• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM) && defined(__AARCH64EL__) && defined(__ELF__)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16.section	.rodata
17
18.align	7
19.Lchacha20_consts:
20.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
21.Linc:
22.long	1,2,3,4
23.Lrol8:
24.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
25.Lclamp:
26.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
27
28.text
29
30.type	.Lpoly_hash_ad_internal,%function
31.align	6
32.Lpoly_hash_ad_internal:
33.cfi_startproc
34	cbnz	x4, .Lpoly_hash_intro
35	ret
36
37.Lpoly_hash_intro:
38	cmp	x4, #16
39	b.lt	.Lpoly_hash_ad_tail
40	ldp	x11, x12, [x3], 16
41	adds	x8, x8, x11
42	adcs	x9, x9, x12
43	adc	x10, x10, x15
44	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
45	umulh	x12, x8, x16
46	mul	x13, x9, x16
47	umulh	x14, x9, x16
48	adds	x12, x12, x13
49	mul	x13, x10, x16
50	adc	x13, x13, x14
51	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
52	umulh	x8, x8, x17
53	adds	x12, x12, x14
54	mul	x14, x9, x17
55	umulh	x9, x9, x17
56	adcs	x14, x14, x8
57	mul	x10, x10, x17
58	adc	x10, x10, x9
59	adds	x13, x13, x14
60	adc	x14, x10, xzr
61	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
62	and	x8, x13, #-4
63	extr	x13, x14, x13, #2
64	adds	x8, x8, x11
65	lsr	x11, x14, #2
66	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
67	adds	x8, x8, x13
68	adcs	x9, x9, x12
69	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
70	sub	x4, x4, #16
71	b	.Lpoly_hash_ad_internal
72
73.Lpoly_hash_ad_tail:
74	cbz	x4, .Lpoly_hash_ad_ret
75
76	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
77	sub	x4, x4, #1
78
79.Lpoly_hash_tail_16_compose:
80	ext	v20.16b, v20.16b, v20.16b, #15
81	ldrb	w11, [x3, x4]
82	mov	v20.b[0], w11
83	subs	x4, x4, #1
84	b.ge	.Lpoly_hash_tail_16_compose
85	mov	x11, v20.d[0]
86	mov	x12, v20.d[1]
87	adds	x8, x8, x11
88	adcs	x9, x9, x12
89	adc	x10, x10, x15
90	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
91	umulh	x12, x8, x16
92	mul	x13, x9, x16
93	umulh	x14, x9, x16
94	adds	x12, x12, x13
95	mul	x13, x10, x16
96	adc	x13, x13, x14
97	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
98	umulh	x8, x8, x17
99	adds	x12, x12, x14
100	mul	x14, x9, x17
101	umulh	x9, x9, x17
102	adcs	x14, x14, x8
103	mul	x10, x10, x17
104	adc	x10, x10, x9
105	adds	x13, x13, x14
106	adc	x14, x10, xzr
107	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
108	and	x8, x13, #-4
109	extr	x13, x14, x13, #2
110	adds	x8, x8, x11
111	lsr	x11, x14, #2
112	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
113	adds	x8, x8, x13
114	adcs	x9, x9, x12
115	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
116
117.Lpoly_hash_ad_ret:
118	ret
119.cfi_endproc
120.size	.Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
121
122/////////////////////////////////
123//
124// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
125//
126.globl	chacha20_poly1305_seal
127.hidden	chacha20_poly1305_seal
128.type	chacha20_poly1305_seal,%function
129.align	6
130chacha20_poly1305_seal:
131	AARCH64_SIGN_LINK_REGISTER
132.cfi_startproc
133	stp	x29, x30, [sp, #-80]!
134.cfi_def_cfa_offset	80
135.cfi_offset	w30, -72
136.cfi_offset	w29, -80
137	mov	x29, sp
138    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
139    // we don't actually use the frame pointer like that, it's probably not
140    // worth bothering.
141	stp	d8, d9, [sp, #16]
142	stp	d10, d11, [sp, #32]
143	stp	d12, d13, [sp, #48]
144	stp	d14, d15, [sp, #64]
145.cfi_offset	b15, -8
146.cfi_offset	b14, -16
147.cfi_offset	b13, -24
148.cfi_offset	b12, -32
149.cfi_offset	b11, -40
150.cfi_offset	b10, -48
151.cfi_offset	b9, -56
152.cfi_offset	b8, -64
153
154	adrp	x11, .Lchacha20_consts
155	add	x11, x11, :lo12:.Lchacha20_consts
156
157	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
158	ld1	{v28.16b - v30.16b}, [x5]
159
160	mov	x15, #1 // Prepare the Poly1305 state
161	mov	x8, #0
162	mov	x9, #0
163	mov	x10, #0
164
165	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
166	add	x12, x12, x2
167	mov	v31.d[0], x4  // Store the input and aad lengths
168	mov	v31.d[1], x12
169
170	cmp	x2, #128
171	b.le	.Lseal_128 // Optimization for smaller buffers
172
173    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
174    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
175    // the fifth block (A4-D4) horizontally.
176	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
177	mov	v4.16b, v24.16b
178
179	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
180	mov	v9.16b, v28.16b
181
182	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
183	mov	v14.16b, v29.16b
184
185	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
186	add	v15.4s, v15.4s, v25.4s
187	mov	v19.16b, v30.16b
188
189	sub	x5, x5, #32
190
191	mov	x6, #10
192
193.align	5
194.Lseal_init_rounds:
195	add	v0.4s, v0.4s, v5.4s
196	add	v1.4s, v1.4s, v6.4s
197	add	v2.4s, v2.4s, v7.4s
198	add	v3.4s, v3.4s, v8.4s
199	add	v4.4s, v4.4s, v9.4s
200
201	eor	v15.16b, v15.16b, v0.16b
202	eor	v16.16b, v16.16b, v1.16b
203	eor	v17.16b, v17.16b, v2.16b
204	eor	v18.16b, v18.16b, v3.16b
205	eor	v19.16b, v19.16b, v4.16b
206
207	rev32	v15.8h, v15.8h
208	rev32	v16.8h, v16.8h
209	rev32	v17.8h, v17.8h
210	rev32	v18.8h, v18.8h
211	rev32	v19.8h, v19.8h
212
213	add	v10.4s, v10.4s, v15.4s
214	add	v11.4s, v11.4s, v16.4s
215	add	v12.4s, v12.4s, v17.4s
216	add	v13.4s, v13.4s, v18.4s
217	add	v14.4s, v14.4s, v19.4s
218
219	eor	v5.16b, v5.16b, v10.16b
220	eor	v6.16b, v6.16b, v11.16b
221	eor	v7.16b, v7.16b, v12.16b
222	eor	v8.16b, v8.16b, v13.16b
223	eor	v9.16b, v9.16b, v14.16b
224
225	ushr	v20.4s, v5.4s, #20
226	sli	v20.4s, v5.4s, #12
227	ushr	v5.4s, v6.4s, #20
228	sli	v5.4s, v6.4s, #12
229	ushr	v6.4s, v7.4s, #20
230	sli	v6.4s, v7.4s, #12
231	ushr	v7.4s, v8.4s, #20
232	sli	v7.4s, v8.4s, #12
233	ushr	v8.4s, v9.4s, #20
234	sli	v8.4s, v9.4s, #12
235
236	add	v0.4s, v0.4s, v20.4s
237	add	v1.4s, v1.4s, v5.4s
238	add	v2.4s, v2.4s, v6.4s
239	add	v3.4s, v3.4s, v7.4s
240	add	v4.4s, v4.4s, v8.4s
241
242	eor	v15.16b, v15.16b, v0.16b
243	eor	v16.16b, v16.16b, v1.16b
244	eor	v17.16b, v17.16b, v2.16b
245	eor	v18.16b, v18.16b, v3.16b
246	eor	v19.16b, v19.16b, v4.16b
247
248	tbl	v15.16b, {v15.16b}, v26.16b
249	tbl	v16.16b, {v16.16b}, v26.16b
250	tbl	v17.16b, {v17.16b}, v26.16b
251	tbl	v18.16b, {v18.16b}, v26.16b
252	tbl	v19.16b, {v19.16b}, v26.16b
253
254	add	v10.4s, v10.4s, v15.4s
255	add	v11.4s, v11.4s, v16.4s
256	add	v12.4s, v12.4s, v17.4s
257	add	v13.4s, v13.4s, v18.4s
258	add	v14.4s, v14.4s, v19.4s
259
260	eor	v20.16b, v20.16b, v10.16b
261	eor	v5.16b, v5.16b, v11.16b
262	eor	v6.16b, v6.16b, v12.16b
263	eor	v7.16b, v7.16b, v13.16b
264	eor	v8.16b, v8.16b, v14.16b
265
266	ushr	v9.4s, v8.4s, #25
267	sli	v9.4s, v8.4s, #7
268	ushr	v8.4s, v7.4s, #25
269	sli	v8.4s, v7.4s, #7
270	ushr	v7.4s, v6.4s, #25
271	sli	v7.4s, v6.4s, #7
272	ushr	v6.4s, v5.4s, #25
273	sli	v6.4s, v5.4s, #7
274	ushr	v5.4s, v20.4s, #25
275	sli	v5.4s, v20.4s, #7
276
277	ext	v9.16b, v9.16b, v9.16b, #4
278	ext	v14.16b, v14.16b, v14.16b, #8
279	ext	v19.16b, v19.16b, v19.16b, #12
280	add	v0.4s, v0.4s, v6.4s
281	add	v1.4s, v1.4s, v7.4s
282	add	v2.4s, v2.4s, v8.4s
283	add	v3.4s, v3.4s, v5.4s
284	add	v4.4s, v4.4s, v9.4s
285
286	eor	v18.16b, v18.16b, v0.16b
287	eor	v15.16b, v15.16b, v1.16b
288	eor	v16.16b, v16.16b, v2.16b
289	eor	v17.16b, v17.16b, v3.16b
290	eor	v19.16b, v19.16b, v4.16b
291
292	rev32	v18.8h, v18.8h
293	rev32	v15.8h, v15.8h
294	rev32	v16.8h, v16.8h
295	rev32	v17.8h, v17.8h
296	rev32	v19.8h, v19.8h
297
298	add	v12.4s, v12.4s, v18.4s
299	add	v13.4s, v13.4s, v15.4s
300	add	v10.4s, v10.4s, v16.4s
301	add	v11.4s, v11.4s, v17.4s
302	add	v14.4s, v14.4s, v19.4s
303
304	eor	v6.16b, v6.16b, v12.16b
305	eor	v7.16b, v7.16b, v13.16b
306	eor	v8.16b, v8.16b, v10.16b
307	eor	v5.16b, v5.16b, v11.16b
308	eor	v9.16b, v9.16b, v14.16b
309
310	ushr	v20.4s, v6.4s, #20
311	sli	v20.4s, v6.4s, #12
312	ushr	v6.4s, v7.4s, #20
313	sli	v6.4s, v7.4s, #12
314	ushr	v7.4s, v8.4s, #20
315	sli	v7.4s, v8.4s, #12
316	ushr	v8.4s, v5.4s, #20
317	sli	v8.4s, v5.4s, #12
318	ushr	v5.4s, v9.4s, #20
319	sli	v5.4s, v9.4s, #12
320
321	add	v0.4s, v0.4s, v20.4s
322	add	v1.4s, v1.4s, v6.4s
323	add	v2.4s, v2.4s, v7.4s
324	add	v3.4s, v3.4s, v8.4s
325	add	v4.4s, v4.4s, v5.4s
326
327	eor	v18.16b, v18.16b, v0.16b
328	eor	v15.16b, v15.16b, v1.16b
329	eor	v16.16b, v16.16b, v2.16b
330	eor	v17.16b, v17.16b, v3.16b
331	eor	v19.16b, v19.16b, v4.16b
332
333	tbl	v18.16b, {v18.16b}, v26.16b
334	tbl	v15.16b, {v15.16b}, v26.16b
335	tbl	v16.16b, {v16.16b}, v26.16b
336	tbl	v17.16b, {v17.16b}, v26.16b
337	tbl	v19.16b, {v19.16b}, v26.16b
338
339	add	v12.4s, v12.4s, v18.4s
340	add	v13.4s, v13.4s, v15.4s
341	add	v10.4s, v10.4s, v16.4s
342	add	v11.4s, v11.4s, v17.4s
343	add	v14.4s, v14.4s, v19.4s
344
345	eor	v20.16b, v20.16b, v12.16b
346	eor	v6.16b, v6.16b, v13.16b
347	eor	v7.16b, v7.16b, v10.16b
348	eor	v8.16b, v8.16b, v11.16b
349	eor	v5.16b, v5.16b, v14.16b
350
351	ushr	v9.4s, v5.4s, #25
352	sli	v9.4s, v5.4s, #7
353	ushr	v5.4s, v8.4s, #25
354	sli	v5.4s, v8.4s, #7
355	ushr	v8.4s, v7.4s, #25
356	sli	v8.4s, v7.4s, #7
357	ushr	v7.4s, v6.4s, #25
358	sli	v7.4s, v6.4s, #7
359	ushr	v6.4s, v20.4s, #25
360	sli	v6.4s, v20.4s, #7
361
362	ext	v9.16b, v9.16b, v9.16b, #12
363	ext	v14.16b, v14.16b, v14.16b, #8
364	ext	v19.16b, v19.16b, v19.16b, #4
365	subs	x6, x6, #1
366	b.hi	.Lseal_init_rounds
367
368	add	v15.4s, v15.4s, v25.4s
369	mov	x11, #4
370	dup	v20.4s, w11
371	add	v25.4s, v25.4s, v20.4s
372
373	zip1	v20.4s, v0.4s, v1.4s
374	zip2	v21.4s, v0.4s, v1.4s
375	zip1	v22.4s, v2.4s, v3.4s
376	zip2	v23.4s, v2.4s, v3.4s
377
378	zip1	v0.2d, v20.2d, v22.2d
379	zip2	v1.2d, v20.2d, v22.2d
380	zip1	v2.2d, v21.2d, v23.2d
381	zip2	v3.2d, v21.2d, v23.2d
382
383	zip1	v20.4s, v5.4s, v6.4s
384	zip2	v21.4s, v5.4s, v6.4s
385	zip1	v22.4s, v7.4s, v8.4s
386	zip2	v23.4s, v7.4s, v8.4s
387
388	zip1	v5.2d, v20.2d, v22.2d
389	zip2	v6.2d, v20.2d, v22.2d
390	zip1	v7.2d, v21.2d, v23.2d
391	zip2	v8.2d, v21.2d, v23.2d
392
393	zip1	v20.4s, v10.4s, v11.4s
394	zip2	v21.4s, v10.4s, v11.4s
395	zip1	v22.4s, v12.4s, v13.4s
396	zip2	v23.4s, v12.4s, v13.4s
397
398	zip1	v10.2d, v20.2d, v22.2d
399	zip2	v11.2d, v20.2d, v22.2d
400	zip1	v12.2d, v21.2d, v23.2d
401	zip2	v13.2d, v21.2d, v23.2d
402
403	zip1	v20.4s, v15.4s, v16.4s
404	zip2	v21.4s, v15.4s, v16.4s
405	zip1	v22.4s, v17.4s, v18.4s
406	zip2	v23.4s, v17.4s, v18.4s
407
408	zip1	v15.2d, v20.2d, v22.2d
409	zip2	v16.2d, v20.2d, v22.2d
410	zip1	v17.2d, v21.2d, v23.2d
411	zip2	v18.2d, v21.2d, v23.2d
412
413	add	v4.4s, v4.4s, v24.4s
414	add	v9.4s, v9.4s, v28.4s
415	and	v4.16b, v4.16b, v27.16b
416
417	add	v0.4s, v0.4s, v24.4s
418	add	v5.4s, v5.4s, v28.4s
419	add	v10.4s, v10.4s, v29.4s
420	add	v15.4s, v15.4s, v30.4s
421
422	add	v1.4s, v1.4s, v24.4s
423	add	v6.4s, v6.4s, v28.4s
424	add	v11.4s, v11.4s, v29.4s
425	add	v16.4s, v16.4s, v30.4s
426
427	add	v2.4s, v2.4s, v24.4s
428	add	v7.4s, v7.4s, v28.4s
429	add	v12.4s, v12.4s, v29.4s
430	add	v17.4s, v17.4s, v30.4s
431
432	add	v3.4s, v3.4s, v24.4s
433	add	v8.4s, v8.4s, v28.4s
434	add	v13.4s, v13.4s, v29.4s
435	add	v18.4s, v18.4s, v30.4s
436
437	mov	x16, v4.d[0] // Move the R key to GPRs
438	mov	x17, v4.d[1]
439	mov	v27.16b, v9.16b // Store the S key
440
441	bl	.Lpoly_hash_ad_internal
442
443	mov	x3, x0
444	cmp	x2, #256
445	b.le	.Lseal_tail
446
447	ld1	{v20.16b - v23.16b}, [x1], #64
448	eor	v20.16b, v20.16b, v0.16b
449	eor	v21.16b, v21.16b, v5.16b
450	eor	v22.16b, v22.16b, v10.16b
451	eor	v23.16b, v23.16b, v15.16b
452	st1	{v20.16b - v23.16b}, [x0], #64
453
454	ld1	{v20.16b - v23.16b}, [x1], #64
455	eor	v20.16b, v20.16b, v1.16b
456	eor	v21.16b, v21.16b, v6.16b
457	eor	v22.16b, v22.16b, v11.16b
458	eor	v23.16b, v23.16b, v16.16b
459	st1	{v20.16b - v23.16b}, [x0], #64
460
461	ld1	{v20.16b - v23.16b}, [x1], #64
462	eor	v20.16b, v20.16b, v2.16b
463	eor	v21.16b, v21.16b, v7.16b
464	eor	v22.16b, v22.16b, v12.16b
465	eor	v23.16b, v23.16b, v17.16b
466	st1	{v20.16b - v23.16b}, [x0], #64
467
468	ld1	{v20.16b - v23.16b}, [x1], #64
469	eor	v20.16b, v20.16b, v3.16b
470	eor	v21.16b, v21.16b, v8.16b
471	eor	v22.16b, v22.16b, v13.16b
472	eor	v23.16b, v23.16b, v18.16b
473	st1	{v20.16b - v23.16b}, [x0], #64
474
475	sub	x2, x2, #256
476
477	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
478	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
479
480.Lseal_main_loop:
481	adrp	x11, .Lchacha20_consts
482	add	x11, x11, :lo12:.Lchacha20_consts
483
484	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
485	mov	v4.16b, v24.16b
486
487	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
488	mov	v9.16b, v28.16b
489
490	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
491	mov	v14.16b, v29.16b
492
493	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
494	add	v15.4s, v15.4s, v25.4s
495	mov	v19.16b, v30.16b
496
497	eor	v20.16b, v20.16b, v20.16b //zero
498	not	v21.16b, v20.16b // -1
499	sub	v21.4s, v25.4s, v21.4s // Add +1
500	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
501	add	v19.4s, v19.4s, v20.4s
502
503	sub	x5, x5, #32
504.align	5
505.Lseal_main_loop_rounds:
506	add	v0.4s, v0.4s, v5.4s
507	add	v1.4s, v1.4s, v6.4s
508	add	v2.4s, v2.4s, v7.4s
509	add	v3.4s, v3.4s, v8.4s
510	add	v4.4s, v4.4s, v9.4s
511
512	eor	v15.16b, v15.16b, v0.16b
513	eor	v16.16b, v16.16b, v1.16b
514	eor	v17.16b, v17.16b, v2.16b
515	eor	v18.16b, v18.16b, v3.16b
516	eor	v19.16b, v19.16b, v4.16b
517
518	rev32	v15.8h, v15.8h
519	rev32	v16.8h, v16.8h
520	rev32	v17.8h, v17.8h
521	rev32	v18.8h, v18.8h
522	rev32	v19.8h, v19.8h
523
524	add	v10.4s, v10.4s, v15.4s
525	add	v11.4s, v11.4s, v16.4s
526	add	v12.4s, v12.4s, v17.4s
527	add	v13.4s, v13.4s, v18.4s
528	add	v14.4s, v14.4s, v19.4s
529
530	eor	v5.16b, v5.16b, v10.16b
531	eor	v6.16b, v6.16b, v11.16b
532	eor	v7.16b, v7.16b, v12.16b
533	eor	v8.16b, v8.16b, v13.16b
534	eor	v9.16b, v9.16b, v14.16b
535
536	ushr	v20.4s, v5.4s, #20
537	sli	v20.4s, v5.4s, #12
538	ushr	v5.4s, v6.4s, #20
539	sli	v5.4s, v6.4s, #12
540	ushr	v6.4s, v7.4s, #20
541	sli	v6.4s, v7.4s, #12
542	ushr	v7.4s, v8.4s, #20
543	sli	v7.4s, v8.4s, #12
544	ushr	v8.4s, v9.4s, #20
545	sli	v8.4s, v9.4s, #12
546
547	add	v0.4s, v0.4s, v20.4s
548	add	v1.4s, v1.4s, v5.4s
549	add	v2.4s, v2.4s, v6.4s
550	add	v3.4s, v3.4s, v7.4s
551	add	v4.4s, v4.4s, v8.4s
552
553	eor	v15.16b, v15.16b, v0.16b
554	eor	v16.16b, v16.16b, v1.16b
555	eor	v17.16b, v17.16b, v2.16b
556	eor	v18.16b, v18.16b, v3.16b
557	eor	v19.16b, v19.16b, v4.16b
558
559	tbl	v15.16b, {v15.16b}, v26.16b
560	tbl	v16.16b, {v16.16b}, v26.16b
561	tbl	v17.16b, {v17.16b}, v26.16b
562	tbl	v18.16b, {v18.16b}, v26.16b
563	tbl	v19.16b, {v19.16b}, v26.16b
564
565	add	v10.4s, v10.4s, v15.4s
566	add	v11.4s, v11.4s, v16.4s
567	add	v12.4s, v12.4s, v17.4s
568	add	v13.4s, v13.4s, v18.4s
569	add	v14.4s, v14.4s, v19.4s
570
571	eor	v20.16b, v20.16b, v10.16b
572	eor	v5.16b, v5.16b, v11.16b
573	eor	v6.16b, v6.16b, v12.16b
574	eor	v7.16b, v7.16b, v13.16b
575	eor	v8.16b, v8.16b, v14.16b
576
577	ushr	v9.4s, v8.4s, #25
578	sli	v9.4s, v8.4s, #7
579	ushr	v8.4s, v7.4s, #25
580	sli	v8.4s, v7.4s, #7
581	ushr	v7.4s, v6.4s, #25
582	sli	v7.4s, v6.4s, #7
583	ushr	v6.4s, v5.4s, #25
584	sli	v6.4s, v5.4s, #7
585	ushr	v5.4s, v20.4s, #25
586	sli	v5.4s, v20.4s, #7
587
588	ext	v9.16b, v9.16b, v9.16b, #4
589	ext	v14.16b, v14.16b, v14.16b, #8
590	ext	v19.16b, v19.16b, v19.16b, #12
591	ldp	x11, x12, [x3], 16
592	adds	x8, x8, x11
593	adcs	x9, x9, x12
594	adc	x10, x10, x15
595	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
596	umulh	x12, x8, x16
597	mul	x13, x9, x16
598	umulh	x14, x9, x16
599	adds	x12, x12, x13
600	mul	x13, x10, x16
601	adc	x13, x13, x14
602	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
603	umulh	x8, x8, x17
604	adds	x12, x12, x14
605	mul	x14, x9, x17
606	umulh	x9, x9, x17
607	adcs	x14, x14, x8
608	mul	x10, x10, x17
609	adc	x10, x10, x9
610	adds	x13, x13, x14
611	adc	x14, x10, xzr
612	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
613	and	x8, x13, #-4
614	extr	x13, x14, x13, #2
615	adds	x8, x8, x11
616	lsr	x11, x14, #2
617	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
618	adds	x8, x8, x13
619	adcs	x9, x9, x12
620	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
621	add	v0.4s, v0.4s, v6.4s
622	add	v1.4s, v1.4s, v7.4s
623	add	v2.4s, v2.4s, v8.4s
624	add	v3.4s, v3.4s, v5.4s
625	add	v4.4s, v4.4s, v9.4s
626
627	eor	v18.16b, v18.16b, v0.16b
628	eor	v15.16b, v15.16b, v1.16b
629	eor	v16.16b, v16.16b, v2.16b
630	eor	v17.16b, v17.16b, v3.16b
631	eor	v19.16b, v19.16b, v4.16b
632
633	rev32	v18.8h, v18.8h
634	rev32	v15.8h, v15.8h
635	rev32	v16.8h, v16.8h
636	rev32	v17.8h, v17.8h
637	rev32	v19.8h, v19.8h
638
639	add	v12.4s, v12.4s, v18.4s
640	add	v13.4s, v13.4s, v15.4s
641	add	v10.4s, v10.4s, v16.4s
642	add	v11.4s, v11.4s, v17.4s
643	add	v14.4s, v14.4s, v19.4s
644
645	eor	v6.16b, v6.16b, v12.16b
646	eor	v7.16b, v7.16b, v13.16b
647	eor	v8.16b, v8.16b, v10.16b
648	eor	v5.16b, v5.16b, v11.16b
649	eor	v9.16b, v9.16b, v14.16b
650
651	ushr	v20.4s, v6.4s, #20
652	sli	v20.4s, v6.4s, #12
653	ushr	v6.4s, v7.4s, #20
654	sli	v6.4s, v7.4s, #12
655	ushr	v7.4s, v8.4s, #20
656	sli	v7.4s, v8.4s, #12
657	ushr	v8.4s, v5.4s, #20
658	sli	v8.4s, v5.4s, #12
659	ushr	v5.4s, v9.4s, #20
660	sli	v5.4s, v9.4s, #12
661
662	add	v0.4s, v0.4s, v20.4s
663	add	v1.4s, v1.4s, v6.4s
664	add	v2.4s, v2.4s, v7.4s
665	add	v3.4s, v3.4s, v8.4s
666	add	v4.4s, v4.4s, v5.4s
667
668	eor	v18.16b, v18.16b, v0.16b
669	eor	v15.16b, v15.16b, v1.16b
670	eor	v16.16b, v16.16b, v2.16b
671	eor	v17.16b, v17.16b, v3.16b
672	eor	v19.16b, v19.16b, v4.16b
673
674	tbl	v18.16b, {v18.16b}, v26.16b
675	tbl	v15.16b, {v15.16b}, v26.16b
676	tbl	v16.16b, {v16.16b}, v26.16b
677	tbl	v17.16b, {v17.16b}, v26.16b
678	tbl	v19.16b, {v19.16b}, v26.16b
679
680	add	v12.4s, v12.4s, v18.4s
681	add	v13.4s, v13.4s, v15.4s
682	add	v10.4s, v10.4s, v16.4s
683	add	v11.4s, v11.4s, v17.4s
684	add	v14.4s, v14.4s, v19.4s
685
686	eor	v20.16b, v20.16b, v12.16b
687	eor	v6.16b, v6.16b, v13.16b
688	eor	v7.16b, v7.16b, v10.16b
689	eor	v8.16b, v8.16b, v11.16b
690	eor	v5.16b, v5.16b, v14.16b
691
692	ushr	v9.4s, v5.4s, #25
693	sli	v9.4s, v5.4s, #7
694	ushr	v5.4s, v8.4s, #25
695	sli	v5.4s, v8.4s, #7
696	ushr	v8.4s, v7.4s, #25
697	sli	v8.4s, v7.4s, #7
698	ushr	v7.4s, v6.4s, #25
699	sli	v7.4s, v6.4s, #7
700	ushr	v6.4s, v20.4s, #25
701	sli	v6.4s, v20.4s, #7
702
703	ext	v9.16b, v9.16b, v9.16b, #12
704	ext	v14.16b, v14.16b, v14.16b, #8
705	ext	v19.16b, v19.16b, v19.16b, #4
706	subs	x6, x6, #1
707	b.ge	.Lseal_main_loop_rounds
708	ldp	x11, x12, [x3], 16
709	adds	x8, x8, x11
710	adcs	x9, x9, x12
711	adc	x10, x10, x15
712	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
713	umulh	x12, x8, x16
714	mul	x13, x9, x16
715	umulh	x14, x9, x16
716	adds	x12, x12, x13
717	mul	x13, x10, x16
718	adc	x13, x13, x14
719	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
720	umulh	x8, x8, x17
721	adds	x12, x12, x14
722	mul	x14, x9, x17
723	umulh	x9, x9, x17
724	adcs	x14, x14, x8
725	mul	x10, x10, x17
726	adc	x10, x10, x9
727	adds	x13, x13, x14
728	adc	x14, x10, xzr
729	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
730	and	x8, x13, #-4
731	extr	x13, x14, x13, #2
732	adds	x8, x8, x11
733	lsr	x11, x14, #2
734	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
735	adds	x8, x8, x13
736	adcs	x9, x9, x12
737	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
738	subs	x7, x7, #1
739	b.gt	.Lseal_main_loop_rounds
740
741	eor	v20.16b, v20.16b, v20.16b //zero
742	not	v21.16b, v20.16b // -1
743	sub	v21.4s, v25.4s, v21.4s // Add +1
744	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
745	add	v19.4s, v19.4s, v20.4s
746
747	add	v15.4s, v15.4s, v25.4s
748	mov	x11, #5
749	dup	v20.4s, w11
750	add	v25.4s, v25.4s, v20.4s
751
752	zip1	v20.4s, v0.4s, v1.4s
753	zip2	v21.4s, v0.4s, v1.4s
754	zip1	v22.4s, v2.4s, v3.4s
755	zip2	v23.4s, v2.4s, v3.4s
756
757	zip1	v0.2d, v20.2d, v22.2d
758	zip2	v1.2d, v20.2d, v22.2d
759	zip1	v2.2d, v21.2d, v23.2d
760	zip2	v3.2d, v21.2d, v23.2d
761
762	zip1	v20.4s, v5.4s, v6.4s
763	zip2	v21.4s, v5.4s, v6.4s
764	zip1	v22.4s, v7.4s, v8.4s
765	zip2	v23.4s, v7.4s, v8.4s
766
767	zip1	v5.2d, v20.2d, v22.2d
768	zip2	v6.2d, v20.2d, v22.2d
769	zip1	v7.2d, v21.2d, v23.2d
770	zip2	v8.2d, v21.2d, v23.2d
771
772	zip1	v20.4s, v10.4s, v11.4s
773	zip2	v21.4s, v10.4s, v11.4s
774	zip1	v22.4s, v12.4s, v13.4s
775	zip2	v23.4s, v12.4s, v13.4s
776
777	zip1	v10.2d, v20.2d, v22.2d
778	zip2	v11.2d, v20.2d, v22.2d
779	zip1	v12.2d, v21.2d, v23.2d
780	zip2	v13.2d, v21.2d, v23.2d
781
782	zip1	v20.4s, v15.4s, v16.4s
783	zip2	v21.4s, v15.4s, v16.4s
784	zip1	v22.4s, v17.4s, v18.4s
785	zip2	v23.4s, v17.4s, v18.4s
786
787	zip1	v15.2d, v20.2d, v22.2d
788	zip2	v16.2d, v20.2d, v22.2d
789	zip1	v17.2d, v21.2d, v23.2d
790	zip2	v18.2d, v21.2d, v23.2d
791
792	add	v0.4s, v0.4s, v24.4s
793	add	v5.4s, v5.4s, v28.4s
794	add	v10.4s, v10.4s, v29.4s
795	add	v15.4s, v15.4s, v30.4s
796
797	add	v1.4s, v1.4s, v24.4s
798	add	v6.4s, v6.4s, v28.4s
799	add	v11.4s, v11.4s, v29.4s
800	add	v16.4s, v16.4s, v30.4s
801
802	add	v2.4s, v2.4s, v24.4s
803	add	v7.4s, v7.4s, v28.4s
804	add	v12.4s, v12.4s, v29.4s
805	add	v17.4s, v17.4s, v30.4s
806
807	add	v3.4s, v3.4s, v24.4s
808	add	v8.4s, v8.4s, v28.4s
809	add	v13.4s, v13.4s, v29.4s
810	add	v18.4s, v18.4s, v30.4s
811
812	add	v4.4s, v4.4s, v24.4s
813	add	v9.4s, v9.4s, v28.4s
814	add	v14.4s, v14.4s, v29.4s
815	add	v19.4s, v19.4s, v30.4s
816
817	cmp	x2, #320
818	b.le	.Lseal_tail
819
820	ld1	{v20.16b - v23.16b}, [x1], #64
821	eor	v20.16b, v20.16b, v0.16b
822	eor	v21.16b, v21.16b, v5.16b
823	eor	v22.16b, v22.16b, v10.16b
824	eor	v23.16b, v23.16b, v15.16b
825	st1	{v20.16b - v23.16b}, [x0], #64
826
827	ld1	{v20.16b - v23.16b}, [x1], #64
828	eor	v20.16b, v20.16b, v1.16b
829	eor	v21.16b, v21.16b, v6.16b
830	eor	v22.16b, v22.16b, v11.16b
831	eor	v23.16b, v23.16b, v16.16b
832	st1	{v20.16b - v23.16b}, [x0], #64
833
834	ld1	{v20.16b - v23.16b}, [x1], #64
835	eor	v20.16b, v20.16b, v2.16b
836	eor	v21.16b, v21.16b, v7.16b
837	eor	v22.16b, v22.16b, v12.16b
838	eor	v23.16b, v23.16b, v17.16b
839	st1	{v20.16b - v23.16b}, [x0], #64
840
841	ld1	{v20.16b - v23.16b}, [x1], #64
842	eor	v20.16b, v20.16b, v3.16b
843	eor	v21.16b, v21.16b, v8.16b
844	eor	v22.16b, v22.16b, v13.16b
845	eor	v23.16b, v23.16b, v18.16b
846	st1	{v20.16b - v23.16b}, [x0], #64
847
848	ld1	{v20.16b - v23.16b}, [x1], #64
849	eor	v20.16b, v20.16b, v4.16b
850	eor	v21.16b, v21.16b, v9.16b
851	eor	v22.16b, v22.16b, v14.16b
852	eor	v23.16b, v23.16b, v19.16b
853	st1	{v20.16b - v23.16b}, [x0], #64
854
855	sub	x2, x2, #320
856
857	mov	x6, #0
858	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
859
860	b	.Lseal_main_loop
861
862.Lseal_tail:
863    // This part of the function handles the storage and authentication of the last [0,320) bytes
864    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
865	cmp	x2, #64
866	b.lt	.Lseal_tail_64
867
868    // Store and authenticate 64B blocks per iteration
869	ld1	{v20.16b - v23.16b}, [x1], #64
870
871	eor	v20.16b, v20.16b, v0.16b
872	eor	v21.16b, v21.16b, v5.16b
873	eor	v22.16b, v22.16b, v10.16b
874	eor	v23.16b, v23.16b, v15.16b
875	mov	x11, v20.d[0]
876	mov	x12, v20.d[1]
877	adds	x8, x8, x11
878	adcs	x9, x9, x12
879	adc	x10, x10, x15
880	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
881	umulh	x12, x8, x16
882	mul	x13, x9, x16
883	umulh	x14, x9, x16
884	adds	x12, x12, x13
885	mul	x13, x10, x16
886	adc	x13, x13, x14
887	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
888	umulh	x8, x8, x17
889	adds	x12, x12, x14
890	mul	x14, x9, x17
891	umulh	x9, x9, x17
892	adcs	x14, x14, x8
893	mul	x10, x10, x17
894	adc	x10, x10, x9
895	adds	x13, x13, x14
896	adc	x14, x10, xzr
897	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
898	and	x8, x13, #-4
899	extr	x13, x14, x13, #2
900	adds	x8, x8, x11
901	lsr	x11, x14, #2
902	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
903	adds	x8, x8, x13
904	adcs	x9, x9, x12
905	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
906	mov	x11, v21.d[0]
907	mov	x12, v21.d[1]
908	adds	x8, x8, x11
909	adcs	x9, x9, x12
910	adc	x10, x10, x15
911	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
912	umulh	x12, x8, x16
913	mul	x13, x9, x16
914	umulh	x14, x9, x16
915	adds	x12, x12, x13
916	mul	x13, x10, x16
917	adc	x13, x13, x14
918	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
919	umulh	x8, x8, x17
920	adds	x12, x12, x14
921	mul	x14, x9, x17
922	umulh	x9, x9, x17
923	adcs	x14, x14, x8
924	mul	x10, x10, x17
925	adc	x10, x10, x9
926	adds	x13, x13, x14
927	adc	x14, x10, xzr
928	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
929	and	x8, x13, #-4
930	extr	x13, x14, x13, #2
931	adds	x8, x8, x11
932	lsr	x11, x14, #2
933	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
934	adds	x8, x8, x13
935	adcs	x9, x9, x12
936	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
937	mov	x11, v22.d[0]
938	mov	x12, v22.d[1]
939	adds	x8, x8, x11
940	adcs	x9, x9, x12
941	adc	x10, x10, x15
942	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
943	umulh	x12, x8, x16
944	mul	x13, x9, x16
945	umulh	x14, x9, x16
946	adds	x12, x12, x13
947	mul	x13, x10, x16
948	adc	x13, x13, x14
949	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
950	umulh	x8, x8, x17
951	adds	x12, x12, x14
952	mul	x14, x9, x17
953	umulh	x9, x9, x17
954	adcs	x14, x14, x8
955	mul	x10, x10, x17
956	adc	x10, x10, x9
957	adds	x13, x13, x14
958	adc	x14, x10, xzr
959	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
960	and	x8, x13, #-4
961	extr	x13, x14, x13, #2
962	adds	x8, x8, x11
963	lsr	x11, x14, #2
964	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
965	adds	x8, x8, x13
966	adcs	x9, x9, x12
967	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
968	mov	x11, v23.d[0]
969	mov	x12, v23.d[1]
970	adds	x8, x8, x11
971	adcs	x9, x9, x12
972	adc	x10, x10, x15
973	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
974	umulh	x12, x8, x16
975	mul	x13, x9, x16
976	umulh	x14, x9, x16
977	adds	x12, x12, x13
978	mul	x13, x10, x16
979	adc	x13, x13, x14
980	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
981	umulh	x8, x8, x17
982	adds	x12, x12, x14
983	mul	x14, x9, x17
984	umulh	x9, x9, x17
985	adcs	x14, x14, x8
986	mul	x10, x10, x17
987	adc	x10, x10, x9
988	adds	x13, x13, x14
989	adc	x14, x10, xzr
990	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
991	and	x8, x13, #-4
992	extr	x13, x14, x13, #2
993	adds	x8, x8, x11
994	lsr	x11, x14, #2
995	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
996	adds	x8, x8, x13
997	adcs	x9, x9, x12
998	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
999	st1	{v20.16b - v23.16b}, [x0], #64
1000	sub	x2, x2, #64
1001
1002    // Shift the state left by 64 bytes for the next iteration of the loop
1003	mov	v0.16b, v1.16b
1004	mov	v5.16b, v6.16b
1005	mov	v10.16b, v11.16b
1006	mov	v15.16b, v16.16b
1007
1008	mov	v1.16b, v2.16b
1009	mov	v6.16b, v7.16b
1010	mov	v11.16b, v12.16b
1011	mov	v16.16b, v17.16b
1012
1013	mov	v2.16b, v3.16b
1014	mov	v7.16b, v8.16b
1015	mov	v12.16b, v13.16b
1016	mov	v17.16b, v18.16b
1017
1018	mov	v3.16b, v4.16b
1019	mov	v8.16b, v9.16b
1020	mov	v13.16b, v14.16b
1021	mov	v18.16b, v19.16b
1022
1023	b	.Lseal_tail
1024
1025.Lseal_tail_64:
1026	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
1027
1028    // Here we handle the last [0,64) bytes of plaintext
1029	cmp	x2, #16
1030	b.lt	.Lseal_tail_16
1031    // Each iteration encrypt and authenticate a 16B block
1032	ld1	{v20.16b}, [x1], #16
1033	eor	v20.16b, v20.16b, v0.16b
1034	mov	x11, v20.d[0]
1035	mov	x12, v20.d[1]
1036	adds	x8, x8, x11
1037	adcs	x9, x9, x12
1038	adc	x10, x10, x15
1039	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1040	umulh	x12, x8, x16
1041	mul	x13, x9, x16
1042	umulh	x14, x9, x16
1043	adds	x12, x12, x13
1044	mul	x13, x10, x16
1045	adc	x13, x13, x14
1046	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1047	umulh	x8, x8, x17
1048	adds	x12, x12, x14
1049	mul	x14, x9, x17
1050	umulh	x9, x9, x17
1051	adcs	x14, x14, x8
1052	mul	x10, x10, x17
1053	adc	x10, x10, x9
1054	adds	x13, x13, x14
1055	adc	x14, x10, xzr
1056	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1057	and	x8, x13, #-4
1058	extr	x13, x14, x13, #2
1059	adds	x8, x8, x11
1060	lsr	x11, x14, #2
1061	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1062	adds	x8, x8, x13
1063	adcs	x9, x9, x12
1064	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1065	st1	{v20.16b}, [x0], #16
1066
1067	sub	x2, x2, #16
1068
1069    // Shift the state left by 16 bytes for the next iteration of the loop
1070	mov	v0.16b, v5.16b
1071	mov	v5.16b, v10.16b
1072	mov	v10.16b, v15.16b
1073
1074	b	.Lseal_tail_64
1075
1076.Lseal_tail_16:
1077    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
1078	cbz	x2, .Lseal_hash_extra
1079
1080	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
1081	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
1082	not	v22.16b, v20.16b
1083
1084	mov	x6, x2
1085	add	x1, x1, x2
1086
1087	cbz	x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding
1088
1089	mov	x7, #16          // We need to load some extra_in first for padding
1090	sub	x7, x7, x2
1091	cmp	x4, x7
1092	csel	x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register
1093	mov	x12, x7
1094	add	x3, x3, x7
1095	sub	x4, x4, x7
1096
1097.Lseal_tail16_compose_extra_in:
1098	ext	v20.16b, v20.16b, v20.16b, #15
1099	ldrb	w11, [x3, #-1]!
1100	mov	v20.b[0], w11
1101	subs	x7, x7, #1
1102	b.gt	.Lseal_tail16_compose_extra_in
1103
1104	add	x3, x3, x12
1105
1106.Lseal_tail_16_compose:
1107	ext	v20.16b, v20.16b, v20.16b, #15
1108	ldrb	w11, [x1, #-1]!
1109	mov	v20.b[0], w11
1110	ext	v21.16b, v22.16b, v21.16b, #15
1111	subs	x2, x2, #1
1112	b.gt	.Lseal_tail_16_compose
1113
1114	and	v0.16b, v0.16b, v21.16b
1115	eor	v20.16b, v20.16b, v0.16b
1116	mov	v21.16b, v20.16b
1117
1118.Lseal_tail_16_store:
1119	umov	w11, v20.b[0]
1120	strb	w11, [x0], #1
1121	ext	v20.16b, v20.16b, v20.16b, #1
1122	subs	x6, x6, #1
1123	b.gt	.Lseal_tail_16_store
1124
1125    // Hash in the final ct block concatenated with extra_in
1126	mov	x11, v21.d[0]
1127	mov	x12, v21.d[1]
1128	adds	x8, x8, x11
1129	adcs	x9, x9, x12
1130	adc	x10, x10, x15
1131	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1132	umulh	x12, x8, x16
1133	mul	x13, x9, x16
1134	umulh	x14, x9, x16
1135	adds	x12, x12, x13
1136	mul	x13, x10, x16
1137	adc	x13, x13, x14
1138	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1139	umulh	x8, x8, x17
1140	adds	x12, x12, x14
1141	mul	x14, x9, x17
1142	umulh	x9, x9, x17
1143	adcs	x14, x14, x8
1144	mul	x10, x10, x17
1145	adc	x10, x10, x9
1146	adds	x13, x13, x14
1147	adc	x14, x10, xzr
1148	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1149	and	x8, x13, #-4
1150	extr	x13, x14, x13, #2
1151	adds	x8, x8, x11
1152	lsr	x11, x14, #2
1153	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1154	adds	x8, x8, x13
1155	adcs	x9, x9, x12
1156	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1157
1158.Lseal_hash_extra:
1159	cbz	x4, .Lseal_finalize
1160
1161.Lseal_hash_extra_loop:
1162	cmp	x4, #16
1163	b.lt	.Lseal_hash_extra_tail
1164	ld1	{v20.16b}, [x3], #16
1165	mov	x11, v20.d[0]
1166	mov	x12, v20.d[1]
1167	adds	x8, x8, x11
1168	adcs	x9, x9, x12
1169	adc	x10, x10, x15
1170	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1171	umulh	x12, x8, x16
1172	mul	x13, x9, x16
1173	umulh	x14, x9, x16
1174	adds	x12, x12, x13
1175	mul	x13, x10, x16
1176	adc	x13, x13, x14
1177	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1178	umulh	x8, x8, x17
1179	adds	x12, x12, x14
1180	mul	x14, x9, x17
1181	umulh	x9, x9, x17
1182	adcs	x14, x14, x8
1183	mul	x10, x10, x17
1184	adc	x10, x10, x9
1185	adds	x13, x13, x14
1186	adc	x14, x10, xzr
1187	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1188	and	x8, x13, #-4
1189	extr	x13, x14, x13, #2
1190	adds	x8, x8, x11
1191	lsr	x11, x14, #2
1192	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1193	adds	x8, x8, x13
1194	adcs	x9, x9, x12
1195	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1196	sub	x4, x4, #16
1197	b	.Lseal_hash_extra_loop
1198
1199.Lseal_hash_extra_tail:
1200	cbz	x4, .Lseal_finalize
1201	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
1202	add	x3, x3, x4
1203
1204.Lseal_hash_extra_load:
1205	ext	v20.16b, v20.16b, v20.16b, #15
1206	ldrb	w11, [x3, #-1]!
1207	mov	v20.b[0], w11
1208	subs	x4, x4, #1
1209	b.gt	.Lseal_hash_extra_load
1210
1211    // Hash in the final padded extra_in blcok
1212	mov	x11, v20.d[0]
1213	mov	x12, v20.d[1]
1214	adds	x8, x8, x11
1215	adcs	x9, x9, x12
1216	adc	x10, x10, x15
1217	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1218	umulh	x12, x8, x16
1219	mul	x13, x9, x16
1220	umulh	x14, x9, x16
1221	adds	x12, x12, x13
1222	mul	x13, x10, x16
1223	adc	x13, x13, x14
1224	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1225	umulh	x8, x8, x17
1226	adds	x12, x12, x14
1227	mul	x14, x9, x17
1228	umulh	x9, x9, x17
1229	adcs	x14, x14, x8
1230	mul	x10, x10, x17
1231	adc	x10, x10, x9
1232	adds	x13, x13, x14
1233	adc	x14, x10, xzr
1234	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1235	and	x8, x13, #-4
1236	extr	x13, x14, x13, #2
1237	adds	x8, x8, x11
1238	lsr	x11, x14, #2
1239	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1240	adds	x8, x8, x13
1241	adcs	x9, x9, x12
1242	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1243
1244.Lseal_finalize:
1245	mov	x11, v31.d[0]
1246	mov	x12, v31.d[1]
1247	adds	x8, x8, x11
1248	adcs	x9, x9, x12
1249	adc	x10, x10, x15
1250	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1251	umulh	x12, x8, x16
1252	mul	x13, x9, x16
1253	umulh	x14, x9, x16
1254	adds	x12, x12, x13
1255	mul	x13, x10, x16
1256	adc	x13, x13, x14
1257	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1258	umulh	x8, x8, x17
1259	adds	x12, x12, x14
1260	mul	x14, x9, x17
1261	umulh	x9, x9, x17
1262	adcs	x14, x14, x8
1263	mul	x10, x10, x17
1264	adc	x10, x10, x9
1265	adds	x13, x13, x14
1266	adc	x14, x10, xzr
1267	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1268	and	x8, x13, #-4
1269	extr	x13, x14, x13, #2
1270	adds	x8, x8, x11
1271	lsr	x11, x14, #2
1272	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1273	adds	x8, x8, x13
1274	adcs	x9, x9, x12
1275	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1276    // Final reduction step
1277	sub	x12, xzr, x15
1278	orr	x13, xzr, #3
1279	subs	x11, x8, #-5
1280	sbcs	x12, x9, x12
1281	sbcs	x13, x10, x13
1282	csel	x8, x11, x8, cs
1283	csel	x9, x12, x9, cs
1284	csel	x10, x13, x10, cs
1285	mov	x11, v27.d[0]
1286	mov	x12, v27.d[1]
1287	adds	x8, x8, x11
1288	adcs	x9, x9, x12
1289	adc	x10, x10, x15
1290
1291	stp	x8, x9, [x5]
1292
1293	ldp	d8, d9, [sp, #16]
1294	ldp	d10, d11, [sp, #32]
1295	ldp	d12, d13, [sp, #48]
1296	ldp	d14, d15, [sp, #64]
1297.cfi_restore	b15
1298.cfi_restore	b14
1299.cfi_restore	b13
1300.cfi_restore	b12
1301.cfi_restore	b11
1302.cfi_restore	b10
1303.cfi_restore	b9
1304.cfi_restore	b8
1305	ldp	x29, x30, [sp], 80
1306.cfi_restore	w29
1307.cfi_restore	w30
1308.cfi_def_cfa_offset	0
1309	AARCH64_VALIDATE_LINK_REGISTER
1310	ret
1311
1312.Lseal_128:
1313    // On some architectures preparing 5 blocks for small buffers is wasteful
1314	eor	v25.16b, v25.16b, v25.16b
1315	mov	x11, #1
1316	mov	v25.s[0], w11
1317	mov	v0.16b, v24.16b
1318	mov	v1.16b, v24.16b
1319	mov	v2.16b, v24.16b
1320	mov	v5.16b, v28.16b
1321	mov	v6.16b, v28.16b
1322	mov	v7.16b, v28.16b
1323	mov	v10.16b, v29.16b
1324	mov	v11.16b, v29.16b
1325	mov	v12.16b, v29.16b
1326	mov	v17.16b, v30.16b
1327	add	v15.4s, v17.4s, v25.4s
1328	add	v16.4s, v15.4s, v25.4s
1329
1330	mov	x6, #10
1331
1332.Lseal_128_rounds:
1333	add	v0.4s, v0.4s, v5.4s
1334	add	v1.4s, v1.4s, v6.4s
1335	add	v2.4s, v2.4s, v7.4s
1336	eor	v15.16b, v15.16b, v0.16b
1337	eor	v16.16b, v16.16b, v1.16b
1338	eor	v17.16b, v17.16b, v2.16b
1339	rev32	v15.8h, v15.8h
1340	rev32	v16.8h, v16.8h
1341	rev32	v17.8h, v17.8h
1342
1343	add	v10.4s, v10.4s, v15.4s
1344	add	v11.4s, v11.4s, v16.4s
1345	add	v12.4s, v12.4s, v17.4s
1346	eor	v5.16b, v5.16b, v10.16b
1347	eor	v6.16b, v6.16b, v11.16b
1348	eor	v7.16b, v7.16b, v12.16b
1349	ushr	v20.4s, v5.4s, #20
1350	sli	v20.4s, v5.4s, #12
1351	ushr	v5.4s, v6.4s, #20
1352	sli	v5.4s, v6.4s, #12
1353	ushr	v6.4s, v7.4s, #20
1354	sli	v6.4s, v7.4s, #12
1355
1356	add	v0.4s, v0.4s, v20.4s
1357	add	v1.4s, v1.4s, v5.4s
1358	add	v2.4s, v2.4s, v6.4s
1359	eor	v15.16b, v15.16b, v0.16b
1360	eor	v16.16b, v16.16b, v1.16b
1361	eor	v17.16b, v17.16b, v2.16b
1362	tbl	v15.16b, {v15.16b}, v26.16b
1363	tbl	v16.16b, {v16.16b}, v26.16b
1364	tbl	v17.16b, {v17.16b}, v26.16b
1365
1366	add	v10.4s, v10.4s, v15.4s
1367	add	v11.4s, v11.4s, v16.4s
1368	add	v12.4s, v12.4s, v17.4s
1369	eor	v20.16b, v20.16b, v10.16b
1370	eor	v5.16b, v5.16b, v11.16b
1371	eor	v6.16b, v6.16b, v12.16b
1372	ushr	v7.4s, v6.4s, #25
1373	sli	v7.4s, v6.4s, #7
1374	ushr	v6.4s, v5.4s, #25
1375	sli	v6.4s, v5.4s, #7
1376	ushr	v5.4s, v20.4s, #25
1377	sli	v5.4s, v20.4s, #7
1378
1379	ext	v5.16b, v5.16b, v5.16b, #4
1380	ext	v6.16b, v6.16b, v6.16b, #4
1381	ext	v7.16b, v7.16b, v7.16b, #4
1382
1383	ext	v10.16b, v10.16b, v10.16b, #8
1384	ext	v11.16b, v11.16b, v11.16b, #8
1385	ext	v12.16b, v12.16b, v12.16b, #8
1386
1387	ext	v15.16b, v15.16b, v15.16b, #12
1388	ext	v16.16b, v16.16b, v16.16b, #12
1389	ext	v17.16b, v17.16b, v17.16b, #12
1390	add	v0.4s, v0.4s, v5.4s
1391	add	v1.4s, v1.4s, v6.4s
1392	add	v2.4s, v2.4s, v7.4s
1393	eor	v15.16b, v15.16b, v0.16b
1394	eor	v16.16b, v16.16b, v1.16b
1395	eor	v17.16b, v17.16b, v2.16b
1396	rev32	v15.8h, v15.8h
1397	rev32	v16.8h, v16.8h
1398	rev32	v17.8h, v17.8h
1399
1400	add	v10.4s, v10.4s, v15.4s
1401	add	v11.4s, v11.4s, v16.4s
1402	add	v12.4s, v12.4s, v17.4s
1403	eor	v5.16b, v5.16b, v10.16b
1404	eor	v6.16b, v6.16b, v11.16b
1405	eor	v7.16b, v7.16b, v12.16b
1406	ushr	v20.4s, v5.4s, #20
1407	sli	v20.4s, v5.4s, #12
1408	ushr	v5.4s, v6.4s, #20
1409	sli	v5.4s, v6.4s, #12
1410	ushr	v6.4s, v7.4s, #20
1411	sli	v6.4s, v7.4s, #12
1412
1413	add	v0.4s, v0.4s, v20.4s
1414	add	v1.4s, v1.4s, v5.4s
1415	add	v2.4s, v2.4s, v6.4s
1416	eor	v15.16b, v15.16b, v0.16b
1417	eor	v16.16b, v16.16b, v1.16b
1418	eor	v17.16b, v17.16b, v2.16b
1419	tbl	v15.16b, {v15.16b}, v26.16b
1420	tbl	v16.16b, {v16.16b}, v26.16b
1421	tbl	v17.16b, {v17.16b}, v26.16b
1422
1423	add	v10.4s, v10.4s, v15.4s
1424	add	v11.4s, v11.4s, v16.4s
1425	add	v12.4s, v12.4s, v17.4s
1426	eor	v20.16b, v20.16b, v10.16b
1427	eor	v5.16b, v5.16b, v11.16b
1428	eor	v6.16b, v6.16b, v12.16b
1429	ushr	v7.4s, v6.4s, #25
1430	sli	v7.4s, v6.4s, #7
1431	ushr	v6.4s, v5.4s, #25
1432	sli	v6.4s, v5.4s, #7
1433	ushr	v5.4s, v20.4s, #25
1434	sli	v5.4s, v20.4s, #7
1435
1436	ext	v5.16b, v5.16b, v5.16b, #12
1437	ext	v6.16b, v6.16b, v6.16b, #12
1438	ext	v7.16b, v7.16b, v7.16b, #12
1439
1440	ext	v10.16b, v10.16b, v10.16b, #8
1441	ext	v11.16b, v11.16b, v11.16b, #8
1442	ext	v12.16b, v12.16b, v12.16b, #8
1443
1444	ext	v15.16b, v15.16b, v15.16b, #4
1445	ext	v16.16b, v16.16b, v16.16b, #4
1446	ext	v17.16b, v17.16b, v17.16b, #4
1447	subs	x6, x6, #1
1448	b.hi	.Lseal_128_rounds
1449
1450	add	v0.4s, v0.4s, v24.4s
1451	add	v1.4s, v1.4s, v24.4s
1452	add	v2.4s, v2.4s, v24.4s
1453
1454	add	v5.4s, v5.4s, v28.4s
1455	add	v6.4s, v6.4s, v28.4s
1456	add	v7.4s, v7.4s, v28.4s
1457
1458    // Only the first 32 bytes of the third block (counter = 0) are needed,
1459    // so skip updating v12 and v17.
1460	add	v10.4s, v10.4s, v29.4s
1461	add	v11.4s, v11.4s, v29.4s
1462
1463	add	v30.4s, v30.4s, v25.4s
1464	add	v15.4s, v15.4s, v30.4s
1465	add	v30.4s, v30.4s, v25.4s
1466	add	v16.4s, v16.4s, v30.4s
1467
1468	and	v2.16b, v2.16b, v27.16b
1469	mov	x16, v2.d[0] // Move the R key to GPRs
1470	mov	x17, v2.d[1]
1471	mov	v27.16b, v7.16b // Store the S key
1472
1473	bl	.Lpoly_hash_ad_internal
1474	b	.Lseal_tail
1475.cfi_endproc
1476.size	chacha20_poly1305_seal,.-chacha20_poly1305_seal
1477
1478/////////////////////////////////
1479//
1480// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
1481//
1482.globl	chacha20_poly1305_open
1483.hidden	chacha20_poly1305_open
1484.type	chacha20_poly1305_open,%function
1485.align	6
1486chacha20_poly1305_open:
1487	AARCH64_SIGN_LINK_REGISTER
1488.cfi_startproc
1489	stp	x29, x30, [sp, #-80]!
1490.cfi_def_cfa_offset	80
1491.cfi_offset	w30, -72
1492.cfi_offset	w29, -80
1493	mov	x29, sp
1494    // We probably could do .cfi_def_cfa w29, 80 at this point, but since
1495    // we don't actually use the frame pointer like that, it's probably not
1496    // worth bothering.
1497	stp	d8, d9, [sp, #16]
1498	stp	d10, d11, [sp, #32]
1499	stp	d12, d13, [sp, #48]
1500	stp	d14, d15, [sp, #64]
1501.cfi_offset	b15, -8
1502.cfi_offset	b14, -16
1503.cfi_offset	b13, -24
1504.cfi_offset	b12, -32
1505.cfi_offset	b11, -40
1506.cfi_offset	b10, -48
1507.cfi_offset	b9, -56
1508.cfi_offset	b8, -64
1509
1510	adrp	x11, .Lchacha20_consts
1511	add	x11, x11, :lo12:.Lchacha20_consts
1512
1513	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
1514	ld1	{v28.16b - v30.16b}, [x5]
1515
1516	mov	x15, #1 // Prepare the Poly1305 state
1517	mov	x8, #0
1518	mov	x9, #0
1519	mov	x10, #0
1520
1521	mov	v31.d[0], x4  // Store the input and aad lengths
1522	mov	v31.d[1], x2
1523
1524	cmp	x2, #128
1525	b.le	.Lopen_128 // Optimization for smaller buffers
1526
1527    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
1528	mov	v0.16b, v24.16b
1529	mov	v5.16b, v28.16b
1530	mov	v10.16b, v29.16b
1531	mov	v15.16b, v30.16b
1532
1533	mov	x6, #10
1534
1535.align	5
1536.Lopen_init_rounds:
1537	add	v0.4s, v0.4s, v5.4s
1538	eor	v15.16b, v15.16b, v0.16b
1539	rev32	v15.8h, v15.8h
1540
1541	add	v10.4s, v10.4s, v15.4s
1542	eor	v5.16b, v5.16b, v10.16b
1543	ushr	v20.4s, v5.4s, #20
1544	sli	v20.4s, v5.4s, #12
1545	add	v0.4s, v0.4s, v20.4s
1546	eor	v15.16b, v15.16b, v0.16b
1547	tbl	v15.16b, {v15.16b}, v26.16b
1548
1549	add	v10.4s, v10.4s, v15.4s
1550	eor	v20.16b, v20.16b, v10.16b
1551	ushr	v5.4s, v20.4s, #25
1552	sli	v5.4s, v20.4s, #7
1553	ext	v5.16b, v5.16b, v5.16b, #4
1554	ext	v10.16b, v10.16b, v10.16b, #8
1555	ext	v15.16b, v15.16b, v15.16b, #12
1556	add	v0.4s, v0.4s, v5.4s
1557	eor	v15.16b, v15.16b, v0.16b
1558	rev32	v15.8h, v15.8h
1559
1560	add	v10.4s, v10.4s, v15.4s
1561	eor	v5.16b, v5.16b, v10.16b
1562	ushr	v20.4s, v5.4s, #20
1563	sli	v20.4s, v5.4s, #12
1564	add	v0.4s, v0.4s, v20.4s
1565	eor	v15.16b, v15.16b, v0.16b
1566	tbl	v15.16b, {v15.16b}, v26.16b
1567
1568	add	v10.4s, v10.4s, v15.4s
1569	eor	v20.16b, v20.16b, v10.16b
1570	ushr	v5.4s, v20.4s, #25
1571	sli	v5.4s, v20.4s, #7
1572	ext	v5.16b, v5.16b, v5.16b, #12
1573	ext	v10.16b, v10.16b, v10.16b, #8
1574	ext	v15.16b, v15.16b, v15.16b, #4
1575	subs	x6, x6, #1
1576	b.hi	.Lopen_init_rounds
1577
1578	add	v0.4s, v0.4s, v24.4s
1579	add	v5.4s, v5.4s, v28.4s
1580
1581	and	v0.16b, v0.16b, v27.16b
1582	mov	x16, v0.d[0] // Move the R key to GPRs
1583	mov	x17, v0.d[1]
1584	mov	v27.16b, v5.16b // Store the S key
1585
1586	bl	.Lpoly_hash_ad_internal
1587
1588.Lopen_ad_done:
1589	mov	x3, x1
1590
1591// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
1592.Lopen_main_loop:
1593
1594	cmp	x2, #192
1595	b.lt	.Lopen_tail
1596
1597	adrp	x11, .Lchacha20_consts
1598	add	x11, x11, :lo12:.Lchacha20_consts
1599
1600	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
1601	mov	v4.16b, v24.16b
1602
1603	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
1604	mov	v9.16b, v28.16b
1605
1606	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
1607	mov	v14.16b, v29.16b
1608
1609	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
1610	sub	x5, x5, #32
1611	add	v15.4s, v15.4s, v25.4s
1612	mov	v19.16b, v30.16b
1613
1614	eor	v20.16b, v20.16b, v20.16b //zero
1615	not	v21.16b, v20.16b // -1
1616	sub	v21.4s, v25.4s, v21.4s // Add +1
1617	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1618	add	v19.4s, v19.4s, v20.4s
1619
1620	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
1621	sub	x4, x4, #10
1622
1623	mov	x7, #10
1624	subs	x6, x7, x4
1625	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
1626	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
1627
1628	cbz	x7, .Lopen_main_loop_rounds_short
1629
1630.align	5
1631.Lopen_main_loop_rounds:
1632	ldp	x11, x12, [x3], 16
1633	adds	x8, x8, x11
1634	adcs	x9, x9, x12
1635	adc	x10, x10, x15
1636	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1637	umulh	x12, x8, x16
1638	mul	x13, x9, x16
1639	umulh	x14, x9, x16
1640	adds	x12, x12, x13
1641	mul	x13, x10, x16
1642	adc	x13, x13, x14
1643	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1644	umulh	x8, x8, x17
1645	adds	x12, x12, x14
1646	mul	x14, x9, x17
1647	umulh	x9, x9, x17
1648	adcs	x14, x14, x8
1649	mul	x10, x10, x17
1650	adc	x10, x10, x9
1651	adds	x13, x13, x14
1652	adc	x14, x10, xzr
1653	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1654	and	x8, x13, #-4
1655	extr	x13, x14, x13, #2
1656	adds	x8, x8, x11
1657	lsr	x11, x14, #2
1658	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1659	adds	x8, x8, x13
1660	adcs	x9, x9, x12
1661	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1662.Lopen_main_loop_rounds_short:
1663	add	v0.4s, v0.4s, v5.4s
1664	add	v1.4s, v1.4s, v6.4s
1665	add	v2.4s, v2.4s, v7.4s
1666	add	v3.4s, v3.4s, v8.4s
1667	add	v4.4s, v4.4s, v9.4s
1668
1669	eor	v15.16b, v15.16b, v0.16b
1670	eor	v16.16b, v16.16b, v1.16b
1671	eor	v17.16b, v17.16b, v2.16b
1672	eor	v18.16b, v18.16b, v3.16b
1673	eor	v19.16b, v19.16b, v4.16b
1674
1675	rev32	v15.8h, v15.8h
1676	rev32	v16.8h, v16.8h
1677	rev32	v17.8h, v17.8h
1678	rev32	v18.8h, v18.8h
1679	rev32	v19.8h, v19.8h
1680
1681	add	v10.4s, v10.4s, v15.4s
1682	add	v11.4s, v11.4s, v16.4s
1683	add	v12.4s, v12.4s, v17.4s
1684	add	v13.4s, v13.4s, v18.4s
1685	add	v14.4s, v14.4s, v19.4s
1686
1687	eor	v5.16b, v5.16b, v10.16b
1688	eor	v6.16b, v6.16b, v11.16b
1689	eor	v7.16b, v7.16b, v12.16b
1690	eor	v8.16b, v8.16b, v13.16b
1691	eor	v9.16b, v9.16b, v14.16b
1692
1693	ushr	v20.4s, v5.4s, #20
1694	sli	v20.4s, v5.4s, #12
1695	ushr	v5.4s, v6.4s, #20
1696	sli	v5.4s, v6.4s, #12
1697	ushr	v6.4s, v7.4s, #20
1698	sli	v6.4s, v7.4s, #12
1699	ushr	v7.4s, v8.4s, #20
1700	sli	v7.4s, v8.4s, #12
1701	ushr	v8.4s, v9.4s, #20
1702	sli	v8.4s, v9.4s, #12
1703
1704	add	v0.4s, v0.4s, v20.4s
1705	add	v1.4s, v1.4s, v5.4s
1706	add	v2.4s, v2.4s, v6.4s
1707	add	v3.4s, v3.4s, v7.4s
1708	add	v4.4s, v4.4s, v8.4s
1709
1710	eor	v15.16b, v15.16b, v0.16b
1711	eor	v16.16b, v16.16b, v1.16b
1712	eor	v17.16b, v17.16b, v2.16b
1713	eor	v18.16b, v18.16b, v3.16b
1714	eor	v19.16b, v19.16b, v4.16b
1715
1716	tbl	v15.16b, {v15.16b}, v26.16b
1717	tbl	v16.16b, {v16.16b}, v26.16b
1718	tbl	v17.16b, {v17.16b}, v26.16b
1719	tbl	v18.16b, {v18.16b}, v26.16b
1720	tbl	v19.16b, {v19.16b}, v26.16b
1721
1722	add	v10.4s, v10.4s, v15.4s
1723	add	v11.4s, v11.4s, v16.4s
1724	add	v12.4s, v12.4s, v17.4s
1725	add	v13.4s, v13.4s, v18.4s
1726	add	v14.4s, v14.4s, v19.4s
1727
1728	eor	v20.16b, v20.16b, v10.16b
1729	eor	v5.16b, v5.16b, v11.16b
1730	eor	v6.16b, v6.16b, v12.16b
1731	eor	v7.16b, v7.16b, v13.16b
1732	eor	v8.16b, v8.16b, v14.16b
1733
1734	ushr	v9.4s, v8.4s, #25
1735	sli	v9.4s, v8.4s, #7
1736	ushr	v8.4s, v7.4s, #25
1737	sli	v8.4s, v7.4s, #7
1738	ushr	v7.4s, v6.4s, #25
1739	sli	v7.4s, v6.4s, #7
1740	ushr	v6.4s, v5.4s, #25
1741	sli	v6.4s, v5.4s, #7
1742	ushr	v5.4s, v20.4s, #25
1743	sli	v5.4s, v20.4s, #7
1744
1745	ext	v9.16b, v9.16b, v9.16b, #4
1746	ext	v14.16b, v14.16b, v14.16b, #8
1747	ext	v19.16b, v19.16b, v19.16b, #12
1748	ldp	x11, x12, [x3], 16
1749	adds	x8, x8, x11
1750	adcs	x9, x9, x12
1751	adc	x10, x10, x15
1752	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1753	umulh	x12, x8, x16
1754	mul	x13, x9, x16
1755	umulh	x14, x9, x16
1756	adds	x12, x12, x13
1757	mul	x13, x10, x16
1758	adc	x13, x13, x14
1759	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1760	umulh	x8, x8, x17
1761	adds	x12, x12, x14
1762	mul	x14, x9, x17
1763	umulh	x9, x9, x17
1764	adcs	x14, x14, x8
1765	mul	x10, x10, x17
1766	adc	x10, x10, x9
1767	adds	x13, x13, x14
1768	adc	x14, x10, xzr
1769	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1770	and	x8, x13, #-4
1771	extr	x13, x14, x13, #2
1772	adds	x8, x8, x11
1773	lsr	x11, x14, #2
1774	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1775	adds	x8, x8, x13
1776	adcs	x9, x9, x12
1777	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1778	add	v0.4s, v0.4s, v6.4s
1779	add	v1.4s, v1.4s, v7.4s
1780	add	v2.4s, v2.4s, v8.4s
1781	add	v3.4s, v3.4s, v5.4s
1782	add	v4.4s, v4.4s, v9.4s
1783
1784	eor	v18.16b, v18.16b, v0.16b
1785	eor	v15.16b, v15.16b, v1.16b
1786	eor	v16.16b, v16.16b, v2.16b
1787	eor	v17.16b, v17.16b, v3.16b
1788	eor	v19.16b, v19.16b, v4.16b
1789
1790	rev32	v18.8h, v18.8h
1791	rev32	v15.8h, v15.8h
1792	rev32	v16.8h, v16.8h
1793	rev32	v17.8h, v17.8h
1794	rev32	v19.8h, v19.8h
1795
1796	add	v12.4s, v12.4s, v18.4s
1797	add	v13.4s, v13.4s, v15.4s
1798	add	v10.4s, v10.4s, v16.4s
1799	add	v11.4s, v11.4s, v17.4s
1800	add	v14.4s, v14.4s, v19.4s
1801
1802	eor	v6.16b, v6.16b, v12.16b
1803	eor	v7.16b, v7.16b, v13.16b
1804	eor	v8.16b, v8.16b, v10.16b
1805	eor	v5.16b, v5.16b, v11.16b
1806	eor	v9.16b, v9.16b, v14.16b
1807
1808	ushr	v20.4s, v6.4s, #20
1809	sli	v20.4s, v6.4s, #12
1810	ushr	v6.4s, v7.4s, #20
1811	sli	v6.4s, v7.4s, #12
1812	ushr	v7.4s, v8.4s, #20
1813	sli	v7.4s, v8.4s, #12
1814	ushr	v8.4s, v5.4s, #20
1815	sli	v8.4s, v5.4s, #12
1816	ushr	v5.4s, v9.4s, #20
1817	sli	v5.4s, v9.4s, #12
1818
1819	add	v0.4s, v0.4s, v20.4s
1820	add	v1.4s, v1.4s, v6.4s
1821	add	v2.4s, v2.4s, v7.4s
1822	add	v3.4s, v3.4s, v8.4s
1823	add	v4.4s, v4.4s, v5.4s
1824
1825	eor	v18.16b, v18.16b, v0.16b
1826	eor	v15.16b, v15.16b, v1.16b
1827	eor	v16.16b, v16.16b, v2.16b
1828	eor	v17.16b, v17.16b, v3.16b
1829	eor	v19.16b, v19.16b, v4.16b
1830
1831	tbl	v18.16b, {v18.16b}, v26.16b
1832	tbl	v15.16b, {v15.16b}, v26.16b
1833	tbl	v16.16b, {v16.16b}, v26.16b
1834	tbl	v17.16b, {v17.16b}, v26.16b
1835	tbl	v19.16b, {v19.16b}, v26.16b
1836
1837	add	v12.4s, v12.4s, v18.4s
1838	add	v13.4s, v13.4s, v15.4s
1839	add	v10.4s, v10.4s, v16.4s
1840	add	v11.4s, v11.4s, v17.4s
1841	add	v14.4s, v14.4s, v19.4s
1842
1843	eor	v20.16b, v20.16b, v12.16b
1844	eor	v6.16b, v6.16b, v13.16b
1845	eor	v7.16b, v7.16b, v10.16b
1846	eor	v8.16b, v8.16b, v11.16b
1847	eor	v5.16b, v5.16b, v14.16b
1848
1849	ushr	v9.4s, v5.4s, #25
1850	sli	v9.4s, v5.4s, #7
1851	ushr	v5.4s, v8.4s, #25
1852	sli	v5.4s, v8.4s, #7
1853	ushr	v8.4s, v7.4s, #25
1854	sli	v8.4s, v7.4s, #7
1855	ushr	v7.4s, v6.4s, #25
1856	sli	v7.4s, v6.4s, #7
1857	ushr	v6.4s, v20.4s, #25
1858	sli	v6.4s, v20.4s, #7
1859
1860	ext	v9.16b, v9.16b, v9.16b, #12
1861	ext	v14.16b, v14.16b, v14.16b, #8
1862	ext	v19.16b, v19.16b, v19.16b, #4
1863	subs	x7, x7, #1
1864	b.gt	.Lopen_main_loop_rounds
1865	subs	x6, x6, #1
1866	b.ge	.Lopen_main_loop_rounds_short
1867
1868	eor	v20.16b, v20.16b, v20.16b //zero
1869	not	v21.16b, v20.16b // -1
1870	sub	v21.4s, v25.4s, v21.4s // Add +1
1871	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1872	add	v19.4s, v19.4s, v20.4s
1873
1874	add	v15.4s, v15.4s, v25.4s
1875	mov	x11, #5
1876	dup	v20.4s, w11
1877	add	v25.4s, v25.4s, v20.4s
1878
1879	zip1	v20.4s, v0.4s, v1.4s
1880	zip2	v21.4s, v0.4s, v1.4s
1881	zip1	v22.4s, v2.4s, v3.4s
1882	zip2	v23.4s, v2.4s, v3.4s
1883
1884	zip1	v0.2d, v20.2d, v22.2d
1885	zip2	v1.2d, v20.2d, v22.2d
1886	zip1	v2.2d, v21.2d, v23.2d
1887	zip2	v3.2d, v21.2d, v23.2d
1888
1889	zip1	v20.4s, v5.4s, v6.4s
1890	zip2	v21.4s, v5.4s, v6.4s
1891	zip1	v22.4s, v7.4s, v8.4s
1892	zip2	v23.4s, v7.4s, v8.4s
1893
1894	zip1	v5.2d, v20.2d, v22.2d
1895	zip2	v6.2d, v20.2d, v22.2d
1896	zip1	v7.2d, v21.2d, v23.2d
1897	zip2	v8.2d, v21.2d, v23.2d
1898
1899	zip1	v20.4s, v10.4s, v11.4s
1900	zip2	v21.4s, v10.4s, v11.4s
1901	zip1	v22.4s, v12.4s, v13.4s
1902	zip2	v23.4s, v12.4s, v13.4s
1903
1904	zip1	v10.2d, v20.2d, v22.2d
1905	zip2	v11.2d, v20.2d, v22.2d
1906	zip1	v12.2d, v21.2d, v23.2d
1907	zip2	v13.2d, v21.2d, v23.2d
1908
1909	zip1	v20.4s, v15.4s, v16.4s
1910	zip2	v21.4s, v15.4s, v16.4s
1911	zip1	v22.4s, v17.4s, v18.4s
1912	zip2	v23.4s, v17.4s, v18.4s
1913
1914	zip1	v15.2d, v20.2d, v22.2d
1915	zip2	v16.2d, v20.2d, v22.2d
1916	zip1	v17.2d, v21.2d, v23.2d
1917	zip2	v18.2d, v21.2d, v23.2d
1918
1919	add	v0.4s, v0.4s, v24.4s
1920	add	v5.4s, v5.4s, v28.4s
1921	add	v10.4s, v10.4s, v29.4s
1922	add	v15.4s, v15.4s, v30.4s
1923
1924	add	v1.4s, v1.4s, v24.4s
1925	add	v6.4s, v6.4s, v28.4s
1926	add	v11.4s, v11.4s, v29.4s
1927	add	v16.4s, v16.4s, v30.4s
1928
1929	add	v2.4s, v2.4s, v24.4s
1930	add	v7.4s, v7.4s, v28.4s
1931	add	v12.4s, v12.4s, v29.4s
1932	add	v17.4s, v17.4s, v30.4s
1933
1934	add	v3.4s, v3.4s, v24.4s
1935	add	v8.4s, v8.4s, v28.4s
1936	add	v13.4s, v13.4s, v29.4s
1937	add	v18.4s, v18.4s, v30.4s
1938
1939	add	v4.4s, v4.4s, v24.4s
1940	add	v9.4s, v9.4s, v28.4s
1941	add	v14.4s, v14.4s, v29.4s
1942	add	v19.4s, v19.4s, v30.4s
1943
1944    // We can always safely store 192 bytes
1945	ld1	{v20.16b - v23.16b}, [x1], #64
1946	eor	v20.16b, v20.16b, v0.16b
1947	eor	v21.16b, v21.16b, v5.16b
1948	eor	v22.16b, v22.16b, v10.16b
1949	eor	v23.16b, v23.16b, v15.16b
1950	st1	{v20.16b - v23.16b}, [x0], #64
1951
1952	ld1	{v20.16b - v23.16b}, [x1], #64
1953	eor	v20.16b, v20.16b, v1.16b
1954	eor	v21.16b, v21.16b, v6.16b
1955	eor	v22.16b, v22.16b, v11.16b
1956	eor	v23.16b, v23.16b, v16.16b
1957	st1	{v20.16b - v23.16b}, [x0], #64
1958
1959	ld1	{v20.16b - v23.16b}, [x1], #64
1960	eor	v20.16b, v20.16b, v2.16b
1961	eor	v21.16b, v21.16b, v7.16b
1962	eor	v22.16b, v22.16b, v12.16b
1963	eor	v23.16b, v23.16b, v17.16b
1964	st1	{v20.16b - v23.16b}, [x0], #64
1965
1966	sub	x2, x2, #192
1967
1968	mov	v0.16b, v3.16b
1969	mov	v5.16b, v8.16b
1970	mov	v10.16b, v13.16b
1971	mov	v15.16b, v18.16b
1972
1973	cmp	x2, #64
1974	b.lt	.Lopen_tail_64_store
1975
1976	ld1	{v20.16b - v23.16b}, [x1], #64
1977	eor	v20.16b, v20.16b, v3.16b
1978	eor	v21.16b, v21.16b, v8.16b
1979	eor	v22.16b, v22.16b, v13.16b
1980	eor	v23.16b, v23.16b, v18.16b
1981	st1	{v20.16b - v23.16b}, [x0], #64
1982
1983	sub	x2, x2, #64
1984
1985	mov	v0.16b, v4.16b
1986	mov	v5.16b, v9.16b
1987	mov	v10.16b, v14.16b
1988	mov	v15.16b, v19.16b
1989
1990	cmp	x2, #64
1991	b.lt	.Lopen_tail_64_store
1992
1993	ld1	{v20.16b - v23.16b}, [x1], #64
1994	eor	v20.16b, v20.16b, v4.16b
1995	eor	v21.16b, v21.16b, v9.16b
1996	eor	v22.16b, v22.16b, v14.16b
1997	eor	v23.16b, v23.16b, v19.16b
1998	st1	{v20.16b - v23.16b}, [x0], #64
1999
2000	sub	x2, x2, #64
2001	b	.Lopen_main_loop
2002
2003.Lopen_tail:
2004
2005	cbz	x2, .Lopen_finalize
2006
2007	lsr	x4, x2, #4 // How many whole blocks we have to hash
2008
2009	cmp	x2, #64
2010	b.le	.Lopen_tail_64
2011	cmp	x2, #128
2012	b.le	.Lopen_tail_128
2013
2014.Lopen_tail_192:
2015     // We need three more blocks
2016	mov	v0.16b, v24.16b
2017	mov	v1.16b, v24.16b
2018	mov	v2.16b, v24.16b
2019	mov	v5.16b, v28.16b
2020	mov	v6.16b, v28.16b
2021	mov	v7.16b, v28.16b
2022	mov	v10.16b, v29.16b
2023	mov	v11.16b, v29.16b
2024	mov	v12.16b, v29.16b
2025	mov	v15.16b, v30.16b
2026	mov	v16.16b, v30.16b
2027	mov	v17.16b, v30.16b
2028	eor	v23.16b, v23.16b, v23.16b
2029	eor	v21.16b, v21.16b, v21.16b
2030	ins	v23.s[0], v25.s[0]
2031	ins	v21.d[0], x15
2032
2033	add	v22.4s, v23.4s, v21.4s
2034	add	v21.4s, v22.4s, v21.4s
2035
2036	add	v15.4s, v15.4s, v21.4s
2037	add	v16.4s, v16.4s, v23.4s
2038	add	v17.4s, v17.4s, v22.4s
2039
2040	mov	x7, #10
2041	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
2042	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
2043	sub	x4, x4, x7
2044
2045	cbz	x7, .Lopen_tail_192_rounds_no_hash
2046
2047.Lopen_tail_192_rounds:
2048	ldp	x11, x12, [x3], 16
2049	adds	x8, x8, x11
2050	adcs	x9, x9, x12
2051	adc	x10, x10, x15
2052	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2053	umulh	x12, x8, x16
2054	mul	x13, x9, x16
2055	umulh	x14, x9, x16
2056	adds	x12, x12, x13
2057	mul	x13, x10, x16
2058	adc	x13, x13, x14
2059	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2060	umulh	x8, x8, x17
2061	adds	x12, x12, x14
2062	mul	x14, x9, x17
2063	umulh	x9, x9, x17
2064	adcs	x14, x14, x8
2065	mul	x10, x10, x17
2066	adc	x10, x10, x9
2067	adds	x13, x13, x14
2068	adc	x14, x10, xzr
2069	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2070	and	x8, x13, #-4
2071	extr	x13, x14, x13, #2
2072	adds	x8, x8, x11
2073	lsr	x11, x14, #2
2074	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2075	adds	x8, x8, x13
2076	adcs	x9, x9, x12
2077	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2078.Lopen_tail_192_rounds_no_hash:
2079	add	v0.4s, v0.4s, v5.4s
2080	add	v1.4s, v1.4s, v6.4s
2081	add	v2.4s, v2.4s, v7.4s
2082	eor	v15.16b, v15.16b, v0.16b
2083	eor	v16.16b, v16.16b, v1.16b
2084	eor	v17.16b, v17.16b, v2.16b
2085	rev32	v15.8h, v15.8h
2086	rev32	v16.8h, v16.8h
2087	rev32	v17.8h, v17.8h
2088
2089	add	v10.4s, v10.4s, v15.4s
2090	add	v11.4s, v11.4s, v16.4s
2091	add	v12.4s, v12.4s, v17.4s
2092	eor	v5.16b, v5.16b, v10.16b
2093	eor	v6.16b, v6.16b, v11.16b
2094	eor	v7.16b, v7.16b, v12.16b
2095	ushr	v20.4s, v5.4s, #20
2096	sli	v20.4s, v5.4s, #12
2097	ushr	v5.4s, v6.4s, #20
2098	sli	v5.4s, v6.4s, #12
2099	ushr	v6.4s, v7.4s, #20
2100	sli	v6.4s, v7.4s, #12
2101
2102	add	v0.4s, v0.4s, v20.4s
2103	add	v1.4s, v1.4s, v5.4s
2104	add	v2.4s, v2.4s, v6.4s
2105	eor	v15.16b, v15.16b, v0.16b
2106	eor	v16.16b, v16.16b, v1.16b
2107	eor	v17.16b, v17.16b, v2.16b
2108	tbl	v15.16b, {v15.16b}, v26.16b
2109	tbl	v16.16b, {v16.16b}, v26.16b
2110	tbl	v17.16b, {v17.16b}, v26.16b
2111
2112	add	v10.4s, v10.4s, v15.4s
2113	add	v11.4s, v11.4s, v16.4s
2114	add	v12.4s, v12.4s, v17.4s
2115	eor	v20.16b, v20.16b, v10.16b
2116	eor	v5.16b, v5.16b, v11.16b
2117	eor	v6.16b, v6.16b, v12.16b
2118	ushr	v7.4s, v6.4s, #25
2119	sli	v7.4s, v6.4s, #7
2120	ushr	v6.4s, v5.4s, #25
2121	sli	v6.4s, v5.4s, #7
2122	ushr	v5.4s, v20.4s, #25
2123	sli	v5.4s, v20.4s, #7
2124
2125	ext	v5.16b, v5.16b, v5.16b, #4
2126	ext	v6.16b, v6.16b, v6.16b, #4
2127	ext	v7.16b, v7.16b, v7.16b, #4
2128
2129	ext	v10.16b, v10.16b, v10.16b, #8
2130	ext	v11.16b, v11.16b, v11.16b, #8
2131	ext	v12.16b, v12.16b, v12.16b, #8
2132
2133	ext	v15.16b, v15.16b, v15.16b, #12
2134	ext	v16.16b, v16.16b, v16.16b, #12
2135	ext	v17.16b, v17.16b, v17.16b, #12
2136	add	v0.4s, v0.4s, v5.4s
2137	add	v1.4s, v1.4s, v6.4s
2138	add	v2.4s, v2.4s, v7.4s
2139	eor	v15.16b, v15.16b, v0.16b
2140	eor	v16.16b, v16.16b, v1.16b
2141	eor	v17.16b, v17.16b, v2.16b
2142	rev32	v15.8h, v15.8h
2143	rev32	v16.8h, v16.8h
2144	rev32	v17.8h, v17.8h
2145
2146	add	v10.4s, v10.4s, v15.4s
2147	add	v11.4s, v11.4s, v16.4s
2148	add	v12.4s, v12.4s, v17.4s
2149	eor	v5.16b, v5.16b, v10.16b
2150	eor	v6.16b, v6.16b, v11.16b
2151	eor	v7.16b, v7.16b, v12.16b
2152	ushr	v20.4s, v5.4s, #20
2153	sli	v20.4s, v5.4s, #12
2154	ushr	v5.4s, v6.4s, #20
2155	sli	v5.4s, v6.4s, #12
2156	ushr	v6.4s, v7.4s, #20
2157	sli	v6.4s, v7.4s, #12
2158
2159	add	v0.4s, v0.4s, v20.4s
2160	add	v1.4s, v1.4s, v5.4s
2161	add	v2.4s, v2.4s, v6.4s
2162	eor	v15.16b, v15.16b, v0.16b
2163	eor	v16.16b, v16.16b, v1.16b
2164	eor	v17.16b, v17.16b, v2.16b
2165	tbl	v15.16b, {v15.16b}, v26.16b
2166	tbl	v16.16b, {v16.16b}, v26.16b
2167	tbl	v17.16b, {v17.16b}, v26.16b
2168
2169	add	v10.4s, v10.4s, v15.4s
2170	add	v11.4s, v11.4s, v16.4s
2171	add	v12.4s, v12.4s, v17.4s
2172	eor	v20.16b, v20.16b, v10.16b
2173	eor	v5.16b, v5.16b, v11.16b
2174	eor	v6.16b, v6.16b, v12.16b
2175	ushr	v7.4s, v6.4s, #25
2176	sli	v7.4s, v6.4s, #7
2177	ushr	v6.4s, v5.4s, #25
2178	sli	v6.4s, v5.4s, #7
2179	ushr	v5.4s, v20.4s, #25
2180	sli	v5.4s, v20.4s, #7
2181
2182	ext	v5.16b, v5.16b, v5.16b, #12
2183	ext	v6.16b, v6.16b, v6.16b, #12
2184	ext	v7.16b, v7.16b, v7.16b, #12
2185
2186	ext	v10.16b, v10.16b, v10.16b, #8
2187	ext	v11.16b, v11.16b, v11.16b, #8
2188	ext	v12.16b, v12.16b, v12.16b, #8
2189
2190	ext	v15.16b, v15.16b, v15.16b, #4
2191	ext	v16.16b, v16.16b, v16.16b, #4
2192	ext	v17.16b, v17.16b, v17.16b, #4
2193	subs	x7, x7, #1
2194	b.gt	.Lopen_tail_192_rounds
2195	subs	x6, x6, #1
2196	b.ge	.Lopen_tail_192_rounds_no_hash
2197
2198    // We hashed 160 bytes at most, may still have 32 bytes left
2199.Lopen_tail_192_hash:
2200	cbz	x4, .Lopen_tail_192_hash_done
2201	ldp	x11, x12, [x3], 16
2202	adds	x8, x8, x11
2203	adcs	x9, x9, x12
2204	adc	x10, x10, x15
2205	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2206	umulh	x12, x8, x16
2207	mul	x13, x9, x16
2208	umulh	x14, x9, x16
2209	adds	x12, x12, x13
2210	mul	x13, x10, x16
2211	adc	x13, x13, x14
2212	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2213	umulh	x8, x8, x17
2214	adds	x12, x12, x14
2215	mul	x14, x9, x17
2216	umulh	x9, x9, x17
2217	adcs	x14, x14, x8
2218	mul	x10, x10, x17
2219	adc	x10, x10, x9
2220	adds	x13, x13, x14
2221	adc	x14, x10, xzr
2222	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2223	and	x8, x13, #-4
2224	extr	x13, x14, x13, #2
2225	adds	x8, x8, x11
2226	lsr	x11, x14, #2
2227	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2228	adds	x8, x8, x13
2229	adcs	x9, x9, x12
2230	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2231	sub	x4, x4, #1
2232	b	.Lopen_tail_192_hash
2233
2234.Lopen_tail_192_hash_done:
2235
2236	add	v0.4s, v0.4s, v24.4s
2237	add	v1.4s, v1.4s, v24.4s
2238	add	v2.4s, v2.4s, v24.4s
2239	add	v5.4s, v5.4s, v28.4s
2240	add	v6.4s, v6.4s, v28.4s
2241	add	v7.4s, v7.4s, v28.4s
2242	add	v10.4s, v10.4s, v29.4s
2243	add	v11.4s, v11.4s, v29.4s
2244	add	v12.4s, v12.4s, v29.4s
2245	add	v15.4s, v15.4s, v30.4s
2246	add	v16.4s, v16.4s, v30.4s
2247	add	v17.4s, v17.4s, v30.4s
2248
2249	add	v15.4s, v15.4s, v21.4s
2250	add	v16.4s, v16.4s, v23.4s
2251	add	v17.4s, v17.4s, v22.4s
2252
2253	ld1	{v20.16b - v23.16b}, [x1], #64
2254
2255	eor	v20.16b, v20.16b, v1.16b
2256	eor	v21.16b, v21.16b, v6.16b
2257	eor	v22.16b, v22.16b, v11.16b
2258	eor	v23.16b, v23.16b, v16.16b
2259
2260	st1	{v20.16b - v23.16b}, [x0], #64
2261
2262	ld1	{v20.16b - v23.16b}, [x1], #64
2263
2264	eor	v20.16b, v20.16b, v2.16b
2265	eor	v21.16b, v21.16b, v7.16b
2266	eor	v22.16b, v22.16b, v12.16b
2267	eor	v23.16b, v23.16b, v17.16b
2268
2269	st1	{v20.16b - v23.16b}, [x0], #64
2270
2271	sub	x2, x2, #128
2272	b	.Lopen_tail_64_store
2273
2274.Lopen_tail_128:
2275     // We need two more blocks
2276	mov	v0.16b, v24.16b
2277	mov	v1.16b, v24.16b
2278	mov	v5.16b, v28.16b
2279	mov	v6.16b, v28.16b
2280	mov	v10.16b, v29.16b
2281	mov	v11.16b, v29.16b
2282	mov	v15.16b, v30.16b
2283	mov	v16.16b, v30.16b
2284	eor	v23.16b, v23.16b, v23.16b
2285	eor	v22.16b, v22.16b, v22.16b
2286	ins	v23.s[0], v25.s[0]
2287	ins	v22.d[0], x15
2288	add	v22.4s, v22.4s, v23.4s
2289
2290	add	v15.4s, v15.4s, v22.4s
2291	add	v16.4s, v16.4s, v23.4s
2292
2293	mov	x6, #10
2294	sub	x6, x6, x4
2295
2296.Lopen_tail_128_rounds:
2297	add	v0.4s, v0.4s, v5.4s
2298	eor	v15.16b, v15.16b, v0.16b
2299	rev32	v15.8h, v15.8h
2300
2301	add	v10.4s, v10.4s, v15.4s
2302	eor	v5.16b, v5.16b, v10.16b
2303	ushr	v20.4s, v5.4s, #20
2304	sli	v20.4s, v5.4s, #12
2305	add	v0.4s, v0.4s, v20.4s
2306	eor	v15.16b, v15.16b, v0.16b
2307	tbl	v15.16b, {v15.16b}, v26.16b
2308
2309	add	v10.4s, v10.4s, v15.4s
2310	eor	v20.16b, v20.16b, v10.16b
2311	ushr	v5.4s, v20.4s, #25
2312	sli	v5.4s, v20.4s, #7
2313	ext	v5.16b, v5.16b, v5.16b, #4
2314	ext	v10.16b, v10.16b, v10.16b, #8
2315	ext	v15.16b, v15.16b, v15.16b, #12
2316	add	v1.4s, v1.4s, v6.4s
2317	eor	v16.16b, v16.16b, v1.16b
2318	rev32	v16.8h, v16.8h
2319
2320	add	v11.4s, v11.4s, v16.4s
2321	eor	v6.16b, v6.16b, v11.16b
2322	ushr	v20.4s, v6.4s, #20
2323	sli	v20.4s, v6.4s, #12
2324	add	v1.4s, v1.4s, v20.4s
2325	eor	v16.16b, v16.16b, v1.16b
2326	tbl	v16.16b, {v16.16b}, v26.16b
2327
2328	add	v11.4s, v11.4s, v16.4s
2329	eor	v20.16b, v20.16b, v11.16b
2330	ushr	v6.4s, v20.4s, #25
2331	sli	v6.4s, v20.4s, #7
2332	ext	v6.16b, v6.16b, v6.16b, #4
2333	ext	v11.16b, v11.16b, v11.16b, #8
2334	ext	v16.16b, v16.16b, v16.16b, #12
2335	add	v0.4s, v0.4s, v5.4s
2336	eor	v15.16b, v15.16b, v0.16b
2337	rev32	v15.8h, v15.8h
2338
2339	add	v10.4s, v10.4s, v15.4s
2340	eor	v5.16b, v5.16b, v10.16b
2341	ushr	v20.4s, v5.4s, #20
2342	sli	v20.4s, v5.4s, #12
2343	add	v0.4s, v0.4s, v20.4s
2344	eor	v15.16b, v15.16b, v0.16b
2345	tbl	v15.16b, {v15.16b}, v26.16b
2346
2347	add	v10.4s, v10.4s, v15.4s
2348	eor	v20.16b, v20.16b, v10.16b
2349	ushr	v5.4s, v20.4s, #25
2350	sli	v5.4s, v20.4s, #7
2351	ext	v5.16b, v5.16b, v5.16b, #12
2352	ext	v10.16b, v10.16b, v10.16b, #8
2353	ext	v15.16b, v15.16b, v15.16b, #4
2354	add	v1.4s, v1.4s, v6.4s
2355	eor	v16.16b, v16.16b, v1.16b
2356	rev32	v16.8h, v16.8h
2357
2358	add	v11.4s, v11.4s, v16.4s
2359	eor	v6.16b, v6.16b, v11.16b
2360	ushr	v20.4s, v6.4s, #20
2361	sli	v20.4s, v6.4s, #12
2362	add	v1.4s, v1.4s, v20.4s
2363	eor	v16.16b, v16.16b, v1.16b
2364	tbl	v16.16b, {v16.16b}, v26.16b
2365
2366	add	v11.4s, v11.4s, v16.4s
2367	eor	v20.16b, v20.16b, v11.16b
2368	ushr	v6.4s, v20.4s, #25
2369	sli	v6.4s, v20.4s, #7
2370	ext	v6.16b, v6.16b, v6.16b, #12
2371	ext	v11.16b, v11.16b, v11.16b, #8
2372	ext	v16.16b, v16.16b, v16.16b, #4
2373	subs	x6, x6, #1
2374	b.gt	.Lopen_tail_128_rounds
2375	cbz	x4, .Lopen_tail_128_rounds_done
2376	subs	x4, x4, #1
2377	ldp	x11, x12, [x3], 16
2378	adds	x8, x8, x11
2379	adcs	x9, x9, x12
2380	adc	x10, x10, x15
2381	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2382	umulh	x12, x8, x16
2383	mul	x13, x9, x16
2384	umulh	x14, x9, x16
2385	adds	x12, x12, x13
2386	mul	x13, x10, x16
2387	adc	x13, x13, x14
2388	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2389	umulh	x8, x8, x17
2390	adds	x12, x12, x14
2391	mul	x14, x9, x17
2392	umulh	x9, x9, x17
2393	adcs	x14, x14, x8
2394	mul	x10, x10, x17
2395	adc	x10, x10, x9
2396	adds	x13, x13, x14
2397	adc	x14, x10, xzr
2398	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2399	and	x8, x13, #-4
2400	extr	x13, x14, x13, #2
2401	adds	x8, x8, x11
2402	lsr	x11, x14, #2
2403	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2404	adds	x8, x8, x13
2405	adcs	x9, x9, x12
2406	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2407	b	.Lopen_tail_128_rounds
2408
2409.Lopen_tail_128_rounds_done:
2410	add	v0.4s, v0.4s, v24.4s
2411	add	v1.4s, v1.4s, v24.4s
2412	add	v5.4s, v5.4s, v28.4s
2413	add	v6.4s, v6.4s, v28.4s
2414	add	v10.4s, v10.4s, v29.4s
2415	add	v11.4s, v11.4s, v29.4s
2416	add	v15.4s, v15.4s, v30.4s
2417	add	v16.4s, v16.4s, v30.4s
2418	add	v15.4s, v15.4s, v22.4s
2419	add	v16.4s, v16.4s, v23.4s
2420
2421	ld1	{v20.16b - v23.16b}, [x1], #64
2422
2423	eor	v20.16b, v20.16b, v1.16b
2424	eor	v21.16b, v21.16b, v6.16b
2425	eor	v22.16b, v22.16b, v11.16b
2426	eor	v23.16b, v23.16b, v16.16b
2427
2428	st1	{v20.16b - v23.16b}, [x0], #64
2429	sub	x2, x2, #64
2430
2431	b	.Lopen_tail_64_store
2432
2433.Lopen_tail_64:
2434    // We just need a single block
2435	mov	v0.16b, v24.16b
2436	mov	v5.16b, v28.16b
2437	mov	v10.16b, v29.16b
2438	mov	v15.16b, v30.16b
2439	eor	v23.16b, v23.16b, v23.16b
2440	ins	v23.s[0], v25.s[0]
2441	add	v15.4s, v15.4s, v23.4s
2442
2443	mov	x6, #10
2444	sub	x6, x6, x4
2445
2446.Lopen_tail_64_rounds:
2447	add	v0.4s, v0.4s, v5.4s
2448	eor	v15.16b, v15.16b, v0.16b
2449	rev32	v15.8h, v15.8h
2450
2451	add	v10.4s, v10.4s, v15.4s
2452	eor	v5.16b, v5.16b, v10.16b
2453	ushr	v20.4s, v5.4s, #20
2454	sli	v20.4s, v5.4s, #12
2455	add	v0.4s, v0.4s, v20.4s
2456	eor	v15.16b, v15.16b, v0.16b
2457	tbl	v15.16b, {v15.16b}, v26.16b
2458
2459	add	v10.4s, v10.4s, v15.4s
2460	eor	v20.16b, v20.16b, v10.16b
2461	ushr	v5.4s, v20.4s, #25
2462	sli	v5.4s, v20.4s, #7
2463	ext	v5.16b, v5.16b, v5.16b, #4
2464	ext	v10.16b, v10.16b, v10.16b, #8
2465	ext	v15.16b, v15.16b, v15.16b, #12
2466	add	v0.4s, v0.4s, v5.4s
2467	eor	v15.16b, v15.16b, v0.16b
2468	rev32	v15.8h, v15.8h
2469
2470	add	v10.4s, v10.4s, v15.4s
2471	eor	v5.16b, v5.16b, v10.16b
2472	ushr	v20.4s, v5.4s, #20
2473	sli	v20.4s, v5.4s, #12
2474	add	v0.4s, v0.4s, v20.4s
2475	eor	v15.16b, v15.16b, v0.16b
2476	tbl	v15.16b, {v15.16b}, v26.16b
2477
2478	add	v10.4s, v10.4s, v15.4s
2479	eor	v20.16b, v20.16b, v10.16b
2480	ushr	v5.4s, v20.4s, #25
2481	sli	v5.4s, v20.4s, #7
2482	ext	v5.16b, v5.16b, v5.16b, #12
2483	ext	v10.16b, v10.16b, v10.16b, #8
2484	ext	v15.16b, v15.16b, v15.16b, #4
2485	subs	x6, x6, #1
2486	b.gt	.Lopen_tail_64_rounds
2487	cbz	x4, .Lopen_tail_64_rounds_done
2488	subs	x4, x4, #1
2489	ldp	x11, x12, [x3], 16
2490	adds	x8, x8, x11
2491	adcs	x9, x9, x12
2492	adc	x10, x10, x15
2493	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2494	umulh	x12, x8, x16
2495	mul	x13, x9, x16
2496	umulh	x14, x9, x16
2497	adds	x12, x12, x13
2498	mul	x13, x10, x16
2499	adc	x13, x13, x14
2500	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2501	umulh	x8, x8, x17
2502	adds	x12, x12, x14
2503	mul	x14, x9, x17
2504	umulh	x9, x9, x17
2505	adcs	x14, x14, x8
2506	mul	x10, x10, x17
2507	adc	x10, x10, x9
2508	adds	x13, x13, x14
2509	adc	x14, x10, xzr
2510	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2511	and	x8, x13, #-4
2512	extr	x13, x14, x13, #2
2513	adds	x8, x8, x11
2514	lsr	x11, x14, #2
2515	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2516	adds	x8, x8, x13
2517	adcs	x9, x9, x12
2518	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2519	b	.Lopen_tail_64_rounds
2520
2521.Lopen_tail_64_rounds_done:
2522	add	v0.4s, v0.4s, v24.4s
2523	add	v5.4s, v5.4s, v28.4s
2524	add	v10.4s, v10.4s, v29.4s
2525	add	v15.4s, v15.4s, v30.4s
2526	add	v15.4s, v15.4s, v23.4s
2527
2528.Lopen_tail_64_store:
2529	cmp	x2, #16
2530	b.lt	.Lopen_tail_16
2531
2532	ld1	{v20.16b}, [x1], #16
2533	eor	v20.16b, v20.16b, v0.16b
2534	st1	{v20.16b}, [x0], #16
2535	mov	v0.16b, v5.16b
2536	mov	v5.16b, v10.16b
2537	mov	v10.16b, v15.16b
2538	sub	x2, x2, #16
2539	b	.Lopen_tail_64_store
2540
2541.Lopen_tail_16:
2542    // Here we handle the last [0,16) bytes that require a padded block
2543	cbz	x2, .Lopen_finalize
2544
2545	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
2546	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
2547	not	v22.16b, v20.16b
2548
2549	add	x7, x1, x2
2550	mov	x6, x2
2551
2552.Lopen_tail_16_compose:
2553	ext	v20.16b, v20.16b, v20.16b, #15
2554	ldrb	w11, [x7, #-1]!
2555	mov	v20.b[0], w11
2556	ext	v21.16b, v22.16b, v21.16b, #15
2557	subs	x2, x2, #1
2558	b.gt	.Lopen_tail_16_compose
2559
2560	and	v20.16b, v20.16b, v21.16b
2561    // Hash in the final padded block
2562	mov	x11, v20.d[0]
2563	mov	x12, v20.d[1]
2564	adds	x8, x8, x11
2565	adcs	x9, x9, x12
2566	adc	x10, x10, x15
2567	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2568	umulh	x12, x8, x16
2569	mul	x13, x9, x16
2570	umulh	x14, x9, x16
2571	adds	x12, x12, x13
2572	mul	x13, x10, x16
2573	adc	x13, x13, x14
2574	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2575	umulh	x8, x8, x17
2576	adds	x12, x12, x14
2577	mul	x14, x9, x17
2578	umulh	x9, x9, x17
2579	adcs	x14, x14, x8
2580	mul	x10, x10, x17
2581	adc	x10, x10, x9
2582	adds	x13, x13, x14
2583	adc	x14, x10, xzr
2584	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2585	and	x8, x13, #-4
2586	extr	x13, x14, x13, #2
2587	adds	x8, x8, x11
2588	lsr	x11, x14, #2
2589	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2590	adds	x8, x8, x13
2591	adcs	x9, x9, x12
2592	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2593	eor	v20.16b, v20.16b, v0.16b
2594
2595.Lopen_tail_16_store:
2596	umov	w11, v20.b[0]
2597	strb	w11, [x0], #1
2598	ext	v20.16b, v20.16b, v20.16b, #1
2599	subs	x6, x6, #1
2600	b.gt	.Lopen_tail_16_store
2601
2602.Lopen_finalize:
2603	mov	x11, v31.d[0]
2604	mov	x12, v31.d[1]
2605	adds	x8, x8, x11
2606	adcs	x9, x9, x12
2607	adc	x10, x10, x15
2608	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2609	umulh	x12, x8, x16
2610	mul	x13, x9, x16
2611	umulh	x14, x9, x16
2612	adds	x12, x12, x13
2613	mul	x13, x10, x16
2614	adc	x13, x13, x14
2615	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2616	umulh	x8, x8, x17
2617	adds	x12, x12, x14
2618	mul	x14, x9, x17
2619	umulh	x9, x9, x17
2620	adcs	x14, x14, x8
2621	mul	x10, x10, x17
2622	adc	x10, x10, x9
2623	adds	x13, x13, x14
2624	adc	x14, x10, xzr
2625	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2626	and	x8, x13, #-4
2627	extr	x13, x14, x13, #2
2628	adds	x8, x8, x11
2629	lsr	x11, x14, #2
2630	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2631	adds	x8, x8, x13
2632	adcs	x9, x9, x12
2633	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2634    // Final reduction step
2635	sub	x12, xzr, x15
2636	orr	x13, xzr, #3
2637	subs	x11, x8, #-5
2638	sbcs	x12, x9, x12
2639	sbcs	x13, x10, x13
2640	csel	x8, x11, x8, cs
2641	csel	x9, x12, x9, cs
2642	csel	x10, x13, x10, cs
2643	mov	x11, v27.d[0]
2644	mov	x12, v27.d[1]
2645	adds	x8, x8, x11
2646	adcs	x9, x9, x12
2647	adc	x10, x10, x15
2648
2649	stp	x8, x9, [x5]
2650
2651	ldp	d8, d9, [sp, #16]
2652	ldp	d10, d11, [sp, #32]
2653	ldp	d12, d13, [sp, #48]
2654	ldp	d14, d15, [sp, #64]
2655.cfi_restore	b15
2656.cfi_restore	b14
2657.cfi_restore	b13
2658.cfi_restore	b12
2659.cfi_restore	b11
2660.cfi_restore	b10
2661.cfi_restore	b9
2662.cfi_restore	b8
2663	ldp	x29, x30, [sp], 80
2664.cfi_restore	w29
2665.cfi_restore	w30
2666.cfi_def_cfa_offset	0
2667	AARCH64_VALIDATE_LINK_REGISTER
2668	ret
2669
2670.Lopen_128:
2671    // On some architectures preparing 5 blocks for small buffers is wasteful
2672	eor	v25.16b, v25.16b, v25.16b
2673	mov	x11, #1
2674	mov	v25.s[0], w11
2675	mov	v0.16b, v24.16b
2676	mov	v1.16b, v24.16b
2677	mov	v2.16b, v24.16b
2678	mov	v5.16b, v28.16b
2679	mov	v6.16b, v28.16b
2680	mov	v7.16b, v28.16b
2681	mov	v10.16b, v29.16b
2682	mov	v11.16b, v29.16b
2683	mov	v12.16b, v29.16b
2684	mov	v17.16b, v30.16b
2685	add	v15.4s, v17.4s, v25.4s
2686	add	v16.4s, v15.4s, v25.4s
2687
2688	mov	x6, #10
2689
2690.Lopen_128_rounds:
2691	add	v0.4s, v0.4s, v5.4s
2692	add	v1.4s, v1.4s, v6.4s
2693	add	v2.4s, v2.4s, v7.4s
2694	eor	v15.16b, v15.16b, v0.16b
2695	eor	v16.16b, v16.16b, v1.16b
2696	eor	v17.16b, v17.16b, v2.16b
2697	rev32	v15.8h, v15.8h
2698	rev32	v16.8h, v16.8h
2699	rev32	v17.8h, v17.8h
2700
2701	add	v10.4s, v10.4s, v15.4s
2702	add	v11.4s, v11.4s, v16.4s
2703	add	v12.4s, v12.4s, v17.4s
2704	eor	v5.16b, v5.16b, v10.16b
2705	eor	v6.16b, v6.16b, v11.16b
2706	eor	v7.16b, v7.16b, v12.16b
2707	ushr	v20.4s, v5.4s, #20
2708	sli	v20.4s, v5.4s, #12
2709	ushr	v5.4s, v6.4s, #20
2710	sli	v5.4s, v6.4s, #12
2711	ushr	v6.4s, v7.4s, #20
2712	sli	v6.4s, v7.4s, #12
2713
2714	add	v0.4s, v0.4s, v20.4s
2715	add	v1.4s, v1.4s, v5.4s
2716	add	v2.4s, v2.4s, v6.4s
2717	eor	v15.16b, v15.16b, v0.16b
2718	eor	v16.16b, v16.16b, v1.16b
2719	eor	v17.16b, v17.16b, v2.16b
2720	tbl	v15.16b, {v15.16b}, v26.16b
2721	tbl	v16.16b, {v16.16b}, v26.16b
2722	tbl	v17.16b, {v17.16b}, v26.16b
2723
2724	add	v10.4s, v10.4s, v15.4s
2725	add	v11.4s, v11.4s, v16.4s
2726	add	v12.4s, v12.4s, v17.4s
2727	eor	v20.16b, v20.16b, v10.16b
2728	eor	v5.16b, v5.16b, v11.16b
2729	eor	v6.16b, v6.16b, v12.16b
2730	ushr	v7.4s, v6.4s, #25
2731	sli	v7.4s, v6.4s, #7
2732	ushr	v6.4s, v5.4s, #25
2733	sli	v6.4s, v5.4s, #7
2734	ushr	v5.4s, v20.4s, #25
2735	sli	v5.4s, v20.4s, #7
2736
2737	ext	v5.16b, v5.16b, v5.16b, #4
2738	ext	v6.16b, v6.16b, v6.16b, #4
2739	ext	v7.16b, v7.16b, v7.16b, #4
2740
2741	ext	v10.16b, v10.16b, v10.16b, #8
2742	ext	v11.16b, v11.16b, v11.16b, #8
2743	ext	v12.16b, v12.16b, v12.16b, #8
2744
2745	ext	v15.16b, v15.16b, v15.16b, #12
2746	ext	v16.16b, v16.16b, v16.16b, #12
2747	ext	v17.16b, v17.16b, v17.16b, #12
2748	add	v0.4s, v0.4s, v5.4s
2749	add	v1.4s, v1.4s, v6.4s
2750	add	v2.4s, v2.4s, v7.4s
2751	eor	v15.16b, v15.16b, v0.16b
2752	eor	v16.16b, v16.16b, v1.16b
2753	eor	v17.16b, v17.16b, v2.16b
2754	rev32	v15.8h, v15.8h
2755	rev32	v16.8h, v16.8h
2756	rev32	v17.8h, v17.8h
2757
2758	add	v10.4s, v10.4s, v15.4s
2759	add	v11.4s, v11.4s, v16.4s
2760	add	v12.4s, v12.4s, v17.4s
2761	eor	v5.16b, v5.16b, v10.16b
2762	eor	v6.16b, v6.16b, v11.16b
2763	eor	v7.16b, v7.16b, v12.16b
2764	ushr	v20.4s, v5.4s, #20
2765	sli	v20.4s, v5.4s, #12
2766	ushr	v5.4s, v6.4s, #20
2767	sli	v5.4s, v6.4s, #12
2768	ushr	v6.4s, v7.4s, #20
2769	sli	v6.4s, v7.4s, #12
2770
2771	add	v0.4s, v0.4s, v20.4s
2772	add	v1.4s, v1.4s, v5.4s
2773	add	v2.4s, v2.4s, v6.4s
2774	eor	v15.16b, v15.16b, v0.16b
2775	eor	v16.16b, v16.16b, v1.16b
2776	eor	v17.16b, v17.16b, v2.16b
2777	tbl	v15.16b, {v15.16b}, v26.16b
2778	tbl	v16.16b, {v16.16b}, v26.16b
2779	tbl	v17.16b, {v17.16b}, v26.16b
2780
2781	add	v10.4s, v10.4s, v15.4s
2782	add	v11.4s, v11.4s, v16.4s
2783	add	v12.4s, v12.4s, v17.4s
2784	eor	v20.16b, v20.16b, v10.16b
2785	eor	v5.16b, v5.16b, v11.16b
2786	eor	v6.16b, v6.16b, v12.16b
2787	ushr	v7.4s, v6.4s, #25
2788	sli	v7.4s, v6.4s, #7
2789	ushr	v6.4s, v5.4s, #25
2790	sli	v6.4s, v5.4s, #7
2791	ushr	v5.4s, v20.4s, #25
2792	sli	v5.4s, v20.4s, #7
2793
2794	ext	v5.16b, v5.16b, v5.16b, #12
2795	ext	v6.16b, v6.16b, v6.16b, #12
2796	ext	v7.16b, v7.16b, v7.16b, #12
2797
2798	ext	v10.16b, v10.16b, v10.16b, #8
2799	ext	v11.16b, v11.16b, v11.16b, #8
2800	ext	v12.16b, v12.16b, v12.16b, #8
2801
2802	ext	v15.16b, v15.16b, v15.16b, #4
2803	ext	v16.16b, v16.16b, v16.16b, #4
2804	ext	v17.16b, v17.16b, v17.16b, #4
2805	subs	x6, x6, #1
2806	b.hi	.Lopen_128_rounds
2807
2808	add	v0.4s, v0.4s, v24.4s
2809	add	v1.4s, v1.4s, v24.4s
2810	add	v2.4s, v2.4s, v24.4s
2811
2812	add	v5.4s, v5.4s, v28.4s
2813	add	v6.4s, v6.4s, v28.4s
2814	add	v7.4s, v7.4s, v28.4s
2815
2816	add	v10.4s, v10.4s, v29.4s
2817	add	v11.4s, v11.4s, v29.4s
2818
2819	add	v30.4s, v30.4s, v25.4s
2820	add	v15.4s, v15.4s, v30.4s
2821	add	v30.4s, v30.4s, v25.4s
2822	add	v16.4s, v16.4s, v30.4s
2823
2824	and	v2.16b, v2.16b, v27.16b
2825	mov	x16, v2.d[0] // Move the R key to GPRs
2826	mov	x17, v2.d[1]
2827	mov	v27.16b, v7.16b // Store the S key
2828
2829	bl	.Lpoly_hash_ad_internal
2830
2831.Lopen_128_store:
2832	cmp	x2, #64
2833	b.lt	.Lopen_128_store_64
2834
2835	ld1	{v20.16b - v23.16b}, [x1], #64
2836
2837	mov	x11, v20.d[0]
2838	mov	x12, v20.d[1]
2839	adds	x8, x8, x11
2840	adcs	x9, x9, x12
2841	adc	x10, x10, x15
2842	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2843	umulh	x12, x8, x16
2844	mul	x13, x9, x16
2845	umulh	x14, x9, x16
2846	adds	x12, x12, x13
2847	mul	x13, x10, x16
2848	adc	x13, x13, x14
2849	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2850	umulh	x8, x8, x17
2851	adds	x12, x12, x14
2852	mul	x14, x9, x17
2853	umulh	x9, x9, x17
2854	adcs	x14, x14, x8
2855	mul	x10, x10, x17
2856	adc	x10, x10, x9
2857	adds	x13, x13, x14
2858	adc	x14, x10, xzr
2859	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2860	and	x8, x13, #-4
2861	extr	x13, x14, x13, #2
2862	adds	x8, x8, x11
2863	lsr	x11, x14, #2
2864	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2865	adds	x8, x8, x13
2866	adcs	x9, x9, x12
2867	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2868	mov	x11, v21.d[0]
2869	mov	x12, v21.d[1]
2870	adds	x8, x8, x11
2871	adcs	x9, x9, x12
2872	adc	x10, x10, x15
2873	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2874	umulh	x12, x8, x16
2875	mul	x13, x9, x16
2876	umulh	x14, x9, x16
2877	adds	x12, x12, x13
2878	mul	x13, x10, x16
2879	adc	x13, x13, x14
2880	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2881	umulh	x8, x8, x17
2882	adds	x12, x12, x14
2883	mul	x14, x9, x17
2884	umulh	x9, x9, x17
2885	adcs	x14, x14, x8
2886	mul	x10, x10, x17
2887	adc	x10, x10, x9
2888	adds	x13, x13, x14
2889	adc	x14, x10, xzr
2890	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2891	and	x8, x13, #-4
2892	extr	x13, x14, x13, #2
2893	adds	x8, x8, x11
2894	lsr	x11, x14, #2
2895	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2896	adds	x8, x8, x13
2897	adcs	x9, x9, x12
2898	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2899	mov	x11, v22.d[0]
2900	mov	x12, v22.d[1]
2901	adds	x8, x8, x11
2902	adcs	x9, x9, x12
2903	adc	x10, x10, x15
2904	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2905	umulh	x12, x8, x16
2906	mul	x13, x9, x16
2907	umulh	x14, x9, x16
2908	adds	x12, x12, x13
2909	mul	x13, x10, x16
2910	adc	x13, x13, x14
2911	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2912	umulh	x8, x8, x17
2913	adds	x12, x12, x14
2914	mul	x14, x9, x17
2915	umulh	x9, x9, x17
2916	adcs	x14, x14, x8
2917	mul	x10, x10, x17
2918	adc	x10, x10, x9
2919	adds	x13, x13, x14
2920	adc	x14, x10, xzr
2921	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2922	and	x8, x13, #-4
2923	extr	x13, x14, x13, #2
2924	adds	x8, x8, x11
2925	lsr	x11, x14, #2
2926	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2927	adds	x8, x8, x13
2928	adcs	x9, x9, x12
2929	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2930	mov	x11, v23.d[0]
2931	mov	x12, v23.d[1]
2932	adds	x8, x8, x11
2933	adcs	x9, x9, x12
2934	adc	x10, x10, x15
2935	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2936	umulh	x12, x8, x16
2937	mul	x13, x9, x16
2938	umulh	x14, x9, x16
2939	adds	x12, x12, x13
2940	mul	x13, x10, x16
2941	adc	x13, x13, x14
2942	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2943	umulh	x8, x8, x17
2944	adds	x12, x12, x14
2945	mul	x14, x9, x17
2946	umulh	x9, x9, x17
2947	adcs	x14, x14, x8
2948	mul	x10, x10, x17
2949	adc	x10, x10, x9
2950	adds	x13, x13, x14
2951	adc	x14, x10, xzr
2952	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2953	and	x8, x13, #-4
2954	extr	x13, x14, x13, #2
2955	adds	x8, x8, x11
2956	lsr	x11, x14, #2
2957	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2958	adds	x8, x8, x13
2959	adcs	x9, x9, x12
2960	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2961
2962	eor	v20.16b, v20.16b, v0.16b
2963	eor	v21.16b, v21.16b, v5.16b
2964	eor	v22.16b, v22.16b, v10.16b
2965	eor	v23.16b, v23.16b, v15.16b
2966
2967	st1	{v20.16b - v23.16b}, [x0], #64
2968
2969	sub	x2, x2, #64
2970
2971	mov	v0.16b, v1.16b
2972	mov	v5.16b, v6.16b
2973	mov	v10.16b, v11.16b
2974	mov	v15.16b, v16.16b
2975
2976.Lopen_128_store_64:
2977
2978	lsr	x4, x2, #4
2979	mov	x3, x1
2980
2981.Lopen_128_hash_64:
2982	cbz	x4, .Lopen_tail_64_store
2983	ldp	x11, x12, [x3], 16
2984	adds	x8, x8, x11
2985	adcs	x9, x9, x12
2986	adc	x10, x10, x15
2987	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2988	umulh	x12, x8, x16
2989	mul	x13, x9, x16
2990	umulh	x14, x9, x16
2991	adds	x12, x12, x13
2992	mul	x13, x10, x16
2993	adc	x13, x13, x14
2994	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2995	umulh	x8, x8, x17
2996	adds	x12, x12, x14
2997	mul	x14, x9, x17
2998	umulh	x9, x9, x17
2999	adcs	x14, x14, x8
3000	mul	x10, x10, x17
3001	adc	x10, x10, x9
3002	adds	x13, x13, x14
3003	adc	x14, x10, xzr
3004	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
3005	and	x8, x13, #-4
3006	extr	x13, x14, x13, #2
3007	adds	x8, x8, x11
3008	lsr	x11, x14, #2
3009	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
3010	adds	x8, x8, x13
3011	adcs	x9, x9, x12
3012	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
3013	sub	x4, x4, #1
3014	b	.Lopen_128_hash_64
3015.cfi_endproc
3016.size	chacha20_poly1305_open,.-chacha20_poly1305_open
3017#endif  // !OPENSSL_NO_ASM && defined(__AARCH64EL__) && defined(__ELF__)
3018#if defined(__ELF__)
3019// See https://www.airs.com/blog/archives/518.
3020.section .note.GNU-stack,"",%progbits
3021#endif
3022