• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17.section	.rodata
18
19.align	7
20.Lchacha20_consts:
21.byte	'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
22.Linc:
23.long	1,2,3,4
24.Lrol8:
25.byte	3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
26.Lclamp:
27.quad	0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
28
29.text
30
31.type	.Lpoly_hash_ad_internal,%function
32.align	6
33.Lpoly_hash_ad_internal:
34.cfi_startproc
35	cbnz	x4, .Lpoly_hash_intro
36	ret
37
38.Lpoly_hash_intro:
39	cmp	x4, #16
40	b.lt	.Lpoly_hash_ad_tail
41	ldp	x11, x12, [x3], 16
42	adds	x8, x8, x11
43	adcs	x9, x9, x12
44	adc	x10, x10, x15
45	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
46	umulh	x12, x8, x16
47	mul	x13, x9, x16
48	umulh	x14, x9, x16
49	adds	x12, x12, x13
50	mul	x13, x10, x16
51	adc	x13, x13, x14
52	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
53	umulh	x8, x8, x17
54	adds	x12, x12, x14
55	mul	x14, x9, x17
56	umulh	x9, x9, x17
57	adcs	x14, x14, x8
58	mul	x10, x10, x17
59	adc	x10, x10, x9
60	adds	x13, x13, x14
61	adc	x14, x10, xzr
62	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
63	and	x8, x13, #-4
64	extr	x13, x14, x13, #2
65	adds	x8, x8, x11
66	lsr	x11, x14, #2
67	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
68	adds	x8, x8, x13
69	adcs	x9, x9, x12
70	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
71	sub	x4, x4, #16
72	b	.Lpoly_hash_ad_internal
73
74.Lpoly_hash_ad_tail:
75	cbz	x4, .Lpoly_hash_ad_ret
76
77	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the AAD
78	sub	x4, x4, #1
79
80.Lpoly_hash_tail_16_compose:
81	ext	v20.16b, v20.16b, v20.16b, #15
82	ldrb	w11, [x3, x4]
83	mov	v20.b[0], w11
84	subs	x4, x4, #1
85	b.ge	.Lpoly_hash_tail_16_compose
86	mov	x11, v20.d[0]
87	mov	x12, v20.d[1]
88	adds	x8, x8, x11
89	adcs	x9, x9, x12
90	adc	x10, x10, x15
91	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
92	umulh	x12, x8, x16
93	mul	x13, x9, x16
94	umulh	x14, x9, x16
95	adds	x12, x12, x13
96	mul	x13, x10, x16
97	adc	x13, x13, x14
98	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
99	umulh	x8, x8, x17
100	adds	x12, x12, x14
101	mul	x14, x9, x17
102	umulh	x9, x9, x17
103	adcs	x14, x14, x8
104	mul	x10, x10, x17
105	adc	x10, x10, x9
106	adds	x13, x13, x14
107	adc	x14, x10, xzr
108	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
109	and	x8, x13, #-4
110	extr	x13, x14, x13, #2
111	adds	x8, x8, x11
112	lsr	x11, x14, #2
113	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
114	adds	x8, x8, x13
115	adcs	x9, x9, x12
116	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
117
118.Lpoly_hash_ad_ret:
119	ret
120.cfi_endproc
121.size	.Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal
122
123/////////////////////////////////
124//
125// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data);
126//
127.globl	chacha20_poly1305_seal
128.hidden	chacha20_poly1305_seal
129.type	chacha20_poly1305_seal,%function
130.align	6
131chacha20_poly1305_seal:
132	AARCH64_SIGN_LINK_REGISTER
133.cfi_startproc
134	stp	x29, x30, [sp, #-80]!
135.cfi_def_cfa_offset	80
136.cfi_offset	w30, -72
137.cfi_offset	w29, -80
138	mov	x29, sp
139# We probably could do .cfi_def_cfa w29, 80 at this point, but since
140# we don't actually use the frame pointer like that, it's probably not
141# worth bothering.
142	stp	d8, d9, [sp, #16]
143	stp	d10, d11, [sp, #32]
144	stp	d12, d13, [sp, #48]
145	stp	d14, d15, [sp, #64]
146.cfi_offset	b15, -8
147.cfi_offset	b14, -16
148.cfi_offset	b13, -24
149.cfi_offset	b12, -32
150.cfi_offset	b11, -40
151.cfi_offset	b10, -48
152.cfi_offset	b9, -56
153.cfi_offset	b8, -64
154
155	adrp	x11, .Lchacha20_consts
156	add	x11, x11, :lo12:.Lchacha20_consts
157
158	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
159	ld1	{v28.16b - v30.16b}, [x5]
160
161	mov	x15, #1 // Prepare the Poly1305 state
162	mov	x8, #0
163	mov	x9, #0
164	mov	x10, #0
165
166	ldr	x12, [x5, #56]   // The total cipher text length includes extra_in_len
167	add	x12, x12, x2
168	mov	v31.d[0], x4  // Store the input and aad lengths
169	mov	v31.d[1], x12
170
171	cmp	x2, #128
172	b.le	.Lseal_128 // Optimization for smaller buffers
173
174    // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext,
175    // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically,
176    // the fifth block (A4-D4) horizontally.
177	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
178	mov	v4.16b, v24.16b
179
180	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
181	mov	v9.16b, v28.16b
182
183	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
184	mov	v14.16b, v29.16b
185
186	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
187	add	v15.4s, v15.4s, v25.4s
188	mov	v19.16b, v30.16b
189
190	sub	x5, x5, #32
191
192	mov	x6, #10
193
194.align	5
195.Lseal_init_rounds:
196	add	v0.4s, v0.4s, v5.4s
197	add	v1.4s, v1.4s, v6.4s
198	add	v2.4s, v2.4s, v7.4s
199	add	v3.4s, v3.4s, v8.4s
200	add	v4.4s, v4.4s, v9.4s
201
202	eor	v15.16b, v15.16b, v0.16b
203	eor	v16.16b, v16.16b, v1.16b
204	eor	v17.16b, v17.16b, v2.16b
205	eor	v18.16b, v18.16b, v3.16b
206	eor	v19.16b, v19.16b, v4.16b
207
208	rev32	v15.8h, v15.8h
209	rev32	v16.8h, v16.8h
210	rev32	v17.8h, v17.8h
211	rev32	v18.8h, v18.8h
212	rev32	v19.8h, v19.8h
213
214	add	v10.4s, v10.4s, v15.4s
215	add	v11.4s, v11.4s, v16.4s
216	add	v12.4s, v12.4s, v17.4s
217	add	v13.4s, v13.4s, v18.4s
218	add	v14.4s, v14.4s, v19.4s
219
220	eor	v5.16b, v5.16b, v10.16b
221	eor	v6.16b, v6.16b, v11.16b
222	eor	v7.16b, v7.16b, v12.16b
223	eor	v8.16b, v8.16b, v13.16b
224	eor	v9.16b, v9.16b, v14.16b
225
226	ushr	v20.4s, v5.4s, #20
227	sli	v20.4s, v5.4s, #12
228	ushr	v5.4s, v6.4s, #20
229	sli	v5.4s, v6.4s, #12
230	ushr	v6.4s, v7.4s, #20
231	sli	v6.4s, v7.4s, #12
232	ushr	v7.4s, v8.4s, #20
233	sli	v7.4s, v8.4s, #12
234	ushr	v8.4s, v9.4s, #20
235	sli	v8.4s, v9.4s, #12
236
237	add	v0.4s, v0.4s, v20.4s
238	add	v1.4s, v1.4s, v5.4s
239	add	v2.4s, v2.4s, v6.4s
240	add	v3.4s, v3.4s, v7.4s
241	add	v4.4s, v4.4s, v8.4s
242
243	eor	v15.16b, v15.16b, v0.16b
244	eor	v16.16b, v16.16b, v1.16b
245	eor	v17.16b, v17.16b, v2.16b
246	eor	v18.16b, v18.16b, v3.16b
247	eor	v19.16b, v19.16b, v4.16b
248
249	tbl	v15.16b, {v15.16b}, v26.16b
250	tbl	v16.16b, {v16.16b}, v26.16b
251	tbl	v17.16b, {v17.16b}, v26.16b
252	tbl	v18.16b, {v18.16b}, v26.16b
253	tbl	v19.16b, {v19.16b}, v26.16b
254
255	add	v10.4s, v10.4s, v15.4s
256	add	v11.4s, v11.4s, v16.4s
257	add	v12.4s, v12.4s, v17.4s
258	add	v13.4s, v13.4s, v18.4s
259	add	v14.4s, v14.4s, v19.4s
260
261	eor	v20.16b, v20.16b, v10.16b
262	eor	v5.16b, v5.16b, v11.16b
263	eor	v6.16b, v6.16b, v12.16b
264	eor	v7.16b, v7.16b, v13.16b
265	eor	v8.16b, v8.16b, v14.16b
266
267	ushr	v9.4s, v8.4s, #25
268	sli	v9.4s, v8.4s, #7
269	ushr	v8.4s, v7.4s, #25
270	sli	v8.4s, v7.4s, #7
271	ushr	v7.4s, v6.4s, #25
272	sli	v7.4s, v6.4s, #7
273	ushr	v6.4s, v5.4s, #25
274	sli	v6.4s, v5.4s, #7
275	ushr	v5.4s, v20.4s, #25
276	sli	v5.4s, v20.4s, #7
277
278	ext	v9.16b, v9.16b, v9.16b, #4
279	ext	v14.16b, v14.16b, v14.16b, #8
280	ext	v19.16b, v19.16b, v19.16b, #12
281	add	v0.4s, v0.4s, v6.4s
282	add	v1.4s, v1.4s, v7.4s
283	add	v2.4s, v2.4s, v8.4s
284	add	v3.4s, v3.4s, v5.4s
285	add	v4.4s, v4.4s, v9.4s
286
287	eor	v18.16b, v18.16b, v0.16b
288	eor	v15.16b, v15.16b, v1.16b
289	eor	v16.16b, v16.16b, v2.16b
290	eor	v17.16b, v17.16b, v3.16b
291	eor	v19.16b, v19.16b, v4.16b
292
293	rev32	v18.8h, v18.8h
294	rev32	v15.8h, v15.8h
295	rev32	v16.8h, v16.8h
296	rev32	v17.8h, v17.8h
297	rev32	v19.8h, v19.8h
298
299	add	v12.4s, v12.4s, v18.4s
300	add	v13.4s, v13.4s, v15.4s
301	add	v10.4s, v10.4s, v16.4s
302	add	v11.4s, v11.4s, v17.4s
303	add	v14.4s, v14.4s, v19.4s
304
305	eor	v6.16b, v6.16b, v12.16b
306	eor	v7.16b, v7.16b, v13.16b
307	eor	v8.16b, v8.16b, v10.16b
308	eor	v5.16b, v5.16b, v11.16b
309	eor	v9.16b, v9.16b, v14.16b
310
311	ushr	v20.4s, v6.4s, #20
312	sli	v20.4s, v6.4s, #12
313	ushr	v6.4s, v7.4s, #20
314	sli	v6.4s, v7.4s, #12
315	ushr	v7.4s, v8.4s, #20
316	sli	v7.4s, v8.4s, #12
317	ushr	v8.4s, v5.4s, #20
318	sli	v8.4s, v5.4s, #12
319	ushr	v5.4s, v9.4s, #20
320	sli	v5.4s, v9.4s, #12
321
322	add	v0.4s, v0.4s, v20.4s
323	add	v1.4s, v1.4s, v6.4s
324	add	v2.4s, v2.4s, v7.4s
325	add	v3.4s, v3.4s, v8.4s
326	add	v4.4s, v4.4s, v5.4s
327
328	eor	v18.16b, v18.16b, v0.16b
329	eor	v15.16b, v15.16b, v1.16b
330	eor	v16.16b, v16.16b, v2.16b
331	eor	v17.16b, v17.16b, v3.16b
332	eor	v19.16b, v19.16b, v4.16b
333
334	tbl	v18.16b, {v18.16b}, v26.16b
335	tbl	v15.16b, {v15.16b}, v26.16b
336	tbl	v16.16b, {v16.16b}, v26.16b
337	tbl	v17.16b, {v17.16b}, v26.16b
338	tbl	v19.16b, {v19.16b}, v26.16b
339
340	add	v12.4s, v12.4s, v18.4s
341	add	v13.4s, v13.4s, v15.4s
342	add	v10.4s, v10.4s, v16.4s
343	add	v11.4s, v11.4s, v17.4s
344	add	v14.4s, v14.4s, v19.4s
345
346	eor	v20.16b, v20.16b, v12.16b
347	eor	v6.16b, v6.16b, v13.16b
348	eor	v7.16b, v7.16b, v10.16b
349	eor	v8.16b, v8.16b, v11.16b
350	eor	v5.16b, v5.16b, v14.16b
351
352	ushr	v9.4s, v5.4s, #25
353	sli	v9.4s, v5.4s, #7
354	ushr	v5.4s, v8.4s, #25
355	sli	v5.4s, v8.4s, #7
356	ushr	v8.4s, v7.4s, #25
357	sli	v8.4s, v7.4s, #7
358	ushr	v7.4s, v6.4s, #25
359	sli	v7.4s, v6.4s, #7
360	ushr	v6.4s, v20.4s, #25
361	sli	v6.4s, v20.4s, #7
362
363	ext	v9.16b, v9.16b, v9.16b, #12
364	ext	v14.16b, v14.16b, v14.16b, #8
365	ext	v19.16b, v19.16b, v19.16b, #4
366	subs	x6, x6, #1
367	b.hi	.Lseal_init_rounds
368
369	add	v15.4s, v15.4s, v25.4s
370	mov	x11, #4
371	dup	v20.4s, w11
372	add	v25.4s, v25.4s, v20.4s
373
374	zip1	v20.4s, v0.4s, v1.4s
375	zip2	v21.4s, v0.4s, v1.4s
376	zip1	v22.4s, v2.4s, v3.4s
377	zip2	v23.4s, v2.4s, v3.4s
378
379	zip1	v0.2d, v20.2d, v22.2d
380	zip2	v1.2d, v20.2d, v22.2d
381	zip1	v2.2d, v21.2d, v23.2d
382	zip2	v3.2d, v21.2d, v23.2d
383
384	zip1	v20.4s, v5.4s, v6.4s
385	zip2	v21.4s, v5.4s, v6.4s
386	zip1	v22.4s, v7.4s, v8.4s
387	zip2	v23.4s, v7.4s, v8.4s
388
389	zip1	v5.2d, v20.2d, v22.2d
390	zip2	v6.2d, v20.2d, v22.2d
391	zip1	v7.2d, v21.2d, v23.2d
392	zip2	v8.2d, v21.2d, v23.2d
393
394	zip1	v20.4s, v10.4s, v11.4s
395	zip2	v21.4s, v10.4s, v11.4s
396	zip1	v22.4s, v12.4s, v13.4s
397	zip2	v23.4s, v12.4s, v13.4s
398
399	zip1	v10.2d, v20.2d, v22.2d
400	zip2	v11.2d, v20.2d, v22.2d
401	zip1	v12.2d, v21.2d, v23.2d
402	zip2	v13.2d, v21.2d, v23.2d
403
404	zip1	v20.4s, v15.4s, v16.4s
405	zip2	v21.4s, v15.4s, v16.4s
406	zip1	v22.4s, v17.4s, v18.4s
407	zip2	v23.4s, v17.4s, v18.4s
408
409	zip1	v15.2d, v20.2d, v22.2d
410	zip2	v16.2d, v20.2d, v22.2d
411	zip1	v17.2d, v21.2d, v23.2d
412	zip2	v18.2d, v21.2d, v23.2d
413
414	add	v4.4s, v4.4s, v24.4s
415	add	v9.4s, v9.4s, v28.4s
416	and	v4.16b, v4.16b, v27.16b
417
418	add	v0.4s, v0.4s, v24.4s
419	add	v5.4s, v5.4s, v28.4s
420	add	v10.4s, v10.4s, v29.4s
421	add	v15.4s, v15.4s, v30.4s
422
423	add	v1.4s, v1.4s, v24.4s
424	add	v6.4s, v6.4s, v28.4s
425	add	v11.4s, v11.4s, v29.4s
426	add	v16.4s, v16.4s, v30.4s
427
428	add	v2.4s, v2.4s, v24.4s
429	add	v7.4s, v7.4s, v28.4s
430	add	v12.4s, v12.4s, v29.4s
431	add	v17.4s, v17.4s, v30.4s
432
433	add	v3.4s, v3.4s, v24.4s
434	add	v8.4s, v8.4s, v28.4s
435	add	v13.4s, v13.4s, v29.4s
436	add	v18.4s, v18.4s, v30.4s
437
438	mov	x16, v4.d[0] // Move the R key to GPRs
439	mov	x17, v4.d[1]
440	mov	v27.16b, v9.16b // Store the S key
441
442	bl	.Lpoly_hash_ad_internal
443
444	mov	x3, x0
445	cmp	x2, #256
446	b.le	.Lseal_tail
447
448	ld1	{v20.16b - v23.16b}, [x1], #64
449	eor	v20.16b, v20.16b, v0.16b
450	eor	v21.16b, v21.16b, v5.16b
451	eor	v22.16b, v22.16b, v10.16b
452	eor	v23.16b, v23.16b, v15.16b
453	st1	{v20.16b - v23.16b}, [x0], #64
454
455	ld1	{v20.16b - v23.16b}, [x1], #64
456	eor	v20.16b, v20.16b, v1.16b
457	eor	v21.16b, v21.16b, v6.16b
458	eor	v22.16b, v22.16b, v11.16b
459	eor	v23.16b, v23.16b, v16.16b
460	st1	{v20.16b - v23.16b}, [x0], #64
461
462	ld1	{v20.16b - v23.16b}, [x1], #64
463	eor	v20.16b, v20.16b, v2.16b
464	eor	v21.16b, v21.16b, v7.16b
465	eor	v22.16b, v22.16b, v12.16b
466	eor	v23.16b, v23.16b, v17.16b
467	st1	{v20.16b - v23.16b}, [x0], #64
468
469	ld1	{v20.16b - v23.16b}, [x1], #64
470	eor	v20.16b, v20.16b, v3.16b
471	eor	v21.16b, v21.16b, v8.16b
472	eor	v22.16b, v22.16b, v13.16b
473	eor	v23.16b, v23.16b, v18.16b
474	st1	{v20.16b - v23.16b}, [x0], #64
475
476	sub	x2, x2, #256
477
478	mov	x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds
479	mov	x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256
480
481.Lseal_main_loop:
482	adrp	x11, .Lchacha20_consts
483	add	x11, x11, :lo12:.Lchacha20_consts
484
485	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
486	mov	v4.16b, v24.16b
487
488	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
489	mov	v9.16b, v28.16b
490
491	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
492	mov	v14.16b, v29.16b
493
494	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
495	add	v15.4s, v15.4s, v25.4s
496	mov	v19.16b, v30.16b
497
498	eor	v20.16b, v20.16b, v20.16b //zero
499	not	v21.16b, v20.16b // -1
500	sub	v21.4s, v25.4s, v21.4s // Add +1
501	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
502	add	v19.4s, v19.4s, v20.4s
503
504	sub	x5, x5, #32
505.align	5
506.Lseal_main_loop_rounds:
507	add	v0.4s, v0.4s, v5.4s
508	add	v1.4s, v1.4s, v6.4s
509	add	v2.4s, v2.4s, v7.4s
510	add	v3.4s, v3.4s, v8.4s
511	add	v4.4s, v4.4s, v9.4s
512
513	eor	v15.16b, v15.16b, v0.16b
514	eor	v16.16b, v16.16b, v1.16b
515	eor	v17.16b, v17.16b, v2.16b
516	eor	v18.16b, v18.16b, v3.16b
517	eor	v19.16b, v19.16b, v4.16b
518
519	rev32	v15.8h, v15.8h
520	rev32	v16.8h, v16.8h
521	rev32	v17.8h, v17.8h
522	rev32	v18.8h, v18.8h
523	rev32	v19.8h, v19.8h
524
525	add	v10.4s, v10.4s, v15.4s
526	add	v11.4s, v11.4s, v16.4s
527	add	v12.4s, v12.4s, v17.4s
528	add	v13.4s, v13.4s, v18.4s
529	add	v14.4s, v14.4s, v19.4s
530
531	eor	v5.16b, v5.16b, v10.16b
532	eor	v6.16b, v6.16b, v11.16b
533	eor	v7.16b, v7.16b, v12.16b
534	eor	v8.16b, v8.16b, v13.16b
535	eor	v9.16b, v9.16b, v14.16b
536
537	ushr	v20.4s, v5.4s, #20
538	sli	v20.4s, v5.4s, #12
539	ushr	v5.4s, v6.4s, #20
540	sli	v5.4s, v6.4s, #12
541	ushr	v6.4s, v7.4s, #20
542	sli	v6.4s, v7.4s, #12
543	ushr	v7.4s, v8.4s, #20
544	sli	v7.4s, v8.4s, #12
545	ushr	v8.4s, v9.4s, #20
546	sli	v8.4s, v9.4s, #12
547
548	add	v0.4s, v0.4s, v20.4s
549	add	v1.4s, v1.4s, v5.4s
550	add	v2.4s, v2.4s, v6.4s
551	add	v3.4s, v3.4s, v7.4s
552	add	v4.4s, v4.4s, v8.4s
553
554	eor	v15.16b, v15.16b, v0.16b
555	eor	v16.16b, v16.16b, v1.16b
556	eor	v17.16b, v17.16b, v2.16b
557	eor	v18.16b, v18.16b, v3.16b
558	eor	v19.16b, v19.16b, v4.16b
559
560	tbl	v15.16b, {v15.16b}, v26.16b
561	tbl	v16.16b, {v16.16b}, v26.16b
562	tbl	v17.16b, {v17.16b}, v26.16b
563	tbl	v18.16b, {v18.16b}, v26.16b
564	tbl	v19.16b, {v19.16b}, v26.16b
565
566	add	v10.4s, v10.4s, v15.4s
567	add	v11.4s, v11.4s, v16.4s
568	add	v12.4s, v12.4s, v17.4s
569	add	v13.4s, v13.4s, v18.4s
570	add	v14.4s, v14.4s, v19.4s
571
572	eor	v20.16b, v20.16b, v10.16b
573	eor	v5.16b, v5.16b, v11.16b
574	eor	v6.16b, v6.16b, v12.16b
575	eor	v7.16b, v7.16b, v13.16b
576	eor	v8.16b, v8.16b, v14.16b
577
578	ushr	v9.4s, v8.4s, #25
579	sli	v9.4s, v8.4s, #7
580	ushr	v8.4s, v7.4s, #25
581	sli	v8.4s, v7.4s, #7
582	ushr	v7.4s, v6.4s, #25
583	sli	v7.4s, v6.4s, #7
584	ushr	v6.4s, v5.4s, #25
585	sli	v6.4s, v5.4s, #7
586	ushr	v5.4s, v20.4s, #25
587	sli	v5.4s, v20.4s, #7
588
589	ext	v9.16b, v9.16b, v9.16b, #4
590	ext	v14.16b, v14.16b, v14.16b, #8
591	ext	v19.16b, v19.16b, v19.16b, #12
592	ldp	x11, x12, [x3], 16
593	adds	x8, x8, x11
594	adcs	x9, x9, x12
595	adc	x10, x10, x15
596	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
597	umulh	x12, x8, x16
598	mul	x13, x9, x16
599	umulh	x14, x9, x16
600	adds	x12, x12, x13
601	mul	x13, x10, x16
602	adc	x13, x13, x14
603	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
604	umulh	x8, x8, x17
605	adds	x12, x12, x14
606	mul	x14, x9, x17
607	umulh	x9, x9, x17
608	adcs	x14, x14, x8
609	mul	x10, x10, x17
610	adc	x10, x10, x9
611	adds	x13, x13, x14
612	adc	x14, x10, xzr
613	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
614	and	x8, x13, #-4
615	extr	x13, x14, x13, #2
616	adds	x8, x8, x11
617	lsr	x11, x14, #2
618	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
619	adds	x8, x8, x13
620	adcs	x9, x9, x12
621	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
622	add	v0.4s, v0.4s, v6.4s
623	add	v1.4s, v1.4s, v7.4s
624	add	v2.4s, v2.4s, v8.4s
625	add	v3.4s, v3.4s, v5.4s
626	add	v4.4s, v4.4s, v9.4s
627
628	eor	v18.16b, v18.16b, v0.16b
629	eor	v15.16b, v15.16b, v1.16b
630	eor	v16.16b, v16.16b, v2.16b
631	eor	v17.16b, v17.16b, v3.16b
632	eor	v19.16b, v19.16b, v4.16b
633
634	rev32	v18.8h, v18.8h
635	rev32	v15.8h, v15.8h
636	rev32	v16.8h, v16.8h
637	rev32	v17.8h, v17.8h
638	rev32	v19.8h, v19.8h
639
640	add	v12.4s, v12.4s, v18.4s
641	add	v13.4s, v13.4s, v15.4s
642	add	v10.4s, v10.4s, v16.4s
643	add	v11.4s, v11.4s, v17.4s
644	add	v14.4s, v14.4s, v19.4s
645
646	eor	v6.16b, v6.16b, v12.16b
647	eor	v7.16b, v7.16b, v13.16b
648	eor	v8.16b, v8.16b, v10.16b
649	eor	v5.16b, v5.16b, v11.16b
650	eor	v9.16b, v9.16b, v14.16b
651
652	ushr	v20.4s, v6.4s, #20
653	sli	v20.4s, v6.4s, #12
654	ushr	v6.4s, v7.4s, #20
655	sli	v6.4s, v7.4s, #12
656	ushr	v7.4s, v8.4s, #20
657	sli	v7.4s, v8.4s, #12
658	ushr	v8.4s, v5.4s, #20
659	sli	v8.4s, v5.4s, #12
660	ushr	v5.4s, v9.4s, #20
661	sli	v5.4s, v9.4s, #12
662
663	add	v0.4s, v0.4s, v20.4s
664	add	v1.4s, v1.4s, v6.4s
665	add	v2.4s, v2.4s, v7.4s
666	add	v3.4s, v3.4s, v8.4s
667	add	v4.4s, v4.4s, v5.4s
668
669	eor	v18.16b, v18.16b, v0.16b
670	eor	v15.16b, v15.16b, v1.16b
671	eor	v16.16b, v16.16b, v2.16b
672	eor	v17.16b, v17.16b, v3.16b
673	eor	v19.16b, v19.16b, v4.16b
674
675	tbl	v18.16b, {v18.16b}, v26.16b
676	tbl	v15.16b, {v15.16b}, v26.16b
677	tbl	v16.16b, {v16.16b}, v26.16b
678	tbl	v17.16b, {v17.16b}, v26.16b
679	tbl	v19.16b, {v19.16b}, v26.16b
680
681	add	v12.4s, v12.4s, v18.4s
682	add	v13.4s, v13.4s, v15.4s
683	add	v10.4s, v10.4s, v16.4s
684	add	v11.4s, v11.4s, v17.4s
685	add	v14.4s, v14.4s, v19.4s
686
687	eor	v20.16b, v20.16b, v12.16b
688	eor	v6.16b, v6.16b, v13.16b
689	eor	v7.16b, v7.16b, v10.16b
690	eor	v8.16b, v8.16b, v11.16b
691	eor	v5.16b, v5.16b, v14.16b
692
693	ushr	v9.4s, v5.4s, #25
694	sli	v9.4s, v5.4s, #7
695	ushr	v5.4s, v8.4s, #25
696	sli	v5.4s, v8.4s, #7
697	ushr	v8.4s, v7.4s, #25
698	sli	v8.4s, v7.4s, #7
699	ushr	v7.4s, v6.4s, #25
700	sli	v7.4s, v6.4s, #7
701	ushr	v6.4s, v20.4s, #25
702	sli	v6.4s, v20.4s, #7
703
704	ext	v9.16b, v9.16b, v9.16b, #12
705	ext	v14.16b, v14.16b, v14.16b, #8
706	ext	v19.16b, v19.16b, v19.16b, #4
707	subs	x6, x6, #1
708	b.ge	.Lseal_main_loop_rounds
709	ldp	x11, x12, [x3], 16
710	adds	x8, x8, x11
711	adcs	x9, x9, x12
712	adc	x10, x10, x15
713	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
714	umulh	x12, x8, x16
715	mul	x13, x9, x16
716	umulh	x14, x9, x16
717	adds	x12, x12, x13
718	mul	x13, x10, x16
719	adc	x13, x13, x14
720	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
721	umulh	x8, x8, x17
722	adds	x12, x12, x14
723	mul	x14, x9, x17
724	umulh	x9, x9, x17
725	adcs	x14, x14, x8
726	mul	x10, x10, x17
727	adc	x10, x10, x9
728	adds	x13, x13, x14
729	adc	x14, x10, xzr
730	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
731	and	x8, x13, #-4
732	extr	x13, x14, x13, #2
733	adds	x8, x8, x11
734	lsr	x11, x14, #2
735	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
736	adds	x8, x8, x13
737	adcs	x9, x9, x12
738	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
739	subs	x7, x7, #1
740	b.gt	.Lseal_main_loop_rounds
741
742	eor	v20.16b, v20.16b, v20.16b //zero
743	not	v21.16b, v20.16b // -1
744	sub	v21.4s, v25.4s, v21.4s // Add +1
745	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
746	add	v19.4s, v19.4s, v20.4s
747
748	add	v15.4s, v15.4s, v25.4s
749	mov	x11, #5
750	dup	v20.4s, w11
751	add	v25.4s, v25.4s, v20.4s
752
753	zip1	v20.4s, v0.4s, v1.4s
754	zip2	v21.4s, v0.4s, v1.4s
755	zip1	v22.4s, v2.4s, v3.4s
756	zip2	v23.4s, v2.4s, v3.4s
757
758	zip1	v0.2d, v20.2d, v22.2d
759	zip2	v1.2d, v20.2d, v22.2d
760	zip1	v2.2d, v21.2d, v23.2d
761	zip2	v3.2d, v21.2d, v23.2d
762
763	zip1	v20.4s, v5.4s, v6.4s
764	zip2	v21.4s, v5.4s, v6.4s
765	zip1	v22.4s, v7.4s, v8.4s
766	zip2	v23.4s, v7.4s, v8.4s
767
768	zip1	v5.2d, v20.2d, v22.2d
769	zip2	v6.2d, v20.2d, v22.2d
770	zip1	v7.2d, v21.2d, v23.2d
771	zip2	v8.2d, v21.2d, v23.2d
772
773	zip1	v20.4s, v10.4s, v11.4s
774	zip2	v21.4s, v10.4s, v11.4s
775	zip1	v22.4s, v12.4s, v13.4s
776	zip2	v23.4s, v12.4s, v13.4s
777
778	zip1	v10.2d, v20.2d, v22.2d
779	zip2	v11.2d, v20.2d, v22.2d
780	zip1	v12.2d, v21.2d, v23.2d
781	zip2	v13.2d, v21.2d, v23.2d
782
783	zip1	v20.4s, v15.4s, v16.4s
784	zip2	v21.4s, v15.4s, v16.4s
785	zip1	v22.4s, v17.4s, v18.4s
786	zip2	v23.4s, v17.4s, v18.4s
787
788	zip1	v15.2d, v20.2d, v22.2d
789	zip2	v16.2d, v20.2d, v22.2d
790	zip1	v17.2d, v21.2d, v23.2d
791	zip2	v18.2d, v21.2d, v23.2d
792
793	add	v0.4s, v0.4s, v24.4s
794	add	v5.4s, v5.4s, v28.4s
795	add	v10.4s, v10.4s, v29.4s
796	add	v15.4s, v15.4s, v30.4s
797
798	add	v1.4s, v1.4s, v24.4s
799	add	v6.4s, v6.4s, v28.4s
800	add	v11.4s, v11.4s, v29.4s
801	add	v16.4s, v16.4s, v30.4s
802
803	add	v2.4s, v2.4s, v24.4s
804	add	v7.4s, v7.4s, v28.4s
805	add	v12.4s, v12.4s, v29.4s
806	add	v17.4s, v17.4s, v30.4s
807
808	add	v3.4s, v3.4s, v24.4s
809	add	v8.4s, v8.4s, v28.4s
810	add	v13.4s, v13.4s, v29.4s
811	add	v18.4s, v18.4s, v30.4s
812
813	add	v4.4s, v4.4s, v24.4s
814	add	v9.4s, v9.4s, v28.4s
815	add	v14.4s, v14.4s, v29.4s
816	add	v19.4s, v19.4s, v30.4s
817
818	cmp	x2, #320
819	b.le	.Lseal_tail
820
821	ld1	{v20.16b - v23.16b}, [x1], #64
822	eor	v20.16b, v20.16b, v0.16b
823	eor	v21.16b, v21.16b, v5.16b
824	eor	v22.16b, v22.16b, v10.16b
825	eor	v23.16b, v23.16b, v15.16b
826	st1	{v20.16b - v23.16b}, [x0], #64
827
828	ld1	{v20.16b - v23.16b}, [x1], #64
829	eor	v20.16b, v20.16b, v1.16b
830	eor	v21.16b, v21.16b, v6.16b
831	eor	v22.16b, v22.16b, v11.16b
832	eor	v23.16b, v23.16b, v16.16b
833	st1	{v20.16b - v23.16b}, [x0], #64
834
835	ld1	{v20.16b - v23.16b}, [x1], #64
836	eor	v20.16b, v20.16b, v2.16b
837	eor	v21.16b, v21.16b, v7.16b
838	eor	v22.16b, v22.16b, v12.16b
839	eor	v23.16b, v23.16b, v17.16b
840	st1	{v20.16b - v23.16b}, [x0], #64
841
842	ld1	{v20.16b - v23.16b}, [x1], #64
843	eor	v20.16b, v20.16b, v3.16b
844	eor	v21.16b, v21.16b, v8.16b
845	eor	v22.16b, v22.16b, v13.16b
846	eor	v23.16b, v23.16b, v18.16b
847	st1	{v20.16b - v23.16b}, [x0], #64
848
849	ld1	{v20.16b - v23.16b}, [x1], #64
850	eor	v20.16b, v20.16b, v4.16b
851	eor	v21.16b, v21.16b, v9.16b
852	eor	v22.16b, v22.16b, v14.16b
853	eor	v23.16b, v23.16b, v19.16b
854	st1	{v20.16b - v23.16b}, [x0], #64
855
856	sub	x2, x2, #320
857
858	mov	x6, #0
859	mov	x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration
860
861	b	.Lseal_main_loop
862
863.Lseal_tail:
864    // This part of the function handles the storage and authentication of the last [0,320) bytes
865    // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data.
866	cmp	x2, #64
867	b.lt	.Lseal_tail_64
868
869    // Store and authenticate 64B blocks per iteration
870	ld1	{v20.16b - v23.16b}, [x1], #64
871
872	eor	v20.16b, v20.16b, v0.16b
873	eor	v21.16b, v21.16b, v5.16b
874	eor	v22.16b, v22.16b, v10.16b
875	eor	v23.16b, v23.16b, v15.16b
876	mov	x11, v20.d[0]
877	mov	x12, v20.d[1]
878	adds	x8, x8, x11
879	adcs	x9, x9, x12
880	adc	x10, x10, x15
881	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
882	umulh	x12, x8, x16
883	mul	x13, x9, x16
884	umulh	x14, x9, x16
885	adds	x12, x12, x13
886	mul	x13, x10, x16
887	adc	x13, x13, x14
888	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
889	umulh	x8, x8, x17
890	adds	x12, x12, x14
891	mul	x14, x9, x17
892	umulh	x9, x9, x17
893	adcs	x14, x14, x8
894	mul	x10, x10, x17
895	adc	x10, x10, x9
896	adds	x13, x13, x14
897	adc	x14, x10, xzr
898	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
899	and	x8, x13, #-4
900	extr	x13, x14, x13, #2
901	adds	x8, x8, x11
902	lsr	x11, x14, #2
903	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
904	adds	x8, x8, x13
905	adcs	x9, x9, x12
906	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
907	mov	x11, v21.d[0]
908	mov	x12, v21.d[1]
909	adds	x8, x8, x11
910	adcs	x9, x9, x12
911	adc	x10, x10, x15
912	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
913	umulh	x12, x8, x16
914	mul	x13, x9, x16
915	umulh	x14, x9, x16
916	adds	x12, x12, x13
917	mul	x13, x10, x16
918	adc	x13, x13, x14
919	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
920	umulh	x8, x8, x17
921	adds	x12, x12, x14
922	mul	x14, x9, x17
923	umulh	x9, x9, x17
924	adcs	x14, x14, x8
925	mul	x10, x10, x17
926	adc	x10, x10, x9
927	adds	x13, x13, x14
928	adc	x14, x10, xzr
929	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
930	and	x8, x13, #-4
931	extr	x13, x14, x13, #2
932	adds	x8, x8, x11
933	lsr	x11, x14, #2
934	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
935	adds	x8, x8, x13
936	adcs	x9, x9, x12
937	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
938	mov	x11, v22.d[0]
939	mov	x12, v22.d[1]
940	adds	x8, x8, x11
941	adcs	x9, x9, x12
942	adc	x10, x10, x15
943	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
944	umulh	x12, x8, x16
945	mul	x13, x9, x16
946	umulh	x14, x9, x16
947	adds	x12, x12, x13
948	mul	x13, x10, x16
949	adc	x13, x13, x14
950	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
951	umulh	x8, x8, x17
952	adds	x12, x12, x14
953	mul	x14, x9, x17
954	umulh	x9, x9, x17
955	adcs	x14, x14, x8
956	mul	x10, x10, x17
957	adc	x10, x10, x9
958	adds	x13, x13, x14
959	adc	x14, x10, xzr
960	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
961	and	x8, x13, #-4
962	extr	x13, x14, x13, #2
963	adds	x8, x8, x11
964	lsr	x11, x14, #2
965	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
966	adds	x8, x8, x13
967	adcs	x9, x9, x12
968	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
969	mov	x11, v23.d[0]
970	mov	x12, v23.d[1]
971	adds	x8, x8, x11
972	adcs	x9, x9, x12
973	adc	x10, x10, x15
974	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
975	umulh	x12, x8, x16
976	mul	x13, x9, x16
977	umulh	x14, x9, x16
978	adds	x12, x12, x13
979	mul	x13, x10, x16
980	adc	x13, x13, x14
981	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
982	umulh	x8, x8, x17
983	adds	x12, x12, x14
984	mul	x14, x9, x17
985	umulh	x9, x9, x17
986	adcs	x14, x14, x8
987	mul	x10, x10, x17
988	adc	x10, x10, x9
989	adds	x13, x13, x14
990	adc	x14, x10, xzr
991	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
992	and	x8, x13, #-4
993	extr	x13, x14, x13, #2
994	adds	x8, x8, x11
995	lsr	x11, x14, #2
996	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
997	adds	x8, x8, x13
998	adcs	x9, x9, x12
999	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1000	st1	{v20.16b - v23.16b}, [x0], #64
1001	sub	x2, x2, #64
1002
1003    // Shift the state left by 64 bytes for the next iteration of the loop
1004	mov	v0.16b, v1.16b
1005	mov	v5.16b, v6.16b
1006	mov	v10.16b, v11.16b
1007	mov	v15.16b, v16.16b
1008
1009	mov	v1.16b, v2.16b
1010	mov	v6.16b, v7.16b
1011	mov	v11.16b, v12.16b
1012	mov	v16.16b, v17.16b
1013
1014	mov	v2.16b, v3.16b
1015	mov	v7.16b, v8.16b
1016	mov	v12.16b, v13.16b
1017	mov	v17.16b, v18.16b
1018
1019	mov	v3.16b, v4.16b
1020	mov	v8.16b, v9.16b
1021	mov	v13.16b, v14.16b
1022	mov	v18.16b, v19.16b
1023
1024	b	.Lseal_tail
1025
1026.Lseal_tail_64:
1027	ldp	x3, x4, [x5, #48] // extra_in_len and extra_in_ptr
1028
1029    // Here we handle the last [0,64) bytes of plaintext
1030	cmp	x2, #16
1031	b.lt	.Lseal_tail_16
1032    // Each iteration encrypt and authenticate a 16B block
1033	ld1	{v20.16b}, [x1], #16
1034	eor	v20.16b, v20.16b, v0.16b
1035	mov	x11, v20.d[0]
1036	mov	x12, v20.d[1]
1037	adds	x8, x8, x11
1038	adcs	x9, x9, x12
1039	adc	x10, x10, x15
1040	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1041	umulh	x12, x8, x16
1042	mul	x13, x9, x16
1043	umulh	x14, x9, x16
1044	adds	x12, x12, x13
1045	mul	x13, x10, x16
1046	adc	x13, x13, x14
1047	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1048	umulh	x8, x8, x17
1049	adds	x12, x12, x14
1050	mul	x14, x9, x17
1051	umulh	x9, x9, x17
1052	adcs	x14, x14, x8
1053	mul	x10, x10, x17
1054	adc	x10, x10, x9
1055	adds	x13, x13, x14
1056	adc	x14, x10, xzr
1057	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1058	and	x8, x13, #-4
1059	extr	x13, x14, x13, #2
1060	adds	x8, x8, x11
1061	lsr	x11, x14, #2
1062	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1063	adds	x8, x8, x13
1064	adcs	x9, x9, x12
1065	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1066	st1	{v20.16b}, [x0], #16
1067
1068	sub	x2, x2, #16
1069
1070    // Shift the state left by 16 bytes for the next iteration of the loop
1071	mov	v0.16b, v5.16b
1072	mov	v5.16b, v10.16b
1073	mov	v10.16b, v15.16b
1074
1075	b	.Lseal_tail_64
1076
1077.Lseal_tail_16:
1078    // Here we handle the last [0,16) bytes of ciphertext that require a padded block
1079	cbz	x2, .Lseal_hash_extra
1080
1081	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in
1082	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes
1083	not	v22.16b, v20.16b
1084
1085	mov	x6, x2
1086	add	x1, x1, x2
1087
1088	cbz	x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding
1089
1090	mov	x7, #16          // We need to load some extra_in first for padding
1091	sub	x7, x7, x2
1092	cmp	x4, x7
1093	csel	x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register
1094	mov	x12, x7
1095	add	x3, x3, x7
1096	sub	x4, x4, x7
1097
1098.Lseal_tail16_compose_extra_in:
1099	ext	v20.16b, v20.16b, v20.16b, #15
1100	ldrb	w11, [x3, #-1]!
1101	mov	v20.b[0], w11
1102	subs	x7, x7, #1
1103	b.gt	.Lseal_tail16_compose_extra_in
1104
1105	add	x3, x3, x12
1106
1107.Lseal_tail_16_compose:
1108	ext	v20.16b, v20.16b, v20.16b, #15
1109	ldrb	w11, [x1, #-1]!
1110	mov	v20.b[0], w11
1111	ext	v21.16b, v22.16b, v21.16b, #15
1112	subs	x2, x2, #1
1113	b.gt	.Lseal_tail_16_compose
1114
1115	and	v0.16b, v0.16b, v21.16b
1116	eor	v20.16b, v20.16b, v0.16b
1117	mov	v21.16b, v20.16b
1118
1119.Lseal_tail_16_store:
1120	umov	w11, v20.b[0]
1121	strb	w11, [x0], #1
1122	ext	v20.16b, v20.16b, v20.16b, #1
1123	subs	x6, x6, #1
1124	b.gt	.Lseal_tail_16_store
1125
1126    // Hash in the final ct block concatenated with extra_in
1127	mov	x11, v21.d[0]
1128	mov	x12, v21.d[1]
1129	adds	x8, x8, x11
1130	adcs	x9, x9, x12
1131	adc	x10, x10, x15
1132	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1133	umulh	x12, x8, x16
1134	mul	x13, x9, x16
1135	umulh	x14, x9, x16
1136	adds	x12, x12, x13
1137	mul	x13, x10, x16
1138	adc	x13, x13, x14
1139	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1140	umulh	x8, x8, x17
1141	adds	x12, x12, x14
1142	mul	x14, x9, x17
1143	umulh	x9, x9, x17
1144	adcs	x14, x14, x8
1145	mul	x10, x10, x17
1146	adc	x10, x10, x9
1147	adds	x13, x13, x14
1148	adc	x14, x10, xzr
1149	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1150	and	x8, x13, #-4
1151	extr	x13, x14, x13, #2
1152	adds	x8, x8, x11
1153	lsr	x11, x14, #2
1154	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1155	adds	x8, x8, x13
1156	adcs	x9, x9, x12
1157	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1158
1159.Lseal_hash_extra:
1160	cbz	x4, .Lseal_finalize
1161
1162.Lseal_hash_extra_loop:
1163	cmp	x4, #16
1164	b.lt	.Lseal_hash_extra_tail
1165	ld1	{v20.16b}, [x3], #16
1166	mov	x11, v20.d[0]
1167	mov	x12, v20.d[1]
1168	adds	x8, x8, x11
1169	adcs	x9, x9, x12
1170	adc	x10, x10, x15
1171	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1172	umulh	x12, x8, x16
1173	mul	x13, x9, x16
1174	umulh	x14, x9, x16
1175	adds	x12, x12, x13
1176	mul	x13, x10, x16
1177	adc	x13, x13, x14
1178	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1179	umulh	x8, x8, x17
1180	adds	x12, x12, x14
1181	mul	x14, x9, x17
1182	umulh	x9, x9, x17
1183	adcs	x14, x14, x8
1184	mul	x10, x10, x17
1185	adc	x10, x10, x9
1186	adds	x13, x13, x14
1187	adc	x14, x10, xzr
1188	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1189	and	x8, x13, #-4
1190	extr	x13, x14, x13, #2
1191	adds	x8, x8, x11
1192	lsr	x11, x14, #2
1193	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1194	adds	x8, x8, x13
1195	adcs	x9, x9, x12
1196	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1197	sub	x4, x4, #16
1198	b	.Lseal_hash_extra_loop
1199
1200.Lseal_hash_extra_tail:
1201	cbz	x4, .Lseal_finalize
1202	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext
1203	add	x3, x3, x4
1204
1205.Lseal_hash_extra_load:
1206	ext	v20.16b, v20.16b, v20.16b, #15
1207	ldrb	w11, [x3, #-1]!
1208	mov	v20.b[0], w11
1209	subs	x4, x4, #1
1210	b.gt	.Lseal_hash_extra_load
1211
1212    // Hash in the final padded extra_in blcok
1213	mov	x11, v20.d[0]
1214	mov	x12, v20.d[1]
1215	adds	x8, x8, x11
1216	adcs	x9, x9, x12
1217	adc	x10, x10, x15
1218	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1219	umulh	x12, x8, x16
1220	mul	x13, x9, x16
1221	umulh	x14, x9, x16
1222	adds	x12, x12, x13
1223	mul	x13, x10, x16
1224	adc	x13, x13, x14
1225	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1226	umulh	x8, x8, x17
1227	adds	x12, x12, x14
1228	mul	x14, x9, x17
1229	umulh	x9, x9, x17
1230	adcs	x14, x14, x8
1231	mul	x10, x10, x17
1232	adc	x10, x10, x9
1233	adds	x13, x13, x14
1234	adc	x14, x10, xzr
1235	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1236	and	x8, x13, #-4
1237	extr	x13, x14, x13, #2
1238	adds	x8, x8, x11
1239	lsr	x11, x14, #2
1240	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1241	adds	x8, x8, x13
1242	adcs	x9, x9, x12
1243	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1244
1245.Lseal_finalize:
1246	mov	x11, v31.d[0]
1247	mov	x12, v31.d[1]
1248	adds	x8, x8, x11
1249	adcs	x9, x9, x12
1250	adc	x10, x10, x15
1251	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1252	umulh	x12, x8, x16
1253	mul	x13, x9, x16
1254	umulh	x14, x9, x16
1255	adds	x12, x12, x13
1256	mul	x13, x10, x16
1257	adc	x13, x13, x14
1258	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1259	umulh	x8, x8, x17
1260	adds	x12, x12, x14
1261	mul	x14, x9, x17
1262	umulh	x9, x9, x17
1263	adcs	x14, x14, x8
1264	mul	x10, x10, x17
1265	adc	x10, x10, x9
1266	adds	x13, x13, x14
1267	adc	x14, x10, xzr
1268	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1269	and	x8, x13, #-4
1270	extr	x13, x14, x13, #2
1271	adds	x8, x8, x11
1272	lsr	x11, x14, #2
1273	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1274	adds	x8, x8, x13
1275	adcs	x9, x9, x12
1276	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1277    # Final reduction step
1278	sub	x12, xzr, x15
1279	orr	x13, xzr, #3
1280	subs	x11, x8, #-5
1281	sbcs	x12, x9, x12
1282	sbcs	x13, x10, x13
1283	csel	x8, x11, x8, cs
1284	csel	x9, x12, x9, cs
1285	csel	x10, x13, x10, cs
1286	mov	x11, v27.d[0]
1287	mov	x12, v27.d[1]
1288	adds	x8, x8, x11
1289	adcs	x9, x9, x12
1290	adc	x10, x10, x15
1291
1292	stp	x8, x9, [x5]
1293
1294	ldp	d8, d9, [sp, #16]
1295	ldp	d10, d11, [sp, #32]
1296	ldp	d12, d13, [sp, #48]
1297	ldp	d14, d15, [sp, #64]
1298.cfi_restore	b15
1299.cfi_restore	b14
1300.cfi_restore	b13
1301.cfi_restore	b12
1302.cfi_restore	b11
1303.cfi_restore	b10
1304.cfi_restore	b9
1305.cfi_restore	b8
1306	ldp	x29, x30, [sp], 80
1307.cfi_restore	w29
1308.cfi_restore	w30
1309.cfi_def_cfa_offset	0
1310	AARCH64_VALIDATE_LINK_REGISTER
1311	ret
1312
1313.Lseal_128:
1314    // On some architectures preparing 5 blocks for small buffers is wasteful
1315	eor	v25.16b, v25.16b, v25.16b
1316	mov	x11, #1
1317	mov	v25.s[0], w11
1318	mov	v0.16b, v24.16b
1319	mov	v1.16b, v24.16b
1320	mov	v2.16b, v24.16b
1321	mov	v5.16b, v28.16b
1322	mov	v6.16b, v28.16b
1323	mov	v7.16b, v28.16b
1324	mov	v10.16b, v29.16b
1325	mov	v11.16b, v29.16b
1326	mov	v12.16b, v29.16b
1327	mov	v17.16b, v30.16b
1328	add	v15.4s, v17.4s, v25.4s
1329	add	v16.4s, v15.4s, v25.4s
1330
1331	mov	x6, #10
1332
1333.Lseal_128_rounds:
1334	add	v0.4s, v0.4s, v5.4s
1335	add	v1.4s, v1.4s, v6.4s
1336	add	v2.4s, v2.4s, v7.4s
1337	eor	v15.16b, v15.16b, v0.16b
1338	eor	v16.16b, v16.16b, v1.16b
1339	eor	v17.16b, v17.16b, v2.16b
1340	rev32	v15.8h, v15.8h
1341	rev32	v16.8h, v16.8h
1342	rev32	v17.8h, v17.8h
1343
1344	add	v10.4s, v10.4s, v15.4s
1345	add	v11.4s, v11.4s, v16.4s
1346	add	v12.4s, v12.4s, v17.4s
1347	eor	v5.16b, v5.16b, v10.16b
1348	eor	v6.16b, v6.16b, v11.16b
1349	eor	v7.16b, v7.16b, v12.16b
1350	ushr	v20.4s, v5.4s, #20
1351	sli	v20.4s, v5.4s, #12
1352	ushr	v5.4s, v6.4s, #20
1353	sli	v5.4s, v6.4s, #12
1354	ushr	v6.4s, v7.4s, #20
1355	sli	v6.4s, v7.4s, #12
1356
1357	add	v0.4s, v0.4s, v20.4s
1358	add	v1.4s, v1.4s, v5.4s
1359	add	v2.4s, v2.4s, v6.4s
1360	eor	v15.16b, v15.16b, v0.16b
1361	eor	v16.16b, v16.16b, v1.16b
1362	eor	v17.16b, v17.16b, v2.16b
1363	tbl	v15.16b, {v15.16b}, v26.16b
1364	tbl	v16.16b, {v16.16b}, v26.16b
1365	tbl	v17.16b, {v17.16b}, v26.16b
1366
1367	add	v10.4s, v10.4s, v15.4s
1368	add	v11.4s, v11.4s, v16.4s
1369	add	v12.4s, v12.4s, v17.4s
1370	eor	v20.16b, v20.16b, v10.16b
1371	eor	v5.16b, v5.16b, v11.16b
1372	eor	v6.16b, v6.16b, v12.16b
1373	ushr	v7.4s, v6.4s, #25
1374	sli	v7.4s, v6.4s, #7
1375	ushr	v6.4s, v5.4s, #25
1376	sli	v6.4s, v5.4s, #7
1377	ushr	v5.4s, v20.4s, #25
1378	sli	v5.4s, v20.4s, #7
1379
1380	ext	v5.16b, v5.16b, v5.16b, #4
1381	ext	v6.16b, v6.16b, v6.16b, #4
1382	ext	v7.16b, v7.16b, v7.16b, #4
1383
1384	ext	v10.16b, v10.16b, v10.16b, #8
1385	ext	v11.16b, v11.16b, v11.16b, #8
1386	ext	v12.16b, v12.16b, v12.16b, #8
1387
1388	ext	v15.16b, v15.16b, v15.16b, #12
1389	ext	v16.16b, v16.16b, v16.16b, #12
1390	ext	v17.16b, v17.16b, v17.16b, #12
1391	add	v0.4s, v0.4s, v5.4s
1392	add	v1.4s, v1.4s, v6.4s
1393	add	v2.4s, v2.4s, v7.4s
1394	eor	v15.16b, v15.16b, v0.16b
1395	eor	v16.16b, v16.16b, v1.16b
1396	eor	v17.16b, v17.16b, v2.16b
1397	rev32	v15.8h, v15.8h
1398	rev32	v16.8h, v16.8h
1399	rev32	v17.8h, v17.8h
1400
1401	add	v10.4s, v10.4s, v15.4s
1402	add	v11.4s, v11.4s, v16.4s
1403	add	v12.4s, v12.4s, v17.4s
1404	eor	v5.16b, v5.16b, v10.16b
1405	eor	v6.16b, v6.16b, v11.16b
1406	eor	v7.16b, v7.16b, v12.16b
1407	ushr	v20.4s, v5.4s, #20
1408	sli	v20.4s, v5.4s, #12
1409	ushr	v5.4s, v6.4s, #20
1410	sli	v5.4s, v6.4s, #12
1411	ushr	v6.4s, v7.4s, #20
1412	sli	v6.4s, v7.4s, #12
1413
1414	add	v0.4s, v0.4s, v20.4s
1415	add	v1.4s, v1.4s, v5.4s
1416	add	v2.4s, v2.4s, v6.4s
1417	eor	v15.16b, v15.16b, v0.16b
1418	eor	v16.16b, v16.16b, v1.16b
1419	eor	v17.16b, v17.16b, v2.16b
1420	tbl	v15.16b, {v15.16b}, v26.16b
1421	tbl	v16.16b, {v16.16b}, v26.16b
1422	tbl	v17.16b, {v17.16b}, v26.16b
1423
1424	add	v10.4s, v10.4s, v15.4s
1425	add	v11.4s, v11.4s, v16.4s
1426	add	v12.4s, v12.4s, v17.4s
1427	eor	v20.16b, v20.16b, v10.16b
1428	eor	v5.16b, v5.16b, v11.16b
1429	eor	v6.16b, v6.16b, v12.16b
1430	ushr	v7.4s, v6.4s, #25
1431	sli	v7.4s, v6.4s, #7
1432	ushr	v6.4s, v5.4s, #25
1433	sli	v6.4s, v5.4s, #7
1434	ushr	v5.4s, v20.4s, #25
1435	sli	v5.4s, v20.4s, #7
1436
1437	ext	v5.16b, v5.16b, v5.16b, #12
1438	ext	v6.16b, v6.16b, v6.16b, #12
1439	ext	v7.16b, v7.16b, v7.16b, #12
1440
1441	ext	v10.16b, v10.16b, v10.16b, #8
1442	ext	v11.16b, v11.16b, v11.16b, #8
1443	ext	v12.16b, v12.16b, v12.16b, #8
1444
1445	ext	v15.16b, v15.16b, v15.16b, #4
1446	ext	v16.16b, v16.16b, v16.16b, #4
1447	ext	v17.16b, v17.16b, v17.16b, #4
1448	subs	x6, x6, #1
1449	b.hi	.Lseal_128_rounds
1450
1451	add	v0.4s, v0.4s, v24.4s
1452	add	v1.4s, v1.4s, v24.4s
1453	add	v2.4s, v2.4s, v24.4s
1454
1455	add	v5.4s, v5.4s, v28.4s
1456	add	v6.4s, v6.4s, v28.4s
1457	add	v7.4s, v7.4s, v28.4s
1458
1459    // Only the first 32 bytes of the third block (counter = 0) are needed,
1460    // so skip updating v12 and v17.
1461	add	v10.4s, v10.4s, v29.4s
1462	add	v11.4s, v11.4s, v29.4s
1463
1464	add	v30.4s, v30.4s, v25.4s
1465	add	v15.4s, v15.4s, v30.4s
1466	add	v30.4s, v30.4s, v25.4s
1467	add	v16.4s, v16.4s, v30.4s
1468
1469	and	v2.16b, v2.16b, v27.16b
1470	mov	x16, v2.d[0] // Move the R key to GPRs
1471	mov	x17, v2.d[1]
1472	mov	v27.16b, v7.16b // Store the S key
1473
1474	bl	.Lpoly_hash_ad_internal
1475	b	.Lseal_tail
1476.cfi_endproc
1477.size	chacha20_poly1305_seal,.-chacha20_poly1305_seal
1478
1479/////////////////////////////////
1480//
1481// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data);
1482//
1483.globl	chacha20_poly1305_open
1484.hidden	chacha20_poly1305_open
1485.type	chacha20_poly1305_open,%function
1486.align	6
1487chacha20_poly1305_open:
1488	AARCH64_SIGN_LINK_REGISTER
1489.cfi_startproc
1490	stp	x29, x30, [sp, #-80]!
1491.cfi_def_cfa_offset	80
1492.cfi_offset	w30, -72
1493.cfi_offset	w29, -80
1494	mov	x29, sp
1495# We probably could do .cfi_def_cfa w29, 80 at this point, but since
1496# we don't actually use the frame pointer like that, it's probably not
1497# worth bothering.
1498	stp	d8, d9, [sp, #16]
1499	stp	d10, d11, [sp, #32]
1500	stp	d12, d13, [sp, #48]
1501	stp	d14, d15, [sp, #64]
1502.cfi_offset	b15, -8
1503.cfi_offset	b14, -16
1504.cfi_offset	b13, -24
1505.cfi_offset	b12, -32
1506.cfi_offset	b11, -40
1507.cfi_offset	b10, -48
1508.cfi_offset	b9, -56
1509.cfi_offset	b8, -64
1510
1511	adrp	x11, .Lchacha20_consts
1512	add	x11, x11, :lo12:.Lchacha20_consts
1513
1514	ld1	{v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values
1515	ld1	{v28.16b - v30.16b}, [x5]
1516
1517	mov	x15, #1 // Prepare the Poly1305 state
1518	mov	x8, #0
1519	mov	x9, #0
1520	mov	x10, #0
1521
1522	mov	v31.d[0], x4  // Store the input and aad lengths
1523	mov	v31.d[1], x2
1524
1525	cmp	x2, #128
1526	b.le	.Lopen_128 // Optimization for smaller buffers
1527
1528    // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys
1529	mov	v0.16b, v24.16b
1530	mov	v5.16b, v28.16b
1531	mov	v10.16b, v29.16b
1532	mov	v15.16b, v30.16b
1533
1534	mov	x6, #10
1535
1536.align	5
1537.Lopen_init_rounds:
1538	add	v0.4s, v0.4s, v5.4s
1539	eor	v15.16b, v15.16b, v0.16b
1540	rev32	v15.8h, v15.8h
1541
1542	add	v10.4s, v10.4s, v15.4s
1543	eor	v5.16b, v5.16b, v10.16b
1544	ushr	v20.4s, v5.4s, #20
1545	sli	v20.4s, v5.4s, #12
1546	add	v0.4s, v0.4s, v20.4s
1547	eor	v15.16b, v15.16b, v0.16b
1548	tbl	v15.16b, {v15.16b}, v26.16b
1549
1550	add	v10.4s, v10.4s, v15.4s
1551	eor	v20.16b, v20.16b, v10.16b
1552	ushr	v5.4s, v20.4s, #25
1553	sli	v5.4s, v20.4s, #7
1554	ext	v5.16b, v5.16b, v5.16b, #4
1555	ext	v10.16b, v10.16b, v10.16b, #8
1556	ext	v15.16b, v15.16b, v15.16b, #12
1557	add	v0.4s, v0.4s, v5.4s
1558	eor	v15.16b, v15.16b, v0.16b
1559	rev32	v15.8h, v15.8h
1560
1561	add	v10.4s, v10.4s, v15.4s
1562	eor	v5.16b, v5.16b, v10.16b
1563	ushr	v20.4s, v5.4s, #20
1564	sli	v20.4s, v5.4s, #12
1565	add	v0.4s, v0.4s, v20.4s
1566	eor	v15.16b, v15.16b, v0.16b
1567	tbl	v15.16b, {v15.16b}, v26.16b
1568
1569	add	v10.4s, v10.4s, v15.4s
1570	eor	v20.16b, v20.16b, v10.16b
1571	ushr	v5.4s, v20.4s, #25
1572	sli	v5.4s, v20.4s, #7
1573	ext	v5.16b, v5.16b, v5.16b, #12
1574	ext	v10.16b, v10.16b, v10.16b, #8
1575	ext	v15.16b, v15.16b, v15.16b, #4
1576	subs	x6, x6, #1
1577	b.hi	.Lopen_init_rounds
1578
1579	add	v0.4s, v0.4s, v24.4s
1580	add	v5.4s, v5.4s, v28.4s
1581
1582	and	v0.16b, v0.16b, v27.16b
1583	mov	x16, v0.d[0] // Move the R key to GPRs
1584	mov	x17, v0.d[1]
1585	mov	v27.16b, v5.16b // Store the S key
1586
1587	bl	.Lpoly_hash_ad_internal
1588
1589.Lopen_ad_done:
1590	mov	x3, x1
1591
1592// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes
1593.Lopen_main_loop:
1594
1595	cmp	x2, #192
1596	b.lt	.Lopen_tail
1597
1598	adrp	x11, .Lchacha20_consts
1599	add	x11, x11, :lo12:.Lchacha20_consts
1600
1601	ld4r	{v0.4s,v1.4s,v2.4s,v3.4s}, [x11]
1602	mov	v4.16b, v24.16b
1603
1604	ld4r	{v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16
1605	mov	v9.16b, v28.16b
1606
1607	ld4r	{v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16
1608	mov	v14.16b, v29.16b
1609
1610	ld4r	{v15.4s,v16.4s,v17.4s,v18.4s}, [x5]
1611	sub	x5, x5, #32
1612	add	v15.4s, v15.4s, v25.4s
1613	mov	v19.16b, v30.16b
1614
1615	eor	v20.16b, v20.16b, v20.16b //zero
1616	not	v21.16b, v20.16b // -1
1617	sub	v21.4s, v25.4s, v21.4s // Add +1
1618	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1619	add	v19.4s, v19.4s, v20.4s
1620
1621	lsr	x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12
1622	sub	x4, x4, #10
1623
1624	mov	x7, #10
1625	subs	x6, x7, x4
1626	subs	x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash
1627	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full
1628
1629	cbz	x7, .Lopen_main_loop_rounds_short
1630
1631.align	5
1632.Lopen_main_loop_rounds:
1633	ldp	x11, x12, [x3], 16
1634	adds	x8, x8, x11
1635	adcs	x9, x9, x12
1636	adc	x10, x10, x15
1637	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1638	umulh	x12, x8, x16
1639	mul	x13, x9, x16
1640	umulh	x14, x9, x16
1641	adds	x12, x12, x13
1642	mul	x13, x10, x16
1643	adc	x13, x13, x14
1644	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1645	umulh	x8, x8, x17
1646	adds	x12, x12, x14
1647	mul	x14, x9, x17
1648	umulh	x9, x9, x17
1649	adcs	x14, x14, x8
1650	mul	x10, x10, x17
1651	adc	x10, x10, x9
1652	adds	x13, x13, x14
1653	adc	x14, x10, xzr
1654	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1655	and	x8, x13, #-4
1656	extr	x13, x14, x13, #2
1657	adds	x8, x8, x11
1658	lsr	x11, x14, #2
1659	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1660	adds	x8, x8, x13
1661	adcs	x9, x9, x12
1662	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1663.Lopen_main_loop_rounds_short:
1664	add	v0.4s, v0.4s, v5.4s
1665	add	v1.4s, v1.4s, v6.4s
1666	add	v2.4s, v2.4s, v7.4s
1667	add	v3.4s, v3.4s, v8.4s
1668	add	v4.4s, v4.4s, v9.4s
1669
1670	eor	v15.16b, v15.16b, v0.16b
1671	eor	v16.16b, v16.16b, v1.16b
1672	eor	v17.16b, v17.16b, v2.16b
1673	eor	v18.16b, v18.16b, v3.16b
1674	eor	v19.16b, v19.16b, v4.16b
1675
1676	rev32	v15.8h, v15.8h
1677	rev32	v16.8h, v16.8h
1678	rev32	v17.8h, v17.8h
1679	rev32	v18.8h, v18.8h
1680	rev32	v19.8h, v19.8h
1681
1682	add	v10.4s, v10.4s, v15.4s
1683	add	v11.4s, v11.4s, v16.4s
1684	add	v12.4s, v12.4s, v17.4s
1685	add	v13.4s, v13.4s, v18.4s
1686	add	v14.4s, v14.4s, v19.4s
1687
1688	eor	v5.16b, v5.16b, v10.16b
1689	eor	v6.16b, v6.16b, v11.16b
1690	eor	v7.16b, v7.16b, v12.16b
1691	eor	v8.16b, v8.16b, v13.16b
1692	eor	v9.16b, v9.16b, v14.16b
1693
1694	ushr	v20.4s, v5.4s, #20
1695	sli	v20.4s, v5.4s, #12
1696	ushr	v5.4s, v6.4s, #20
1697	sli	v5.4s, v6.4s, #12
1698	ushr	v6.4s, v7.4s, #20
1699	sli	v6.4s, v7.4s, #12
1700	ushr	v7.4s, v8.4s, #20
1701	sli	v7.4s, v8.4s, #12
1702	ushr	v8.4s, v9.4s, #20
1703	sli	v8.4s, v9.4s, #12
1704
1705	add	v0.4s, v0.4s, v20.4s
1706	add	v1.4s, v1.4s, v5.4s
1707	add	v2.4s, v2.4s, v6.4s
1708	add	v3.4s, v3.4s, v7.4s
1709	add	v4.4s, v4.4s, v8.4s
1710
1711	eor	v15.16b, v15.16b, v0.16b
1712	eor	v16.16b, v16.16b, v1.16b
1713	eor	v17.16b, v17.16b, v2.16b
1714	eor	v18.16b, v18.16b, v3.16b
1715	eor	v19.16b, v19.16b, v4.16b
1716
1717	tbl	v15.16b, {v15.16b}, v26.16b
1718	tbl	v16.16b, {v16.16b}, v26.16b
1719	tbl	v17.16b, {v17.16b}, v26.16b
1720	tbl	v18.16b, {v18.16b}, v26.16b
1721	tbl	v19.16b, {v19.16b}, v26.16b
1722
1723	add	v10.4s, v10.4s, v15.4s
1724	add	v11.4s, v11.4s, v16.4s
1725	add	v12.4s, v12.4s, v17.4s
1726	add	v13.4s, v13.4s, v18.4s
1727	add	v14.4s, v14.4s, v19.4s
1728
1729	eor	v20.16b, v20.16b, v10.16b
1730	eor	v5.16b, v5.16b, v11.16b
1731	eor	v6.16b, v6.16b, v12.16b
1732	eor	v7.16b, v7.16b, v13.16b
1733	eor	v8.16b, v8.16b, v14.16b
1734
1735	ushr	v9.4s, v8.4s, #25
1736	sli	v9.4s, v8.4s, #7
1737	ushr	v8.4s, v7.4s, #25
1738	sli	v8.4s, v7.4s, #7
1739	ushr	v7.4s, v6.4s, #25
1740	sli	v7.4s, v6.4s, #7
1741	ushr	v6.4s, v5.4s, #25
1742	sli	v6.4s, v5.4s, #7
1743	ushr	v5.4s, v20.4s, #25
1744	sli	v5.4s, v20.4s, #7
1745
1746	ext	v9.16b, v9.16b, v9.16b, #4
1747	ext	v14.16b, v14.16b, v14.16b, #8
1748	ext	v19.16b, v19.16b, v19.16b, #12
1749	ldp	x11, x12, [x3], 16
1750	adds	x8, x8, x11
1751	adcs	x9, x9, x12
1752	adc	x10, x10, x15
1753	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
1754	umulh	x12, x8, x16
1755	mul	x13, x9, x16
1756	umulh	x14, x9, x16
1757	adds	x12, x12, x13
1758	mul	x13, x10, x16
1759	adc	x13, x13, x14
1760	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
1761	umulh	x8, x8, x17
1762	adds	x12, x12, x14
1763	mul	x14, x9, x17
1764	umulh	x9, x9, x17
1765	adcs	x14, x14, x8
1766	mul	x10, x10, x17
1767	adc	x10, x10, x9
1768	adds	x13, x13, x14
1769	adc	x14, x10, xzr
1770	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
1771	and	x8, x13, #-4
1772	extr	x13, x14, x13, #2
1773	adds	x8, x8, x11
1774	lsr	x11, x14, #2
1775	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
1776	adds	x8, x8, x13
1777	adcs	x9, x9, x12
1778	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
1779	add	v0.4s, v0.4s, v6.4s
1780	add	v1.4s, v1.4s, v7.4s
1781	add	v2.4s, v2.4s, v8.4s
1782	add	v3.4s, v3.4s, v5.4s
1783	add	v4.4s, v4.4s, v9.4s
1784
1785	eor	v18.16b, v18.16b, v0.16b
1786	eor	v15.16b, v15.16b, v1.16b
1787	eor	v16.16b, v16.16b, v2.16b
1788	eor	v17.16b, v17.16b, v3.16b
1789	eor	v19.16b, v19.16b, v4.16b
1790
1791	rev32	v18.8h, v18.8h
1792	rev32	v15.8h, v15.8h
1793	rev32	v16.8h, v16.8h
1794	rev32	v17.8h, v17.8h
1795	rev32	v19.8h, v19.8h
1796
1797	add	v12.4s, v12.4s, v18.4s
1798	add	v13.4s, v13.4s, v15.4s
1799	add	v10.4s, v10.4s, v16.4s
1800	add	v11.4s, v11.4s, v17.4s
1801	add	v14.4s, v14.4s, v19.4s
1802
1803	eor	v6.16b, v6.16b, v12.16b
1804	eor	v7.16b, v7.16b, v13.16b
1805	eor	v8.16b, v8.16b, v10.16b
1806	eor	v5.16b, v5.16b, v11.16b
1807	eor	v9.16b, v9.16b, v14.16b
1808
1809	ushr	v20.4s, v6.4s, #20
1810	sli	v20.4s, v6.4s, #12
1811	ushr	v6.4s, v7.4s, #20
1812	sli	v6.4s, v7.4s, #12
1813	ushr	v7.4s, v8.4s, #20
1814	sli	v7.4s, v8.4s, #12
1815	ushr	v8.4s, v5.4s, #20
1816	sli	v8.4s, v5.4s, #12
1817	ushr	v5.4s, v9.4s, #20
1818	sli	v5.4s, v9.4s, #12
1819
1820	add	v0.4s, v0.4s, v20.4s
1821	add	v1.4s, v1.4s, v6.4s
1822	add	v2.4s, v2.4s, v7.4s
1823	add	v3.4s, v3.4s, v8.4s
1824	add	v4.4s, v4.4s, v5.4s
1825
1826	eor	v18.16b, v18.16b, v0.16b
1827	eor	v15.16b, v15.16b, v1.16b
1828	eor	v16.16b, v16.16b, v2.16b
1829	eor	v17.16b, v17.16b, v3.16b
1830	eor	v19.16b, v19.16b, v4.16b
1831
1832	tbl	v18.16b, {v18.16b}, v26.16b
1833	tbl	v15.16b, {v15.16b}, v26.16b
1834	tbl	v16.16b, {v16.16b}, v26.16b
1835	tbl	v17.16b, {v17.16b}, v26.16b
1836	tbl	v19.16b, {v19.16b}, v26.16b
1837
1838	add	v12.4s, v12.4s, v18.4s
1839	add	v13.4s, v13.4s, v15.4s
1840	add	v10.4s, v10.4s, v16.4s
1841	add	v11.4s, v11.4s, v17.4s
1842	add	v14.4s, v14.4s, v19.4s
1843
1844	eor	v20.16b, v20.16b, v12.16b
1845	eor	v6.16b, v6.16b, v13.16b
1846	eor	v7.16b, v7.16b, v10.16b
1847	eor	v8.16b, v8.16b, v11.16b
1848	eor	v5.16b, v5.16b, v14.16b
1849
1850	ushr	v9.4s, v5.4s, #25
1851	sli	v9.4s, v5.4s, #7
1852	ushr	v5.4s, v8.4s, #25
1853	sli	v5.4s, v8.4s, #7
1854	ushr	v8.4s, v7.4s, #25
1855	sli	v8.4s, v7.4s, #7
1856	ushr	v7.4s, v6.4s, #25
1857	sli	v7.4s, v6.4s, #7
1858	ushr	v6.4s, v20.4s, #25
1859	sli	v6.4s, v20.4s, #7
1860
1861	ext	v9.16b, v9.16b, v9.16b, #12
1862	ext	v14.16b, v14.16b, v14.16b, #8
1863	ext	v19.16b, v19.16b, v19.16b, #4
1864	subs	x7, x7, #1
1865	b.gt	.Lopen_main_loop_rounds
1866	subs	x6, x6, #1
1867	b.ge	.Lopen_main_loop_rounds_short
1868
1869	eor	v20.16b, v20.16b, v20.16b //zero
1870	not	v21.16b, v20.16b // -1
1871	sub	v21.4s, v25.4s, v21.4s // Add +1
1872	ext	v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter)
1873	add	v19.4s, v19.4s, v20.4s
1874
1875	add	v15.4s, v15.4s, v25.4s
1876	mov	x11, #5
1877	dup	v20.4s, w11
1878	add	v25.4s, v25.4s, v20.4s
1879
1880	zip1	v20.4s, v0.4s, v1.4s
1881	zip2	v21.4s, v0.4s, v1.4s
1882	zip1	v22.4s, v2.4s, v3.4s
1883	zip2	v23.4s, v2.4s, v3.4s
1884
1885	zip1	v0.2d, v20.2d, v22.2d
1886	zip2	v1.2d, v20.2d, v22.2d
1887	zip1	v2.2d, v21.2d, v23.2d
1888	zip2	v3.2d, v21.2d, v23.2d
1889
1890	zip1	v20.4s, v5.4s, v6.4s
1891	zip2	v21.4s, v5.4s, v6.4s
1892	zip1	v22.4s, v7.4s, v8.4s
1893	zip2	v23.4s, v7.4s, v8.4s
1894
1895	zip1	v5.2d, v20.2d, v22.2d
1896	zip2	v6.2d, v20.2d, v22.2d
1897	zip1	v7.2d, v21.2d, v23.2d
1898	zip2	v8.2d, v21.2d, v23.2d
1899
1900	zip1	v20.4s, v10.4s, v11.4s
1901	zip2	v21.4s, v10.4s, v11.4s
1902	zip1	v22.4s, v12.4s, v13.4s
1903	zip2	v23.4s, v12.4s, v13.4s
1904
1905	zip1	v10.2d, v20.2d, v22.2d
1906	zip2	v11.2d, v20.2d, v22.2d
1907	zip1	v12.2d, v21.2d, v23.2d
1908	zip2	v13.2d, v21.2d, v23.2d
1909
1910	zip1	v20.4s, v15.4s, v16.4s
1911	zip2	v21.4s, v15.4s, v16.4s
1912	zip1	v22.4s, v17.4s, v18.4s
1913	zip2	v23.4s, v17.4s, v18.4s
1914
1915	zip1	v15.2d, v20.2d, v22.2d
1916	zip2	v16.2d, v20.2d, v22.2d
1917	zip1	v17.2d, v21.2d, v23.2d
1918	zip2	v18.2d, v21.2d, v23.2d
1919
1920	add	v0.4s, v0.4s, v24.4s
1921	add	v5.4s, v5.4s, v28.4s
1922	add	v10.4s, v10.4s, v29.4s
1923	add	v15.4s, v15.4s, v30.4s
1924
1925	add	v1.4s, v1.4s, v24.4s
1926	add	v6.4s, v6.4s, v28.4s
1927	add	v11.4s, v11.4s, v29.4s
1928	add	v16.4s, v16.4s, v30.4s
1929
1930	add	v2.4s, v2.4s, v24.4s
1931	add	v7.4s, v7.4s, v28.4s
1932	add	v12.4s, v12.4s, v29.4s
1933	add	v17.4s, v17.4s, v30.4s
1934
1935	add	v3.4s, v3.4s, v24.4s
1936	add	v8.4s, v8.4s, v28.4s
1937	add	v13.4s, v13.4s, v29.4s
1938	add	v18.4s, v18.4s, v30.4s
1939
1940	add	v4.4s, v4.4s, v24.4s
1941	add	v9.4s, v9.4s, v28.4s
1942	add	v14.4s, v14.4s, v29.4s
1943	add	v19.4s, v19.4s, v30.4s
1944
1945    // We can always safely store 192 bytes
1946	ld1	{v20.16b - v23.16b}, [x1], #64
1947	eor	v20.16b, v20.16b, v0.16b
1948	eor	v21.16b, v21.16b, v5.16b
1949	eor	v22.16b, v22.16b, v10.16b
1950	eor	v23.16b, v23.16b, v15.16b
1951	st1	{v20.16b - v23.16b}, [x0], #64
1952
1953	ld1	{v20.16b - v23.16b}, [x1], #64
1954	eor	v20.16b, v20.16b, v1.16b
1955	eor	v21.16b, v21.16b, v6.16b
1956	eor	v22.16b, v22.16b, v11.16b
1957	eor	v23.16b, v23.16b, v16.16b
1958	st1	{v20.16b - v23.16b}, [x0], #64
1959
1960	ld1	{v20.16b - v23.16b}, [x1], #64
1961	eor	v20.16b, v20.16b, v2.16b
1962	eor	v21.16b, v21.16b, v7.16b
1963	eor	v22.16b, v22.16b, v12.16b
1964	eor	v23.16b, v23.16b, v17.16b
1965	st1	{v20.16b - v23.16b}, [x0], #64
1966
1967	sub	x2, x2, #192
1968
1969	mov	v0.16b, v3.16b
1970	mov	v5.16b, v8.16b
1971	mov	v10.16b, v13.16b
1972	mov	v15.16b, v18.16b
1973
1974	cmp	x2, #64
1975	b.lt	.Lopen_tail_64_store
1976
1977	ld1	{v20.16b - v23.16b}, [x1], #64
1978	eor	v20.16b, v20.16b, v3.16b
1979	eor	v21.16b, v21.16b, v8.16b
1980	eor	v22.16b, v22.16b, v13.16b
1981	eor	v23.16b, v23.16b, v18.16b
1982	st1	{v20.16b - v23.16b}, [x0], #64
1983
1984	sub	x2, x2, #64
1985
1986	mov	v0.16b, v4.16b
1987	mov	v5.16b, v9.16b
1988	mov	v10.16b, v14.16b
1989	mov	v15.16b, v19.16b
1990
1991	cmp	x2, #64
1992	b.lt	.Lopen_tail_64_store
1993
1994	ld1	{v20.16b - v23.16b}, [x1], #64
1995	eor	v20.16b, v20.16b, v4.16b
1996	eor	v21.16b, v21.16b, v9.16b
1997	eor	v22.16b, v22.16b, v14.16b
1998	eor	v23.16b, v23.16b, v19.16b
1999	st1	{v20.16b - v23.16b}, [x0], #64
2000
2001	sub	x2, x2, #64
2002	b	.Lopen_main_loop
2003
2004.Lopen_tail:
2005
2006	cbz	x2, .Lopen_finalize
2007
2008	lsr	x4, x2, #4 // How many whole blocks we have to hash
2009
2010	cmp	x2, #64
2011	b.le	.Lopen_tail_64
2012	cmp	x2, #128
2013	b.le	.Lopen_tail_128
2014
2015.Lopen_tail_192:
2016     // We need three more blocks
2017	mov	v0.16b, v24.16b
2018	mov	v1.16b, v24.16b
2019	mov	v2.16b, v24.16b
2020	mov	v5.16b, v28.16b
2021	mov	v6.16b, v28.16b
2022	mov	v7.16b, v28.16b
2023	mov	v10.16b, v29.16b
2024	mov	v11.16b, v29.16b
2025	mov	v12.16b, v29.16b
2026	mov	v15.16b, v30.16b
2027	mov	v16.16b, v30.16b
2028	mov	v17.16b, v30.16b
2029	eor	v23.16b, v23.16b, v23.16b
2030	eor	v21.16b, v21.16b, v21.16b
2031	ins	v23.s[0], v25.s[0]
2032	ins	v21.d[0], x15
2033
2034	add	v22.4s, v23.4s, v21.4s
2035	add	v21.4s, v22.4s, v21.4s
2036
2037	add	v15.4s, v15.4s, v21.4s
2038	add	v16.4s, v16.4s, v23.4s
2039	add	v17.4s, v17.4s, v22.4s
2040
2041	mov	x7, #10
2042	subs	x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash
2043	csel	x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing
2044	sub	x4, x4, x7
2045
2046	cbz	x7, .Lopen_tail_192_rounds_no_hash
2047
2048.Lopen_tail_192_rounds:
2049	ldp	x11, x12, [x3], 16
2050	adds	x8, x8, x11
2051	adcs	x9, x9, x12
2052	adc	x10, x10, x15
2053	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2054	umulh	x12, x8, x16
2055	mul	x13, x9, x16
2056	umulh	x14, x9, x16
2057	adds	x12, x12, x13
2058	mul	x13, x10, x16
2059	adc	x13, x13, x14
2060	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2061	umulh	x8, x8, x17
2062	adds	x12, x12, x14
2063	mul	x14, x9, x17
2064	umulh	x9, x9, x17
2065	adcs	x14, x14, x8
2066	mul	x10, x10, x17
2067	adc	x10, x10, x9
2068	adds	x13, x13, x14
2069	adc	x14, x10, xzr
2070	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2071	and	x8, x13, #-4
2072	extr	x13, x14, x13, #2
2073	adds	x8, x8, x11
2074	lsr	x11, x14, #2
2075	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2076	adds	x8, x8, x13
2077	adcs	x9, x9, x12
2078	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2079.Lopen_tail_192_rounds_no_hash:
2080	add	v0.4s, v0.4s, v5.4s
2081	add	v1.4s, v1.4s, v6.4s
2082	add	v2.4s, v2.4s, v7.4s
2083	eor	v15.16b, v15.16b, v0.16b
2084	eor	v16.16b, v16.16b, v1.16b
2085	eor	v17.16b, v17.16b, v2.16b
2086	rev32	v15.8h, v15.8h
2087	rev32	v16.8h, v16.8h
2088	rev32	v17.8h, v17.8h
2089
2090	add	v10.4s, v10.4s, v15.4s
2091	add	v11.4s, v11.4s, v16.4s
2092	add	v12.4s, v12.4s, v17.4s
2093	eor	v5.16b, v5.16b, v10.16b
2094	eor	v6.16b, v6.16b, v11.16b
2095	eor	v7.16b, v7.16b, v12.16b
2096	ushr	v20.4s, v5.4s, #20
2097	sli	v20.4s, v5.4s, #12
2098	ushr	v5.4s, v6.4s, #20
2099	sli	v5.4s, v6.4s, #12
2100	ushr	v6.4s, v7.4s, #20
2101	sli	v6.4s, v7.4s, #12
2102
2103	add	v0.4s, v0.4s, v20.4s
2104	add	v1.4s, v1.4s, v5.4s
2105	add	v2.4s, v2.4s, v6.4s
2106	eor	v15.16b, v15.16b, v0.16b
2107	eor	v16.16b, v16.16b, v1.16b
2108	eor	v17.16b, v17.16b, v2.16b
2109	tbl	v15.16b, {v15.16b}, v26.16b
2110	tbl	v16.16b, {v16.16b}, v26.16b
2111	tbl	v17.16b, {v17.16b}, v26.16b
2112
2113	add	v10.4s, v10.4s, v15.4s
2114	add	v11.4s, v11.4s, v16.4s
2115	add	v12.4s, v12.4s, v17.4s
2116	eor	v20.16b, v20.16b, v10.16b
2117	eor	v5.16b, v5.16b, v11.16b
2118	eor	v6.16b, v6.16b, v12.16b
2119	ushr	v7.4s, v6.4s, #25
2120	sli	v7.4s, v6.4s, #7
2121	ushr	v6.4s, v5.4s, #25
2122	sli	v6.4s, v5.4s, #7
2123	ushr	v5.4s, v20.4s, #25
2124	sli	v5.4s, v20.4s, #7
2125
2126	ext	v5.16b, v5.16b, v5.16b, #4
2127	ext	v6.16b, v6.16b, v6.16b, #4
2128	ext	v7.16b, v7.16b, v7.16b, #4
2129
2130	ext	v10.16b, v10.16b, v10.16b, #8
2131	ext	v11.16b, v11.16b, v11.16b, #8
2132	ext	v12.16b, v12.16b, v12.16b, #8
2133
2134	ext	v15.16b, v15.16b, v15.16b, #12
2135	ext	v16.16b, v16.16b, v16.16b, #12
2136	ext	v17.16b, v17.16b, v17.16b, #12
2137	add	v0.4s, v0.4s, v5.4s
2138	add	v1.4s, v1.4s, v6.4s
2139	add	v2.4s, v2.4s, v7.4s
2140	eor	v15.16b, v15.16b, v0.16b
2141	eor	v16.16b, v16.16b, v1.16b
2142	eor	v17.16b, v17.16b, v2.16b
2143	rev32	v15.8h, v15.8h
2144	rev32	v16.8h, v16.8h
2145	rev32	v17.8h, v17.8h
2146
2147	add	v10.4s, v10.4s, v15.4s
2148	add	v11.4s, v11.4s, v16.4s
2149	add	v12.4s, v12.4s, v17.4s
2150	eor	v5.16b, v5.16b, v10.16b
2151	eor	v6.16b, v6.16b, v11.16b
2152	eor	v7.16b, v7.16b, v12.16b
2153	ushr	v20.4s, v5.4s, #20
2154	sli	v20.4s, v5.4s, #12
2155	ushr	v5.4s, v6.4s, #20
2156	sli	v5.4s, v6.4s, #12
2157	ushr	v6.4s, v7.4s, #20
2158	sli	v6.4s, v7.4s, #12
2159
2160	add	v0.4s, v0.4s, v20.4s
2161	add	v1.4s, v1.4s, v5.4s
2162	add	v2.4s, v2.4s, v6.4s
2163	eor	v15.16b, v15.16b, v0.16b
2164	eor	v16.16b, v16.16b, v1.16b
2165	eor	v17.16b, v17.16b, v2.16b
2166	tbl	v15.16b, {v15.16b}, v26.16b
2167	tbl	v16.16b, {v16.16b}, v26.16b
2168	tbl	v17.16b, {v17.16b}, v26.16b
2169
2170	add	v10.4s, v10.4s, v15.4s
2171	add	v11.4s, v11.4s, v16.4s
2172	add	v12.4s, v12.4s, v17.4s
2173	eor	v20.16b, v20.16b, v10.16b
2174	eor	v5.16b, v5.16b, v11.16b
2175	eor	v6.16b, v6.16b, v12.16b
2176	ushr	v7.4s, v6.4s, #25
2177	sli	v7.4s, v6.4s, #7
2178	ushr	v6.4s, v5.4s, #25
2179	sli	v6.4s, v5.4s, #7
2180	ushr	v5.4s, v20.4s, #25
2181	sli	v5.4s, v20.4s, #7
2182
2183	ext	v5.16b, v5.16b, v5.16b, #12
2184	ext	v6.16b, v6.16b, v6.16b, #12
2185	ext	v7.16b, v7.16b, v7.16b, #12
2186
2187	ext	v10.16b, v10.16b, v10.16b, #8
2188	ext	v11.16b, v11.16b, v11.16b, #8
2189	ext	v12.16b, v12.16b, v12.16b, #8
2190
2191	ext	v15.16b, v15.16b, v15.16b, #4
2192	ext	v16.16b, v16.16b, v16.16b, #4
2193	ext	v17.16b, v17.16b, v17.16b, #4
2194	subs	x7, x7, #1
2195	b.gt	.Lopen_tail_192_rounds
2196	subs	x6, x6, #1
2197	b.ge	.Lopen_tail_192_rounds_no_hash
2198
2199    // We hashed 160 bytes at most, may still have 32 bytes left
2200.Lopen_tail_192_hash:
2201	cbz	x4, .Lopen_tail_192_hash_done
2202	ldp	x11, x12, [x3], 16
2203	adds	x8, x8, x11
2204	adcs	x9, x9, x12
2205	adc	x10, x10, x15
2206	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2207	umulh	x12, x8, x16
2208	mul	x13, x9, x16
2209	umulh	x14, x9, x16
2210	adds	x12, x12, x13
2211	mul	x13, x10, x16
2212	adc	x13, x13, x14
2213	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2214	umulh	x8, x8, x17
2215	adds	x12, x12, x14
2216	mul	x14, x9, x17
2217	umulh	x9, x9, x17
2218	adcs	x14, x14, x8
2219	mul	x10, x10, x17
2220	adc	x10, x10, x9
2221	adds	x13, x13, x14
2222	adc	x14, x10, xzr
2223	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2224	and	x8, x13, #-4
2225	extr	x13, x14, x13, #2
2226	adds	x8, x8, x11
2227	lsr	x11, x14, #2
2228	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2229	adds	x8, x8, x13
2230	adcs	x9, x9, x12
2231	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2232	sub	x4, x4, #1
2233	b	.Lopen_tail_192_hash
2234
2235.Lopen_tail_192_hash_done:
2236
2237	add	v0.4s, v0.4s, v24.4s
2238	add	v1.4s, v1.4s, v24.4s
2239	add	v2.4s, v2.4s, v24.4s
2240	add	v5.4s, v5.4s, v28.4s
2241	add	v6.4s, v6.4s, v28.4s
2242	add	v7.4s, v7.4s, v28.4s
2243	add	v10.4s, v10.4s, v29.4s
2244	add	v11.4s, v11.4s, v29.4s
2245	add	v12.4s, v12.4s, v29.4s
2246	add	v15.4s, v15.4s, v30.4s
2247	add	v16.4s, v16.4s, v30.4s
2248	add	v17.4s, v17.4s, v30.4s
2249
2250	add	v15.4s, v15.4s, v21.4s
2251	add	v16.4s, v16.4s, v23.4s
2252	add	v17.4s, v17.4s, v22.4s
2253
2254	ld1	{v20.16b - v23.16b}, [x1], #64
2255
2256	eor	v20.16b, v20.16b, v1.16b
2257	eor	v21.16b, v21.16b, v6.16b
2258	eor	v22.16b, v22.16b, v11.16b
2259	eor	v23.16b, v23.16b, v16.16b
2260
2261	st1	{v20.16b - v23.16b}, [x0], #64
2262
2263	ld1	{v20.16b - v23.16b}, [x1], #64
2264
2265	eor	v20.16b, v20.16b, v2.16b
2266	eor	v21.16b, v21.16b, v7.16b
2267	eor	v22.16b, v22.16b, v12.16b
2268	eor	v23.16b, v23.16b, v17.16b
2269
2270	st1	{v20.16b - v23.16b}, [x0], #64
2271
2272	sub	x2, x2, #128
2273	b	.Lopen_tail_64_store
2274
2275.Lopen_tail_128:
2276     // We need two more blocks
2277	mov	v0.16b, v24.16b
2278	mov	v1.16b, v24.16b
2279	mov	v5.16b, v28.16b
2280	mov	v6.16b, v28.16b
2281	mov	v10.16b, v29.16b
2282	mov	v11.16b, v29.16b
2283	mov	v15.16b, v30.16b
2284	mov	v16.16b, v30.16b
2285	eor	v23.16b, v23.16b, v23.16b
2286	eor	v22.16b, v22.16b, v22.16b
2287	ins	v23.s[0], v25.s[0]
2288	ins	v22.d[0], x15
2289	add	v22.4s, v22.4s, v23.4s
2290
2291	add	v15.4s, v15.4s, v22.4s
2292	add	v16.4s, v16.4s, v23.4s
2293
2294	mov	x6, #10
2295	sub	x6, x6, x4
2296
2297.Lopen_tail_128_rounds:
2298	add	v0.4s, v0.4s, v5.4s
2299	eor	v15.16b, v15.16b, v0.16b
2300	rev32	v15.8h, v15.8h
2301
2302	add	v10.4s, v10.4s, v15.4s
2303	eor	v5.16b, v5.16b, v10.16b
2304	ushr	v20.4s, v5.4s, #20
2305	sli	v20.4s, v5.4s, #12
2306	add	v0.4s, v0.4s, v20.4s
2307	eor	v15.16b, v15.16b, v0.16b
2308	tbl	v15.16b, {v15.16b}, v26.16b
2309
2310	add	v10.4s, v10.4s, v15.4s
2311	eor	v20.16b, v20.16b, v10.16b
2312	ushr	v5.4s, v20.4s, #25
2313	sli	v5.4s, v20.4s, #7
2314	ext	v5.16b, v5.16b, v5.16b, #4
2315	ext	v10.16b, v10.16b, v10.16b, #8
2316	ext	v15.16b, v15.16b, v15.16b, #12
2317	add	v1.4s, v1.4s, v6.4s
2318	eor	v16.16b, v16.16b, v1.16b
2319	rev32	v16.8h, v16.8h
2320
2321	add	v11.4s, v11.4s, v16.4s
2322	eor	v6.16b, v6.16b, v11.16b
2323	ushr	v20.4s, v6.4s, #20
2324	sli	v20.4s, v6.4s, #12
2325	add	v1.4s, v1.4s, v20.4s
2326	eor	v16.16b, v16.16b, v1.16b
2327	tbl	v16.16b, {v16.16b}, v26.16b
2328
2329	add	v11.4s, v11.4s, v16.4s
2330	eor	v20.16b, v20.16b, v11.16b
2331	ushr	v6.4s, v20.4s, #25
2332	sli	v6.4s, v20.4s, #7
2333	ext	v6.16b, v6.16b, v6.16b, #4
2334	ext	v11.16b, v11.16b, v11.16b, #8
2335	ext	v16.16b, v16.16b, v16.16b, #12
2336	add	v0.4s, v0.4s, v5.4s
2337	eor	v15.16b, v15.16b, v0.16b
2338	rev32	v15.8h, v15.8h
2339
2340	add	v10.4s, v10.4s, v15.4s
2341	eor	v5.16b, v5.16b, v10.16b
2342	ushr	v20.4s, v5.4s, #20
2343	sli	v20.4s, v5.4s, #12
2344	add	v0.4s, v0.4s, v20.4s
2345	eor	v15.16b, v15.16b, v0.16b
2346	tbl	v15.16b, {v15.16b}, v26.16b
2347
2348	add	v10.4s, v10.4s, v15.4s
2349	eor	v20.16b, v20.16b, v10.16b
2350	ushr	v5.4s, v20.4s, #25
2351	sli	v5.4s, v20.4s, #7
2352	ext	v5.16b, v5.16b, v5.16b, #12
2353	ext	v10.16b, v10.16b, v10.16b, #8
2354	ext	v15.16b, v15.16b, v15.16b, #4
2355	add	v1.4s, v1.4s, v6.4s
2356	eor	v16.16b, v16.16b, v1.16b
2357	rev32	v16.8h, v16.8h
2358
2359	add	v11.4s, v11.4s, v16.4s
2360	eor	v6.16b, v6.16b, v11.16b
2361	ushr	v20.4s, v6.4s, #20
2362	sli	v20.4s, v6.4s, #12
2363	add	v1.4s, v1.4s, v20.4s
2364	eor	v16.16b, v16.16b, v1.16b
2365	tbl	v16.16b, {v16.16b}, v26.16b
2366
2367	add	v11.4s, v11.4s, v16.4s
2368	eor	v20.16b, v20.16b, v11.16b
2369	ushr	v6.4s, v20.4s, #25
2370	sli	v6.4s, v20.4s, #7
2371	ext	v6.16b, v6.16b, v6.16b, #12
2372	ext	v11.16b, v11.16b, v11.16b, #8
2373	ext	v16.16b, v16.16b, v16.16b, #4
2374	subs	x6, x6, #1
2375	b.gt	.Lopen_tail_128_rounds
2376	cbz	x4, .Lopen_tail_128_rounds_done
2377	subs	x4, x4, #1
2378	ldp	x11, x12, [x3], 16
2379	adds	x8, x8, x11
2380	adcs	x9, x9, x12
2381	adc	x10, x10, x15
2382	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2383	umulh	x12, x8, x16
2384	mul	x13, x9, x16
2385	umulh	x14, x9, x16
2386	adds	x12, x12, x13
2387	mul	x13, x10, x16
2388	adc	x13, x13, x14
2389	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2390	umulh	x8, x8, x17
2391	adds	x12, x12, x14
2392	mul	x14, x9, x17
2393	umulh	x9, x9, x17
2394	adcs	x14, x14, x8
2395	mul	x10, x10, x17
2396	adc	x10, x10, x9
2397	adds	x13, x13, x14
2398	adc	x14, x10, xzr
2399	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2400	and	x8, x13, #-4
2401	extr	x13, x14, x13, #2
2402	adds	x8, x8, x11
2403	lsr	x11, x14, #2
2404	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2405	adds	x8, x8, x13
2406	adcs	x9, x9, x12
2407	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2408	b	.Lopen_tail_128_rounds
2409
2410.Lopen_tail_128_rounds_done:
2411	add	v0.4s, v0.4s, v24.4s
2412	add	v1.4s, v1.4s, v24.4s
2413	add	v5.4s, v5.4s, v28.4s
2414	add	v6.4s, v6.4s, v28.4s
2415	add	v10.4s, v10.4s, v29.4s
2416	add	v11.4s, v11.4s, v29.4s
2417	add	v15.4s, v15.4s, v30.4s
2418	add	v16.4s, v16.4s, v30.4s
2419	add	v15.4s, v15.4s, v22.4s
2420	add	v16.4s, v16.4s, v23.4s
2421
2422	ld1	{v20.16b - v23.16b}, [x1], #64
2423
2424	eor	v20.16b, v20.16b, v1.16b
2425	eor	v21.16b, v21.16b, v6.16b
2426	eor	v22.16b, v22.16b, v11.16b
2427	eor	v23.16b, v23.16b, v16.16b
2428
2429	st1	{v20.16b - v23.16b}, [x0], #64
2430	sub	x2, x2, #64
2431
2432	b	.Lopen_tail_64_store
2433
2434.Lopen_tail_64:
2435    // We just need a single block
2436	mov	v0.16b, v24.16b
2437	mov	v5.16b, v28.16b
2438	mov	v10.16b, v29.16b
2439	mov	v15.16b, v30.16b
2440	eor	v23.16b, v23.16b, v23.16b
2441	ins	v23.s[0], v25.s[0]
2442	add	v15.4s, v15.4s, v23.4s
2443
2444	mov	x6, #10
2445	sub	x6, x6, x4
2446
2447.Lopen_tail_64_rounds:
2448	add	v0.4s, v0.4s, v5.4s
2449	eor	v15.16b, v15.16b, v0.16b
2450	rev32	v15.8h, v15.8h
2451
2452	add	v10.4s, v10.4s, v15.4s
2453	eor	v5.16b, v5.16b, v10.16b
2454	ushr	v20.4s, v5.4s, #20
2455	sli	v20.4s, v5.4s, #12
2456	add	v0.4s, v0.4s, v20.4s
2457	eor	v15.16b, v15.16b, v0.16b
2458	tbl	v15.16b, {v15.16b}, v26.16b
2459
2460	add	v10.4s, v10.4s, v15.4s
2461	eor	v20.16b, v20.16b, v10.16b
2462	ushr	v5.4s, v20.4s, #25
2463	sli	v5.4s, v20.4s, #7
2464	ext	v5.16b, v5.16b, v5.16b, #4
2465	ext	v10.16b, v10.16b, v10.16b, #8
2466	ext	v15.16b, v15.16b, v15.16b, #12
2467	add	v0.4s, v0.4s, v5.4s
2468	eor	v15.16b, v15.16b, v0.16b
2469	rev32	v15.8h, v15.8h
2470
2471	add	v10.4s, v10.4s, v15.4s
2472	eor	v5.16b, v5.16b, v10.16b
2473	ushr	v20.4s, v5.4s, #20
2474	sli	v20.4s, v5.4s, #12
2475	add	v0.4s, v0.4s, v20.4s
2476	eor	v15.16b, v15.16b, v0.16b
2477	tbl	v15.16b, {v15.16b}, v26.16b
2478
2479	add	v10.4s, v10.4s, v15.4s
2480	eor	v20.16b, v20.16b, v10.16b
2481	ushr	v5.4s, v20.4s, #25
2482	sli	v5.4s, v20.4s, #7
2483	ext	v5.16b, v5.16b, v5.16b, #12
2484	ext	v10.16b, v10.16b, v10.16b, #8
2485	ext	v15.16b, v15.16b, v15.16b, #4
2486	subs	x6, x6, #1
2487	b.gt	.Lopen_tail_64_rounds
2488	cbz	x4, .Lopen_tail_64_rounds_done
2489	subs	x4, x4, #1
2490	ldp	x11, x12, [x3], 16
2491	adds	x8, x8, x11
2492	adcs	x9, x9, x12
2493	adc	x10, x10, x15
2494	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2495	umulh	x12, x8, x16
2496	mul	x13, x9, x16
2497	umulh	x14, x9, x16
2498	adds	x12, x12, x13
2499	mul	x13, x10, x16
2500	adc	x13, x13, x14
2501	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2502	umulh	x8, x8, x17
2503	adds	x12, x12, x14
2504	mul	x14, x9, x17
2505	umulh	x9, x9, x17
2506	adcs	x14, x14, x8
2507	mul	x10, x10, x17
2508	adc	x10, x10, x9
2509	adds	x13, x13, x14
2510	adc	x14, x10, xzr
2511	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2512	and	x8, x13, #-4
2513	extr	x13, x14, x13, #2
2514	adds	x8, x8, x11
2515	lsr	x11, x14, #2
2516	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2517	adds	x8, x8, x13
2518	adcs	x9, x9, x12
2519	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2520	b	.Lopen_tail_64_rounds
2521
2522.Lopen_tail_64_rounds_done:
2523	add	v0.4s, v0.4s, v24.4s
2524	add	v5.4s, v5.4s, v28.4s
2525	add	v10.4s, v10.4s, v29.4s
2526	add	v15.4s, v15.4s, v30.4s
2527	add	v15.4s, v15.4s, v23.4s
2528
2529.Lopen_tail_64_store:
2530	cmp	x2, #16
2531	b.lt	.Lopen_tail_16
2532
2533	ld1	{v20.16b}, [x1], #16
2534	eor	v20.16b, v20.16b, v0.16b
2535	st1	{v20.16b}, [x0], #16
2536	mov	v0.16b, v5.16b
2537	mov	v5.16b, v10.16b
2538	mov	v10.16b, v15.16b
2539	sub	x2, x2, #16
2540	b	.Lopen_tail_64_store
2541
2542.Lopen_tail_16:
2543    // Here we handle the last [0,16) bytes that require a padded block
2544	cbz	x2, .Lopen_finalize
2545
2546	eor	v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext
2547	eor	v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask
2548	not	v22.16b, v20.16b
2549
2550	add	x7, x1, x2
2551	mov	x6, x2
2552
2553.Lopen_tail_16_compose:
2554	ext	v20.16b, v20.16b, v20.16b, #15
2555	ldrb	w11, [x7, #-1]!
2556	mov	v20.b[0], w11
2557	ext	v21.16b, v22.16b, v21.16b, #15
2558	subs	x2, x2, #1
2559	b.gt	.Lopen_tail_16_compose
2560
2561	and	v20.16b, v20.16b, v21.16b
2562    // Hash in the final padded block
2563	mov	x11, v20.d[0]
2564	mov	x12, v20.d[1]
2565	adds	x8, x8, x11
2566	adcs	x9, x9, x12
2567	adc	x10, x10, x15
2568	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2569	umulh	x12, x8, x16
2570	mul	x13, x9, x16
2571	umulh	x14, x9, x16
2572	adds	x12, x12, x13
2573	mul	x13, x10, x16
2574	adc	x13, x13, x14
2575	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2576	umulh	x8, x8, x17
2577	adds	x12, x12, x14
2578	mul	x14, x9, x17
2579	umulh	x9, x9, x17
2580	adcs	x14, x14, x8
2581	mul	x10, x10, x17
2582	adc	x10, x10, x9
2583	adds	x13, x13, x14
2584	adc	x14, x10, xzr
2585	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2586	and	x8, x13, #-4
2587	extr	x13, x14, x13, #2
2588	adds	x8, x8, x11
2589	lsr	x11, x14, #2
2590	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2591	adds	x8, x8, x13
2592	adcs	x9, x9, x12
2593	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2594	eor	v20.16b, v20.16b, v0.16b
2595
2596.Lopen_tail_16_store:
2597	umov	w11, v20.b[0]
2598	strb	w11, [x0], #1
2599	ext	v20.16b, v20.16b, v20.16b, #1
2600	subs	x6, x6, #1
2601	b.gt	.Lopen_tail_16_store
2602
2603.Lopen_finalize:
2604	mov	x11, v31.d[0]
2605	mov	x12, v31.d[1]
2606	adds	x8, x8, x11
2607	adcs	x9, x9, x12
2608	adc	x10, x10, x15
2609	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2610	umulh	x12, x8, x16
2611	mul	x13, x9, x16
2612	umulh	x14, x9, x16
2613	adds	x12, x12, x13
2614	mul	x13, x10, x16
2615	adc	x13, x13, x14
2616	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2617	umulh	x8, x8, x17
2618	adds	x12, x12, x14
2619	mul	x14, x9, x17
2620	umulh	x9, x9, x17
2621	adcs	x14, x14, x8
2622	mul	x10, x10, x17
2623	adc	x10, x10, x9
2624	adds	x13, x13, x14
2625	adc	x14, x10, xzr
2626	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2627	and	x8, x13, #-4
2628	extr	x13, x14, x13, #2
2629	adds	x8, x8, x11
2630	lsr	x11, x14, #2
2631	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2632	adds	x8, x8, x13
2633	adcs	x9, x9, x12
2634	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2635    # Final reduction step
2636	sub	x12, xzr, x15
2637	orr	x13, xzr, #3
2638	subs	x11, x8, #-5
2639	sbcs	x12, x9, x12
2640	sbcs	x13, x10, x13
2641	csel	x8, x11, x8, cs
2642	csel	x9, x12, x9, cs
2643	csel	x10, x13, x10, cs
2644	mov	x11, v27.d[0]
2645	mov	x12, v27.d[1]
2646	adds	x8, x8, x11
2647	adcs	x9, x9, x12
2648	adc	x10, x10, x15
2649
2650	stp	x8, x9, [x5]
2651
2652	ldp	d8, d9, [sp, #16]
2653	ldp	d10, d11, [sp, #32]
2654	ldp	d12, d13, [sp, #48]
2655	ldp	d14, d15, [sp, #64]
2656.cfi_restore	b15
2657.cfi_restore	b14
2658.cfi_restore	b13
2659.cfi_restore	b12
2660.cfi_restore	b11
2661.cfi_restore	b10
2662.cfi_restore	b9
2663.cfi_restore	b8
2664	ldp	x29, x30, [sp], 80
2665.cfi_restore	w29
2666.cfi_restore	w30
2667.cfi_def_cfa_offset	0
2668	AARCH64_VALIDATE_LINK_REGISTER
2669	ret
2670
2671.Lopen_128:
2672    // On some architectures preparing 5 blocks for small buffers is wasteful
2673	eor	v25.16b, v25.16b, v25.16b
2674	mov	x11, #1
2675	mov	v25.s[0], w11
2676	mov	v0.16b, v24.16b
2677	mov	v1.16b, v24.16b
2678	mov	v2.16b, v24.16b
2679	mov	v5.16b, v28.16b
2680	mov	v6.16b, v28.16b
2681	mov	v7.16b, v28.16b
2682	mov	v10.16b, v29.16b
2683	mov	v11.16b, v29.16b
2684	mov	v12.16b, v29.16b
2685	mov	v17.16b, v30.16b
2686	add	v15.4s, v17.4s, v25.4s
2687	add	v16.4s, v15.4s, v25.4s
2688
2689	mov	x6, #10
2690
2691.Lopen_128_rounds:
2692	add	v0.4s, v0.4s, v5.4s
2693	add	v1.4s, v1.4s, v6.4s
2694	add	v2.4s, v2.4s, v7.4s
2695	eor	v15.16b, v15.16b, v0.16b
2696	eor	v16.16b, v16.16b, v1.16b
2697	eor	v17.16b, v17.16b, v2.16b
2698	rev32	v15.8h, v15.8h
2699	rev32	v16.8h, v16.8h
2700	rev32	v17.8h, v17.8h
2701
2702	add	v10.4s, v10.4s, v15.4s
2703	add	v11.4s, v11.4s, v16.4s
2704	add	v12.4s, v12.4s, v17.4s
2705	eor	v5.16b, v5.16b, v10.16b
2706	eor	v6.16b, v6.16b, v11.16b
2707	eor	v7.16b, v7.16b, v12.16b
2708	ushr	v20.4s, v5.4s, #20
2709	sli	v20.4s, v5.4s, #12
2710	ushr	v5.4s, v6.4s, #20
2711	sli	v5.4s, v6.4s, #12
2712	ushr	v6.4s, v7.4s, #20
2713	sli	v6.4s, v7.4s, #12
2714
2715	add	v0.4s, v0.4s, v20.4s
2716	add	v1.4s, v1.4s, v5.4s
2717	add	v2.4s, v2.4s, v6.4s
2718	eor	v15.16b, v15.16b, v0.16b
2719	eor	v16.16b, v16.16b, v1.16b
2720	eor	v17.16b, v17.16b, v2.16b
2721	tbl	v15.16b, {v15.16b}, v26.16b
2722	tbl	v16.16b, {v16.16b}, v26.16b
2723	tbl	v17.16b, {v17.16b}, v26.16b
2724
2725	add	v10.4s, v10.4s, v15.4s
2726	add	v11.4s, v11.4s, v16.4s
2727	add	v12.4s, v12.4s, v17.4s
2728	eor	v20.16b, v20.16b, v10.16b
2729	eor	v5.16b, v5.16b, v11.16b
2730	eor	v6.16b, v6.16b, v12.16b
2731	ushr	v7.4s, v6.4s, #25
2732	sli	v7.4s, v6.4s, #7
2733	ushr	v6.4s, v5.4s, #25
2734	sli	v6.4s, v5.4s, #7
2735	ushr	v5.4s, v20.4s, #25
2736	sli	v5.4s, v20.4s, #7
2737
2738	ext	v5.16b, v5.16b, v5.16b, #4
2739	ext	v6.16b, v6.16b, v6.16b, #4
2740	ext	v7.16b, v7.16b, v7.16b, #4
2741
2742	ext	v10.16b, v10.16b, v10.16b, #8
2743	ext	v11.16b, v11.16b, v11.16b, #8
2744	ext	v12.16b, v12.16b, v12.16b, #8
2745
2746	ext	v15.16b, v15.16b, v15.16b, #12
2747	ext	v16.16b, v16.16b, v16.16b, #12
2748	ext	v17.16b, v17.16b, v17.16b, #12
2749	add	v0.4s, v0.4s, v5.4s
2750	add	v1.4s, v1.4s, v6.4s
2751	add	v2.4s, v2.4s, v7.4s
2752	eor	v15.16b, v15.16b, v0.16b
2753	eor	v16.16b, v16.16b, v1.16b
2754	eor	v17.16b, v17.16b, v2.16b
2755	rev32	v15.8h, v15.8h
2756	rev32	v16.8h, v16.8h
2757	rev32	v17.8h, v17.8h
2758
2759	add	v10.4s, v10.4s, v15.4s
2760	add	v11.4s, v11.4s, v16.4s
2761	add	v12.4s, v12.4s, v17.4s
2762	eor	v5.16b, v5.16b, v10.16b
2763	eor	v6.16b, v6.16b, v11.16b
2764	eor	v7.16b, v7.16b, v12.16b
2765	ushr	v20.4s, v5.4s, #20
2766	sli	v20.4s, v5.4s, #12
2767	ushr	v5.4s, v6.4s, #20
2768	sli	v5.4s, v6.4s, #12
2769	ushr	v6.4s, v7.4s, #20
2770	sli	v6.4s, v7.4s, #12
2771
2772	add	v0.4s, v0.4s, v20.4s
2773	add	v1.4s, v1.4s, v5.4s
2774	add	v2.4s, v2.4s, v6.4s
2775	eor	v15.16b, v15.16b, v0.16b
2776	eor	v16.16b, v16.16b, v1.16b
2777	eor	v17.16b, v17.16b, v2.16b
2778	tbl	v15.16b, {v15.16b}, v26.16b
2779	tbl	v16.16b, {v16.16b}, v26.16b
2780	tbl	v17.16b, {v17.16b}, v26.16b
2781
2782	add	v10.4s, v10.4s, v15.4s
2783	add	v11.4s, v11.4s, v16.4s
2784	add	v12.4s, v12.4s, v17.4s
2785	eor	v20.16b, v20.16b, v10.16b
2786	eor	v5.16b, v5.16b, v11.16b
2787	eor	v6.16b, v6.16b, v12.16b
2788	ushr	v7.4s, v6.4s, #25
2789	sli	v7.4s, v6.4s, #7
2790	ushr	v6.4s, v5.4s, #25
2791	sli	v6.4s, v5.4s, #7
2792	ushr	v5.4s, v20.4s, #25
2793	sli	v5.4s, v20.4s, #7
2794
2795	ext	v5.16b, v5.16b, v5.16b, #12
2796	ext	v6.16b, v6.16b, v6.16b, #12
2797	ext	v7.16b, v7.16b, v7.16b, #12
2798
2799	ext	v10.16b, v10.16b, v10.16b, #8
2800	ext	v11.16b, v11.16b, v11.16b, #8
2801	ext	v12.16b, v12.16b, v12.16b, #8
2802
2803	ext	v15.16b, v15.16b, v15.16b, #4
2804	ext	v16.16b, v16.16b, v16.16b, #4
2805	ext	v17.16b, v17.16b, v17.16b, #4
2806	subs	x6, x6, #1
2807	b.hi	.Lopen_128_rounds
2808
2809	add	v0.4s, v0.4s, v24.4s
2810	add	v1.4s, v1.4s, v24.4s
2811	add	v2.4s, v2.4s, v24.4s
2812
2813	add	v5.4s, v5.4s, v28.4s
2814	add	v6.4s, v6.4s, v28.4s
2815	add	v7.4s, v7.4s, v28.4s
2816
2817	add	v10.4s, v10.4s, v29.4s
2818	add	v11.4s, v11.4s, v29.4s
2819
2820	add	v30.4s, v30.4s, v25.4s
2821	add	v15.4s, v15.4s, v30.4s
2822	add	v30.4s, v30.4s, v25.4s
2823	add	v16.4s, v16.4s, v30.4s
2824
2825	and	v2.16b, v2.16b, v27.16b
2826	mov	x16, v2.d[0] // Move the R key to GPRs
2827	mov	x17, v2.d[1]
2828	mov	v27.16b, v7.16b // Store the S key
2829
2830	bl	.Lpoly_hash_ad_internal
2831
2832.Lopen_128_store:
2833	cmp	x2, #64
2834	b.lt	.Lopen_128_store_64
2835
2836	ld1	{v20.16b - v23.16b}, [x1], #64
2837
2838	mov	x11, v20.d[0]
2839	mov	x12, v20.d[1]
2840	adds	x8, x8, x11
2841	adcs	x9, x9, x12
2842	adc	x10, x10, x15
2843	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2844	umulh	x12, x8, x16
2845	mul	x13, x9, x16
2846	umulh	x14, x9, x16
2847	adds	x12, x12, x13
2848	mul	x13, x10, x16
2849	adc	x13, x13, x14
2850	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2851	umulh	x8, x8, x17
2852	adds	x12, x12, x14
2853	mul	x14, x9, x17
2854	umulh	x9, x9, x17
2855	adcs	x14, x14, x8
2856	mul	x10, x10, x17
2857	adc	x10, x10, x9
2858	adds	x13, x13, x14
2859	adc	x14, x10, xzr
2860	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2861	and	x8, x13, #-4
2862	extr	x13, x14, x13, #2
2863	adds	x8, x8, x11
2864	lsr	x11, x14, #2
2865	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2866	adds	x8, x8, x13
2867	adcs	x9, x9, x12
2868	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2869	mov	x11, v21.d[0]
2870	mov	x12, v21.d[1]
2871	adds	x8, x8, x11
2872	adcs	x9, x9, x12
2873	adc	x10, x10, x15
2874	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2875	umulh	x12, x8, x16
2876	mul	x13, x9, x16
2877	umulh	x14, x9, x16
2878	adds	x12, x12, x13
2879	mul	x13, x10, x16
2880	adc	x13, x13, x14
2881	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2882	umulh	x8, x8, x17
2883	adds	x12, x12, x14
2884	mul	x14, x9, x17
2885	umulh	x9, x9, x17
2886	adcs	x14, x14, x8
2887	mul	x10, x10, x17
2888	adc	x10, x10, x9
2889	adds	x13, x13, x14
2890	adc	x14, x10, xzr
2891	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2892	and	x8, x13, #-4
2893	extr	x13, x14, x13, #2
2894	adds	x8, x8, x11
2895	lsr	x11, x14, #2
2896	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2897	adds	x8, x8, x13
2898	adcs	x9, x9, x12
2899	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2900	mov	x11, v22.d[0]
2901	mov	x12, v22.d[1]
2902	adds	x8, x8, x11
2903	adcs	x9, x9, x12
2904	adc	x10, x10, x15
2905	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2906	umulh	x12, x8, x16
2907	mul	x13, x9, x16
2908	umulh	x14, x9, x16
2909	adds	x12, x12, x13
2910	mul	x13, x10, x16
2911	adc	x13, x13, x14
2912	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2913	umulh	x8, x8, x17
2914	adds	x12, x12, x14
2915	mul	x14, x9, x17
2916	umulh	x9, x9, x17
2917	adcs	x14, x14, x8
2918	mul	x10, x10, x17
2919	adc	x10, x10, x9
2920	adds	x13, x13, x14
2921	adc	x14, x10, xzr
2922	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2923	and	x8, x13, #-4
2924	extr	x13, x14, x13, #2
2925	adds	x8, x8, x11
2926	lsr	x11, x14, #2
2927	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2928	adds	x8, x8, x13
2929	adcs	x9, x9, x12
2930	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2931	mov	x11, v23.d[0]
2932	mov	x12, v23.d[1]
2933	adds	x8, x8, x11
2934	adcs	x9, x9, x12
2935	adc	x10, x10, x15
2936	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2937	umulh	x12, x8, x16
2938	mul	x13, x9, x16
2939	umulh	x14, x9, x16
2940	adds	x12, x12, x13
2941	mul	x13, x10, x16
2942	adc	x13, x13, x14
2943	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2944	umulh	x8, x8, x17
2945	adds	x12, x12, x14
2946	mul	x14, x9, x17
2947	umulh	x9, x9, x17
2948	adcs	x14, x14, x8
2949	mul	x10, x10, x17
2950	adc	x10, x10, x9
2951	adds	x13, x13, x14
2952	adc	x14, x10, xzr
2953	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
2954	and	x8, x13, #-4
2955	extr	x13, x14, x13, #2
2956	adds	x8, x8, x11
2957	lsr	x11, x14, #2
2958	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
2959	adds	x8, x8, x13
2960	adcs	x9, x9, x12
2961	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
2962
2963	eor	v20.16b, v20.16b, v0.16b
2964	eor	v21.16b, v21.16b, v5.16b
2965	eor	v22.16b, v22.16b, v10.16b
2966	eor	v23.16b, v23.16b, v15.16b
2967
2968	st1	{v20.16b - v23.16b}, [x0], #64
2969
2970	sub	x2, x2, #64
2971
2972	mov	v0.16b, v1.16b
2973	mov	v5.16b, v6.16b
2974	mov	v10.16b, v11.16b
2975	mov	v15.16b, v16.16b
2976
2977.Lopen_128_store_64:
2978
2979	lsr	x4, x2, #4
2980	mov	x3, x1
2981
2982.Lopen_128_hash_64:
2983	cbz	x4, .Lopen_tail_64_store
2984	ldp	x11, x12, [x3], 16
2985	adds	x8, x8, x11
2986	adcs	x9, x9, x12
2987	adc	x10, x10, x15
2988	mul	x11, x8, x16     // [t2:t1:t0] = [acc2:acc1:acc0] * r0
2989	umulh	x12, x8, x16
2990	mul	x13, x9, x16
2991	umulh	x14, x9, x16
2992	adds	x12, x12, x13
2993	mul	x13, x10, x16
2994	adc	x13, x13, x14
2995	mul	x14, x8, x17       // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0]
2996	umulh	x8, x8, x17
2997	adds	x12, x12, x14
2998	mul	x14, x9, x17
2999	umulh	x9, x9, x17
3000	adcs	x14, x14, x8
3001	mul	x10, x10, x17
3002	adc	x10, x10, x9
3003	adds	x13, x13, x14
3004	adc	x14, x10, xzr
3005	and	x10, x13, #3         // At this point acc2 is 2 bits at most (value of 3)
3006	and	x8, x13, #-4
3007	extr	x13, x14, x13, #2
3008	adds	x8, x8, x11
3009	lsr	x11, x14, #2
3010	adc	x9, x14, x11        // No carry out since t0 is 61 bits and t3 is 63 bits
3011	adds	x8, x8, x13
3012	adcs	x9, x9, x12
3013	adc	x10, x10, xzr      // At this point acc2 has the value of 4 at most
3014	sub	x4, x4, #1
3015	b	.Lopen_128_hash_64
3016.cfi_endproc
3017.size	chacha20_poly1305_open,.-chacha20_poly1305_open
3018#endif
3019#endif  // !OPENSSL_NO_ASM
3020.section	.note.GNU-stack,"",%progbits
3021