• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include "ring_core_generated/prefix_symbols_asm.h"
14#include <ring-core/arm_arch.h>
15
16
17.hidden	OPENSSL_armcap_P
18
19.section	.rodata
20
21.align	5
22.Lsigma:
23.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
24.Lone:
25.long	1,0,0,0
26.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
27.align	2
28
29.text
30
31.globl	ChaCha20_ctr32
32.hidden	ChaCha20_ctr32
33.type	ChaCha20_ctr32,%function
34.align	5
35ChaCha20_ctr32:
36	AARCH64_VALID_CALL_TARGET
37	cbz	x2,.Labort
38#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
39	adrp	x5,:pg_hi21_nc:OPENSSL_armcap_P
40#else
41	adrp	x5,OPENSSL_armcap_P
42#endif
43	cmp	x2,#192
44	b.lo	.Lshort
45	ldr	w17,[x5,:lo12:OPENSSL_armcap_P]
46	tst	w17,#ARMV7_NEON
47	b.ne	ChaCha20_neon
48
49.Lshort:
50	AARCH64_SIGN_LINK_REGISTER
51	stp	x29,x30,[sp,#-96]!
52	add	x29,sp,#0
53
54	adrp	x5,.Lsigma
55	add	x5,x5,:lo12:.Lsigma
56	stp	x19,x20,[sp,#16]
57	stp	x21,x22,[sp,#32]
58	stp	x23,x24,[sp,#48]
59	stp	x25,x26,[sp,#64]
60	stp	x27,x28,[sp,#80]
61	sub	sp,sp,#64
62
63	ldp	x22,x23,[x5]		// load sigma
64	ldp	x24,x25,[x3]		// load key
65	ldp	x26,x27,[x3,#16]
66	ldp	x28,x30,[x4]		// load counter
67#ifdef	__ARMEB__
68	ror	x24,x24,#32
69	ror	x25,x25,#32
70	ror	x26,x26,#32
71	ror	x27,x27,#32
72	ror	x28,x28,#32
73	ror	x30,x30,#32
74#endif
75
76.Loop_outer:
77	mov	w5,w22			// unpack key block
78	lsr	x6,x22,#32
79	mov	w7,w23
80	lsr	x8,x23,#32
81	mov	w9,w24
82	lsr	x10,x24,#32
83	mov	w11,w25
84	lsr	x12,x25,#32
85	mov	w13,w26
86	lsr	x14,x26,#32
87	mov	w15,w27
88	lsr	x16,x27,#32
89	mov	w17,w28
90	lsr	x19,x28,#32
91	mov	w20,w30
92	lsr	x21,x30,#32
93
94	mov	x4,#10
95	subs	x2,x2,#64
96.Loop:
97	sub	x4,x4,#1
98	add	w5,w5,w9
99	add	w6,w6,w10
100	add	w7,w7,w11
101	add	w8,w8,w12
102	eor	w17,w17,w5
103	eor	w19,w19,w6
104	eor	w20,w20,w7
105	eor	w21,w21,w8
106	ror	w17,w17,#16
107	ror	w19,w19,#16
108	ror	w20,w20,#16
109	ror	w21,w21,#16
110	add	w13,w13,w17
111	add	w14,w14,w19
112	add	w15,w15,w20
113	add	w16,w16,w21
114	eor	w9,w9,w13
115	eor	w10,w10,w14
116	eor	w11,w11,w15
117	eor	w12,w12,w16
118	ror	w9,w9,#20
119	ror	w10,w10,#20
120	ror	w11,w11,#20
121	ror	w12,w12,#20
122	add	w5,w5,w9
123	add	w6,w6,w10
124	add	w7,w7,w11
125	add	w8,w8,w12
126	eor	w17,w17,w5
127	eor	w19,w19,w6
128	eor	w20,w20,w7
129	eor	w21,w21,w8
130	ror	w17,w17,#24
131	ror	w19,w19,#24
132	ror	w20,w20,#24
133	ror	w21,w21,#24
134	add	w13,w13,w17
135	add	w14,w14,w19
136	add	w15,w15,w20
137	add	w16,w16,w21
138	eor	w9,w9,w13
139	eor	w10,w10,w14
140	eor	w11,w11,w15
141	eor	w12,w12,w16
142	ror	w9,w9,#25
143	ror	w10,w10,#25
144	ror	w11,w11,#25
145	ror	w12,w12,#25
146	add	w5,w5,w10
147	add	w6,w6,w11
148	add	w7,w7,w12
149	add	w8,w8,w9
150	eor	w21,w21,w5
151	eor	w17,w17,w6
152	eor	w19,w19,w7
153	eor	w20,w20,w8
154	ror	w21,w21,#16
155	ror	w17,w17,#16
156	ror	w19,w19,#16
157	ror	w20,w20,#16
158	add	w15,w15,w21
159	add	w16,w16,w17
160	add	w13,w13,w19
161	add	w14,w14,w20
162	eor	w10,w10,w15
163	eor	w11,w11,w16
164	eor	w12,w12,w13
165	eor	w9,w9,w14
166	ror	w10,w10,#20
167	ror	w11,w11,#20
168	ror	w12,w12,#20
169	ror	w9,w9,#20
170	add	w5,w5,w10
171	add	w6,w6,w11
172	add	w7,w7,w12
173	add	w8,w8,w9
174	eor	w21,w21,w5
175	eor	w17,w17,w6
176	eor	w19,w19,w7
177	eor	w20,w20,w8
178	ror	w21,w21,#24
179	ror	w17,w17,#24
180	ror	w19,w19,#24
181	ror	w20,w20,#24
182	add	w15,w15,w21
183	add	w16,w16,w17
184	add	w13,w13,w19
185	add	w14,w14,w20
186	eor	w10,w10,w15
187	eor	w11,w11,w16
188	eor	w12,w12,w13
189	eor	w9,w9,w14
190	ror	w10,w10,#25
191	ror	w11,w11,#25
192	ror	w12,w12,#25
193	ror	w9,w9,#25
194	cbnz	x4,.Loop
195
196	add	w5,w5,w22		// accumulate key block
197	add	x6,x6,x22,lsr#32
198	add	w7,w7,w23
199	add	x8,x8,x23,lsr#32
200	add	w9,w9,w24
201	add	x10,x10,x24,lsr#32
202	add	w11,w11,w25
203	add	x12,x12,x25,lsr#32
204	add	w13,w13,w26
205	add	x14,x14,x26,lsr#32
206	add	w15,w15,w27
207	add	x16,x16,x27,lsr#32
208	add	w17,w17,w28
209	add	x19,x19,x28,lsr#32
210	add	w20,w20,w30
211	add	x21,x21,x30,lsr#32
212
213	b.lo	.Ltail
214
215	add	x5,x5,x6,lsl#32	// pack
216	add	x7,x7,x8,lsl#32
217	ldp	x6,x8,[x1,#0]		// load input
218	add	x9,x9,x10,lsl#32
219	add	x11,x11,x12,lsl#32
220	ldp	x10,x12,[x1,#16]
221	add	x13,x13,x14,lsl#32
222	add	x15,x15,x16,lsl#32
223	ldp	x14,x16,[x1,#32]
224	add	x17,x17,x19,lsl#32
225	add	x20,x20,x21,lsl#32
226	ldp	x19,x21,[x1,#48]
227	add	x1,x1,#64
228#ifdef	__ARMEB__
229	rev	x5,x5
230	rev	x7,x7
231	rev	x9,x9
232	rev	x11,x11
233	rev	x13,x13
234	rev	x15,x15
235	rev	x17,x17
236	rev	x20,x20
237#endif
238	eor	x5,x5,x6
239	eor	x7,x7,x8
240	eor	x9,x9,x10
241	eor	x11,x11,x12
242	eor	x13,x13,x14
243	eor	x15,x15,x16
244	eor	x17,x17,x19
245	eor	x20,x20,x21
246
247	stp	x5,x7,[x0,#0]		// store output
248	add	x28,x28,#1			// increment counter
249	stp	x9,x11,[x0,#16]
250	stp	x13,x15,[x0,#32]
251	stp	x17,x20,[x0,#48]
252	add	x0,x0,#64
253
254	b.hi	.Loop_outer
255
256	ldp	x19,x20,[x29,#16]
257	add	sp,sp,#64
258	ldp	x21,x22,[x29,#32]
259	ldp	x23,x24,[x29,#48]
260	ldp	x25,x26,[x29,#64]
261	ldp	x27,x28,[x29,#80]
262	ldp	x29,x30,[sp],#96
263	AARCH64_VALIDATE_LINK_REGISTER
264.Labort:
265	ret
266
267.align	4
268.Ltail:
269	add	x2,x2,#64
270.Less_than_64:
271	sub	x0,x0,#1
272	add	x1,x1,x2
273	add	x0,x0,x2
274	add	x4,sp,x2
275	neg	x2,x2
276
277	add	x5,x5,x6,lsl#32	// pack
278	add	x7,x7,x8,lsl#32
279	add	x9,x9,x10,lsl#32
280	add	x11,x11,x12,lsl#32
281	add	x13,x13,x14,lsl#32
282	add	x15,x15,x16,lsl#32
283	add	x17,x17,x19,lsl#32
284	add	x20,x20,x21,lsl#32
285#ifdef	__ARMEB__
286	rev	x5,x5
287	rev	x7,x7
288	rev	x9,x9
289	rev	x11,x11
290	rev	x13,x13
291	rev	x15,x15
292	rev	x17,x17
293	rev	x20,x20
294#endif
295	stp	x5,x7,[sp,#0]
296	stp	x9,x11,[sp,#16]
297	stp	x13,x15,[sp,#32]
298	stp	x17,x20,[sp,#48]
299
300.Loop_tail:
301	ldrb	w10,[x1,x2]
302	ldrb	w11,[x4,x2]
303	add	x2,x2,#1
304	eor	w10,w10,w11
305	strb	w10,[x0,x2]
306	cbnz	x2,.Loop_tail
307
308	stp	xzr,xzr,[sp,#0]
309	stp	xzr,xzr,[sp,#16]
310	stp	xzr,xzr,[sp,#32]
311	stp	xzr,xzr,[sp,#48]
312
313	ldp	x19,x20,[x29,#16]
314	add	sp,sp,#64
315	ldp	x21,x22,[x29,#32]
316	ldp	x23,x24,[x29,#48]
317	ldp	x25,x26,[x29,#64]
318	ldp	x27,x28,[x29,#80]
319	ldp	x29,x30,[sp],#96
320	AARCH64_VALIDATE_LINK_REGISTER
321	ret
322.size	ChaCha20_ctr32,.-ChaCha20_ctr32
323
324.type	ChaCha20_neon,%function
325.align	5
326ChaCha20_neon:
327	AARCH64_SIGN_LINK_REGISTER
328	stp	x29,x30,[sp,#-96]!
329	add	x29,sp,#0
330
331	adrp	x5,.Lsigma
332	add	x5,x5,:lo12:.Lsigma
333	stp	x19,x20,[sp,#16]
334	stp	x21,x22,[sp,#32]
335	stp	x23,x24,[sp,#48]
336	stp	x25,x26,[sp,#64]
337	stp	x27,x28,[sp,#80]
338	cmp	x2,#512
339	b.hs	.L512_or_more_neon
340
341	sub	sp,sp,#64
342
343	ldp	x22,x23,[x5]		// load sigma
344	ld1	{v24.4s},[x5],#16
345	ldp	x24,x25,[x3]		// load key
346	ldp	x26,x27,[x3,#16]
347	ld1	{v25.4s,v26.4s},[x3]
348	ldp	x28,x30,[x4]		// load counter
349	ld1	{v27.4s},[x4]
350	ld1	{v31.4s},[x5]
351#ifdef	__ARMEB__
352	rev64	v24.4s,v24.4s
353	ror	x24,x24,#32
354	ror	x25,x25,#32
355	ror	x26,x26,#32
356	ror	x27,x27,#32
357	ror	x28,x28,#32
358	ror	x30,x30,#32
359#endif
360	add	v27.4s,v27.4s,v31.4s		// += 1
361	add	v28.4s,v27.4s,v31.4s
362	add	v29.4s,v28.4s,v31.4s
363	shl	v31.4s,v31.4s,#2			// 1 -> 4
364
365.Loop_outer_neon:
366	mov	w5,w22			// unpack key block
367	lsr	x6,x22,#32
368	mov	v0.16b,v24.16b
369	mov	w7,w23
370	lsr	x8,x23,#32
371	mov	v4.16b,v24.16b
372	mov	w9,w24
373	lsr	x10,x24,#32
374	mov	v16.16b,v24.16b
375	mov	w11,w25
376	mov	v1.16b,v25.16b
377	lsr	x12,x25,#32
378	mov	v5.16b,v25.16b
379	mov	w13,w26
380	mov	v17.16b,v25.16b
381	lsr	x14,x26,#32
382	mov	v3.16b,v27.16b
383	mov	w15,w27
384	mov	v7.16b,v28.16b
385	lsr	x16,x27,#32
386	mov	v19.16b,v29.16b
387	mov	w17,w28
388	mov	v2.16b,v26.16b
389	lsr	x19,x28,#32
390	mov	v6.16b,v26.16b
391	mov	w20,w30
392	mov	v18.16b,v26.16b
393	lsr	x21,x30,#32
394
395	mov	x4,#10
396	subs	x2,x2,#256
397.Loop_neon:
398	sub	x4,x4,#1
399	add	v0.4s,v0.4s,v1.4s
400	add	w5,w5,w9
401	add	v4.4s,v4.4s,v5.4s
402	add	w6,w6,w10
403	add	v16.4s,v16.4s,v17.4s
404	add	w7,w7,w11
405	eor	v3.16b,v3.16b,v0.16b
406	add	w8,w8,w12
407	eor	v7.16b,v7.16b,v4.16b
408	eor	w17,w17,w5
409	eor	v19.16b,v19.16b,v16.16b
410	eor	w19,w19,w6
411	rev32	v3.8h,v3.8h
412	eor	w20,w20,w7
413	rev32	v7.8h,v7.8h
414	eor	w21,w21,w8
415	rev32	v19.8h,v19.8h
416	ror	w17,w17,#16
417	add	v2.4s,v2.4s,v3.4s
418	ror	w19,w19,#16
419	add	v6.4s,v6.4s,v7.4s
420	ror	w20,w20,#16
421	add	v18.4s,v18.4s,v19.4s
422	ror	w21,w21,#16
423	eor	v20.16b,v1.16b,v2.16b
424	add	w13,w13,w17
425	eor	v21.16b,v5.16b,v6.16b
426	add	w14,w14,w19
427	eor	v22.16b,v17.16b,v18.16b
428	add	w15,w15,w20
429	ushr	v1.4s,v20.4s,#20
430	add	w16,w16,w21
431	ushr	v5.4s,v21.4s,#20
432	eor	w9,w9,w13
433	ushr	v17.4s,v22.4s,#20
434	eor	w10,w10,w14
435	sli	v1.4s,v20.4s,#12
436	eor	w11,w11,w15
437	sli	v5.4s,v21.4s,#12
438	eor	w12,w12,w16
439	sli	v17.4s,v22.4s,#12
440	ror	w9,w9,#20
441	add	v0.4s,v0.4s,v1.4s
442	ror	w10,w10,#20
443	add	v4.4s,v4.4s,v5.4s
444	ror	w11,w11,#20
445	add	v16.4s,v16.4s,v17.4s
446	ror	w12,w12,#20
447	eor	v20.16b,v3.16b,v0.16b
448	add	w5,w5,w9
449	eor	v21.16b,v7.16b,v4.16b
450	add	w6,w6,w10
451	eor	v22.16b,v19.16b,v16.16b
452	add	w7,w7,w11
453	ushr	v3.4s,v20.4s,#24
454	add	w8,w8,w12
455	ushr	v7.4s,v21.4s,#24
456	eor	w17,w17,w5
457	ushr	v19.4s,v22.4s,#24
458	eor	w19,w19,w6
459	sli	v3.4s,v20.4s,#8
460	eor	w20,w20,w7
461	sli	v7.4s,v21.4s,#8
462	eor	w21,w21,w8
463	sli	v19.4s,v22.4s,#8
464	ror	w17,w17,#24
465	add	v2.4s,v2.4s,v3.4s
466	ror	w19,w19,#24
467	add	v6.4s,v6.4s,v7.4s
468	ror	w20,w20,#24
469	add	v18.4s,v18.4s,v19.4s
470	ror	w21,w21,#24
471	eor	v20.16b,v1.16b,v2.16b
472	add	w13,w13,w17
473	eor	v21.16b,v5.16b,v6.16b
474	add	w14,w14,w19
475	eor	v22.16b,v17.16b,v18.16b
476	add	w15,w15,w20
477	ushr	v1.4s,v20.4s,#25
478	add	w16,w16,w21
479	ushr	v5.4s,v21.4s,#25
480	eor	w9,w9,w13
481	ushr	v17.4s,v22.4s,#25
482	eor	w10,w10,w14
483	sli	v1.4s,v20.4s,#7
484	eor	w11,w11,w15
485	sli	v5.4s,v21.4s,#7
486	eor	w12,w12,w16
487	sli	v17.4s,v22.4s,#7
488	ror	w9,w9,#25
489	ext	v2.16b,v2.16b,v2.16b,#8
490	ror	w10,w10,#25
491	ext	v6.16b,v6.16b,v6.16b,#8
492	ror	w11,w11,#25
493	ext	v18.16b,v18.16b,v18.16b,#8
494	ror	w12,w12,#25
495	ext	v3.16b,v3.16b,v3.16b,#12
496	ext	v7.16b,v7.16b,v7.16b,#12
497	ext	v19.16b,v19.16b,v19.16b,#12
498	ext	v1.16b,v1.16b,v1.16b,#4
499	ext	v5.16b,v5.16b,v5.16b,#4
500	ext	v17.16b,v17.16b,v17.16b,#4
501	add	v0.4s,v0.4s,v1.4s
502	add	w5,w5,w10
503	add	v4.4s,v4.4s,v5.4s
504	add	w6,w6,w11
505	add	v16.4s,v16.4s,v17.4s
506	add	w7,w7,w12
507	eor	v3.16b,v3.16b,v0.16b
508	add	w8,w8,w9
509	eor	v7.16b,v7.16b,v4.16b
510	eor	w21,w21,w5
511	eor	v19.16b,v19.16b,v16.16b
512	eor	w17,w17,w6
513	rev32	v3.8h,v3.8h
514	eor	w19,w19,w7
515	rev32	v7.8h,v7.8h
516	eor	w20,w20,w8
517	rev32	v19.8h,v19.8h
518	ror	w21,w21,#16
519	add	v2.4s,v2.4s,v3.4s
520	ror	w17,w17,#16
521	add	v6.4s,v6.4s,v7.4s
522	ror	w19,w19,#16
523	add	v18.4s,v18.4s,v19.4s
524	ror	w20,w20,#16
525	eor	v20.16b,v1.16b,v2.16b
526	add	w15,w15,w21
527	eor	v21.16b,v5.16b,v6.16b
528	add	w16,w16,w17
529	eor	v22.16b,v17.16b,v18.16b
530	add	w13,w13,w19
531	ushr	v1.4s,v20.4s,#20
532	add	w14,w14,w20
533	ushr	v5.4s,v21.4s,#20
534	eor	w10,w10,w15
535	ushr	v17.4s,v22.4s,#20
536	eor	w11,w11,w16
537	sli	v1.4s,v20.4s,#12
538	eor	w12,w12,w13
539	sli	v5.4s,v21.4s,#12
540	eor	w9,w9,w14
541	sli	v17.4s,v22.4s,#12
542	ror	w10,w10,#20
543	add	v0.4s,v0.4s,v1.4s
544	ror	w11,w11,#20
545	add	v4.4s,v4.4s,v5.4s
546	ror	w12,w12,#20
547	add	v16.4s,v16.4s,v17.4s
548	ror	w9,w9,#20
549	eor	v20.16b,v3.16b,v0.16b
550	add	w5,w5,w10
551	eor	v21.16b,v7.16b,v4.16b
552	add	w6,w6,w11
553	eor	v22.16b,v19.16b,v16.16b
554	add	w7,w7,w12
555	ushr	v3.4s,v20.4s,#24
556	add	w8,w8,w9
557	ushr	v7.4s,v21.4s,#24
558	eor	w21,w21,w5
559	ushr	v19.4s,v22.4s,#24
560	eor	w17,w17,w6
561	sli	v3.4s,v20.4s,#8
562	eor	w19,w19,w7
563	sli	v7.4s,v21.4s,#8
564	eor	w20,w20,w8
565	sli	v19.4s,v22.4s,#8
566	ror	w21,w21,#24
567	add	v2.4s,v2.4s,v3.4s
568	ror	w17,w17,#24
569	add	v6.4s,v6.4s,v7.4s
570	ror	w19,w19,#24
571	add	v18.4s,v18.4s,v19.4s
572	ror	w20,w20,#24
573	eor	v20.16b,v1.16b,v2.16b
574	add	w15,w15,w21
575	eor	v21.16b,v5.16b,v6.16b
576	add	w16,w16,w17
577	eor	v22.16b,v17.16b,v18.16b
578	add	w13,w13,w19
579	ushr	v1.4s,v20.4s,#25
580	add	w14,w14,w20
581	ushr	v5.4s,v21.4s,#25
582	eor	w10,w10,w15
583	ushr	v17.4s,v22.4s,#25
584	eor	w11,w11,w16
585	sli	v1.4s,v20.4s,#7
586	eor	w12,w12,w13
587	sli	v5.4s,v21.4s,#7
588	eor	w9,w9,w14
589	sli	v17.4s,v22.4s,#7
590	ror	w10,w10,#25
591	ext	v2.16b,v2.16b,v2.16b,#8
592	ror	w11,w11,#25
593	ext	v6.16b,v6.16b,v6.16b,#8
594	ror	w12,w12,#25
595	ext	v18.16b,v18.16b,v18.16b,#8
596	ror	w9,w9,#25
597	ext	v3.16b,v3.16b,v3.16b,#4
598	ext	v7.16b,v7.16b,v7.16b,#4
599	ext	v19.16b,v19.16b,v19.16b,#4
600	ext	v1.16b,v1.16b,v1.16b,#12
601	ext	v5.16b,v5.16b,v5.16b,#12
602	ext	v17.16b,v17.16b,v17.16b,#12
603	cbnz	x4,.Loop_neon
604
605	add	w5,w5,w22		// accumulate key block
606	add	v0.4s,v0.4s,v24.4s
607	add	x6,x6,x22,lsr#32
608	add	v4.4s,v4.4s,v24.4s
609	add	w7,w7,w23
610	add	v16.4s,v16.4s,v24.4s
611	add	x8,x8,x23,lsr#32
612	add	v2.4s,v2.4s,v26.4s
613	add	w9,w9,w24
614	add	v6.4s,v6.4s,v26.4s
615	add	x10,x10,x24,lsr#32
616	add	v18.4s,v18.4s,v26.4s
617	add	w11,w11,w25
618	add	v3.4s,v3.4s,v27.4s
619	add	x12,x12,x25,lsr#32
620	add	w13,w13,w26
621	add	v7.4s,v7.4s,v28.4s
622	add	x14,x14,x26,lsr#32
623	add	w15,w15,w27
624	add	v19.4s,v19.4s,v29.4s
625	add	x16,x16,x27,lsr#32
626	add	w17,w17,w28
627	add	v1.4s,v1.4s,v25.4s
628	add	x19,x19,x28,lsr#32
629	add	w20,w20,w30
630	add	v5.4s,v5.4s,v25.4s
631	add	x21,x21,x30,lsr#32
632	add	v17.4s,v17.4s,v25.4s
633
634	b.lo	.Ltail_neon
635
636	add	x5,x5,x6,lsl#32	// pack
637	add	x7,x7,x8,lsl#32
638	ldp	x6,x8,[x1,#0]		// load input
639	add	x9,x9,x10,lsl#32
640	add	x11,x11,x12,lsl#32
641	ldp	x10,x12,[x1,#16]
642	add	x13,x13,x14,lsl#32
643	add	x15,x15,x16,lsl#32
644	ldp	x14,x16,[x1,#32]
645	add	x17,x17,x19,lsl#32
646	add	x20,x20,x21,lsl#32
647	ldp	x19,x21,[x1,#48]
648	add	x1,x1,#64
649#ifdef	__ARMEB__
650	rev	x5,x5
651	rev	x7,x7
652	rev	x9,x9
653	rev	x11,x11
654	rev	x13,x13
655	rev	x15,x15
656	rev	x17,x17
657	rev	x20,x20
658#endif
659	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
660	eor	x5,x5,x6
661	eor	x7,x7,x8
662	eor	x9,x9,x10
663	eor	x11,x11,x12
664	eor	x13,x13,x14
665	eor	v0.16b,v0.16b,v20.16b
666	eor	x15,x15,x16
667	eor	v1.16b,v1.16b,v21.16b
668	eor	x17,x17,x19
669	eor	v2.16b,v2.16b,v22.16b
670	eor	x20,x20,x21
671	eor	v3.16b,v3.16b,v23.16b
672	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
673
674	stp	x5,x7,[x0,#0]		// store output
675	add	x28,x28,#4			// increment counter
676	stp	x9,x11,[x0,#16]
677	add	v27.4s,v27.4s,v31.4s		// += 4
678	stp	x13,x15,[x0,#32]
679	add	v28.4s,v28.4s,v31.4s
680	stp	x17,x20,[x0,#48]
681	add	v29.4s,v29.4s,v31.4s
682	add	x0,x0,#64
683
684	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
685	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
686
687	eor	v4.16b,v4.16b,v20.16b
688	eor	v5.16b,v5.16b,v21.16b
689	eor	v6.16b,v6.16b,v22.16b
690	eor	v7.16b,v7.16b,v23.16b
691	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
692
693	eor	v16.16b,v16.16b,v0.16b
694	eor	v17.16b,v17.16b,v1.16b
695	eor	v18.16b,v18.16b,v2.16b
696	eor	v19.16b,v19.16b,v3.16b
697	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
698
699	b.hi	.Loop_outer_neon
700
701	ldp	x19,x20,[x29,#16]
702	add	sp,sp,#64
703	ldp	x21,x22,[x29,#32]
704	ldp	x23,x24,[x29,#48]
705	ldp	x25,x26,[x29,#64]
706	ldp	x27,x28,[x29,#80]
707	ldp	x29,x30,[sp],#96
708	AARCH64_VALIDATE_LINK_REGISTER
709	ret
710
711.Ltail_neon:
712	add	x2,x2,#256
713	cmp	x2,#64
714	b.lo	.Less_than_64
715
716	add	x5,x5,x6,lsl#32	// pack
717	add	x7,x7,x8,lsl#32
718	ldp	x6,x8,[x1,#0]		// load input
719	add	x9,x9,x10,lsl#32
720	add	x11,x11,x12,lsl#32
721	ldp	x10,x12,[x1,#16]
722	add	x13,x13,x14,lsl#32
723	add	x15,x15,x16,lsl#32
724	ldp	x14,x16,[x1,#32]
725	add	x17,x17,x19,lsl#32
726	add	x20,x20,x21,lsl#32
727	ldp	x19,x21,[x1,#48]
728	add	x1,x1,#64
729#ifdef	__ARMEB__
730	rev	x5,x5
731	rev	x7,x7
732	rev	x9,x9
733	rev	x11,x11
734	rev	x13,x13
735	rev	x15,x15
736	rev	x17,x17
737	rev	x20,x20
738#endif
739	eor	x5,x5,x6
740	eor	x7,x7,x8
741	eor	x9,x9,x10
742	eor	x11,x11,x12
743	eor	x13,x13,x14
744	eor	x15,x15,x16
745	eor	x17,x17,x19
746	eor	x20,x20,x21
747
748	stp	x5,x7,[x0,#0]		// store output
749	add	x28,x28,#4			// increment counter
750	stp	x9,x11,[x0,#16]
751	stp	x13,x15,[x0,#32]
752	stp	x17,x20,[x0,#48]
753	add	x0,x0,#64
754	b.eq	.Ldone_neon
755	sub	x2,x2,#64
756	cmp	x2,#64
757	b.lo	.Less_than_128
758
759	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
760	eor	v0.16b,v0.16b,v20.16b
761	eor	v1.16b,v1.16b,v21.16b
762	eor	v2.16b,v2.16b,v22.16b
763	eor	v3.16b,v3.16b,v23.16b
764	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
765	b.eq	.Ldone_neon
766	sub	x2,x2,#64
767	cmp	x2,#64
768	b.lo	.Less_than_192
769
770	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
771	eor	v4.16b,v4.16b,v20.16b
772	eor	v5.16b,v5.16b,v21.16b
773	eor	v6.16b,v6.16b,v22.16b
774	eor	v7.16b,v7.16b,v23.16b
775	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
776	b.eq	.Ldone_neon
777	sub	x2,x2,#64
778
779	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
780	b	.Last_neon
781
782.Less_than_128:
783	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
784	b	.Last_neon
785.Less_than_192:
786	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
787	b	.Last_neon
788
789.align	4
790.Last_neon:
791	sub	x0,x0,#1
792	add	x1,x1,x2
793	add	x0,x0,x2
794	add	x4,sp,x2
795	neg	x2,x2
796
797.Loop_tail_neon:
798	ldrb	w10,[x1,x2]
799	ldrb	w11,[x4,x2]
800	add	x2,x2,#1
801	eor	w10,w10,w11
802	strb	w10,[x0,x2]
803	cbnz	x2,.Loop_tail_neon
804
805	stp	xzr,xzr,[sp,#0]
806	stp	xzr,xzr,[sp,#16]
807	stp	xzr,xzr,[sp,#32]
808	stp	xzr,xzr,[sp,#48]
809
810.Ldone_neon:
811	ldp	x19,x20,[x29,#16]
812	add	sp,sp,#64
813	ldp	x21,x22,[x29,#32]
814	ldp	x23,x24,[x29,#48]
815	ldp	x25,x26,[x29,#64]
816	ldp	x27,x28,[x29,#80]
817	ldp	x29,x30,[sp],#96
818	AARCH64_VALIDATE_LINK_REGISTER
819	ret
820.size	ChaCha20_neon,.-ChaCha20_neon
821.type	ChaCha20_512_neon,%function
822.align	5
823ChaCha20_512_neon:
824	AARCH64_SIGN_LINK_REGISTER
825	stp	x29,x30,[sp,#-96]!
826	add	x29,sp,#0
827
828	adrp	x5,.Lsigma
829	add	x5,x5,:lo12:.Lsigma
830	stp	x19,x20,[sp,#16]
831	stp	x21,x22,[sp,#32]
832	stp	x23,x24,[sp,#48]
833	stp	x25,x26,[sp,#64]
834	stp	x27,x28,[sp,#80]
835
836.L512_or_more_neon:
837	sub	sp,sp,#128+64
838
839	ldp	x22,x23,[x5]		// load sigma
840	ld1	{v24.4s},[x5],#16
841	ldp	x24,x25,[x3]		// load key
842	ldp	x26,x27,[x3,#16]
843	ld1	{v25.4s,v26.4s},[x3]
844	ldp	x28,x30,[x4]		// load counter
845	ld1	{v27.4s},[x4]
846	ld1	{v31.4s},[x5]
847#ifdef	__ARMEB__
848	rev64	v24.4s,v24.4s
849	ror	x24,x24,#32
850	ror	x25,x25,#32
851	ror	x26,x26,#32
852	ror	x27,x27,#32
853	ror	x28,x28,#32
854	ror	x30,x30,#32
855#endif
856	add	v27.4s,v27.4s,v31.4s		// += 1
857	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
858	add	v27.4s,v27.4s,v31.4s		// not typo
859	str	q26,[sp,#32]
860	add	v28.4s,v27.4s,v31.4s
861	add	v29.4s,v28.4s,v31.4s
862	add	v30.4s,v29.4s,v31.4s
863	shl	v31.4s,v31.4s,#2			// 1 -> 4
864
865	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
866	stp	d10,d11,[sp,#128+16]
867	stp	d12,d13,[sp,#128+32]
868	stp	d14,d15,[sp,#128+48]
869
870	sub	x2,x2,#512			// not typo
871
872.Loop_outer_512_neon:
873	mov	v0.16b,v24.16b
874	mov	v4.16b,v24.16b
875	mov	v8.16b,v24.16b
876	mov	v12.16b,v24.16b
877	mov	v16.16b,v24.16b
878	mov	v20.16b,v24.16b
879	mov	v1.16b,v25.16b
880	mov	w5,w22			// unpack key block
881	mov	v5.16b,v25.16b
882	lsr	x6,x22,#32
883	mov	v9.16b,v25.16b
884	mov	w7,w23
885	mov	v13.16b,v25.16b
886	lsr	x8,x23,#32
887	mov	v17.16b,v25.16b
888	mov	w9,w24
889	mov	v21.16b,v25.16b
890	lsr	x10,x24,#32
891	mov	v3.16b,v27.16b
892	mov	w11,w25
893	mov	v7.16b,v28.16b
894	lsr	x12,x25,#32
895	mov	v11.16b,v29.16b
896	mov	w13,w26
897	mov	v15.16b,v30.16b
898	lsr	x14,x26,#32
899	mov	v2.16b,v26.16b
900	mov	w15,w27
901	mov	v6.16b,v26.16b
902	lsr	x16,x27,#32
903	add	v19.4s,v3.4s,v31.4s			// +4
904	mov	w17,w28
905	add	v23.4s,v7.4s,v31.4s			// +4
906	lsr	x19,x28,#32
907	mov	v10.16b,v26.16b
908	mov	w20,w30
909	mov	v14.16b,v26.16b
910	lsr	x21,x30,#32
911	mov	v18.16b,v26.16b
912	stp	q27,q28,[sp,#48]		// off-load key block, variable part
913	mov	v22.16b,v26.16b
914	str	q29,[sp,#80]
915
916	mov	x4,#5
917	subs	x2,x2,#512
918.Loop_upper_neon:
919	sub	x4,x4,#1
920	add	v0.4s,v0.4s,v1.4s
921	add	w5,w5,w9
922	add	v4.4s,v4.4s,v5.4s
923	add	w6,w6,w10
924	add	v8.4s,v8.4s,v9.4s
925	add	w7,w7,w11
926	add	v12.4s,v12.4s,v13.4s
927	add	w8,w8,w12
928	add	v16.4s,v16.4s,v17.4s
929	eor	w17,w17,w5
930	add	v20.4s,v20.4s,v21.4s
931	eor	w19,w19,w6
932	eor	v3.16b,v3.16b,v0.16b
933	eor	w20,w20,w7
934	eor	v7.16b,v7.16b,v4.16b
935	eor	w21,w21,w8
936	eor	v11.16b,v11.16b,v8.16b
937	ror	w17,w17,#16
938	eor	v15.16b,v15.16b,v12.16b
939	ror	w19,w19,#16
940	eor	v19.16b,v19.16b,v16.16b
941	ror	w20,w20,#16
942	eor	v23.16b,v23.16b,v20.16b
943	ror	w21,w21,#16
944	rev32	v3.8h,v3.8h
945	add	w13,w13,w17
946	rev32	v7.8h,v7.8h
947	add	w14,w14,w19
948	rev32	v11.8h,v11.8h
949	add	w15,w15,w20
950	rev32	v15.8h,v15.8h
951	add	w16,w16,w21
952	rev32	v19.8h,v19.8h
953	eor	w9,w9,w13
954	rev32	v23.8h,v23.8h
955	eor	w10,w10,w14
956	add	v2.4s,v2.4s,v3.4s
957	eor	w11,w11,w15
958	add	v6.4s,v6.4s,v7.4s
959	eor	w12,w12,w16
960	add	v10.4s,v10.4s,v11.4s
961	ror	w9,w9,#20
962	add	v14.4s,v14.4s,v15.4s
963	ror	w10,w10,#20
964	add	v18.4s,v18.4s,v19.4s
965	ror	w11,w11,#20
966	add	v22.4s,v22.4s,v23.4s
967	ror	w12,w12,#20
968	eor	v24.16b,v1.16b,v2.16b
969	add	w5,w5,w9
970	eor	v25.16b,v5.16b,v6.16b
971	add	w6,w6,w10
972	eor	v26.16b,v9.16b,v10.16b
973	add	w7,w7,w11
974	eor	v27.16b,v13.16b,v14.16b
975	add	w8,w8,w12
976	eor	v28.16b,v17.16b,v18.16b
977	eor	w17,w17,w5
978	eor	v29.16b,v21.16b,v22.16b
979	eor	w19,w19,w6
980	ushr	v1.4s,v24.4s,#20
981	eor	w20,w20,w7
982	ushr	v5.4s,v25.4s,#20
983	eor	w21,w21,w8
984	ushr	v9.4s,v26.4s,#20
985	ror	w17,w17,#24
986	ushr	v13.4s,v27.4s,#20
987	ror	w19,w19,#24
988	ushr	v17.4s,v28.4s,#20
989	ror	w20,w20,#24
990	ushr	v21.4s,v29.4s,#20
991	ror	w21,w21,#24
992	sli	v1.4s,v24.4s,#12
993	add	w13,w13,w17
994	sli	v5.4s,v25.4s,#12
995	add	w14,w14,w19
996	sli	v9.4s,v26.4s,#12
997	add	w15,w15,w20
998	sli	v13.4s,v27.4s,#12
999	add	w16,w16,w21
1000	sli	v17.4s,v28.4s,#12
1001	eor	w9,w9,w13
1002	sli	v21.4s,v29.4s,#12
1003	eor	w10,w10,w14
1004	add	v0.4s,v0.4s,v1.4s
1005	eor	w11,w11,w15
1006	add	v4.4s,v4.4s,v5.4s
1007	eor	w12,w12,w16
1008	add	v8.4s,v8.4s,v9.4s
1009	ror	w9,w9,#25
1010	add	v12.4s,v12.4s,v13.4s
1011	ror	w10,w10,#25
1012	add	v16.4s,v16.4s,v17.4s
1013	ror	w11,w11,#25
1014	add	v20.4s,v20.4s,v21.4s
1015	ror	w12,w12,#25
1016	eor	v24.16b,v3.16b,v0.16b
1017	add	w5,w5,w10
1018	eor	v25.16b,v7.16b,v4.16b
1019	add	w6,w6,w11
1020	eor	v26.16b,v11.16b,v8.16b
1021	add	w7,w7,w12
1022	eor	v27.16b,v15.16b,v12.16b
1023	add	w8,w8,w9
1024	eor	v28.16b,v19.16b,v16.16b
1025	eor	w21,w21,w5
1026	eor	v29.16b,v23.16b,v20.16b
1027	eor	w17,w17,w6
1028	ushr	v3.4s,v24.4s,#24
1029	eor	w19,w19,w7
1030	ushr	v7.4s,v25.4s,#24
1031	eor	w20,w20,w8
1032	ushr	v11.4s,v26.4s,#24
1033	ror	w21,w21,#16
1034	ushr	v15.4s,v27.4s,#24
1035	ror	w17,w17,#16
1036	ushr	v19.4s,v28.4s,#24
1037	ror	w19,w19,#16
1038	ushr	v23.4s,v29.4s,#24
1039	ror	w20,w20,#16
1040	sli	v3.4s,v24.4s,#8
1041	add	w15,w15,w21
1042	sli	v7.4s,v25.4s,#8
1043	add	w16,w16,w17
1044	sli	v11.4s,v26.4s,#8
1045	add	w13,w13,w19
1046	sli	v15.4s,v27.4s,#8
1047	add	w14,w14,w20
1048	sli	v19.4s,v28.4s,#8
1049	eor	w10,w10,w15
1050	sli	v23.4s,v29.4s,#8
1051	eor	w11,w11,w16
1052	add	v2.4s,v2.4s,v3.4s
1053	eor	w12,w12,w13
1054	add	v6.4s,v6.4s,v7.4s
1055	eor	w9,w9,w14
1056	add	v10.4s,v10.4s,v11.4s
1057	ror	w10,w10,#20
1058	add	v14.4s,v14.4s,v15.4s
1059	ror	w11,w11,#20
1060	add	v18.4s,v18.4s,v19.4s
1061	ror	w12,w12,#20
1062	add	v22.4s,v22.4s,v23.4s
1063	ror	w9,w9,#20
1064	eor	v24.16b,v1.16b,v2.16b
1065	add	w5,w5,w10
1066	eor	v25.16b,v5.16b,v6.16b
1067	add	w6,w6,w11
1068	eor	v26.16b,v9.16b,v10.16b
1069	add	w7,w7,w12
1070	eor	v27.16b,v13.16b,v14.16b
1071	add	w8,w8,w9
1072	eor	v28.16b,v17.16b,v18.16b
1073	eor	w21,w21,w5
1074	eor	v29.16b,v21.16b,v22.16b
1075	eor	w17,w17,w6
1076	ushr	v1.4s,v24.4s,#25
1077	eor	w19,w19,w7
1078	ushr	v5.4s,v25.4s,#25
1079	eor	w20,w20,w8
1080	ushr	v9.4s,v26.4s,#25
1081	ror	w21,w21,#24
1082	ushr	v13.4s,v27.4s,#25
1083	ror	w17,w17,#24
1084	ushr	v17.4s,v28.4s,#25
1085	ror	w19,w19,#24
1086	ushr	v21.4s,v29.4s,#25
1087	ror	w20,w20,#24
1088	sli	v1.4s,v24.4s,#7
1089	add	w15,w15,w21
1090	sli	v5.4s,v25.4s,#7
1091	add	w16,w16,w17
1092	sli	v9.4s,v26.4s,#7
1093	add	w13,w13,w19
1094	sli	v13.4s,v27.4s,#7
1095	add	w14,w14,w20
1096	sli	v17.4s,v28.4s,#7
1097	eor	w10,w10,w15
1098	sli	v21.4s,v29.4s,#7
1099	eor	w11,w11,w16
1100	ext	v2.16b,v2.16b,v2.16b,#8
1101	eor	w12,w12,w13
1102	ext	v6.16b,v6.16b,v6.16b,#8
1103	eor	w9,w9,w14
1104	ext	v10.16b,v10.16b,v10.16b,#8
1105	ror	w10,w10,#25
1106	ext	v14.16b,v14.16b,v14.16b,#8
1107	ror	w11,w11,#25
1108	ext	v18.16b,v18.16b,v18.16b,#8
1109	ror	w12,w12,#25
1110	ext	v22.16b,v22.16b,v22.16b,#8
1111	ror	w9,w9,#25
1112	ext	v3.16b,v3.16b,v3.16b,#12
1113	ext	v7.16b,v7.16b,v7.16b,#12
1114	ext	v11.16b,v11.16b,v11.16b,#12
1115	ext	v15.16b,v15.16b,v15.16b,#12
1116	ext	v19.16b,v19.16b,v19.16b,#12
1117	ext	v23.16b,v23.16b,v23.16b,#12
1118	ext	v1.16b,v1.16b,v1.16b,#4
1119	ext	v5.16b,v5.16b,v5.16b,#4
1120	ext	v9.16b,v9.16b,v9.16b,#4
1121	ext	v13.16b,v13.16b,v13.16b,#4
1122	ext	v17.16b,v17.16b,v17.16b,#4
1123	ext	v21.16b,v21.16b,v21.16b,#4
1124	add	v0.4s,v0.4s,v1.4s
1125	add	w5,w5,w9
1126	add	v4.4s,v4.4s,v5.4s
1127	add	w6,w6,w10
1128	add	v8.4s,v8.4s,v9.4s
1129	add	w7,w7,w11
1130	add	v12.4s,v12.4s,v13.4s
1131	add	w8,w8,w12
1132	add	v16.4s,v16.4s,v17.4s
1133	eor	w17,w17,w5
1134	add	v20.4s,v20.4s,v21.4s
1135	eor	w19,w19,w6
1136	eor	v3.16b,v3.16b,v0.16b
1137	eor	w20,w20,w7
1138	eor	v7.16b,v7.16b,v4.16b
1139	eor	w21,w21,w8
1140	eor	v11.16b,v11.16b,v8.16b
1141	ror	w17,w17,#16
1142	eor	v15.16b,v15.16b,v12.16b
1143	ror	w19,w19,#16
1144	eor	v19.16b,v19.16b,v16.16b
1145	ror	w20,w20,#16
1146	eor	v23.16b,v23.16b,v20.16b
1147	ror	w21,w21,#16
1148	rev32	v3.8h,v3.8h
1149	add	w13,w13,w17
1150	rev32	v7.8h,v7.8h
1151	add	w14,w14,w19
1152	rev32	v11.8h,v11.8h
1153	add	w15,w15,w20
1154	rev32	v15.8h,v15.8h
1155	add	w16,w16,w21
1156	rev32	v19.8h,v19.8h
1157	eor	w9,w9,w13
1158	rev32	v23.8h,v23.8h
1159	eor	w10,w10,w14
1160	add	v2.4s,v2.4s,v3.4s
1161	eor	w11,w11,w15
1162	add	v6.4s,v6.4s,v7.4s
1163	eor	w12,w12,w16
1164	add	v10.4s,v10.4s,v11.4s
1165	ror	w9,w9,#20
1166	add	v14.4s,v14.4s,v15.4s
1167	ror	w10,w10,#20
1168	add	v18.4s,v18.4s,v19.4s
1169	ror	w11,w11,#20
1170	add	v22.4s,v22.4s,v23.4s
1171	ror	w12,w12,#20
1172	eor	v24.16b,v1.16b,v2.16b
1173	add	w5,w5,w9
1174	eor	v25.16b,v5.16b,v6.16b
1175	add	w6,w6,w10
1176	eor	v26.16b,v9.16b,v10.16b
1177	add	w7,w7,w11
1178	eor	v27.16b,v13.16b,v14.16b
1179	add	w8,w8,w12
1180	eor	v28.16b,v17.16b,v18.16b
1181	eor	w17,w17,w5
1182	eor	v29.16b,v21.16b,v22.16b
1183	eor	w19,w19,w6
1184	ushr	v1.4s,v24.4s,#20
1185	eor	w20,w20,w7
1186	ushr	v5.4s,v25.4s,#20
1187	eor	w21,w21,w8
1188	ushr	v9.4s,v26.4s,#20
1189	ror	w17,w17,#24
1190	ushr	v13.4s,v27.4s,#20
1191	ror	w19,w19,#24
1192	ushr	v17.4s,v28.4s,#20
1193	ror	w20,w20,#24
1194	ushr	v21.4s,v29.4s,#20
1195	ror	w21,w21,#24
1196	sli	v1.4s,v24.4s,#12
1197	add	w13,w13,w17
1198	sli	v5.4s,v25.4s,#12
1199	add	w14,w14,w19
1200	sli	v9.4s,v26.4s,#12
1201	add	w15,w15,w20
1202	sli	v13.4s,v27.4s,#12
1203	add	w16,w16,w21
1204	sli	v17.4s,v28.4s,#12
1205	eor	w9,w9,w13
1206	sli	v21.4s,v29.4s,#12
1207	eor	w10,w10,w14
1208	add	v0.4s,v0.4s,v1.4s
1209	eor	w11,w11,w15
1210	add	v4.4s,v4.4s,v5.4s
1211	eor	w12,w12,w16
1212	add	v8.4s,v8.4s,v9.4s
1213	ror	w9,w9,#25
1214	add	v12.4s,v12.4s,v13.4s
1215	ror	w10,w10,#25
1216	add	v16.4s,v16.4s,v17.4s
1217	ror	w11,w11,#25
1218	add	v20.4s,v20.4s,v21.4s
1219	ror	w12,w12,#25
1220	eor	v24.16b,v3.16b,v0.16b
1221	add	w5,w5,w10
1222	eor	v25.16b,v7.16b,v4.16b
1223	add	w6,w6,w11
1224	eor	v26.16b,v11.16b,v8.16b
1225	add	w7,w7,w12
1226	eor	v27.16b,v15.16b,v12.16b
1227	add	w8,w8,w9
1228	eor	v28.16b,v19.16b,v16.16b
1229	eor	w21,w21,w5
1230	eor	v29.16b,v23.16b,v20.16b
1231	eor	w17,w17,w6
1232	ushr	v3.4s,v24.4s,#24
1233	eor	w19,w19,w7
1234	ushr	v7.4s,v25.4s,#24
1235	eor	w20,w20,w8
1236	ushr	v11.4s,v26.4s,#24
1237	ror	w21,w21,#16
1238	ushr	v15.4s,v27.4s,#24
1239	ror	w17,w17,#16
1240	ushr	v19.4s,v28.4s,#24
1241	ror	w19,w19,#16
1242	ushr	v23.4s,v29.4s,#24
1243	ror	w20,w20,#16
1244	sli	v3.4s,v24.4s,#8
1245	add	w15,w15,w21
1246	sli	v7.4s,v25.4s,#8
1247	add	w16,w16,w17
1248	sli	v11.4s,v26.4s,#8
1249	add	w13,w13,w19
1250	sli	v15.4s,v27.4s,#8
1251	add	w14,w14,w20
1252	sli	v19.4s,v28.4s,#8
1253	eor	w10,w10,w15
1254	sli	v23.4s,v29.4s,#8
1255	eor	w11,w11,w16
1256	add	v2.4s,v2.4s,v3.4s
1257	eor	w12,w12,w13
1258	add	v6.4s,v6.4s,v7.4s
1259	eor	w9,w9,w14
1260	add	v10.4s,v10.4s,v11.4s
1261	ror	w10,w10,#20
1262	add	v14.4s,v14.4s,v15.4s
1263	ror	w11,w11,#20
1264	add	v18.4s,v18.4s,v19.4s
1265	ror	w12,w12,#20
1266	add	v22.4s,v22.4s,v23.4s
1267	ror	w9,w9,#20
1268	eor	v24.16b,v1.16b,v2.16b
1269	add	w5,w5,w10
1270	eor	v25.16b,v5.16b,v6.16b
1271	add	w6,w6,w11
1272	eor	v26.16b,v9.16b,v10.16b
1273	add	w7,w7,w12
1274	eor	v27.16b,v13.16b,v14.16b
1275	add	w8,w8,w9
1276	eor	v28.16b,v17.16b,v18.16b
1277	eor	w21,w21,w5
1278	eor	v29.16b,v21.16b,v22.16b
1279	eor	w17,w17,w6
1280	ushr	v1.4s,v24.4s,#25
1281	eor	w19,w19,w7
1282	ushr	v5.4s,v25.4s,#25
1283	eor	w20,w20,w8
1284	ushr	v9.4s,v26.4s,#25
1285	ror	w21,w21,#24
1286	ushr	v13.4s,v27.4s,#25
1287	ror	w17,w17,#24
1288	ushr	v17.4s,v28.4s,#25
1289	ror	w19,w19,#24
1290	ushr	v21.4s,v29.4s,#25
1291	ror	w20,w20,#24
1292	sli	v1.4s,v24.4s,#7
1293	add	w15,w15,w21
1294	sli	v5.4s,v25.4s,#7
1295	add	w16,w16,w17
1296	sli	v9.4s,v26.4s,#7
1297	add	w13,w13,w19
1298	sli	v13.4s,v27.4s,#7
1299	add	w14,w14,w20
1300	sli	v17.4s,v28.4s,#7
1301	eor	w10,w10,w15
1302	sli	v21.4s,v29.4s,#7
1303	eor	w11,w11,w16
1304	ext	v2.16b,v2.16b,v2.16b,#8
1305	eor	w12,w12,w13
1306	ext	v6.16b,v6.16b,v6.16b,#8
1307	eor	w9,w9,w14
1308	ext	v10.16b,v10.16b,v10.16b,#8
1309	ror	w10,w10,#25
1310	ext	v14.16b,v14.16b,v14.16b,#8
1311	ror	w11,w11,#25
1312	ext	v18.16b,v18.16b,v18.16b,#8
1313	ror	w12,w12,#25
1314	ext	v22.16b,v22.16b,v22.16b,#8
1315	ror	w9,w9,#25
1316	ext	v3.16b,v3.16b,v3.16b,#4
1317	ext	v7.16b,v7.16b,v7.16b,#4
1318	ext	v11.16b,v11.16b,v11.16b,#4
1319	ext	v15.16b,v15.16b,v15.16b,#4
1320	ext	v19.16b,v19.16b,v19.16b,#4
1321	ext	v23.16b,v23.16b,v23.16b,#4
1322	ext	v1.16b,v1.16b,v1.16b,#12
1323	ext	v5.16b,v5.16b,v5.16b,#12
1324	ext	v9.16b,v9.16b,v9.16b,#12
1325	ext	v13.16b,v13.16b,v13.16b,#12
1326	ext	v17.16b,v17.16b,v17.16b,#12
1327	ext	v21.16b,v21.16b,v21.16b,#12
1328	cbnz	x4,.Loop_upper_neon
1329
1330	add	w5,w5,w22		// accumulate key block
1331	add	x6,x6,x22,lsr#32
1332	add	w7,w7,w23
1333	add	x8,x8,x23,lsr#32
1334	add	w9,w9,w24
1335	add	x10,x10,x24,lsr#32
1336	add	w11,w11,w25
1337	add	x12,x12,x25,lsr#32
1338	add	w13,w13,w26
1339	add	x14,x14,x26,lsr#32
1340	add	w15,w15,w27
1341	add	x16,x16,x27,lsr#32
1342	add	w17,w17,w28
1343	add	x19,x19,x28,lsr#32
1344	add	w20,w20,w30
1345	add	x21,x21,x30,lsr#32
1346
1347	add	x5,x5,x6,lsl#32	// pack
1348	add	x7,x7,x8,lsl#32
1349	ldp	x6,x8,[x1,#0]		// load input
1350	add	x9,x9,x10,lsl#32
1351	add	x11,x11,x12,lsl#32
1352	ldp	x10,x12,[x1,#16]
1353	add	x13,x13,x14,lsl#32
1354	add	x15,x15,x16,lsl#32
1355	ldp	x14,x16,[x1,#32]
1356	add	x17,x17,x19,lsl#32
1357	add	x20,x20,x21,lsl#32
1358	ldp	x19,x21,[x1,#48]
1359	add	x1,x1,#64
1360#ifdef	__ARMEB__
1361	rev	x5,x5
1362	rev	x7,x7
1363	rev	x9,x9
1364	rev	x11,x11
1365	rev	x13,x13
1366	rev	x15,x15
1367	rev	x17,x17
1368	rev	x20,x20
1369#endif
1370	eor	x5,x5,x6
1371	eor	x7,x7,x8
1372	eor	x9,x9,x10
1373	eor	x11,x11,x12
1374	eor	x13,x13,x14
1375	eor	x15,x15,x16
1376	eor	x17,x17,x19
1377	eor	x20,x20,x21
1378
1379	stp	x5,x7,[x0,#0]		// store output
1380	add	x28,x28,#1			// increment counter
1381	mov	w5,w22			// unpack key block
1382	lsr	x6,x22,#32
1383	stp	x9,x11,[x0,#16]
1384	mov	w7,w23
1385	lsr	x8,x23,#32
1386	stp	x13,x15,[x0,#32]
1387	mov	w9,w24
1388	lsr	x10,x24,#32
1389	stp	x17,x20,[x0,#48]
1390	add	x0,x0,#64
1391	mov	w11,w25
1392	lsr	x12,x25,#32
1393	mov	w13,w26
1394	lsr	x14,x26,#32
1395	mov	w15,w27
1396	lsr	x16,x27,#32
1397	mov	w17,w28
1398	lsr	x19,x28,#32
1399	mov	w20,w30
1400	lsr	x21,x30,#32
1401
1402	mov	x4,#5
1403.Loop_lower_neon:
1404	sub	x4,x4,#1
1405	add	v0.4s,v0.4s,v1.4s
1406	add	w5,w5,w9
1407	add	v4.4s,v4.4s,v5.4s
1408	add	w6,w6,w10
1409	add	v8.4s,v8.4s,v9.4s
1410	add	w7,w7,w11
1411	add	v12.4s,v12.4s,v13.4s
1412	add	w8,w8,w12
1413	add	v16.4s,v16.4s,v17.4s
1414	eor	w17,w17,w5
1415	add	v20.4s,v20.4s,v21.4s
1416	eor	w19,w19,w6
1417	eor	v3.16b,v3.16b,v0.16b
1418	eor	w20,w20,w7
1419	eor	v7.16b,v7.16b,v4.16b
1420	eor	w21,w21,w8
1421	eor	v11.16b,v11.16b,v8.16b
1422	ror	w17,w17,#16
1423	eor	v15.16b,v15.16b,v12.16b
1424	ror	w19,w19,#16
1425	eor	v19.16b,v19.16b,v16.16b
1426	ror	w20,w20,#16
1427	eor	v23.16b,v23.16b,v20.16b
1428	ror	w21,w21,#16
1429	rev32	v3.8h,v3.8h
1430	add	w13,w13,w17
1431	rev32	v7.8h,v7.8h
1432	add	w14,w14,w19
1433	rev32	v11.8h,v11.8h
1434	add	w15,w15,w20
1435	rev32	v15.8h,v15.8h
1436	add	w16,w16,w21
1437	rev32	v19.8h,v19.8h
1438	eor	w9,w9,w13
1439	rev32	v23.8h,v23.8h
1440	eor	w10,w10,w14
1441	add	v2.4s,v2.4s,v3.4s
1442	eor	w11,w11,w15
1443	add	v6.4s,v6.4s,v7.4s
1444	eor	w12,w12,w16
1445	add	v10.4s,v10.4s,v11.4s
1446	ror	w9,w9,#20
1447	add	v14.4s,v14.4s,v15.4s
1448	ror	w10,w10,#20
1449	add	v18.4s,v18.4s,v19.4s
1450	ror	w11,w11,#20
1451	add	v22.4s,v22.4s,v23.4s
1452	ror	w12,w12,#20
1453	eor	v24.16b,v1.16b,v2.16b
1454	add	w5,w5,w9
1455	eor	v25.16b,v5.16b,v6.16b
1456	add	w6,w6,w10
1457	eor	v26.16b,v9.16b,v10.16b
1458	add	w7,w7,w11
1459	eor	v27.16b,v13.16b,v14.16b
1460	add	w8,w8,w12
1461	eor	v28.16b,v17.16b,v18.16b
1462	eor	w17,w17,w5
1463	eor	v29.16b,v21.16b,v22.16b
1464	eor	w19,w19,w6
1465	ushr	v1.4s,v24.4s,#20
1466	eor	w20,w20,w7
1467	ushr	v5.4s,v25.4s,#20
1468	eor	w21,w21,w8
1469	ushr	v9.4s,v26.4s,#20
1470	ror	w17,w17,#24
1471	ushr	v13.4s,v27.4s,#20
1472	ror	w19,w19,#24
1473	ushr	v17.4s,v28.4s,#20
1474	ror	w20,w20,#24
1475	ushr	v21.4s,v29.4s,#20
1476	ror	w21,w21,#24
1477	sli	v1.4s,v24.4s,#12
1478	add	w13,w13,w17
1479	sli	v5.4s,v25.4s,#12
1480	add	w14,w14,w19
1481	sli	v9.4s,v26.4s,#12
1482	add	w15,w15,w20
1483	sli	v13.4s,v27.4s,#12
1484	add	w16,w16,w21
1485	sli	v17.4s,v28.4s,#12
1486	eor	w9,w9,w13
1487	sli	v21.4s,v29.4s,#12
1488	eor	w10,w10,w14
1489	add	v0.4s,v0.4s,v1.4s
1490	eor	w11,w11,w15
1491	add	v4.4s,v4.4s,v5.4s
1492	eor	w12,w12,w16
1493	add	v8.4s,v8.4s,v9.4s
1494	ror	w9,w9,#25
1495	add	v12.4s,v12.4s,v13.4s
1496	ror	w10,w10,#25
1497	add	v16.4s,v16.4s,v17.4s
1498	ror	w11,w11,#25
1499	add	v20.4s,v20.4s,v21.4s
1500	ror	w12,w12,#25
1501	eor	v24.16b,v3.16b,v0.16b
1502	add	w5,w5,w10
1503	eor	v25.16b,v7.16b,v4.16b
1504	add	w6,w6,w11
1505	eor	v26.16b,v11.16b,v8.16b
1506	add	w7,w7,w12
1507	eor	v27.16b,v15.16b,v12.16b
1508	add	w8,w8,w9
1509	eor	v28.16b,v19.16b,v16.16b
1510	eor	w21,w21,w5
1511	eor	v29.16b,v23.16b,v20.16b
1512	eor	w17,w17,w6
1513	ushr	v3.4s,v24.4s,#24
1514	eor	w19,w19,w7
1515	ushr	v7.4s,v25.4s,#24
1516	eor	w20,w20,w8
1517	ushr	v11.4s,v26.4s,#24
1518	ror	w21,w21,#16
1519	ushr	v15.4s,v27.4s,#24
1520	ror	w17,w17,#16
1521	ushr	v19.4s,v28.4s,#24
1522	ror	w19,w19,#16
1523	ushr	v23.4s,v29.4s,#24
1524	ror	w20,w20,#16
1525	sli	v3.4s,v24.4s,#8
1526	add	w15,w15,w21
1527	sli	v7.4s,v25.4s,#8
1528	add	w16,w16,w17
1529	sli	v11.4s,v26.4s,#8
1530	add	w13,w13,w19
1531	sli	v15.4s,v27.4s,#8
1532	add	w14,w14,w20
1533	sli	v19.4s,v28.4s,#8
1534	eor	w10,w10,w15
1535	sli	v23.4s,v29.4s,#8
1536	eor	w11,w11,w16
1537	add	v2.4s,v2.4s,v3.4s
1538	eor	w12,w12,w13
1539	add	v6.4s,v6.4s,v7.4s
1540	eor	w9,w9,w14
1541	add	v10.4s,v10.4s,v11.4s
1542	ror	w10,w10,#20
1543	add	v14.4s,v14.4s,v15.4s
1544	ror	w11,w11,#20
1545	add	v18.4s,v18.4s,v19.4s
1546	ror	w12,w12,#20
1547	add	v22.4s,v22.4s,v23.4s
1548	ror	w9,w9,#20
1549	eor	v24.16b,v1.16b,v2.16b
1550	add	w5,w5,w10
1551	eor	v25.16b,v5.16b,v6.16b
1552	add	w6,w6,w11
1553	eor	v26.16b,v9.16b,v10.16b
1554	add	w7,w7,w12
1555	eor	v27.16b,v13.16b,v14.16b
1556	add	w8,w8,w9
1557	eor	v28.16b,v17.16b,v18.16b
1558	eor	w21,w21,w5
1559	eor	v29.16b,v21.16b,v22.16b
1560	eor	w17,w17,w6
1561	ushr	v1.4s,v24.4s,#25
1562	eor	w19,w19,w7
1563	ushr	v5.4s,v25.4s,#25
1564	eor	w20,w20,w8
1565	ushr	v9.4s,v26.4s,#25
1566	ror	w21,w21,#24
1567	ushr	v13.4s,v27.4s,#25
1568	ror	w17,w17,#24
1569	ushr	v17.4s,v28.4s,#25
1570	ror	w19,w19,#24
1571	ushr	v21.4s,v29.4s,#25
1572	ror	w20,w20,#24
1573	sli	v1.4s,v24.4s,#7
1574	add	w15,w15,w21
1575	sli	v5.4s,v25.4s,#7
1576	add	w16,w16,w17
1577	sli	v9.4s,v26.4s,#7
1578	add	w13,w13,w19
1579	sli	v13.4s,v27.4s,#7
1580	add	w14,w14,w20
1581	sli	v17.4s,v28.4s,#7
1582	eor	w10,w10,w15
1583	sli	v21.4s,v29.4s,#7
1584	eor	w11,w11,w16
1585	ext	v2.16b,v2.16b,v2.16b,#8
1586	eor	w12,w12,w13
1587	ext	v6.16b,v6.16b,v6.16b,#8
1588	eor	w9,w9,w14
1589	ext	v10.16b,v10.16b,v10.16b,#8
1590	ror	w10,w10,#25
1591	ext	v14.16b,v14.16b,v14.16b,#8
1592	ror	w11,w11,#25
1593	ext	v18.16b,v18.16b,v18.16b,#8
1594	ror	w12,w12,#25
1595	ext	v22.16b,v22.16b,v22.16b,#8
1596	ror	w9,w9,#25
1597	ext	v3.16b,v3.16b,v3.16b,#12
1598	ext	v7.16b,v7.16b,v7.16b,#12
1599	ext	v11.16b,v11.16b,v11.16b,#12
1600	ext	v15.16b,v15.16b,v15.16b,#12
1601	ext	v19.16b,v19.16b,v19.16b,#12
1602	ext	v23.16b,v23.16b,v23.16b,#12
1603	ext	v1.16b,v1.16b,v1.16b,#4
1604	ext	v5.16b,v5.16b,v5.16b,#4
1605	ext	v9.16b,v9.16b,v9.16b,#4
1606	ext	v13.16b,v13.16b,v13.16b,#4
1607	ext	v17.16b,v17.16b,v17.16b,#4
1608	ext	v21.16b,v21.16b,v21.16b,#4
1609	add	v0.4s,v0.4s,v1.4s
1610	add	w5,w5,w9
1611	add	v4.4s,v4.4s,v5.4s
1612	add	w6,w6,w10
1613	add	v8.4s,v8.4s,v9.4s
1614	add	w7,w7,w11
1615	add	v12.4s,v12.4s,v13.4s
1616	add	w8,w8,w12
1617	add	v16.4s,v16.4s,v17.4s
1618	eor	w17,w17,w5
1619	add	v20.4s,v20.4s,v21.4s
1620	eor	w19,w19,w6
1621	eor	v3.16b,v3.16b,v0.16b
1622	eor	w20,w20,w7
1623	eor	v7.16b,v7.16b,v4.16b
1624	eor	w21,w21,w8
1625	eor	v11.16b,v11.16b,v8.16b
1626	ror	w17,w17,#16
1627	eor	v15.16b,v15.16b,v12.16b
1628	ror	w19,w19,#16
1629	eor	v19.16b,v19.16b,v16.16b
1630	ror	w20,w20,#16
1631	eor	v23.16b,v23.16b,v20.16b
1632	ror	w21,w21,#16
1633	rev32	v3.8h,v3.8h
1634	add	w13,w13,w17
1635	rev32	v7.8h,v7.8h
1636	add	w14,w14,w19
1637	rev32	v11.8h,v11.8h
1638	add	w15,w15,w20
1639	rev32	v15.8h,v15.8h
1640	add	w16,w16,w21
1641	rev32	v19.8h,v19.8h
1642	eor	w9,w9,w13
1643	rev32	v23.8h,v23.8h
1644	eor	w10,w10,w14
1645	add	v2.4s,v2.4s,v3.4s
1646	eor	w11,w11,w15
1647	add	v6.4s,v6.4s,v7.4s
1648	eor	w12,w12,w16
1649	add	v10.4s,v10.4s,v11.4s
1650	ror	w9,w9,#20
1651	add	v14.4s,v14.4s,v15.4s
1652	ror	w10,w10,#20
1653	add	v18.4s,v18.4s,v19.4s
1654	ror	w11,w11,#20
1655	add	v22.4s,v22.4s,v23.4s
1656	ror	w12,w12,#20
1657	eor	v24.16b,v1.16b,v2.16b
1658	add	w5,w5,w9
1659	eor	v25.16b,v5.16b,v6.16b
1660	add	w6,w6,w10
1661	eor	v26.16b,v9.16b,v10.16b
1662	add	w7,w7,w11
1663	eor	v27.16b,v13.16b,v14.16b
1664	add	w8,w8,w12
1665	eor	v28.16b,v17.16b,v18.16b
1666	eor	w17,w17,w5
1667	eor	v29.16b,v21.16b,v22.16b
1668	eor	w19,w19,w6
1669	ushr	v1.4s,v24.4s,#20
1670	eor	w20,w20,w7
1671	ushr	v5.4s,v25.4s,#20
1672	eor	w21,w21,w8
1673	ushr	v9.4s,v26.4s,#20
1674	ror	w17,w17,#24
1675	ushr	v13.4s,v27.4s,#20
1676	ror	w19,w19,#24
1677	ushr	v17.4s,v28.4s,#20
1678	ror	w20,w20,#24
1679	ushr	v21.4s,v29.4s,#20
1680	ror	w21,w21,#24
1681	sli	v1.4s,v24.4s,#12
1682	add	w13,w13,w17
1683	sli	v5.4s,v25.4s,#12
1684	add	w14,w14,w19
1685	sli	v9.4s,v26.4s,#12
1686	add	w15,w15,w20
1687	sli	v13.4s,v27.4s,#12
1688	add	w16,w16,w21
1689	sli	v17.4s,v28.4s,#12
1690	eor	w9,w9,w13
1691	sli	v21.4s,v29.4s,#12
1692	eor	w10,w10,w14
1693	add	v0.4s,v0.4s,v1.4s
1694	eor	w11,w11,w15
1695	add	v4.4s,v4.4s,v5.4s
1696	eor	w12,w12,w16
1697	add	v8.4s,v8.4s,v9.4s
1698	ror	w9,w9,#25
1699	add	v12.4s,v12.4s,v13.4s
1700	ror	w10,w10,#25
1701	add	v16.4s,v16.4s,v17.4s
1702	ror	w11,w11,#25
1703	add	v20.4s,v20.4s,v21.4s
1704	ror	w12,w12,#25
1705	eor	v24.16b,v3.16b,v0.16b
1706	add	w5,w5,w10
1707	eor	v25.16b,v7.16b,v4.16b
1708	add	w6,w6,w11
1709	eor	v26.16b,v11.16b,v8.16b
1710	add	w7,w7,w12
1711	eor	v27.16b,v15.16b,v12.16b
1712	add	w8,w8,w9
1713	eor	v28.16b,v19.16b,v16.16b
1714	eor	w21,w21,w5
1715	eor	v29.16b,v23.16b,v20.16b
1716	eor	w17,w17,w6
1717	ushr	v3.4s,v24.4s,#24
1718	eor	w19,w19,w7
1719	ushr	v7.4s,v25.4s,#24
1720	eor	w20,w20,w8
1721	ushr	v11.4s,v26.4s,#24
1722	ror	w21,w21,#16
1723	ushr	v15.4s,v27.4s,#24
1724	ror	w17,w17,#16
1725	ushr	v19.4s,v28.4s,#24
1726	ror	w19,w19,#16
1727	ushr	v23.4s,v29.4s,#24
1728	ror	w20,w20,#16
1729	sli	v3.4s,v24.4s,#8
1730	add	w15,w15,w21
1731	sli	v7.4s,v25.4s,#8
1732	add	w16,w16,w17
1733	sli	v11.4s,v26.4s,#8
1734	add	w13,w13,w19
1735	sli	v15.4s,v27.4s,#8
1736	add	w14,w14,w20
1737	sli	v19.4s,v28.4s,#8
1738	eor	w10,w10,w15
1739	sli	v23.4s,v29.4s,#8
1740	eor	w11,w11,w16
1741	add	v2.4s,v2.4s,v3.4s
1742	eor	w12,w12,w13
1743	add	v6.4s,v6.4s,v7.4s
1744	eor	w9,w9,w14
1745	add	v10.4s,v10.4s,v11.4s
1746	ror	w10,w10,#20
1747	add	v14.4s,v14.4s,v15.4s
1748	ror	w11,w11,#20
1749	add	v18.4s,v18.4s,v19.4s
1750	ror	w12,w12,#20
1751	add	v22.4s,v22.4s,v23.4s
1752	ror	w9,w9,#20
1753	eor	v24.16b,v1.16b,v2.16b
1754	add	w5,w5,w10
1755	eor	v25.16b,v5.16b,v6.16b
1756	add	w6,w6,w11
1757	eor	v26.16b,v9.16b,v10.16b
1758	add	w7,w7,w12
1759	eor	v27.16b,v13.16b,v14.16b
1760	add	w8,w8,w9
1761	eor	v28.16b,v17.16b,v18.16b
1762	eor	w21,w21,w5
1763	eor	v29.16b,v21.16b,v22.16b
1764	eor	w17,w17,w6
1765	ushr	v1.4s,v24.4s,#25
1766	eor	w19,w19,w7
1767	ushr	v5.4s,v25.4s,#25
1768	eor	w20,w20,w8
1769	ushr	v9.4s,v26.4s,#25
1770	ror	w21,w21,#24
1771	ushr	v13.4s,v27.4s,#25
1772	ror	w17,w17,#24
1773	ushr	v17.4s,v28.4s,#25
1774	ror	w19,w19,#24
1775	ushr	v21.4s,v29.4s,#25
1776	ror	w20,w20,#24
1777	sli	v1.4s,v24.4s,#7
1778	add	w15,w15,w21
1779	sli	v5.4s,v25.4s,#7
1780	add	w16,w16,w17
1781	sli	v9.4s,v26.4s,#7
1782	add	w13,w13,w19
1783	sli	v13.4s,v27.4s,#7
1784	add	w14,w14,w20
1785	sli	v17.4s,v28.4s,#7
1786	eor	w10,w10,w15
1787	sli	v21.4s,v29.4s,#7
1788	eor	w11,w11,w16
1789	ext	v2.16b,v2.16b,v2.16b,#8
1790	eor	w12,w12,w13
1791	ext	v6.16b,v6.16b,v6.16b,#8
1792	eor	w9,w9,w14
1793	ext	v10.16b,v10.16b,v10.16b,#8
1794	ror	w10,w10,#25
1795	ext	v14.16b,v14.16b,v14.16b,#8
1796	ror	w11,w11,#25
1797	ext	v18.16b,v18.16b,v18.16b,#8
1798	ror	w12,w12,#25
1799	ext	v22.16b,v22.16b,v22.16b,#8
1800	ror	w9,w9,#25
1801	ext	v3.16b,v3.16b,v3.16b,#4
1802	ext	v7.16b,v7.16b,v7.16b,#4
1803	ext	v11.16b,v11.16b,v11.16b,#4
1804	ext	v15.16b,v15.16b,v15.16b,#4
1805	ext	v19.16b,v19.16b,v19.16b,#4
1806	ext	v23.16b,v23.16b,v23.16b,#4
1807	ext	v1.16b,v1.16b,v1.16b,#12
1808	ext	v5.16b,v5.16b,v5.16b,#12
1809	ext	v9.16b,v9.16b,v9.16b,#12
1810	ext	v13.16b,v13.16b,v13.16b,#12
1811	ext	v17.16b,v17.16b,v17.16b,#12
1812	ext	v21.16b,v21.16b,v21.16b,#12
1813	cbnz	x4,.Loop_lower_neon
1814
1815	add	w5,w5,w22		// accumulate key block
1816	ldp	q24,q25,[sp,#0]
1817	add	x6,x6,x22,lsr#32
1818	ldp	q26,q27,[sp,#32]
1819	add	w7,w7,w23
1820	ldp	q28,q29,[sp,#64]
1821	add	x8,x8,x23,lsr#32
1822	add	v0.4s,v0.4s,v24.4s
1823	add	w9,w9,w24
1824	add	v4.4s,v4.4s,v24.4s
1825	add	x10,x10,x24,lsr#32
1826	add	v8.4s,v8.4s,v24.4s
1827	add	w11,w11,w25
1828	add	v12.4s,v12.4s,v24.4s
1829	add	x12,x12,x25,lsr#32
1830	add	v16.4s,v16.4s,v24.4s
1831	add	w13,w13,w26
1832	add	v20.4s,v20.4s,v24.4s
1833	add	x14,x14,x26,lsr#32
1834	add	v2.4s,v2.4s,v26.4s
1835	add	w15,w15,w27
1836	add	v6.4s,v6.4s,v26.4s
1837	add	x16,x16,x27,lsr#32
1838	add	v10.4s,v10.4s,v26.4s
1839	add	w17,w17,w28
1840	add	v14.4s,v14.4s,v26.4s
1841	add	x19,x19,x28,lsr#32
1842	add	v18.4s,v18.4s,v26.4s
1843	add	w20,w20,w30
1844	add	v22.4s,v22.4s,v26.4s
1845	add	x21,x21,x30,lsr#32
1846	add	v19.4s,v19.4s,v31.4s			// +4
1847	add	x5,x5,x6,lsl#32	// pack
1848	add	v23.4s,v23.4s,v31.4s			// +4
1849	add	x7,x7,x8,lsl#32
1850	add	v3.4s,v3.4s,v27.4s
1851	ldp	x6,x8,[x1,#0]		// load input
1852	add	v7.4s,v7.4s,v28.4s
1853	add	x9,x9,x10,lsl#32
1854	add	v11.4s,v11.4s,v29.4s
1855	add	x11,x11,x12,lsl#32
1856	add	v15.4s,v15.4s,v30.4s
1857	ldp	x10,x12,[x1,#16]
1858	add	v19.4s,v19.4s,v27.4s
1859	add	x13,x13,x14,lsl#32
1860	add	v23.4s,v23.4s,v28.4s
1861	add	x15,x15,x16,lsl#32
1862	add	v1.4s,v1.4s,v25.4s
1863	ldp	x14,x16,[x1,#32]
1864	add	v5.4s,v5.4s,v25.4s
1865	add	x17,x17,x19,lsl#32
1866	add	v9.4s,v9.4s,v25.4s
1867	add	x20,x20,x21,lsl#32
1868	add	v13.4s,v13.4s,v25.4s
1869	ldp	x19,x21,[x1,#48]
1870	add	v17.4s,v17.4s,v25.4s
1871	add	x1,x1,#64
1872	add	v21.4s,v21.4s,v25.4s
1873
1874#ifdef	__ARMEB__
1875	rev	x5,x5
1876	rev	x7,x7
1877	rev	x9,x9
1878	rev	x11,x11
1879	rev	x13,x13
1880	rev	x15,x15
1881	rev	x17,x17
1882	rev	x20,x20
1883#endif
1884	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1885	eor	x5,x5,x6
1886	eor	x7,x7,x8
1887	eor	x9,x9,x10
1888	eor	x11,x11,x12
1889	eor	x13,x13,x14
1890	eor	v0.16b,v0.16b,v24.16b
1891	eor	x15,x15,x16
1892	eor	v1.16b,v1.16b,v25.16b
1893	eor	x17,x17,x19
1894	eor	v2.16b,v2.16b,v26.16b
1895	eor	x20,x20,x21
1896	eor	v3.16b,v3.16b,v27.16b
1897	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1898
1899	stp	x5,x7,[x0,#0]		// store output
1900	add	x28,x28,#7			// increment counter
1901	stp	x9,x11,[x0,#16]
1902	stp	x13,x15,[x0,#32]
1903	stp	x17,x20,[x0,#48]
1904	add	x0,x0,#64
1905	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1906
1907	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1908	eor	v4.16b,v4.16b,v24.16b
1909	eor	v5.16b,v5.16b,v25.16b
1910	eor	v6.16b,v6.16b,v26.16b
1911	eor	v7.16b,v7.16b,v27.16b
1912	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1913
1914	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1915	eor	v8.16b,v8.16b,v0.16b
1916	ldp	q24,q25,[sp,#0]
1917	eor	v9.16b,v9.16b,v1.16b
1918	ldp	q26,q27,[sp,#32]
1919	eor	v10.16b,v10.16b,v2.16b
1920	eor	v11.16b,v11.16b,v3.16b
1921	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1922
1923	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1924	eor	v12.16b,v12.16b,v4.16b
1925	eor	v13.16b,v13.16b,v5.16b
1926	eor	v14.16b,v14.16b,v6.16b
1927	eor	v15.16b,v15.16b,v7.16b
1928	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1929
1930	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1931	eor	v16.16b,v16.16b,v8.16b
1932	eor	v17.16b,v17.16b,v9.16b
1933	eor	v18.16b,v18.16b,v10.16b
1934	eor	v19.16b,v19.16b,v11.16b
1935	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1936
1937	shl	v0.4s,v31.4s,#1			// 4 -> 8
1938	eor	v20.16b,v20.16b,v12.16b
1939	eor	v21.16b,v21.16b,v13.16b
1940	eor	v22.16b,v22.16b,v14.16b
1941	eor	v23.16b,v23.16b,v15.16b
1942	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1943
1944	add	v27.4s,v27.4s,v0.4s			// += 8
1945	add	v28.4s,v28.4s,v0.4s
1946	add	v29.4s,v29.4s,v0.4s
1947	add	v30.4s,v30.4s,v0.4s
1948
1949	b.hs	.Loop_outer_512_neon
1950
1951	adds	x2,x2,#512
1952	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1953
1954	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1955	ldp	d10,d11,[sp,#128+16]
1956	ldp	d12,d13,[sp,#128+32]
1957	ldp	d14,d15,[sp,#128+48]
1958
1959	stp	q24,q31,[sp,#0]		// wipe off-load area
1960	stp	q24,q31,[sp,#32]
1961	stp	q24,q31,[sp,#64]
1962
1963	b.eq	.Ldone_512_neon
1964
1965	cmp	x2,#192
1966	sub	v27.4s,v27.4s,v0.4s			// -= 1
1967	sub	v28.4s,v28.4s,v0.4s
1968	sub	v29.4s,v29.4s,v0.4s
1969	add	sp,sp,#128
1970	b.hs	.Loop_outer_neon
1971
1972	eor	v25.16b,v25.16b,v25.16b
1973	eor	v26.16b,v26.16b,v26.16b
1974	eor	v27.16b,v27.16b,v27.16b
1975	eor	v28.16b,v28.16b,v28.16b
1976	eor	v29.16b,v29.16b,v29.16b
1977	eor	v30.16b,v30.16b,v30.16b
1978	b	.Loop_outer
1979
1980.Ldone_512_neon:
1981	ldp	x19,x20,[x29,#16]
1982	add	sp,sp,#128+64
1983	ldp	x21,x22,[x29,#32]
1984	ldp	x23,x24,[x29,#48]
1985	ldp	x25,x26,[x29,#64]
1986	ldp	x27,x28,[x29,#80]
1987	ldp	x29,x30,[sp],#96
1988	AARCH64_VALIDATE_LINK_REGISTER
1989	ret
1990.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1991#endif
1992#endif  // !OPENSSL_NO_ASM
1993.section	.note.GNU-stack,"",%progbits
1994