• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#include <GFp/arm_arch.h>
14
15
16.hidden	GFp_armcap_P
17
18.section	.rodata
19
20.align	5
21.Lsigma:
22.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
23.Lone:
24.long	1,0,0,0
25.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
26.align	2
27
28.text
29
30.globl	GFp_ChaCha20_ctr32
31.hidden	GFp_ChaCha20_ctr32
32.type	GFp_ChaCha20_ctr32,%function
33.align	5
34GFp_ChaCha20_ctr32:
35	AARCH64_VALID_CALL_TARGET
36	cbz	x2,.Labort
37#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
38	adrp	x5,:pg_hi21_nc:GFp_armcap_P
39#else
40	adrp	x5,GFp_armcap_P
41#endif
42	cmp	x2,#192
43	b.lo	.Lshort
44	ldr	w17,[x5,:lo12:GFp_armcap_P]
45	tst	w17,#ARMV7_NEON
46	b.ne	ChaCha20_neon
47
48.Lshort:
49	AARCH64_SIGN_LINK_REGISTER
50	stp	x29,x30,[sp,#-96]!
51	add	x29,sp,#0
52
53	adrp	x5,.Lsigma
54	add	x5,x5,:lo12:.Lsigma
55	stp	x19,x20,[sp,#16]
56	stp	x21,x22,[sp,#32]
57	stp	x23,x24,[sp,#48]
58	stp	x25,x26,[sp,#64]
59	stp	x27,x28,[sp,#80]
60	sub	sp,sp,#64
61
62	ldp	x22,x23,[x5]		// load sigma
63	ldp	x24,x25,[x3]		// load key
64	ldp	x26,x27,[x3,#16]
65	ldp	x28,x30,[x4]		// load counter
66#ifdef	__ARMEB__
67	ror	x24,x24,#32
68	ror	x25,x25,#32
69	ror	x26,x26,#32
70	ror	x27,x27,#32
71	ror	x28,x28,#32
72	ror	x30,x30,#32
73#endif
74
75.Loop_outer:
76	mov	w5,w22			// unpack key block
77	lsr	x6,x22,#32
78	mov	w7,w23
79	lsr	x8,x23,#32
80	mov	w9,w24
81	lsr	x10,x24,#32
82	mov	w11,w25
83	lsr	x12,x25,#32
84	mov	w13,w26
85	lsr	x14,x26,#32
86	mov	w15,w27
87	lsr	x16,x27,#32
88	mov	w17,w28
89	lsr	x19,x28,#32
90	mov	w20,w30
91	lsr	x21,x30,#32
92
93	mov	x4,#10
94	subs	x2,x2,#64
95.Loop:
96	sub	x4,x4,#1
97	add	w5,w5,w9
98	add	w6,w6,w10
99	add	w7,w7,w11
100	add	w8,w8,w12
101	eor	w17,w17,w5
102	eor	w19,w19,w6
103	eor	w20,w20,w7
104	eor	w21,w21,w8
105	ror	w17,w17,#16
106	ror	w19,w19,#16
107	ror	w20,w20,#16
108	ror	w21,w21,#16
109	add	w13,w13,w17
110	add	w14,w14,w19
111	add	w15,w15,w20
112	add	w16,w16,w21
113	eor	w9,w9,w13
114	eor	w10,w10,w14
115	eor	w11,w11,w15
116	eor	w12,w12,w16
117	ror	w9,w9,#20
118	ror	w10,w10,#20
119	ror	w11,w11,#20
120	ror	w12,w12,#20
121	add	w5,w5,w9
122	add	w6,w6,w10
123	add	w7,w7,w11
124	add	w8,w8,w12
125	eor	w17,w17,w5
126	eor	w19,w19,w6
127	eor	w20,w20,w7
128	eor	w21,w21,w8
129	ror	w17,w17,#24
130	ror	w19,w19,#24
131	ror	w20,w20,#24
132	ror	w21,w21,#24
133	add	w13,w13,w17
134	add	w14,w14,w19
135	add	w15,w15,w20
136	add	w16,w16,w21
137	eor	w9,w9,w13
138	eor	w10,w10,w14
139	eor	w11,w11,w15
140	eor	w12,w12,w16
141	ror	w9,w9,#25
142	ror	w10,w10,#25
143	ror	w11,w11,#25
144	ror	w12,w12,#25
145	add	w5,w5,w10
146	add	w6,w6,w11
147	add	w7,w7,w12
148	add	w8,w8,w9
149	eor	w21,w21,w5
150	eor	w17,w17,w6
151	eor	w19,w19,w7
152	eor	w20,w20,w8
153	ror	w21,w21,#16
154	ror	w17,w17,#16
155	ror	w19,w19,#16
156	ror	w20,w20,#16
157	add	w15,w15,w21
158	add	w16,w16,w17
159	add	w13,w13,w19
160	add	w14,w14,w20
161	eor	w10,w10,w15
162	eor	w11,w11,w16
163	eor	w12,w12,w13
164	eor	w9,w9,w14
165	ror	w10,w10,#20
166	ror	w11,w11,#20
167	ror	w12,w12,#20
168	ror	w9,w9,#20
169	add	w5,w5,w10
170	add	w6,w6,w11
171	add	w7,w7,w12
172	add	w8,w8,w9
173	eor	w21,w21,w5
174	eor	w17,w17,w6
175	eor	w19,w19,w7
176	eor	w20,w20,w8
177	ror	w21,w21,#24
178	ror	w17,w17,#24
179	ror	w19,w19,#24
180	ror	w20,w20,#24
181	add	w15,w15,w21
182	add	w16,w16,w17
183	add	w13,w13,w19
184	add	w14,w14,w20
185	eor	w10,w10,w15
186	eor	w11,w11,w16
187	eor	w12,w12,w13
188	eor	w9,w9,w14
189	ror	w10,w10,#25
190	ror	w11,w11,#25
191	ror	w12,w12,#25
192	ror	w9,w9,#25
193	cbnz	x4,.Loop
194
195	add	w5,w5,w22		// accumulate key block
196	add	x6,x6,x22,lsr#32
197	add	w7,w7,w23
198	add	x8,x8,x23,lsr#32
199	add	w9,w9,w24
200	add	x10,x10,x24,lsr#32
201	add	w11,w11,w25
202	add	x12,x12,x25,lsr#32
203	add	w13,w13,w26
204	add	x14,x14,x26,lsr#32
205	add	w15,w15,w27
206	add	x16,x16,x27,lsr#32
207	add	w17,w17,w28
208	add	x19,x19,x28,lsr#32
209	add	w20,w20,w30
210	add	x21,x21,x30,lsr#32
211
212	b.lo	.Ltail
213
214	add	x5,x5,x6,lsl#32	// pack
215	add	x7,x7,x8,lsl#32
216	ldp	x6,x8,[x1,#0]		// load input
217	add	x9,x9,x10,lsl#32
218	add	x11,x11,x12,lsl#32
219	ldp	x10,x12,[x1,#16]
220	add	x13,x13,x14,lsl#32
221	add	x15,x15,x16,lsl#32
222	ldp	x14,x16,[x1,#32]
223	add	x17,x17,x19,lsl#32
224	add	x20,x20,x21,lsl#32
225	ldp	x19,x21,[x1,#48]
226	add	x1,x1,#64
227#ifdef	__ARMEB__
228	rev	x5,x5
229	rev	x7,x7
230	rev	x9,x9
231	rev	x11,x11
232	rev	x13,x13
233	rev	x15,x15
234	rev	x17,x17
235	rev	x20,x20
236#endif
237	eor	x5,x5,x6
238	eor	x7,x7,x8
239	eor	x9,x9,x10
240	eor	x11,x11,x12
241	eor	x13,x13,x14
242	eor	x15,x15,x16
243	eor	x17,x17,x19
244	eor	x20,x20,x21
245
246	stp	x5,x7,[x0,#0]		// store output
247	add	x28,x28,#1			// increment counter
248	stp	x9,x11,[x0,#16]
249	stp	x13,x15,[x0,#32]
250	stp	x17,x20,[x0,#48]
251	add	x0,x0,#64
252
253	b.hi	.Loop_outer
254
255	ldp	x19,x20,[x29,#16]
256	add	sp,sp,#64
257	ldp	x21,x22,[x29,#32]
258	ldp	x23,x24,[x29,#48]
259	ldp	x25,x26,[x29,#64]
260	ldp	x27,x28,[x29,#80]
261	ldp	x29,x30,[sp],#96
262	AARCH64_VALIDATE_LINK_REGISTER
263.Labort:
264	ret
265
266.align	4
267.Ltail:
268	add	x2,x2,#64
269.Less_than_64:
270	sub	x0,x0,#1
271	add	x1,x1,x2
272	add	x0,x0,x2
273	add	x4,sp,x2
274	neg	x2,x2
275
276	add	x5,x5,x6,lsl#32	// pack
277	add	x7,x7,x8,lsl#32
278	add	x9,x9,x10,lsl#32
279	add	x11,x11,x12,lsl#32
280	add	x13,x13,x14,lsl#32
281	add	x15,x15,x16,lsl#32
282	add	x17,x17,x19,lsl#32
283	add	x20,x20,x21,lsl#32
284#ifdef	__ARMEB__
285	rev	x5,x5
286	rev	x7,x7
287	rev	x9,x9
288	rev	x11,x11
289	rev	x13,x13
290	rev	x15,x15
291	rev	x17,x17
292	rev	x20,x20
293#endif
294	stp	x5,x7,[sp,#0]
295	stp	x9,x11,[sp,#16]
296	stp	x13,x15,[sp,#32]
297	stp	x17,x20,[sp,#48]
298
299.Loop_tail:
300	ldrb	w10,[x1,x2]
301	ldrb	w11,[x4,x2]
302	add	x2,x2,#1
303	eor	w10,w10,w11
304	strb	w10,[x0,x2]
305	cbnz	x2,.Loop_tail
306
307	stp	xzr,xzr,[sp,#0]
308	stp	xzr,xzr,[sp,#16]
309	stp	xzr,xzr,[sp,#32]
310	stp	xzr,xzr,[sp,#48]
311
312	ldp	x19,x20,[x29,#16]
313	add	sp,sp,#64
314	ldp	x21,x22,[x29,#32]
315	ldp	x23,x24,[x29,#48]
316	ldp	x25,x26,[x29,#64]
317	ldp	x27,x28,[x29,#80]
318	ldp	x29,x30,[sp],#96
319	AARCH64_VALIDATE_LINK_REGISTER
320	ret
321.size	GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32
322
323.type	ChaCha20_neon,%function
324.align	5
325ChaCha20_neon:
326	AARCH64_SIGN_LINK_REGISTER
327	stp	x29,x30,[sp,#-96]!
328	add	x29,sp,#0
329
330	adrp	x5,.Lsigma
331	add	x5,x5,:lo12:.Lsigma
332	stp	x19,x20,[sp,#16]
333	stp	x21,x22,[sp,#32]
334	stp	x23,x24,[sp,#48]
335	stp	x25,x26,[sp,#64]
336	stp	x27,x28,[sp,#80]
337	cmp	x2,#512
338	b.hs	.L512_or_more_neon
339
340	sub	sp,sp,#64
341
342	ldp	x22,x23,[x5]		// load sigma
343	ld1	{v24.4s},[x5],#16
344	ldp	x24,x25,[x3]		// load key
345	ldp	x26,x27,[x3,#16]
346	ld1	{v25.4s,v26.4s},[x3]
347	ldp	x28,x30,[x4]		// load counter
348	ld1	{v27.4s},[x4]
349	ld1	{v31.4s},[x5]
350#ifdef	__ARMEB__
351	rev64	v24.4s,v24.4s
352	ror	x24,x24,#32
353	ror	x25,x25,#32
354	ror	x26,x26,#32
355	ror	x27,x27,#32
356	ror	x28,x28,#32
357	ror	x30,x30,#32
358#endif
359	add	v27.4s,v27.4s,v31.4s		// += 1
360	add	v28.4s,v27.4s,v31.4s
361	add	v29.4s,v28.4s,v31.4s
362	shl	v31.4s,v31.4s,#2			// 1 -> 4
363
364.Loop_outer_neon:
365	mov	w5,w22			// unpack key block
366	lsr	x6,x22,#32
367	mov	v0.16b,v24.16b
368	mov	w7,w23
369	lsr	x8,x23,#32
370	mov	v4.16b,v24.16b
371	mov	w9,w24
372	lsr	x10,x24,#32
373	mov	v16.16b,v24.16b
374	mov	w11,w25
375	mov	v1.16b,v25.16b
376	lsr	x12,x25,#32
377	mov	v5.16b,v25.16b
378	mov	w13,w26
379	mov	v17.16b,v25.16b
380	lsr	x14,x26,#32
381	mov	v3.16b,v27.16b
382	mov	w15,w27
383	mov	v7.16b,v28.16b
384	lsr	x16,x27,#32
385	mov	v19.16b,v29.16b
386	mov	w17,w28
387	mov	v2.16b,v26.16b
388	lsr	x19,x28,#32
389	mov	v6.16b,v26.16b
390	mov	w20,w30
391	mov	v18.16b,v26.16b
392	lsr	x21,x30,#32
393
394	mov	x4,#10
395	subs	x2,x2,#256
396.Loop_neon:
397	sub	x4,x4,#1
398	add	v0.4s,v0.4s,v1.4s
399	add	w5,w5,w9
400	add	v4.4s,v4.4s,v5.4s
401	add	w6,w6,w10
402	add	v16.4s,v16.4s,v17.4s
403	add	w7,w7,w11
404	eor	v3.16b,v3.16b,v0.16b
405	add	w8,w8,w12
406	eor	v7.16b,v7.16b,v4.16b
407	eor	w17,w17,w5
408	eor	v19.16b,v19.16b,v16.16b
409	eor	w19,w19,w6
410	rev32	v3.8h,v3.8h
411	eor	w20,w20,w7
412	rev32	v7.8h,v7.8h
413	eor	w21,w21,w8
414	rev32	v19.8h,v19.8h
415	ror	w17,w17,#16
416	add	v2.4s,v2.4s,v3.4s
417	ror	w19,w19,#16
418	add	v6.4s,v6.4s,v7.4s
419	ror	w20,w20,#16
420	add	v18.4s,v18.4s,v19.4s
421	ror	w21,w21,#16
422	eor	v20.16b,v1.16b,v2.16b
423	add	w13,w13,w17
424	eor	v21.16b,v5.16b,v6.16b
425	add	w14,w14,w19
426	eor	v22.16b,v17.16b,v18.16b
427	add	w15,w15,w20
428	ushr	v1.4s,v20.4s,#20
429	add	w16,w16,w21
430	ushr	v5.4s,v21.4s,#20
431	eor	w9,w9,w13
432	ushr	v17.4s,v22.4s,#20
433	eor	w10,w10,w14
434	sli	v1.4s,v20.4s,#12
435	eor	w11,w11,w15
436	sli	v5.4s,v21.4s,#12
437	eor	w12,w12,w16
438	sli	v17.4s,v22.4s,#12
439	ror	w9,w9,#20
440	add	v0.4s,v0.4s,v1.4s
441	ror	w10,w10,#20
442	add	v4.4s,v4.4s,v5.4s
443	ror	w11,w11,#20
444	add	v16.4s,v16.4s,v17.4s
445	ror	w12,w12,#20
446	eor	v20.16b,v3.16b,v0.16b
447	add	w5,w5,w9
448	eor	v21.16b,v7.16b,v4.16b
449	add	w6,w6,w10
450	eor	v22.16b,v19.16b,v16.16b
451	add	w7,w7,w11
452	ushr	v3.4s,v20.4s,#24
453	add	w8,w8,w12
454	ushr	v7.4s,v21.4s,#24
455	eor	w17,w17,w5
456	ushr	v19.4s,v22.4s,#24
457	eor	w19,w19,w6
458	sli	v3.4s,v20.4s,#8
459	eor	w20,w20,w7
460	sli	v7.4s,v21.4s,#8
461	eor	w21,w21,w8
462	sli	v19.4s,v22.4s,#8
463	ror	w17,w17,#24
464	add	v2.4s,v2.4s,v3.4s
465	ror	w19,w19,#24
466	add	v6.4s,v6.4s,v7.4s
467	ror	w20,w20,#24
468	add	v18.4s,v18.4s,v19.4s
469	ror	w21,w21,#24
470	eor	v20.16b,v1.16b,v2.16b
471	add	w13,w13,w17
472	eor	v21.16b,v5.16b,v6.16b
473	add	w14,w14,w19
474	eor	v22.16b,v17.16b,v18.16b
475	add	w15,w15,w20
476	ushr	v1.4s,v20.4s,#25
477	add	w16,w16,w21
478	ushr	v5.4s,v21.4s,#25
479	eor	w9,w9,w13
480	ushr	v17.4s,v22.4s,#25
481	eor	w10,w10,w14
482	sli	v1.4s,v20.4s,#7
483	eor	w11,w11,w15
484	sli	v5.4s,v21.4s,#7
485	eor	w12,w12,w16
486	sli	v17.4s,v22.4s,#7
487	ror	w9,w9,#25
488	ext	v2.16b,v2.16b,v2.16b,#8
489	ror	w10,w10,#25
490	ext	v6.16b,v6.16b,v6.16b,#8
491	ror	w11,w11,#25
492	ext	v18.16b,v18.16b,v18.16b,#8
493	ror	w12,w12,#25
494	ext	v3.16b,v3.16b,v3.16b,#12
495	ext	v7.16b,v7.16b,v7.16b,#12
496	ext	v19.16b,v19.16b,v19.16b,#12
497	ext	v1.16b,v1.16b,v1.16b,#4
498	ext	v5.16b,v5.16b,v5.16b,#4
499	ext	v17.16b,v17.16b,v17.16b,#4
500	add	v0.4s,v0.4s,v1.4s
501	add	w5,w5,w10
502	add	v4.4s,v4.4s,v5.4s
503	add	w6,w6,w11
504	add	v16.4s,v16.4s,v17.4s
505	add	w7,w7,w12
506	eor	v3.16b,v3.16b,v0.16b
507	add	w8,w8,w9
508	eor	v7.16b,v7.16b,v4.16b
509	eor	w21,w21,w5
510	eor	v19.16b,v19.16b,v16.16b
511	eor	w17,w17,w6
512	rev32	v3.8h,v3.8h
513	eor	w19,w19,w7
514	rev32	v7.8h,v7.8h
515	eor	w20,w20,w8
516	rev32	v19.8h,v19.8h
517	ror	w21,w21,#16
518	add	v2.4s,v2.4s,v3.4s
519	ror	w17,w17,#16
520	add	v6.4s,v6.4s,v7.4s
521	ror	w19,w19,#16
522	add	v18.4s,v18.4s,v19.4s
523	ror	w20,w20,#16
524	eor	v20.16b,v1.16b,v2.16b
525	add	w15,w15,w21
526	eor	v21.16b,v5.16b,v6.16b
527	add	w16,w16,w17
528	eor	v22.16b,v17.16b,v18.16b
529	add	w13,w13,w19
530	ushr	v1.4s,v20.4s,#20
531	add	w14,w14,w20
532	ushr	v5.4s,v21.4s,#20
533	eor	w10,w10,w15
534	ushr	v17.4s,v22.4s,#20
535	eor	w11,w11,w16
536	sli	v1.4s,v20.4s,#12
537	eor	w12,w12,w13
538	sli	v5.4s,v21.4s,#12
539	eor	w9,w9,w14
540	sli	v17.4s,v22.4s,#12
541	ror	w10,w10,#20
542	add	v0.4s,v0.4s,v1.4s
543	ror	w11,w11,#20
544	add	v4.4s,v4.4s,v5.4s
545	ror	w12,w12,#20
546	add	v16.4s,v16.4s,v17.4s
547	ror	w9,w9,#20
548	eor	v20.16b,v3.16b,v0.16b
549	add	w5,w5,w10
550	eor	v21.16b,v7.16b,v4.16b
551	add	w6,w6,w11
552	eor	v22.16b,v19.16b,v16.16b
553	add	w7,w7,w12
554	ushr	v3.4s,v20.4s,#24
555	add	w8,w8,w9
556	ushr	v7.4s,v21.4s,#24
557	eor	w21,w21,w5
558	ushr	v19.4s,v22.4s,#24
559	eor	w17,w17,w6
560	sli	v3.4s,v20.4s,#8
561	eor	w19,w19,w7
562	sli	v7.4s,v21.4s,#8
563	eor	w20,w20,w8
564	sli	v19.4s,v22.4s,#8
565	ror	w21,w21,#24
566	add	v2.4s,v2.4s,v3.4s
567	ror	w17,w17,#24
568	add	v6.4s,v6.4s,v7.4s
569	ror	w19,w19,#24
570	add	v18.4s,v18.4s,v19.4s
571	ror	w20,w20,#24
572	eor	v20.16b,v1.16b,v2.16b
573	add	w15,w15,w21
574	eor	v21.16b,v5.16b,v6.16b
575	add	w16,w16,w17
576	eor	v22.16b,v17.16b,v18.16b
577	add	w13,w13,w19
578	ushr	v1.4s,v20.4s,#25
579	add	w14,w14,w20
580	ushr	v5.4s,v21.4s,#25
581	eor	w10,w10,w15
582	ushr	v17.4s,v22.4s,#25
583	eor	w11,w11,w16
584	sli	v1.4s,v20.4s,#7
585	eor	w12,w12,w13
586	sli	v5.4s,v21.4s,#7
587	eor	w9,w9,w14
588	sli	v17.4s,v22.4s,#7
589	ror	w10,w10,#25
590	ext	v2.16b,v2.16b,v2.16b,#8
591	ror	w11,w11,#25
592	ext	v6.16b,v6.16b,v6.16b,#8
593	ror	w12,w12,#25
594	ext	v18.16b,v18.16b,v18.16b,#8
595	ror	w9,w9,#25
596	ext	v3.16b,v3.16b,v3.16b,#4
597	ext	v7.16b,v7.16b,v7.16b,#4
598	ext	v19.16b,v19.16b,v19.16b,#4
599	ext	v1.16b,v1.16b,v1.16b,#12
600	ext	v5.16b,v5.16b,v5.16b,#12
601	ext	v17.16b,v17.16b,v17.16b,#12
602	cbnz	x4,.Loop_neon
603
604	add	w5,w5,w22		// accumulate key block
605	add	v0.4s,v0.4s,v24.4s
606	add	x6,x6,x22,lsr#32
607	add	v4.4s,v4.4s,v24.4s
608	add	w7,w7,w23
609	add	v16.4s,v16.4s,v24.4s
610	add	x8,x8,x23,lsr#32
611	add	v2.4s,v2.4s,v26.4s
612	add	w9,w9,w24
613	add	v6.4s,v6.4s,v26.4s
614	add	x10,x10,x24,lsr#32
615	add	v18.4s,v18.4s,v26.4s
616	add	w11,w11,w25
617	add	v3.4s,v3.4s,v27.4s
618	add	x12,x12,x25,lsr#32
619	add	w13,w13,w26
620	add	v7.4s,v7.4s,v28.4s
621	add	x14,x14,x26,lsr#32
622	add	w15,w15,w27
623	add	v19.4s,v19.4s,v29.4s
624	add	x16,x16,x27,lsr#32
625	add	w17,w17,w28
626	add	v1.4s,v1.4s,v25.4s
627	add	x19,x19,x28,lsr#32
628	add	w20,w20,w30
629	add	v5.4s,v5.4s,v25.4s
630	add	x21,x21,x30,lsr#32
631	add	v17.4s,v17.4s,v25.4s
632
633	b.lo	.Ltail_neon
634
635	add	x5,x5,x6,lsl#32	// pack
636	add	x7,x7,x8,lsl#32
637	ldp	x6,x8,[x1,#0]		// load input
638	add	x9,x9,x10,lsl#32
639	add	x11,x11,x12,lsl#32
640	ldp	x10,x12,[x1,#16]
641	add	x13,x13,x14,lsl#32
642	add	x15,x15,x16,lsl#32
643	ldp	x14,x16,[x1,#32]
644	add	x17,x17,x19,lsl#32
645	add	x20,x20,x21,lsl#32
646	ldp	x19,x21,[x1,#48]
647	add	x1,x1,#64
648#ifdef	__ARMEB__
649	rev	x5,x5
650	rev	x7,x7
651	rev	x9,x9
652	rev	x11,x11
653	rev	x13,x13
654	rev	x15,x15
655	rev	x17,x17
656	rev	x20,x20
657#endif
658	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
659	eor	x5,x5,x6
660	eor	x7,x7,x8
661	eor	x9,x9,x10
662	eor	x11,x11,x12
663	eor	x13,x13,x14
664	eor	v0.16b,v0.16b,v20.16b
665	eor	x15,x15,x16
666	eor	v1.16b,v1.16b,v21.16b
667	eor	x17,x17,x19
668	eor	v2.16b,v2.16b,v22.16b
669	eor	x20,x20,x21
670	eor	v3.16b,v3.16b,v23.16b
671	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
672
673	stp	x5,x7,[x0,#0]		// store output
674	add	x28,x28,#4			// increment counter
675	stp	x9,x11,[x0,#16]
676	add	v27.4s,v27.4s,v31.4s		// += 4
677	stp	x13,x15,[x0,#32]
678	add	v28.4s,v28.4s,v31.4s
679	stp	x17,x20,[x0,#48]
680	add	v29.4s,v29.4s,v31.4s
681	add	x0,x0,#64
682
683	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
684	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
685
686	eor	v4.16b,v4.16b,v20.16b
687	eor	v5.16b,v5.16b,v21.16b
688	eor	v6.16b,v6.16b,v22.16b
689	eor	v7.16b,v7.16b,v23.16b
690	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
691
692	eor	v16.16b,v16.16b,v0.16b
693	eor	v17.16b,v17.16b,v1.16b
694	eor	v18.16b,v18.16b,v2.16b
695	eor	v19.16b,v19.16b,v3.16b
696	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
697
698	b.hi	.Loop_outer_neon
699
700	ldp	x19,x20,[x29,#16]
701	add	sp,sp,#64
702	ldp	x21,x22,[x29,#32]
703	ldp	x23,x24,[x29,#48]
704	ldp	x25,x26,[x29,#64]
705	ldp	x27,x28,[x29,#80]
706	ldp	x29,x30,[sp],#96
707	AARCH64_VALIDATE_LINK_REGISTER
708	ret
709
710.Ltail_neon:
711	add	x2,x2,#256
712	cmp	x2,#64
713	b.lo	.Less_than_64
714
715	add	x5,x5,x6,lsl#32	// pack
716	add	x7,x7,x8,lsl#32
717	ldp	x6,x8,[x1,#0]		// load input
718	add	x9,x9,x10,lsl#32
719	add	x11,x11,x12,lsl#32
720	ldp	x10,x12,[x1,#16]
721	add	x13,x13,x14,lsl#32
722	add	x15,x15,x16,lsl#32
723	ldp	x14,x16,[x1,#32]
724	add	x17,x17,x19,lsl#32
725	add	x20,x20,x21,lsl#32
726	ldp	x19,x21,[x1,#48]
727	add	x1,x1,#64
728#ifdef	__ARMEB__
729	rev	x5,x5
730	rev	x7,x7
731	rev	x9,x9
732	rev	x11,x11
733	rev	x13,x13
734	rev	x15,x15
735	rev	x17,x17
736	rev	x20,x20
737#endif
738	eor	x5,x5,x6
739	eor	x7,x7,x8
740	eor	x9,x9,x10
741	eor	x11,x11,x12
742	eor	x13,x13,x14
743	eor	x15,x15,x16
744	eor	x17,x17,x19
745	eor	x20,x20,x21
746
747	stp	x5,x7,[x0,#0]		// store output
748	add	x28,x28,#4			// increment counter
749	stp	x9,x11,[x0,#16]
750	stp	x13,x15,[x0,#32]
751	stp	x17,x20,[x0,#48]
752	add	x0,x0,#64
753	b.eq	.Ldone_neon
754	sub	x2,x2,#64
755	cmp	x2,#64
756	b.lo	.Less_than_128
757
758	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
759	eor	v0.16b,v0.16b,v20.16b
760	eor	v1.16b,v1.16b,v21.16b
761	eor	v2.16b,v2.16b,v22.16b
762	eor	v3.16b,v3.16b,v23.16b
763	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
764	b.eq	.Ldone_neon
765	sub	x2,x2,#64
766	cmp	x2,#64
767	b.lo	.Less_than_192
768
769	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
770	eor	v4.16b,v4.16b,v20.16b
771	eor	v5.16b,v5.16b,v21.16b
772	eor	v6.16b,v6.16b,v22.16b
773	eor	v7.16b,v7.16b,v23.16b
774	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
775	b.eq	.Ldone_neon
776	sub	x2,x2,#64
777
778	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
779	b	.Last_neon
780
781.Less_than_128:
782	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
783	b	.Last_neon
784.Less_than_192:
785	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
786	b	.Last_neon
787
788.align	4
789.Last_neon:
790	sub	x0,x0,#1
791	add	x1,x1,x2
792	add	x0,x0,x2
793	add	x4,sp,x2
794	neg	x2,x2
795
796.Loop_tail_neon:
797	ldrb	w10,[x1,x2]
798	ldrb	w11,[x4,x2]
799	add	x2,x2,#1
800	eor	w10,w10,w11
801	strb	w10,[x0,x2]
802	cbnz	x2,.Loop_tail_neon
803
804	stp	xzr,xzr,[sp,#0]
805	stp	xzr,xzr,[sp,#16]
806	stp	xzr,xzr,[sp,#32]
807	stp	xzr,xzr,[sp,#48]
808
809.Ldone_neon:
810	ldp	x19,x20,[x29,#16]
811	add	sp,sp,#64
812	ldp	x21,x22,[x29,#32]
813	ldp	x23,x24,[x29,#48]
814	ldp	x25,x26,[x29,#64]
815	ldp	x27,x28,[x29,#80]
816	ldp	x29,x30,[sp],#96
817	AARCH64_VALIDATE_LINK_REGISTER
818	ret
819.size	ChaCha20_neon,.-ChaCha20_neon
820.type	ChaCha20_512_neon,%function
821.align	5
822ChaCha20_512_neon:
823	AARCH64_SIGN_LINK_REGISTER
824	stp	x29,x30,[sp,#-96]!
825	add	x29,sp,#0
826
827	adrp	x5,.Lsigma
828	add	x5,x5,:lo12:.Lsigma
829	stp	x19,x20,[sp,#16]
830	stp	x21,x22,[sp,#32]
831	stp	x23,x24,[sp,#48]
832	stp	x25,x26,[sp,#64]
833	stp	x27,x28,[sp,#80]
834
835.L512_or_more_neon:
836	sub	sp,sp,#128+64
837
838	ldp	x22,x23,[x5]		// load sigma
839	ld1	{v24.4s},[x5],#16
840	ldp	x24,x25,[x3]		// load key
841	ldp	x26,x27,[x3,#16]
842	ld1	{v25.4s,v26.4s},[x3]
843	ldp	x28,x30,[x4]		// load counter
844	ld1	{v27.4s},[x4]
845	ld1	{v31.4s},[x5]
846#ifdef	__ARMEB__
847	rev64	v24.4s,v24.4s
848	ror	x24,x24,#32
849	ror	x25,x25,#32
850	ror	x26,x26,#32
851	ror	x27,x27,#32
852	ror	x28,x28,#32
853	ror	x30,x30,#32
854#endif
855	add	v27.4s,v27.4s,v31.4s		// += 1
856	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
857	add	v27.4s,v27.4s,v31.4s		// not typo
858	str	q26,[sp,#32]
859	add	v28.4s,v27.4s,v31.4s
860	add	v29.4s,v28.4s,v31.4s
861	add	v30.4s,v29.4s,v31.4s
862	shl	v31.4s,v31.4s,#2			// 1 -> 4
863
864	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
865	stp	d10,d11,[sp,#128+16]
866	stp	d12,d13,[sp,#128+32]
867	stp	d14,d15,[sp,#128+48]
868
869	sub	x2,x2,#512			// not typo
870
871.Loop_outer_512_neon:
872	mov	v0.16b,v24.16b
873	mov	v4.16b,v24.16b
874	mov	v8.16b,v24.16b
875	mov	v12.16b,v24.16b
876	mov	v16.16b,v24.16b
877	mov	v20.16b,v24.16b
878	mov	v1.16b,v25.16b
879	mov	w5,w22			// unpack key block
880	mov	v5.16b,v25.16b
881	lsr	x6,x22,#32
882	mov	v9.16b,v25.16b
883	mov	w7,w23
884	mov	v13.16b,v25.16b
885	lsr	x8,x23,#32
886	mov	v17.16b,v25.16b
887	mov	w9,w24
888	mov	v21.16b,v25.16b
889	lsr	x10,x24,#32
890	mov	v3.16b,v27.16b
891	mov	w11,w25
892	mov	v7.16b,v28.16b
893	lsr	x12,x25,#32
894	mov	v11.16b,v29.16b
895	mov	w13,w26
896	mov	v15.16b,v30.16b
897	lsr	x14,x26,#32
898	mov	v2.16b,v26.16b
899	mov	w15,w27
900	mov	v6.16b,v26.16b
901	lsr	x16,x27,#32
902	add	v19.4s,v3.4s,v31.4s			// +4
903	mov	w17,w28
904	add	v23.4s,v7.4s,v31.4s			// +4
905	lsr	x19,x28,#32
906	mov	v10.16b,v26.16b
907	mov	w20,w30
908	mov	v14.16b,v26.16b
909	lsr	x21,x30,#32
910	mov	v18.16b,v26.16b
911	stp	q27,q28,[sp,#48]		// off-load key block, variable part
912	mov	v22.16b,v26.16b
913	str	q29,[sp,#80]
914
915	mov	x4,#5
916	subs	x2,x2,#512
917.Loop_upper_neon:
918	sub	x4,x4,#1
919	add	v0.4s,v0.4s,v1.4s
920	add	w5,w5,w9
921	add	v4.4s,v4.4s,v5.4s
922	add	w6,w6,w10
923	add	v8.4s,v8.4s,v9.4s
924	add	w7,w7,w11
925	add	v12.4s,v12.4s,v13.4s
926	add	w8,w8,w12
927	add	v16.4s,v16.4s,v17.4s
928	eor	w17,w17,w5
929	add	v20.4s,v20.4s,v21.4s
930	eor	w19,w19,w6
931	eor	v3.16b,v3.16b,v0.16b
932	eor	w20,w20,w7
933	eor	v7.16b,v7.16b,v4.16b
934	eor	w21,w21,w8
935	eor	v11.16b,v11.16b,v8.16b
936	ror	w17,w17,#16
937	eor	v15.16b,v15.16b,v12.16b
938	ror	w19,w19,#16
939	eor	v19.16b,v19.16b,v16.16b
940	ror	w20,w20,#16
941	eor	v23.16b,v23.16b,v20.16b
942	ror	w21,w21,#16
943	rev32	v3.8h,v3.8h
944	add	w13,w13,w17
945	rev32	v7.8h,v7.8h
946	add	w14,w14,w19
947	rev32	v11.8h,v11.8h
948	add	w15,w15,w20
949	rev32	v15.8h,v15.8h
950	add	w16,w16,w21
951	rev32	v19.8h,v19.8h
952	eor	w9,w9,w13
953	rev32	v23.8h,v23.8h
954	eor	w10,w10,w14
955	add	v2.4s,v2.4s,v3.4s
956	eor	w11,w11,w15
957	add	v6.4s,v6.4s,v7.4s
958	eor	w12,w12,w16
959	add	v10.4s,v10.4s,v11.4s
960	ror	w9,w9,#20
961	add	v14.4s,v14.4s,v15.4s
962	ror	w10,w10,#20
963	add	v18.4s,v18.4s,v19.4s
964	ror	w11,w11,#20
965	add	v22.4s,v22.4s,v23.4s
966	ror	w12,w12,#20
967	eor	v24.16b,v1.16b,v2.16b
968	add	w5,w5,w9
969	eor	v25.16b,v5.16b,v6.16b
970	add	w6,w6,w10
971	eor	v26.16b,v9.16b,v10.16b
972	add	w7,w7,w11
973	eor	v27.16b,v13.16b,v14.16b
974	add	w8,w8,w12
975	eor	v28.16b,v17.16b,v18.16b
976	eor	w17,w17,w5
977	eor	v29.16b,v21.16b,v22.16b
978	eor	w19,w19,w6
979	ushr	v1.4s,v24.4s,#20
980	eor	w20,w20,w7
981	ushr	v5.4s,v25.4s,#20
982	eor	w21,w21,w8
983	ushr	v9.4s,v26.4s,#20
984	ror	w17,w17,#24
985	ushr	v13.4s,v27.4s,#20
986	ror	w19,w19,#24
987	ushr	v17.4s,v28.4s,#20
988	ror	w20,w20,#24
989	ushr	v21.4s,v29.4s,#20
990	ror	w21,w21,#24
991	sli	v1.4s,v24.4s,#12
992	add	w13,w13,w17
993	sli	v5.4s,v25.4s,#12
994	add	w14,w14,w19
995	sli	v9.4s,v26.4s,#12
996	add	w15,w15,w20
997	sli	v13.4s,v27.4s,#12
998	add	w16,w16,w21
999	sli	v17.4s,v28.4s,#12
1000	eor	w9,w9,w13
1001	sli	v21.4s,v29.4s,#12
1002	eor	w10,w10,w14
1003	add	v0.4s,v0.4s,v1.4s
1004	eor	w11,w11,w15
1005	add	v4.4s,v4.4s,v5.4s
1006	eor	w12,w12,w16
1007	add	v8.4s,v8.4s,v9.4s
1008	ror	w9,w9,#25
1009	add	v12.4s,v12.4s,v13.4s
1010	ror	w10,w10,#25
1011	add	v16.4s,v16.4s,v17.4s
1012	ror	w11,w11,#25
1013	add	v20.4s,v20.4s,v21.4s
1014	ror	w12,w12,#25
1015	eor	v24.16b,v3.16b,v0.16b
1016	add	w5,w5,w10
1017	eor	v25.16b,v7.16b,v4.16b
1018	add	w6,w6,w11
1019	eor	v26.16b,v11.16b,v8.16b
1020	add	w7,w7,w12
1021	eor	v27.16b,v15.16b,v12.16b
1022	add	w8,w8,w9
1023	eor	v28.16b,v19.16b,v16.16b
1024	eor	w21,w21,w5
1025	eor	v29.16b,v23.16b,v20.16b
1026	eor	w17,w17,w6
1027	ushr	v3.4s,v24.4s,#24
1028	eor	w19,w19,w7
1029	ushr	v7.4s,v25.4s,#24
1030	eor	w20,w20,w8
1031	ushr	v11.4s,v26.4s,#24
1032	ror	w21,w21,#16
1033	ushr	v15.4s,v27.4s,#24
1034	ror	w17,w17,#16
1035	ushr	v19.4s,v28.4s,#24
1036	ror	w19,w19,#16
1037	ushr	v23.4s,v29.4s,#24
1038	ror	w20,w20,#16
1039	sli	v3.4s,v24.4s,#8
1040	add	w15,w15,w21
1041	sli	v7.4s,v25.4s,#8
1042	add	w16,w16,w17
1043	sli	v11.4s,v26.4s,#8
1044	add	w13,w13,w19
1045	sli	v15.4s,v27.4s,#8
1046	add	w14,w14,w20
1047	sli	v19.4s,v28.4s,#8
1048	eor	w10,w10,w15
1049	sli	v23.4s,v29.4s,#8
1050	eor	w11,w11,w16
1051	add	v2.4s,v2.4s,v3.4s
1052	eor	w12,w12,w13
1053	add	v6.4s,v6.4s,v7.4s
1054	eor	w9,w9,w14
1055	add	v10.4s,v10.4s,v11.4s
1056	ror	w10,w10,#20
1057	add	v14.4s,v14.4s,v15.4s
1058	ror	w11,w11,#20
1059	add	v18.4s,v18.4s,v19.4s
1060	ror	w12,w12,#20
1061	add	v22.4s,v22.4s,v23.4s
1062	ror	w9,w9,#20
1063	eor	v24.16b,v1.16b,v2.16b
1064	add	w5,w5,w10
1065	eor	v25.16b,v5.16b,v6.16b
1066	add	w6,w6,w11
1067	eor	v26.16b,v9.16b,v10.16b
1068	add	w7,w7,w12
1069	eor	v27.16b,v13.16b,v14.16b
1070	add	w8,w8,w9
1071	eor	v28.16b,v17.16b,v18.16b
1072	eor	w21,w21,w5
1073	eor	v29.16b,v21.16b,v22.16b
1074	eor	w17,w17,w6
1075	ushr	v1.4s,v24.4s,#25
1076	eor	w19,w19,w7
1077	ushr	v5.4s,v25.4s,#25
1078	eor	w20,w20,w8
1079	ushr	v9.4s,v26.4s,#25
1080	ror	w21,w21,#24
1081	ushr	v13.4s,v27.4s,#25
1082	ror	w17,w17,#24
1083	ushr	v17.4s,v28.4s,#25
1084	ror	w19,w19,#24
1085	ushr	v21.4s,v29.4s,#25
1086	ror	w20,w20,#24
1087	sli	v1.4s,v24.4s,#7
1088	add	w15,w15,w21
1089	sli	v5.4s,v25.4s,#7
1090	add	w16,w16,w17
1091	sli	v9.4s,v26.4s,#7
1092	add	w13,w13,w19
1093	sli	v13.4s,v27.4s,#7
1094	add	w14,w14,w20
1095	sli	v17.4s,v28.4s,#7
1096	eor	w10,w10,w15
1097	sli	v21.4s,v29.4s,#7
1098	eor	w11,w11,w16
1099	ext	v2.16b,v2.16b,v2.16b,#8
1100	eor	w12,w12,w13
1101	ext	v6.16b,v6.16b,v6.16b,#8
1102	eor	w9,w9,w14
1103	ext	v10.16b,v10.16b,v10.16b,#8
1104	ror	w10,w10,#25
1105	ext	v14.16b,v14.16b,v14.16b,#8
1106	ror	w11,w11,#25
1107	ext	v18.16b,v18.16b,v18.16b,#8
1108	ror	w12,w12,#25
1109	ext	v22.16b,v22.16b,v22.16b,#8
1110	ror	w9,w9,#25
1111	ext	v3.16b,v3.16b,v3.16b,#12
1112	ext	v7.16b,v7.16b,v7.16b,#12
1113	ext	v11.16b,v11.16b,v11.16b,#12
1114	ext	v15.16b,v15.16b,v15.16b,#12
1115	ext	v19.16b,v19.16b,v19.16b,#12
1116	ext	v23.16b,v23.16b,v23.16b,#12
1117	ext	v1.16b,v1.16b,v1.16b,#4
1118	ext	v5.16b,v5.16b,v5.16b,#4
1119	ext	v9.16b,v9.16b,v9.16b,#4
1120	ext	v13.16b,v13.16b,v13.16b,#4
1121	ext	v17.16b,v17.16b,v17.16b,#4
1122	ext	v21.16b,v21.16b,v21.16b,#4
1123	add	v0.4s,v0.4s,v1.4s
1124	add	w5,w5,w9
1125	add	v4.4s,v4.4s,v5.4s
1126	add	w6,w6,w10
1127	add	v8.4s,v8.4s,v9.4s
1128	add	w7,w7,w11
1129	add	v12.4s,v12.4s,v13.4s
1130	add	w8,w8,w12
1131	add	v16.4s,v16.4s,v17.4s
1132	eor	w17,w17,w5
1133	add	v20.4s,v20.4s,v21.4s
1134	eor	w19,w19,w6
1135	eor	v3.16b,v3.16b,v0.16b
1136	eor	w20,w20,w7
1137	eor	v7.16b,v7.16b,v4.16b
1138	eor	w21,w21,w8
1139	eor	v11.16b,v11.16b,v8.16b
1140	ror	w17,w17,#16
1141	eor	v15.16b,v15.16b,v12.16b
1142	ror	w19,w19,#16
1143	eor	v19.16b,v19.16b,v16.16b
1144	ror	w20,w20,#16
1145	eor	v23.16b,v23.16b,v20.16b
1146	ror	w21,w21,#16
1147	rev32	v3.8h,v3.8h
1148	add	w13,w13,w17
1149	rev32	v7.8h,v7.8h
1150	add	w14,w14,w19
1151	rev32	v11.8h,v11.8h
1152	add	w15,w15,w20
1153	rev32	v15.8h,v15.8h
1154	add	w16,w16,w21
1155	rev32	v19.8h,v19.8h
1156	eor	w9,w9,w13
1157	rev32	v23.8h,v23.8h
1158	eor	w10,w10,w14
1159	add	v2.4s,v2.4s,v3.4s
1160	eor	w11,w11,w15
1161	add	v6.4s,v6.4s,v7.4s
1162	eor	w12,w12,w16
1163	add	v10.4s,v10.4s,v11.4s
1164	ror	w9,w9,#20
1165	add	v14.4s,v14.4s,v15.4s
1166	ror	w10,w10,#20
1167	add	v18.4s,v18.4s,v19.4s
1168	ror	w11,w11,#20
1169	add	v22.4s,v22.4s,v23.4s
1170	ror	w12,w12,#20
1171	eor	v24.16b,v1.16b,v2.16b
1172	add	w5,w5,w9
1173	eor	v25.16b,v5.16b,v6.16b
1174	add	w6,w6,w10
1175	eor	v26.16b,v9.16b,v10.16b
1176	add	w7,w7,w11
1177	eor	v27.16b,v13.16b,v14.16b
1178	add	w8,w8,w12
1179	eor	v28.16b,v17.16b,v18.16b
1180	eor	w17,w17,w5
1181	eor	v29.16b,v21.16b,v22.16b
1182	eor	w19,w19,w6
1183	ushr	v1.4s,v24.4s,#20
1184	eor	w20,w20,w7
1185	ushr	v5.4s,v25.4s,#20
1186	eor	w21,w21,w8
1187	ushr	v9.4s,v26.4s,#20
1188	ror	w17,w17,#24
1189	ushr	v13.4s,v27.4s,#20
1190	ror	w19,w19,#24
1191	ushr	v17.4s,v28.4s,#20
1192	ror	w20,w20,#24
1193	ushr	v21.4s,v29.4s,#20
1194	ror	w21,w21,#24
1195	sli	v1.4s,v24.4s,#12
1196	add	w13,w13,w17
1197	sli	v5.4s,v25.4s,#12
1198	add	w14,w14,w19
1199	sli	v9.4s,v26.4s,#12
1200	add	w15,w15,w20
1201	sli	v13.4s,v27.4s,#12
1202	add	w16,w16,w21
1203	sli	v17.4s,v28.4s,#12
1204	eor	w9,w9,w13
1205	sli	v21.4s,v29.4s,#12
1206	eor	w10,w10,w14
1207	add	v0.4s,v0.4s,v1.4s
1208	eor	w11,w11,w15
1209	add	v4.4s,v4.4s,v5.4s
1210	eor	w12,w12,w16
1211	add	v8.4s,v8.4s,v9.4s
1212	ror	w9,w9,#25
1213	add	v12.4s,v12.4s,v13.4s
1214	ror	w10,w10,#25
1215	add	v16.4s,v16.4s,v17.4s
1216	ror	w11,w11,#25
1217	add	v20.4s,v20.4s,v21.4s
1218	ror	w12,w12,#25
1219	eor	v24.16b,v3.16b,v0.16b
1220	add	w5,w5,w10
1221	eor	v25.16b,v7.16b,v4.16b
1222	add	w6,w6,w11
1223	eor	v26.16b,v11.16b,v8.16b
1224	add	w7,w7,w12
1225	eor	v27.16b,v15.16b,v12.16b
1226	add	w8,w8,w9
1227	eor	v28.16b,v19.16b,v16.16b
1228	eor	w21,w21,w5
1229	eor	v29.16b,v23.16b,v20.16b
1230	eor	w17,w17,w6
1231	ushr	v3.4s,v24.4s,#24
1232	eor	w19,w19,w7
1233	ushr	v7.4s,v25.4s,#24
1234	eor	w20,w20,w8
1235	ushr	v11.4s,v26.4s,#24
1236	ror	w21,w21,#16
1237	ushr	v15.4s,v27.4s,#24
1238	ror	w17,w17,#16
1239	ushr	v19.4s,v28.4s,#24
1240	ror	w19,w19,#16
1241	ushr	v23.4s,v29.4s,#24
1242	ror	w20,w20,#16
1243	sli	v3.4s,v24.4s,#8
1244	add	w15,w15,w21
1245	sli	v7.4s,v25.4s,#8
1246	add	w16,w16,w17
1247	sli	v11.4s,v26.4s,#8
1248	add	w13,w13,w19
1249	sli	v15.4s,v27.4s,#8
1250	add	w14,w14,w20
1251	sli	v19.4s,v28.4s,#8
1252	eor	w10,w10,w15
1253	sli	v23.4s,v29.4s,#8
1254	eor	w11,w11,w16
1255	add	v2.4s,v2.4s,v3.4s
1256	eor	w12,w12,w13
1257	add	v6.4s,v6.4s,v7.4s
1258	eor	w9,w9,w14
1259	add	v10.4s,v10.4s,v11.4s
1260	ror	w10,w10,#20
1261	add	v14.4s,v14.4s,v15.4s
1262	ror	w11,w11,#20
1263	add	v18.4s,v18.4s,v19.4s
1264	ror	w12,w12,#20
1265	add	v22.4s,v22.4s,v23.4s
1266	ror	w9,w9,#20
1267	eor	v24.16b,v1.16b,v2.16b
1268	add	w5,w5,w10
1269	eor	v25.16b,v5.16b,v6.16b
1270	add	w6,w6,w11
1271	eor	v26.16b,v9.16b,v10.16b
1272	add	w7,w7,w12
1273	eor	v27.16b,v13.16b,v14.16b
1274	add	w8,w8,w9
1275	eor	v28.16b,v17.16b,v18.16b
1276	eor	w21,w21,w5
1277	eor	v29.16b,v21.16b,v22.16b
1278	eor	w17,w17,w6
1279	ushr	v1.4s,v24.4s,#25
1280	eor	w19,w19,w7
1281	ushr	v5.4s,v25.4s,#25
1282	eor	w20,w20,w8
1283	ushr	v9.4s,v26.4s,#25
1284	ror	w21,w21,#24
1285	ushr	v13.4s,v27.4s,#25
1286	ror	w17,w17,#24
1287	ushr	v17.4s,v28.4s,#25
1288	ror	w19,w19,#24
1289	ushr	v21.4s,v29.4s,#25
1290	ror	w20,w20,#24
1291	sli	v1.4s,v24.4s,#7
1292	add	w15,w15,w21
1293	sli	v5.4s,v25.4s,#7
1294	add	w16,w16,w17
1295	sli	v9.4s,v26.4s,#7
1296	add	w13,w13,w19
1297	sli	v13.4s,v27.4s,#7
1298	add	w14,w14,w20
1299	sli	v17.4s,v28.4s,#7
1300	eor	w10,w10,w15
1301	sli	v21.4s,v29.4s,#7
1302	eor	w11,w11,w16
1303	ext	v2.16b,v2.16b,v2.16b,#8
1304	eor	w12,w12,w13
1305	ext	v6.16b,v6.16b,v6.16b,#8
1306	eor	w9,w9,w14
1307	ext	v10.16b,v10.16b,v10.16b,#8
1308	ror	w10,w10,#25
1309	ext	v14.16b,v14.16b,v14.16b,#8
1310	ror	w11,w11,#25
1311	ext	v18.16b,v18.16b,v18.16b,#8
1312	ror	w12,w12,#25
1313	ext	v22.16b,v22.16b,v22.16b,#8
1314	ror	w9,w9,#25
1315	ext	v3.16b,v3.16b,v3.16b,#4
1316	ext	v7.16b,v7.16b,v7.16b,#4
1317	ext	v11.16b,v11.16b,v11.16b,#4
1318	ext	v15.16b,v15.16b,v15.16b,#4
1319	ext	v19.16b,v19.16b,v19.16b,#4
1320	ext	v23.16b,v23.16b,v23.16b,#4
1321	ext	v1.16b,v1.16b,v1.16b,#12
1322	ext	v5.16b,v5.16b,v5.16b,#12
1323	ext	v9.16b,v9.16b,v9.16b,#12
1324	ext	v13.16b,v13.16b,v13.16b,#12
1325	ext	v17.16b,v17.16b,v17.16b,#12
1326	ext	v21.16b,v21.16b,v21.16b,#12
1327	cbnz	x4,.Loop_upper_neon
1328
1329	add	w5,w5,w22		// accumulate key block
1330	add	x6,x6,x22,lsr#32
1331	add	w7,w7,w23
1332	add	x8,x8,x23,lsr#32
1333	add	w9,w9,w24
1334	add	x10,x10,x24,lsr#32
1335	add	w11,w11,w25
1336	add	x12,x12,x25,lsr#32
1337	add	w13,w13,w26
1338	add	x14,x14,x26,lsr#32
1339	add	w15,w15,w27
1340	add	x16,x16,x27,lsr#32
1341	add	w17,w17,w28
1342	add	x19,x19,x28,lsr#32
1343	add	w20,w20,w30
1344	add	x21,x21,x30,lsr#32
1345
1346	add	x5,x5,x6,lsl#32	// pack
1347	add	x7,x7,x8,lsl#32
1348	ldp	x6,x8,[x1,#0]		// load input
1349	add	x9,x9,x10,lsl#32
1350	add	x11,x11,x12,lsl#32
1351	ldp	x10,x12,[x1,#16]
1352	add	x13,x13,x14,lsl#32
1353	add	x15,x15,x16,lsl#32
1354	ldp	x14,x16,[x1,#32]
1355	add	x17,x17,x19,lsl#32
1356	add	x20,x20,x21,lsl#32
1357	ldp	x19,x21,[x1,#48]
1358	add	x1,x1,#64
1359#ifdef	__ARMEB__
1360	rev	x5,x5
1361	rev	x7,x7
1362	rev	x9,x9
1363	rev	x11,x11
1364	rev	x13,x13
1365	rev	x15,x15
1366	rev	x17,x17
1367	rev	x20,x20
1368#endif
1369	eor	x5,x5,x6
1370	eor	x7,x7,x8
1371	eor	x9,x9,x10
1372	eor	x11,x11,x12
1373	eor	x13,x13,x14
1374	eor	x15,x15,x16
1375	eor	x17,x17,x19
1376	eor	x20,x20,x21
1377
1378	stp	x5,x7,[x0,#0]		// store output
1379	add	x28,x28,#1			// increment counter
1380	mov	w5,w22			// unpack key block
1381	lsr	x6,x22,#32
1382	stp	x9,x11,[x0,#16]
1383	mov	w7,w23
1384	lsr	x8,x23,#32
1385	stp	x13,x15,[x0,#32]
1386	mov	w9,w24
1387	lsr	x10,x24,#32
1388	stp	x17,x20,[x0,#48]
1389	add	x0,x0,#64
1390	mov	w11,w25
1391	lsr	x12,x25,#32
1392	mov	w13,w26
1393	lsr	x14,x26,#32
1394	mov	w15,w27
1395	lsr	x16,x27,#32
1396	mov	w17,w28
1397	lsr	x19,x28,#32
1398	mov	w20,w30
1399	lsr	x21,x30,#32
1400
1401	mov	x4,#5
1402.Loop_lower_neon:
1403	sub	x4,x4,#1
1404	add	v0.4s,v0.4s,v1.4s
1405	add	w5,w5,w9
1406	add	v4.4s,v4.4s,v5.4s
1407	add	w6,w6,w10
1408	add	v8.4s,v8.4s,v9.4s
1409	add	w7,w7,w11
1410	add	v12.4s,v12.4s,v13.4s
1411	add	w8,w8,w12
1412	add	v16.4s,v16.4s,v17.4s
1413	eor	w17,w17,w5
1414	add	v20.4s,v20.4s,v21.4s
1415	eor	w19,w19,w6
1416	eor	v3.16b,v3.16b,v0.16b
1417	eor	w20,w20,w7
1418	eor	v7.16b,v7.16b,v4.16b
1419	eor	w21,w21,w8
1420	eor	v11.16b,v11.16b,v8.16b
1421	ror	w17,w17,#16
1422	eor	v15.16b,v15.16b,v12.16b
1423	ror	w19,w19,#16
1424	eor	v19.16b,v19.16b,v16.16b
1425	ror	w20,w20,#16
1426	eor	v23.16b,v23.16b,v20.16b
1427	ror	w21,w21,#16
1428	rev32	v3.8h,v3.8h
1429	add	w13,w13,w17
1430	rev32	v7.8h,v7.8h
1431	add	w14,w14,w19
1432	rev32	v11.8h,v11.8h
1433	add	w15,w15,w20
1434	rev32	v15.8h,v15.8h
1435	add	w16,w16,w21
1436	rev32	v19.8h,v19.8h
1437	eor	w9,w9,w13
1438	rev32	v23.8h,v23.8h
1439	eor	w10,w10,w14
1440	add	v2.4s,v2.4s,v3.4s
1441	eor	w11,w11,w15
1442	add	v6.4s,v6.4s,v7.4s
1443	eor	w12,w12,w16
1444	add	v10.4s,v10.4s,v11.4s
1445	ror	w9,w9,#20
1446	add	v14.4s,v14.4s,v15.4s
1447	ror	w10,w10,#20
1448	add	v18.4s,v18.4s,v19.4s
1449	ror	w11,w11,#20
1450	add	v22.4s,v22.4s,v23.4s
1451	ror	w12,w12,#20
1452	eor	v24.16b,v1.16b,v2.16b
1453	add	w5,w5,w9
1454	eor	v25.16b,v5.16b,v6.16b
1455	add	w6,w6,w10
1456	eor	v26.16b,v9.16b,v10.16b
1457	add	w7,w7,w11
1458	eor	v27.16b,v13.16b,v14.16b
1459	add	w8,w8,w12
1460	eor	v28.16b,v17.16b,v18.16b
1461	eor	w17,w17,w5
1462	eor	v29.16b,v21.16b,v22.16b
1463	eor	w19,w19,w6
1464	ushr	v1.4s,v24.4s,#20
1465	eor	w20,w20,w7
1466	ushr	v5.4s,v25.4s,#20
1467	eor	w21,w21,w8
1468	ushr	v9.4s,v26.4s,#20
1469	ror	w17,w17,#24
1470	ushr	v13.4s,v27.4s,#20
1471	ror	w19,w19,#24
1472	ushr	v17.4s,v28.4s,#20
1473	ror	w20,w20,#24
1474	ushr	v21.4s,v29.4s,#20
1475	ror	w21,w21,#24
1476	sli	v1.4s,v24.4s,#12
1477	add	w13,w13,w17
1478	sli	v5.4s,v25.4s,#12
1479	add	w14,w14,w19
1480	sli	v9.4s,v26.4s,#12
1481	add	w15,w15,w20
1482	sli	v13.4s,v27.4s,#12
1483	add	w16,w16,w21
1484	sli	v17.4s,v28.4s,#12
1485	eor	w9,w9,w13
1486	sli	v21.4s,v29.4s,#12
1487	eor	w10,w10,w14
1488	add	v0.4s,v0.4s,v1.4s
1489	eor	w11,w11,w15
1490	add	v4.4s,v4.4s,v5.4s
1491	eor	w12,w12,w16
1492	add	v8.4s,v8.4s,v9.4s
1493	ror	w9,w9,#25
1494	add	v12.4s,v12.4s,v13.4s
1495	ror	w10,w10,#25
1496	add	v16.4s,v16.4s,v17.4s
1497	ror	w11,w11,#25
1498	add	v20.4s,v20.4s,v21.4s
1499	ror	w12,w12,#25
1500	eor	v24.16b,v3.16b,v0.16b
1501	add	w5,w5,w10
1502	eor	v25.16b,v7.16b,v4.16b
1503	add	w6,w6,w11
1504	eor	v26.16b,v11.16b,v8.16b
1505	add	w7,w7,w12
1506	eor	v27.16b,v15.16b,v12.16b
1507	add	w8,w8,w9
1508	eor	v28.16b,v19.16b,v16.16b
1509	eor	w21,w21,w5
1510	eor	v29.16b,v23.16b,v20.16b
1511	eor	w17,w17,w6
1512	ushr	v3.4s,v24.4s,#24
1513	eor	w19,w19,w7
1514	ushr	v7.4s,v25.4s,#24
1515	eor	w20,w20,w8
1516	ushr	v11.4s,v26.4s,#24
1517	ror	w21,w21,#16
1518	ushr	v15.4s,v27.4s,#24
1519	ror	w17,w17,#16
1520	ushr	v19.4s,v28.4s,#24
1521	ror	w19,w19,#16
1522	ushr	v23.4s,v29.4s,#24
1523	ror	w20,w20,#16
1524	sli	v3.4s,v24.4s,#8
1525	add	w15,w15,w21
1526	sli	v7.4s,v25.4s,#8
1527	add	w16,w16,w17
1528	sli	v11.4s,v26.4s,#8
1529	add	w13,w13,w19
1530	sli	v15.4s,v27.4s,#8
1531	add	w14,w14,w20
1532	sli	v19.4s,v28.4s,#8
1533	eor	w10,w10,w15
1534	sli	v23.4s,v29.4s,#8
1535	eor	w11,w11,w16
1536	add	v2.4s,v2.4s,v3.4s
1537	eor	w12,w12,w13
1538	add	v6.4s,v6.4s,v7.4s
1539	eor	w9,w9,w14
1540	add	v10.4s,v10.4s,v11.4s
1541	ror	w10,w10,#20
1542	add	v14.4s,v14.4s,v15.4s
1543	ror	w11,w11,#20
1544	add	v18.4s,v18.4s,v19.4s
1545	ror	w12,w12,#20
1546	add	v22.4s,v22.4s,v23.4s
1547	ror	w9,w9,#20
1548	eor	v24.16b,v1.16b,v2.16b
1549	add	w5,w5,w10
1550	eor	v25.16b,v5.16b,v6.16b
1551	add	w6,w6,w11
1552	eor	v26.16b,v9.16b,v10.16b
1553	add	w7,w7,w12
1554	eor	v27.16b,v13.16b,v14.16b
1555	add	w8,w8,w9
1556	eor	v28.16b,v17.16b,v18.16b
1557	eor	w21,w21,w5
1558	eor	v29.16b,v21.16b,v22.16b
1559	eor	w17,w17,w6
1560	ushr	v1.4s,v24.4s,#25
1561	eor	w19,w19,w7
1562	ushr	v5.4s,v25.4s,#25
1563	eor	w20,w20,w8
1564	ushr	v9.4s,v26.4s,#25
1565	ror	w21,w21,#24
1566	ushr	v13.4s,v27.4s,#25
1567	ror	w17,w17,#24
1568	ushr	v17.4s,v28.4s,#25
1569	ror	w19,w19,#24
1570	ushr	v21.4s,v29.4s,#25
1571	ror	w20,w20,#24
1572	sli	v1.4s,v24.4s,#7
1573	add	w15,w15,w21
1574	sli	v5.4s,v25.4s,#7
1575	add	w16,w16,w17
1576	sli	v9.4s,v26.4s,#7
1577	add	w13,w13,w19
1578	sli	v13.4s,v27.4s,#7
1579	add	w14,w14,w20
1580	sli	v17.4s,v28.4s,#7
1581	eor	w10,w10,w15
1582	sli	v21.4s,v29.4s,#7
1583	eor	w11,w11,w16
1584	ext	v2.16b,v2.16b,v2.16b,#8
1585	eor	w12,w12,w13
1586	ext	v6.16b,v6.16b,v6.16b,#8
1587	eor	w9,w9,w14
1588	ext	v10.16b,v10.16b,v10.16b,#8
1589	ror	w10,w10,#25
1590	ext	v14.16b,v14.16b,v14.16b,#8
1591	ror	w11,w11,#25
1592	ext	v18.16b,v18.16b,v18.16b,#8
1593	ror	w12,w12,#25
1594	ext	v22.16b,v22.16b,v22.16b,#8
1595	ror	w9,w9,#25
1596	ext	v3.16b,v3.16b,v3.16b,#12
1597	ext	v7.16b,v7.16b,v7.16b,#12
1598	ext	v11.16b,v11.16b,v11.16b,#12
1599	ext	v15.16b,v15.16b,v15.16b,#12
1600	ext	v19.16b,v19.16b,v19.16b,#12
1601	ext	v23.16b,v23.16b,v23.16b,#12
1602	ext	v1.16b,v1.16b,v1.16b,#4
1603	ext	v5.16b,v5.16b,v5.16b,#4
1604	ext	v9.16b,v9.16b,v9.16b,#4
1605	ext	v13.16b,v13.16b,v13.16b,#4
1606	ext	v17.16b,v17.16b,v17.16b,#4
1607	ext	v21.16b,v21.16b,v21.16b,#4
1608	add	v0.4s,v0.4s,v1.4s
1609	add	w5,w5,w9
1610	add	v4.4s,v4.4s,v5.4s
1611	add	w6,w6,w10
1612	add	v8.4s,v8.4s,v9.4s
1613	add	w7,w7,w11
1614	add	v12.4s,v12.4s,v13.4s
1615	add	w8,w8,w12
1616	add	v16.4s,v16.4s,v17.4s
1617	eor	w17,w17,w5
1618	add	v20.4s,v20.4s,v21.4s
1619	eor	w19,w19,w6
1620	eor	v3.16b,v3.16b,v0.16b
1621	eor	w20,w20,w7
1622	eor	v7.16b,v7.16b,v4.16b
1623	eor	w21,w21,w8
1624	eor	v11.16b,v11.16b,v8.16b
1625	ror	w17,w17,#16
1626	eor	v15.16b,v15.16b,v12.16b
1627	ror	w19,w19,#16
1628	eor	v19.16b,v19.16b,v16.16b
1629	ror	w20,w20,#16
1630	eor	v23.16b,v23.16b,v20.16b
1631	ror	w21,w21,#16
1632	rev32	v3.8h,v3.8h
1633	add	w13,w13,w17
1634	rev32	v7.8h,v7.8h
1635	add	w14,w14,w19
1636	rev32	v11.8h,v11.8h
1637	add	w15,w15,w20
1638	rev32	v15.8h,v15.8h
1639	add	w16,w16,w21
1640	rev32	v19.8h,v19.8h
1641	eor	w9,w9,w13
1642	rev32	v23.8h,v23.8h
1643	eor	w10,w10,w14
1644	add	v2.4s,v2.4s,v3.4s
1645	eor	w11,w11,w15
1646	add	v6.4s,v6.4s,v7.4s
1647	eor	w12,w12,w16
1648	add	v10.4s,v10.4s,v11.4s
1649	ror	w9,w9,#20
1650	add	v14.4s,v14.4s,v15.4s
1651	ror	w10,w10,#20
1652	add	v18.4s,v18.4s,v19.4s
1653	ror	w11,w11,#20
1654	add	v22.4s,v22.4s,v23.4s
1655	ror	w12,w12,#20
1656	eor	v24.16b,v1.16b,v2.16b
1657	add	w5,w5,w9
1658	eor	v25.16b,v5.16b,v6.16b
1659	add	w6,w6,w10
1660	eor	v26.16b,v9.16b,v10.16b
1661	add	w7,w7,w11
1662	eor	v27.16b,v13.16b,v14.16b
1663	add	w8,w8,w12
1664	eor	v28.16b,v17.16b,v18.16b
1665	eor	w17,w17,w5
1666	eor	v29.16b,v21.16b,v22.16b
1667	eor	w19,w19,w6
1668	ushr	v1.4s,v24.4s,#20
1669	eor	w20,w20,w7
1670	ushr	v5.4s,v25.4s,#20
1671	eor	w21,w21,w8
1672	ushr	v9.4s,v26.4s,#20
1673	ror	w17,w17,#24
1674	ushr	v13.4s,v27.4s,#20
1675	ror	w19,w19,#24
1676	ushr	v17.4s,v28.4s,#20
1677	ror	w20,w20,#24
1678	ushr	v21.4s,v29.4s,#20
1679	ror	w21,w21,#24
1680	sli	v1.4s,v24.4s,#12
1681	add	w13,w13,w17
1682	sli	v5.4s,v25.4s,#12
1683	add	w14,w14,w19
1684	sli	v9.4s,v26.4s,#12
1685	add	w15,w15,w20
1686	sli	v13.4s,v27.4s,#12
1687	add	w16,w16,w21
1688	sli	v17.4s,v28.4s,#12
1689	eor	w9,w9,w13
1690	sli	v21.4s,v29.4s,#12
1691	eor	w10,w10,w14
1692	add	v0.4s,v0.4s,v1.4s
1693	eor	w11,w11,w15
1694	add	v4.4s,v4.4s,v5.4s
1695	eor	w12,w12,w16
1696	add	v8.4s,v8.4s,v9.4s
1697	ror	w9,w9,#25
1698	add	v12.4s,v12.4s,v13.4s
1699	ror	w10,w10,#25
1700	add	v16.4s,v16.4s,v17.4s
1701	ror	w11,w11,#25
1702	add	v20.4s,v20.4s,v21.4s
1703	ror	w12,w12,#25
1704	eor	v24.16b,v3.16b,v0.16b
1705	add	w5,w5,w10
1706	eor	v25.16b,v7.16b,v4.16b
1707	add	w6,w6,w11
1708	eor	v26.16b,v11.16b,v8.16b
1709	add	w7,w7,w12
1710	eor	v27.16b,v15.16b,v12.16b
1711	add	w8,w8,w9
1712	eor	v28.16b,v19.16b,v16.16b
1713	eor	w21,w21,w5
1714	eor	v29.16b,v23.16b,v20.16b
1715	eor	w17,w17,w6
1716	ushr	v3.4s,v24.4s,#24
1717	eor	w19,w19,w7
1718	ushr	v7.4s,v25.4s,#24
1719	eor	w20,w20,w8
1720	ushr	v11.4s,v26.4s,#24
1721	ror	w21,w21,#16
1722	ushr	v15.4s,v27.4s,#24
1723	ror	w17,w17,#16
1724	ushr	v19.4s,v28.4s,#24
1725	ror	w19,w19,#16
1726	ushr	v23.4s,v29.4s,#24
1727	ror	w20,w20,#16
1728	sli	v3.4s,v24.4s,#8
1729	add	w15,w15,w21
1730	sli	v7.4s,v25.4s,#8
1731	add	w16,w16,w17
1732	sli	v11.4s,v26.4s,#8
1733	add	w13,w13,w19
1734	sli	v15.4s,v27.4s,#8
1735	add	w14,w14,w20
1736	sli	v19.4s,v28.4s,#8
1737	eor	w10,w10,w15
1738	sli	v23.4s,v29.4s,#8
1739	eor	w11,w11,w16
1740	add	v2.4s,v2.4s,v3.4s
1741	eor	w12,w12,w13
1742	add	v6.4s,v6.4s,v7.4s
1743	eor	w9,w9,w14
1744	add	v10.4s,v10.4s,v11.4s
1745	ror	w10,w10,#20
1746	add	v14.4s,v14.4s,v15.4s
1747	ror	w11,w11,#20
1748	add	v18.4s,v18.4s,v19.4s
1749	ror	w12,w12,#20
1750	add	v22.4s,v22.4s,v23.4s
1751	ror	w9,w9,#20
1752	eor	v24.16b,v1.16b,v2.16b
1753	add	w5,w5,w10
1754	eor	v25.16b,v5.16b,v6.16b
1755	add	w6,w6,w11
1756	eor	v26.16b,v9.16b,v10.16b
1757	add	w7,w7,w12
1758	eor	v27.16b,v13.16b,v14.16b
1759	add	w8,w8,w9
1760	eor	v28.16b,v17.16b,v18.16b
1761	eor	w21,w21,w5
1762	eor	v29.16b,v21.16b,v22.16b
1763	eor	w17,w17,w6
1764	ushr	v1.4s,v24.4s,#25
1765	eor	w19,w19,w7
1766	ushr	v5.4s,v25.4s,#25
1767	eor	w20,w20,w8
1768	ushr	v9.4s,v26.4s,#25
1769	ror	w21,w21,#24
1770	ushr	v13.4s,v27.4s,#25
1771	ror	w17,w17,#24
1772	ushr	v17.4s,v28.4s,#25
1773	ror	w19,w19,#24
1774	ushr	v21.4s,v29.4s,#25
1775	ror	w20,w20,#24
1776	sli	v1.4s,v24.4s,#7
1777	add	w15,w15,w21
1778	sli	v5.4s,v25.4s,#7
1779	add	w16,w16,w17
1780	sli	v9.4s,v26.4s,#7
1781	add	w13,w13,w19
1782	sli	v13.4s,v27.4s,#7
1783	add	w14,w14,w20
1784	sli	v17.4s,v28.4s,#7
1785	eor	w10,w10,w15
1786	sli	v21.4s,v29.4s,#7
1787	eor	w11,w11,w16
1788	ext	v2.16b,v2.16b,v2.16b,#8
1789	eor	w12,w12,w13
1790	ext	v6.16b,v6.16b,v6.16b,#8
1791	eor	w9,w9,w14
1792	ext	v10.16b,v10.16b,v10.16b,#8
1793	ror	w10,w10,#25
1794	ext	v14.16b,v14.16b,v14.16b,#8
1795	ror	w11,w11,#25
1796	ext	v18.16b,v18.16b,v18.16b,#8
1797	ror	w12,w12,#25
1798	ext	v22.16b,v22.16b,v22.16b,#8
1799	ror	w9,w9,#25
1800	ext	v3.16b,v3.16b,v3.16b,#4
1801	ext	v7.16b,v7.16b,v7.16b,#4
1802	ext	v11.16b,v11.16b,v11.16b,#4
1803	ext	v15.16b,v15.16b,v15.16b,#4
1804	ext	v19.16b,v19.16b,v19.16b,#4
1805	ext	v23.16b,v23.16b,v23.16b,#4
1806	ext	v1.16b,v1.16b,v1.16b,#12
1807	ext	v5.16b,v5.16b,v5.16b,#12
1808	ext	v9.16b,v9.16b,v9.16b,#12
1809	ext	v13.16b,v13.16b,v13.16b,#12
1810	ext	v17.16b,v17.16b,v17.16b,#12
1811	ext	v21.16b,v21.16b,v21.16b,#12
1812	cbnz	x4,.Loop_lower_neon
1813
1814	add	w5,w5,w22		// accumulate key block
1815	ldp	q24,q25,[sp,#0]
1816	add	x6,x6,x22,lsr#32
1817	ldp	q26,q27,[sp,#32]
1818	add	w7,w7,w23
1819	ldp	q28,q29,[sp,#64]
1820	add	x8,x8,x23,lsr#32
1821	add	v0.4s,v0.4s,v24.4s
1822	add	w9,w9,w24
1823	add	v4.4s,v4.4s,v24.4s
1824	add	x10,x10,x24,lsr#32
1825	add	v8.4s,v8.4s,v24.4s
1826	add	w11,w11,w25
1827	add	v12.4s,v12.4s,v24.4s
1828	add	x12,x12,x25,lsr#32
1829	add	v16.4s,v16.4s,v24.4s
1830	add	w13,w13,w26
1831	add	v20.4s,v20.4s,v24.4s
1832	add	x14,x14,x26,lsr#32
1833	add	v2.4s,v2.4s,v26.4s
1834	add	w15,w15,w27
1835	add	v6.4s,v6.4s,v26.4s
1836	add	x16,x16,x27,lsr#32
1837	add	v10.4s,v10.4s,v26.4s
1838	add	w17,w17,w28
1839	add	v14.4s,v14.4s,v26.4s
1840	add	x19,x19,x28,lsr#32
1841	add	v18.4s,v18.4s,v26.4s
1842	add	w20,w20,w30
1843	add	v22.4s,v22.4s,v26.4s
1844	add	x21,x21,x30,lsr#32
1845	add	v19.4s,v19.4s,v31.4s			// +4
1846	add	x5,x5,x6,lsl#32	// pack
1847	add	v23.4s,v23.4s,v31.4s			// +4
1848	add	x7,x7,x8,lsl#32
1849	add	v3.4s,v3.4s,v27.4s
1850	ldp	x6,x8,[x1,#0]		// load input
1851	add	v7.4s,v7.4s,v28.4s
1852	add	x9,x9,x10,lsl#32
1853	add	v11.4s,v11.4s,v29.4s
1854	add	x11,x11,x12,lsl#32
1855	add	v15.4s,v15.4s,v30.4s
1856	ldp	x10,x12,[x1,#16]
1857	add	v19.4s,v19.4s,v27.4s
1858	add	x13,x13,x14,lsl#32
1859	add	v23.4s,v23.4s,v28.4s
1860	add	x15,x15,x16,lsl#32
1861	add	v1.4s,v1.4s,v25.4s
1862	ldp	x14,x16,[x1,#32]
1863	add	v5.4s,v5.4s,v25.4s
1864	add	x17,x17,x19,lsl#32
1865	add	v9.4s,v9.4s,v25.4s
1866	add	x20,x20,x21,lsl#32
1867	add	v13.4s,v13.4s,v25.4s
1868	ldp	x19,x21,[x1,#48]
1869	add	v17.4s,v17.4s,v25.4s
1870	add	x1,x1,#64
1871	add	v21.4s,v21.4s,v25.4s
1872
1873#ifdef	__ARMEB__
1874	rev	x5,x5
1875	rev	x7,x7
1876	rev	x9,x9
1877	rev	x11,x11
1878	rev	x13,x13
1879	rev	x15,x15
1880	rev	x17,x17
1881	rev	x20,x20
1882#endif
1883	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1884	eor	x5,x5,x6
1885	eor	x7,x7,x8
1886	eor	x9,x9,x10
1887	eor	x11,x11,x12
1888	eor	x13,x13,x14
1889	eor	v0.16b,v0.16b,v24.16b
1890	eor	x15,x15,x16
1891	eor	v1.16b,v1.16b,v25.16b
1892	eor	x17,x17,x19
1893	eor	v2.16b,v2.16b,v26.16b
1894	eor	x20,x20,x21
1895	eor	v3.16b,v3.16b,v27.16b
1896	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1897
1898	stp	x5,x7,[x0,#0]		// store output
1899	add	x28,x28,#7			// increment counter
1900	stp	x9,x11,[x0,#16]
1901	stp	x13,x15,[x0,#32]
1902	stp	x17,x20,[x0,#48]
1903	add	x0,x0,#64
1904	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1905
1906	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1907	eor	v4.16b,v4.16b,v24.16b
1908	eor	v5.16b,v5.16b,v25.16b
1909	eor	v6.16b,v6.16b,v26.16b
1910	eor	v7.16b,v7.16b,v27.16b
1911	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1912
1913	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1914	eor	v8.16b,v8.16b,v0.16b
1915	ldp	q24,q25,[sp,#0]
1916	eor	v9.16b,v9.16b,v1.16b
1917	ldp	q26,q27,[sp,#32]
1918	eor	v10.16b,v10.16b,v2.16b
1919	eor	v11.16b,v11.16b,v3.16b
1920	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1921
1922	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1923	eor	v12.16b,v12.16b,v4.16b
1924	eor	v13.16b,v13.16b,v5.16b
1925	eor	v14.16b,v14.16b,v6.16b
1926	eor	v15.16b,v15.16b,v7.16b
1927	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1928
1929	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1930	eor	v16.16b,v16.16b,v8.16b
1931	eor	v17.16b,v17.16b,v9.16b
1932	eor	v18.16b,v18.16b,v10.16b
1933	eor	v19.16b,v19.16b,v11.16b
1934	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1935
1936	shl	v0.4s,v31.4s,#1			// 4 -> 8
1937	eor	v20.16b,v20.16b,v12.16b
1938	eor	v21.16b,v21.16b,v13.16b
1939	eor	v22.16b,v22.16b,v14.16b
1940	eor	v23.16b,v23.16b,v15.16b
1941	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1942
1943	add	v27.4s,v27.4s,v0.4s			// += 8
1944	add	v28.4s,v28.4s,v0.4s
1945	add	v29.4s,v29.4s,v0.4s
1946	add	v30.4s,v30.4s,v0.4s
1947
1948	b.hs	.Loop_outer_512_neon
1949
1950	adds	x2,x2,#512
1951	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1952
1953	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1954	ldp	d10,d11,[sp,#128+16]
1955	ldp	d12,d13,[sp,#128+32]
1956	ldp	d14,d15,[sp,#128+48]
1957
1958	stp	q24,q31,[sp,#0]		// wipe off-load area
1959	stp	q24,q31,[sp,#32]
1960	stp	q24,q31,[sp,#64]
1961
1962	b.eq	.Ldone_512_neon
1963
1964	cmp	x2,#192
1965	sub	v27.4s,v27.4s,v0.4s			// -= 1
1966	sub	v28.4s,v28.4s,v0.4s
1967	sub	v29.4s,v29.4s,v0.4s
1968	add	sp,sp,#128
1969	b.hs	.Loop_outer_neon
1970
1971	eor	v25.16b,v25.16b,v25.16b
1972	eor	v26.16b,v26.16b,v26.16b
1973	eor	v27.16b,v27.16b,v27.16b
1974	eor	v28.16b,v28.16b,v28.16b
1975	eor	v29.16b,v29.16b,v29.16b
1976	eor	v30.16b,v30.16b,v30.16b
1977	b	.Loop_outer
1978
1979.Ldone_512_neon:
1980	ldp	x19,x20,[x29,#16]
1981	add	sp,sp,#128+64
1982	ldp	x21,x22,[x29,#32]
1983	ldp	x23,x24,[x29,#48]
1984	ldp	x25,x26,[x29,#64]
1985	ldp	x27,x28,[x29,#80]
1986	ldp	x29,x30,[sp],#96
1987	AARCH64_VALIDATE_LINK_REGISTER
1988	ret
1989.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1990#endif
1991#endif  // !OPENSSL_NO_ASM
1992.section	.note.GNU-stack,"",%progbits
1993