• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(__aarch64__)
13#if defined(BORINGSSL_PREFIX)
14#include <boringssl_prefix_symbols_asm.h>
15#endif
16#include <openssl/arm_arch.h>
17
18
19
20
21.section	.rodata
22
23.align	5
24Lsigma:
25.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
26Lone:
27.long	1,0,0,0
28.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
29.align	2
30
31.text
32
33.globl	ChaCha20_ctr32
34
35.def ChaCha20_ctr32
36   .type 32
37.endef
38.align	5
39ChaCha20_ctr32:
40	AARCH64_VALID_CALL_TARGET
41	cbz	x2,Labort
42#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
43	adrp	x5,:pg_hi21_nc:OPENSSL_armcap_P
44#else
45	adrp	x5,OPENSSL_armcap_P
46#endif
47	cmp	x2,#192
48	b.lo	Lshort
49	ldr	w17,[x5,:lo12:OPENSSL_armcap_P]
50	tst	w17,#ARMV7_NEON
51	b.ne	ChaCha20_neon
52
53Lshort:
54	AARCH64_SIGN_LINK_REGISTER
55	stp	x29,x30,[sp,#-96]!
56	add	x29,sp,#0
57
58	adrp	x5,Lsigma
59	add	x5,x5,:lo12:Lsigma
60	stp	x19,x20,[sp,#16]
61	stp	x21,x22,[sp,#32]
62	stp	x23,x24,[sp,#48]
63	stp	x25,x26,[sp,#64]
64	stp	x27,x28,[sp,#80]
65	sub	sp,sp,#64
66
67	ldp	x22,x23,[x5]		// load sigma
68	ldp	x24,x25,[x3]		// load key
69	ldp	x26,x27,[x3,#16]
70	ldp	x28,x30,[x4]		// load counter
71#ifdef	__ARMEB__
72	ror	x24,x24,#32
73	ror	x25,x25,#32
74	ror	x26,x26,#32
75	ror	x27,x27,#32
76	ror	x28,x28,#32
77	ror	x30,x30,#32
78#endif
79
80Loop_outer:
81	mov	w5,w22			// unpack key block
82	lsr	x6,x22,#32
83	mov	w7,w23
84	lsr	x8,x23,#32
85	mov	w9,w24
86	lsr	x10,x24,#32
87	mov	w11,w25
88	lsr	x12,x25,#32
89	mov	w13,w26
90	lsr	x14,x26,#32
91	mov	w15,w27
92	lsr	x16,x27,#32
93	mov	w17,w28
94	lsr	x19,x28,#32
95	mov	w20,w30
96	lsr	x21,x30,#32
97
98	mov	x4,#10
99	subs	x2,x2,#64
100Loop:
101	sub	x4,x4,#1
102	add	w5,w5,w9
103	add	w6,w6,w10
104	add	w7,w7,w11
105	add	w8,w8,w12
106	eor	w17,w17,w5
107	eor	w19,w19,w6
108	eor	w20,w20,w7
109	eor	w21,w21,w8
110	ror	w17,w17,#16
111	ror	w19,w19,#16
112	ror	w20,w20,#16
113	ror	w21,w21,#16
114	add	w13,w13,w17
115	add	w14,w14,w19
116	add	w15,w15,w20
117	add	w16,w16,w21
118	eor	w9,w9,w13
119	eor	w10,w10,w14
120	eor	w11,w11,w15
121	eor	w12,w12,w16
122	ror	w9,w9,#20
123	ror	w10,w10,#20
124	ror	w11,w11,#20
125	ror	w12,w12,#20
126	add	w5,w5,w9
127	add	w6,w6,w10
128	add	w7,w7,w11
129	add	w8,w8,w12
130	eor	w17,w17,w5
131	eor	w19,w19,w6
132	eor	w20,w20,w7
133	eor	w21,w21,w8
134	ror	w17,w17,#24
135	ror	w19,w19,#24
136	ror	w20,w20,#24
137	ror	w21,w21,#24
138	add	w13,w13,w17
139	add	w14,w14,w19
140	add	w15,w15,w20
141	add	w16,w16,w21
142	eor	w9,w9,w13
143	eor	w10,w10,w14
144	eor	w11,w11,w15
145	eor	w12,w12,w16
146	ror	w9,w9,#25
147	ror	w10,w10,#25
148	ror	w11,w11,#25
149	ror	w12,w12,#25
150	add	w5,w5,w10
151	add	w6,w6,w11
152	add	w7,w7,w12
153	add	w8,w8,w9
154	eor	w21,w21,w5
155	eor	w17,w17,w6
156	eor	w19,w19,w7
157	eor	w20,w20,w8
158	ror	w21,w21,#16
159	ror	w17,w17,#16
160	ror	w19,w19,#16
161	ror	w20,w20,#16
162	add	w15,w15,w21
163	add	w16,w16,w17
164	add	w13,w13,w19
165	add	w14,w14,w20
166	eor	w10,w10,w15
167	eor	w11,w11,w16
168	eor	w12,w12,w13
169	eor	w9,w9,w14
170	ror	w10,w10,#20
171	ror	w11,w11,#20
172	ror	w12,w12,#20
173	ror	w9,w9,#20
174	add	w5,w5,w10
175	add	w6,w6,w11
176	add	w7,w7,w12
177	add	w8,w8,w9
178	eor	w21,w21,w5
179	eor	w17,w17,w6
180	eor	w19,w19,w7
181	eor	w20,w20,w8
182	ror	w21,w21,#24
183	ror	w17,w17,#24
184	ror	w19,w19,#24
185	ror	w20,w20,#24
186	add	w15,w15,w21
187	add	w16,w16,w17
188	add	w13,w13,w19
189	add	w14,w14,w20
190	eor	w10,w10,w15
191	eor	w11,w11,w16
192	eor	w12,w12,w13
193	eor	w9,w9,w14
194	ror	w10,w10,#25
195	ror	w11,w11,#25
196	ror	w12,w12,#25
197	ror	w9,w9,#25
198	cbnz	x4,Loop
199
200	add	w5,w5,w22		// accumulate key block
201	add	x6,x6,x22,lsr#32
202	add	w7,w7,w23
203	add	x8,x8,x23,lsr#32
204	add	w9,w9,w24
205	add	x10,x10,x24,lsr#32
206	add	w11,w11,w25
207	add	x12,x12,x25,lsr#32
208	add	w13,w13,w26
209	add	x14,x14,x26,lsr#32
210	add	w15,w15,w27
211	add	x16,x16,x27,lsr#32
212	add	w17,w17,w28
213	add	x19,x19,x28,lsr#32
214	add	w20,w20,w30
215	add	x21,x21,x30,lsr#32
216
217	b.lo	Ltail
218
219	add	x5,x5,x6,lsl#32	// pack
220	add	x7,x7,x8,lsl#32
221	ldp	x6,x8,[x1,#0]		// load input
222	add	x9,x9,x10,lsl#32
223	add	x11,x11,x12,lsl#32
224	ldp	x10,x12,[x1,#16]
225	add	x13,x13,x14,lsl#32
226	add	x15,x15,x16,lsl#32
227	ldp	x14,x16,[x1,#32]
228	add	x17,x17,x19,lsl#32
229	add	x20,x20,x21,lsl#32
230	ldp	x19,x21,[x1,#48]
231	add	x1,x1,#64
232#ifdef	__ARMEB__
233	rev	x5,x5
234	rev	x7,x7
235	rev	x9,x9
236	rev	x11,x11
237	rev	x13,x13
238	rev	x15,x15
239	rev	x17,x17
240	rev	x20,x20
241#endif
242	eor	x5,x5,x6
243	eor	x7,x7,x8
244	eor	x9,x9,x10
245	eor	x11,x11,x12
246	eor	x13,x13,x14
247	eor	x15,x15,x16
248	eor	x17,x17,x19
249	eor	x20,x20,x21
250
251	stp	x5,x7,[x0,#0]		// store output
252	add	x28,x28,#1			// increment counter
253	stp	x9,x11,[x0,#16]
254	stp	x13,x15,[x0,#32]
255	stp	x17,x20,[x0,#48]
256	add	x0,x0,#64
257
258	b.hi	Loop_outer
259
260	ldp	x19,x20,[x29,#16]
261	add	sp,sp,#64
262	ldp	x21,x22,[x29,#32]
263	ldp	x23,x24,[x29,#48]
264	ldp	x25,x26,[x29,#64]
265	ldp	x27,x28,[x29,#80]
266	ldp	x29,x30,[sp],#96
267	AARCH64_VALIDATE_LINK_REGISTER
268Labort:
269	ret
270
271.align	4
272Ltail:
273	add	x2,x2,#64
274Less_than_64:
275	sub	x0,x0,#1
276	add	x1,x1,x2
277	add	x0,x0,x2
278	add	x4,sp,x2
279	neg	x2,x2
280
281	add	x5,x5,x6,lsl#32	// pack
282	add	x7,x7,x8,lsl#32
283	add	x9,x9,x10,lsl#32
284	add	x11,x11,x12,lsl#32
285	add	x13,x13,x14,lsl#32
286	add	x15,x15,x16,lsl#32
287	add	x17,x17,x19,lsl#32
288	add	x20,x20,x21,lsl#32
289#ifdef	__ARMEB__
290	rev	x5,x5
291	rev	x7,x7
292	rev	x9,x9
293	rev	x11,x11
294	rev	x13,x13
295	rev	x15,x15
296	rev	x17,x17
297	rev	x20,x20
298#endif
299	stp	x5,x7,[sp,#0]
300	stp	x9,x11,[sp,#16]
301	stp	x13,x15,[sp,#32]
302	stp	x17,x20,[sp,#48]
303
304Loop_tail:
305	ldrb	w10,[x1,x2]
306	ldrb	w11,[x4,x2]
307	add	x2,x2,#1
308	eor	w10,w10,w11
309	strb	w10,[x0,x2]
310	cbnz	x2,Loop_tail
311
312	stp	xzr,xzr,[sp,#0]
313	stp	xzr,xzr,[sp,#16]
314	stp	xzr,xzr,[sp,#32]
315	stp	xzr,xzr,[sp,#48]
316
317	ldp	x19,x20,[x29,#16]
318	add	sp,sp,#64
319	ldp	x21,x22,[x29,#32]
320	ldp	x23,x24,[x29,#48]
321	ldp	x25,x26,[x29,#64]
322	ldp	x27,x28,[x29,#80]
323	ldp	x29,x30,[sp],#96
324	AARCH64_VALIDATE_LINK_REGISTER
325	ret
326
327
328.def ChaCha20_neon
329   .type 32
330.endef
331.align	5
332ChaCha20_neon:
333	AARCH64_SIGN_LINK_REGISTER
334	stp	x29,x30,[sp,#-96]!
335	add	x29,sp,#0
336
337	adrp	x5,Lsigma
338	add	x5,x5,:lo12:Lsigma
339	stp	x19,x20,[sp,#16]
340	stp	x21,x22,[sp,#32]
341	stp	x23,x24,[sp,#48]
342	stp	x25,x26,[sp,#64]
343	stp	x27,x28,[sp,#80]
344	cmp	x2,#512
345	b.hs	L512_or_more_neon
346
347	sub	sp,sp,#64
348
349	ldp	x22,x23,[x5]		// load sigma
350	ld1	{v24.4s},[x5],#16
351	ldp	x24,x25,[x3]		// load key
352	ldp	x26,x27,[x3,#16]
353	ld1	{v25.4s,v26.4s},[x3]
354	ldp	x28,x30,[x4]		// load counter
355	ld1	{v27.4s},[x4]
356	ld1	{v31.4s},[x5]
357#ifdef	__ARMEB__
358	rev64	v24.4s,v24.4s
359	ror	x24,x24,#32
360	ror	x25,x25,#32
361	ror	x26,x26,#32
362	ror	x27,x27,#32
363	ror	x28,x28,#32
364	ror	x30,x30,#32
365#endif
366	add	v27.4s,v27.4s,v31.4s		// += 1
367	add	v28.4s,v27.4s,v31.4s
368	add	v29.4s,v28.4s,v31.4s
369	shl	v31.4s,v31.4s,#2			// 1 -> 4
370
371Loop_outer_neon:
372	mov	w5,w22			// unpack key block
373	lsr	x6,x22,#32
374	mov	v0.16b,v24.16b
375	mov	w7,w23
376	lsr	x8,x23,#32
377	mov	v4.16b,v24.16b
378	mov	w9,w24
379	lsr	x10,x24,#32
380	mov	v16.16b,v24.16b
381	mov	w11,w25
382	mov	v1.16b,v25.16b
383	lsr	x12,x25,#32
384	mov	v5.16b,v25.16b
385	mov	w13,w26
386	mov	v17.16b,v25.16b
387	lsr	x14,x26,#32
388	mov	v3.16b,v27.16b
389	mov	w15,w27
390	mov	v7.16b,v28.16b
391	lsr	x16,x27,#32
392	mov	v19.16b,v29.16b
393	mov	w17,w28
394	mov	v2.16b,v26.16b
395	lsr	x19,x28,#32
396	mov	v6.16b,v26.16b
397	mov	w20,w30
398	mov	v18.16b,v26.16b
399	lsr	x21,x30,#32
400
401	mov	x4,#10
402	subs	x2,x2,#256
403Loop_neon:
404	sub	x4,x4,#1
405	add	v0.4s,v0.4s,v1.4s
406	add	w5,w5,w9
407	add	v4.4s,v4.4s,v5.4s
408	add	w6,w6,w10
409	add	v16.4s,v16.4s,v17.4s
410	add	w7,w7,w11
411	eor	v3.16b,v3.16b,v0.16b
412	add	w8,w8,w12
413	eor	v7.16b,v7.16b,v4.16b
414	eor	w17,w17,w5
415	eor	v19.16b,v19.16b,v16.16b
416	eor	w19,w19,w6
417	rev32	v3.8h,v3.8h
418	eor	w20,w20,w7
419	rev32	v7.8h,v7.8h
420	eor	w21,w21,w8
421	rev32	v19.8h,v19.8h
422	ror	w17,w17,#16
423	add	v2.4s,v2.4s,v3.4s
424	ror	w19,w19,#16
425	add	v6.4s,v6.4s,v7.4s
426	ror	w20,w20,#16
427	add	v18.4s,v18.4s,v19.4s
428	ror	w21,w21,#16
429	eor	v20.16b,v1.16b,v2.16b
430	add	w13,w13,w17
431	eor	v21.16b,v5.16b,v6.16b
432	add	w14,w14,w19
433	eor	v22.16b,v17.16b,v18.16b
434	add	w15,w15,w20
435	ushr	v1.4s,v20.4s,#20
436	add	w16,w16,w21
437	ushr	v5.4s,v21.4s,#20
438	eor	w9,w9,w13
439	ushr	v17.4s,v22.4s,#20
440	eor	w10,w10,w14
441	sli	v1.4s,v20.4s,#12
442	eor	w11,w11,w15
443	sli	v5.4s,v21.4s,#12
444	eor	w12,w12,w16
445	sli	v17.4s,v22.4s,#12
446	ror	w9,w9,#20
447	add	v0.4s,v0.4s,v1.4s
448	ror	w10,w10,#20
449	add	v4.4s,v4.4s,v5.4s
450	ror	w11,w11,#20
451	add	v16.4s,v16.4s,v17.4s
452	ror	w12,w12,#20
453	eor	v20.16b,v3.16b,v0.16b
454	add	w5,w5,w9
455	eor	v21.16b,v7.16b,v4.16b
456	add	w6,w6,w10
457	eor	v22.16b,v19.16b,v16.16b
458	add	w7,w7,w11
459	ushr	v3.4s,v20.4s,#24
460	add	w8,w8,w12
461	ushr	v7.4s,v21.4s,#24
462	eor	w17,w17,w5
463	ushr	v19.4s,v22.4s,#24
464	eor	w19,w19,w6
465	sli	v3.4s,v20.4s,#8
466	eor	w20,w20,w7
467	sli	v7.4s,v21.4s,#8
468	eor	w21,w21,w8
469	sli	v19.4s,v22.4s,#8
470	ror	w17,w17,#24
471	add	v2.4s,v2.4s,v3.4s
472	ror	w19,w19,#24
473	add	v6.4s,v6.4s,v7.4s
474	ror	w20,w20,#24
475	add	v18.4s,v18.4s,v19.4s
476	ror	w21,w21,#24
477	eor	v20.16b,v1.16b,v2.16b
478	add	w13,w13,w17
479	eor	v21.16b,v5.16b,v6.16b
480	add	w14,w14,w19
481	eor	v22.16b,v17.16b,v18.16b
482	add	w15,w15,w20
483	ushr	v1.4s,v20.4s,#25
484	add	w16,w16,w21
485	ushr	v5.4s,v21.4s,#25
486	eor	w9,w9,w13
487	ushr	v17.4s,v22.4s,#25
488	eor	w10,w10,w14
489	sli	v1.4s,v20.4s,#7
490	eor	w11,w11,w15
491	sli	v5.4s,v21.4s,#7
492	eor	w12,w12,w16
493	sli	v17.4s,v22.4s,#7
494	ror	w9,w9,#25
495	ext	v2.16b,v2.16b,v2.16b,#8
496	ror	w10,w10,#25
497	ext	v6.16b,v6.16b,v6.16b,#8
498	ror	w11,w11,#25
499	ext	v18.16b,v18.16b,v18.16b,#8
500	ror	w12,w12,#25
501	ext	v3.16b,v3.16b,v3.16b,#12
502	ext	v7.16b,v7.16b,v7.16b,#12
503	ext	v19.16b,v19.16b,v19.16b,#12
504	ext	v1.16b,v1.16b,v1.16b,#4
505	ext	v5.16b,v5.16b,v5.16b,#4
506	ext	v17.16b,v17.16b,v17.16b,#4
507	add	v0.4s,v0.4s,v1.4s
508	add	w5,w5,w10
509	add	v4.4s,v4.4s,v5.4s
510	add	w6,w6,w11
511	add	v16.4s,v16.4s,v17.4s
512	add	w7,w7,w12
513	eor	v3.16b,v3.16b,v0.16b
514	add	w8,w8,w9
515	eor	v7.16b,v7.16b,v4.16b
516	eor	w21,w21,w5
517	eor	v19.16b,v19.16b,v16.16b
518	eor	w17,w17,w6
519	rev32	v3.8h,v3.8h
520	eor	w19,w19,w7
521	rev32	v7.8h,v7.8h
522	eor	w20,w20,w8
523	rev32	v19.8h,v19.8h
524	ror	w21,w21,#16
525	add	v2.4s,v2.4s,v3.4s
526	ror	w17,w17,#16
527	add	v6.4s,v6.4s,v7.4s
528	ror	w19,w19,#16
529	add	v18.4s,v18.4s,v19.4s
530	ror	w20,w20,#16
531	eor	v20.16b,v1.16b,v2.16b
532	add	w15,w15,w21
533	eor	v21.16b,v5.16b,v6.16b
534	add	w16,w16,w17
535	eor	v22.16b,v17.16b,v18.16b
536	add	w13,w13,w19
537	ushr	v1.4s,v20.4s,#20
538	add	w14,w14,w20
539	ushr	v5.4s,v21.4s,#20
540	eor	w10,w10,w15
541	ushr	v17.4s,v22.4s,#20
542	eor	w11,w11,w16
543	sli	v1.4s,v20.4s,#12
544	eor	w12,w12,w13
545	sli	v5.4s,v21.4s,#12
546	eor	w9,w9,w14
547	sli	v17.4s,v22.4s,#12
548	ror	w10,w10,#20
549	add	v0.4s,v0.4s,v1.4s
550	ror	w11,w11,#20
551	add	v4.4s,v4.4s,v5.4s
552	ror	w12,w12,#20
553	add	v16.4s,v16.4s,v17.4s
554	ror	w9,w9,#20
555	eor	v20.16b,v3.16b,v0.16b
556	add	w5,w5,w10
557	eor	v21.16b,v7.16b,v4.16b
558	add	w6,w6,w11
559	eor	v22.16b,v19.16b,v16.16b
560	add	w7,w7,w12
561	ushr	v3.4s,v20.4s,#24
562	add	w8,w8,w9
563	ushr	v7.4s,v21.4s,#24
564	eor	w21,w21,w5
565	ushr	v19.4s,v22.4s,#24
566	eor	w17,w17,w6
567	sli	v3.4s,v20.4s,#8
568	eor	w19,w19,w7
569	sli	v7.4s,v21.4s,#8
570	eor	w20,w20,w8
571	sli	v19.4s,v22.4s,#8
572	ror	w21,w21,#24
573	add	v2.4s,v2.4s,v3.4s
574	ror	w17,w17,#24
575	add	v6.4s,v6.4s,v7.4s
576	ror	w19,w19,#24
577	add	v18.4s,v18.4s,v19.4s
578	ror	w20,w20,#24
579	eor	v20.16b,v1.16b,v2.16b
580	add	w15,w15,w21
581	eor	v21.16b,v5.16b,v6.16b
582	add	w16,w16,w17
583	eor	v22.16b,v17.16b,v18.16b
584	add	w13,w13,w19
585	ushr	v1.4s,v20.4s,#25
586	add	w14,w14,w20
587	ushr	v5.4s,v21.4s,#25
588	eor	w10,w10,w15
589	ushr	v17.4s,v22.4s,#25
590	eor	w11,w11,w16
591	sli	v1.4s,v20.4s,#7
592	eor	w12,w12,w13
593	sli	v5.4s,v21.4s,#7
594	eor	w9,w9,w14
595	sli	v17.4s,v22.4s,#7
596	ror	w10,w10,#25
597	ext	v2.16b,v2.16b,v2.16b,#8
598	ror	w11,w11,#25
599	ext	v6.16b,v6.16b,v6.16b,#8
600	ror	w12,w12,#25
601	ext	v18.16b,v18.16b,v18.16b,#8
602	ror	w9,w9,#25
603	ext	v3.16b,v3.16b,v3.16b,#4
604	ext	v7.16b,v7.16b,v7.16b,#4
605	ext	v19.16b,v19.16b,v19.16b,#4
606	ext	v1.16b,v1.16b,v1.16b,#12
607	ext	v5.16b,v5.16b,v5.16b,#12
608	ext	v17.16b,v17.16b,v17.16b,#12
609	cbnz	x4,Loop_neon
610
611	add	w5,w5,w22		// accumulate key block
612	add	v0.4s,v0.4s,v24.4s
613	add	x6,x6,x22,lsr#32
614	add	v4.4s,v4.4s,v24.4s
615	add	w7,w7,w23
616	add	v16.4s,v16.4s,v24.4s
617	add	x8,x8,x23,lsr#32
618	add	v2.4s,v2.4s,v26.4s
619	add	w9,w9,w24
620	add	v6.4s,v6.4s,v26.4s
621	add	x10,x10,x24,lsr#32
622	add	v18.4s,v18.4s,v26.4s
623	add	w11,w11,w25
624	add	v3.4s,v3.4s,v27.4s
625	add	x12,x12,x25,lsr#32
626	add	w13,w13,w26
627	add	v7.4s,v7.4s,v28.4s
628	add	x14,x14,x26,lsr#32
629	add	w15,w15,w27
630	add	v19.4s,v19.4s,v29.4s
631	add	x16,x16,x27,lsr#32
632	add	w17,w17,w28
633	add	v1.4s,v1.4s,v25.4s
634	add	x19,x19,x28,lsr#32
635	add	w20,w20,w30
636	add	v5.4s,v5.4s,v25.4s
637	add	x21,x21,x30,lsr#32
638	add	v17.4s,v17.4s,v25.4s
639
640	b.lo	Ltail_neon
641
642	add	x5,x5,x6,lsl#32	// pack
643	add	x7,x7,x8,lsl#32
644	ldp	x6,x8,[x1,#0]		// load input
645	add	x9,x9,x10,lsl#32
646	add	x11,x11,x12,lsl#32
647	ldp	x10,x12,[x1,#16]
648	add	x13,x13,x14,lsl#32
649	add	x15,x15,x16,lsl#32
650	ldp	x14,x16,[x1,#32]
651	add	x17,x17,x19,lsl#32
652	add	x20,x20,x21,lsl#32
653	ldp	x19,x21,[x1,#48]
654	add	x1,x1,#64
655#ifdef	__ARMEB__
656	rev	x5,x5
657	rev	x7,x7
658	rev	x9,x9
659	rev	x11,x11
660	rev	x13,x13
661	rev	x15,x15
662	rev	x17,x17
663	rev	x20,x20
664#endif
665	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
666	eor	x5,x5,x6
667	eor	x7,x7,x8
668	eor	x9,x9,x10
669	eor	x11,x11,x12
670	eor	x13,x13,x14
671	eor	v0.16b,v0.16b,v20.16b
672	eor	x15,x15,x16
673	eor	v1.16b,v1.16b,v21.16b
674	eor	x17,x17,x19
675	eor	v2.16b,v2.16b,v22.16b
676	eor	x20,x20,x21
677	eor	v3.16b,v3.16b,v23.16b
678	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
679
680	stp	x5,x7,[x0,#0]		// store output
681	add	x28,x28,#4			// increment counter
682	stp	x9,x11,[x0,#16]
683	add	v27.4s,v27.4s,v31.4s		// += 4
684	stp	x13,x15,[x0,#32]
685	add	v28.4s,v28.4s,v31.4s
686	stp	x17,x20,[x0,#48]
687	add	v29.4s,v29.4s,v31.4s
688	add	x0,x0,#64
689
690	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
691	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
692
693	eor	v4.16b,v4.16b,v20.16b
694	eor	v5.16b,v5.16b,v21.16b
695	eor	v6.16b,v6.16b,v22.16b
696	eor	v7.16b,v7.16b,v23.16b
697	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
698
699	eor	v16.16b,v16.16b,v0.16b
700	eor	v17.16b,v17.16b,v1.16b
701	eor	v18.16b,v18.16b,v2.16b
702	eor	v19.16b,v19.16b,v3.16b
703	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
704
705	b.hi	Loop_outer_neon
706
707	ldp	x19,x20,[x29,#16]
708	add	sp,sp,#64
709	ldp	x21,x22,[x29,#32]
710	ldp	x23,x24,[x29,#48]
711	ldp	x25,x26,[x29,#64]
712	ldp	x27,x28,[x29,#80]
713	ldp	x29,x30,[sp],#96
714	AARCH64_VALIDATE_LINK_REGISTER
715	ret
716
717Ltail_neon:
718	add	x2,x2,#256
719	cmp	x2,#64
720	b.lo	Less_than_64
721
722	add	x5,x5,x6,lsl#32	// pack
723	add	x7,x7,x8,lsl#32
724	ldp	x6,x8,[x1,#0]		// load input
725	add	x9,x9,x10,lsl#32
726	add	x11,x11,x12,lsl#32
727	ldp	x10,x12,[x1,#16]
728	add	x13,x13,x14,lsl#32
729	add	x15,x15,x16,lsl#32
730	ldp	x14,x16,[x1,#32]
731	add	x17,x17,x19,lsl#32
732	add	x20,x20,x21,lsl#32
733	ldp	x19,x21,[x1,#48]
734	add	x1,x1,#64
735#ifdef	__ARMEB__
736	rev	x5,x5
737	rev	x7,x7
738	rev	x9,x9
739	rev	x11,x11
740	rev	x13,x13
741	rev	x15,x15
742	rev	x17,x17
743	rev	x20,x20
744#endif
745	eor	x5,x5,x6
746	eor	x7,x7,x8
747	eor	x9,x9,x10
748	eor	x11,x11,x12
749	eor	x13,x13,x14
750	eor	x15,x15,x16
751	eor	x17,x17,x19
752	eor	x20,x20,x21
753
754	stp	x5,x7,[x0,#0]		// store output
755	add	x28,x28,#4			// increment counter
756	stp	x9,x11,[x0,#16]
757	stp	x13,x15,[x0,#32]
758	stp	x17,x20,[x0,#48]
759	add	x0,x0,#64
760	b.eq	Ldone_neon
761	sub	x2,x2,#64
762	cmp	x2,#64
763	b.lo	Less_than_128
764
765	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
766	eor	v0.16b,v0.16b,v20.16b
767	eor	v1.16b,v1.16b,v21.16b
768	eor	v2.16b,v2.16b,v22.16b
769	eor	v3.16b,v3.16b,v23.16b
770	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
771	b.eq	Ldone_neon
772	sub	x2,x2,#64
773	cmp	x2,#64
774	b.lo	Less_than_192
775
776	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
777	eor	v4.16b,v4.16b,v20.16b
778	eor	v5.16b,v5.16b,v21.16b
779	eor	v6.16b,v6.16b,v22.16b
780	eor	v7.16b,v7.16b,v23.16b
781	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
782	b.eq	Ldone_neon
783	sub	x2,x2,#64
784
785	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
786	b	Last_neon
787
788Less_than_128:
789	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
790	b	Last_neon
791Less_than_192:
792	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
793	b	Last_neon
794
795.align	4
796Last_neon:
797	sub	x0,x0,#1
798	add	x1,x1,x2
799	add	x0,x0,x2
800	add	x4,sp,x2
801	neg	x2,x2
802
803Loop_tail_neon:
804	ldrb	w10,[x1,x2]
805	ldrb	w11,[x4,x2]
806	add	x2,x2,#1
807	eor	w10,w10,w11
808	strb	w10,[x0,x2]
809	cbnz	x2,Loop_tail_neon
810
811	stp	xzr,xzr,[sp,#0]
812	stp	xzr,xzr,[sp,#16]
813	stp	xzr,xzr,[sp,#32]
814	stp	xzr,xzr,[sp,#48]
815
816Ldone_neon:
817	ldp	x19,x20,[x29,#16]
818	add	sp,sp,#64
819	ldp	x21,x22,[x29,#32]
820	ldp	x23,x24,[x29,#48]
821	ldp	x25,x26,[x29,#64]
822	ldp	x27,x28,[x29,#80]
823	ldp	x29,x30,[sp],#96
824	AARCH64_VALIDATE_LINK_REGISTER
825	ret
826
827.def ChaCha20_512_neon
828   .type 32
829.endef
830.align	5
831ChaCha20_512_neon:
832	AARCH64_SIGN_LINK_REGISTER
833	stp	x29,x30,[sp,#-96]!
834	add	x29,sp,#0
835
836	adrp	x5,Lsigma
837	add	x5,x5,:lo12:Lsigma
838	stp	x19,x20,[sp,#16]
839	stp	x21,x22,[sp,#32]
840	stp	x23,x24,[sp,#48]
841	stp	x25,x26,[sp,#64]
842	stp	x27,x28,[sp,#80]
843
844L512_or_more_neon:
845	sub	sp,sp,#128+64
846
847	ldp	x22,x23,[x5]		// load sigma
848	ld1	{v24.4s},[x5],#16
849	ldp	x24,x25,[x3]		// load key
850	ldp	x26,x27,[x3,#16]
851	ld1	{v25.4s,v26.4s},[x3]
852	ldp	x28,x30,[x4]		// load counter
853	ld1	{v27.4s},[x4]
854	ld1	{v31.4s},[x5]
855#ifdef	__ARMEB__
856	rev64	v24.4s,v24.4s
857	ror	x24,x24,#32
858	ror	x25,x25,#32
859	ror	x26,x26,#32
860	ror	x27,x27,#32
861	ror	x28,x28,#32
862	ror	x30,x30,#32
863#endif
864	add	v27.4s,v27.4s,v31.4s		// += 1
865	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
866	add	v27.4s,v27.4s,v31.4s		// not typo
867	str	q26,[sp,#32]
868	add	v28.4s,v27.4s,v31.4s
869	add	v29.4s,v28.4s,v31.4s
870	add	v30.4s,v29.4s,v31.4s
871	shl	v31.4s,v31.4s,#2			// 1 -> 4
872
873	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
874	stp	d10,d11,[sp,#128+16]
875	stp	d12,d13,[sp,#128+32]
876	stp	d14,d15,[sp,#128+48]
877
878	sub	x2,x2,#512			// not typo
879
880Loop_outer_512_neon:
881	mov	v0.16b,v24.16b
882	mov	v4.16b,v24.16b
883	mov	v8.16b,v24.16b
884	mov	v12.16b,v24.16b
885	mov	v16.16b,v24.16b
886	mov	v20.16b,v24.16b
887	mov	v1.16b,v25.16b
888	mov	w5,w22			// unpack key block
889	mov	v5.16b,v25.16b
890	lsr	x6,x22,#32
891	mov	v9.16b,v25.16b
892	mov	w7,w23
893	mov	v13.16b,v25.16b
894	lsr	x8,x23,#32
895	mov	v17.16b,v25.16b
896	mov	w9,w24
897	mov	v21.16b,v25.16b
898	lsr	x10,x24,#32
899	mov	v3.16b,v27.16b
900	mov	w11,w25
901	mov	v7.16b,v28.16b
902	lsr	x12,x25,#32
903	mov	v11.16b,v29.16b
904	mov	w13,w26
905	mov	v15.16b,v30.16b
906	lsr	x14,x26,#32
907	mov	v2.16b,v26.16b
908	mov	w15,w27
909	mov	v6.16b,v26.16b
910	lsr	x16,x27,#32
911	add	v19.4s,v3.4s,v31.4s			// +4
912	mov	w17,w28
913	add	v23.4s,v7.4s,v31.4s			// +4
914	lsr	x19,x28,#32
915	mov	v10.16b,v26.16b
916	mov	w20,w30
917	mov	v14.16b,v26.16b
918	lsr	x21,x30,#32
919	mov	v18.16b,v26.16b
920	stp	q27,q28,[sp,#48]		// off-load key block, variable part
921	mov	v22.16b,v26.16b
922	str	q29,[sp,#80]
923
924	mov	x4,#5
925	subs	x2,x2,#512
926Loop_upper_neon:
927	sub	x4,x4,#1
928	add	v0.4s,v0.4s,v1.4s
929	add	w5,w5,w9
930	add	v4.4s,v4.4s,v5.4s
931	add	w6,w6,w10
932	add	v8.4s,v8.4s,v9.4s
933	add	w7,w7,w11
934	add	v12.4s,v12.4s,v13.4s
935	add	w8,w8,w12
936	add	v16.4s,v16.4s,v17.4s
937	eor	w17,w17,w5
938	add	v20.4s,v20.4s,v21.4s
939	eor	w19,w19,w6
940	eor	v3.16b,v3.16b,v0.16b
941	eor	w20,w20,w7
942	eor	v7.16b,v7.16b,v4.16b
943	eor	w21,w21,w8
944	eor	v11.16b,v11.16b,v8.16b
945	ror	w17,w17,#16
946	eor	v15.16b,v15.16b,v12.16b
947	ror	w19,w19,#16
948	eor	v19.16b,v19.16b,v16.16b
949	ror	w20,w20,#16
950	eor	v23.16b,v23.16b,v20.16b
951	ror	w21,w21,#16
952	rev32	v3.8h,v3.8h
953	add	w13,w13,w17
954	rev32	v7.8h,v7.8h
955	add	w14,w14,w19
956	rev32	v11.8h,v11.8h
957	add	w15,w15,w20
958	rev32	v15.8h,v15.8h
959	add	w16,w16,w21
960	rev32	v19.8h,v19.8h
961	eor	w9,w9,w13
962	rev32	v23.8h,v23.8h
963	eor	w10,w10,w14
964	add	v2.4s,v2.4s,v3.4s
965	eor	w11,w11,w15
966	add	v6.4s,v6.4s,v7.4s
967	eor	w12,w12,w16
968	add	v10.4s,v10.4s,v11.4s
969	ror	w9,w9,#20
970	add	v14.4s,v14.4s,v15.4s
971	ror	w10,w10,#20
972	add	v18.4s,v18.4s,v19.4s
973	ror	w11,w11,#20
974	add	v22.4s,v22.4s,v23.4s
975	ror	w12,w12,#20
976	eor	v24.16b,v1.16b,v2.16b
977	add	w5,w5,w9
978	eor	v25.16b,v5.16b,v6.16b
979	add	w6,w6,w10
980	eor	v26.16b,v9.16b,v10.16b
981	add	w7,w7,w11
982	eor	v27.16b,v13.16b,v14.16b
983	add	w8,w8,w12
984	eor	v28.16b,v17.16b,v18.16b
985	eor	w17,w17,w5
986	eor	v29.16b,v21.16b,v22.16b
987	eor	w19,w19,w6
988	ushr	v1.4s,v24.4s,#20
989	eor	w20,w20,w7
990	ushr	v5.4s,v25.4s,#20
991	eor	w21,w21,w8
992	ushr	v9.4s,v26.4s,#20
993	ror	w17,w17,#24
994	ushr	v13.4s,v27.4s,#20
995	ror	w19,w19,#24
996	ushr	v17.4s,v28.4s,#20
997	ror	w20,w20,#24
998	ushr	v21.4s,v29.4s,#20
999	ror	w21,w21,#24
1000	sli	v1.4s,v24.4s,#12
1001	add	w13,w13,w17
1002	sli	v5.4s,v25.4s,#12
1003	add	w14,w14,w19
1004	sli	v9.4s,v26.4s,#12
1005	add	w15,w15,w20
1006	sli	v13.4s,v27.4s,#12
1007	add	w16,w16,w21
1008	sli	v17.4s,v28.4s,#12
1009	eor	w9,w9,w13
1010	sli	v21.4s,v29.4s,#12
1011	eor	w10,w10,w14
1012	add	v0.4s,v0.4s,v1.4s
1013	eor	w11,w11,w15
1014	add	v4.4s,v4.4s,v5.4s
1015	eor	w12,w12,w16
1016	add	v8.4s,v8.4s,v9.4s
1017	ror	w9,w9,#25
1018	add	v12.4s,v12.4s,v13.4s
1019	ror	w10,w10,#25
1020	add	v16.4s,v16.4s,v17.4s
1021	ror	w11,w11,#25
1022	add	v20.4s,v20.4s,v21.4s
1023	ror	w12,w12,#25
1024	eor	v24.16b,v3.16b,v0.16b
1025	add	w5,w5,w10
1026	eor	v25.16b,v7.16b,v4.16b
1027	add	w6,w6,w11
1028	eor	v26.16b,v11.16b,v8.16b
1029	add	w7,w7,w12
1030	eor	v27.16b,v15.16b,v12.16b
1031	add	w8,w8,w9
1032	eor	v28.16b,v19.16b,v16.16b
1033	eor	w21,w21,w5
1034	eor	v29.16b,v23.16b,v20.16b
1035	eor	w17,w17,w6
1036	ushr	v3.4s,v24.4s,#24
1037	eor	w19,w19,w7
1038	ushr	v7.4s,v25.4s,#24
1039	eor	w20,w20,w8
1040	ushr	v11.4s,v26.4s,#24
1041	ror	w21,w21,#16
1042	ushr	v15.4s,v27.4s,#24
1043	ror	w17,w17,#16
1044	ushr	v19.4s,v28.4s,#24
1045	ror	w19,w19,#16
1046	ushr	v23.4s,v29.4s,#24
1047	ror	w20,w20,#16
1048	sli	v3.4s,v24.4s,#8
1049	add	w15,w15,w21
1050	sli	v7.4s,v25.4s,#8
1051	add	w16,w16,w17
1052	sli	v11.4s,v26.4s,#8
1053	add	w13,w13,w19
1054	sli	v15.4s,v27.4s,#8
1055	add	w14,w14,w20
1056	sli	v19.4s,v28.4s,#8
1057	eor	w10,w10,w15
1058	sli	v23.4s,v29.4s,#8
1059	eor	w11,w11,w16
1060	add	v2.4s,v2.4s,v3.4s
1061	eor	w12,w12,w13
1062	add	v6.4s,v6.4s,v7.4s
1063	eor	w9,w9,w14
1064	add	v10.4s,v10.4s,v11.4s
1065	ror	w10,w10,#20
1066	add	v14.4s,v14.4s,v15.4s
1067	ror	w11,w11,#20
1068	add	v18.4s,v18.4s,v19.4s
1069	ror	w12,w12,#20
1070	add	v22.4s,v22.4s,v23.4s
1071	ror	w9,w9,#20
1072	eor	v24.16b,v1.16b,v2.16b
1073	add	w5,w5,w10
1074	eor	v25.16b,v5.16b,v6.16b
1075	add	w6,w6,w11
1076	eor	v26.16b,v9.16b,v10.16b
1077	add	w7,w7,w12
1078	eor	v27.16b,v13.16b,v14.16b
1079	add	w8,w8,w9
1080	eor	v28.16b,v17.16b,v18.16b
1081	eor	w21,w21,w5
1082	eor	v29.16b,v21.16b,v22.16b
1083	eor	w17,w17,w6
1084	ushr	v1.4s,v24.4s,#25
1085	eor	w19,w19,w7
1086	ushr	v5.4s,v25.4s,#25
1087	eor	w20,w20,w8
1088	ushr	v9.4s,v26.4s,#25
1089	ror	w21,w21,#24
1090	ushr	v13.4s,v27.4s,#25
1091	ror	w17,w17,#24
1092	ushr	v17.4s,v28.4s,#25
1093	ror	w19,w19,#24
1094	ushr	v21.4s,v29.4s,#25
1095	ror	w20,w20,#24
1096	sli	v1.4s,v24.4s,#7
1097	add	w15,w15,w21
1098	sli	v5.4s,v25.4s,#7
1099	add	w16,w16,w17
1100	sli	v9.4s,v26.4s,#7
1101	add	w13,w13,w19
1102	sli	v13.4s,v27.4s,#7
1103	add	w14,w14,w20
1104	sli	v17.4s,v28.4s,#7
1105	eor	w10,w10,w15
1106	sli	v21.4s,v29.4s,#7
1107	eor	w11,w11,w16
1108	ext	v2.16b,v2.16b,v2.16b,#8
1109	eor	w12,w12,w13
1110	ext	v6.16b,v6.16b,v6.16b,#8
1111	eor	w9,w9,w14
1112	ext	v10.16b,v10.16b,v10.16b,#8
1113	ror	w10,w10,#25
1114	ext	v14.16b,v14.16b,v14.16b,#8
1115	ror	w11,w11,#25
1116	ext	v18.16b,v18.16b,v18.16b,#8
1117	ror	w12,w12,#25
1118	ext	v22.16b,v22.16b,v22.16b,#8
1119	ror	w9,w9,#25
1120	ext	v3.16b,v3.16b,v3.16b,#12
1121	ext	v7.16b,v7.16b,v7.16b,#12
1122	ext	v11.16b,v11.16b,v11.16b,#12
1123	ext	v15.16b,v15.16b,v15.16b,#12
1124	ext	v19.16b,v19.16b,v19.16b,#12
1125	ext	v23.16b,v23.16b,v23.16b,#12
1126	ext	v1.16b,v1.16b,v1.16b,#4
1127	ext	v5.16b,v5.16b,v5.16b,#4
1128	ext	v9.16b,v9.16b,v9.16b,#4
1129	ext	v13.16b,v13.16b,v13.16b,#4
1130	ext	v17.16b,v17.16b,v17.16b,#4
1131	ext	v21.16b,v21.16b,v21.16b,#4
1132	add	v0.4s,v0.4s,v1.4s
1133	add	w5,w5,w9
1134	add	v4.4s,v4.4s,v5.4s
1135	add	w6,w6,w10
1136	add	v8.4s,v8.4s,v9.4s
1137	add	w7,w7,w11
1138	add	v12.4s,v12.4s,v13.4s
1139	add	w8,w8,w12
1140	add	v16.4s,v16.4s,v17.4s
1141	eor	w17,w17,w5
1142	add	v20.4s,v20.4s,v21.4s
1143	eor	w19,w19,w6
1144	eor	v3.16b,v3.16b,v0.16b
1145	eor	w20,w20,w7
1146	eor	v7.16b,v7.16b,v4.16b
1147	eor	w21,w21,w8
1148	eor	v11.16b,v11.16b,v8.16b
1149	ror	w17,w17,#16
1150	eor	v15.16b,v15.16b,v12.16b
1151	ror	w19,w19,#16
1152	eor	v19.16b,v19.16b,v16.16b
1153	ror	w20,w20,#16
1154	eor	v23.16b,v23.16b,v20.16b
1155	ror	w21,w21,#16
1156	rev32	v3.8h,v3.8h
1157	add	w13,w13,w17
1158	rev32	v7.8h,v7.8h
1159	add	w14,w14,w19
1160	rev32	v11.8h,v11.8h
1161	add	w15,w15,w20
1162	rev32	v15.8h,v15.8h
1163	add	w16,w16,w21
1164	rev32	v19.8h,v19.8h
1165	eor	w9,w9,w13
1166	rev32	v23.8h,v23.8h
1167	eor	w10,w10,w14
1168	add	v2.4s,v2.4s,v3.4s
1169	eor	w11,w11,w15
1170	add	v6.4s,v6.4s,v7.4s
1171	eor	w12,w12,w16
1172	add	v10.4s,v10.4s,v11.4s
1173	ror	w9,w9,#20
1174	add	v14.4s,v14.4s,v15.4s
1175	ror	w10,w10,#20
1176	add	v18.4s,v18.4s,v19.4s
1177	ror	w11,w11,#20
1178	add	v22.4s,v22.4s,v23.4s
1179	ror	w12,w12,#20
1180	eor	v24.16b,v1.16b,v2.16b
1181	add	w5,w5,w9
1182	eor	v25.16b,v5.16b,v6.16b
1183	add	w6,w6,w10
1184	eor	v26.16b,v9.16b,v10.16b
1185	add	w7,w7,w11
1186	eor	v27.16b,v13.16b,v14.16b
1187	add	w8,w8,w12
1188	eor	v28.16b,v17.16b,v18.16b
1189	eor	w17,w17,w5
1190	eor	v29.16b,v21.16b,v22.16b
1191	eor	w19,w19,w6
1192	ushr	v1.4s,v24.4s,#20
1193	eor	w20,w20,w7
1194	ushr	v5.4s,v25.4s,#20
1195	eor	w21,w21,w8
1196	ushr	v9.4s,v26.4s,#20
1197	ror	w17,w17,#24
1198	ushr	v13.4s,v27.4s,#20
1199	ror	w19,w19,#24
1200	ushr	v17.4s,v28.4s,#20
1201	ror	w20,w20,#24
1202	ushr	v21.4s,v29.4s,#20
1203	ror	w21,w21,#24
1204	sli	v1.4s,v24.4s,#12
1205	add	w13,w13,w17
1206	sli	v5.4s,v25.4s,#12
1207	add	w14,w14,w19
1208	sli	v9.4s,v26.4s,#12
1209	add	w15,w15,w20
1210	sli	v13.4s,v27.4s,#12
1211	add	w16,w16,w21
1212	sli	v17.4s,v28.4s,#12
1213	eor	w9,w9,w13
1214	sli	v21.4s,v29.4s,#12
1215	eor	w10,w10,w14
1216	add	v0.4s,v0.4s,v1.4s
1217	eor	w11,w11,w15
1218	add	v4.4s,v4.4s,v5.4s
1219	eor	w12,w12,w16
1220	add	v8.4s,v8.4s,v9.4s
1221	ror	w9,w9,#25
1222	add	v12.4s,v12.4s,v13.4s
1223	ror	w10,w10,#25
1224	add	v16.4s,v16.4s,v17.4s
1225	ror	w11,w11,#25
1226	add	v20.4s,v20.4s,v21.4s
1227	ror	w12,w12,#25
1228	eor	v24.16b,v3.16b,v0.16b
1229	add	w5,w5,w10
1230	eor	v25.16b,v7.16b,v4.16b
1231	add	w6,w6,w11
1232	eor	v26.16b,v11.16b,v8.16b
1233	add	w7,w7,w12
1234	eor	v27.16b,v15.16b,v12.16b
1235	add	w8,w8,w9
1236	eor	v28.16b,v19.16b,v16.16b
1237	eor	w21,w21,w5
1238	eor	v29.16b,v23.16b,v20.16b
1239	eor	w17,w17,w6
1240	ushr	v3.4s,v24.4s,#24
1241	eor	w19,w19,w7
1242	ushr	v7.4s,v25.4s,#24
1243	eor	w20,w20,w8
1244	ushr	v11.4s,v26.4s,#24
1245	ror	w21,w21,#16
1246	ushr	v15.4s,v27.4s,#24
1247	ror	w17,w17,#16
1248	ushr	v19.4s,v28.4s,#24
1249	ror	w19,w19,#16
1250	ushr	v23.4s,v29.4s,#24
1251	ror	w20,w20,#16
1252	sli	v3.4s,v24.4s,#8
1253	add	w15,w15,w21
1254	sli	v7.4s,v25.4s,#8
1255	add	w16,w16,w17
1256	sli	v11.4s,v26.4s,#8
1257	add	w13,w13,w19
1258	sli	v15.4s,v27.4s,#8
1259	add	w14,w14,w20
1260	sli	v19.4s,v28.4s,#8
1261	eor	w10,w10,w15
1262	sli	v23.4s,v29.4s,#8
1263	eor	w11,w11,w16
1264	add	v2.4s,v2.4s,v3.4s
1265	eor	w12,w12,w13
1266	add	v6.4s,v6.4s,v7.4s
1267	eor	w9,w9,w14
1268	add	v10.4s,v10.4s,v11.4s
1269	ror	w10,w10,#20
1270	add	v14.4s,v14.4s,v15.4s
1271	ror	w11,w11,#20
1272	add	v18.4s,v18.4s,v19.4s
1273	ror	w12,w12,#20
1274	add	v22.4s,v22.4s,v23.4s
1275	ror	w9,w9,#20
1276	eor	v24.16b,v1.16b,v2.16b
1277	add	w5,w5,w10
1278	eor	v25.16b,v5.16b,v6.16b
1279	add	w6,w6,w11
1280	eor	v26.16b,v9.16b,v10.16b
1281	add	w7,w7,w12
1282	eor	v27.16b,v13.16b,v14.16b
1283	add	w8,w8,w9
1284	eor	v28.16b,v17.16b,v18.16b
1285	eor	w21,w21,w5
1286	eor	v29.16b,v21.16b,v22.16b
1287	eor	w17,w17,w6
1288	ushr	v1.4s,v24.4s,#25
1289	eor	w19,w19,w7
1290	ushr	v5.4s,v25.4s,#25
1291	eor	w20,w20,w8
1292	ushr	v9.4s,v26.4s,#25
1293	ror	w21,w21,#24
1294	ushr	v13.4s,v27.4s,#25
1295	ror	w17,w17,#24
1296	ushr	v17.4s,v28.4s,#25
1297	ror	w19,w19,#24
1298	ushr	v21.4s,v29.4s,#25
1299	ror	w20,w20,#24
1300	sli	v1.4s,v24.4s,#7
1301	add	w15,w15,w21
1302	sli	v5.4s,v25.4s,#7
1303	add	w16,w16,w17
1304	sli	v9.4s,v26.4s,#7
1305	add	w13,w13,w19
1306	sli	v13.4s,v27.4s,#7
1307	add	w14,w14,w20
1308	sli	v17.4s,v28.4s,#7
1309	eor	w10,w10,w15
1310	sli	v21.4s,v29.4s,#7
1311	eor	w11,w11,w16
1312	ext	v2.16b,v2.16b,v2.16b,#8
1313	eor	w12,w12,w13
1314	ext	v6.16b,v6.16b,v6.16b,#8
1315	eor	w9,w9,w14
1316	ext	v10.16b,v10.16b,v10.16b,#8
1317	ror	w10,w10,#25
1318	ext	v14.16b,v14.16b,v14.16b,#8
1319	ror	w11,w11,#25
1320	ext	v18.16b,v18.16b,v18.16b,#8
1321	ror	w12,w12,#25
1322	ext	v22.16b,v22.16b,v22.16b,#8
1323	ror	w9,w9,#25
1324	ext	v3.16b,v3.16b,v3.16b,#4
1325	ext	v7.16b,v7.16b,v7.16b,#4
1326	ext	v11.16b,v11.16b,v11.16b,#4
1327	ext	v15.16b,v15.16b,v15.16b,#4
1328	ext	v19.16b,v19.16b,v19.16b,#4
1329	ext	v23.16b,v23.16b,v23.16b,#4
1330	ext	v1.16b,v1.16b,v1.16b,#12
1331	ext	v5.16b,v5.16b,v5.16b,#12
1332	ext	v9.16b,v9.16b,v9.16b,#12
1333	ext	v13.16b,v13.16b,v13.16b,#12
1334	ext	v17.16b,v17.16b,v17.16b,#12
1335	ext	v21.16b,v21.16b,v21.16b,#12
1336	cbnz	x4,Loop_upper_neon
1337
1338	add	w5,w5,w22		// accumulate key block
1339	add	x6,x6,x22,lsr#32
1340	add	w7,w7,w23
1341	add	x8,x8,x23,lsr#32
1342	add	w9,w9,w24
1343	add	x10,x10,x24,lsr#32
1344	add	w11,w11,w25
1345	add	x12,x12,x25,lsr#32
1346	add	w13,w13,w26
1347	add	x14,x14,x26,lsr#32
1348	add	w15,w15,w27
1349	add	x16,x16,x27,lsr#32
1350	add	w17,w17,w28
1351	add	x19,x19,x28,lsr#32
1352	add	w20,w20,w30
1353	add	x21,x21,x30,lsr#32
1354
1355	add	x5,x5,x6,lsl#32	// pack
1356	add	x7,x7,x8,lsl#32
1357	ldp	x6,x8,[x1,#0]		// load input
1358	add	x9,x9,x10,lsl#32
1359	add	x11,x11,x12,lsl#32
1360	ldp	x10,x12,[x1,#16]
1361	add	x13,x13,x14,lsl#32
1362	add	x15,x15,x16,lsl#32
1363	ldp	x14,x16,[x1,#32]
1364	add	x17,x17,x19,lsl#32
1365	add	x20,x20,x21,lsl#32
1366	ldp	x19,x21,[x1,#48]
1367	add	x1,x1,#64
1368#ifdef	__ARMEB__
1369	rev	x5,x5
1370	rev	x7,x7
1371	rev	x9,x9
1372	rev	x11,x11
1373	rev	x13,x13
1374	rev	x15,x15
1375	rev	x17,x17
1376	rev	x20,x20
1377#endif
1378	eor	x5,x5,x6
1379	eor	x7,x7,x8
1380	eor	x9,x9,x10
1381	eor	x11,x11,x12
1382	eor	x13,x13,x14
1383	eor	x15,x15,x16
1384	eor	x17,x17,x19
1385	eor	x20,x20,x21
1386
1387	stp	x5,x7,[x0,#0]		// store output
1388	add	x28,x28,#1			// increment counter
1389	mov	w5,w22			// unpack key block
1390	lsr	x6,x22,#32
1391	stp	x9,x11,[x0,#16]
1392	mov	w7,w23
1393	lsr	x8,x23,#32
1394	stp	x13,x15,[x0,#32]
1395	mov	w9,w24
1396	lsr	x10,x24,#32
1397	stp	x17,x20,[x0,#48]
1398	add	x0,x0,#64
1399	mov	w11,w25
1400	lsr	x12,x25,#32
1401	mov	w13,w26
1402	lsr	x14,x26,#32
1403	mov	w15,w27
1404	lsr	x16,x27,#32
1405	mov	w17,w28
1406	lsr	x19,x28,#32
1407	mov	w20,w30
1408	lsr	x21,x30,#32
1409
1410	mov	x4,#5
1411Loop_lower_neon:
1412	sub	x4,x4,#1
1413	add	v0.4s,v0.4s,v1.4s
1414	add	w5,w5,w9
1415	add	v4.4s,v4.4s,v5.4s
1416	add	w6,w6,w10
1417	add	v8.4s,v8.4s,v9.4s
1418	add	w7,w7,w11
1419	add	v12.4s,v12.4s,v13.4s
1420	add	w8,w8,w12
1421	add	v16.4s,v16.4s,v17.4s
1422	eor	w17,w17,w5
1423	add	v20.4s,v20.4s,v21.4s
1424	eor	w19,w19,w6
1425	eor	v3.16b,v3.16b,v0.16b
1426	eor	w20,w20,w7
1427	eor	v7.16b,v7.16b,v4.16b
1428	eor	w21,w21,w8
1429	eor	v11.16b,v11.16b,v8.16b
1430	ror	w17,w17,#16
1431	eor	v15.16b,v15.16b,v12.16b
1432	ror	w19,w19,#16
1433	eor	v19.16b,v19.16b,v16.16b
1434	ror	w20,w20,#16
1435	eor	v23.16b,v23.16b,v20.16b
1436	ror	w21,w21,#16
1437	rev32	v3.8h,v3.8h
1438	add	w13,w13,w17
1439	rev32	v7.8h,v7.8h
1440	add	w14,w14,w19
1441	rev32	v11.8h,v11.8h
1442	add	w15,w15,w20
1443	rev32	v15.8h,v15.8h
1444	add	w16,w16,w21
1445	rev32	v19.8h,v19.8h
1446	eor	w9,w9,w13
1447	rev32	v23.8h,v23.8h
1448	eor	w10,w10,w14
1449	add	v2.4s,v2.4s,v3.4s
1450	eor	w11,w11,w15
1451	add	v6.4s,v6.4s,v7.4s
1452	eor	w12,w12,w16
1453	add	v10.4s,v10.4s,v11.4s
1454	ror	w9,w9,#20
1455	add	v14.4s,v14.4s,v15.4s
1456	ror	w10,w10,#20
1457	add	v18.4s,v18.4s,v19.4s
1458	ror	w11,w11,#20
1459	add	v22.4s,v22.4s,v23.4s
1460	ror	w12,w12,#20
1461	eor	v24.16b,v1.16b,v2.16b
1462	add	w5,w5,w9
1463	eor	v25.16b,v5.16b,v6.16b
1464	add	w6,w6,w10
1465	eor	v26.16b,v9.16b,v10.16b
1466	add	w7,w7,w11
1467	eor	v27.16b,v13.16b,v14.16b
1468	add	w8,w8,w12
1469	eor	v28.16b,v17.16b,v18.16b
1470	eor	w17,w17,w5
1471	eor	v29.16b,v21.16b,v22.16b
1472	eor	w19,w19,w6
1473	ushr	v1.4s,v24.4s,#20
1474	eor	w20,w20,w7
1475	ushr	v5.4s,v25.4s,#20
1476	eor	w21,w21,w8
1477	ushr	v9.4s,v26.4s,#20
1478	ror	w17,w17,#24
1479	ushr	v13.4s,v27.4s,#20
1480	ror	w19,w19,#24
1481	ushr	v17.4s,v28.4s,#20
1482	ror	w20,w20,#24
1483	ushr	v21.4s,v29.4s,#20
1484	ror	w21,w21,#24
1485	sli	v1.4s,v24.4s,#12
1486	add	w13,w13,w17
1487	sli	v5.4s,v25.4s,#12
1488	add	w14,w14,w19
1489	sli	v9.4s,v26.4s,#12
1490	add	w15,w15,w20
1491	sli	v13.4s,v27.4s,#12
1492	add	w16,w16,w21
1493	sli	v17.4s,v28.4s,#12
1494	eor	w9,w9,w13
1495	sli	v21.4s,v29.4s,#12
1496	eor	w10,w10,w14
1497	add	v0.4s,v0.4s,v1.4s
1498	eor	w11,w11,w15
1499	add	v4.4s,v4.4s,v5.4s
1500	eor	w12,w12,w16
1501	add	v8.4s,v8.4s,v9.4s
1502	ror	w9,w9,#25
1503	add	v12.4s,v12.4s,v13.4s
1504	ror	w10,w10,#25
1505	add	v16.4s,v16.4s,v17.4s
1506	ror	w11,w11,#25
1507	add	v20.4s,v20.4s,v21.4s
1508	ror	w12,w12,#25
1509	eor	v24.16b,v3.16b,v0.16b
1510	add	w5,w5,w10
1511	eor	v25.16b,v7.16b,v4.16b
1512	add	w6,w6,w11
1513	eor	v26.16b,v11.16b,v8.16b
1514	add	w7,w7,w12
1515	eor	v27.16b,v15.16b,v12.16b
1516	add	w8,w8,w9
1517	eor	v28.16b,v19.16b,v16.16b
1518	eor	w21,w21,w5
1519	eor	v29.16b,v23.16b,v20.16b
1520	eor	w17,w17,w6
1521	ushr	v3.4s,v24.4s,#24
1522	eor	w19,w19,w7
1523	ushr	v7.4s,v25.4s,#24
1524	eor	w20,w20,w8
1525	ushr	v11.4s,v26.4s,#24
1526	ror	w21,w21,#16
1527	ushr	v15.4s,v27.4s,#24
1528	ror	w17,w17,#16
1529	ushr	v19.4s,v28.4s,#24
1530	ror	w19,w19,#16
1531	ushr	v23.4s,v29.4s,#24
1532	ror	w20,w20,#16
1533	sli	v3.4s,v24.4s,#8
1534	add	w15,w15,w21
1535	sli	v7.4s,v25.4s,#8
1536	add	w16,w16,w17
1537	sli	v11.4s,v26.4s,#8
1538	add	w13,w13,w19
1539	sli	v15.4s,v27.4s,#8
1540	add	w14,w14,w20
1541	sli	v19.4s,v28.4s,#8
1542	eor	w10,w10,w15
1543	sli	v23.4s,v29.4s,#8
1544	eor	w11,w11,w16
1545	add	v2.4s,v2.4s,v3.4s
1546	eor	w12,w12,w13
1547	add	v6.4s,v6.4s,v7.4s
1548	eor	w9,w9,w14
1549	add	v10.4s,v10.4s,v11.4s
1550	ror	w10,w10,#20
1551	add	v14.4s,v14.4s,v15.4s
1552	ror	w11,w11,#20
1553	add	v18.4s,v18.4s,v19.4s
1554	ror	w12,w12,#20
1555	add	v22.4s,v22.4s,v23.4s
1556	ror	w9,w9,#20
1557	eor	v24.16b,v1.16b,v2.16b
1558	add	w5,w5,w10
1559	eor	v25.16b,v5.16b,v6.16b
1560	add	w6,w6,w11
1561	eor	v26.16b,v9.16b,v10.16b
1562	add	w7,w7,w12
1563	eor	v27.16b,v13.16b,v14.16b
1564	add	w8,w8,w9
1565	eor	v28.16b,v17.16b,v18.16b
1566	eor	w21,w21,w5
1567	eor	v29.16b,v21.16b,v22.16b
1568	eor	w17,w17,w6
1569	ushr	v1.4s,v24.4s,#25
1570	eor	w19,w19,w7
1571	ushr	v5.4s,v25.4s,#25
1572	eor	w20,w20,w8
1573	ushr	v9.4s,v26.4s,#25
1574	ror	w21,w21,#24
1575	ushr	v13.4s,v27.4s,#25
1576	ror	w17,w17,#24
1577	ushr	v17.4s,v28.4s,#25
1578	ror	w19,w19,#24
1579	ushr	v21.4s,v29.4s,#25
1580	ror	w20,w20,#24
1581	sli	v1.4s,v24.4s,#7
1582	add	w15,w15,w21
1583	sli	v5.4s,v25.4s,#7
1584	add	w16,w16,w17
1585	sli	v9.4s,v26.4s,#7
1586	add	w13,w13,w19
1587	sli	v13.4s,v27.4s,#7
1588	add	w14,w14,w20
1589	sli	v17.4s,v28.4s,#7
1590	eor	w10,w10,w15
1591	sli	v21.4s,v29.4s,#7
1592	eor	w11,w11,w16
1593	ext	v2.16b,v2.16b,v2.16b,#8
1594	eor	w12,w12,w13
1595	ext	v6.16b,v6.16b,v6.16b,#8
1596	eor	w9,w9,w14
1597	ext	v10.16b,v10.16b,v10.16b,#8
1598	ror	w10,w10,#25
1599	ext	v14.16b,v14.16b,v14.16b,#8
1600	ror	w11,w11,#25
1601	ext	v18.16b,v18.16b,v18.16b,#8
1602	ror	w12,w12,#25
1603	ext	v22.16b,v22.16b,v22.16b,#8
1604	ror	w9,w9,#25
1605	ext	v3.16b,v3.16b,v3.16b,#12
1606	ext	v7.16b,v7.16b,v7.16b,#12
1607	ext	v11.16b,v11.16b,v11.16b,#12
1608	ext	v15.16b,v15.16b,v15.16b,#12
1609	ext	v19.16b,v19.16b,v19.16b,#12
1610	ext	v23.16b,v23.16b,v23.16b,#12
1611	ext	v1.16b,v1.16b,v1.16b,#4
1612	ext	v5.16b,v5.16b,v5.16b,#4
1613	ext	v9.16b,v9.16b,v9.16b,#4
1614	ext	v13.16b,v13.16b,v13.16b,#4
1615	ext	v17.16b,v17.16b,v17.16b,#4
1616	ext	v21.16b,v21.16b,v21.16b,#4
1617	add	v0.4s,v0.4s,v1.4s
1618	add	w5,w5,w9
1619	add	v4.4s,v4.4s,v5.4s
1620	add	w6,w6,w10
1621	add	v8.4s,v8.4s,v9.4s
1622	add	w7,w7,w11
1623	add	v12.4s,v12.4s,v13.4s
1624	add	w8,w8,w12
1625	add	v16.4s,v16.4s,v17.4s
1626	eor	w17,w17,w5
1627	add	v20.4s,v20.4s,v21.4s
1628	eor	w19,w19,w6
1629	eor	v3.16b,v3.16b,v0.16b
1630	eor	w20,w20,w7
1631	eor	v7.16b,v7.16b,v4.16b
1632	eor	w21,w21,w8
1633	eor	v11.16b,v11.16b,v8.16b
1634	ror	w17,w17,#16
1635	eor	v15.16b,v15.16b,v12.16b
1636	ror	w19,w19,#16
1637	eor	v19.16b,v19.16b,v16.16b
1638	ror	w20,w20,#16
1639	eor	v23.16b,v23.16b,v20.16b
1640	ror	w21,w21,#16
1641	rev32	v3.8h,v3.8h
1642	add	w13,w13,w17
1643	rev32	v7.8h,v7.8h
1644	add	w14,w14,w19
1645	rev32	v11.8h,v11.8h
1646	add	w15,w15,w20
1647	rev32	v15.8h,v15.8h
1648	add	w16,w16,w21
1649	rev32	v19.8h,v19.8h
1650	eor	w9,w9,w13
1651	rev32	v23.8h,v23.8h
1652	eor	w10,w10,w14
1653	add	v2.4s,v2.4s,v3.4s
1654	eor	w11,w11,w15
1655	add	v6.4s,v6.4s,v7.4s
1656	eor	w12,w12,w16
1657	add	v10.4s,v10.4s,v11.4s
1658	ror	w9,w9,#20
1659	add	v14.4s,v14.4s,v15.4s
1660	ror	w10,w10,#20
1661	add	v18.4s,v18.4s,v19.4s
1662	ror	w11,w11,#20
1663	add	v22.4s,v22.4s,v23.4s
1664	ror	w12,w12,#20
1665	eor	v24.16b,v1.16b,v2.16b
1666	add	w5,w5,w9
1667	eor	v25.16b,v5.16b,v6.16b
1668	add	w6,w6,w10
1669	eor	v26.16b,v9.16b,v10.16b
1670	add	w7,w7,w11
1671	eor	v27.16b,v13.16b,v14.16b
1672	add	w8,w8,w12
1673	eor	v28.16b,v17.16b,v18.16b
1674	eor	w17,w17,w5
1675	eor	v29.16b,v21.16b,v22.16b
1676	eor	w19,w19,w6
1677	ushr	v1.4s,v24.4s,#20
1678	eor	w20,w20,w7
1679	ushr	v5.4s,v25.4s,#20
1680	eor	w21,w21,w8
1681	ushr	v9.4s,v26.4s,#20
1682	ror	w17,w17,#24
1683	ushr	v13.4s,v27.4s,#20
1684	ror	w19,w19,#24
1685	ushr	v17.4s,v28.4s,#20
1686	ror	w20,w20,#24
1687	ushr	v21.4s,v29.4s,#20
1688	ror	w21,w21,#24
1689	sli	v1.4s,v24.4s,#12
1690	add	w13,w13,w17
1691	sli	v5.4s,v25.4s,#12
1692	add	w14,w14,w19
1693	sli	v9.4s,v26.4s,#12
1694	add	w15,w15,w20
1695	sli	v13.4s,v27.4s,#12
1696	add	w16,w16,w21
1697	sli	v17.4s,v28.4s,#12
1698	eor	w9,w9,w13
1699	sli	v21.4s,v29.4s,#12
1700	eor	w10,w10,w14
1701	add	v0.4s,v0.4s,v1.4s
1702	eor	w11,w11,w15
1703	add	v4.4s,v4.4s,v5.4s
1704	eor	w12,w12,w16
1705	add	v8.4s,v8.4s,v9.4s
1706	ror	w9,w9,#25
1707	add	v12.4s,v12.4s,v13.4s
1708	ror	w10,w10,#25
1709	add	v16.4s,v16.4s,v17.4s
1710	ror	w11,w11,#25
1711	add	v20.4s,v20.4s,v21.4s
1712	ror	w12,w12,#25
1713	eor	v24.16b,v3.16b,v0.16b
1714	add	w5,w5,w10
1715	eor	v25.16b,v7.16b,v4.16b
1716	add	w6,w6,w11
1717	eor	v26.16b,v11.16b,v8.16b
1718	add	w7,w7,w12
1719	eor	v27.16b,v15.16b,v12.16b
1720	add	w8,w8,w9
1721	eor	v28.16b,v19.16b,v16.16b
1722	eor	w21,w21,w5
1723	eor	v29.16b,v23.16b,v20.16b
1724	eor	w17,w17,w6
1725	ushr	v3.4s,v24.4s,#24
1726	eor	w19,w19,w7
1727	ushr	v7.4s,v25.4s,#24
1728	eor	w20,w20,w8
1729	ushr	v11.4s,v26.4s,#24
1730	ror	w21,w21,#16
1731	ushr	v15.4s,v27.4s,#24
1732	ror	w17,w17,#16
1733	ushr	v19.4s,v28.4s,#24
1734	ror	w19,w19,#16
1735	ushr	v23.4s,v29.4s,#24
1736	ror	w20,w20,#16
1737	sli	v3.4s,v24.4s,#8
1738	add	w15,w15,w21
1739	sli	v7.4s,v25.4s,#8
1740	add	w16,w16,w17
1741	sli	v11.4s,v26.4s,#8
1742	add	w13,w13,w19
1743	sli	v15.4s,v27.4s,#8
1744	add	w14,w14,w20
1745	sli	v19.4s,v28.4s,#8
1746	eor	w10,w10,w15
1747	sli	v23.4s,v29.4s,#8
1748	eor	w11,w11,w16
1749	add	v2.4s,v2.4s,v3.4s
1750	eor	w12,w12,w13
1751	add	v6.4s,v6.4s,v7.4s
1752	eor	w9,w9,w14
1753	add	v10.4s,v10.4s,v11.4s
1754	ror	w10,w10,#20
1755	add	v14.4s,v14.4s,v15.4s
1756	ror	w11,w11,#20
1757	add	v18.4s,v18.4s,v19.4s
1758	ror	w12,w12,#20
1759	add	v22.4s,v22.4s,v23.4s
1760	ror	w9,w9,#20
1761	eor	v24.16b,v1.16b,v2.16b
1762	add	w5,w5,w10
1763	eor	v25.16b,v5.16b,v6.16b
1764	add	w6,w6,w11
1765	eor	v26.16b,v9.16b,v10.16b
1766	add	w7,w7,w12
1767	eor	v27.16b,v13.16b,v14.16b
1768	add	w8,w8,w9
1769	eor	v28.16b,v17.16b,v18.16b
1770	eor	w21,w21,w5
1771	eor	v29.16b,v21.16b,v22.16b
1772	eor	w17,w17,w6
1773	ushr	v1.4s,v24.4s,#25
1774	eor	w19,w19,w7
1775	ushr	v5.4s,v25.4s,#25
1776	eor	w20,w20,w8
1777	ushr	v9.4s,v26.4s,#25
1778	ror	w21,w21,#24
1779	ushr	v13.4s,v27.4s,#25
1780	ror	w17,w17,#24
1781	ushr	v17.4s,v28.4s,#25
1782	ror	w19,w19,#24
1783	ushr	v21.4s,v29.4s,#25
1784	ror	w20,w20,#24
1785	sli	v1.4s,v24.4s,#7
1786	add	w15,w15,w21
1787	sli	v5.4s,v25.4s,#7
1788	add	w16,w16,w17
1789	sli	v9.4s,v26.4s,#7
1790	add	w13,w13,w19
1791	sli	v13.4s,v27.4s,#7
1792	add	w14,w14,w20
1793	sli	v17.4s,v28.4s,#7
1794	eor	w10,w10,w15
1795	sli	v21.4s,v29.4s,#7
1796	eor	w11,w11,w16
1797	ext	v2.16b,v2.16b,v2.16b,#8
1798	eor	w12,w12,w13
1799	ext	v6.16b,v6.16b,v6.16b,#8
1800	eor	w9,w9,w14
1801	ext	v10.16b,v10.16b,v10.16b,#8
1802	ror	w10,w10,#25
1803	ext	v14.16b,v14.16b,v14.16b,#8
1804	ror	w11,w11,#25
1805	ext	v18.16b,v18.16b,v18.16b,#8
1806	ror	w12,w12,#25
1807	ext	v22.16b,v22.16b,v22.16b,#8
1808	ror	w9,w9,#25
1809	ext	v3.16b,v3.16b,v3.16b,#4
1810	ext	v7.16b,v7.16b,v7.16b,#4
1811	ext	v11.16b,v11.16b,v11.16b,#4
1812	ext	v15.16b,v15.16b,v15.16b,#4
1813	ext	v19.16b,v19.16b,v19.16b,#4
1814	ext	v23.16b,v23.16b,v23.16b,#4
1815	ext	v1.16b,v1.16b,v1.16b,#12
1816	ext	v5.16b,v5.16b,v5.16b,#12
1817	ext	v9.16b,v9.16b,v9.16b,#12
1818	ext	v13.16b,v13.16b,v13.16b,#12
1819	ext	v17.16b,v17.16b,v17.16b,#12
1820	ext	v21.16b,v21.16b,v21.16b,#12
1821	cbnz	x4,Loop_lower_neon
1822
1823	add	w5,w5,w22		// accumulate key block
1824	ldp	q24,q25,[sp,#0]
1825	add	x6,x6,x22,lsr#32
1826	ldp	q26,q27,[sp,#32]
1827	add	w7,w7,w23
1828	ldp	q28,q29,[sp,#64]
1829	add	x8,x8,x23,lsr#32
1830	add	v0.4s,v0.4s,v24.4s
1831	add	w9,w9,w24
1832	add	v4.4s,v4.4s,v24.4s
1833	add	x10,x10,x24,lsr#32
1834	add	v8.4s,v8.4s,v24.4s
1835	add	w11,w11,w25
1836	add	v12.4s,v12.4s,v24.4s
1837	add	x12,x12,x25,lsr#32
1838	add	v16.4s,v16.4s,v24.4s
1839	add	w13,w13,w26
1840	add	v20.4s,v20.4s,v24.4s
1841	add	x14,x14,x26,lsr#32
1842	add	v2.4s,v2.4s,v26.4s
1843	add	w15,w15,w27
1844	add	v6.4s,v6.4s,v26.4s
1845	add	x16,x16,x27,lsr#32
1846	add	v10.4s,v10.4s,v26.4s
1847	add	w17,w17,w28
1848	add	v14.4s,v14.4s,v26.4s
1849	add	x19,x19,x28,lsr#32
1850	add	v18.4s,v18.4s,v26.4s
1851	add	w20,w20,w30
1852	add	v22.4s,v22.4s,v26.4s
1853	add	x21,x21,x30,lsr#32
1854	add	v19.4s,v19.4s,v31.4s			// +4
1855	add	x5,x5,x6,lsl#32	// pack
1856	add	v23.4s,v23.4s,v31.4s			// +4
1857	add	x7,x7,x8,lsl#32
1858	add	v3.4s,v3.4s,v27.4s
1859	ldp	x6,x8,[x1,#0]		// load input
1860	add	v7.4s,v7.4s,v28.4s
1861	add	x9,x9,x10,lsl#32
1862	add	v11.4s,v11.4s,v29.4s
1863	add	x11,x11,x12,lsl#32
1864	add	v15.4s,v15.4s,v30.4s
1865	ldp	x10,x12,[x1,#16]
1866	add	v19.4s,v19.4s,v27.4s
1867	add	x13,x13,x14,lsl#32
1868	add	v23.4s,v23.4s,v28.4s
1869	add	x15,x15,x16,lsl#32
1870	add	v1.4s,v1.4s,v25.4s
1871	ldp	x14,x16,[x1,#32]
1872	add	v5.4s,v5.4s,v25.4s
1873	add	x17,x17,x19,lsl#32
1874	add	v9.4s,v9.4s,v25.4s
1875	add	x20,x20,x21,lsl#32
1876	add	v13.4s,v13.4s,v25.4s
1877	ldp	x19,x21,[x1,#48]
1878	add	v17.4s,v17.4s,v25.4s
1879	add	x1,x1,#64
1880	add	v21.4s,v21.4s,v25.4s
1881
1882#ifdef	__ARMEB__
1883	rev	x5,x5
1884	rev	x7,x7
1885	rev	x9,x9
1886	rev	x11,x11
1887	rev	x13,x13
1888	rev	x15,x15
1889	rev	x17,x17
1890	rev	x20,x20
1891#endif
1892	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1893	eor	x5,x5,x6
1894	eor	x7,x7,x8
1895	eor	x9,x9,x10
1896	eor	x11,x11,x12
1897	eor	x13,x13,x14
1898	eor	v0.16b,v0.16b,v24.16b
1899	eor	x15,x15,x16
1900	eor	v1.16b,v1.16b,v25.16b
1901	eor	x17,x17,x19
1902	eor	v2.16b,v2.16b,v26.16b
1903	eor	x20,x20,x21
1904	eor	v3.16b,v3.16b,v27.16b
1905	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1906
1907	stp	x5,x7,[x0,#0]		// store output
1908	add	x28,x28,#7			// increment counter
1909	stp	x9,x11,[x0,#16]
1910	stp	x13,x15,[x0,#32]
1911	stp	x17,x20,[x0,#48]
1912	add	x0,x0,#64
1913	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1914
1915	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1916	eor	v4.16b,v4.16b,v24.16b
1917	eor	v5.16b,v5.16b,v25.16b
1918	eor	v6.16b,v6.16b,v26.16b
1919	eor	v7.16b,v7.16b,v27.16b
1920	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1921
1922	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1923	eor	v8.16b,v8.16b,v0.16b
1924	ldp	q24,q25,[sp,#0]
1925	eor	v9.16b,v9.16b,v1.16b
1926	ldp	q26,q27,[sp,#32]
1927	eor	v10.16b,v10.16b,v2.16b
1928	eor	v11.16b,v11.16b,v3.16b
1929	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1930
1931	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1932	eor	v12.16b,v12.16b,v4.16b
1933	eor	v13.16b,v13.16b,v5.16b
1934	eor	v14.16b,v14.16b,v6.16b
1935	eor	v15.16b,v15.16b,v7.16b
1936	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1937
1938	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1939	eor	v16.16b,v16.16b,v8.16b
1940	eor	v17.16b,v17.16b,v9.16b
1941	eor	v18.16b,v18.16b,v10.16b
1942	eor	v19.16b,v19.16b,v11.16b
1943	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1944
1945	shl	v0.4s,v31.4s,#1			// 4 -> 8
1946	eor	v20.16b,v20.16b,v12.16b
1947	eor	v21.16b,v21.16b,v13.16b
1948	eor	v22.16b,v22.16b,v14.16b
1949	eor	v23.16b,v23.16b,v15.16b
1950	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1951
1952	add	v27.4s,v27.4s,v0.4s			// += 8
1953	add	v28.4s,v28.4s,v0.4s
1954	add	v29.4s,v29.4s,v0.4s
1955	add	v30.4s,v30.4s,v0.4s
1956
1957	b.hs	Loop_outer_512_neon
1958
1959	adds	x2,x2,#512
1960	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1961
1962	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1963	ldp	d10,d11,[sp,#128+16]
1964	ldp	d12,d13,[sp,#128+32]
1965	ldp	d14,d15,[sp,#128+48]
1966
1967	stp	q24,q31,[sp,#0]		// wipe off-load area
1968	stp	q24,q31,[sp,#32]
1969	stp	q24,q31,[sp,#64]
1970
1971	b.eq	Ldone_512_neon
1972
1973	cmp	x2,#192
1974	sub	v27.4s,v27.4s,v0.4s			// -= 1
1975	sub	v28.4s,v28.4s,v0.4s
1976	sub	v29.4s,v29.4s,v0.4s
1977	add	sp,sp,#128
1978	b.hs	Loop_outer_neon
1979
1980	eor	v25.16b,v25.16b,v25.16b
1981	eor	v26.16b,v26.16b,v26.16b
1982	eor	v27.16b,v27.16b,v27.16b
1983	eor	v28.16b,v28.16b,v28.16b
1984	eor	v29.16b,v29.16b,v29.16b
1985	eor	v30.16b,v30.16b,v30.16b
1986	b	Loop_outer
1987
1988Ldone_512_neon:
1989	ldp	x19,x20,[x29,#16]
1990	add	sp,sp,#128+64
1991	ldp	x21,x22,[x29,#32]
1992	ldp	x23,x24,[x29,#48]
1993	ldp	x25,x26,[x29,#64]
1994	ldp	x27,x28,[x29,#80]
1995	ldp	x29,x30,[sp],#96
1996	AARCH64_VALIDATE_LINK_REGISTER
1997	ret
1998
1999#endif
2000#endif  // !OPENSSL_NO_ASM
2001