• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3.text
4
5
6.hidden	OPENSSL_armcap_P
7
8.align	5
9.Lsigma:
10.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
11.Lone:
12.long	1,0,0,0
13.LOPENSSL_armcap_P:
14#ifdef	__ILP32__
15.long	OPENSSL_armcap_P-.
16#else
17.quad	OPENSSL_armcap_P-.
18#endif
19.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
20.align	2
21
22.globl	ChaCha20_ctr32
23.type	ChaCha20_ctr32,%function
24.align	5
25ChaCha20_ctr32:
26	cbz	x2,.Labort
27	adr	x5,.LOPENSSL_armcap_P
28	cmp	x2,#192
29	b.lo	.Lshort
30#ifdef	__ILP32__
31	ldrsw	x6,[x5]
32#else
33	ldr	x6,[x5]
34#endif
35	ldr	w17,[x6,x5]
36	tst	w17,#ARMV7_NEON
37	b.ne	ChaCha20_neon
38
39.Lshort:
40.inst	0xd503233f			// paciasp
41	stp	x29,x30,[sp,#-96]!
42	add	x29,sp,#0
43
44	adr	x5,.Lsigma
45	stp	x19,x20,[sp,#16]
46	stp	x21,x22,[sp,#32]
47	stp	x23,x24,[sp,#48]
48	stp	x25,x26,[sp,#64]
49	stp	x27,x28,[sp,#80]
50	sub	sp,sp,#64
51
52	ldp	x22,x23,[x5]		// load sigma
53	ldp	x24,x25,[x3]		// load key
54	ldp	x26,x27,[x3,#16]
55	ldp	x28,x30,[x4]		// load counter
56#ifdef	__ARMEB__
57	ror	x24,x24,#32
58	ror	x25,x25,#32
59	ror	x26,x26,#32
60	ror	x27,x27,#32
61	ror	x28,x28,#32
62	ror	x30,x30,#32
63#endif
64
65.Loop_outer:
66	mov	w5,w22			// unpack key block
67	lsr	x6,x22,#32
68	mov	w7,w23
69	lsr	x8,x23,#32
70	mov	w9,w24
71	lsr	x10,x24,#32
72	mov	w11,w25
73	lsr	x12,x25,#32
74	mov	w13,w26
75	lsr	x14,x26,#32
76	mov	w15,w27
77	lsr	x16,x27,#32
78	mov	w17,w28
79	lsr	x19,x28,#32
80	mov	w20,w30
81	lsr	x21,x30,#32
82
83	mov	x4,#10
84	subs	x2,x2,#64
85.Loop:
86	sub	x4,x4,#1
87	add	w5,w5,w9
88	add	w6,w6,w10
89	add	w7,w7,w11
90	add	w8,w8,w12
91	eor	w17,w17,w5
92	eor	w19,w19,w6
93	eor	w20,w20,w7
94	eor	w21,w21,w8
95	ror	w17,w17,#16
96	ror	w19,w19,#16
97	ror	w20,w20,#16
98	ror	w21,w21,#16
99	add	w13,w13,w17
100	add	w14,w14,w19
101	add	w15,w15,w20
102	add	w16,w16,w21
103	eor	w9,w9,w13
104	eor	w10,w10,w14
105	eor	w11,w11,w15
106	eor	w12,w12,w16
107	ror	w9,w9,#20
108	ror	w10,w10,#20
109	ror	w11,w11,#20
110	ror	w12,w12,#20
111	add	w5,w5,w9
112	add	w6,w6,w10
113	add	w7,w7,w11
114	add	w8,w8,w12
115	eor	w17,w17,w5
116	eor	w19,w19,w6
117	eor	w20,w20,w7
118	eor	w21,w21,w8
119	ror	w17,w17,#24
120	ror	w19,w19,#24
121	ror	w20,w20,#24
122	ror	w21,w21,#24
123	add	w13,w13,w17
124	add	w14,w14,w19
125	add	w15,w15,w20
126	add	w16,w16,w21
127	eor	w9,w9,w13
128	eor	w10,w10,w14
129	eor	w11,w11,w15
130	eor	w12,w12,w16
131	ror	w9,w9,#25
132	ror	w10,w10,#25
133	ror	w11,w11,#25
134	ror	w12,w12,#25
135	add	w5,w5,w10
136	add	w6,w6,w11
137	add	w7,w7,w12
138	add	w8,w8,w9
139	eor	w21,w21,w5
140	eor	w17,w17,w6
141	eor	w19,w19,w7
142	eor	w20,w20,w8
143	ror	w21,w21,#16
144	ror	w17,w17,#16
145	ror	w19,w19,#16
146	ror	w20,w20,#16
147	add	w15,w15,w21
148	add	w16,w16,w17
149	add	w13,w13,w19
150	add	w14,w14,w20
151	eor	w10,w10,w15
152	eor	w11,w11,w16
153	eor	w12,w12,w13
154	eor	w9,w9,w14
155	ror	w10,w10,#20
156	ror	w11,w11,#20
157	ror	w12,w12,#20
158	ror	w9,w9,#20
159	add	w5,w5,w10
160	add	w6,w6,w11
161	add	w7,w7,w12
162	add	w8,w8,w9
163	eor	w21,w21,w5
164	eor	w17,w17,w6
165	eor	w19,w19,w7
166	eor	w20,w20,w8
167	ror	w21,w21,#24
168	ror	w17,w17,#24
169	ror	w19,w19,#24
170	ror	w20,w20,#24
171	add	w15,w15,w21
172	add	w16,w16,w17
173	add	w13,w13,w19
174	add	w14,w14,w20
175	eor	w10,w10,w15
176	eor	w11,w11,w16
177	eor	w12,w12,w13
178	eor	w9,w9,w14
179	ror	w10,w10,#25
180	ror	w11,w11,#25
181	ror	w12,w12,#25
182	ror	w9,w9,#25
183	cbnz	x4,.Loop
184
185	add	w5,w5,w22		// accumulate key block
186	add	x6,x6,x22,lsr#32
187	add	w7,w7,w23
188	add	x8,x8,x23,lsr#32
189	add	w9,w9,w24
190	add	x10,x10,x24,lsr#32
191	add	w11,w11,w25
192	add	x12,x12,x25,lsr#32
193	add	w13,w13,w26
194	add	x14,x14,x26,lsr#32
195	add	w15,w15,w27
196	add	x16,x16,x27,lsr#32
197	add	w17,w17,w28
198	add	x19,x19,x28,lsr#32
199	add	w20,w20,w30
200	add	x21,x21,x30,lsr#32
201
202	b.lo	.Ltail
203
204	add	x5,x5,x6,lsl#32	// pack
205	add	x7,x7,x8,lsl#32
206	ldp	x6,x8,[x1,#0]		// load input
207	add	x9,x9,x10,lsl#32
208	add	x11,x11,x12,lsl#32
209	ldp	x10,x12,[x1,#16]
210	add	x13,x13,x14,lsl#32
211	add	x15,x15,x16,lsl#32
212	ldp	x14,x16,[x1,#32]
213	add	x17,x17,x19,lsl#32
214	add	x20,x20,x21,lsl#32
215	ldp	x19,x21,[x1,#48]
216	add	x1,x1,#64
217#ifdef	__ARMEB__
218	rev	x5,x5
219	rev	x7,x7
220	rev	x9,x9
221	rev	x11,x11
222	rev	x13,x13
223	rev	x15,x15
224	rev	x17,x17
225	rev	x20,x20
226#endif
227	eor	x5,x5,x6
228	eor	x7,x7,x8
229	eor	x9,x9,x10
230	eor	x11,x11,x12
231	eor	x13,x13,x14
232	eor	x15,x15,x16
233	eor	x17,x17,x19
234	eor	x20,x20,x21
235
236	stp	x5,x7,[x0,#0]		// store output
237	add	x28,x28,#1			// increment counter
238	stp	x9,x11,[x0,#16]
239	stp	x13,x15,[x0,#32]
240	stp	x17,x20,[x0,#48]
241	add	x0,x0,#64
242
243	b.hi	.Loop_outer
244
245	ldp	x19,x20,[x29,#16]
246	add	sp,sp,#64
247	ldp	x21,x22,[x29,#32]
248	ldp	x23,x24,[x29,#48]
249	ldp	x25,x26,[x29,#64]
250	ldp	x27,x28,[x29,#80]
251	ldp	x29,x30,[sp],#96
252.inst	0xd50323bf			// autiasp
253.Labort:
254	ret
255
256.align	4
257.Ltail:
258	add	x2,x2,#64
259.Less_than_64:
260	sub	x0,x0,#1
261	add	x1,x1,x2
262	add	x0,x0,x2
263	add	x4,sp,x2
264	neg	x2,x2
265
266	add	x5,x5,x6,lsl#32	// pack
267	add	x7,x7,x8,lsl#32
268	add	x9,x9,x10,lsl#32
269	add	x11,x11,x12,lsl#32
270	add	x13,x13,x14,lsl#32
271	add	x15,x15,x16,lsl#32
272	add	x17,x17,x19,lsl#32
273	add	x20,x20,x21,lsl#32
274#ifdef	__ARMEB__
275	rev	x5,x5
276	rev	x7,x7
277	rev	x9,x9
278	rev	x11,x11
279	rev	x13,x13
280	rev	x15,x15
281	rev	x17,x17
282	rev	x20,x20
283#endif
284	stp	x5,x7,[sp,#0]
285	stp	x9,x11,[sp,#16]
286	stp	x13,x15,[sp,#32]
287	stp	x17,x20,[sp,#48]
288
289.Loop_tail:
290	ldrb	w10,[x1,x2]
291	ldrb	w11,[x4,x2]
292	add	x2,x2,#1
293	eor	w10,w10,w11
294	strb	w10,[x0,x2]
295	cbnz	x2,.Loop_tail
296
297	stp	xzr,xzr,[sp,#0]
298	stp	xzr,xzr,[sp,#16]
299	stp	xzr,xzr,[sp,#32]
300	stp	xzr,xzr,[sp,#48]
301
302	ldp	x19,x20,[x29,#16]
303	add	sp,sp,#64
304	ldp	x21,x22,[x29,#32]
305	ldp	x23,x24,[x29,#48]
306	ldp	x25,x26,[x29,#64]
307	ldp	x27,x28,[x29,#80]
308	ldp	x29,x30,[sp],#96
309.inst	0xd50323bf			// autiasp
310	ret
311.size	ChaCha20_ctr32,.-ChaCha20_ctr32
312
313.type	ChaCha20_neon,%function
314.align	5
315ChaCha20_neon:
316.inst	0xd503233f			// paciasp
317	stp	x29,x30,[sp,#-96]!
318	add	x29,sp,#0
319
320	adr	x5,.Lsigma
321	stp	x19,x20,[sp,#16]
322	stp	x21,x22,[sp,#32]
323	stp	x23,x24,[sp,#48]
324	stp	x25,x26,[sp,#64]
325	stp	x27,x28,[sp,#80]
326	cmp	x2,#512
327	b.hs	.L512_or_more_neon
328
329	sub	sp,sp,#64
330
331	ldp	x22,x23,[x5]		// load sigma
332	ld1	{v24.4s},[x5],#16
333	ldp	x24,x25,[x3]		// load key
334	ldp	x26,x27,[x3,#16]
335	ld1	{v25.4s,v26.4s},[x3]
336	ldp	x28,x30,[x4]		// load counter
337	ld1	{v27.4s},[x4]
338	ld1	{v31.4s},[x5]
339#ifdef	__ARMEB__
340	rev64	v24.4s,v24.4s
341	ror	x24,x24,#32
342	ror	x25,x25,#32
343	ror	x26,x26,#32
344	ror	x27,x27,#32
345	ror	x28,x28,#32
346	ror	x30,x30,#32
347#endif
348	add	v27.4s,v27.4s,v31.4s		// += 1
349	add	v28.4s,v27.4s,v31.4s
350	add	v29.4s,v28.4s,v31.4s
351	shl	v31.4s,v31.4s,#2			// 1 -> 4
352
353.Loop_outer_neon:
354	mov	w5,w22			// unpack key block
355	lsr	x6,x22,#32
356	mov	v0.16b,v24.16b
357	mov	w7,w23
358	lsr	x8,x23,#32
359	mov	v4.16b,v24.16b
360	mov	w9,w24
361	lsr	x10,x24,#32
362	mov	v16.16b,v24.16b
363	mov	w11,w25
364	mov	v1.16b,v25.16b
365	lsr	x12,x25,#32
366	mov	v5.16b,v25.16b
367	mov	w13,w26
368	mov	v17.16b,v25.16b
369	lsr	x14,x26,#32
370	mov	v3.16b,v27.16b
371	mov	w15,w27
372	mov	v7.16b,v28.16b
373	lsr	x16,x27,#32
374	mov	v19.16b,v29.16b
375	mov	w17,w28
376	mov	v2.16b,v26.16b
377	lsr	x19,x28,#32
378	mov	v6.16b,v26.16b
379	mov	w20,w30
380	mov	v18.16b,v26.16b
381	lsr	x21,x30,#32
382
383	mov	x4,#10
384	subs	x2,x2,#256
385.Loop_neon:
386	sub	x4,x4,#1
387	add	v0.4s,v0.4s,v1.4s
388	add	w5,w5,w9
389	add	v4.4s,v4.4s,v5.4s
390	add	w6,w6,w10
391	add	v16.4s,v16.4s,v17.4s
392	add	w7,w7,w11
393	eor	v3.16b,v3.16b,v0.16b
394	add	w8,w8,w12
395	eor	v7.16b,v7.16b,v4.16b
396	eor	w17,w17,w5
397	eor	v19.16b,v19.16b,v16.16b
398	eor	w19,w19,w6
399	rev32	v3.8h,v3.8h
400	eor	w20,w20,w7
401	rev32	v7.8h,v7.8h
402	eor	w21,w21,w8
403	rev32	v19.8h,v19.8h
404	ror	w17,w17,#16
405	add	v2.4s,v2.4s,v3.4s
406	ror	w19,w19,#16
407	add	v6.4s,v6.4s,v7.4s
408	ror	w20,w20,#16
409	add	v18.4s,v18.4s,v19.4s
410	ror	w21,w21,#16
411	eor	v20.16b,v1.16b,v2.16b
412	add	w13,w13,w17
413	eor	v21.16b,v5.16b,v6.16b
414	add	w14,w14,w19
415	eor	v22.16b,v17.16b,v18.16b
416	add	w15,w15,w20
417	ushr	v1.4s,v20.4s,#20
418	add	w16,w16,w21
419	ushr	v5.4s,v21.4s,#20
420	eor	w9,w9,w13
421	ushr	v17.4s,v22.4s,#20
422	eor	w10,w10,w14
423	sli	v1.4s,v20.4s,#12
424	eor	w11,w11,w15
425	sli	v5.4s,v21.4s,#12
426	eor	w12,w12,w16
427	sli	v17.4s,v22.4s,#12
428	ror	w9,w9,#20
429	add	v0.4s,v0.4s,v1.4s
430	ror	w10,w10,#20
431	add	v4.4s,v4.4s,v5.4s
432	ror	w11,w11,#20
433	add	v16.4s,v16.4s,v17.4s
434	ror	w12,w12,#20
435	eor	v20.16b,v3.16b,v0.16b
436	add	w5,w5,w9
437	eor	v21.16b,v7.16b,v4.16b
438	add	w6,w6,w10
439	eor	v22.16b,v19.16b,v16.16b
440	add	w7,w7,w11
441	ushr	v3.4s,v20.4s,#24
442	add	w8,w8,w12
443	ushr	v7.4s,v21.4s,#24
444	eor	w17,w17,w5
445	ushr	v19.4s,v22.4s,#24
446	eor	w19,w19,w6
447	sli	v3.4s,v20.4s,#8
448	eor	w20,w20,w7
449	sli	v7.4s,v21.4s,#8
450	eor	w21,w21,w8
451	sli	v19.4s,v22.4s,#8
452	ror	w17,w17,#24
453	add	v2.4s,v2.4s,v3.4s
454	ror	w19,w19,#24
455	add	v6.4s,v6.4s,v7.4s
456	ror	w20,w20,#24
457	add	v18.4s,v18.4s,v19.4s
458	ror	w21,w21,#24
459	eor	v20.16b,v1.16b,v2.16b
460	add	w13,w13,w17
461	eor	v21.16b,v5.16b,v6.16b
462	add	w14,w14,w19
463	eor	v22.16b,v17.16b,v18.16b
464	add	w15,w15,w20
465	ushr	v1.4s,v20.4s,#25
466	add	w16,w16,w21
467	ushr	v5.4s,v21.4s,#25
468	eor	w9,w9,w13
469	ushr	v17.4s,v22.4s,#25
470	eor	w10,w10,w14
471	sli	v1.4s,v20.4s,#7
472	eor	w11,w11,w15
473	sli	v5.4s,v21.4s,#7
474	eor	w12,w12,w16
475	sli	v17.4s,v22.4s,#7
476	ror	w9,w9,#25
477	ext	v2.16b,v2.16b,v2.16b,#8
478	ror	w10,w10,#25
479	ext	v6.16b,v6.16b,v6.16b,#8
480	ror	w11,w11,#25
481	ext	v18.16b,v18.16b,v18.16b,#8
482	ror	w12,w12,#25
483	ext	v3.16b,v3.16b,v3.16b,#12
484	ext	v7.16b,v7.16b,v7.16b,#12
485	ext	v19.16b,v19.16b,v19.16b,#12
486	ext	v1.16b,v1.16b,v1.16b,#4
487	ext	v5.16b,v5.16b,v5.16b,#4
488	ext	v17.16b,v17.16b,v17.16b,#4
489	add	v0.4s,v0.4s,v1.4s
490	add	w5,w5,w10
491	add	v4.4s,v4.4s,v5.4s
492	add	w6,w6,w11
493	add	v16.4s,v16.4s,v17.4s
494	add	w7,w7,w12
495	eor	v3.16b,v3.16b,v0.16b
496	add	w8,w8,w9
497	eor	v7.16b,v7.16b,v4.16b
498	eor	w21,w21,w5
499	eor	v19.16b,v19.16b,v16.16b
500	eor	w17,w17,w6
501	rev32	v3.8h,v3.8h
502	eor	w19,w19,w7
503	rev32	v7.8h,v7.8h
504	eor	w20,w20,w8
505	rev32	v19.8h,v19.8h
506	ror	w21,w21,#16
507	add	v2.4s,v2.4s,v3.4s
508	ror	w17,w17,#16
509	add	v6.4s,v6.4s,v7.4s
510	ror	w19,w19,#16
511	add	v18.4s,v18.4s,v19.4s
512	ror	w20,w20,#16
513	eor	v20.16b,v1.16b,v2.16b
514	add	w15,w15,w21
515	eor	v21.16b,v5.16b,v6.16b
516	add	w16,w16,w17
517	eor	v22.16b,v17.16b,v18.16b
518	add	w13,w13,w19
519	ushr	v1.4s,v20.4s,#20
520	add	w14,w14,w20
521	ushr	v5.4s,v21.4s,#20
522	eor	w10,w10,w15
523	ushr	v17.4s,v22.4s,#20
524	eor	w11,w11,w16
525	sli	v1.4s,v20.4s,#12
526	eor	w12,w12,w13
527	sli	v5.4s,v21.4s,#12
528	eor	w9,w9,w14
529	sli	v17.4s,v22.4s,#12
530	ror	w10,w10,#20
531	add	v0.4s,v0.4s,v1.4s
532	ror	w11,w11,#20
533	add	v4.4s,v4.4s,v5.4s
534	ror	w12,w12,#20
535	add	v16.4s,v16.4s,v17.4s
536	ror	w9,w9,#20
537	eor	v20.16b,v3.16b,v0.16b
538	add	w5,w5,w10
539	eor	v21.16b,v7.16b,v4.16b
540	add	w6,w6,w11
541	eor	v22.16b,v19.16b,v16.16b
542	add	w7,w7,w12
543	ushr	v3.4s,v20.4s,#24
544	add	w8,w8,w9
545	ushr	v7.4s,v21.4s,#24
546	eor	w21,w21,w5
547	ushr	v19.4s,v22.4s,#24
548	eor	w17,w17,w6
549	sli	v3.4s,v20.4s,#8
550	eor	w19,w19,w7
551	sli	v7.4s,v21.4s,#8
552	eor	w20,w20,w8
553	sli	v19.4s,v22.4s,#8
554	ror	w21,w21,#24
555	add	v2.4s,v2.4s,v3.4s
556	ror	w17,w17,#24
557	add	v6.4s,v6.4s,v7.4s
558	ror	w19,w19,#24
559	add	v18.4s,v18.4s,v19.4s
560	ror	w20,w20,#24
561	eor	v20.16b,v1.16b,v2.16b
562	add	w15,w15,w21
563	eor	v21.16b,v5.16b,v6.16b
564	add	w16,w16,w17
565	eor	v22.16b,v17.16b,v18.16b
566	add	w13,w13,w19
567	ushr	v1.4s,v20.4s,#25
568	add	w14,w14,w20
569	ushr	v5.4s,v21.4s,#25
570	eor	w10,w10,w15
571	ushr	v17.4s,v22.4s,#25
572	eor	w11,w11,w16
573	sli	v1.4s,v20.4s,#7
574	eor	w12,w12,w13
575	sli	v5.4s,v21.4s,#7
576	eor	w9,w9,w14
577	sli	v17.4s,v22.4s,#7
578	ror	w10,w10,#25
579	ext	v2.16b,v2.16b,v2.16b,#8
580	ror	w11,w11,#25
581	ext	v6.16b,v6.16b,v6.16b,#8
582	ror	w12,w12,#25
583	ext	v18.16b,v18.16b,v18.16b,#8
584	ror	w9,w9,#25
585	ext	v3.16b,v3.16b,v3.16b,#4
586	ext	v7.16b,v7.16b,v7.16b,#4
587	ext	v19.16b,v19.16b,v19.16b,#4
588	ext	v1.16b,v1.16b,v1.16b,#12
589	ext	v5.16b,v5.16b,v5.16b,#12
590	ext	v17.16b,v17.16b,v17.16b,#12
591	cbnz	x4,.Loop_neon
592
593	add	w5,w5,w22		// accumulate key block
594	add	v0.4s,v0.4s,v24.4s
595	add	x6,x6,x22,lsr#32
596	add	v4.4s,v4.4s,v24.4s
597	add	w7,w7,w23
598	add	v16.4s,v16.4s,v24.4s
599	add	x8,x8,x23,lsr#32
600	add	v2.4s,v2.4s,v26.4s
601	add	w9,w9,w24
602	add	v6.4s,v6.4s,v26.4s
603	add	x10,x10,x24,lsr#32
604	add	v18.4s,v18.4s,v26.4s
605	add	w11,w11,w25
606	add	v3.4s,v3.4s,v27.4s
607	add	x12,x12,x25,lsr#32
608	add	w13,w13,w26
609	add	v7.4s,v7.4s,v28.4s
610	add	x14,x14,x26,lsr#32
611	add	w15,w15,w27
612	add	v19.4s,v19.4s,v29.4s
613	add	x16,x16,x27,lsr#32
614	add	w17,w17,w28
615	add	v1.4s,v1.4s,v25.4s
616	add	x19,x19,x28,lsr#32
617	add	w20,w20,w30
618	add	v5.4s,v5.4s,v25.4s
619	add	x21,x21,x30,lsr#32
620	add	v17.4s,v17.4s,v25.4s
621
622	b.lo	.Ltail_neon
623
624	add	x5,x5,x6,lsl#32	// pack
625	add	x7,x7,x8,lsl#32
626	ldp	x6,x8,[x1,#0]		// load input
627	add	x9,x9,x10,lsl#32
628	add	x11,x11,x12,lsl#32
629	ldp	x10,x12,[x1,#16]
630	add	x13,x13,x14,lsl#32
631	add	x15,x15,x16,lsl#32
632	ldp	x14,x16,[x1,#32]
633	add	x17,x17,x19,lsl#32
634	add	x20,x20,x21,lsl#32
635	ldp	x19,x21,[x1,#48]
636	add	x1,x1,#64
637#ifdef	__ARMEB__
638	rev	x5,x5
639	rev	x7,x7
640	rev	x9,x9
641	rev	x11,x11
642	rev	x13,x13
643	rev	x15,x15
644	rev	x17,x17
645	rev	x20,x20
646#endif
647	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
648	eor	x5,x5,x6
649	eor	x7,x7,x8
650	eor	x9,x9,x10
651	eor	x11,x11,x12
652	eor	x13,x13,x14
653	eor	v0.16b,v0.16b,v20.16b
654	eor	x15,x15,x16
655	eor	v1.16b,v1.16b,v21.16b
656	eor	x17,x17,x19
657	eor	v2.16b,v2.16b,v22.16b
658	eor	x20,x20,x21
659	eor	v3.16b,v3.16b,v23.16b
660	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
661
662	stp	x5,x7,[x0,#0]		// store output
663	add	x28,x28,#4			// increment counter
664	stp	x9,x11,[x0,#16]
665	add	v27.4s,v27.4s,v31.4s		// += 4
666	stp	x13,x15,[x0,#32]
667	add	v28.4s,v28.4s,v31.4s
668	stp	x17,x20,[x0,#48]
669	add	v29.4s,v29.4s,v31.4s
670	add	x0,x0,#64
671
672	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
673	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
674
675	eor	v4.16b,v4.16b,v20.16b
676	eor	v5.16b,v5.16b,v21.16b
677	eor	v6.16b,v6.16b,v22.16b
678	eor	v7.16b,v7.16b,v23.16b
679	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
680
681	eor	v16.16b,v16.16b,v0.16b
682	eor	v17.16b,v17.16b,v1.16b
683	eor	v18.16b,v18.16b,v2.16b
684	eor	v19.16b,v19.16b,v3.16b
685	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
686
687	b.hi	.Loop_outer_neon
688
689	ldp	x19,x20,[x29,#16]
690	add	sp,sp,#64
691	ldp	x21,x22,[x29,#32]
692	ldp	x23,x24,[x29,#48]
693	ldp	x25,x26,[x29,#64]
694	ldp	x27,x28,[x29,#80]
695	ldp	x29,x30,[sp],#96
696.inst	0xd50323bf			// autiasp
697	ret
698
699.Ltail_neon:
700	add	x2,x2,#256
701	cmp	x2,#64
702	b.lo	.Less_than_64
703
704	add	x5,x5,x6,lsl#32	// pack
705	add	x7,x7,x8,lsl#32
706	ldp	x6,x8,[x1,#0]		// load input
707	add	x9,x9,x10,lsl#32
708	add	x11,x11,x12,lsl#32
709	ldp	x10,x12,[x1,#16]
710	add	x13,x13,x14,lsl#32
711	add	x15,x15,x16,lsl#32
712	ldp	x14,x16,[x1,#32]
713	add	x17,x17,x19,lsl#32
714	add	x20,x20,x21,lsl#32
715	ldp	x19,x21,[x1,#48]
716	add	x1,x1,#64
717#ifdef	__ARMEB__
718	rev	x5,x5
719	rev	x7,x7
720	rev	x9,x9
721	rev	x11,x11
722	rev	x13,x13
723	rev	x15,x15
724	rev	x17,x17
725	rev	x20,x20
726#endif
727	eor	x5,x5,x6
728	eor	x7,x7,x8
729	eor	x9,x9,x10
730	eor	x11,x11,x12
731	eor	x13,x13,x14
732	eor	x15,x15,x16
733	eor	x17,x17,x19
734	eor	x20,x20,x21
735
736	stp	x5,x7,[x0,#0]		// store output
737	add	x28,x28,#4			// increment counter
738	stp	x9,x11,[x0,#16]
739	stp	x13,x15,[x0,#32]
740	stp	x17,x20,[x0,#48]
741	add	x0,x0,#64
742	b.eq	.Ldone_neon
743	sub	x2,x2,#64
744	cmp	x2,#64
745	b.lo	.Less_than_128
746
747	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
748	eor	v0.16b,v0.16b,v20.16b
749	eor	v1.16b,v1.16b,v21.16b
750	eor	v2.16b,v2.16b,v22.16b
751	eor	v3.16b,v3.16b,v23.16b
752	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
753	b.eq	.Ldone_neon
754	sub	x2,x2,#64
755	cmp	x2,#64
756	b.lo	.Less_than_192
757
758	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
759	eor	v4.16b,v4.16b,v20.16b
760	eor	v5.16b,v5.16b,v21.16b
761	eor	v6.16b,v6.16b,v22.16b
762	eor	v7.16b,v7.16b,v23.16b
763	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
764	b.eq	.Ldone_neon
765	sub	x2,x2,#64
766
767	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
768	b	.Last_neon
769
770.Less_than_128:
771	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
772	b	.Last_neon
773.Less_than_192:
774	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
775	b	.Last_neon
776
777.align	4
778.Last_neon:
779	sub	x0,x0,#1
780	add	x1,x1,x2
781	add	x0,x0,x2
782	add	x4,sp,x2
783	neg	x2,x2
784
785.Loop_tail_neon:
786	ldrb	w10,[x1,x2]
787	ldrb	w11,[x4,x2]
788	add	x2,x2,#1
789	eor	w10,w10,w11
790	strb	w10,[x0,x2]
791	cbnz	x2,.Loop_tail_neon
792
793	stp	xzr,xzr,[sp,#0]
794	stp	xzr,xzr,[sp,#16]
795	stp	xzr,xzr,[sp,#32]
796	stp	xzr,xzr,[sp,#48]
797
798.Ldone_neon:
799	ldp	x19,x20,[x29,#16]
800	add	sp,sp,#64
801	ldp	x21,x22,[x29,#32]
802	ldp	x23,x24,[x29,#48]
803	ldp	x25,x26,[x29,#64]
804	ldp	x27,x28,[x29,#80]
805	ldp	x29,x30,[sp],#96
806.inst	0xd50323bf			// autiasp
807	ret
808.size	ChaCha20_neon,.-ChaCha20_neon
809.type	ChaCha20_512_neon,%function
810.align	5
811ChaCha20_512_neon:
812.inst	0xd503233f			// paciasp
813	stp	x29,x30,[sp,#-96]!
814	add	x29,sp,#0
815
816	adr	x5,.Lsigma
817	stp	x19,x20,[sp,#16]
818	stp	x21,x22,[sp,#32]
819	stp	x23,x24,[sp,#48]
820	stp	x25,x26,[sp,#64]
821	stp	x27,x28,[sp,#80]
822
823.L512_or_more_neon:
824	sub	sp,sp,#128+64
825
826	ldp	x22,x23,[x5]		// load sigma
827	ld1	{v24.4s},[x5],#16
828	ldp	x24,x25,[x3]		// load key
829	ldp	x26,x27,[x3,#16]
830	ld1	{v25.4s,v26.4s},[x3]
831	ldp	x28,x30,[x4]		// load counter
832	ld1	{v27.4s},[x4]
833	ld1	{v31.4s},[x5]
834#ifdef	__ARMEB__
835	rev64	v24.4s,v24.4s
836	ror	x24,x24,#32
837	ror	x25,x25,#32
838	ror	x26,x26,#32
839	ror	x27,x27,#32
840	ror	x28,x28,#32
841	ror	x30,x30,#32
842#endif
843	add	v27.4s,v27.4s,v31.4s		// += 1
844	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
845	add	v27.4s,v27.4s,v31.4s		// not typo
846	str	q26,[sp,#32]
847	add	v28.4s,v27.4s,v31.4s
848	add	v29.4s,v28.4s,v31.4s
849	add	v30.4s,v29.4s,v31.4s
850	shl	v31.4s,v31.4s,#2			// 1 -> 4
851
852	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
853	stp	d10,d11,[sp,#128+16]
854	stp	d12,d13,[sp,#128+32]
855	stp	d14,d15,[sp,#128+48]
856
857	sub	x2,x2,#512			// not typo
858
859.Loop_outer_512_neon:
860	mov	v0.16b,v24.16b
861	mov	v4.16b,v24.16b
862	mov	v8.16b,v24.16b
863	mov	v12.16b,v24.16b
864	mov	v16.16b,v24.16b
865	mov	v20.16b,v24.16b
866	mov	v1.16b,v25.16b
867	mov	w5,w22			// unpack key block
868	mov	v5.16b,v25.16b
869	lsr	x6,x22,#32
870	mov	v9.16b,v25.16b
871	mov	w7,w23
872	mov	v13.16b,v25.16b
873	lsr	x8,x23,#32
874	mov	v17.16b,v25.16b
875	mov	w9,w24
876	mov	v21.16b,v25.16b
877	lsr	x10,x24,#32
878	mov	v3.16b,v27.16b
879	mov	w11,w25
880	mov	v7.16b,v28.16b
881	lsr	x12,x25,#32
882	mov	v11.16b,v29.16b
883	mov	w13,w26
884	mov	v15.16b,v30.16b
885	lsr	x14,x26,#32
886	mov	v2.16b,v26.16b
887	mov	w15,w27
888	mov	v6.16b,v26.16b
889	lsr	x16,x27,#32
890	add	v19.4s,v3.4s,v31.4s			// +4
891	mov	w17,w28
892	add	v23.4s,v7.4s,v31.4s			// +4
893	lsr	x19,x28,#32
894	mov	v10.16b,v26.16b
895	mov	w20,w30
896	mov	v14.16b,v26.16b
897	lsr	x21,x30,#32
898	mov	v18.16b,v26.16b
899	stp	q27,q28,[sp,#48]		// off-load key block, variable part
900	mov	v22.16b,v26.16b
901	str	q29,[sp,#80]
902
903	mov	x4,#5
904	subs	x2,x2,#512
905.Loop_upper_neon:
906	sub	x4,x4,#1
907	add	v0.4s,v0.4s,v1.4s
908	add	w5,w5,w9
909	add	v4.4s,v4.4s,v5.4s
910	add	w6,w6,w10
911	add	v8.4s,v8.4s,v9.4s
912	add	w7,w7,w11
913	add	v12.4s,v12.4s,v13.4s
914	add	w8,w8,w12
915	add	v16.4s,v16.4s,v17.4s
916	eor	w17,w17,w5
917	add	v20.4s,v20.4s,v21.4s
918	eor	w19,w19,w6
919	eor	v3.16b,v3.16b,v0.16b
920	eor	w20,w20,w7
921	eor	v7.16b,v7.16b,v4.16b
922	eor	w21,w21,w8
923	eor	v11.16b,v11.16b,v8.16b
924	ror	w17,w17,#16
925	eor	v15.16b,v15.16b,v12.16b
926	ror	w19,w19,#16
927	eor	v19.16b,v19.16b,v16.16b
928	ror	w20,w20,#16
929	eor	v23.16b,v23.16b,v20.16b
930	ror	w21,w21,#16
931	rev32	v3.8h,v3.8h
932	add	w13,w13,w17
933	rev32	v7.8h,v7.8h
934	add	w14,w14,w19
935	rev32	v11.8h,v11.8h
936	add	w15,w15,w20
937	rev32	v15.8h,v15.8h
938	add	w16,w16,w21
939	rev32	v19.8h,v19.8h
940	eor	w9,w9,w13
941	rev32	v23.8h,v23.8h
942	eor	w10,w10,w14
943	add	v2.4s,v2.4s,v3.4s
944	eor	w11,w11,w15
945	add	v6.4s,v6.4s,v7.4s
946	eor	w12,w12,w16
947	add	v10.4s,v10.4s,v11.4s
948	ror	w9,w9,#20
949	add	v14.4s,v14.4s,v15.4s
950	ror	w10,w10,#20
951	add	v18.4s,v18.4s,v19.4s
952	ror	w11,w11,#20
953	add	v22.4s,v22.4s,v23.4s
954	ror	w12,w12,#20
955	eor	v24.16b,v1.16b,v2.16b
956	add	w5,w5,w9
957	eor	v25.16b,v5.16b,v6.16b
958	add	w6,w6,w10
959	eor	v26.16b,v9.16b,v10.16b
960	add	w7,w7,w11
961	eor	v27.16b,v13.16b,v14.16b
962	add	w8,w8,w12
963	eor	v28.16b,v17.16b,v18.16b
964	eor	w17,w17,w5
965	eor	v29.16b,v21.16b,v22.16b
966	eor	w19,w19,w6
967	ushr	v1.4s,v24.4s,#20
968	eor	w20,w20,w7
969	ushr	v5.4s,v25.4s,#20
970	eor	w21,w21,w8
971	ushr	v9.4s,v26.4s,#20
972	ror	w17,w17,#24
973	ushr	v13.4s,v27.4s,#20
974	ror	w19,w19,#24
975	ushr	v17.4s,v28.4s,#20
976	ror	w20,w20,#24
977	ushr	v21.4s,v29.4s,#20
978	ror	w21,w21,#24
979	sli	v1.4s,v24.4s,#12
980	add	w13,w13,w17
981	sli	v5.4s,v25.4s,#12
982	add	w14,w14,w19
983	sli	v9.4s,v26.4s,#12
984	add	w15,w15,w20
985	sli	v13.4s,v27.4s,#12
986	add	w16,w16,w21
987	sli	v17.4s,v28.4s,#12
988	eor	w9,w9,w13
989	sli	v21.4s,v29.4s,#12
990	eor	w10,w10,w14
991	add	v0.4s,v0.4s,v1.4s
992	eor	w11,w11,w15
993	add	v4.4s,v4.4s,v5.4s
994	eor	w12,w12,w16
995	add	v8.4s,v8.4s,v9.4s
996	ror	w9,w9,#25
997	add	v12.4s,v12.4s,v13.4s
998	ror	w10,w10,#25
999	add	v16.4s,v16.4s,v17.4s
1000	ror	w11,w11,#25
1001	add	v20.4s,v20.4s,v21.4s
1002	ror	w12,w12,#25
1003	eor	v24.16b,v3.16b,v0.16b
1004	add	w5,w5,w10
1005	eor	v25.16b,v7.16b,v4.16b
1006	add	w6,w6,w11
1007	eor	v26.16b,v11.16b,v8.16b
1008	add	w7,w7,w12
1009	eor	v27.16b,v15.16b,v12.16b
1010	add	w8,w8,w9
1011	eor	v28.16b,v19.16b,v16.16b
1012	eor	w21,w21,w5
1013	eor	v29.16b,v23.16b,v20.16b
1014	eor	w17,w17,w6
1015	ushr	v3.4s,v24.4s,#24
1016	eor	w19,w19,w7
1017	ushr	v7.4s,v25.4s,#24
1018	eor	w20,w20,w8
1019	ushr	v11.4s,v26.4s,#24
1020	ror	w21,w21,#16
1021	ushr	v15.4s,v27.4s,#24
1022	ror	w17,w17,#16
1023	ushr	v19.4s,v28.4s,#24
1024	ror	w19,w19,#16
1025	ushr	v23.4s,v29.4s,#24
1026	ror	w20,w20,#16
1027	sli	v3.4s,v24.4s,#8
1028	add	w15,w15,w21
1029	sli	v7.4s,v25.4s,#8
1030	add	w16,w16,w17
1031	sli	v11.4s,v26.4s,#8
1032	add	w13,w13,w19
1033	sli	v15.4s,v27.4s,#8
1034	add	w14,w14,w20
1035	sli	v19.4s,v28.4s,#8
1036	eor	w10,w10,w15
1037	sli	v23.4s,v29.4s,#8
1038	eor	w11,w11,w16
1039	add	v2.4s,v2.4s,v3.4s
1040	eor	w12,w12,w13
1041	add	v6.4s,v6.4s,v7.4s
1042	eor	w9,w9,w14
1043	add	v10.4s,v10.4s,v11.4s
1044	ror	w10,w10,#20
1045	add	v14.4s,v14.4s,v15.4s
1046	ror	w11,w11,#20
1047	add	v18.4s,v18.4s,v19.4s
1048	ror	w12,w12,#20
1049	add	v22.4s,v22.4s,v23.4s
1050	ror	w9,w9,#20
1051	eor	v24.16b,v1.16b,v2.16b
1052	add	w5,w5,w10
1053	eor	v25.16b,v5.16b,v6.16b
1054	add	w6,w6,w11
1055	eor	v26.16b,v9.16b,v10.16b
1056	add	w7,w7,w12
1057	eor	v27.16b,v13.16b,v14.16b
1058	add	w8,w8,w9
1059	eor	v28.16b,v17.16b,v18.16b
1060	eor	w21,w21,w5
1061	eor	v29.16b,v21.16b,v22.16b
1062	eor	w17,w17,w6
1063	ushr	v1.4s,v24.4s,#25
1064	eor	w19,w19,w7
1065	ushr	v5.4s,v25.4s,#25
1066	eor	w20,w20,w8
1067	ushr	v9.4s,v26.4s,#25
1068	ror	w21,w21,#24
1069	ushr	v13.4s,v27.4s,#25
1070	ror	w17,w17,#24
1071	ushr	v17.4s,v28.4s,#25
1072	ror	w19,w19,#24
1073	ushr	v21.4s,v29.4s,#25
1074	ror	w20,w20,#24
1075	sli	v1.4s,v24.4s,#7
1076	add	w15,w15,w21
1077	sli	v5.4s,v25.4s,#7
1078	add	w16,w16,w17
1079	sli	v9.4s,v26.4s,#7
1080	add	w13,w13,w19
1081	sli	v13.4s,v27.4s,#7
1082	add	w14,w14,w20
1083	sli	v17.4s,v28.4s,#7
1084	eor	w10,w10,w15
1085	sli	v21.4s,v29.4s,#7
1086	eor	w11,w11,w16
1087	ext	v2.16b,v2.16b,v2.16b,#8
1088	eor	w12,w12,w13
1089	ext	v6.16b,v6.16b,v6.16b,#8
1090	eor	w9,w9,w14
1091	ext	v10.16b,v10.16b,v10.16b,#8
1092	ror	w10,w10,#25
1093	ext	v14.16b,v14.16b,v14.16b,#8
1094	ror	w11,w11,#25
1095	ext	v18.16b,v18.16b,v18.16b,#8
1096	ror	w12,w12,#25
1097	ext	v22.16b,v22.16b,v22.16b,#8
1098	ror	w9,w9,#25
1099	ext	v3.16b,v3.16b,v3.16b,#12
1100	ext	v7.16b,v7.16b,v7.16b,#12
1101	ext	v11.16b,v11.16b,v11.16b,#12
1102	ext	v15.16b,v15.16b,v15.16b,#12
1103	ext	v19.16b,v19.16b,v19.16b,#12
1104	ext	v23.16b,v23.16b,v23.16b,#12
1105	ext	v1.16b,v1.16b,v1.16b,#4
1106	ext	v5.16b,v5.16b,v5.16b,#4
1107	ext	v9.16b,v9.16b,v9.16b,#4
1108	ext	v13.16b,v13.16b,v13.16b,#4
1109	ext	v17.16b,v17.16b,v17.16b,#4
1110	ext	v21.16b,v21.16b,v21.16b,#4
1111	add	v0.4s,v0.4s,v1.4s
1112	add	w5,w5,w9
1113	add	v4.4s,v4.4s,v5.4s
1114	add	w6,w6,w10
1115	add	v8.4s,v8.4s,v9.4s
1116	add	w7,w7,w11
1117	add	v12.4s,v12.4s,v13.4s
1118	add	w8,w8,w12
1119	add	v16.4s,v16.4s,v17.4s
1120	eor	w17,w17,w5
1121	add	v20.4s,v20.4s,v21.4s
1122	eor	w19,w19,w6
1123	eor	v3.16b,v3.16b,v0.16b
1124	eor	w20,w20,w7
1125	eor	v7.16b,v7.16b,v4.16b
1126	eor	w21,w21,w8
1127	eor	v11.16b,v11.16b,v8.16b
1128	ror	w17,w17,#16
1129	eor	v15.16b,v15.16b,v12.16b
1130	ror	w19,w19,#16
1131	eor	v19.16b,v19.16b,v16.16b
1132	ror	w20,w20,#16
1133	eor	v23.16b,v23.16b,v20.16b
1134	ror	w21,w21,#16
1135	rev32	v3.8h,v3.8h
1136	add	w13,w13,w17
1137	rev32	v7.8h,v7.8h
1138	add	w14,w14,w19
1139	rev32	v11.8h,v11.8h
1140	add	w15,w15,w20
1141	rev32	v15.8h,v15.8h
1142	add	w16,w16,w21
1143	rev32	v19.8h,v19.8h
1144	eor	w9,w9,w13
1145	rev32	v23.8h,v23.8h
1146	eor	w10,w10,w14
1147	add	v2.4s,v2.4s,v3.4s
1148	eor	w11,w11,w15
1149	add	v6.4s,v6.4s,v7.4s
1150	eor	w12,w12,w16
1151	add	v10.4s,v10.4s,v11.4s
1152	ror	w9,w9,#20
1153	add	v14.4s,v14.4s,v15.4s
1154	ror	w10,w10,#20
1155	add	v18.4s,v18.4s,v19.4s
1156	ror	w11,w11,#20
1157	add	v22.4s,v22.4s,v23.4s
1158	ror	w12,w12,#20
1159	eor	v24.16b,v1.16b,v2.16b
1160	add	w5,w5,w9
1161	eor	v25.16b,v5.16b,v6.16b
1162	add	w6,w6,w10
1163	eor	v26.16b,v9.16b,v10.16b
1164	add	w7,w7,w11
1165	eor	v27.16b,v13.16b,v14.16b
1166	add	w8,w8,w12
1167	eor	v28.16b,v17.16b,v18.16b
1168	eor	w17,w17,w5
1169	eor	v29.16b,v21.16b,v22.16b
1170	eor	w19,w19,w6
1171	ushr	v1.4s,v24.4s,#20
1172	eor	w20,w20,w7
1173	ushr	v5.4s,v25.4s,#20
1174	eor	w21,w21,w8
1175	ushr	v9.4s,v26.4s,#20
1176	ror	w17,w17,#24
1177	ushr	v13.4s,v27.4s,#20
1178	ror	w19,w19,#24
1179	ushr	v17.4s,v28.4s,#20
1180	ror	w20,w20,#24
1181	ushr	v21.4s,v29.4s,#20
1182	ror	w21,w21,#24
1183	sli	v1.4s,v24.4s,#12
1184	add	w13,w13,w17
1185	sli	v5.4s,v25.4s,#12
1186	add	w14,w14,w19
1187	sli	v9.4s,v26.4s,#12
1188	add	w15,w15,w20
1189	sli	v13.4s,v27.4s,#12
1190	add	w16,w16,w21
1191	sli	v17.4s,v28.4s,#12
1192	eor	w9,w9,w13
1193	sli	v21.4s,v29.4s,#12
1194	eor	w10,w10,w14
1195	add	v0.4s,v0.4s,v1.4s
1196	eor	w11,w11,w15
1197	add	v4.4s,v4.4s,v5.4s
1198	eor	w12,w12,w16
1199	add	v8.4s,v8.4s,v9.4s
1200	ror	w9,w9,#25
1201	add	v12.4s,v12.4s,v13.4s
1202	ror	w10,w10,#25
1203	add	v16.4s,v16.4s,v17.4s
1204	ror	w11,w11,#25
1205	add	v20.4s,v20.4s,v21.4s
1206	ror	w12,w12,#25
1207	eor	v24.16b,v3.16b,v0.16b
1208	add	w5,w5,w10
1209	eor	v25.16b,v7.16b,v4.16b
1210	add	w6,w6,w11
1211	eor	v26.16b,v11.16b,v8.16b
1212	add	w7,w7,w12
1213	eor	v27.16b,v15.16b,v12.16b
1214	add	w8,w8,w9
1215	eor	v28.16b,v19.16b,v16.16b
1216	eor	w21,w21,w5
1217	eor	v29.16b,v23.16b,v20.16b
1218	eor	w17,w17,w6
1219	ushr	v3.4s,v24.4s,#24
1220	eor	w19,w19,w7
1221	ushr	v7.4s,v25.4s,#24
1222	eor	w20,w20,w8
1223	ushr	v11.4s,v26.4s,#24
1224	ror	w21,w21,#16
1225	ushr	v15.4s,v27.4s,#24
1226	ror	w17,w17,#16
1227	ushr	v19.4s,v28.4s,#24
1228	ror	w19,w19,#16
1229	ushr	v23.4s,v29.4s,#24
1230	ror	w20,w20,#16
1231	sli	v3.4s,v24.4s,#8
1232	add	w15,w15,w21
1233	sli	v7.4s,v25.4s,#8
1234	add	w16,w16,w17
1235	sli	v11.4s,v26.4s,#8
1236	add	w13,w13,w19
1237	sli	v15.4s,v27.4s,#8
1238	add	w14,w14,w20
1239	sli	v19.4s,v28.4s,#8
1240	eor	w10,w10,w15
1241	sli	v23.4s,v29.4s,#8
1242	eor	w11,w11,w16
1243	add	v2.4s,v2.4s,v3.4s
1244	eor	w12,w12,w13
1245	add	v6.4s,v6.4s,v7.4s
1246	eor	w9,w9,w14
1247	add	v10.4s,v10.4s,v11.4s
1248	ror	w10,w10,#20
1249	add	v14.4s,v14.4s,v15.4s
1250	ror	w11,w11,#20
1251	add	v18.4s,v18.4s,v19.4s
1252	ror	w12,w12,#20
1253	add	v22.4s,v22.4s,v23.4s
1254	ror	w9,w9,#20
1255	eor	v24.16b,v1.16b,v2.16b
1256	add	w5,w5,w10
1257	eor	v25.16b,v5.16b,v6.16b
1258	add	w6,w6,w11
1259	eor	v26.16b,v9.16b,v10.16b
1260	add	w7,w7,w12
1261	eor	v27.16b,v13.16b,v14.16b
1262	add	w8,w8,w9
1263	eor	v28.16b,v17.16b,v18.16b
1264	eor	w21,w21,w5
1265	eor	v29.16b,v21.16b,v22.16b
1266	eor	w17,w17,w6
1267	ushr	v1.4s,v24.4s,#25
1268	eor	w19,w19,w7
1269	ushr	v5.4s,v25.4s,#25
1270	eor	w20,w20,w8
1271	ushr	v9.4s,v26.4s,#25
1272	ror	w21,w21,#24
1273	ushr	v13.4s,v27.4s,#25
1274	ror	w17,w17,#24
1275	ushr	v17.4s,v28.4s,#25
1276	ror	w19,w19,#24
1277	ushr	v21.4s,v29.4s,#25
1278	ror	w20,w20,#24
1279	sli	v1.4s,v24.4s,#7
1280	add	w15,w15,w21
1281	sli	v5.4s,v25.4s,#7
1282	add	w16,w16,w17
1283	sli	v9.4s,v26.4s,#7
1284	add	w13,w13,w19
1285	sli	v13.4s,v27.4s,#7
1286	add	w14,w14,w20
1287	sli	v17.4s,v28.4s,#7
1288	eor	w10,w10,w15
1289	sli	v21.4s,v29.4s,#7
1290	eor	w11,w11,w16
1291	ext	v2.16b,v2.16b,v2.16b,#8
1292	eor	w12,w12,w13
1293	ext	v6.16b,v6.16b,v6.16b,#8
1294	eor	w9,w9,w14
1295	ext	v10.16b,v10.16b,v10.16b,#8
1296	ror	w10,w10,#25
1297	ext	v14.16b,v14.16b,v14.16b,#8
1298	ror	w11,w11,#25
1299	ext	v18.16b,v18.16b,v18.16b,#8
1300	ror	w12,w12,#25
1301	ext	v22.16b,v22.16b,v22.16b,#8
1302	ror	w9,w9,#25
1303	ext	v3.16b,v3.16b,v3.16b,#4
1304	ext	v7.16b,v7.16b,v7.16b,#4
1305	ext	v11.16b,v11.16b,v11.16b,#4
1306	ext	v15.16b,v15.16b,v15.16b,#4
1307	ext	v19.16b,v19.16b,v19.16b,#4
1308	ext	v23.16b,v23.16b,v23.16b,#4
1309	ext	v1.16b,v1.16b,v1.16b,#12
1310	ext	v5.16b,v5.16b,v5.16b,#12
1311	ext	v9.16b,v9.16b,v9.16b,#12
1312	ext	v13.16b,v13.16b,v13.16b,#12
1313	ext	v17.16b,v17.16b,v17.16b,#12
1314	ext	v21.16b,v21.16b,v21.16b,#12
1315	cbnz	x4,.Loop_upper_neon
1316
1317	add	w5,w5,w22		// accumulate key block
1318	add	x6,x6,x22,lsr#32
1319	add	w7,w7,w23
1320	add	x8,x8,x23,lsr#32
1321	add	w9,w9,w24
1322	add	x10,x10,x24,lsr#32
1323	add	w11,w11,w25
1324	add	x12,x12,x25,lsr#32
1325	add	w13,w13,w26
1326	add	x14,x14,x26,lsr#32
1327	add	w15,w15,w27
1328	add	x16,x16,x27,lsr#32
1329	add	w17,w17,w28
1330	add	x19,x19,x28,lsr#32
1331	add	w20,w20,w30
1332	add	x21,x21,x30,lsr#32
1333
1334	add	x5,x5,x6,lsl#32	// pack
1335	add	x7,x7,x8,lsl#32
1336	ldp	x6,x8,[x1,#0]		// load input
1337	add	x9,x9,x10,lsl#32
1338	add	x11,x11,x12,lsl#32
1339	ldp	x10,x12,[x1,#16]
1340	add	x13,x13,x14,lsl#32
1341	add	x15,x15,x16,lsl#32
1342	ldp	x14,x16,[x1,#32]
1343	add	x17,x17,x19,lsl#32
1344	add	x20,x20,x21,lsl#32
1345	ldp	x19,x21,[x1,#48]
1346	add	x1,x1,#64
1347#ifdef	__ARMEB__
1348	rev	x5,x5
1349	rev	x7,x7
1350	rev	x9,x9
1351	rev	x11,x11
1352	rev	x13,x13
1353	rev	x15,x15
1354	rev	x17,x17
1355	rev	x20,x20
1356#endif
1357	eor	x5,x5,x6
1358	eor	x7,x7,x8
1359	eor	x9,x9,x10
1360	eor	x11,x11,x12
1361	eor	x13,x13,x14
1362	eor	x15,x15,x16
1363	eor	x17,x17,x19
1364	eor	x20,x20,x21
1365
1366	stp	x5,x7,[x0,#0]		// store output
1367	add	x28,x28,#1			// increment counter
1368	mov	w5,w22			// unpack key block
1369	lsr	x6,x22,#32
1370	stp	x9,x11,[x0,#16]
1371	mov	w7,w23
1372	lsr	x8,x23,#32
1373	stp	x13,x15,[x0,#32]
1374	mov	w9,w24
1375	lsr	x10,x24,#32
1376	stp	x17,x20,[x0,#48]
1377	add	x0,x0,#64
1378	mov	w11,w25
1379	lsr	x12,x25,#32
1380	mov	w13,w26
1381	lsr	x14,x26,#32
1382	mov	w15,w27
1383	lsr	x16,x27,#32
1384	mov	w17,w28
1385	lsr	x19,x28,#32
1386	mov	w20,w30
1387	lsr	x21,x30,#32
1388
1389	mov	x4,#5
1390.Loop_lower_neon:
1391	sub	x4,x4,#1
1392	add	v0.4s,v0.4s,v1.4s
1393	add	w5,w5,w9
1394	add	v4.4s,v4.4s,v5.4s
1395	add	w6,w6,w10
1396	add	v8.4s,v8.4s,v9.4s
1397	add	w7,w7,w11
1398	add	v12.4s,v12.4s,v13.4s
1399	add	w8,w8,w12
1400	add	v16.4s,v16.4s,v17.4s
1401	eor	w17,w17,w5
1402	add	v20.4s,v20.4s,v21.4s
1403	eor	w19,w19,w6
1404	eor	v3.16b,v3.16b,v0.16b
1405	eor	w20,w20,w7
1406	eor	v7.16b,v7.16b,v4.16b
1407	eor	w21,w21,w8
1408	eor	v11.16b,v11.16b,v8.16b
1409	ror	w17,w17,#16
1410	eor	v15.16b,v15.16b,v12.16b
1411	ror	w19,w19,#16
1412	eor	v19.16b,v19.16b,v16.16b
1413	ror	w20,w20,#16
1414	eor	v23.16b,v23.16b,v20.16b
1415	ror	w21,w21,#16
1416	rev32	v3.8h,v3.8h
1417	add	w13,w13,w17
1418	rev32	v7.8h,v7.8h
1419	add	w14,w14,w19
1420	rev32	v11.8h,v11.8h
1421	add	w15,w15,w20
1422	rev32	v15.8h,v15.8h
1423	add	w16,w16,w21
1424	rev32	v19.8h,v19.8h
1425	eor	w9,w9,w13
1426	rev32	v23.8h,v23.8h
1427	eor	w10,w10,w14
1428	add	v2.4s,v2.4s,v3.4s
1429	eor	w11,w11,w15
1430	add	v6.4s,v6.4s,v7.4s
1431	eor	w12,w12,w16
1432	add	v10.4s,v10.4s,v11.4s
1433	ror	w9,w9,#20
1434	add	v14.4s,v14.4s,v15.4s
1435	ror	w10,w10,#20
1436	add	v18.4s,v18.4s,v19.4s
1437	ror	w11,w11,#20
1438	add	v22.4s,v22.4s,v23.4s
1439	ror	w12,w12,#20
1440	eor	v24.16b,v1.16b,v2.16b
1441	add	w5,w5,w9
1442	eor	v25.16b,v5.16b,v6.16b
1443	add	w6,w6,w10
1444	eor	v26.16b,v9.16b,v10.16b
1445	add	w7,w7,w11
1446	eor	v27.16b,v13.16b,v14.16b
1447	add	w8,w8,w12
1448	eor	v28.16b,v17.16b,v18.16b
1449	eor	w17,w17,w5
1450	eor	v29.16b,v21.16b,v22.16b
1451	eor	w19,w19,w6
1452	ushr	v1.4s,v24.4s,#20
1453	eor	w20,w20,w7
1454	ushr	v5.4s,v25.4s,#20
1455	eor	w21,w21,w8
1456	ushr	v9.4s,v26.4s,#20
1457	ror	w17,w17,#24
1458	ushr	v13.4s,v27.4s,#20
1459	ror	w19,w19,#24
1460	ushr	v17.4s,v28.4s,#20
1461	ror	w20,w20,#24
1462	ushr	v21.4s,v29.4s,#20
1463	ror	w21,w21,#24
1464	sli	v1.4s,v24.4s,#12
1465	add	w13,w13,w17
1466	sli	v5.4s,v25.4s,#12
1467	add	w14,w14,w19
1468	sli	v9.4s,v26.4s,#12
1469	add	w15,w15,w20
1470	sli	v13.4s,v27.4s,#12
1471	add	w16,w16,w21
1472	sli	v17.4s,v28.4s,#12
1473	eor	w9,w9,w13
1474	sli	v21.4s,v29.4s,#12
1475	eor	w10,w10,w14
1476	add	v0.4s,v0.4s,v1.4s
1477	eor	w11,w11,w15
1478	add	v4.4s,v4.4s,v5.4s
1479	eor	w12,w12,w16
1480	add	v8.4s,v8.4s,v9.4s
1481	ror	w9,w9,#25
1482	add	v12.4s,v12.4s,v13.4s
1483	ror	w10,w10,#25
1484	add	v16.4s,v16.4s,v17.4s
1485	ror	w11,w11,#25
1486	add	v20.4s,v20.4s,v21.4s
1487	ror	w12,w12,#25
1488	eor	v24.16b,v3.16b,v0.16b
1489	add	w5,w5,w10
1490	eor	v25.16b,v7.16b,v4.16b
1491	add	w6,w6,w11
1492	eor	v26.16b,v11.16b,v8.16b
1493	add	w7,w7,w12
1494	eor	v27.16b,v15.16b,v12.16b
1495	add	w8,w8,w9
1496	eor	v28.16b,v19.16b,v16.16b
1497	eor	w21,w21,w5
1498	eor	v29.16b,v23.16b,v20.16b
1499	eor	w17,w17,w6
1500	ushr	v3.4s,v24.4s,#24
1501	eor	w19,w19,w7
1502	ushr	v7.4s,v25.4s,#24
1503	eor	w20,w20,w8
1504	ushr	v11.4s,v26.4s,#24
1505	ror	w21,w21,#16
1506	ushr	v15.4s,v27.4s,#24
1507	ror	w17,w17,#16
1508	ushr	v19.4s,v28.4s,#24
1509	ror	w19,w19,#16
1510	ushr	v23.4s,v29.4s,#24
1511	ror	w20,w20,#16
1512	sli	v3.4s,v24.4s,#8
1513	add	w15,w15,w21
1514	sli	v7.4s,v25.4s,#8
1515	add	w16,w16,w17
1516	sli	v11.4s,v26.4s,#8
1517	add	w13,w13,w19
1518	sli	v15.4s,v27.4s,#8
1519	add	w14,w14,w20
1520	sli	v19.4s,v28.4s,#8
1521	eor	w10,w10,w15
1522	sli	v23.4s,v29.4s,#8
1523	eor	w11,w11,w16
1524	add	v2.4s,v2.4s,v3.4s
1525	eor	w12,w12,w13
1526	add	v6.4s,v6.4s,v7.4s
1527	eor	w9,w9,w14
1528	add	v10.4s,v10.4s,v11.4s
1529	ror	w10,w10,#20
1530	add	v14.4s,v14.4s,v15.4s
1531	ror	w11,w11,#20
1532	add	v18.4s,v18.4s,v19.4s
1533	ror	w12,w12,#20
1534	add	v22.4s,v22.4s,v23.4s
1535	ror	w9,w9,#20
1536	eor	v24.16b,v1.16b,v2.16b
1537	add	w5,w5,w10
1538	eor	v25.16b,v5.16b,v6.16b
1539	add	w6,w6,w11
1540	eor	v26.16b,v9.16b,v10.16b
1541	add	w7,w7,w12
1542	eor	v27.16b,v13.16b,v14.16b
1543	add	w8,w8,w9
1544	eor	v28.16b,v17.16b,v18.16b
1545	eor	w21,w21,w5
1546	eor	v29.16b,v21.16b,v22.16b
1547	eor	w17,w17,w6
1548	ushr	v1.4s,v24.4s,#25
1549	eor	w19,w19,w7
1550	ushr	v5.4s,v25.4s,#25
1551	eor	w20,w20,w8
1552	ushr	v9.4s,v26.4s,#25
1553	ror	w21,w21,#24
1554	ushr	v13.4s,v27.4s,#25
1555	ror	w17,w17,#24
1556	ushr	v17.4s,v28.4s,#25
1557	ror	w19,w19,#24
1558	ushr	v21.4s,v29.4s,#25
1559	ror	w20,w20,#24
1560	sli	v1.4s,v24.4s,#7
1561	add	w15,w15,w21
1562	sli	v5.4s,v25.4s,#7
1563	add	w16,w16,w17
1564	sli	v9.4s,v26.4s,#7
1565	add	w13,w13,w19
1566	sli	v13.4s,v27.4s,#7
1567	add	w14,w14,w20
1568	sli	v17.4s,v28.4s,#7
1569	eor	w10,w10,w15
1570	sli	v21.4s,v29.4s,#7
1571	eor	w11,w11,w16
1572	ext	v2.16b,v2.16b,v2.16b,#8
1573	eor	w12,w12,w13
1574	ext	v6.16b,v6.16b,v6.16b,#8
1575	eor	w9,w9,w14
1576	ext	v10.16b,v10.16b,v10.16b,#8
1577	ror	w10,w10,#25
1578	ext	v14.16b,v14.16b,v14.16b,#8
1579	ror	w11,w11,#25
1580	ext	v18.16b,v18.16b,v18.16b,#8
1581	ror	w12,w12,#25
1582	ext	v22.16b,v22.16b,v22.16b,#8
1583	ror	w9,w9,#25
1584	ext	v3.16b,v3.16b,v3.16b,#12
1585	ext	v7.16b,v7.16b,v7.16b,#12
1586	ext	v11.16b,v11.16b,v11.16b,#12
1587	ext	v15.16b,v15.16b,v15.16b,#12
1588	ext	v19.16b,v19.16b,v19.16b,#12
1589	ext	v23.16b,v23.16b,v23.16b,#12
1590	ext	v1.16b,v1.16b,v1.16b,#4
1591	ext	v5.16b,v5.16b,v5.16b,#4
1592	ext	v9.16b,v9.16b,v9.16b,#4
1593	ext	v13.16b,v13.16b,v13.16b,#4
1594	ext	v17.16b,v17.16b,v17.16b,#4
1595	ext	v21.16b,v21.16b,v21.16b,#4
1596	add	v0.4s,v0.4s,v1.4s
1597	add	w5,w5,w9
1598	add	v4.4s,v4.4s,v5.4s
1599	add	w6,w6,w10
1600	add	v8.4s,v8.4s,v9.4s
1601	add	w7,w7,w11
1602	add	v12.4s,v12.4s,v13.4s
1603	add	w8,w8,w12
1604	add	v16.4s,v16.4s,v17.4s
1605	eor	w17,w17,w5
1606	add	v20.4s,v20.4s,v21.4s
1607	eor	w19,w19,w6
1608	eor	v3.16b,v3.16b,v0.16b
1609	eor	w20,w20,w7
1610	eor	v7.16b,v7.16b,v4.16b
1611	eor	w21,w21,w8
1612	eor	v11.16b,v11.16b,v8.16b
1613	ror	w17,w17,#16
1614	eor	v15.16b,v15.16b,v12.16b
1615	ror	w19,w19,#16
1616	eor	v19.16b,v19.16b,v16.16b
1617	ror	w20,w20,#16
1618	eor	v23.16b,v23.16b,v20.16b
1619	ror	w21,w21,#16
1620	rev32	v3.8h,v3.8h
1621	add	w13,w13,w17
1622	rev32	v7.8h,v7.8h
1623	add	w14,w14,w19
1624	rev32	v11.8h,v11.8h
1625	add	w15,w15,w20
1626	rev32	v15.8h,v15.8h
1627	add	w16,w16,w21
1628	rev32	v19.8h,v19.8h
1629	eor	w9,w9,w13
1630	rev32	v23.8h,v23.8h
1631	eor	w10,w10,w14
1632	add	v2.4s,v2.4s,v3.4s
1633	eor	w11,w11,w15
1634	add	v6.4s,v6.4s,v7.4s
1635	eor	w12,w12,w16
1636	add	v10.4s,v10.4s,v11.4s
1637	ror	w9,w9,#20
1638	add	v14.4s,v14.4s,v15.4s
1639	ror	w10,w10,#20
1640	add	v18.4s,v18.4s,v19.4s
1641	ror	w11,w11,#20
1642	add	v22.4s,v22.4s,v23.4s
1643	ror	w12,w12,#20
1644	eor	v24.16b,v1.16b,v2.16b
1645	add	w5,w5,w9
1646	eor	v25.16b,v5.16b,v6.16b
1647	add	w6,w6,w10
1648	eor	v26.16b,v9.16b,v10.16b
1649	add	w7,w7,w11
1650	eor	v27.16b,v13.16b,v14.16b
1651	add	w8,w8,w12
1652	eor	v28.16b,v17.16b,v18.16b
1653	eor	w17,w17,w5
1654	eor	v29.16b,v21.16b,v22.16b
1655	eor	w19,w19,w6
1656	ushr	v1.4s,v24.4s,#20
1657	eor	w20,w20,w7
1658	ushr	v5.4s,v25.4s,#20
1659	eor	w21,w21,w8
1660	ushr	v9.4s,v26.4s,#20
1661	ror	w17,w17,#24
1662	ushr	v13.4s,v27.4s,#20
1663	ror	w19,w19,#24
1664	ushr	v17.4s,v28.4s,#20
1665	ror	w20,w20,#24
1666	ushr	v21.4s,v29.4s,#20
1667	ror	w21,w21,#24
1668	sli	v1.4s,v24.4s,#12
1669	add	w13,w13,w17
1670	sli	v5.4s,v25.4s,#12
1671	add	w14,w14,w19
1672	sli	v9.4s,v26.4s,#12
1673	add	w15,w15,w20
1674	sli	v13.4s,v27.4s,#12
1675	add	w16,w16,w21
1676	sli	v17.4s,v28.4s,#12
1677	eor	w9,w9,w13
1678	sli	v21.4s,v29.4s,#12
1679	eor	w10,w10,w14
1680	add	v0.4s,v0.4s,v1.4s
1681	eor	w11,w11,w15
1682	add	v4.4s,v4.4s,v5.4s
1683	eor	w12,w12,w16
1684	add	v8.4s,v8.4s,v9.4s
1685	ror	w9,w9,#25
1686	add	v12.4s,v12.4s,v13.4s
1687	ror	w10,w10,#25
1688	add	v16.4s,v16.4s,v17.4s
1689	ror	w11,w11,#25
1690	add	v20.4s,v20.4s,v21.4s
1691	ror	w12,w12,#25
1692	eor	v24.16b,v3.16b,v0.16b
1693	add	w5,w5,w10
1694	eor	v25.16b,v7.16b,v4.16b
1695	add	w6,w6,w11
1696	eor	v26.16b,v11.16b,v8.16b
1697	add	w7,w7,w12
1698	eor	v27.16b,v15.16b,v12.16b
1699	add	w8,w8,w9
1700	eor	v28.16b,v19.16b,v16.16b
1701	eor	w21,w21,w5
1702	eor	v29.16b,v23.16b,v20.16b
1703	eor	w17,w17,w6
1704	ushr	v3.4s,v24.4s,#24
1705	eor	w19,w19,w7
1706	ushr	v7.4s,v25.4s,#24
1707	eor	w20,w20,w8
1708	ushr	v11.4s,v26.4s,#24
1709	ror	w21,w21,#16
1710	ushr	v15.4s,v27.4s,#24
1711	ror	w17,w17,#16
1712	ushr	v19.4s,v28.4s,#24
1713	ror	w19,w19,#16
1714	ushr	v23.4s,v29.4s,#24
1715	ror	w20,w20,#16
1716	sli	v3.4s,v24.4s,#8
1717	add	w15,w15,w21
1718	sli	v7.4s,v25.4s,#8
1719	add	w16,w16,w17
1720	sli	v11.4s,v26.4s,#8
1721	add	w13,w13,w19
1722	sli	v15.4s,v27.4s,#8
1723	add	w14,w14,w20
1724	sli	v19.4s,v28.4s,#8
1725	eor	w10,w10,w15
1726	sli	v23.4s,v29.4s,#8
1727	eor	w11,w11,w16
1728	add	v2.4s,v2.4s,v3.4s
1729	eor	w12,w12,w13
1730	add	v6.4s,v6.4s,v7.4s
1731	eor	w9,w9,w14
1732	add	v10.4s,v10.4s,v11.4s
1733	ror	w10,w10,#20
1734	add	v14.4s,v14.4s,v15.4s
1735	ror	w11,w11,#20
1736	add	v18.4s,v18.4s,v19.4s
1737	ror	w12,w12,#20
1738	add	v22.4s,v22.4s,v23.4s
1739	ror	w9,w9,#20
1740	eor	v24.16b,v1.16b,v2.16b
1741	add	w5,w5,w10
1742	eor	v25.16b,v5.16b,v6.16b
1743	add	w6,w6,w11
1744	eor	v26.16b,v9.16b,v10.16b
1745	add	w7,w7,w12
1746	eor	v27.16b,v13.16b,v14.16b
1747	add	w8,w8,w9
1748	eor	v28.16b,v17.16b,v18.16b
1749	eor	w21,w21,w5
1750	eor	v29.16b,v21.16b,v22.16b
1751	eor	w17,w17,w6
1752	ushr	v1.4s,v24.4s,#25
1753	eor	w19,w19,w7
1754	ushr	v5.4s,v25.4s,#25
1755	eor	w20,w20,w8
1756	ushr	v9.4s,v26.4s,#25
1757	ror	w21,w21,#24
1758	ushr	v13.4s,v27.4s,#25
1759	ror	w17,w17,#24
1760	ushr	v17.4s,v28.4s,#25
1761	ror	w19,w19,#24
1762	ushr	v21.4s,v29.4s,#25
1763	ror	w20,w20,#24
1764	sli	v1.4s,v24.4s,#7
1765	add	w15,w15,w21
1766	sli	v5.4s,v25.4s,#7
1767	add	w16,w16,w17
1768	sli	v9.4s,v26.4s,#7
1769	add	w13,w13,w19
1770	sli	v13.4s,v27.4s,#7
1771	add	w14,w14,w20
1772	sli	v17.4s,v28.4s,#7
1773	eor	w10,w10,w15
1774	sli	v21.4s,v29.4s,#7
1775	eor	w11,w11,w16
1776	ext	v2.16b,v2.16b,v2.16b,#8
1777	eor	w12,w12,w13
1778	ext	v6.16b,v6.16b,v6.16b,#8
1779	eor	w9,w9,w14
1780	ext	v10.16b,v10.16b,v10.16b,#8
1781	ror	w10,w10,#25
1782	ext	v14.16b,v14.16b,v14.16b,#8
1783	ror	w11,w11,#25
1784	ext	v18.16b,v18.16b,v18.16b,#8
1785	ror	w12,w12,#25
1786	ext	v22.16b,v22.16b,v22.16b,#8
1787	ror	w9,w9,#25
1788	ext	v3.16b,v3.16b,v3.16b,#4
1789	ext	v7.16b,v7.16b,v7.16b,#4
1790	ext	v11.16b,v11.16b,v11.16b,#4
1791	ext	v15.16b,v15.16b,v15.16b,#4
1792	ext	v19.16b,v19.16b,v19.16b,#4
1793	ext	v23.16b,v23.16b,v23.16b,#4
1794	ext	v1.16b,v1.16b,v1.16b,#12
1795	ext	v5.16b,v5.16b,v5.16b,#12
1796	ext	v9.16b,v9.16b,v9.16b,#12
1797	ext	v13.16b,v13.16b,v13.16b,#12
1798	ext	v17.16b,v17.16b,v17.16b,#12
1799	ext	v21.16b,v21.16b,v21.16b,#12
1800	cbnz	x4,.Loop_lower_neon
1801
1802	add	w5,w5,w22		// accumulate key block
1803	ldp	q24,q25,[sp,#0]
1804	add	x6,x6,x22,lsr#32
1805	ldp	q26,q27,[sp,#32]
1806	add	w7,w7,w23
1807	ldp	q28,q29,[sp,#64]
1808	add	x8,x8,x23,lsr#32
1809	add	v0.4s,v0.4s,v24.4s
1810	add	w9,w9,w24
1811	add	v4.4s,v4.4s,v24.4s
1812	add	x10,x10,x24,lsr#32
1813	add	v8.4s,v8.4s,v24.4s
1814	add	w11,w11,w25
1815	add	v12.4s,v12.4s,v24.4s
1816	add	x12,x12,x25,lsr#32
1817	add	v16.4s,v16.4s,v24.4s
1818	add	w13,w13,w26
1819	add	v20.4s,v20.4s,v24.4s
1820	add	x14,x14,x26,lsr#32
1821	add	v2.4s,v2.4s,v26.4s
1822	add	w15,w15,w27
1823	add	v6.4s,v6.4s,v26.4s
1824	add	x16,x16,x27,lsr#32
1825	add	v10.4s,v10.4s,v26.4s
1826	add	w17,w17,w28
1827	add	v14.4s,v14.4s,v26.4s
1828	add	x19,x19,x28,lsr#32
1829	add	v18.4s,v18.4s,v26.4s
1830	add	w20,w20,w30
1831	add	v22.4s,v22.4s,v26.4s
1832	add	x21,x21,x30,lsr#32
1833	add	v19.4s,v19.4s,v31.4s			// +4
1834	add	x5,x5,x6,lsl#32	// pack
1835	add	v23.4s,v23.4s,v31.4s			// +4
1836	add	x7,x7,x8,lsl#32
1837	add	v3.4s,v3.4s,v27.4s
1838	ldp	x6,x8,[x1,#0]		// load input
1839	add	v7.4s,v7.4s,v28.4s
1840	add	x9,x9,x10,lsl#32
1841	add	v11.4s,v11.4s,v29.4s
1842	add	x11,x11,x12,lsl#32
1843	add	v15.4s,v15.4s,v30.4s
1844	ldp	x10,x12,[x1,#16]
1845	add	v19.4s,v19.4s,v27.4s
1846	add	x13,x13,x14,lsl#32
1847	add	v23.4s,v23.4s,v28.4s
1848	add	x15,x15,x16,lsl#32
1849	add	v1.4s,v1.4s,v25.4s
1850	ldp	x14,x16,[x1,#32]
1851	add	v5.4s,v5.4s,v25.4s
1852	add	x17,x17,x19,lsl#32
1853	add	v9.4s,v9.4s,v25.4s
1854	add	x20,x20,x21,lsl#32
1855	add	v13.4s,v13.4s,v25.4s
1856	ldp	x19,x21,[x1,#48]
1857	add	v17.4s,v17.4s,v25.4s
1858	add	x1,x1,#64
1859	add	v21.4s,v21.4s,v25.4s
1860
1861#ifdef	__ARMEB__
1862	rev	x5,x5
1863	rev	x7,x7
1864	rev	x9,x9
1865	rev	x11,x11
1866	rev	x13,x13
1867	rev	x15,x15
1868	rev	x17,x17
1869	rev	x20,x20
1870#endif
1871	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1872	eor	x5,x5,x6
1873	eor	x7,x7,x8
1874	eor	x9,x9,x10
1875	eor	x11,x11,x12
1876	eor	x13,x13,x14
1877	eor	v0.16b,v0.16b,v24.16b
1878	eor	x15,x15,x16
1879	eor	v1.16b,v1.16b,v25.16b
1880	eor	x17,x17,x19
1881	eor	v2.16b,v2.16b,v26.16b
1882	eor	x20,x20,x21
1883	eor	v3.16b,v3.16b,v27.16b
1884	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
1885
1886	stp	x5,x7,[x0,#0]		// store output
1887	add	x28,x28,#7			// increment counter
1888	stp	x9,x11,[x0,#16]
1889	stp	x13,x15,[x0,#32]
1890	stp	x17,x20,[x0,#48]
1891	add	x0,x0,#64
1892	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
1893
1894	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1895	eor	v4.16b,v4.16b,v24.16b
1896	eor	v5.16b,v5.16b,v25.16b
1897	eor	v6.16b,v6.16b,v26.16b
1898	eor	v7.16b,v7.16b,v27.16b
1899	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
1900
1901	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
1902	eor	v8.16b,v8.16b,v0.16b
1903	ldp	q24,q25,[sp,#0]
1904	eor	v9.16b,v9.16b,v1.16b
1905	ldp	q26,q27,[sp,#32]
1906	eor	v10.16b,v10.16b,v2.16b
1907	eor	v11.16b,v11.16b,v3.16b
1908	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1909
1910	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1911	eor	v12.16b,v12.16b,v4.16b
1912	eor	v13.16b,v13.16b,v5.16b
1913	eor	v14.16b,v14.16b,v6.16b
1914	eor	v15.16b,v15.16b,v7.16b
1915	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1916
1917	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1918	eor	v16.16b,v16.16b,v8.16b
1919	eor	v17.16b,v17.16b,v9.16b
1920	eor	v18.16b,v18.16b,v10.16b
1921	eor	v19.16b,v19.16b,v11.16b
1922	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1923
1924	shl	v0.4s,v31.4s,#1			// 4 -> 8
1925	eor	v20.16b,v20.16b,v12.16b
1926	eor	v21.16b,v21.16b,v13.16b
1927	eor	v22.16b,v22.16b,v14.16b
1928	eor	v23.16b,v23.16b,v15.16b
1929	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1930
1931	add	v27.4s,v27.4s,v0.4s			// += 8
1932	add	v28.4s,v28.4s,v0.4s
1933	add	v29.4s,v29.4s,v0.4s
1934	add	v30.4s,v30.4s,v0.4s
1935
1936	b.hs	.Loop_outer_512_neon
1937
1938	adds	x2,x2,#512
1939	ushr	v0.4s,v31.4s,#2			// 4 -> 1
1940
1941	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
1942	ldp	d10,d11,[sp,#128+16]
1943	ldp	d12,d13,[sp,#128+32]
1944	ldp	d14,d15,[sp,#128+48]
1945
1946	stp	q24,q31,[sp,#0]		// wipe off-load area
1947	stp	q24,q31,[sp,#32]
1948	stp	q24,q31,[sp,#64]
1949
1950	b.eq	.Ldone_512_neon
1951
1952	cmp	x2,#192
1953	sub	v27.4s,v27.4s,v0.4s			// -= 1
1954	sub	v28.4s,v28.4s,v0.4s
1955	sub	v29.4s,v29.4s,v0.4s
1956	add	sp,sp,#128
1957	b.hs	.Loop_outer_neon
1958
1959	eor	v25.16b,v25.16b,v25.16b
1960	eor	v26.16b,v26.16b,v26.16b
1961	eor	v27.16b,v27.16b,v27.16b
1962	eor	v28.16b,v28.16b,v28.16b
1963	eor	v29.16b,v29.16b,v29.16b
1964	eor	v30.16b,v30.16b,v30.16b
1965	b	.Loop_outer
1966
1967.Ldone_512_neon:
1968	ldp	x19,x20,[x29,#16]
1969	add	sp,sp,#128+64
1970	ldp	x21,x22,[x29,#32]
1971	ldp	x23,x24,[x29,#48]
1972	ldp	x25,x26,[x29,#64]
1973	ldp	x27,x28,[x29,#80]
1974	ldp	x29,x30,[sp],#96
1975.inst	0xd50323bf			// autiasp
1976	ret
1977.size	ChaCha20_512_neon,.-ChaCha20_512_neon
1978