• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#ifndef	__KERNEL__
2# include "arm_arch.h"
3
4.private_extern	_OPENSSL_armcap_P
5#endif
6
7.text
8
9.align	5
10Lsigma:
11.quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
12Lone:
13.long	1,2,3,4
14Lrot24:
15.long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
16.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0
17.align	2
18
19.globl	_ChaCha20_ctr32
20
21.align	5
22_ChaCha20_ctr32:
23	cbz	x2,Labort
24	cmp	x2,#192
25	b.lo	Lshort
26
27#ifndef	__KERNEL__
28	adrp	x17,_OPENSSL_armcap_P@PAGE
29	ldr	w17,[x17,_OPENSSL_armcap_P@PAGEOFF]
30	tst	w17,#ARMV7_NEON
31	b.ne	LChaCha20_neon
32#endif
33
34Lshort:
35.long	0xd503233f			// paciasp
36	stp	x29,x30,[sp,#-96]!
37	add	x29,sp,#0
38
39	adr	x5,Lsigma
40	stp	x19,x20,[sp,#16]
41	stp	x21,x22,[sp,#32]
42	stp	x23,x24,[sp,#48]
43	stp	x25,x26,[sp,#64]
44	stp	x27,x28,[sp,#80]
45	sub	sp,sp,#64
46
47	ldp	x22,x23,[x5]		// load sigma
48	ldp	x24,x25,[x3]		// load key
49	ldp	x26,x27,[x3,#16]
50	ldp	x28,x30,[x4]		// load counter
51#ifdef	__AARCH64EB__
52	ror	x24,x24,#32
53	ror	x25,x25,#32
54	ror	x26,x26,#32
55	ror	x27,x27,#32
56	ror	x28,x28,#32
57	ror	x30,x30,#32
58#endif
59
60Loop_outer:
61	mov	w5,w22			// unpack key block
62	lsr	x6,x22,#32
63	mov	w7,w23
64	lsr	x8,x23,#32
65	mov	w9,w24
66	lsr	x10,x24,#32
67	mov	w11,w25
68	lsr	x12,x25,#32
69	mov	w13,w26
70	lsr	x14,x26,#32
71	mov	w15,w27
72	lsr	x16,x27,#32
73	mov	w17,w28
74	lsr	x19,x28,#32
75	mov	w20,w30
76	lsr	x21,x30,#32
77
78	mov	x4,#10
79	subs	x2,x2,#64
80Loop:
81	sub	x4,x4,#1
82	add	w5,w5,w9
83	add	w6,w6,w10
84	add	w7,w7,w11
85	add	w8,w8,w12
86	eor	w17,w17,w5
87	eor	w19,w19,w6
88	eor	w20,w20,w7
89	eor	w21,w21,w8
90	ror	w17,w17,#16
91	ror	w19,w19,#16
92	ror	w20,w20,#16
93	ror	w21,w21,#16
94	add	w13,w13,w17
95	add	w14,w14,w19
96	add	w15,w15,w20
97	add	w16,w16,w21
98	eor	w9,w9,w13
99	eor	w10,w10,w14
100	eor	w11,w11,w15
101	eor	w12,w12,w16
102	ror	w9,w9,#20
103	ror	w10,w10,#20
104	ror	w11,w11,#20
105	ror	w12,w12,#20
106	add	w5,w5,w9
107	add	w6,w6,w10
108	add	w7,w7,w11
109	add	w8,w8,w12
110	eor	w17,w17,w5
111	eor	w19,w19,w6
112	eor	w20,w20,w7
113	eor	w21,w21,w8
114	ror	w17,w17,#24
115	ror	w19,w19,#24
116	ror	w20,w20,#24
117	ror	w21,w21,#24
118	add	w13,w13,w17
119	add	w14,w14,w19
120	add	w15,w15,w20
121	add	w16,w16,w21
122	eor	w9,w9,w13
123	eor	w10,w10,w14
124	eor	w11,w11,w15
125	eor	w12,w12,w16
126	ror	w9,w9,#25
127	ror	w10,w10,#25
128	ror	w11,w11,#25
129	ror	w12,w12,#25
130	add	w5,w5,w10
131	add	w6,w6,w11
132	add	w7,w7,w12
133	add	w8,w8,w9
134	eor	w21,w21,w5
135	eor	w17,w17,w6
136	eor	w19,w19,w7
137	eor	w20,w20,w8
138	ror	w21,w21,#16
139	ror	w17,w17,#16
140	ror	w19,w19,#16
141	ror	w20,w20,#16
142	add	w15,w15,w21
143	add	w16,w16,w17
144	add	w13,w13,w19
145	add	w14,w14,w20
146	eor	w10,w10,w15
147	eor	w11,w11,w16
148	eor	w12,w12,w13
149	eor	w9,w9,w14
150	ror	w10,w10,#20
151	ror	w11,w11,#20
152	ror	w12,w12,#20
153	ror	w9,w9,#20
154	add	w5,w5,w10
155	add	w6,w6,w11
156	add	w7,w7,w12
157	add	w8,w8,w9
158	eor	w21,w21,w5
159	eor	w17,w17,w6
160	eor	w19,w19,w7
161	eor	w20,w20,w8
162	ror	w21,w21,#24
163	ror	w17,w17,#24
164	ror	w19,w19,#24
165	ror	w20,w20,#24
166	add	w15,w15,w21
167	add	w16,w16,w17
168	add	w13,w13,w19
169	add	w14,w14,w20
170	eor	w10,w10,w15
171	eor	w11,w11,w16
172	eor	w12,w12,w13
173	eor	w9,w9,w14
174	ror	w10,w10,#25
175	ror	w11,w11,#25
176	ror	w12,w12,#25
177	ror	w9,w9,#25
178	cbnz	x4,Loop
179
180	add	w5,w5,w22		// accumulate key block
181	add	x6,x6,x22,lsr#32
182	add	w7,w7,w23
183	add	x8,x8,x23,lsr#32
184	add	w9,w9,w24
185	add	x10,x10,x24,lsr#32
186	add	w11,w11,w25
187	add	x12,x12,x25,lsr#32
188	add	w13,w13,w26
189	add	x14,x14,x26,lsr#32
190	add	w15,w15,w27
191	add	x16,x16,x27,lsr#32
192	add	w17,w17,w28
193	add	x19,x19,x28,lsr#32
194	add	w20,w20,w30
195	add	x21,x21,x30,lsr#32
196
197	b.lo	Ltail
198
199	add	x5,x5,x6,lsl#32	// pack
200	add	x7,x7,x8,lsl#32
201	ldp	x6,x8,[x1,#0]		// load input
202	add	x9,x9,x10,lsl#32
203	add	x11,x11,x12,lsl#32
204	ldp	x10,x12,[x1,#16]
205	add	x13,x13,x14,lsl#32
206	add	x15,x15,x16,lsl#32
207	ldp	x14,x16,[x1,#32]
208	add	x17,x17,x19,lsl#32
209	add	x20,x20,x21,lsl#32
210	ldp	x19,x21,[x1,#48]
211	add	x1,x1,#64
212#ifdef	__AARCH64EB__
213	rev	x5,x5
214	rev	x7,x7
215	rev	x9,x9
216	rev	x11,x11
217	rev	x13,x13
218	rev	x15,x15
219	rev	x17,x17
220	rev	x20,x20
221#endif
222	eor	x5,x5,x6
223	eor	x7,x7,x8
224	eor	x9,x9,x10
225	eor	x11,x11,x12
226	eor	x13,x13,x14
227	eor	x15,x15,x16
228	eor	x17,x17,x19
229	eor	x20,x20,x21
230
231	stp	x5,x7,[x0,#0]		// store output
232	add	x28,x28,#1			// increment counter
233	stp	x9,x11,[x0,#16]
234	stp	x13,x15,[x0,#32]
235	stp	x17,x20,[x0,#48]
236	add	x0,x0,#64
237
238	b.hi	Loop_outer
239
240	ldp	x19,x20,[x29,#16]
241	add	sp,sp,#64
242	ldp	x21,x22,[x29,#32]
243	ldp	x23,x24,[x29,#48]
244	ldp	x25,x26,[x29,#64]
245	ldp	x27,x28,[x29,#80]
246	ldp	x29,x30,[sp],#96
247.long	0xd50323bf			// autiasp
248Labort:
249	ret
250
251.align	4
252Ltail:
253	add	x2,x2,#64
254Less_than_64:
255	sub	x0,x0,#1
256	add	x1,x1,x2
257	add	x0,x0,x2
258	add	x4,sp,x2
259	neg	x2,x2
260
261	add	x5,x5,x6,lsl#32	// pack
262	add	x7,x7,x8,lsl#32
263	add	x9,x9,x10,lsl#32
264	add	x11,x11,x12,lsl#32
265	add	x13,x13,x14,lsl#32
266	add	x15,x15,x16,lsl#32
267	add	x17,x17,x19,lsl#32
268	add	x20,x20,x21,lsl#32
269#ifdef	__AARCH64EB__
270	rev	x5,x5
271	rev	x7,x7
272	rev	x9,x9
273	rev	x11,x11
274	rev	x13,x13
275	rev	x15,x15
276	rev	x17,x17
277	rev	x20,x20
278#endif
279	stp	x5,x7,[sp,#0]
280	stp	x9,x11,[sp,#16]
281	stp	x13,x15,[sp,#32]
282	stp	x17,x20,[sp,#48]
283
284Loop_tail:
285	ldrb	w10,[x1,x2]
286	ldrb	w11,[x4,x2]
287	add	x2,x2,#1
288	eor	w10,w10,w11
289	strb	w10,[x0,x2]
290	cbnz	x2,Loop_tail
291
292	stp	xzr,xzr,[sp,#0]
293	stp	xzr,xzr,[sp,#16]
294	stp	xzr,xzr,[sp,#32]
295	stp	xzr,xzr,[sp,#48]
296
297	ldp	x19,x20,[x29,#16]
298	add	sp,sp,#64
299	ldp	x21,x22,[x29,#32]
300	ldp	x23,x24,[x29,#48]
301	ldp	x25,x26,[x29,#64]
302	ldp	x27,x28,[x29,#80]
303	ldp	x29,x30,[sp],#96
304.long	0xd50323bf			// autiasp
305	ret
306
307
308#ifdef	__KERNEL__
309.globl	_ChaCha20_neon
310#endif
311
312.align	5
313_ChaCha20_neon:
314LChaCha20_neon:
315.long	0xd503233f			// paciasp
316	stp	x29,x30,[sp,#-96]!
317	add	x29,sp,#0
318
319	adr	x5,Lsigma
320	stp	x19,x20,[sp,#16]
321	stp	x21,x22,[sp,#32]
322	stp	x23,x24,[sp,#48]
323	stp	x25,x26,[sp,#64]
324	stp	x27,x28,[sp,#80]
325	cmp	x2,#512
326	b.hs	L512_or_more_neon
327
328	sub	sp,sp,#64
329
330	ldp	x22,x23,[x5]		// load sigma
331	ld1	{v0.4s},[x5],#16
332	ldp	x24,x25,[x3]		// load key
333	ldp	x26,x27,[x3,#16]
334	ld1	{v1.4s,v2.4s},[x3]
335	ldp	x28,x30,[x4]		// load counter
336	ld1	{v3.4s},[x4]
337	stp	d8,d9,[sp]			// meet ABI requirements
338	ld1	{v8.4s,v9.4s},[x5]
339#ifdef	__AARCH64EB__
340	rev64	v0.4s,v0.4s
341	ror	x24,x24,#32
342	ror	x25,x25,#32
343	ror	x26,x26,#32
344	ror	x27,x27,#32
345	ror	x28,x28,#32
346	ror	x30,x30,#32
347#endif
348
349Loop_outer_neon:
350	dup	v16.4s,v0.s[0]			// unpack key block
351	mov	w5,w22
352	dup	v20.4s,v0.s[1]
353	lsr	x6,x22,#32
354	dup	v24.4s,v0.s[2]
355	mov	w7,w23
356	dup	v28.4s,v0.s[3]
357	lsr	x8,x23,#32
358	dup	v17.4s,v1.s[0]
359	mov	w9,w24
360	dup	v21.4s,v1.s[1]
361	lsr	x10,x24,#32
362	dup	v25.4s,v1.s[2]
363	mov	w11,w25
364	dup	v29.4s,v1.s[3]
365	lsr	x12,x25,#32
366	dup	v19.4s,v3.s[0]
367	mov	w13,w26
368	dup	v23.4s,v3.s[1]
369	lsr	x14,x26,#32
370	dup	v27.4s,v3.s[2]
371	mov	w15,w27
372	dup	v31.4s,v3.s[3]
373	lsr	x16,x27,#32
374	add	v19.4s,v19.4s,v8.4s
375	mov	w17,w28
376	dup	v18.4s,v2.s[0]
377	lsr	x19,x28,#32
378	dup	v22.4s,v2.s[1]
379	mov	w20,w30
380	dup	v26.4s,v2.s[2]
381	lsr	x21,x30,#32
382	dup	v30.4s,v2.s[3]
383
384	mov	x4,#10
385	subs	x2,x2,#320
386Loop_neon:
387	sub	x4,x4,#1
388	add	v16.4s,v16.4s,v17.4s
389	add	w5,w5,w9
390	add	v20.4s,v20.4s,v21.4s
391	add	w6,w6,w10
392	add	v24.4s,v24.4s,v25.4s
393	add	w7,w7,w11
394	add	v28.4s,v28.4s,v29.4s
395	add	w8,w8,w12
396	eor	v19.16b,v19.16b,v16.16b
397	eor	w17,w17,w5
398	eor	v23.16b,v23.16b,v20.16b
399	eor	w19,w19,w6
400	eor	v27.16b,v27.16b,v24.16b
401	eor	w20,w20,w7
402	eor	v31.16b,v31.16b,v28.16b
403	eor	w21,w21,w8
404	rev32	v19.8h,v19.8h
405	ror	w17,w17,#16
406	rev32	v23.8h,v23.8h
407	ror	w19,w19,#16
408	rev32	v27.8h,v27.8h
409	ror	w20,w20,#16
410	rev32	v31.8h,v31.8h
411	ror	w21,w21,#16
412	add	v18.4s,v18.4s,v19.4s
413	add	w13,w13,w17
414	add	v22.4s,v22.4s,v23.4s
415	add	w14,w14,w19
416	add	v26.4s,v26.4s,v27.4s
417	add	w15,w15,w20
418	add	v30.4s,v30.4s,v31.4s
419	add	w16,w16,w21
420	eor	v4.16b,v17.16b,v18.16b
421	eor	w9,w9,w13
422	eor	v5.16b,v21.16b,v22.16b
423	eor	w10,w10,w14
424	eor	v6.16b,v25.16b,v26.16b
425	eor	w11,w11,w15
426	eor	v7.16b,v29.16b,v30.16b
427	eor	w12,w12,w16
428	ushr	v17.4s,v4.4s,#20
429	ror	w9,w9,#20
430	ushr	v21.4s,v5.4s,#20
431	ror	w10,w10,#20
432	ushr	v25.4s,v6.4s,#20
433	ror	w11,w11,#20
434	ushr	v29.4s,v7.4s,#20
435	ror	w12,w12,#20
436	sli	v17.4s,v4.4s,#12
437	add	w5,w5,w9
438	sli	v21.4s,v5.4s,#12
439	add	w6,w6,w10
440	sli	v25.4s,v6.4s,#12
441	add	w7,w7,w11
442	sli	v29.4s,v7.4s,#12
443	add	w8,w8,w12
444	add	v16.4s,v16.4s,v17.4s
445	eor	w17,w17,w5
446	add	v20.4s,v20.4s,v21.4s
447	eor	w19,w19,w6
448	add	v24.4s,v24.4s,v25.4s
449	eor	w20,w20,w7
450	add	v28.4s,v28.4s,v29.4s
451	eor	w21,w21,w8
452	eor	v4.16b,v19.16b,v16.16b
453	ror	w17,w17,#24
454	eor	v5.16b,v23.16b,v20.16b
455	ror	w19,w19,#24
456	eor	v6.16b,v27.16b,v24.16b
457	ror	w20,w20,#24
458	eor	v7.16b,v31.16b,v28.16b
459	ror	w21,w21,#24
460	tbl	v19.16b,{v4.16b},v9.16b
461	add	w13,w13,w17
462	tbl	v23.16b,{v5.16b},v9.16b
463	add	w14,w14,w19
464	tbl	v27.16b,{v6.16b},v9.16b
465	add	w15,w15,w20
466	tbl	v31.16b,{v7.16b},v9.16b
467	add	w16,w16,w21
468	add	v18.4s,v18.4s,v19.4s
469	eor	w9,w9,w13
470	add	v22.4s,v22.4s,v23.4s
471	eor	w10,w10,w14
472	add	v26.4s,v26.4s,v27.4s
473	eor	w11,w11,w15
474	add	v30.4s,v30.4s,v31.4s
475	eor	w12,w12,w16
476	eor	v4.16b,v17.16b,v18.16b
477	ror	w9,w9,#25
478	eor	v5.16b,v21.16b,v22.16b
479	ror	w10,w10,#25
480	eor	v6.16b,v25.16b,v26.16b
481	ror	w11,w11,#25
482	eor	v7.16b,v29.16b,v30.16b
483	ror	w12,w12,#25
484	ushr	v17.4s,v4.4s,#25
485	ushr	v21.4s,v5.4s,#25
486	ushr	v25.4s,v6.4s,#25
487	ushr	v29.4s,v7.4s,#25
488	sli	v17.4s,v4.4s,#7
489	sli	v21.4s,v5.4s,#7
490	sli	v25.4s,v6.4s,#7
491	sli	v29.4s,v7.4s,#7
492	add	v16.4s,v16.4s,v21.4s
493	add	w5,w5,w10
494	add	v20.4s,v20.4s,v25.4s
495	add	w6,w6,w11
496	add	v24.4s,v24.4s,v29.4s
497	add	w7,w7,w12
498	add	v28.4s,v28.4s,v17.4s
499	add	w8,w8,w9
500	eor	v31.16b,v31.16b,v16.16b
501	eor	w21,w21,w5
502	eor	v19.16b,v19.16b,v20.16b
503	eor	w17,w17,w6
504	eor	v23.16b,v23.16b,v24.16b
505	eor	w19,w19,w7
506	eor	v27.16b,v27.16b,v28.16b
507	eor	w20,w20,w8
508	rev32	v31.8h,v31.8h
509	ror	w21,w21,#16
510	rev32	v19.8h,v19.8h
511	ror	w17,w17,#16
512	rev32	v23.8h,v23.8h
513	ror	w19,w19,#16
514	rev32	v27.8h,v27.8h
515	ror	w20,w20,#16
516	add	v26.4s,v26.4s,v31.4s
517	add	w15,w15,w21
518	add	v30.4s,v30.4s,v19.4s
519	add	w16,w16,w17
520	add	v18.4s,v18.4s,v23.4s
521	add	w13,w13,w19
522	add	v22.4s,v22.4s,v27.4s
523	add	w14,w14,w20
524	eor	v4.16b,v21.16b,v26.16b
525	eor	w10,w10,w15
526	eor	v5.16b,v25.16b,v30.16b
527	eor	w11,w11,w16
528	eor	v6.16b,v29.16b,v18.16b
529	eor	w12,w12,w13
530	eor	v7.16b,v17.16b,v22.16b
531	eor	w9,w9,w14
532	ushr	v21.4s,v4.4s,#20
533	ror	w10,w10,#20
534	ushr	v25.4s,v5.4s,#20
535	ror	w11,w11,#20
536	ushr	v29.4s,v6.4s,#20
537	ror	w12,w12,#20
538	ushr	v17.4s,v7.4s,#20
539	ror	w9,w9,#20
540	sli	v21.4s,v4.4s,#12
541	add	w5,w5,w10
542	sli	v25.4s,v5.4s,#12
543	add	w6,w6,w11
544	sli	v29.4s,v6.4s,#12
545	add	w7,w7,w12
546	sli	v17.4s,v7.4s,#12
547	add	w8,w8,w9
548	add	v16.4s,v16.4s,v21.4s
549	eor	w21,w21,w5
550	add	v20.4s,v20.4s,v25.4s
551	eor	w17,w17,w6
552	add	v24.4s,v24.4s,v29.4s
553	eor	w19,w19,w7
554	add	v28.4s,v28.4s,v17.4s
555	eor	w20,w20,w8
556	eor	v4.16b,v31.16b,v16.16b
557	ror	w21,w21,#24
558	eor	v5.16b,v19.16b,v20.16b
559	ror	w17,w17,#24
560	eor	v6.16b,v23.16b,v24.16b
561	ror	w19,w19,#24
562	eor	v7.16b,v27.16b,v28.16b
563	ror	w20,w20,#24
564	tbl	v31.16b,{v4.16b},v9.16b
565	add	w15,w15,w21
566	tbl	v19.16b,{v5.16b},v9.16b
567	add	w16,w16,w17
568	tbl	v23.16b,{v6.16b},v9.16b
569	add	w13,w13,w19
570	tbl	v27.16b,{v7.16b},v9.16b
571	add	w14,w14,w20
572	add	v26.4s,v26.4s,v31.4s
573	eor	w10,w10,w15
574	add	v30.4s,v30.4s,v19.4s
575	eor	w11,w11,w16
576	add	v18.4s,v18.4s,v23.4s
577	eor	w12,w12,w13
578	add	v22.4s,v22.4s,v27.4s
579	eor	w9,w9,w14
580	eor	v4.16b,v21.16b,v26.16b
581	ror	w10,w10,#25
582	eor	v5.16b,v25.16b,v30.16b
583	ror	w11,w11,#25
584	eor	v6.16b,v29.16b,v18.16b
585	ror	w12,w12,#25
586	eor	v7.16b,v17.16b,v22.16b
587	ror	w9,w9,#25
588	ushr	v21.4s,v4.4s,#25
589	ushr	v25.4s,v5.4s,#25
590	ushr	v29.4s,v6.4s,#25
591	ushr	v17.4s,v7.4s,#25
592	sli	v21.4s,v4.4s,#7
593	sli	v25.4s,v5.4s,#7
594	sli	v29.4s,v6.4s,#7
595	sli	v17.4s,v7.4s,#7
596	cbnz	x4,Loop_neon
597
598	add	v19.4s,v19.4s,v8.4s
599
600	zip1	v4.4s,v16.4s,v20.4s			// transpose data
601	zip1	v5.4s,v24.4s,v28.4s
602	zip2	v6.4s,v16.4s,v20.4s
603	zip2	v7.4s,v24.4s,v28.4s
604	zip1	v16.2d,v4.2d,v5.2d
605	zip2	v20.2d,v4.2d,v5.2d
606	zip1	v24.2d,v6.2d,v7.2d
607	zip2	v28.2d,v6.2d,v7.2d
608
609	zip1	v4.4s,v17.4s,v21.4s
610	zip1	v5.4s,v25.4s,v29.4s
611	zip2	v6.4s,v17.4s,v21.4s
612	zip2	v7.4s,v25.4s,v29.4s
613	zip1	v17.2d,v4.2d,v5.2d
614	zip2	v21.2d,v4.2d,v5.2d
615	zip1	v25.2d,v6.2d,v7.2d
616	zip2	v29.2d,v6.2d,v7.2d
617
618	zip1	v4.4s,v18.4s,v22.4s
619	add	w5,w5,w22		// accumulate key block
620	zip1	v5.4s,v26.4s,v30.4s
621	add	x6,x6,x22,lsr#32
622	zip2	v6.4s,v18.4s,v22.4s
623	add	w7,w7,w23
624	zip2	v7.4s,v26.4s,v30.4s
625	add	x8,x8,x23,lsr#32
626	zip1	v18.2d,v4.2d,v5.2d
627	add	w9,w9,w24
628	zip2	v22.2d,v4.2d,v5.2d
629	add	x10,x10,x24,lsr#32
630	zip1	v26.2d,v6.2d,v7.2d
631	add	w11,w11,w25
632	zip2	v30.2d,v6.2d,v7.2d
633	add	x12,x12,x25,lsr#32
634
635	zip1	v4.4s,v19.4s,v23.4s
636	add	w13,w13,w26
637	zip1	v5.4s,v27.4s,v31.4s
638	add	x14,x14,x26,lsr#32
639	zip2	v6.4s,v19.4s,v23.4s
640	add	w15,w15,w27
641	zip2	v7.4s,v27.4s,v31.4s
642	add	x16,x16,x27,lsr#32
643	zip1	v19.2d,v4.2d,v5.2d
644	add	w17,w17,w28
645	zip2	v23.2d,v4.2d,v5.2d
646	add	x19,x19,x28,lsr#32
647	zip1	v27.2d,v6.2d,v7.2d
648	add	w20,w20,w30
649	zip2	v31.2d,v6.2d,v7.2d
650	add	x21,x21,x30,lsr#32
651
652	b.lo	Ltail_neon
653
654	add	x5,x5,x6,lsl#32	// pack
655	add	x7,x7,x8,lsl#32
656	ldp	x6,x8,[x1,#0]		// load input
657	add	v16.4s,v16.4s,v0.4s			// accumulate key block
658	add	x9,x9,x10,lsl#32
659	add	x11,x11,x12,lsl#32
660	ldp	x10,x12,[x1,#16]
661	add	v17.4s,v17.4s,v1.4s
662	add	x13,x13,x14,lsl#32
663	add	x15,x15,x16,lsl#32
664	ldp	x14,x16,[x1,#32]
665	add	v18.4s,v18.4s,v2.4s
666	add	x17,x17,x19,lsl#32
667	add	x20,x20,x21,lsl#32
668	ldp	x19,x21,[x1,#48]
669	add	v19.4s,v19.4s,v3.4s
670	add	x1,x1,#64
671#ifdef	__AARCH64EB__
672	rev	x5,x5
673	rev	x7,x7
674	rev	x9,x9
675	rev	x11,x11
676	rev	x13,x13
677	rev	x15,x15
678	rev	x17,x17
679	rev	x20,x20
680#endif
681	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
682	eor	x5,x5,x6
683	add	v20.4s,v20.4s,v0.4s
684	eor	x7,x7,x8
685	add	v21.4s,v21.4s,v1.4s
686	eor	x9,x9,x10
687	add	v22.4s,v22.4s,v2.4s
688	eor	x11,x11,x12
689	add	v23.4s,v23.4s,v3.4s
690	eor	x13,x13,x14
691	eor	v16.16b,v16.16b,v4.16b
692	movi	v4.4s,#5
693	eor	x15,x15,x16
694	eor	v17.16b,v17.16b,v5.16b
695	eor	x17,x17,x19
696	eor	v18.16b,v18.16b,v6.16b
697	eor	x20,x20,x21
698	eor	v19.16b,v19.16b,v7.16b
699	add	v8.4s,v8.4s,v4.4s			// += 5
700	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
701
702	stp	x5,x7,[x0,#0]		// store output
703	add	x28,x28,#5			// increment counter
704	stp	x9,x11,[x0,#16]
705	stp	x13,x15,[x0,#32]
706	stp	x17,x20,[x0,#48]
707	add	x0,x0,#64
708
709	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
710	add	v24.4s,v24.4s,v0.4s
711	add	v25.4s,v25.4s,v1.4s
712	add	v26.4s,v26.4s,v2.4s
713	add	v27.4s,v27.4s,v3.4s
714	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
715
716	eor	v20.16b,v20.16b,v4.16b
717	eor	v21.16b,v21.16b,v5.16b
718	eor	v22.16b,v22.16b,v6.16b
719	eor	v23.16b,v23.16b,v7.16b
720	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
721	add	v28.4s,v28.4s,v0.4s
722	add	v29.4s,v29.4s,v1.4s
723	add	v30.4s,v30.4s,v2.4s
724	add	v31.4s,v31.4s,v3.4s
725	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
726
727	eor	v24.16b,v24.16b,v16.16b
728	eor	v25.16b,v25.16b,v17.16b
729	eor	v26.16b,v26.16b,v18.16b
730	eor	v27.16b,v27.16b,v19.16b
731	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
732
733	eor	v28.16b,v28.16b,v20.16b
734	eor	v29.16b,v29.16b,v21.16b
735	eor	v30.16b,v30.16b,v22.16b
736	eor	v31.16b,v31.16b,v23.16b
737	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
738
739	b.hi	Loop_outer_neon
740
741	ldp	d8,d9,[sp]			// meet ABI requirements
742
743	ldp	x19,x20,[x29,#16]
744	add	sp,sp,#64
745	ldp	x21,x22,[x29,#32]
746	ldp	x23,x24,[x29,#48]
747	ldp	x25,x26,[x29,#64]
748	ldp	x27,x28,[x29,#80]
749	ldp	x29,x30,[sp],#96
750.long	0xd50323bf			// autiasp
751	ret
752
753.align	4
754Ltail_neon:
755	add	x2,x2,#320
756	ldp	d8,d9,[sp]			// meet ABI requirements
757	cmp	x2,#64
758	b.lo	Less_than_64
759
760	add	x5,x5,x6,lsl#32	// pack
761	add	x7,x7,x8,lsl#32
762	ldp	x6,x8,[x1,#0]		// load input
763	add	x9,x9,x10,lsl#32
764	add	x11,x11,x12,lsl#32
765	ldp	x10,x12,[x1,#16]
766	add	x13,x13,x14,lsl#32
767	add	x15,x15,x16,lsl#32
768	ldp	x14,x16,[x1,#32]
769	add	x17,x17,x19,lsl#32
770	add	x20,x20,x21,lsl#32
771	ldp	x19,x21,[x1,#48]
772	add	x1,x1,#64
773#ifdef	__AARCH64EB__
774	rev	x5,x5
775	rev	x7,x7
776	rev	x9,x9
777	rev	x11,x11
778	rev	x13,x13
779	rev	x15,x15
780	rev	x17,x17
781	rev	x20,x20
782#endif
783	eor	x5,x5,x6
784	eor	x7,x7,x8
785	eor	x9,x9,x10
786	eor	x11,x11,x12
787	eor	x13,x13,x14
788	eor	x15,x15,x16
789	eor	x17,x17,x19
790	eor	x20,x20,x21
791
792	stp	x5,x7,[x0,#0]		// store output
793	add	v16.4s,v16.4s,v0.4s			// accumulate key block
794	stp	x9,x11,[x0,#16]
795	add	v17.4s,v17.4s,v1.4s
796	stp	x13,x15,[x0,#32]
797	add	v18.4s,v18.4s,v2.4s
798	stp	x17,x20,[x0,#48]
799	add	v19.4s,v19.4s,v3.4s
800	add	x0,x0,#64
801	b.eq	Ldone_neon
802	sub	x2,x2,#64
803	cmp	x2,#64
804	b.lo	Last_neon
805
806	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
807	eor	v16.16b,v16.16b,v4.16b
808	eor	v17.16b,v17.16b,v5.16b
809	eor	v18.16b,v18.16b,v6.16b
810	eor	v19.16b,v19.16b,v7.16b
811	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
812	b.eq	Ldone_neon
813
814	add	v16.4s,v20.4s,v0.4s
815	add	v17.4s,v21.4s,v1.4s
816	sub	x2,x2,#64
817	add	v18.4s,v22.4s,v2.4s
818	cmp	x2,#64
819	add	v19.4s,v23.4s,v3.4s
820	b.lo	Last_neon
821
822	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
823	eor	v20.16b,v16.16b,v4.16b
824	eor	v21.16b,v17.16b,v5.16b
825	eor	v22.16b,v18.16b,v6.16b
826	eor	v23.16b,v19.16b,v7.16b
827	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
828	b.eq	Ldone_neon
829
830	add	v16.4s,v24.4s,v0.4s
831	add	v17.4s,v25.4s,v1.4s
832	sub	x2,x2,#64
833	add	v18.4s,v26.4s,v2.4s
834	cmp	x2,#64
835	add	v19.4s,v27.4s,v3.4s
836	b.lo	Last_neon
837
838	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
839	eor	v24.16b,v16.16b,v4.16b
840	eor	v25.16b,v17.16b,v5.16b
841	eor	v26.16b,v18.16b,v6.16b
842	eor	v27.16b,v19.16b,v7.16b
843	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
844	b.eq	Ldone_neon
845
846	add	v16.4s,v28.4s,v0.4s
847	add	v17.4s,v29.4s,v1.4s
848	add	v18.4s,v30.4s,v2.4s
849	add	v19.4s,v31.4s,v3.4s
850	sub	x2,x2,#64
851
852Last_neon:
853	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
854
855	sub	x0,x0,#1
856	add	x1,x1,x2
857	add	x0,x0,x2
858	add	x4,sp,x2
859	neg	x2,x2
860
861Loop_tail_neon:
862	ldrb	w10,[x1,x2]
863	ldrb	w11,[x4,x2]
864	add	x2,x2,#1
865	eor	w10,w10,w11
866	strb	w10,[x0,x2]
867	cbnz	x2,Loop_tail_neon
868
869	stp	xzr,xzr,[sp,#0]
870	stp	xzr,xzr,[sp,#16]
871	stp	xzr,xzr,[sp,#32]
872	stp	xzr,xzr,[sp,#48]
873
874Ldone_neon:
875	ldp	x19,x20,[x29,#16]
876	add	sp,sp,#64
877	ldp	x21,x22,[x29,#32]
878	ldp	x23,x24,[x29,#48]
879	ldp	x25,x26,[x29,#64]
880	ldp	x27,x28,[x29,#80]
881	ldp	x29,x30,[sp],#96
882.long	0xd50323bf			// autiasp
883	ret
884
885
886.align	5
887ChaCha20_512_neon:
888.long	0xd503233f			// paciasp
889	stp	x29,x30,[sp,#-96]!
890	add	x29,sp,#0
891
892	adr	x5,Lsigma
893	stp	x19,x20,[sp,#16]
894	stp	x21,x22,[sp,#32]
895	stp	x23,x24,[sp,#48]
896	stp	x25,x26,[sp,#64]
897	stp	x27,x28,[sp,#80]
898
899L512_or_more_neon:
900	sub	sp,sp,#128+64
901
902	eor	v7.16b,v7.16b,v7.16b
903	ldp	x22,x23,[x5]		// load sigma
904	ld1	{v0.4s},[x5],#16
905	ldp	x24,x25,[x3]		// load key
906	ldp	x26,x27,[x3,#16]
907	ld1	{v1.4s,v2.4s},[x3]
908	ldp	x28,x30,[x4]		// load counter
909	ld1	{v3.4s},[x4]
910	ld1	{v7.s}[0],[x5]
911	add	x3,x5,#16			// Lrot24
912#ifdef	__AARCH64EB__
913	rev64	v0.4s,v0.4s
914	ror	x24,x24,#32
915	ror	x25,x25,#32
916	ror	x26,x26,#32
917	ror	x27,x27,#32
918	ror	x28,x28,#32
919	ror	x30,x30,#32
920#endif
921	add	v3.4s,v3.4s,v7.4s		// += 1
922	stp	q0,q1,[sp,#0]		// off-load key block, invariant part
923	add	v3.4s,v3.4s,v7.4s		// not typo
924	str	q2,[sp,#32]
925	add	v4.4s,v3.4s,v7.4s
926	add	v5.4s,v4.4s,v7.4s
927	add	v6.4s,v5.4s,v7.4s
928	shl	v7.4s,v7.4s,#2			// 1 -> 4
929
930	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
931	stp	d10,d11,[sp,#128+16]
932	stp	d12,d13,[sp,#128+32]
933	stp	d14,d15,[sp,#128+48]
934
935	sub	x2,x2,#512			// not typo
936
937Loop_outer_512_neon:
938	mov	v8.16b,v0.16b
939	mov	v12.16b,v0.16b
940	mov	v16.16b,v0.16b
941	mov	v20.16b,v0.16b
942	mov	v24.16b,v0.16b
943	mov	v28.16b,v0.16b
944	mov	v9.16b,v1.16b
945	mov	w5,w22			// unpack key block
946	mov	v13.16b,v1.16b
947	lsr	x6,x22,#32
948	mov	v17.16b,v1.16b
949	mov	w7,w23
950	mov	v21.16b,v1.16b
951	lsr	x8,x23,#32
952	mov	v25.16b,v1.16b
953	mov	w9,w24
954	mov	v29.16b,v1.16b
955	lsr	x10,x24,#32
956	mov	v11.16b,v3.16b
957	mov	w11,w25
958	mov	v15.16b,v4.16b
959	lsr	x12,x25,#32
960	mov	v19.16b,v5.16b
961	mov	w13,w26
962	mov	v23.16b,v6.16b
963	lsr	x14,x26,#32
964	mov	v10.16b,v2.16b
965	mov	w15,w27
966	mov	v14.16b,v2.16b
967	lsr	x16,x27,#32
968	add	v27.4s,v11.4s,v7.4s			// +4
969	mov	w17,w28
970	add	v31.4s,v15.4s,v7.4s			// +4
971	lsr	x19,x28,#32
972	mov	v18.16b,v2.16b
973	mov	w20,w30
974	mov	v22.16b,v2.16b
975	lsr	x21,x30,#32
976	mov	v26.16b,v2.16b
977	stp	q3,q4,[sp,#48]		// off-load key block, variable part
978	mov	v30.16b,v2.16b
979	stp	q5,q6,[sp,#80]
980
981	mov	x4,#5
982	ld1	{v6.4s},[x3]
983	subs	x2,x2,#512
984Loop_upper_neon:
985	sub	x4,x4,#1
986	add	v8.4s,v8.4s,v9.4s
987	add	w5,w5,w9
988	add	v12.4s,v12.4s,v13.4s
989	add	w6,w6,w10
990	add	v16.4s,v16.4s,v17.4s
991	add	w7,w7,w11
992	add	v20.4s,v20.4s,v21.4s
993	add	w8,w8,w12
994	add	v24.4s,v24.4s,v25.4s
995	eor	w17,w17,w5
996	add	v28.4s,v28.4s,v29.4s
997	eor	w19,w19,w6
998	eor	v11.16b,v11.16b,v8.16b
999	eor	w20,w20,w7
1000	eor	v15.16b,v15.16b,v12.16b
1001	eor	w21,w21,w8
1002	eor	v19.16b,v19.16b,v16.16b
1003	ror	w17,w17,#16
1004	eor	v23.16b,v23.16b,v20.16b
1005	ror	w19,w19,#16
1006	eor	v27.16b,v27.16b,v24.16b
1007	ror	w20,w20,#16
1008	eor	v31.16b,v31.16b,v28.16b
1009	ror	w21,w21,#16
1010	rev32	v11.8h,v11.8h
1011	add	w13,w13,w17
1012	rev32	v15.8h,v15.8h
1013	add	w14,w14,w19
1014	rev32	v19.8h,v19.8h
1015	add	w15,w15,w20
1016	rev32	v23.8h,v23.8h
1017	add	w16,w16,w21
1018	rev32	v27.8h,v27.8h
1019	eor	w9,w9,w13
1020	rev32	v31.8h,v31.8h
1021	eor	w10,w10,w14
1022	add	v10.4s,v10.4s,v11.4s
1023	eor	w11,w11,w15
1024	add	v14.4s,v14.4s,v15.4s
1025	eor	w12,w12,w16
1026	add	v18.4s,v18.4s,v19.4s
1027	ror	w9,w9,#20
1028	add	v22.4s,v22.4s,v23.4s
1029	ror	w10,w10,#20
1030	add	v26.4s,v26.4s,v27.4s
1031	ror	w11,w11,#20
1032	add	v30.4s,v30.4s,v31.4s
1033	ror	w12,w12,#20
1034	eor	v0.16b,v9.16b,v10.16b
1035	add	w5,w5,w9
1036	eor	v1.16b,v13.16b,v14.16b
1037	add	w6,w6,w10
1038	eor	v2.16b,v17.16b,v18.16b
1039	add	w7,w7,w11
1040	eor	v3.16b,v21.16b,v22.16b
1041	add	w8,w8,w12
1042	eor	v4.16b,v25.16b,v26.16b
1043	eor	w17,w17,w5
1044	eor	v5.16b,v29.16b,v30.16b
1045	eor	w19,w19,w6
1046	ushr	v9.4s,v0.4s,#20
1047	eor	w20,w20,w7
1048	ushr	v13.4s,v1.4s,#20
1049	eor	w21,w21,w8
1050	ushr	v17.4s,v2.4s,#20
1051	ror	w17,w17,#24
1052	ushr	v21.4s,v3.4s,#20
1053	ror	w19,w19,#24
1054	ushr	v25.4s,v4.4s,#20
1055	ror	w20,w20,#24
1056	ushr	v29.4s,v5.4s,#20
1057	ror	w21,w21,#24
1058	sli	v9.4s,v0.4s,#12
1059	add	w13,w13,w17
1060	sli	v13.4s,v1.4s,#12
1061	add	w14,w14,w19
1062	sli	v17.4s,v2.4s,#12
1063	add	w15,w15,w20
1064	sli	v21.4s,v3.4s,#12
1065	add	w16,w16,w21
1066	sli	v25.4s,v4.4s,#12
1067	eor	w9,w9,w13
1068	sli	v29.4s,v5.4s,#12
1069	eor	w10,w10,w14
1070	add	v8.4s,v8.4s,v9.4s
1071	eor	w11,w11,w15
1072	add	v12.4s,v12.4s,v13.4s
1073	eor	w12,w12,w16
1074	add	v16.4s,v16.4s,v17.4s
1075	ror	w9,w9,#25
1076	add	v20.4s,v20.4s,v21.4s
1077	ror	w10,w10,#25
1078	add	v24.4s,v24.4s,v25.4s
1079	ror	w11,w11,#25
1080	add	v28.4s,v28.4s,v29.4s
1081	ror	w12,w12,#25
1082	eor	v11.16b,v11.16b,v8.16b
1083	add	w5,w5,w10
1084	eor	v15.16b,v15.16b,v12.16b
1085	add	w6,w6,w11
1086	eor	v19.16b,v19.16b,v16.16b
1087	add	w7,w7,w12
1088	eor	v23.16b,v23.16b,v20.16b
1089	add	w8,w8,w9
1090	eor	v27.16b,v27.16b,v24.16b
1091	eor	w21,w21,w5
1092	eor	v31.16b,v31.16b,v28.16b
1093	eor	w17,w17,w6
1094	tbl	v11.16b,{v11.16b},v6.16b
1095	eor	w19,w19,w7
1096	tbl	v15.16b,{v15.16b},v6.16b
1097	eor	w20,w20,w8
1098	tbl	v19.16b,{v19.16b},v6.16b
1099	ror	w21,w21,#16
1100	tbl	v23.16b,{v23.16b},v6.16b
1101	ror	w17,w17,#16
1102	tbl	v27.16b,{v27.16b},v6.16b
1103	ror	w19,w19,#16
1104	tbl	v31.16b,{v31.16b},v6.16b
1105	ror	w20,w20,#16
1106	add	v10.4s,v10.4s,v11.4s
1107	add	w15,w15,w21
1108	add	v14.4s,v14.4s,v15.4s
1109	add	w16,w16,w17
1110	add	v18.4s,v18.4s,v19.4s
1111	add	w13,w13,w19
1112	add	v22.4s,v22.4s,v23.4s
1113	add	w14,w14,w20
1114	add	v26.4s,v26.4s,v27.4s
1115	eor	w10,w10,w15
1116	add	v30.4s,v30.4s,v31.4s
1117	eor	w11,w11,w16
1118	eor	v0.16b,v9.16b,v10.16b
1119	eor	w12,w12,w13
1120	eor	v1.16b,v13.16b,v14.16b
1121	eor	w9,w9,w14
1122	eor	v2.16b,v17.16b,v18.16b
1123	ror	w10,w10,#20
1124	eor	v3.16b,v21.16b,v22.16b
1125	ror	w11,w11,#20
1126	eor	v4.16b,v25.16b,v26.16b
1127	ror	w12,w12,#20
1128	eor	v5.16b,v29.16b,v30.16b
1129	ror	w9,w9,#20
1130	ushr	v9.4s,v0.4s,#25
1131	add	w5,w5,w10
1132	ushr	v13.4s,v1.4s,#25
1133	add	w6,w6,w11
1134	ushr	v17.4s,v2.4s,#25
1135	add	w7,w7,w12
1136	ushr	v21.4s,v3.4s,#25
1137	add	w8,w8,w9
1138	ushr	v25.4s,v4.4s,#25
1139	eor	w21,w21,w5
1140	ushr	v29.4s,v5.4s,#25
1141	eor	w17,w17,w6
1142	sli	v9.4s,v0.4s,#7
1143	eor	w19,w19,w7
1144	sli	v13.4s,v1.4s,#7
1145	eor	w20,w20,w8
1146	sli	v17.4s,v2.4s,#7
1147	ror	w21,w21,#24
1148	sli	v21.4s,v3.4s,#7
1149	ror	w17,w17,#24
1150	sli	v25.4s,v4.4s,#7
1151	ror	w19,w19,#24
1152	sli	v29.4s,v5.4s,#7
1153	ror	w20,w20,#24
1154	ext	v10.16b,v10.16b,v10.16b,#8
1155	add	w15,w15,w21
1156	ext	v14.16b,v14.16b,v14.16b,#8
1157	add	w16,w16,w17
1158	ext	v18.16b,v18.16b,v18.16b,#8
1159	add	w13,w13,w19
1160	ext	v22.16b,v22.16b,v22.16b,#8
1161	add	w14,w14,w20
1162	ext	v26.16b,v26.16b,v26.16b,#8
1163	eor	w10,w10,w15
1164	ext	v30.16b,v30.16b,v30.16b,#8
1165	eor	w11,w11,w16
1166	ext	v11.16b,v11.16b,v11.16b,#12
1167	eor	w12,w12,w13
1168	ext	v15.16b,v15.16b,v15.16b,#12
1169	eor	w9,w9,w14
1170	ext	v19.16b,v19.16b,v19.16b,#12
1171	ror	w10,w10,#25
1172	ext	v23.16b,v23.16b,v23.16b,#12
1173	ror	w11,w11,#25
1174	ext	v27.16b,v27.16b,v27.16b,#12
1175	ror	w12,w12,#25
1176	ext	v31.16b,v31.16b,v31.16b,#12
1177	ror	w9,w9,#25
1178	ext	v9.16b,v9.16b,v9.16b,#4
1179	ext	v13.16b,v13.16b,v13.16b,#4
1180	ext	v17.16b,v17.16b,v17.16b,#4
1181	ext	v21.16b,v21.16b,v21.16b,#4
1182	ext	v25.16b,v25.16b,v25.16b,#4
1183	ext	v29.16b,v29.16b,v29.16b,#4
1184	add	v8.4s,v8.4s,v9.4s
1185	add	w5,w5,w9
1186	add	v12.4s,v12.4s,v13.4s
1187	add	w6,w6,w10
1188	add	v16.4s,v16.4s,v17.4s
1189	add	w7,w7,w11
1190	add	v20.4s,v20.4s,v21.4s
1191	add	w8,w8,w12
1192	add	v24.4s,v24.4s,v25.4s
1193	eor	w17,w17,w5
1194	add	v28.4s,v28.4s,v29.4s
1195	eor	w19,w19,w6
1196	eor	v11.16b,v11.16b,v8.16b
1197	eor	w20,w20,w7
1198	eor	v15.16b,v15.16b,v12.16b
1199	eor	w21,w21,w8
1200	eor	v19.16b,v19.16b,v16.16b
1201	ror	w17,w17,#16
1202	eor	v23.16b,v23.16b,v20.16b
1203	ror	w19,w19,#16
1204	eor	v27.16b,v27.16b,v24.16b
1205	ror	w20,w20,#16
1206	eor	v31.16b,v31.16b,v28.16b
1207	ror	w21,w21,#16
1208	rev32	v11.8h,v11.8h
1209	add	w13,w13,w17
1210	rev32	v15.8h,v15.8h
1211	add	w14,w14,w19
1212	rev32	v19.8h,v19.8h
1213	add	w15,w15,w20
1214	rev32	v23.8h,v23.8h
1215	add	w16,w16,w21
1216	rev32	v27.8h,v27.8h
1217	eor	w9,w9,w13
1218	rev32	v31.8h,v31.8h
1219	eor	w10,w10,w14
1220	add	v10.4s,v10.4s,v11.4s
1221	eor	w11,w11,w15
1222	add	v14.4s,v14.4s,v15.4s
1223	eor	w12,w12,w16
1224	add	v18.4s,v18.4s,v19.4s
1225	ror	w9,w9,#20
1226	add	v22.4s,v22.4s,v23.4s
1227	ror	w10,w10,#20
1228	add	v26.4s,v26.4s,v27.4s
1229	ror	w11,w11,#20
1230	add	v30.4s,v30.4s,v31.4s
1231	ror	w12,w12,#20
1232	eor	v0.16b,v9.16b,v10.16b
1233	add	w5,w5,w9
1234	eor	v1.16b,v13.16b,v14.16b
1235	add	w6,w6,w10
1236	eor	v2.16b,v17.16b,v18.16b
1237	add	w7,w7,w11
1238	eor	v3.16b,v21.16b,v22.16b
1239	add	w8,w8,w12
1240	eor	v4.16b,v25.16b,v26.16b
1241	eor	w17,w17,w5
1242	eor	v5.16b,v29.16b,v30.16b
1243	eor	w19,w19,w6
1244	ushr	v9.4s,v0.4s,#20
1245	eor	w20,w20,w7
1246	ushr	v13.4s,v1.4s,#20
1247	eor	w21,w21,w8
1248	ushr	v17.4s,v2.4s,#20
1249	ror	w17,w17,#24
1250	ushr	v21.4s,v3.4s,#20
1251	ror	w19,w19,#24
1252	ushr	v25.4s,v4.4s,#20
1253	ror	w20,w20,#24
1254	ushr	v29.4s,v5.4s,#20
1255	ror	w21,w21,#24
1256	sli	v9.4s,v0.4s,#12
1257	add	w13,w13,w17
1258	sli	v13.4s,v1.4s,#12
1259	add	w14,w14,w19
1260	sli	v17.4s,v2.4s,#12
1261	add	w15,w15,w20
1262	sli	v21.4s,v3.4s,#12
1263	add	w16,w16,w21
1264	sli	v25.4s,v4.4s,#12
1265	eor	w9,w9,w13
1266	sli	v29.4s,v5.4s,#12
1267	eor	w10,w10,w14
1268	add	v8.4s,v8.4s,v9.4s
1269	eor	w11,w11,w15
1270	add	v12.4s,v12.4s,v13.4s
1271	eor	w12,w12,w16
1272	add	v16.4s,v16.4s,v17.4s
1273	ror	w9,w9,#25
1274	add	v20.4s,v20.4s,v21.4s
1275	ror	w10,w10,#25
1276	add	v24.4s,v24.4s,v25.4s
1277	ror	w11,w11,#25
1278	add	v28.4s,v28.4s,v29.4s
1279	ror	w12,w12,#25
1280	eor	v11.16b,v11.16b,v8.16b
1281	add	w5,w5,w10
1282	eor	v15.16b,v15.16b,v12.16b
1283	add	w6,w6,w11
1284	eor	v19.16b,v19.16b,v16.16b
1285	add	w7,w7,w12
1286	eor	v23.16b,v23.16b,v20.16b
1287	add	w8,w8,w9
1288	eor	v27.16b,v27.16b,v24.16b
1289	eor	w21,w21,w5
1290	eor	v31.16b,v31.16b,v28.16b
1291	eor	w17,w17,w6
1292	tbl	v11.16b,{v11.16b},v6.16b
1293	eor	w19,w19,w7
1294	tbl	v15.16b,{v15.16b},v6.16b
1295	eor	w20,w20,w8
1296	tbl	v19.16b,{v19.16b},v6.16b
1297	ror	w21,w21,#16
1298	tbl	v23.16b,{v23.16b},v6.16b
1299	ror	w17,w17,#16
1300	tbl	v27.16b,{v27.16b},v6.16b
1301	ror	w19,w19,#16
1302	tbl	v31.16b,{v31.16b},v6.16b
1303	ror	w20,w20,#16
1304	add	v10.4s,v10.4s,v11.4s
1305	add	w15,w15,w21
1306	add	v14.4s,v14.4s,v15.4s
1307	add	w16,w16,w17
1308	add	v18.4s,v18.4s,v19.4s
1309	add	w13,w13,w19
1310	add	v22.4s,v22.4s,v23.4s
1311	add	w14,w14,w20
1312	add	v26.4s,v26.4s,v27.4s
1313	eor	w10,w10,w15
1314	add	v30.4s,v30.4s,v31.4s
1315	eor	w11,w11,w16
1316	eor	v0.16b,v9.16b,v10.16b
1317	eor	w12,w12,w13
1318	eor	v1.16b,v13.16b,v14.16b
1319	eor	w9,w9,w14
1320	eor	v2.16b,v17.16b,v18.16b
1321	ror	w10,w10,#20
1322	eor	v3.16b,v21.16b,v22.16b
1323	ror	w11,w11,#20
1324	eor	v4.16b,v25.16b,v26.16b
1325	ror	w12,w12,#20
1326	eor	v5.16b,v29.16b,v30.16b
1327	ror	w9,w9,#20
1328	ushr	v9.4s,v0.4s,#25
1329	add	w5,w5,w10
1330	ushr	v13.4s,v1.4s,#25
1331	add	w6,w6,w11
1332	ushr	v17.4s,v2.4s,#25
1333	add	w7,w7,w12
1334	ushr	v21.4s,v3.4s,#25
1335	add	w8,w8,w9
1336	ushr	v25.4s,v4.4s,#25
1337	eor	w21,w21,w5
1338	ushr	v29.4s,v5.4s,#25
1339	eor	w17,w17,w6
1340	sli	v9.4s,v0.4s,#7
1341	eor	w19,w19,w7
1342	sli	v13.4s,v1.4s,#7
1343	eor	w20,w20,w8
1344	sli	v17.4s,v2.4s,#7
1345	ror	w21,w21,#24
1346	sli	v21.4s,v3.4s,#7
1347	ror	w17,w17,#24
1348	sli	v25.4s,v4.4s,#7
1349	ror	w19,w19,#24
1350	sli	v29.4s,v5.4s,#7
1351	ror	w20,w20,#24
1352	ext	v10.16b,v10.16b,v10.16b,#8
1353	add	w15,w15,w21
1354	ext	v14.16b,v14.16b,v14.16b,#8
1355	add	w16,w16,w17
1356	ext	v18.16b,v18.16b,v18.16b,#8
1357	add	w13,w13,w19
1358	ext	v22.16b,v22.16b,v22.16b,#8
1359	add	w14,w14,w20
1360	ext	v26.16b,v26.16b,v26.16b,#8
1361	eor	w10,w10,w15
1362	ext	v30.16b,v30.16b,v30.16b,#8
1363	eor	w11,w11,w16
1364	ext	v11.16b,v11.16b,v11.16b,#4
1365	eor	w12,w12,w13
1366	ext	v15.16b,v15.16b,v15.16b,#4
1367	eor	w9,w9,w14
1368	ext	v19.16b,v19.16b,v19.16b,#4
1369	ror	w10,w10,#25
1370	ext	v23.16b,v23.16b,v23.16b,#4
1371	ror	w11,w11,#25
1372	ext	v27.16b,v27.16b,v27.16b,#4
1373	ror	w12,w12,#25
1374	ext	v31.16b,v31.16b,v31.16b,#4
1375	ror	w9,w9,#25
1376	ext	v9.16b,v9.16b,v9.16b,#12
1377	ext	v13.16b,v13.16b,v13.16b,#12
1378	ext	v17.16b,v17.16b,v17.16b,#12
1379	ext	v21.16b,v21.16b,v21.16b,#12
1380	ext	v25.16b,v25.16b,v25.16b,#12
1381	ext	v29.16b,v29.16b,v29.16b,#12
1382	cbnz	x4,Loop_upper_neon
1383
1384	add	w5,w5,w22		// accumulate key block
1385	add	x6,x6,x22,lsr#32
1386	add	w7,w7,w23
1387	add	x8,x8,x23,lsr#32
1388	add	w9,w9,w24
1389	add	x10,x10,x24,lsr#32
1390	add	w11,w11,w25
1391	add	x12,x12,x25,lsr#32
1392	add	w13,w13,w26
1393	add	x14,x14,x26,lsr#32
1394	add	w15,w15,w27
1395	add	x16,x16,x27,lsr#32
1396	add	w17,w17,w28
1397	add	x19,x19,x28,lsr#32
1398	add	w20,w20,w30
1399	add	x21,x21,x30,lsr#32
1400
1401	add	x5,x5,x6,lsl#32	// pack
1402	add	x7,x7,x8,lsl#32
1403	ldp	x6,x8,[x1,#0]		// load input
1404	add	x9,x9,x10,lsl#32
1405	add	x11,x11,x12,lsl#32
1406	ldp	x10,x12,[x1,#16]
1407	add	x13,x13,x14,lsl#32
1408	add	x15,x15,x16,lsl#32
1409	ldp	x14,x16,[x1,#32]
1410	add	x17,x17,x19,lsl#32
1411	add	x20,x20,x21,lsl#32
1412	ldp	x19,x21,[x1,#48]
1413	add	x1,x1,#64
1414#ifdef	__AARCH64EB__
1415	rev	x5,x5
1416	rev	x7,x7
1417	rev	x9,x9
1418	rev	x11,x11
1419	rev	x13,x13
1420	rev	x15,x15
1421	rev	x17,x17
1422	rev	x20,x20
1423#endif
1424	eor	x5,x5,x6
1425	eor	x7,x7,x8
1426	eor	x9,x9,x10
1427	eor	x11,x11,x12
1428	eor	x13,x13,x14
1429	eor	x15,x15,x16
1430	eor	x17,x17,x19
1431	eor	x20,x20,x21
1432
1433	stp	x5,x7,[x0,#0]		// store output
1434	add	x28,x28,#1			// increment counter
1435	mov	w5,w22			// unpack key block
1436	lsr	x6,x22,#32
1437	stp	x9,x11,[x0,#16]
1438	mov	w7,w23
1439	lsr	x8,x23,#32
1440	stp	x13,x15,[x0,#32]
1441	mov	w9,w24
1442	lsr	x10,x24,#32
1443	stp	x17,x20,[x0,#48]
1444	add	x0,x0,#64
1445	mov	w11,w25
1446	lsr	x12,x25,#32
1447	mov	w13,w26
1448	lsr	x14,x26,#32
1449	mov	w15,w27
1450	lsr	x16,x27,#32
1451	mov	w17,w28
1452	lsr	x19,x28,#32
1453	mov	w20,w30
1454	lsr	x21,x30,#32
1455
1456	mov	x4,#5
1457Loop_lower_neon:
1458	sub	x4,x4,#1
1459	add	v8.4s,v8.4s,v9.4s
1460	add	w5,w5,w9
1461	add	v12.4s,v12.4s,v13.4s
1462	add	w6,w6,w10
1463	add	v16.4s,v16.4s,v17.4s
1464	add	w7,w7,w11
1465	add	v20.4s,v20.4s,v21.4s
1466	add	w8,w8,w12
1467	add	v24.4s,v24.4s,v25.4s
1468	eor	w17,w17,w5
1469	add	v28.4s,v28.4s,v29.4s
1470	eor	w19,w19,w6
1471	eor	v11.16b,v11.16b,v8.16b
1472	eor	w20,w20,w7
1473	eor	v15.16b,v15.16b,v12.16b
1474	eor	w21,w21,w8
1475	eor	v19.16b,v19.16b,v16.16b
1476	ror	w17,w17,#16
1477	eor	v23.16b,v23.16b,v20.16b
1478	ror	w19,w19,#16
1479	eor	v27.16b,v27.16b,v24.16b
1480	ror	w20,w20,#16
1481	eor	v31.16b,v31.16b,v28.16b
1482	ror	w21,w21,#16
1483	rev32	v11.8h,v11.8h
1484	add	w13,w13,w17
1485	rev32	v15.8h,v15.8h
1486	add	w14,w14,w19
1487	rev32	v19.8h,v19.8h
1488	add	w15,w15,w20
1489	rev32	v23.8h,v23.8h
1490	add	w16,w16,w21
1491	rev32	v27.8h,v27.8h
1492	eor	w9,w9,w13
1493	rev32	v31.8h,v31.8h
1494	eor	w10,w10,w14
1495	add	v10.4s,v10.4s,v11.4s
1496	eor	w11,w11,w15
1497	add	v14.4s,v14.4s,v15.4s
1498	eor	w12,w12,w16
1499	add	v18.4s,v18.4s,v19.4s
1500	ror	w9,w9,#20
1501	add	v22.4s,v22.4s,v23.4s
1502	ror	w10,w10,#20
1503	add	v26.4s,v26.4s,v27.4s
1504	ror	w11,w11,#20
1505	add	v30.4s,v30.4s,v31.4s
1506	ror	w12,w12,#20
1507	eor	v0.16b,v9.16b,v10.16b
1508	add	w5,w5,w9
1509	eor	v1.16b,v13.16b,v14.16b
1510	add	w6,w6,w10
1511	eor	v2.16b,v17.16b,v18.16b
1512	add	w7,w7,w11
1513	eor	v3.16b,v21.16b,v22.16b
1514	add	w8,w8,w12
1515	eor	v4.16b,v25.16b,v26.16b
1516	eor	w17,w17,w5
1517	eor	v5.16b,v29.16b,v30.16b
1518	eor	w19,w19,w6
1519	ushr	v9.4s,v0.4s,#20
1520	eor	w20,w20,w7
1521	ushr	v13.4s,v1.4s,#20
1522	eor	w21,w21,w8
1523	ushr	v17.4s,v2.4s,#20
1524	ror	w17,w17,#24
1525	ushr	v21.4s,v3.4s,#20
1526	ror	w19,w19,#24
1527	ushr	v25.4s,v4.4s,#20
1528	ror	w20,w20,#24
1529	ushr	v29.4s,v5.4s,#20
1530	ror	w21,w21,#24
1531	sli	v9.4s,v0.4s,#12
1532	add	w13,w13,w17
1533	sli	v13.4s,v1.4s,#12
1534	add	w14,w14,w19
1535	sli	v17.4s,v2.4s,#12
1536	add	w15,w15,w20
1537	sli	v21.4s,v3.4s,#12
1538	add	w16,w16,w21
1539	sli	v25.4s,v4.4s,#12
1540	eor	w9,w9,w13
1541	sli	v29.4s,v5.4s,#12
1542	eor	w10,w10,w14
1543	add	v8.4s,v8.4s,v9.4s
1544	eor	w11,w11,w15
1545	add	v12.4s,v12.4s,v13.4s
1546	eor	w12,w12,w16
1547	add	v16.4s,v16.4s,v17.4s
1548	ror	w9,w9,#25
1549	add	v20.4s,v20.4s,v21.4s
1550	ror	w10,w10,#25
1551	add	v24.4s,v24.4s,v25.4s
1552	ror	w11,w11,#25
1553	add	v28.4s,v28.4s,v29.4s
1554	ror	w12,w12,#25
1555	eor	v11.16b,v11.16b,v8.16b
1556	add	w5,w5,w10
1557	eor	v15.16b,v15.16b,v12.16b
1558	add	w6,w6,w11
1559	eor	v19.16b,v19.16b,v16.16b
1560	add	w7,w7,w12
1561	eor	v23.16b,v23.16b,v20.16b
1562	add	w8,w8,w9
1563	eor	v27.16b,v27.16b,v24.16b
1564	eor	w21,w21,w5
1565	eor	v31.16b,v31.16b,v28.16b
1566	eor	w17,w17,w6
1567	tbl	v11.16b,{v11.16b},v6.16b
1568	eor	w19,w19,w7
1569	tbl	v15.16b,{v15.16b},v6.16b
1570	eor	w20,w20,w8
1571	tbl	v19.16b,{v19.16b},v6.16b
1572	ror	w21,w21,#16
1573	tbl	v23.16b,{v23.16b},v6.16b
1574	ror	w17,w17,#16
1575	tbl	v27.16b,{v27.16b},v6.16b
1576	ror	w19,w19,#16
1577	tbl	v31.16b,{v31.16b},v6.16b
1578	ror	w20,w20,#16
1579	add	v10.4s,v10.4s,v11.4s
1580	add	w15,w15,w21
1581	add	v14.4s,v14.4s,v15.4s
1582	add	w16,w16,w17
1583	add	v18.4s,v18.4s,v19.4s
1584	add	w13,w13,w19
1585	add	v22.4s,v22.4s,v23.4s
1586	add	w14,w14,w20
1587	add	v26.4s,v26.4s,v27.4s
1588	eor	w10,w10,w15
1589	add	v30.4s,v30.4s,v31.4s
1590	eor	w11,w11,w16
1591	eor	v0.16b,v9.16b,v10.16b
1592	eor	w12,w12,w13
1593	eor	v1.16b,v13.16b,v14.16b
1594	eor	w9,w9,w14
1595	eor	v2.16b,v17.16b,v18.16b
1596	ror	w10,w10,#20
1597	eor	v3.16b,v21.16b,v22.16b
1598	ror	w11,w11,#20
1599	eor	v4.16b,v25.16b,v26.16b
1600	ror	w12,w12,#20
1601	eor	v5.16b,v29.16b,v30.16b
1602	ror	w9,w9,#20
1603	ushr	v9.4s,v0.4s,#25
1604	add	w5,w5,w10
1605	ushr	v13.4s,v1.4s,#25
1606	add	w6,w6,w11
1607	ushr	v17.4s,v2.4s,#25
1608	add	w7,w7,w12
1609	ushr	v21.4s,v3.4s,#25
1610	add	w8,w8,w9
1611	ushr	v25.4s,v4.4s,#25
1612	eor	w21,w21,w5
1613	ushr	v29.4s,v5.4s,#25
1614	eor	w17,w17,w6
1615	sli	v9.4s,v0.4s,#7
1616	eor	w19,w19,w7
1617	sli	v13.4s,v1.4s,#7
1618	eor	w20,w20,w8
1619	sli	v17.4s,v2.4s,#7
1620	ror	w21,w21,#24
1621	sli	v21.4s,v3.4s,#7
1622	ror	w17,w17,#24
1623	sli	v25.4s,v4.4s,#7
1624	ror	w19,w19,#24
1625	sli	v29.4s,v5.4s,#7
1626	ror	w20,w20,#24
1627	ext	v10.16b,v10.16b,v10.16b,#8
1628	add	w15,w15,w21
1629	ext	v14.16b,v14.16b,v14.16b,#8
1630	add	w16,w16,w17
1631	ext	v18.16b,v18.16b,v18.16b,#8
1632	add	w13,w13,w19
1633	ext	v22.16b,v22.16b,v22.16b,#8
1634	add	w14,w14,w20
1635	ext	v26.16b,v26.16b,v26.16b,#8
1636	eor	w10,w10,w15
1637	ext	v30.16b,v30.16b,v30.16b,#8
1638	eor	w11,w11,w16
1639	ext	v11.16b,v11.16b,v11.16b,#12
1640	eor	w12,w12,w13
1641	ext	v15.16b,v15.16b,v15.16b,#12
1642	eor	w9,w9,w14
1643	ext	v19.16b,v19.16b,v19.16b,#12
1644	ror	w10,w10,#25
1645	ext	v23.16b,v23.16b,v23.16b,#12
1646	ror	w11,w11,#25
1647	ext	v27.16b,v27.16b,v27.16b,#12
1648	ror	w12,w12,#25
1649	ext	v31.16b,v31.16b,v31.16b,#12
1650	ror	w9,w9,#25
1651	ext	v9.16b,v9.16b,v9.16b,#4
1652	ext	v13.16b,v13.16b,v13.16b,#4
1653	ext	v17.16b,v17.16b,v17.16b,#4
1654	ext	v21.16b,v21.16b,v21.16b,#4
1655	ext	v25.16b,v25.16b,v25.16b,#4
1656	ext	v29.16b,v29.16b,v29.16b,#4
1657	add	v8.4s,v8.4s,v9.4s
1658	add	w5,w5,w9
1659	add	v12.4s,v12.4s,v13.4s
1660	add	w6,w6,w10
1661	add	v16.4s,v16.4s,v17.4s
1662	add	w7,w7,w11
1663	add	v20.4s,v20.4s,v21.4s
1664	add	w8,w8,w12
1665	add	v24.4s,v24.4s,v25.4s
1666	eor	w17,w17,w5
1667	add	v28.4s,v28.4s,v29.4s
1668	eor	w19,w19,w6
1669	eor	v11.16b,v11.16b,v8.16b
1670	eor	w20,w20,w7
1671	eor	v15.16b,v15.16b,v12.16b
1672	eor	w21,w21,w8
1673	eor	v19.16b,v19.16b,v16.16b
1674	ror	w17,w17,#16
1675	eor	v23.16b,v23.16b,v20.16b
1676	ror	w19,w19,#16
1677	eor	v27.16b,v27.16b,v24.16b
1678	ror	w20,w20,#16
1679	eor	v31.16b,v31.16b,v28.16b
1680	ror	w21,w21,#16
1681	rev32	v11.8h,v11.8h
1682	add	w13,w13,w17
1683	rev32	v15.8h,v15.8h
1684	add	w14,w14,w19
1685	rev32	v19.8h,v19.8h
1686	add	w15,w15,w20
1687	rev32	v23.8h,v23.8h
1688	add	w16,w16,w21
1689	rev32	v27.8h,v27.8h
1690	eor	w9,w9,w13
1691	rev32	v31.8h,v31.8h
1692	eor	w10,w10,w14
1693	add	v10.4s,v10.4s,v11.4s
1694	eor	w11,w11,w15
1695	add	v14.4s,v14.4s,v15.4s
1696	eor	w12,w12,w16
1697	add	v18.4s,v18.4s,v19.4s
1698	ror	w9,w9,#20
1699	add	v22.4s,v22.4s,v23.4s
1700	ror	w10,w10,#20
1701	add	v26.4s,v26.4s,v27.4s
1702	ror	w11,w11,#20
1703	add	v30.4s,v30.4s,v31.4s
1704	ror	w12,w12,#20
1705	eor	v0.16b,v9.16b,v10.16b
1706	add	w5,w5,w9
1707	eor	v1.16b,v13.16b,v14.16b
1708	add	w6,w6,w10
1709	eor	v2.16b,v17.16b,v18.16b
1710	add	w7,w7,w11
1711	eor	v3.16b,v21.16b,v22.16b
1712	add	w8,w8,w12
1713	eor	v4.16b,v25.16b,v26.16b
1714	eor	w17,w17,w5
1715	eor	v5.16b,v29.16b,v30.16b
1716	eor	w19,w19,w6
1717	ushr	v9.4s,v0.4s,#20
1718	eor	w20,w20,w7
1719	ushr	v13.4s,v1.4s,#20
1720	eor	w21,w21,w8
1721	ushr	v17.4s,v2.4s,#20
1722	ror	w17,w17,#24
1723	ushr	v21.4s,v3.4s,#20
1724	ror	w19,w19,#24
1725	ushr	v25.4s,v4.4s,#20
1726	ror	w20,w20,#24
1727	ushr	v29.4s,v5.4s,#20
1728	ror	w21,w21,#24
1729	sli	v9.4s,v0.4s,#12
1730	add	w13,w13,w17
1731	sli	v13.4s,v1.4s,#12
1732	add	w14,w14,w19
1733	sli	v17.4s,v2.4s,#12
1734	add	w15,w15,w20
1735	sli	v21.4s,v3.4s,#12
1736	add	w16,w16,w21
1737	sli	v25.4s,v4.4s,#12
1738	eor	w9,w9,w13
1739	sli	v29.4s,v5.4s,#12
1740	eor	w10,w10,w14
1741	add	v8.4s,v8.4s,v9.4s
1742	eor	w11,w11,w15
1743	add	v12.4s,v12.4s,v13.4s
1744	eor	w12,w12,w16
1745	add	v16.4s,v16.4s,v17.4s
1746	ror	w9,w9,#25
1747	add	v20.4s,v20.4s,v21.4s
1748	ror	w10,w10,#25
1749	add	v24.4s,v24.4s,v25.4s
1750	ror	w11,w11,#25
1751	add	v28.4s,v28.4s,v29.4s
1752	ror	w12,w12,#25
1753	eor	v11.16b,v11.16b,v8.16b
1754	add	w5,w5,w10
1755	eor	v15.16b,v15.16b,v12.16b
1756	add	w6,w6,w11
1757	eor	v19.16b,v19.16b,v16.16b
1758	add	w7,w7,w12
1759	eor	v23.16b,v23.16b,v20.16b
1760	add	w8,w8,w9
1761	eor	v27.16b,v27.16b,v24.16b
1762	eor	w21,w21,w5
1763	eor	v31.16b,v31.16b,v28.16b
1764	eor	w17,w17,w6
1765	tbl	v11.16b,{v11.16b},v6.16b
1766	eor	w19,w19,w7
1767	tbl	v15.16b,{v15.16b},v6.16b
1768	eor	w20,w20,w8
1769	tbl	v19.16b,{v19.16b},v6.16b
1770	ror	w21,w21,#16
1771	tbl	v23.16b,{v23.16b},v6.16b
1772	ror	w17,w17,#16
1773	tbl	v27.16b,{v27.16b},v6.16b
1774	ror	w19,w19,#16
1775	tbl	v31.16b,{v31.16b},v6.16b
1776	ror	w20,w20,#16
1777	add	v10.4s,v10.4s,v11.4s
1778	add	w15,w15,w21
1779	add	v14.4s,v14.4s,v15.4s
1780	add	w16,w16,w17
1781	add	v18.4s,v18.4s,v19.4s
1782	add	w13,w13,w19
1783	add	v22.4s,v22.4s,v23.4s
1784	add	w14,w14,w20
1785	add	v26.4s,v26.4s,v27.4s
1786	eor	w10,w10,w15
1787	add	v30.4s,v30.4s,v31.4s
1788	eor	w11,w11,w16
1789	eor	v0.16b,v9.16b,v10.16b
1790	eor	w12,w12,w13
1791	eor	v1.16b,v13.16b,v14.16b
1792	eor	w9,w9,w14
1793	eor	v2.16b,v17.16b,v18.16b
1794	ror	w10,w10,#20
1795	eor	v3.16b,v21.16b,v22.16b
1796	ror	w11,w11,#20
1797	eor	v4.16b,v25.16b,v26.16b
1798	ror	w12,w12,#20
1799	eor	v5.16b,v29.16b,v30.16b
1800	ror	w9,w9,#20
1801	ushr	v9.4s,v0.4s,#25
1802	add	w5,w5,w10
1803	ushr	v13.4s,v1.4s,#25
1804	add	w6,w6,w11
1805	ushr	v17.4s,v2.4s,#25
1806	add	w7,w7,w12
1807	ushr	v21.4s,v3.4s,#25
1808	add	w8,w8,w9
1809	ushr	v25.4s,v4.4s,#25
1810	eor	w21,w21,w5
1811	ushr	v29.4s,v5.4s,#25
1812	eor	w17,w17,w6
1813	sli	v9.4s,v0.4s,#7
1814	eor	w19,w19,w7
1815	sli	v13.4s,v1.4s,#7
1816	eor	w20,w20,w8
1817	sli	v17.4s,v2.4s,#7
1818	ror	w21,w21,#24
1819	sli	v21.4s,v3.4s,#7
1820	ror	w17,w17,#24
1821	sli	v25.4s,v4.4s,#7
1822	ror	w19,w19,#24
1823	sli	v29.4s,v5.4s,#7
1824	ror	w20,w20,#24
1825	ext	v10.16b,v10.16b,v10.16b,#8
1826	add	w15,w15,w21
1827	ext	v14.16b,v14.16b,v14.16b,#8
1828	add	w16,w16,w17
1829	ext	v18.16b,v18.16b,v18.16b,#8
1830	add	w13,w13,w19
1831	ext	v22.16b,v22.16b,v22.16b,#8
1832	add	w14,w14,w20
1833	ext	v26.16b,v26.16b,v26.16b,#8
1834	eor	w10,w10,w15
1835	ext	v30.16b,v30.16b,v30.16b,#8
1836	eor	w11,w11,w16
1837	ext	v11.16b,v11.16b,v11.16b,#4
1838	eor	w12,w12,w13
1839	ext	v15.16b,v15.16b,v15.16b,#4
1840	eor	w9,w9,w14
1841	ext	v19.16b,v19.16b,v19.16b,#4
1842	ror	w10,w10,#25
1843	ext	v23.16b,v23.16b,v23.16b,#4
1844	ror	w11,w11,#25
1845	ext	v27.16b,v27.16b,v27.16b,#4
1846	ror	w12,w12,#25
1847	ext	v31.16b,v31.16b,v31.16b,#4
1848	ror	w9,w9,#25
1849	ext	v9.16b,v9.16b,v9.16b,#12
1850	ext	v13.16b,v13.16b,v13.16b,#12
1851	ext	v17.16b,v17.16b,v17.16b,#12
1852	ext	v21.16b,v21.16b,v21.16b,#12
1853	ext	v25.16b,v25.16b,v25.16b,#12
1854	ext	v29.16b,v29.16b,v29.16b,#12
1855	cbnz	x4,Loop_lower_neon
1856
1857	add	w5,w5,w22		// accumulate key block
1858	ldp	q0,q1,[sp,#0]
1859	add	x6,x6,x22,lsr#32
1860	ldp	q2,q3,[sp,#32]
1861	add	w7,w7,w23
1862	ldp	q4,q5,[sp,#64]
1863	add	x8,x8,x23,lsr#32
1864	ldr	q6,[sp,#96]
1865	add	v8.4s,v8.4s,v0.4s
1866	add	w9,w9,w24
1867	add	v12.4s,v12.4s,v0.4s
1868	add	x10,x10,x24,lsr#32
1869	add	v16.4s,v16.4s,v0.4s
1870	add	w11,w11,w25
1871	add	v20.4s,v20.4s,v0.4s
1872	add	x12,x12,x25,lsr#32
1873	add	v24.4s,v24.4s,v0.4s
1874	add	w13,w13,w26
1875	add	v28.4s,v28.4s,v0.4s
1876	add	x14,x14,x26,lsr#32
1877	add	v10.4s,v10.4s,v2.4s
1878	add	w15,w15,w27
1879	add	v14.4s,v14.4s,v2.4s
1880	add	x16,x16,x27,lsr#32
1881	add	v18.4s,v18.4s,v2.4s
1882	add	w17,w17,w28
1883	add	v22.4s,v22.4s,v2.4s
1884	add	x19,x19,x28,lsr#32
1885	add	v26.4s,v26.4s,v2.4s
1886	add	w20,w20,w30
1887	add	v30.4s,v30.4s,v2.4s
1888	add	x21,x21,x30,lsr#32
1889	add	v27.4s,v27.4s,v7.4s			// +4
1890	add	x5,x5,x6,lsl#32	// pack
1891	add	v31.4s,v31.4s,v7.4s			// +4
1892	add	x7,x7,x8,lsl#32
1893	add	v11.4s,v11.4s,v3.4s
1894	ldp	x6,x8,[x1,#0]		// load input
1895	add	v15.4s,v15.4s,v4.4s
1896	add	x9,x9,x10,lsl#32
1897	add	v19.4s,v19.4s,v5.4s
1898	add	x11,x11,x12,lsl#32
1899	add	v23.4s,v23.4s,v6.4s
1900	ldp	x10,x12,[x1,#16]
1901	add	v27.4s,v27.4s,v3.4s
1902	add	x13,x13,x14,lsl#32
1903	add	v31.4s,v31.4s,v4.4s
1904	add	x15,x15,x16,lsl#32
1905	add	v9.4s,v9.4s,v1.4s
1906	ldp	x14,x16,[x1,#32]
1907	add	v13.4s,v13.4s,v1.4s
1908	add	x17,x17,x19,lsl#32
1909	add	v17.4s,v17.4s,v1.4s
1910	add	x20,x20,x21,lsl#32
1911	add	v21.4s,v21.4s,v1.4s
1912	ldp	x19,x21,[x1,#48]
1913	add	v25.4s,v25.4s,v1.4s
1914	add	x1,x1,#64
1915	add	v29.4s,v29.4s,v1.4s
1916
1917#ifdef	__AARCH64EB__
1918	rev	x5,x5
1919	rev	x7,x7
1920	rev	x9,x9
1921	rev	x11,x11
1922	rev	x13,x13
1923	rev	x15,x15
1924	rev	x17,x17
1925	rev	x20,x20
1926#endif
1927	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1928	eor	x5,x5,x6
1929	eor	x7,x7,x8
1930	eor	x9,x9,x10
1931	eor	x11,x11,x12
1932	eor	x13,x13,x14
1933	eor	v8.16b,v8.16b,v0.16b
1934	eor	x15,x15,x16
1935	eor	v9.16b,v9.16b,v1.16b
1936	eor	x17,x17,x19
1937	eor	v10.16b,v10.16b,v2.16b
1938	eor	x20,x20,x21
1939	eor	v11.16b,v11.16b,v3.16b
1940	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
1941
1942	stp	x5,x7,[x0,#0]		// store output
1943	add	x28,x28,#7			// increment counter
1944	stp	x9,x11,[x0,#16]
1945	stp	x13,x15,[x0,#32]
1946	stp	x17,x20,[x0,#48]
1947	add	x0,x0,#64
1948	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
1949
1950	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
1951	eor	v12.16b,v12.16b,v0.16b
1952	eor	v13.16b,v13.16b,v1.16b
1953	eor	v14.16b,v14.16b,v2.16b
1954	eor	v15.16b,v15.16b,v3.16b
1955	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
1956
1957	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
1958	eor	v16.16b,v16.16b,v8.16b
1959	ldp	q0,q1,[sp,#0]
1960	eor	v17.16b,v17.16b,v9.16b
1961	ldp	q2,q3,[sp,#32]
1962	eor	v18.16b,v18.16b,v10.16b
1963	eor	v19.16b,v19.16b,v11.16b
1964	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
1965
1966	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64
1967	eor	v20.16b,v20.16b,v12.16b
1968	eor	v21.16b,v21.16b,v13.16b
1969	eor	v22.16b,v22.16b,v14.16b
1970	eor	v23.16b,v23.16b,v15.16b
1971	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
1972
1973	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1974	eor	v24.16b,v24.16b,v16.16b
1975	eor	v25.16b,v25.16b,v17.16b
1976	eor	v26.16b,v26.16b,v18.16b
1977	eor	v27.16b,v27.16b,v19.16b
1978	st1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
1979
1980	shl	v8.4s,v7.4s,#1			// 4 -> 8
1981	eor	v28.16b,v28.16b,v20.16b
1982	eor	v29.16b,v29.16b,v21.16b
1983	eor	v30.16b,v30.16b,v22.16b
1984	eor	v31.16b,v31.16b,v23.16b
1985	st1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64
1986
1987	add	v3.4s,v3.4s,v8.4s			// += 8
1988	add	v4.4s,v4.4s,v8.4s
1989	add	v5.4s,v5.4s,v8.4s
1990	add	v6.4s,v6.4s,v8.4s
1991
1992	b.hs	Loop_outer_512_neon
1993
1994	adds	x2,x2,#512
1995	ushr	v7.4s,v7.4s,#1			// 4 -> 2
1996
1997	ldp	d10,d11,[sp,#128+16]		// meet ABI requirements
1998	ldp	d12,d13,[sp,#128+32]
1999	ldp	d14,d15,[sp,#128+48]
2000
2001	stp	q0,q0,[sp,#0]		// wipe off-load area
2002	stp	q0,q0,[sp,#32]
2003	stp	q0,q0,[sp,#64]
2004
2005	b.eq	Ldone_512_neon
2006
2007	sub	x3,x3,#16			// Lone
2008	cmp	x2,#192
2009	add	sp,sp,#128
2010	sub	v3.4s,v3.4s,v7.4s		// -= 2
2011	ld1	{v8.4s,v9.4s},[x3]
2012	b.hs	Loop_outer_neon
2013
2014	ldp	d8,d9,[sp,#0]			// meet ABI requirements
2015	eor	v1.16b,v1.16b,v1.16b
2016	eor	v2.16b,v2.16b,v2.16b
2017	eor	v3.16b,v3.16b,v3.16b
2018	eor	v4.16b,v4.16b,v4.16b
2019	eor	v5.16b,v5.16b,v5.16b
2020	eor	v6.16b,v6.16b,v6.16b
2021	b	Loop_outer
2022
2023Ldone_512_neon:
2024	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
2025	ldp	x19,x20,[x29,#16]
2026	add	sp,sp,#128+64
2027	ldp	x21,x22,[x29,#32]
2028	ldp	x23,x24,[x29,#48]
2029	ldp	x25,x26,[x29,#64]
2030	ldp	x27,x28,[x29,#80]
2031	ldp	x29,x30,[sp],#96
2032.long	0xd50323bf			// autiasp
2033	ret
2034
2035