• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3#if __ARM_MAX_ARCH__>=7
4
5.text
6.align	5
7Lrcon:
8.long	0x01,0x01,0x01,0x01
9.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
10.long	0x1b,0x1b,0x1b,0x1b
11
12.globl	_aes_v8_set_encrypt_key
13
14.align	5
15_aes_v8_set_encrypt_key:
16Lenc_key:
17	stp	x29,x30,[sp,#-16]!
18	add	x29,sp,#0
19	mov	x3,#-1
20	cmp	x0,#0
21	b.eq	Lenc_key_abort
22	cmp	x2,#0
23	b.eq	Lenc_key_abort
24	mov	x3,#-2
25	cmp	w1,#128
26	b.lt	Lenc_key_abort
27	cmp	w1,#256
28	b.gt	Lenc_key_abort
29	tst	w1,#0x3f
30	b.ne	Lenc_key_abort
31
32	adr	x3,Lrcon
33	cmp	w1,#192
34
35	eor	v0.16b,v0.16b,v0.16b
36	ld1	{v3.16b},[x0],#16
37	mov	w1,#8		// reuse w1
38	ld1	{v1.4s,v2.4s},[x3],#32
39
40	b.lt	Loop128
41	b.eq	L192
42	b	L256
43
44.align	4
45Loop128:
46	tbl	v6.16b,{v3.16b},v2.16b
47	ext	v5.16b,v0.16b,v3.16b,#12
48	st1	{v3.4s},[x2],#16
49	aese	v6.16b,v0.16b
50	subs	w1,w1,#1
51
52	eor	v3.16b,v3.16b,v5.16b
53	ext	v5.16b,v0.16b,v5.16b,#12
54	eor	v3.16b,v3.16b,v5.16b
55	ext	v5.16b,v0.16b,v5.16b,#12
56	eor	v6.16b,v6.16b,v1.16b
57	eor	v3.16b,v3.16b,v5.16b
58	shl	v1.16b,v1.16b,#1
59	eor	v3.16b,v3.16b,v6.16b
60	b.ne	Loop128
61
62	ld1	{v1.4s},[x3]
63
64	tbl	v6.16b,{v3.16b},v2.16b
65	ext	v5.16b,v0.16b,v3.16b,#12
66	st1	{v3.4s},[x2],#16
67	aese	v6.16b,v0.16b
68
69	eor	v3.16b,v3.16b,v5.16b
70	ext	v5.16b,v0.16b,v5.16b,#12
71	eor	v3.16b,v3.16b,v5.16b
72	ext	v5.16b,v0.16b,v5.16b,#12
73	eor	v6.16b,v6.16b,v1.16b
74	eor	v3.16b,v3.16b,v5.16b
75	shl	v1.16b,v1.16b,#1
76	eor	v3.16b,v3.16b,v6.16b
77
78	tbl	v6.16b,{v3.16b},v2.16b
79	ext	v5.16b,v0.16b,v3.16b,#12
80	st1	{v3.4s},[x2],#16
81	aese	v6.16b,v0.16b
82
83	eor	v3.16b,v3.16b,v5.16b
84	ext	v5.16b,v0.16b,v5.16b,#12
85	eor	v3.16b,v3.16b,v5.16b
86	ext	v5.16b,v0.16b,v5.16b,#12
87	eor	v6.16b,v6.16b,v1.16b
88	eor	v3.16b,v3.16b,v5.16b
89	eor	v3.16b,v3.16b,v6.16b
90	st1	{v3.4s},[x2]
91	add	x2,x2,#0x50
92
93	mov	w12,#10
94	b	Ldone
95
96.align	4
97L192:
98	ld1	{v4.8b},[x0],#8
99	movi	v6.16b,#8			// borrow v6.16b
100	st1	{v3.4s},[x2],#16
101	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
102
103Loop192:
104	tbl	v6.16b,{v4.16b},v2.16b
105	ext	v5.16b,v0.16b,v3.16b,#12
106#ifdef __ARMEB__
107	st1	{v4.4s},[x2],#16
108	sub	x2,x2,#8
109#else
110	st1	{v4.8b},[x2],#8
111#endif
112	aese	v6.16b,v0.16b
113	subs	w1,w1,#1
114
115	eor	v3.16b,v3.16b,v5.16b
116	ext	v5.16b,v0.16b,v5.16b,#12
117	eor	v3.16b,v3.16b,v5.16b
118	ext	v5.16b,v0.16b,v5.16b,#12
119	eor	v3.16b,v3.16b,v5.16b
120
121	dup	v5.4s,v3.s[3]
122	eor	v5.16b,v5.16b,v4.16b
123	eor	v6.16b,v6.16b,v1.16b
124	ext	v4.16b,v0.16b,v4.16b,#12
125	shl	v1.16b,v1.16b,#1
126	eor	v4.16b,v4.16b,v5.16b
127	eor	v3.16b,v3.16b,v6.16b
128	eor	v4.16b,v4.16b,v6.16b
129	st1	{v3.4s},[x2],#16
130	b.ne	Loop192
131
132	mov	w12,#12
133	add	x2,x2,#0x20
134	b	Ldone
135
136.align	4
137L256:
138	ld1	{v4.16b},[x0]
139	mov	w1,#7
140	mov	w12,#14
141	st1	{v3.4s},[x2],#16
142
143Loop256:
144	tbl	v6.16b,{v4.16b},v2.16b
145	ext	v5.16b,v0.16b,v3.16b,#12
146	st1	{v4.4s},[x2],#16
147	aese	v6.16b,v0.16b
148	subs	w1,w1,#1
149
150	eor	v3.16b,v3.16b,v5.16b
151	ext	v5.16b,v0.16b,v5.16b,#12
152	eor	v3.16b,v3.16b,v5.16b
153	ext	v5.16b,v0.16b,v5.16b,#12
154	eor	v6.16b,v6.16b,v1.16b
155	eor	v3.16b,v3.16b,v5.16b
156	shl	v1.16b,v1.16b,#1
157	eor	v3.16b,v3.16b,v6.16b
158	st1	{v3.4s},[x2],#16
159	b.eq	Ldone
160
161	dup	v6.4s,v3.s[3]		// just splat
162	ext	v5.16b,v0.16b,v4.16b,#12
163	aese	v6.16b,v0.16b
164
165	eor	v4.16b,v4.16b,v5.16b
166	ext	v5.16b,v0.16b,v5.16b,#12
167	eor	v4.16b,v4.16b,v5.16b
168	ext	v5.16b,v0.16b,v5.16b,#12
169	eor	v4.16b,v4.16b,v5.16b
170
171	eor	v4.16b,v4.16b,v6.16b
172	b	Loop256
173
174Ldone:
175	str	w12,[x2]
176	mov	x3,#0
177
178Lenc_key_abort:
179	mov	x0,x3			// return value
180	ldr	x29,[sp],#16
181	ret
182
183
184.globl	_aes_v8_set_decrypt_key
185
186.align	5
187_aes_v8_set_decrypt_key:
188.long	0xd503233f		// paciasp
189	stp	x29,x30,[sp,#-16]!
190	add	x29,sp,#0
191	bl	Lenc_key
192
193	cmp	x0,#0
194	b.ne	Ldec_key_abort
195
196	sub	x2,x2,#240		// restore original x2
197	mov	x4,#-16
198	add	x0,x2,x12,lsl#4	// end of key schedule
199
200	ld1	{v0.4s},[x2]
201	ld1	{v1.4s},[x0]
202	st1	{v0.4s},[x0],x4
203	st1	{v1.4s},[x2],#16
204
205Loop_imc:
206	ld1	{v0.4s},[x2]
207	ld1	{v1.4s},[x0]
208	aesimc	v0.16b,v0.16b
209	aesimc	v1.16b,v1.16b
210	st1	{v0.4s},[x0],x4
211	st1	{v1.4s},[x2],#16
212	cmp	x0,x2
213	b.hi	Loop_imc
214
215	ld1	{v0.4s},[x2]
216	aesimc	v0.16b,v0.16b
217	st1	{v0.4s},[x0]
218
219	eor	x0,x0,x0		// return value
220Ldec_key_abort:
221	ldp	x29,x30,[sp],#16
222.long	0xd50323bf		// autiasp
223	ret
224
225.globl	_aes_v8_encrypt
226
227.align	5
228_aes_v8_encrypt:
229	ldr	w3,[x2,#240]
230	ld1	{v0.4s},[x2],#16
231	ld1	{v2.16b},[x0]
232	sub	w3,w3,#2
233	ld1	{v1.4s},[x2],#16
234
235Loop_enc:
236	aese	v2.16b,v0.16b
237	aesmc	v2.16b,v2.16b
238	ld1	{v0.4s},[x2],#16
239	subs	w3,w3,#2
240	aese	v2.16b,v1.16b
241	aesmc	v2.16b,v2.16b
242	ld1	{v1.4s},[x2],#16
243	b.gt	Loop_enc
244
245	aese	v2.16b,v0.16b
246	aesmc	v2.16b,v2.16b
247	ld1	{v0.4s},[x2]
248	aese	v2.16b,v1.16b
249	eor	v2.16b,v2.16b,v0.16b
250
251	st1	{v2.16b},[x1]
252	ret
253
254.globl	_aes_v8_decrypt
255
256.align	5
257_aes_v8_decrypt:
258	ldr	w3,[x2,#240]
259	ld1	{v0.4s},[x2],#16
260	ld1	{v2.16b},[x0]
261	sub	w3,w3,#2
262	ld1	{v1.4s},[x2],#16
263
264Loop_dec:
265	aesd	v2.16b,v0.16b
266	aesimc	v2.16b,v2.16b
267	ld1	{v0.4s},[x2],#16
268	subs	w3,w3,#2
269	aesd	v2.16b,v1.16b
270	aesimc	v2.16b,v2.16b
271	ld1	{v1.4s},[x2],#16
272	b.gt	Loop_dec
273
274	aesd	v2.16b,v0.16b
275	aesimc	v2.16b,v2.16b
276	ld1	{v0.4s},[x2]
277	aesd	v2.16b,v1.16b
278	eor	v2.16b,v2.16b,v0.16b
279
280	st1	{v2.16b},[x1]
281	ret
282
283.globl	_aes_v8_ecb_encrypt
284
285.align	5
286_aes_v8_ecb_encrypt:
287	subs	x2,x2,#16
288	// Original input data size bigger than 16, jump to big size processing.
289	b.ne	Lecb_big_size
290	ld1	{v0.16b},[x0]
291	cmp	w4,#0					// en- or decrypting?
292	ldr	w5,[x3,#240]
293	ld1	{v5.4s,v6.4s},[x3],#32			// load key schedule...
294
295	b.eq	Lecb_small_dec
296	aese	v0.16b,v5.16b
297	aesmc	v0.16b,v0.16b
298	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
299	aese	v0.16b,v6.16b
300	aesmc	v0.16b,v0.16b
301	subs	w5,w5,#10			// if rounds==10, jump to aes-128-ecb processing
302	b.eq	Lecb_128_enc
303Lecb_round_loop:
304	aese	v0.16b,v16.16b
305	aesmc	v0.16b,v0.16b
306	ld1	{v16.4s},[x3],#16				// load key schedule...
307	aese	v0.16b,v17.16b
308	aesmc	v0.16b,v0.16b
309	ld1	{v17.4s},[x3],#16				// load key schedule...
310	subs	w5,w5,#2			// bias
311	b.gt	Lecb_round_loop
312Lecb_128_enc:
313	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
314	aese	v0.16b,v16.16b
315	aesmc	v0.16b,v0.16b
316	aese	v0.16b,v17.16b
317	aesmc	v0.16b,v0.16b
318	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
319	aese	v0.16b,v18.16b
320	aesmc	v0.16b,v0.16b
321	aese	v0.16b,v19.16b
322	aesmc	v0.16b,v0.16b
323	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
324	aese	v0.16b,v20.16b
325	aesmc	v0.16b,v0.16b
326	aese	v0.16b,v21.16b
327	aesmc	v0.16b,v0.16b
328	ld1	{v7.4s},[x3]
329	aese	v0.16b,v22.16b
330	aesmc	v0.16b,v0.16b
331	aese	v0.16b,v23.16b
332	eor	v0.16b,v0.16b,v7.16b
333	st1	{v0.16b},[x1]
334	b	Lecb_Final_abort
335Lecb_small_dec:
336	aesd	v0.16b,v5.16b
337	aesimc	v0.16b,v0.16b
338	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
339	aesd	v0.16b,v6.16b
340	aesimc	v0.16b,v0.16b
341	subs	w5,w5,#10			// bias
342	b.eq	Lecb_128_dec
343Lecb_dec_round_loop:
344	aesd	v0.16b,v16.16b
345	aesimc	v0.16b,v0.16b
346	ld1	{v16.4s},[x3],#16				// load key schedule...
347	aesd	v0.16b,v17.16b
348	aesimc	v0.16b,v0.16b
349	ld1	{v17.4s},[x3],#16				// load key schedule...
350	subs	w5,w5,#2			// bias
351	b.gt	Lecb_dec_round_loop
352Lecb_128_dec:
353	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
354	aesd	v0.16b,v16.16b
355	aesimc	v0.16b,v0.16b
356	aesd	v0.16b,v17.16b
357	aesimc	v0.16b,v0.16b
358	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
359	aesd	v0.16b,v18.16b
360	aesimc	v0.16b,v0.16b
361	aesd	v0.16b,v19.16b
362	aesimc	v0.16b,v0.16b
363	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
364	aesd	v0.16b,v20.16b
365	aesimc	v0.16b,v0.16b
366	aesd	v0.16b,v21.16b
367	aesimc	v0.16b,v0.16b
368	ld1	{v7.4s},[x3]
369	aesd	v0.16b,v22.16b
370	aesimc	v0.16b,v0.16b
371	aesd	v0.16b,v23.16b
372	eor	v0.16b,v0.16b,v7.16b
373	st1	{v0.16b},[x1]
374	b	Lecb_Final_abort
375Lecb_big_size:
376	stp	x29,x30,[sp,#-16]!
377	add	x29,sp,#0
378	mov	x8,#16
379	b.lo	Lecb_done
380	csel	x8,xzr,x8,eq
381
382	cmp	w4,#0					// en- or decrypting?
383	ldr	w5,[x3,#240]
384	and	x2,x2,#-16
385	ld1	{v0.16b},[x0],x8
386
387	ld1	{v16.4s,v17.4s},[x3]				// load key schedule...
388	sub	w5,w5,#6
389	add	x7,x3,x5,lsl#4				// pointer to last 7 round keys
390	sub	w5,w5,#2
391	ld1	{v18.4s,v19.4s},[x7],#32
392	ld1	{v20.4s,v21.4s},[x7],#32
393	ld1	{v22.4s,v23.4s},[x7],#32
394	ld1	{v7.4s},[x7]
395
396	add	x7,x3,#32
397	mov	w6,w5
398	b.eq	Lecb_dec
399
400	ld1	{v1.16b},[x0],#16
401	subs	x2,x2,#32				// bias
402	add	w6,w5,#2
403	orr	v3.16b,v1.16b,v1.16b
404	orr	v24.16b,v1.16b,v1.16b
405	orr	v1.16b,v0.16b,v0.16b
406	b.lo	Lecb_enc_tail
407
408	orr	v1.16b,v3.16b,v3.16b
409	ld1	{v24.16b},[x0],#16
410	cmp	x2,#32
411	b.lo	Loop3x_ecb_enc
412
413	ld1	{v25.16b},[x0],#16
414	ld1	{v26.16b},[x0],#16
415	sub	x2,x2,#32				// bias
416	mov	w6,w5
417
418Loop5x_ecb_enc:
419	aese	v0.16b,v16.16b
420	aesmc	v0.16b,v0.16b
421	aese	v1.16b,v16.16b
422	aesmc	v1.16b,v1.16b
423	aese	v24.16b,v16.16b
424	aesmc	v24.16b,v24.16b
425	aese	v25.16b,v16.16b
426	aesmc	v25.16b,v25.16b
427	aese	v26.16b,v16.16b
428	aesmc	v26.16b,v26.16b
429	ld1	{v16.4s},[x7],#16
430	subs	w6,w6,#2
431	aese	v0.16b,v17.16b
432	aesmc	v0.16b,v0.16b
433	aese	v1.16b,v17.16b
434	aesmc	v1.16b,v1.16b
435	aese	v24.16b,v17.16b
436	aesmc	v24.16b,v24.16b
437	aese	v25.16b,v17.16b
438	aesmc	v25.16b,v25.16b
439	aese	v26.16b,v17.16b
440	aesmc	v26.16b,v26.16b
441	ld1	{v17.4s},[x7],#16
442	b.gt	Loop5x_ecb_enc
443
444	aese	v0.16b,v16.16b
445	aesmc	v0.16b,v0.16b
446	aese	v1.16b,v16.16b
447	aesmc	v1.16b,v1.16b
448	aese	v24.16b,v16.16b
449	aesmc	v24.16b,v24.16b
450	aese	v25.16b,v16.16b
451	aesmc	v25.16b,v25.16b
452	aese	v26.16b,v16.16b
453	aesmc	v26.16b,v26.16b
454	cmp	x2,#0x40					// because Lecb_enc_tail4x
455	sub	x2,x2,#0x50
456
457	aese	v0.16b,v17.16b
458	aesmc	v0.16b,v0.16b
459	aese	v1.16b,v17.16b
460	aesmc	v1.16b,v1.16b
461	aese	v24.16b,v17.16b
462	aesmc	v24.16b,v24.16b
463	aese	v25.16b,v17.16b
464	aesmc	v25.16b,v25.16b
465	aese	v26.16b,v17.16b
466	aesmc	v26.16b,v26.16b
467	csel	x6,xzr,x2,gt			// borrow x6, w6, "gt" is not typo
468	mov	x7,x3
469
470	aese	v0.16b,v18.16b
471	aesmc	v0.16b,v0.16b
472	aese	v1.16b,v18.16b
473	aesmc	v1.16b,v1.16b
474	aese	v24.16b,v18.16b
475	aesmc	v24.16b,v24.16b
476	aese	v25.16b,v18.16b
477	aesmc	v25.16b,v25.16b
478	aese	v26.16b,v18.16b
479	aesmc	v26.16b,v26.16b
480	add	x0,x0,x6				// x0 is adjusted in such way that
481							// at exit from the loop v1.16b-v26.16b
482							// are loaded with last "words"
483	add	x6,x2,#0x60		    // because Lecb_enc_tail4x
484
485	aese	v0.16b,v19.16b
486	aesmc	v0.16b,v0.16b
487	aese	v1.16b,v19.16b
488	aesmc	v1.16b,v1.16b
489	aese	v24.16b,v19.16b
490	aesmc	v24.16b,v24.16b
491	aese	v25.16b,v19.16b
492	aesmc	v25.16b,v25.16b
493	aese	v26.16b,v19.16b
494	aesmc	v26.16b,v26.16b
495
496	aese	v0.16b,v20.16b
497	aesmc	v0.16b,v0.16b
498	aese	v1.16b,v20.16b
499	aesmc	v1.16b,v1.16b
500	aese	v24.16b,v20.16b
501	aesmc	v24.16b,v24.16b
502	aese	v25.16b,v20.16b
503	aesmc	v25.16b,v25.16b
504	aese	v26.16b,v20.16b
505	aesmc	v26.16b,v26.16b
506
507	aese	v0.16b,v21.16b
508	aesmc	v0.16b,v0.16b
509	aese	v1.16b,v21.16b
510	aesmc	v1.16b,v1.16b
511	aese	v24.16b,v21.16b
512	aesmc	v24.16b,v24.16b
513	aese	v25.16b,v21.16b
514	aesmc	v25.16b,v25.16b
515	aese	v26.16b,v21.16b
516	aesmc	v26.16b,v26.16b
517
518	aese	v0.16b,v22.16b
519	aesmc	v0.16b,v0.16b
520	aese	v1.16b,v22.16b
521	aesmc	v1.16b,v1.16b
522	aese	v24.16b,v22.16b
523	aesmc	v24.16b,v24.16b
524	aese	v25.16b,v22.16b
525	aesmc	v25.16b,v25.16b
526	aese	v26.16b,v22.16b
527	aesmc	v26.16b,v26.16b
528
529	aese	v0.16b,v23.16b
530	ld1	{v2.16b},[x0],#16
531	aese	v1.16b,v23.16b
532	ld1	{v3.16b},[x0],#16
533	aese	v24.16b,v23.16b
534	ld1	{v27.16b},[x0],#16
535	aese	v25.16b,v23.16b
536	ld1	{v28.16b},[x0],#16
537	aese	v26.16b,v23.16b
538	ld1	{v29.16b},[x0],#16
539	cbz	x6,Lecb_enc_tail4x
540	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
541	eor	v4.16b,v7.16b,v0.16b
542	orr	v0.16b,v2.16b,v2.16b
543	eor	v5.16b,v7.16b,v1.16b
544	orr	v1.16b,v3.16b,v3.16b
545	eor	v17.16b,v7.16b,v24.16b
546	orr	v24.16b,v27.16b,v27.16b
547	eor	v30.16b,v7.16b,v25.16b
548	orr	v25.16b,v28.16b,v28.16b
549	eor	v31.16b,v7.16b,v26.16b
550	st1	{v4.16b},[x1],#16
551	orr	v26.16b,v29.16b,v29.16b
552	st1	{v5.16b},[x1],#16
553	mov	w6,w5
554	st1	{v17.16b},[x1],#16
555	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
556	st1	{v30.16b},[x1],#16
557	st1	{v31.16b},[x1],#16
558	b.hs	Loop5x_ecb_enc
559
560	add	x2,x2,#0x50
561	cbz	x2,Lecb_done
562
563	add	w6,w5,#2
564	subs	x2,x2,#0x30
565	orr	v0.16b,v27.16b,v27.16b
566	orr	v1.16b,v28.16b,v28.16b
567	orr	v24.16b,v29.16b,v29.16b
568	b.lo	Lecb_enc_tail
569
570	b	Loop3x_ecb_enc
571
572.align	4
573Lecb_enc_tail4x:
574	eor	v5.16b,v7.16b,v1.16b
575	eor	v17.16b,v7.16b,v24.16b
576	eor	v30.16b,v7.16b,v25.16b
577	eor	v31.16b,v7.16b,v26.16b
578	st1	{v5.16b},[x1],#16
579	st1	{v17.16b},[x1],#16
580	st1	{v30.16b},[x1],#16
581	st1	{v31.16b},[x1],#16
582
583	b	Lecb_done
584.align	4
585Loop3x_ecb_enc:
586	aese	v0.16b,v16.16b
587	aesmc	v0.16b,v0.16b
588	aese	v1.16b,v16.16b
589	aesmc	v1.16b,v1.16b
590	aese	v24.16b,v16.16b
591	aesmc	v24.16b,v24.16b
592	ld1	{v16.4s},[x7],#16
593	subs	w6,w6,#2
594	aese	v0.16b,v17.16b
595	aesmc	v0.16b,v0.16b
596	aese	v1.16b,v17.16b
597	aesmc	v1.16b,v1.16b
598	aese	v24.16b,v17.16b
599	aesmc	v24.16b,v24.16b
600	ld1	{v17.4s},[x7],#16
601	b.gt	Loop3x_ecb_enc
602
603	aese	v0.16b,v16.16b
604	aesmc	v0.16b,v0.16b
605	aese	v1.16b,v16.16b
606	aesmc	v1.16b,v1.16b
607	aese	v24.16b,v16.16b
608	aesmc	v24.16b,v24.16b
609	subs	x2,x2,#0x30
610	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
611	aese	v0.16b,v17.16b
612	aesmc	v0.16b,v0.16b
613	aese	v1.16b,v17.16b
614	aesmc	v1.16b,v1.16b
615	aese	v24.16b,v17.16b
616	aesmc	v24.16b,v24.16b
617	add	x0,x0,x6			// x0 is adjusted in such way that
618						// at exit from the loop v1.16b-v24.16b
619						// are loaded with last "words"
620	mov	x7,x3
621	aese	v0.16b,v20.16b
622	aesmc	v0.16b,v0.16b
623	aese	v1.16b,v20.16b
624	aesmc	v1.16b,v1.16b
625	aese	v24.16b,v20.16b
626	aesmc	v24.16b,v24.16b
627	ld1	{v2.16b},[x0],#16
628	aese	v0.16b,v21.16b
629	aesmc	v0.16b,v0.16b
630	aese	v1.16b,v21.16b
631	aesmc	v1.16b,v1.16b
632	aese	v24.16b,v21.16b
633	aesmc	v24.16b,v24.16b
634	ld1	{v3.16b},[x0],#16
635	aese	v0.16b,v22.16b
636	aesmc	v0.16b,v0.16b
637	aese	v1.16b,v22.16b
638	aesmc	v1.16b,v1.16b
639	aese	v24.16b,v22.16b
640	aesmc	v24.16b,v24.16b
641	ld1	{v27.16b},[x0],#16
642	aese	v0.16b,v23.16b
643	aese	v1.16b,v23.16b
644	aese	v24.16b,v23.16b
645	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
646	add	w6,w5,#2
647	eor	v4.16b,v7.16b,v0.16b
648	eor	v5.16b,v7.16b,v1.16b
649	eor	v24.16b,v24.16b,v7.16b
650	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
651	st1	{v4.16b},[x1],#16
652	orr	v0.16b,v2.16b,v2.16b
653	st1	{v5.16b},[x1],#16
654	orr	v1.16b,v3.16b,v3.16b
655	st1	{v24.16b},[x1],#16
656	orr	v24.16b,v27.16b,v27.16b
657	b.hs	Loop3x_ecb_enc
658
659	cmn	x2,#0x30
660	b.eq	Lecb_done
661	nop
662
663Lecb_enc_tail:
664	aese	v1.16b,v16.16b
665	aesmc	v1.16b,v1.16b
666	aese	v24.16b,v16.16b
667	aesmc	v24.16b,v24.16b
668	ld1	{v16.4s},[x7],#16
669	subs	w6,w6,#2
670	aese	v1.16b,v17.16b
671	aesmc	v1.16b,v1.16b
672	aese	v24.16b,v17.16b
673	aesmc	v24.16b,v24.16b
674	ld1	{v17.4s},[x7],#16
675	b.gt	Lecb_enc_tail
676
677	aese	v1.16b,v16.16b
678	aesmc	v1.16b,v1.16b
679	aese	v24.16b,v16.16b
680	aesmc	v24.16b,v24.16b
681	aese	v1.16b,v17.16b
682	aesmc	v1.16b,v1.16b
683	aese	v24.16b,v17.16b
684	aesmc	v24.16b,v24.16b
685	aese	v1.16b,v20.16b
686	aesmc	v1.16b,v1.16b
687	aese	v24.16b,v20.16b
688	aesmc	v24.16b,v24.16b
689	cmn	x2,#0x20
690	aese	v1.16b,v21.16b
691	aesmc	v1.16b,v1.16b
692	aese	v24.16b,v21.16b
693	aesmc	v24.16b,v24.16b
694	aese	v1.16b,v22.16b
695	aesmc	v1.16b,v1.16b
696	aese	v24.16b,v22.16b
697	aesmc	v24.16b,v24.16b
698	aese	v1.16b,v23.16b
699	aese	v24.16b,v23.16b
700	b.eq	Lecb_enc_one
701	eor	v5.16b,v7.16b,v1.16b
702	eor	v17.16b,v7.16b,v24.16b
703	st1	{v5.16b},[x1],#16
704	st1	{v17.16b},[x1],#16
705	b	Lecb_done
706
707Lecb_enc_one:
708	eor	v5.16b,v7.16b,v24.16b
709	st1	{v5.16b},[x1],#16
710	b	Lecb_done
711.align	5
712Lecb_dec:
713	ld1	{v1.16b},[x0],#16
714	subs	x2,x2,#32			// bias
715	add	w6,w5,#2
716	orr	v3.16b,v1.16b,v1.16b
717	orr	v24.16b,v1.16b,v1.16b
718	orr	v1.16b,v0.16b,v0.16b
719	b.lo	Lecb_dec_tail
720
721	orr	v1.16b,v3.16b,v3.16b
722	ld1	{v24.16b},[x0],#16
723	cmp	x2,#32
724	b.lo	Loop3x_ecb_dec
725
726	ld1	{v25.16b},[x0],#16
727	ld1	{v26.16b},[x0],#16
728	sub	x2,x2,#32				// bias
729	mov	w6,w5
730
731Loop5x_ecb_dec:
732	aesd	v0.16b,v16.16b
733	aesimc	v0.16b,v0.16b
734	aesd	v1.16b,v16.16b
735	aesimc	v1.16b,v1.16b
736	aesd	v24.16b,v16.16b
737	aesimc	v24.16b,v24.16b
738	aesd	v25.16b,v16.16b
739	aesimc	v25.16b,v25.16b
740	aesd	v26.16b,v16.16b
741	aesimc	v26.16b,v26.16b
742	ld1	{v16.4s},[x7],#16
743	subs	w6,w6,#2
744	aesd	v0.16b,v17.16b
745	aesimc	v0.16b,v0.16b
746	aesd	v1.16b,v17.16b
747	aesimc	v1.16b,v1.16b
748	aesd	v24.16b,v17.16b
749	aesimc	v24.16b,v24.16b
750	aesd	v25.16b,v17.16b
751	aesimc	v25.16b,v25.16b
752	aesd	v26.16b,v17.16b
753	aesimc	v26.16b,v26.16b
754	ld1	{v17.4s},[x7],#16
755	b.gt	Loop5x_ecb_dec
756
757	aesd	v0.16b,v16.16b
758	aesimc	v0.16b,v0.16b
759	aesd	v1.16b,v16.16b
760	aesimc	v1.16b,v1.16b
761	aesd	v24.16b,v16.16b
762	aesimc	v24.16b,v24.16b
763	aesd	v25.16b,v16.16b
764	aesimc	v25.16b,v25.16b
765	aesd	v26.16b,v16.16b
766	aesimc	v26.16b,v26.16b
767	cmp	x2,#0x40				// because Lecb_tail4x
768	sub	x2,x2,#0x50
769
770	aesd	v0.16b,v17.16b
771	aesimc	v0.16b,v0.16b
772	aesd	v1.16b,v17.16b
773	aesimc	v1.16b,v1.16b
774	aesd	v24.16b,v17.16b
775	aesimc	v24.16b,v24.16b
776	aesd	v25.16b,v17.16b
777	aesimc	v25.16b,v25.16b
778	aesd	v26.16b,v17.16b
779	aesimc	v26.16b,v26.16b
780	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
781	mov	x7,x3
782
783	aesd	v0.16b,v18.16b
784	aesimc	v0.16b,v0.16b
785	aesd	v1.16b,v18.16b
786	aesimc	v1.16b,v1.16b
787	aesd	v24.16b,v18.16b
788	aesimc	v24.16b,v24.16b
789	aesd	v25.16b,v18.16b
790	aesimc	v25.16b,v25.16b
791	aesd	v26.16b,v18.16b
792	aesimc	v26.16b,v26.16b
793	add	x0,x0,x6				// x0 is adjusted in such way that
794							// at exit from the loop v1.16b-v26.16b
795							// are loaded with last "words"
796	add	x6,x2,#0x60			// because Lecb_tail4x
797
798	aesd	v0.16b,v19.16b
799	aesimc	v0.16b,v0.16b
800	aesd	v1.16b,v19.16b
801	aesimc	v1.16b,v1.16b
802	aesd	v24.16b,v19.16b
803	aesimc	v24.16b,v24.16b
804	aesd	v25.16b,v19.16b
805	aesimc	v25.16b,v25.16b
806	aesd	v26.16b,v19.16b
807	aesimc	v26.16b,v26.16b
808
809	aesd	v0.16b,v20.16b
810	aesimc	v0.16b,v0.16b
811	aesd	v1.16b,v20.16b
812	aesimc	v1.16b,v1.16b
813	aesd	v24.16b,v20.16b
814	aesimc	v24.16b,v24.16b
815	aesd	v25.16b,v20.16b
816	aesimc	v25.16b,v25.16b
817	aesd	v26.16b,v20.16b
818	aesimc	v26.16b,v26.16b
819
820	aesd	v0.16b,v21.16b
821	aesimc	v0.16b,v0.16b
822	aesd	v1.16b,v21.16b
823	aesimc	v1.16b,v1.16b
824	aesd	v24.16b,v21.16b
825	aesimc	v24.16b,v24.16b
826	aesd	v25.16b,v21.16b
827	aesimc	v25.16b,v25.16b
828	aesd	v26.16b,v21.16b
829	aesimc	v26.16b,v26.16b
830
831	aesd	v0.16b,v22.16b
832	aesimc	v0.16b,v0.16b
833	aesd	v1.16b,v22.16b
834	aesimc	v1.16b,v1.16b
835	aesd	v24.16b,v22.16b
836	aesimc	v24.16b,v24.16b
837	aesd	v25.16b,v22.16b
838	aesimc	v25.16b,v25.16b
839	aesd	v26.16b,v22.16b
840	aesimc	v26.16b,v26.16b
841
842	aesd	v0.16b,v23.16b
843	ld1	{v2.16b},[x0],#16
844	aesd	v1.16b,v23.16b
845	ld1	{v3.16b},[x0],#16
846	aesd	v24.16b,v23.16b
847	ld1	{v27.16b},[x0],#16
848	aesd	v25.16b,v23.16b
849	ld1	{v28.16b},[x0],#16
850	aesd	v26.16b,v23.16b
851	ld1	{v29.16b},[x0],#16
852	cbz	x6,Lecb_tail4x
853	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
854	eor	v4.16b,v7.16b,v0.16b
855	orr	v0.16b,v2.16b,v2.16b
856	eor	v5.16b,v7.16b,v1.16b
857	orr	v1.16b,v3.16b,v3.16b
858	eor	v17.16b,v7.16b,v24.16b
859	orr	v24.16b,v27.16b,v27.16b
860	eor	v30.16b,v7.16b,v25.16b
861	orr	v25.16b,v28.16b,v28.16b
862	eor	v31.16b,v7.16b,v26.16b
863	st1	{v4.16b},[x1],#16
864	orr	v26.16b,v29.16b,v29.16b
865	st1	{v5.16b},[x1],#16
866	mov	w6,w5
867	st1	{v17.16b},[x1],#16
868	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
869	st1	{v30.16b},[x1],#16
870	st1	{v31.16b},[x1],#16
871	b.hs	Loop5x_ecb_dec
872
873	add	x2,x2,#0x50
874	cbz	x2,Lecb_done
875
876	add	w6,w5,#2
877	subs	x2,x2,#0x30
878	orr	v0.16b,v27.16b,v27.16b
879	orr	v1.16b,v28.16b,v28.16b
880	orr	v24.16b,v29.16b,v29.16b
881	b.lo	Lecb_dec_tail
882
883	b	Loop3x_ecb_dec
884
885.align	4
886Lecb_tail4x:
887	eor	v5.16b,v7.16b,v1.16b
888	eor	v17.16b,v7.16b,v24.16b
889	eor	v30.16b,v7.16b,v25.16b
890	eor	v31.16b,v7.16b,v26.16b
891	st1	{v5.16b},[x1],#16
892	st1	{v17.16b},[x1],#16
893	st1	{v30.16b},[x1],#16
894	st1	{v31.16b},[x1],#16
895
896	b	Lecb_done
897.align	4
898Loop3x_ecb_dec:
899	aesd	v0.16b,v16.16b
900	aesimc	v0.16b,v0.16b
901	aesd	v1.16b,v16.16b
902	aesimc	v1.16b,v1.16b
903	aesd	v24.16b,v16.16b
904	aesimc	v24.16b,v24.16b
905	ld1	{v16.4s},[x7],#16
906	subs	w6,w6,#2
907	aesd	v0.16b,v17.16b
908	aesimc	v0.16b,v0.16b
909	aesd	v1.16b,v17.16b
910	aesimc	v1.16b,v1.16b
911	aesd	v24.16b,v17.16b
912	aesimc	v24.16b,v24.16b
913	ld1	{v17.4s},[x7],#16
914	b.gt	Loop3x_ecb_dec
915
916	aesd	v0.16b,v16.16b
917	aesimc	v0.16b,v0.16b
918	aesd	v1.16b,v16.16b
919	aesimc	v1.16b,v1.16b
920	aesd	v24.16b,v16.16b
921	aesimc	v24.16b,v24.16b
922	subs	x2,x2,#0x30
923	csel	x6,x2,x6,lo				// x6, w6, is zero at this point
924	aesd	v0.16b,v17.16b
925	aesimc	v0.16b,v0.16b
926	aesd	v1.16b,v17.16b
927	aesimc	v1.16b,v1.16b
928	aesd	v24.16b,v17.16b
929	aesimc	v24.16b,v24.16b
930	add	x0,x0,x6 			// x0 is adjusted in such way that
931						// at exit from the loop v1.16b-v24.16b
932						// are loaded with last "words"
933	mov	x7,x3
934	aesd	v0.16b,v20.16b
935	aesimc	v0.16b,v0.16b
936	aesd	v1.16b,v20.16b
937	aesimc	v1.16b,v1.16b
938	aesd	v24.16b,v20.16b
939	aesimc	v24.16b,v24.16b
940	ld1	{v2.16b},[x0],#16
941	aesd	v0.16b,v21.16b
942	aesimc	v0.16b,v0.16b
943	aesd	v1.16b,v21.16b
944	aesimc	v1.16b,v1.16b
945	aesd	v24.16b,v21.16b
946	aesimc	v24.16b,v24.16b
947	ld1	{v3.16b},[x0],#16
948	aesd	v0.16b,v22.16b
949	aesimc	v0.16b,v0.16b
950	aesd	v1.16b,v22.16b
951	aesimc	v1.16b,v1.16b
952	aesd	v24.16b,v22.16b
953	aesimc	v24.16b,v24.16b
954	ld1	{v27.16b},[x0],#16
955	aesd	v0.16b,v23.16b
956	aesd	v1.16b,v23.16b
957	aesd	v24.16b,v23.16b
958	ld1	{v16.4s},[x7],#16			// re-pre-load rndkey[0]
959	add	w6,w5,#2
960	eor	v4.16b,v7.16b,v0.16b
961	eor	v5.16b,v7.16b,v1.16b
962	eor	v24.16b,v24.16b,v7.16b
963	ld1	{v17.4s},[x7],#16			// re-pre-load rndkey[1]
964	st1	{v4.16b},[x1],#16
965	orr	v0.16b,v2.16b,v2.16b
966	st1	{v5.16b},[x1],#16
967	orr	v1.16b,v3.16b,v3.16b
968	st1	{v24.16b},[x1],#16
969	orr	v24.16b,v27.16b,v27.16b
970	b.hs	Loop3x_ecb_dec
971
972	cmn	x2,#0x30
973	b.eq	Lecb_done
974	nop
975
976Lecb_dec_tail:
977	aesd	v1.16b,v16.16b
978	aesimc	v1.16b,v1.16b
979	aesd	v24.16b,v16.16b
980	aesimc	v24.16b,v24.16b
981	ld1	{v16.4s},[x7],#16
982	subs	w6,w6,#2
983	aesd	v1.16b,v17.16b
984	aesimc	v1.16b,v1.16b
985	aesd	v24.16b,v17.16b
986	aesimc	v24.16b,v24.16b
987	ld1	{v17.4s},[x7],#16
988	b.gt	Lecb_dec_tail
989
990	aesd	v1.16b,v16.16b
991	aesimc	v1.16b,v1.16b
992	aesd	v24.16b,v16.16b
993	aesimc	v24.16b,v24.16b
994	aesd	v1.16b,v17.16b
995	aesimc	v1.16b,v1.16b
996	aesd	v24.16b,v17.16b
997	aesimc	v24.16b,v24.16b
998	aesd	v1.16b,v20.16b
999	aesimc	v1.16b,v1.16b
1000	aesd	v24.16b,v20.16b
1001	aesimc	v24.16b,v24.16b
1002	cmn	x2,#0x20
1003	aesd	v1.16b,v21.16b
1004	aesimc	v1.16b,v1.16b
1005	aesd	v24.16b,v21.16b
1006	aesimc	v24.16b,v24.16b
1007	aesd	v1.16b,v22.16b
1008	aesimc	v1.16b,v1.16b
1009	aesd	v24.16b,v22.16b
1010	aesimc	v24.16b,v24.16b
1011	aesd	v1.16b,v23.16b
1012	aesd	v24.16b,v23.16b
1013	b.eq	Lecb_dec_one
1014	eor	v5.16b,v7.16b,v1.16b
1015	eor	v17.16b,v7.16b,v24.16b
1016	st1	{v5.16b},[x1],#16
1017	st1	{v17.16b},[x1],#16
1018	b	Lecb_done
1019
1020Lecb_dec_one:
1021	eor	v5.16b,v7.16b,v24.16b
1022	st1	{v5.16b},[x1],#16
1023
1024Lecb_done:
1025	ldr	x29,[sp],#16
1026Lecb_Final_abort:
1027	ret
1028
1029.globl	_aes_v8_cbc_encrypt
1030
1031.align	5
1032_aes_v8_cbc_encrypt:
1033	stp	x29,x30,[sp,#-16]!
1034	add	x29,sp,#0
1035	subs	x2,x2,#16
1036	mov	x8,#16
1037	b.lo	Lcbc_abort
1038	csel	x8,xzr,x8,eq
1039
1040	cmp	w5,#0			// en- or decrypting?
1041	ldr	w5,[x3,#240]
1042	and	x2,x2,#-16
1043	ld1	{v6.16b},[x4]
1044	ld1	{v0.16b},[x0],x8
1045
1046	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1047	sub	w5,w5,#6
1048	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
1049	sub	w5,w5,#2
1050	ld1	{v18.4s,v19.4s},[x7],#32
1051	ld1	{v20.4s,v21.4s},[x7],#32
1052	ld1	{v22.4s,v23.4s},[x7],#32
1053	ld1	{v7.4s},[x7]
1054
1055	add	x7,x3,#32
1056	mov	w6,w5
1057	b.eq	Lcbc_dec
1058
1059	cmp	w5,#2
1060	eor	v0.16b,v0.16b,v6.16b
1061	eor	v5.16b,v16.16b,v7.16b
1062	b.eq	Lcbc_enc128
1063
1064	ld1	{v2.4s,v3.4s},[x7]
1065	add	x7,x3,#16
1066	add	x6,x3,#16*4
1067	add	x12,x3,#16*5
1068	aese	v0.16b,v16.16b
1069	aesmc	v0.16b,v0.16b
1070	add	x14,x3,#16*6
1071	add	x3,x3,#16*7
1072	b	Lenter_cbc_enc
1073
1074.align	4
1075Loop_cbc_enc:
1076	aese	v0.16b,v16.16b
1077	aesmc	v0.16b,v0.16b
1078	st1	{v6.16b},[x1],#16
1079Lenter_cbc_enc:
1080	aese	v0.16b,v17.16b
1081	aesmc	v0.16b,v0.16b
1082	aese	v0.16b,v2.16b
1083	aesmc	v0.16b,v0.16b
1084	ld1	{v16.4s},[x6]
1085	cmp	w5,#4
1086	aese	v0.16b,v3.16b
1087	aesmc	v0.16b,v0.16b
1088	ld1	{v17.4s},[x12]
1089	b.eq	Lcbc_enc192
1090
1091	aese	v0.16b,v16.16b
1092	aesmc	v0.16b,v0.16b
1093	ld1	{v16.4s},[x14]
1094	aese	v0.16b,v17.16b
1095	aesmc	v0.16b,v0.16b
1096	ld1	{v17.4s},[x3]
1097	nop
1098
1099Lcbc_enc192:
1100	aese	v0.16b,v16.16b
1101	aesmc	v0.16b,v0.16b
1102	subs	x2,x2,#16
1103	aese	v0.16b,v17.16b
1104	aesmc	v0.16b,v0.16b
1105	csel	x8,xzr,x8,eq
1106	aese	v0.16b,v18.16b
1107	aesmc	v0.16b,v0.16b
1108	aese	v0.16b,v19.16b
1109	aesmc	v0.16b,v0.16b
1110	ld1	{v16.16b},[x0],x8
1111	aese	v0.16b,v20.16b
1112	aesmc	v0.16b,v0.16b
1113	eor	v16.16b,v16.16b,v5.16b
1114	aese	v0.16b,v21.16b
1115	aesmc	v0.16b,v0.16b
1116	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
1117	aese	v0.16b,v22.16b
1118	aesmc	v0.16b,v0.16b
1119	aese	v0.16b,v23.16b
1120	eor	v6.16b,v0.16b,v7.16b
1121	b.hs	Loop_cbc_enc
1122
1123	st1	{v6.16b},[x1],#16
1124	b	Lcbc_done
1125
1126.align	5
1127Lcbc_enc128:
1128	ld1	{v2.4s,v3.4s},[x7]
1129	aese	v0.16b,v16.16b
1130	aesmc	v0.16b,v0.16b
1131	b	Lenter_cbc_enc128
1132Loop_cbc_enc128:
1133	aese	v0.16b,v16.16b
1134	aesmc	v0.16b,v0.16b
1135	st1	{v6.16b},[x1],#16
1136Lenter_cbc_enc128:
1137	aese	v0.16b,v17.16b
1138	aesmc	v0.16b,v0.16b
1139	subs	x2,x2,#16
1140	aese	v0.16b,v2.16b
1141	aesmc	v0.16b,v0.16b
1142	csel	x8,xzr,x8,eq
1143	aese	v0.16b,v3.16b
1144	aesmc	v0.16b,v0.16b
1145	aese	v0.16b,v18.16b
1146	aesmc	v0.16b,v0.16b
1147	aese	v0.16b,v19.16b
1148	aesmc	v0.16b,v0.16b
1149	ld1	{v16.16b},[x0],x8
1150	aese	v0.16b,v20.16b
1151	aesmc	v0.16b,v0.16b
1152	aese	v0.16b,v21.16b
1153	aesmc	v0.16b,v0.16b
1154	aese	v0.16b,v22.16b
1155	aesmc	v0.16b,v0.16b
1156	eor	v16.16b,v16.16b,v5.16b
1157	aese	v0.16b,v23.16b
1158	eor	v6.16b,v0.16b,v7.16b
1159	b.hs	Loop_cbc_enc128
1160
1161	st1	{v6.16b},[x1],#16
1162	b	Lcbc_done
1163.align	5
1164Lcbc_dec:
1165	ld1	{v24.16b},[x0],#16
1166	subs	x2,x2,#32		// bias
1167	add	w6,w5,#2
1168	orr	v3.16b,v0.16b,v0.16b
1169	orr	v1.16b,v0.16b,v0.16b
1170	orr	v27.16b,v24.16b,v24.16b
1171	b.lo	Lcbc_dec_tail
1172
1173	orr	v1.16b,v24.16b,v24.16b
1174	ld1	{v24.16b},[x0],#16
1175	orr	v2.16b,v0.16b,v0.16b
1176	orr	v3.16b,v1.16b,v1.16b
1177	orr	v27.16b,v24.16b,v24.16b
1178	cmp	x2,#32
1179	b.lo	Loop3x_cbc_dec
1180
1181	ld1	{v25.16b},[x0],#16
1182	ld1	{v26.16b},[x0],#16
1183	sub	x2,x2,#32		// bias
1184	mov	w6,w5
1185	orr	v28.16b,v25.16b,v25.16b
1186	orr	v29.16b,v26.16b,v26.16b
1187
1188Loop5x_cbc_dec:
1189	aesd	v0.16b,v16.16b
1190	aesimc	v0.16b,v0.16b
1191	aesd	v1.16b,v16.16b
1192	aesimc	v1.16b,v1.16b
1193	aesd	v24.16b,v16.16b
1194	aesimc	v24.16b,v24.16b
1195	aesd	v25.16b,v16.16b
1196	aesimc	v25.16b,v25.16b
1197	aesd	v26.16b,v16.16b
1198	aesimc	v26.16b,v26.16b
1199	ld1	{v16.4s},[x7],#16
1200	subs	w6,w6,#2
1201	aesd	v0.16b,v17.16b
1202	aesimc	v0.16b,v0.16b
1203	aesd	v1.16b,v17.16b
1204	aesimc	v1.16b,v1.16b
1205	aesd	v24.16b,v17.16b
1206	aesimc	v24.16b,v24.16b
1207	aesd	v25.16b,v17.16b
1208	aesimc	v25.16b,v25.16b
1209	aesd	v26.16b,v17.16b
1210	aesimc	v26.16b,v26.16b
1211	ld1	{v17.4s},[x7],#16
1212	b.gt	Loop5x_cbc_dec
1213
1214	aesd	v0.16b,v16.16b
1215	aesimc	v0.16b,v0.16b
1216	aesd	v1.16b,v16.16b
1217	aesimc	v1.16b,v1.16b
1218	aesd	v24.16b,v16.16b
1219	aesimc	v24.16b,v24.16b
1220	aesd	v25.16b,v16.16b
1221	aesimc	v25.16b,v25.16b
1222	aesd	v26.16b,v16.16b
1223	aesimc	v26.16b,v26.16b
1224	cmp	x2,#0x40		// because Lcbc_tail4x
1225	sub	x2,x2,#0x50
1226
1227	aesd	v0.16b,v17.16b
1228	aesimc	v0.16b,v0.16b
1229	aesd	v1.16b,v17.16b
1230	aesimc	v1.16b,v1.16b
1231	aesd	v24.16b,v17.16b
1232	aesimc	v24.16b,v24.16b
1233	aesd	v25.16b,v17.16b
1234	aesimc	v25.16b,v25.16b
1235	aesd	v26.16b,v17.16b
1236	aesimc	v26.16b,v26.16b
1237	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
1238	mov	x7,x3
1239
1240	aesd	v0.16b,v18.16b
1241	aesimc	v0.16b,v0.16b
1242	aesd	v1.16b,v18.16b
1243	aesimc	v1.16b,v1.16b
1244	aesd	v24.16b,v18.16b
1245	aesimc	v24.16b,v24.16b
1246	aesd	v25.16b,v18.16b
1247	aesimc	v25.16b,v25.16b
1248	aesd	v26.16b,v18.16b
1249	aesimc	v26.16b,v26.16b
1250	add	x0,x0,x6		// x0 is adjusted in such way that
1251					// at exit from the loop v1.16b-v26.16b
1252					// are loaded with last "words"
1253	add	x6,x2,#0x60		// because Lcbc_tail4x
1254
1255	aesd	v0.16b,v19.16b
1256	aesimc	v0.16b,v0.16b
1257	aesd	v1.16b,v19.16b
1258	aesimc	v1.16b,v1.16b
1259	aesd	v24.16b,v19.16b
1260	aesimc	v24.16b,v24.16b
1261	aesd	v25.16b,v19.16b
1262	aesimc	v25.16b,v25.16b
1263	aesd	v26.16b,v19.16b
1264	aesimc	v26.16b,v26.16b
1265
1266	aesd	v0.16b,v20.16b
1267	aesimc	v0.16b,v0.16b
1268	aesd	v1.16b,v20.16b
1269	aesimc	v1.16b,v1.16b
1270	aesd	v24.16b,v20.16b
1271	aesimc	v24.16b,v24.16b
1272	aesd	v25.16b,v20.16b
1273	aesimc	v25.16b,v25.16b
1274	aesd	v26.16b,v20.16b
1275	aesimc	v26.16b,v26.16b
1276
1277	aesd	v0.16b,v21.16b
1278	aesimc	v0.16b,v0.16b
1279	aesd	v1.16b,v21.16b
1280	aesimc	v1.16b,v1.16b
1281	aesd	v24.16b,v21.16b
1282	aesimc	v24.16b,v24.16b
1283	aesd	v25.16b,v21.16b
1284	aesimc	v25.16b,v25.16b
1285	aesd	v26.16b,v21.16b
1286	aesimc	v26.16b,v26.16b
1287
1288	aesd	v0.16b,v22.16b
1289	aesimc	v0.16b,v0.16b
1290	aesd	v1.16b,v22.16b
1291	aesimc	v1.16b,v1.16b
1292	aesd	v24.16b,v22.16b
1293	aesimc	v24.16b,v24.16b
1294	aesd	v25.16b,v22.16b
1295	aesimc	v25.16b,v25.16b
1296	aesd	v26.16b,v22.16b
1297	aesimc	v26.16b,v26.16b
1298
1299	eor	v4.16b,v6.16b,v7.16b
1300	aesd	v0.16b,v23.16b
1301	eor	v5.16b,v2.16b,v7.16b
1302	ld1	{v2.16b},[x0],#16
1303	aesd	v1.16b,v23.16b
1304	eor	v17.16b,v3.16b,v7.16b
1305	ld1	{v3.16b},[x0],#16
1306	aesd	v24.16b,v23.16b
1307	eor	v30.16b,v27.16b,v7.16b
1308	ld1	{v27.16b},[x0],#16
1309	aesd	v25.16b,v23.16b
1310	eor	v31.16b,v28.16b,v7.16b
1311	ld1	{v28.16b},[x0],#16
1312	aesd	v26.16b,v23.16b
1313	orr	v6.16b,v29.16b,v29.16b
1314	ld1	{v29.16b},[x0],#16
1315	cbz	x6,Lcbc_tail4x
1316	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1317	eor	v4.16b,v4.16b,v0.16b
1318	orr	v0.16b,v2.16b,v2.16b
1319	eor	v5.16b,v5.16b,v1.16b
1320	orr	v1.16b,v3.16b,v3.16b
1321	eor	v17.16b,v17.16b,v24.16b
1322	orr	v24.16b,v27.16b,v27.16b
1323	eor	v30.16b,v30.16b,v25.16b
1324	orr	v25.16b,v28.16b,v28.16b
1325	eor	v31.16b,v31.16b,v26.16b
1326	st1	{v4.16b},[x1],#16
1327	orr	v26.16b,v29.16b,v29.16b
1328	st1	{v5.16b},[x1],#16
1329	mov	w6,w5
1330	st1	{v17.16b},[x1],#16
1331	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1332	st1	{v30.16b},[x1],#16
1333	st1	{v31.16b},[x1],#16
1334	b.hs	Loop5x_cbc_dec
1335
1336	add	x2,x2,#0x50
1337	cbz	x2,Lcbc_done
1338
1339	add	w6,w5,#2
1340	subs	x2,x2,#0x30
1341	orr	v0.16b,v27.16b,v27.16b
1342	orr	v2.16b,v27.16b,v27.16b
1343	orr	v1.16b,v28.16b,v28.16b
1344	orr	v3.16b,v28.16b,v28.16b
1345	orr	v24.16b,v29.16b,v29.16b
1346	orr	v27.16b,v29.16b,v29.16b
1347	b.lo	Lcbc_dec_tail
1348
1349	b	Loop3x_cbc_dec
1350
1351.align	4
1352Lcbc_tail4x:
1353	eor	v5.16b,v4.16b,v1.16b
1354	eor	v17.16b,v17.16b,v24.16b
1355	eor	v30.16b,v30.16b,v25.16b
1356	eor	v31.16b,v31.16b,v26.16b
1357	st1	{v5.16b},[x1],#16
1358	st1	{v17.16b},[x1],#16
1359	st1	{v30.16b},[x1],#16
1360	st1	{v31.16b},[x1],#16
1361
1362	b	Lcbc_done
1363.align	4
1364Loop3x_cbc_dec:
1365	aesd	v0.16b,v16.16b
1366	aesimc	v0.16b,v0.16b
1367	aesd	v1.16b,v16.16b
1368	aesimc	v1.16b,v1.16b
1369	aesd	v24.16b,v16.16b
1370	aesimc	v24.16b,v24.16b
1371	ld1	{v16.4s},[x7],#16
1372	subs	w6,w6,#2
1373	aesd	v0.16b,v17.16b
1374	aesimc	v0.16b,v0.16b
1375	aesd	v1.16b,v17.16b
1376	aesimc	v1.16b,v1.16b
1377	aesd	v24.16b,v17.16b
1378	aesimc	v24.16b,v24.16b
1379	ld1	{v17.4s},[x7],#16
1380	b.gt	Loop3x_cbc_dec
1381
1382	aesd	v0.16b,v16.16b
1383	aesimc	v0.16b,v0.16b
1384	aesd	v1.16b,v16.16b
1385	aesimc	v1.16b,v1.16b
1386	aesd	v24.16b,v16.16b
1387	aesimc	v24.16b,v24.16b
1388	eor	v4.16b,v6.16b,v7.16b
1389	subs	x2,x2,#0x30
1390	eor	v5.16b,v2.16b,v7.16b
1391	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
1392	aesd	v0.16b,v17.16b
1393	aesimc	v0.16b,v0.16b
1394	aesd	v1.16b,v17.16b
1395	aesimc	v1.16b,v1.16b
1396	aesd	v24.16b,v17.16b
1397	aesimc	v24.16b,v24.16b
1398	eor	v17.16b,v3.16b,v7.16b
1399	add	x0,x0,x6		// x0 is adjusted in such way that
1400					// at exit from the loop v1.16b-v24.16b
1401					// are loaded with last "words"
1402	orr	v6.16b,v27.16b,v27.16b
1403	mov	x7,x3
1404	aesd	v0.16b,v20.16b
1405	aesimc	v0.16b,v0.16b
1406	aesd	v1.16b,v20.16b
1407	aesimc	v1.16b,v1.16b
1408	aesd	v24.16b,v20.16b
1409	aesimc	v24.16b,v24.16b
1410	ld1	{v2.16b},[x0],#16
1411	aesd	v0.16b,v21.16b
1412	aesimc	v0.16b,v0.16b
1413	aesd	v1.16b,v21.16b
1414	aesimc	v1.16b,v1.16b
1415	aesd	v24.16b,v21.16b
1416	aesimc	v24.16b,v24.16b
1417	ld1	{v3.16b},[x0],#16
1418	aesd	v0.16b,v22.16b
1419	aesimc	v0.16b,v0.16b
1420	aesd	v1.16b,v22.16b
1421	aesimc	v1.16b,v1.16b
1422	aesd	v24.16b,v22.16b
1423	aesimc	v24.16b,v24.16b
1424	ld1	{v27.16b},[x0],#16
1425	aesd	v0.16b,v23.16b
1426	aesd	v1.16b,v23.16b
1427	aesd	v24.16b,v23.16b
1428	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1429	add	w6,w5,#2
1430	eor	v4.16b,v4.16b,v0.16b
1431	eor	v5.16b,v5.16b,v1.16b
1432	eor	v24.16b,v24.16b,v17.16b
1433	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1434	st1	{v4.16b},[x1],#16
1435	orr	v0.16b,v2.16b,v2.16b
1436	st1	{v5.16b},[x1],#16
1437	orr	v1.16b,v3.16b,v3.16b
1438	st1	{v24.16b},[x1],#16
1439	orr	v24.16b,v27.16b,v27.16b
1440	b.hs	Loop3x_cbc_dec
1441
1442	cmn	x2,#0x30
1443	b.eq	Lcbc_done
1444	nop
1445
1446Lcbc_dec_tail:
1447	aesd	v1.16b,v16.16b
1448	aesimc	v1.16b,v1.16b
1449	aesd	v24.16b,v16.16b
1450	aesimc	v24.16b,v24.16b
1451	ld1	{v16.4s},[x7],#16
1452	subs	w6,w6,#2
1453	aesd	v1.16b,v17.16b
1454	aesimc	v1.16b,v1.16b
1455	aesd	v24.16b,v17.16b
1456	aesimc	v24.16b,v24.16b
1457	ld1	{v17.4s},[x7],#16
1458	b.gt	Lcbc_dec_tail
1459
1460	aesd	v1.16b,v16.16b
1461	aesimc	v1.16b,v1.16b
1462	aesd	v24.16b,v16.16b
1463	aesimc	v24.16b,v24.16b
1464	aesd	v1.16b,v17.16b
1465	aesimc	v1.16b,v1.16b
1466	aesd	v24.16b,v17.16b
1467	aesimc	v24.16b,v24.16b
1468	aesd	v1.16b,v20.16b
1469	aesimc	v1.16b,v1.16b
1470	aesd	v24.16b,v20.16b
1471	aesimc	v24.16b,v24.16b
1472	cmn	x2,#0x20
1473	aesd	v1.16b,v21.16b
1474	aesimc	v1.16b,v1.16b
1475	aesd	v24.16b,v21.16b
1476	aesimc	v24.16b,v24.16b
1477	eor	v5.16b,v6.16b,v7.16b
1478	aesd	v1.16b,v22.16b
1479	aesimc	v1.16b,v1.16b
1480	aesd	v24.16b,v22.16b
1481	aesimc	v24.16b,v24.16b
1482	eor	v17.16b,v3.16b,v7.16b
1483	aesd	v1.16b,v23.16b
1484	aesd	v24.16b,v23.16b
1485	b.eq	Lcbc_dec_one
1486	eor	v5.16b,v5.16b,v1.16b
1487	eor	v17.16b,v17.16b,v24.16b
1488	orr	v6.16b,v27.16b,v27.16b
1489	st1	{v5.16b},[x1],#16
1490	st1	{v17.16b},[x1],#16
1491	b	Lcbc_done
1492
1493Lcbc_dec_one:
1494	eor	v5.16b,v5.16b,v24.16b
1495	orr	v6.16b,v27.16b,v27.16b
1496	st1	{v5.16b},[x1],#16
1497
1498Lcbc_done:
1499	st1	{v6.16b},[x4]
1500Lcbc_abort:
1501	ldr	x29,[sp],#16
1502	ret
1503
1504.globl	_aes_v8_ctr32_encrypt_blocks
1505
1506.align	5
1507_aes_v8_ctr32_encrypt_blocks:
1508	stp	x29,x30,[sp,#-16]!
1509	add	x29,sp,#0
1510	ldr	w5,[x3,#240]
1511
1512	ldr	w8, [x4, #12]
1513#ifdef __ARMEB__
1514	ld1	{v0.16b},[x4]
1515#else
1516	ld1	{v0.4s},[x4]
1517#endif
1518	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
1519	sub	w5,w5,#4
1520	mov	x12,#16
1521	cmp	x2,#2
1522	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
1523	sub	w5,w5,#2
1524	ld1	{v20.4s,v21.4s},[x7],#32
1525	ld1	{v22.4s,v23.4s},[x7],#32
1526	ld1	{v7.4s},[x7]
1527	add	x7,x3,#32
1528	mov	w6,w5
1529	csel	x12,xzr,x12,lo
1530#ifndef __ARMEB__
1531	rev	w8, w8
1532#endif
1533	orr	v1.16b,v0.16b,v0.16b
1534	add	w10, w8, #1
1535	orr	v18.16b,v0.16b,v0.16b
1536	add	w8, w8, #2
1537	orr	v6.16b,v0.16b,v0.16b
1538	rev	w10, w10
1539	mov	v1.s[3],w10
1540	b.ls	Lctr32_tail
1541	rev	w12, w8
1542	sub	x2,x2,#3		// bias
1543	mov	v18.s[3],w12
1544	cmp	x2,#32
1545	b.lo	Loop3x_ctr32
1546
1547	add	w13,w8,#1
1548	add	w14,w8,#2
1549	orr	v24.16b,v0.16b,v0.16b
1550	rev	w13,w13
1551	orr	v25.16b,v0.16b,v0.16b
1552	rev	w14,w14
1553	mov	v24.s[3],w13
1554	sub	x2,x2,#2		// bias
1555	mov	v25.s[3],w14
1556	add	w8,w8,#2
1557	b	Loop5x_ctr32
1558
1559.align	4
1560Loop5x_ctr32:
1561	aese	v0.16b,v16.16b
1562	aesmc	v0.16b,v0.16b
1563	aese	v1.16b,v16.16b
1564	aesmc	v1.16b,v1.16b
1565	aese	v18.16b,v16.16b
1566	aesmc	v18.16b,v18.16b
1567	aese	v24.16b,v16.16b
1568	aesmc	v24.16b,v24.16b
1569	aese	v25.16b,v16.16b
1570	aesmc	v25.16b,v25.16b
1571	ld1	{v16.4s},[x7],#16
1572	subs	w6,w6,#2
1573	aese	v0.16b,v17.16b
1574	aesmc	v0.16b,v0.16b
1575	aese	v1.16b,v17.16b
1576	aesmc	v1.16b,v1.16b
1577	aese	v18.16b,v17.16b
1578	aesmc	v18.16b,v18.16b
1579	aese	v24.16b,v17.16b
1580	aesmc	v24.16b,v24.16b
1581	aese	v25.16b,v17.16b
1582	aesmc	v25.16b,v25.16b
1583	ld1	{v17.4s},[x7],#16
1584	b.gt	Loop5x_ctr32
1585
1586	mov	x7,x3
1587	aese	v0.16b,v16.16b
1588	aesmc	v0.16b,v0.16b
1589	aese	v1.16b,v16.16b
1590	aesmc	v1.16b,v1.16b
1591	aese	v18.16b,v16.16b
1592	aesmc	v18.16b,v18.16b
1593	aese	v24.16b,v16.16b
1594	aesmc	v24.16b,v24.16b
1595	aese	v25.16b,v16.16b
1596	aesmc	v25.16b,v25.16b
1597	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1598
1599	aese	v0.16b,v17.16b
1600	aesmc	v0.16b,v0.16b
1601	aese	v1.16b,v17.16b
1602	aesmc	v1.16b,v1.16b
1603	aese	v18.16b,v17.16b
1604	aesmc	v18.16b,v18.16b
1605	aese	v24.16b,v17.16b
1606	aesmc	v24.16b,v24.16b
1607	aese	v25.16b,v17.16b
1608	aesmc	v25.16b,v25.16b
1609	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1610
1611	aese	v0.16b,v20.16b
1612	aesmc	v0.16b,v0.16b
1613	add	w9,w8,#1
1614	add	w10,w8,#2
1615	aese	v1.16b,v20.16b
1616	aesmc	v1.16b,v1.16b
1617	add	w12,w8,#3
1618	add	w13,w8,#4
1619	aese	v18.16b,v20.16b
1620	aesmc	v18.16b,v18.16b
1621	add	w14,w8,#5
1622	rev	w9,w9
1623	aese	v24.16b,v20.16b
1624	aesmc	v24.16b,v24.16b
1625	rev	w10,w10
1626	rev	w12,w12
1627	aese	v25.16b,v20.16b
1628	aesmc	v25.16b,v25.16b
1629	rev	w13,w13
1630	rev	w14,w14
1631
1632	aese	v0.16b,v21.16b
1633	aesmc	v0.16b,v0.16b
1634	aese	v1.16b,v21.16b
1635	aesmc	v1.16b,v1.16b
1636	aese	v18.16b,v21.16b
1637	aesmc	v18.16b,v18.16b
1638	aese	v24.16b,v21.16b
1639	aesmc	v24.16b,v24.16b
1640	aese	v25.16b,v21.16b
1641	aesmc	v25.16b,v25.16b
1642
1643	aese	v0.16b,v22.16b
1644	aesmc	v0.16b,v0.16b
1645	ld1	{v2.16b},[x0],#16
1646	aese	v1.16b,v22.16b
1647	aesmc	v1.16b,v1.16b
1648	ld1	{v3.16b},[x0],#16
1649	aese	v18.16b,v22.16b
1650	aesmc	v18.16b,v18.16b
1651	ld1	{v19.16b},[x0],#16
1652	aese	v24.16b,v22.16b
1653	aesmc	v24.16b,v24.16b
1654	ld1	{v26.16b},[x0],#16
1655	aese	v25.16b,v22.16b
1656	aesmc	v25.16b,v25.16b
1657	ld1	{v27.16b},[x0],#16
1658
1659	aese	v0.16b,v23.16b
1660	eor	v2.16b,v2.16b,v7.16b
1661	aese	v1.16b,v23.16b
1662	eor	v3.16b,v3.16b,v7.16b
1663	aese	v18.16b,v23.16b
1664	eor	v19.16b,v19.16b,v7.16b
1665	aese	v24.16b,v23.16b
1666	eor	v26.16b,v26.16b,v7.16b
1667	aese	v25.16b,v23.16b
1668	eor	v27.16b,v27.16b,v7.16b
1669
1670	eor	v2.16b,v2.16b,v0.16b
1671	orr	v0.16b,v6.16b,v6.16b
1672	eor	v3.16b,v3.16b,v1.16b
1673	orr	v1.16b,v6.16b,v6.16b
1674	eor	v19.16b,v19.16b,v18.16b
1675	orr	v18.16b,v6.16b,v6.16b
1676	eor	v26.16b,v26.16b,v24.16b
1677	orr	v24.16b,v6.16b,v6.16b
1678	eor	v27.16b,v27.16b,v25.16b
1679	orr	v25.16b,v6.16b,v6.16b
1680
1681	st1	{v2.16b},[x1],#16
1682	mov	v0.s[3],w9
1683	st1	{v3.16b},[x1],#16
1684	mov	v1.s[3],w10
1685	st1	{v19.16b},[x1],#16
1686	mov	v18.s[3],w12
1687	st1	{v26.16b},[x1],#16
1688	mov	v24.s[3],w13
1689	st1	{v27.16b},[x1],#16
1690	mov	v25.s[3],w14
1691
1692	mov	w6,w5
1693	cbz	x2,Lctr32_done
1694
1695	add	w8,w8,#5
1696	subs	x2,x2,#5
1697	b.hs	Loop5x_ctr32
1698
1699	add	x2,x2,#5
1700	sub	w8,w8,#5
1701
1702	cmp	x2,#2
1703	mov	x12,#16
1704	csel	x12,xzr,x12,lo
1705	b.ls	Lctr32_tail
1706
1707	sub	x2,x2,#3		// bias
1708	add	w8,w8,#3
1709	b	Loop3x_ctr32
1710
1711.align	4
1712Loop3x_ctr32:
1713	aese	v0.16b,v16.16b
1714	aesmc	v0.16b,v0.16b
1715	aese	v1.16b,v16.16b
1716	aesmc	v1.16b,v1.16b
1717	aese	v18.16b,v16.16b
1718	aesmc	v18.16b,v18.16b
1719	ld1	{v16.4s},[x7],#16
1720	subs	w6,w6,#2
1721	aese	v0.16b,v17.16b
1722	aesmc	v0.16b,v0.16b
1723	aese	v1.16b,v17.16b
1724	aesmc	v1.16b,v1.16b
1725	aese	v18.16b,v17.16b
1726	aesmc	v18.16b,v18.16b
1727	ld1	{v17.4s},[x7],#16
1728	b.gt	Loop3x_ctr32
1729
1730	aese	v0.16b,v16.16b
1731	aesmc	v4.16b,v0.16b
1732	aese	v1.16b,v16.16b
1733	aesmc	v5.16b,v1.16b
1734	ld1	{v2.16b},[x0],#16
1735	orr	v0.16b,v6.16b,v6.16b
1736	aese	v18.16b,v16.16b
1737	aesmc	v18.16b,v18.16b
1738	ld1	{v3.16b},[x0],#16
1739	orr	v1.16b,v6.16b,v6.16b
1740	aese	v4.16b,v17.16b
1741	aesmc	v4.16b,v4.16b
1742	aese	v5.16b,v17.16b
1743	aesmc	v5.16b,v5.16b
1744	ld1	{v19.16b},[x0],#16
1745	mov	x7,x3
1746	aese	v18.16b,v17.16b
1747	aesmc	v17.16b,v18.16b
1748	orr	v18.16b,v6.16b,v6.16b
1749	add	w9,w8,#1
1750	aese	v4.16b,v20.16b
1751	aesmc	v4.16b,v4.16b
1752	aese	v5.16b,v20.16b
1753	aesmc	v5.16b,v5.16b
1754	eor	v2.16b,v2.16b,v7.16b
1755	add	w10,w8,#2
1756	aese	v17.16b,v20.16b
1757	aesmc	v17.16b,v17.16b
1758	eor	v3.16b,v3.16b,v7.16b
1759	add	w8,w8,#3
1760	aese	v4.16b,v21.16b
1761	aesmc	v4.16b,v4.16b
1762	aese	v5.16b,v21.16b
1763	aesmc	v5.16b,v5.16b
1764	eor	v19.16b,v19.16b,v7.16b
1765	rev	w9,w9
1766	aese	v17.16b,v21.16b
1767	aesmc	v17.16b,v17.16b
1768	mov	v0.s[3], w9
1769	rev	w10,w10
1770	aese	v4.16b,v22.16b
1771	aesmc	v4.16b,v4.16b
1772	aese	v5.16b,v22.16b
1773	aesmc	v5.16b,v5.16b
1774	mov	v1.s[3], w10
1775	rev	w12,w8
1776	aese	v17.16b,v22.16b
1777	aesmc	v17.16b,v17.16b
1778	mov	v18.s[3], w12
1779	subs	x2,x2,#3
1780	aese	v4.16b,v23.16b
1781	aese	v5.16b,v23.16b
1782	aese	v17.16b,v23.16b
1783
1784	eor	v2.16b,v2.16b,v4.16b
1785	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
1786	st1	{v2.16b},[x1],#16
1787	eor	v3.16b,v3.16b,v5.16b
1788	mov	w6,w5
1789	st1	{v3.16b},[x1],#16
1790	eor	v19.16b,v19.16b,v17.16b
1791	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
1792	st1	{v19.16b},[x1],#16
1793	b.hs	Loop3x_ctr32
1794
1795	adds	x2,x2,#3
1796	b.eq	Lctr32_done
1797	cmp	x2,#1
1798	mov	x12,#16
1799	csel	x12,xzr,x12,eq
1800
1801Lctr32_tail:
1802	aese	v0.16b,v16.16b
1803	aesmc	v0.16b,v0.16b
1804	aese	v1.16b,v16.16b
1805	aesmc	v1.16b,v1.16b
1806	ld1	{v16.4s},[x7],#16
1807	subs	w6,w6,#2
1808	aese	v0.16b,v17.16b
1809	aesmc	v0.16b,v0.16b
1810	aese	v1.16b,v17.16b
1811	aesmc	v1.16b,v1.16b
1812	ld1	{v17.4s},[x7],#16
1813	b.gt	Lctr32_tail
1814
1815	aese	v0.16b,v16.16b
1816	aesmc	v0.16b,v0.16b
1817	aese	v1.16b,v16.16b
1818	aesmc	v1.16b,v1.16b
1819	aese	v0.16b,v17.16b
1820	aesmc	v0.16b,v0.16b
1821	aese	v1.16b,v17.16b
1822	aesmc	v1.16b,v1.16b
1823	ld1	{v2.16b},[x0],x12
1824	aese	v0.16b,v20.16b
1825	aesmc	v0.16b,v0.16b
1826	aese	v1.16b,v20.16b
1827	aesmc	v1.16b,v1.16b
1828	ld1	{v3.16b},[x0]
1829	aese	v0.16b,v21.16b
1830	aesmc	v0.16b,v0.16b
1831	aese	v1.16b,v21.16b
1832	aesmc	v1.16b,v1.16b
1833	eor	v2.16b,v2.16b,v7.16b
1834	aese	v0.16b,v22.16b
1835	aesmc	v0.16b,v0.16b
1836	aese	v1.16b,v22.16b
1837	aesmc	v1.16b,v1.16b
1838	eor	v3.16b,v3.16b,v7.16b
1839	aese	v0.16b,v23.16b
1840	aese	v1.16b,v23.16b
1841
1842	cmp	x2,#1
1843	eor	v2.16b,v2.16b,v0.16b
1844	eor	v3.16b,v3.16b,v1.16b
1845	st1	{v2.16b},[x1],#16
1846	b.eq	Lctr32_done
1847	st1	{v3.16b},[x1]
1848
1849Lctr32_done:
1850	ldr	x29,[sp],#16
1851	ret
1852
1853.globl	_aes_v8_xts_encrypt
1854
1855.align	5
1856_aes_v8_xts_encrypt:
1857	cmp	x2,#16
1858	// Original input data size bigger than 16, jump to big size processing.
1859	b.ne	Lxts_enc_big_size
1860	// Encrypt the iv with key2, as the first XEX iv.
1861	ldr	w6,[x4,#240]
1862	ld1	{v0.4s},[x4],#16
1863	ld1	{v6.16b},[x5]
1864	sub	w6,w6,#2
1865	ld1	{v1.4s},[x4],#16
1866
1867Loop_enc_iv_enc:
1868	aese	v6.16b,v0.16b
1869	aesmc	v6.16b,v6.16b
1870	ld1	{v0.4s},[x4],#16
1871	subs	w6,w6,#2
1872	aese	v6.16b,v1.16b
1873	aesmc	v6.16b,v6.16b
1874	ld1	{v1.4s},[x4],#16
1875	b.gt	Loop_enc_iv_enc
1876
1877	aese	v6.16b,v0.16b
1878	aesmc	v6.16b,v6.16b
1879	ld1	{v0.4s},[x4]
1880	aese	v6.16b,v1.16b
1881	eor	v6.16b,v6.16b,v0.16b
1882
1883	ld1	{v0.16b},[x0]
1884	eor	v0.16b,v6.16b,v0.16b
1885
1886	ldr	w6,[x3,#240]
1887	ld1	{v28.4s,v29.4s},[x3],#32		// load key schedule...
1888
1889	aese	v0.16b,v28.16b
1890	aesmc	v0.16b,v0.16b
1891	ld1	{v16.4s,v17.4s},[x3],#32		// load key schedule...
1892	aese	v0.16b,v29.16b
1893	aesmc	v0.16b,v0.16b
1894	subs	w6,w6,#10		// if rounds==10, jump to aes-128-xts processing
1895	b.eq	Lxts_128_enc
1896Lxts_enc_round_loop:
1897	aese	v0.16b,v16.16b
1898	aesmc	v0.16b,v0.16b
1899	ld1	{v16.4s},[x3],#16		// load key schedule...
1900	aese	v0.16b,v17.16b
1901	aesmc	v0.16b,v0.16b
1902	ld1	{v17.4s},[x3],#16		// load key schedule...
1903	subs	w6,w6,#2		// bias
1904	b.gt	Lxts_enc_round_loop
1905Lxts_128_enc:
1906	ld1	{v18.4s,v19.4s},[x3],#32		// load key schedule...
1907	aese	v0.16b,v16.16b
1908	aesmc	v0.16b,v0.16b
1909	aese	v0.16b,v17.16b
1910	aesmc	v0.16b,v0.16b
1911	ld1	{v20.4s,v21.4s},[x3],#32		// load key schedule...
1912	aese	v0.16b,v18.16b
1913	aesmc	v0.16b,v0.16b
1914	aese	v0.16b,v19.16b
1915	aesmc	v0.16b,v0.16b
1916	ld1	{v22.4s,v23.4s},[x3],#32		// load key schedule...
1917	aese	v0.16b,v20.16b
1918	aesmc	v0.16b,v0.16b
1919	aese	v0.16b,v21.16b
1920	aesmc	v0.16b,v0.16b
1921	ld1	{v7.4s},[x3]
1922	aese	v0.16b,v22.16b
1923	aesmc	v0.16b,v0.16b
1924	aese	v0.16b,v23.16b
1925	eor	v0.16b,v0.16b,v7.16b
1926	eor	v0.16b,v0.16b,v6.16b
1927	st1	{v0.16b},[x1]
1928	b	Lxts_enc_final_abort
1929
1930.align	4
1931Lxts_enc_big_size:
1932	stp	x19,x20,[sp,#-64]!
1933	stp	x21,x22,[sp,#48]
1934	stp	d8,d9,[sp,#32]
1935	stp	d10,d11,[sp,#16]
1936
1937	// tailcnt store the tail value of length%16.
1938	and	x21,x2,#0xf
1939	and	x2,x2,#-16
1940	subs	x2,x2,#16
1941	mov	x8,#16
1942	b.lo	Lxts_abort
1943	csel	x8,xzr,x8,eq
1944
1945	// Firstly, encrypt the iv with key2, as the first iv of XEX.
1946	ldr	w6,[x4,#240]
1947	ld1	{v0.4s},[x4],#16
1948	ld1	{v6.16b},[x5]
1949	sub	w6,w6,#2
1950	ld1	{v1.4s},[x4],#16
1951
1952Loop_iv_enc:
1953	aese	v6.16b,v0.16b
1954	aesmc	v6.16b,v6.16b
1955	ld1	{v0.4s},[x4],#16
1956	subs	w6,w6,#2
1957	aese	v6.16b,v1.16b
1958	aesmc	v6.16b,v6.16b
1959	ld1	{v1.4s},[x4],#16
1960	b.gt	Loop_iv_enc
1961
1962	aese	v6.16b,v0.16b
1963	aesmc	v6.16b,v6.16b
1964	ld1	{v0.4s},[x4]
1965	aese	v6.16b,v1.16b
1966	eor	v6.16b,v6.16b,v0.16b
1967
1968	// The iv for second block
1969	// x9- iv(low), x10 - iv(high)
1970	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
1971	fmov	x9,d6
1972	fmov	x10,v6.d[1]
1973	mov	w19,#0x87
1974	extr	x22,x10,x10,#32
1975	extr	x10,x10,x9,#63
1976	and	w11,w19,w22,asr#31
1977	eor	x9,x11,x9,lsl#1
1978	fmov	d8,x9
1979	fmov	v8.d[1],x10
1980
1981	ldr	w5,[x3,#240]		// next starting point
1982	ld1	{v0.16b},[x0],x8
1983
1984	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
1985	sub	w5,w5,#6
1986	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
1987	sub	w5,w5,#2
1988	ld1	{v18.4s,v19.4s},[x7],#32
1989	ld1	{v20.4s,v21.4s},[x7],#32
1990	ld1	{v22.4s,v23.4s},[x7],#32
1991	ld1	{v7.4s},[x7]
1992
1993	add	x7,x3,#32
1994	mov	w6,w5
1995
1996	// Encryption
1997Lxts_enc:
1998	ld1	{v24.16b},[x0],#16
1999	subs	x2,x2,#32			// bias
2000	add	w6,w5,#2
2001	orr	v3.16b,v0.16b,v0.16b
2002	orr	v1.16b,v0.16b,v0.16b
2003	orr	v28.16b,v0.16b,v0.16b
2004	orr	v27.16b,v24.16b,v24.16b
2005	orr	v29.16b,v24.16b,v24.16b
2006	b.lo	Lxts_inner_enc_tail
2007	eor	v0.16b,v0.16b,v6.16b			// before encryption, xor with iv
2008	eor	v24.16b,v24.16b,v8.16b
2009
2010	// The iv for third block
2011	extr	x22,x10,x10,#32
2012	extr	x10,x10,x9,#63
2013	and	w11,w19,w22,asr#31
2014	eor	x9,x11,x9,lsl#1
2015	fmov	d9,x9
2016	fmov	v9.d[1],x10
2017
2018
2019	orr	v1.16b,v24.16b,v24.16b
2020	ld1	{v24.16b},[x0],#16
2021	orr	v2.16b,v0.16b,v0.16b
2022	orr	v3.16b,v1.16b,v1.16b
2023	eor	v27.16b,v24.16b,v9.16b 		// the third block
2024	eor	v24.16b,v24.16b,v9.16b
2025	cmp	x2,#32
2026	b.lo	Lxts_outer_enc_tail
2027
2028	// The iv for fourth block
2029	extr	x22,x10,x10,#32
2030	extr	x10,x10,x9,#63
2031	and	w11,w19,w22,asr#31
2032	eor	x9,x11,x9,lsl#1
2033	fmov	d10,x9
2034	fmov	v10.d[1],x10
2035
2036	ld1	{v25.16b},[x0],#16
2037	// The iv for fifth block
2038	extr	x22,x10,x10,#32
2039	extr	x10,x10,x9,#63
2040	and	w11,w19,w22,asr#31
2041	eor	x9,x11,x9,lsl#1
2042	fmov	d11,x9
2043	fmov	v11.d[1],x10
2044
2045	ld1	{v26.16b},[x0],#16
2046	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2047	eor	v26.16b,v26.16b,v11.16b
2048	sub	x2,x2,#32			// bias
2049	mov	w6,w5
2050	b	Loop5x_xts_enc
2051
2052.align	4
2053Loop5x_xts_enc:
2054	aese	v0.16b,v16.16b
2055	aesmc	v0.16b,v0.16b
2056	aese	v1.16b,v16.16b
2057	aesmc	v1.16b,v1.16b
2058	aese	v24.16b,v16.16b
2059	aesmc	v24.16b,v24.16b
2060	aese	v25.16b,v16.16b
2061	aesmc	v25.16b,v25.16b
2062	aese	v26.16b,v16.16b
2063	aesmc	v26.16b,v26.16b
2064	ld1	{v16.4s},[x7],#16
2065	subs	w6,w6,#2
2066	aese	v0.16b,v17.16b
2067	aesmc	v0.16b,v0.16b
2068	aese	v1.16b,v17.16b
2069	aesmc	v1.16b,v1.16b
2070	aese	v24.16b,v17.16b
2071	aesmc	v24.16b,v24.16b
2072	aese	v25.16b,v17.16b
2073	aesmc	v25.16b,v25.16b
2074	aese	v26.16b,v17.16b
2075	aesmc	v26.16b,v26.16b
2076	ld1	{v17.4s},[x7],#16
2077	b.gt	Loop5x_xts_enc
2078
2079	aese	v0.16b,v16.16b
2080	aesmc	v0.16b,v0.16b
2081	aese	v1.16b,v16.16b
2082	aesmc	v1.16b,v1.16b
2083	aese	v24.16b,v16.16b
2084	aesmc	v24.16b,v24.16b
2085	aese	v25.16b,v16.16b
2086	aesmc	v25.16b,v25.16b
2087	aese	v26.16b,v16.16b
2088	aesmc	v26.16b,v26.16b
2089	subs	x2,x2,#0x50			// because Lxts_enc_tail4x
2090
2091	aese	v0.16b,v17.16b
2092	aesmc	v0.16b,v0.16b
2093	aese	v1.16b,v17.16b
2094	aesmc	v1.16b,v1.16b
2095	aese	v24.16b,v17.16b
2096	aesmc	v24.16b,v24.16b
2097	aese	v25.16b,v17.16b
2098	aesmc	v25.16b,v25.16b
2099	aese	v26.16b,v17.16b
2100	aesmc	v26.16b,v26.16b
2101	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2102	mov	x7,x3
2103
2104	aese	v0.16b,v18.16b
2105	aesmc	v0.16b,v0.16b
2106	aese	v1.16b,v18.16b
2107	aesmc	v1.16b,v1.16b
2108	aese	v24.16b,v18.16b
2109	aesmc	v24.16b,v24.16b
2110	aese	v25.16b,v18.16b
2111	aesmc	v25.16b,v25.16b
2112	aese	v26.16b,v18.16b
2113	aesmc	v26.16b,v26.16b
2114	add	x0,x0,x6		// x0 is adjusted in such way that
2115						// at exit from the loop v1.16b-v26.16b
2116						// are loaded with last "words"
2117	add	x6,x2,#0x60		// because Lxts_enc_tail4x
2118
2119	aese	v0.16b,v19.16b
2120	aesmc	v0.16b,v0.16b
2121	aese	v1.16b,v19.16b
2122	aesmc	v1.16b,v1.16b
2123	aese	v24.16b,v19.16b
2124	aesmc	v24.16b,v24.16b
2125	aese	v25.16b,v19.16b
2126	aesmc	v25.16b,v25.16b
2127	aese	v26.16b,v19.16b
2128	aesmc	v26.16b,v26.16b
2129
2130	aese	v0.16b,v20.16b
2131	aesmc	v0.16b,v0.16b
2132	aese	v1.16b,v20.16b
2133	aesmc	v1.16b,v1.16b
2134	aese	v24.16b,v20.16b
2135	aesmc	v24.16b,v24.16b
2136	aese	v25.16b,v20.16b
2137	aesmc	v25.16b,v25.16b
2138	aese	v26.16b,v20.16b
2139	aesmc	v26.16b,v26.16b
2140
2141	aese	v0.16b,v21.16b
2142	aesmc	v0.16b,v0.16b
2143	aese	v1.16b,v21.16b
2144	aesmc	v1.16b,v1.16b
2145	aese	v24.16b,v21.16b
2146	aesmc	v24.16b,v24.16b
2147	aese	v25.16b,v21.16b
2148	aesmc	v25.16b,v25.16b
2149	aese	v26.16b,v21.16b
2150	aesmc	v26.16b,v26.16b
2151
2152	aese	v0.16b,v22.16b
2153	aesmc	v0.16b,v0.16b
2154	aese	v1.16b,v22.16b
2155	aesmc	v1.16b,v1.16b
2156	aese	v24.16b,v22.16b
2157	aesmc	v24.16b,v24.16b
2158	aese	v25.16b,v22.16b
2159	aesmc	v25.16b,v25.16b
2160	aese	v26.16b,v22.16b
2161	aesmc	v26.16b,v26.16b
2162
2163	eor	v4.16b,v7.16b,v6.16b
2164	aese	v0.16b,v23.16b
2165	// The iv for first block of one iteration
2166	extr	x22,x10,x10,#32
2167	extr	x10,x10,x9,#63
2168	and	w11,w19,w22,asr#31
2169	eor	x9,x11,x9,lsl#1
2170	fmov	d6,x9
2171	fmov	v6.d[1],x10
2172	eor	v5.16b,v7.16b,v8.16b
2173	ld1	{v2.16b},[x0],#16
2174	aese	v1.16b,v23.16b
2175	// The iv for second block
2176	extr	x22,x10,x10,#32
2177	extr	x10,x10,x9,#63
2178	and	w11,w19,w22,asr#31
2179	eor	x9,x11,x9,lsl#1
2180	fmov	d8,x9
2181	fmov	v8.d[1],x10
2182	eor	v17.16b,v7.16b,v9.16b
2183	ld1	{v3.16b},[x0],#16
2184	aese	v24.16b,v23.16b
2185	// The iv for third block
2186	extr	x22,x10,x10,#32
2187	extr	x10,x10,x9,#63
2188	and	w11,w19,w22,asr#31
2189	eor	x9,x11,x9,lsl#1
2190	fmov	d9,x9
2191	fmov	v9.d[1],x10
2192	eor	v30.16b,v7.16b,v10.16b
2193	ld1	{v27.16b},[x0],#16
2194	aese	v25.16b,v23.16b
2195	// The iv for fourth block
2196	extr	x22,x10,x10,#32
2197	extr	x10,x10,x9,#63
2198	and	w11,w19,w22,asr#31
2199	eor	x9,x11,x9,lsl#1
2200	fmov	d10,x9
2201	fmov	v10.d[1],x10
2202	eor	v31.16b,v7.16b,v11.16b
2203	ld1	{v28.16b},[x0],#16
2204	aese	v26.16b,v23.16b
2205
2206	// The iv for fifth block
2207	extr	x22,x10,x10,#32
2208	extr	x10,x10,x9,#63
2209	and	w11,w19,w22,asr #31
2210	eor	x9,x11,x9,lsl #1
2211	fmov	d11,x9
2212	fmov	v11.d[1],x10
2213
2214	ld1	{v29.16b},[x0],#16
2215	cbz	x6,Lxts_enc_tail4x
2216	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2217	eor	v4.16b,v4.16b,v0.16b
2218	eor	v0.16b,v2.16b,v6.16b
2219	eor	v5.16b,v5.16b,v1.16b
2220	eor	v1.16b,v3.16b,v8.16b
2221	eor	v17.16b,v17.16b,v24.16b
2222	eor	v24.16b,v27.16b,v9.16b
2223	eor	v30.16b,v30.16b,v25.16b
2224	eor	v25.16b,v28.16b,v10.16b
2225	eor	v31.16b,v31.16b,v26.16b
2226	st1	{v4.16b},[x1],#16
2227	eor	v26.16b,v29.16b,v11.16b
2228	st1	{v5.16b},[x1],#16
2229	mov	w6,w5
2230	st1	{v17.16b},[x1],#16
2231	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2232	st1	{v30.16b},[x1],#16
2233	st1	{v31.16b},[x1],#16
2234	b.hs	Loop5x_xts_enc
2235
2236
2237	// If left 4 blocks, borrow the five block's processing.
2238	cmn	x2,#0x10
2239	b.ne	Loop5x_enc_after
2240	orr	v11.16b,v10.16b,v10.16b
2241	orr	v10.16b,v9.16b,v9.16b
2242	orr	v9.16b,v8.16b,v8.16b
2243	orr	v8.16b,v6.16b,v6.16b
2244	fmov	x9,d11
2245	fmov	x10,v11.d[1]
2246	eor	v0.16b,v6.16b,v2.16b
2247	eor	v1.16b,v8.16b,v3.16b
2248	eor	v24.16b,v27.16b,v9.16b
2249	eor	v25.16b,v28.16b,v10.16b
2250	eor	v26.16b,v29.16b,v11.16b
2251	b.eq	Loop5x_xts_enc
2252
2253Loop5x_enc_after:
2254	add	x2,x2,#0x50
2255	cbz	x2,Lxts_enc_done
2256
2257	add	w6,w5,#2
2258	subs	x2,x2,#0x30
2259	b.lo	Lxts_inner_enc_tail
2260
2261	eor	v0.16b,v6.16b,v27.16b
2262	eor	v1.16b,v8.16b,v28.16b
2263	eor	v24.16b,v29.16b,v9.16b
2264	b	Lxts_outer_enc_tail
2265
2266.align	4
2267Lxts_enc_tail4x:
2268	add	x0,x0,#16
2269	eor	v5.16b,v1.16b,v5.16b
2270	st1	{v5.16b},[x1],#16
2271	eor	v17.16b,v24.16b,v17.16b
2272	st1	{v17.16b},[x1],#16
2273	eor	v30.16b,v25.16b,v30.16b
2274	eor	v31.16b,v26.16b,v31.16b
2275	st1	{v30.16b,v31.16b},[x1],#32
2276
2277	b	Lxts_enc_done
2278.align	4
2279Lxts_outer_enc_tail:
2280	aese	v0.16b,v16.16b
2281	aesmc	v0.16b,v0.16b
2282	aese	v1.16b,v16.16b
2283	aesmc	v1.16b,v1.16b
2284	aese	v24.16b,v16.16b
2285	aesmc	v24.16b,v24.16b
2286	ld1	{v16.4s},[x7],#16
2287	subs	w6,w6,#2
2288	aese	v0.16b,v17.16b
2289	aesmc	v0.16b,v0.16b
2290	aese	v1.16b,v17.16b
2291	aesmc	v1.16b,v1.16b
2292	aese	v24.16b,v17.16b
2293	aesmc	v24.16b,v24.16b
2294	ld1	{v17.4s},[x7],#16
2295	b.gt	Lxts_outer_enc_tail
2296
2297	aese	v0.16b,v16.16b
2298	aesmc	v0.16b,v0.16b
2299	aese	v1.16b,v16.16b
2300	aesmc	v1.16b,v1.16b
2301	aese	v24.16b,v16.16b
2302	aesmc	v24.16b,v24.16b
2303	eor	v4.16b,v6.16b,v7.16b
2304	subs	x2,x2,#0x30
2305	// The iv for first block
2306	fmov	x9,d9
2307	fmov	x10,v9.d[1]
2308	//mov	w19,#0x87
2309	extr	x22,x10,x10,#32
2310	extr	x10,x10,x9,#63
2311	and	w11,w19,w22,asr#31
2312	eor	x9,x11,x9,lsl#1
2313	fmov	d6,x9
2314	fmov	v6.d[1],x10
2315	eor	v5.16b,v8.16b,v7.16b
2316	csel	x6,x2,x6,lo       // x6, w6, is zero at this point
2317	aese	v0.16b,v17.16b
2318	aesmc	v0.16b,v0.16b
2319	aese	v1.16b,v17.16b
2320	aesmc	v1.16b,v1.16b
2321	aese	v24.16b,v17.16b
2322	aesmc	v24.16b,v24.16b
2323	eor	v17.16b,v9.16b,v7.16b
2324
2325	add	x6,x6,#0x20
2326	add	x0,x0,x6
2327	mov	x7,x3
2328
2329	aese	v0.16b,v20.16b
2330	aesmc	v0.16b,v0.16b
2331	aese	v1.16b,v20.16b
2332	aesmc	v1.16b,v1.16b
2333	aese	v24.16b,v20.16b
2334	aesmc	v24.16b,v24.16b
2335	aese	v0.16b,v21.16b
2336	aesmc	v0.16b,v0.16b
2337	aese	v1.16b,v21.16b
2338	aesmc	v1.16b,v1.16b
2339	aese	v24.16b,v21.16b
2340	aesmc	v24.16b,v24.16b
2341	aese	v0.16b,v22.16b
2342	aesmc	v0.16b,v0.16b
2343	aese	v1.16b,v22.16b
2344	aesmc	v1.16b,v1.16b
2345	aese	v24.16b,v22.16b
2346	aesmc	v24.16b,v24.16b
2347	aese	v0.16b,v23.16b
2348	aese	v1.16b,v23.16b
2349	aese	v24.16b,v23.16b
2350	ld1	{v27.16b},[x0],#16
2351	add	w6,w5,#2
2352	ld1	{v16.4s},[x7],#16                // re-pre-load rndkey[0]
2353	eor	v4.16b,v4.16b,v0.16b
2354	eor	v5.16b,v5.16b,v1.16b
2355	eor	v24.16b,v24.16b,v17.16b
2356	ld1	{v17.4s},[x7],#16                // re-pre-load rndkey[1]
2357	st1	{v4.16b},[x1],#16
2358	st1	{v5.16b},[x1],#16
2359	st1	{v24.16b},[x1],#16
2360	cmn	x2,#0x30
2361	b.eq	Lxts_enc_done
2362Lxts_encxor_one:
2363	orr	v28.16b,v3.16b,v3.16b
2364	orr	v29.16b,v27.16b,v27.16b
2365	nop
2366
2367Lxts_inner_enc_tail:
2368	cmn	x2,#0x10
2369	eor	v1.16b,v28.16b,v6.16b
2370	eor	v24.16b,v29.16b,v8.16b
2371	b.eq	Lxts_enc_tail_loop
2372	eor	v24.16b,v29.16b,v6.16b
2373Lxts_enc_tail_loop:
2374	aese	v1.16b,v16.16b
2375	aesmc	v1.16b,v1.16b
2376	aese	v24.16b,v16.16b
2377	aesmc	v24.16b,v24.16b
2378	ld1	{v16.4s},[x7],#16
2379	subs	w6,w6,#2
2380	aese	v1.16b,v17.16b
2381	aesmc	v1.16b,v1.16b
2382	aese	v24.16b,v17.16b
2383	aesmc	v24.16b,v24.16b
2384	ld1	{v17.4s},[x7],#16
2385	b.gt	Lxts_enc_tail_loop
2386
2387	aese	v1.16b,v16.16b
2388	aesmc	v1.16b,v1.16b
2389	aese	v24.16b,v16.16b
2390	aesmc	v24.16b,v24.16b
2391	aese	v1.16b,v17.16b
2392	aesmc	v1.16b,v1.16b
2393	aese	v24.16b,v17.16b
2394	aesmc	v24.16b,v24.16b
2395	aese	v1.16b,v20.16b
2396	aesmc	v1.16b,v1.16b
2397	aese	v24.16b,v20.16b
2398	aesmc	v24.16b,v24.16b
2399	cmn	x2,#0x20
2400	aese	v1.16b,v21.16b
2401	aesmc	v1.16b,v1.16b
2402	aese	v24.16b,v21.16b
2403	aesmc	v24.16b,v24.16b
2404	eor	v5.16b,v6.16b,v7.16b
2405	aese	v1.16b,v22.16b
2406	aesmc	v1.16b,v1.16b
2407	aese	v24.16b,v22.16b
2408	aesmc	v24.16b,v24.16b
2409	eor	v17.16b,v8.16b,v7.16b
2410	aese	v1.16b,v23.16b
2411	aese	v24.16b,v23.16b
2412	b.eq	Lxts_enc_one
2413	eor	v5.16b,v5.16b,v1.16b
2414	st1	{v5.16b},[x1],#16
2415	eor	v17.16b,v17.16b,v24.16b
2416	orr	v6.16b,v8.16b,v8.16b
2417	st1	{v17.16b},[x1],#16
2418	fmov	x9,d8
2419	fmov	x10,v8.d[1]
2420	mov	w19,#0x87
2421	extr	x22,x10,x10,#32
2422	extr	x10,x10,x9,#63
2423	and	w11,w19,w22,asr #31
2424	eor	x9,x11,x9,lsl #1
2425	fmov	d6,x9
2426	fmov	v6.d[1],x10
2427	b	Lxts_enc_done
2428
2429Lxts_enc_one:
2430	eor	v5.16b,v5.16b,v24.16b
2431	orr	v6.16b,v6.16b,v6.16b
2432	st1	{v5.16b},[x1],#16
2433	fmov	x9,d6
2434	fmov	x10,v6.d[1]
2435	mov	w19,#0x87
2436	extr	x22,x10,x10,#32
2437	extr	x10,x10,x9,#63
2438	and	w11,w19,w22,asr #31
2439	eor	x9,x11,x9,lsl #1
2440	fmov	d6,x9
2441	fmov	v6.d[1],x10
2442	b	Lxts_enc_done
2443.align	5
2444Lxts_enc_done:
2445	// Process the tail block with cipher stealing.
2446	tst	x21,#0xf
2447	b.eq	Lxts_abort
2448
2449	mov	x20,x0
2450	mov	x13,x1
2451	sub	x1,x1,#16
2452.composite_enc_loop:
2453	subs	x21,x21,#1
2454	ldrb	w15,[x1,x21]
2455	ldrb	w14,[x20,x21]
2456	strb	w15,[x13,x21]
2457	strb	w14,[x1,x21]
2458	b.gt	.composite_enc_loop
2459Lxts_enc_load_done:
2460	ld1	{v26.16b},[x1]
2461	eor	v26.16b,v26.16b,v6.16b
2462
2463	// Encrypt the composite block to get the last second encrypted text block
2464	ldr	w6,[x3,#240]		// load key schedule...
2465	ld1	{v0.4s},[x3],#16
2466	sub	w6,w6,#2
2467	ld1	{v1.4s},[x3],#16		// load key schedule...
2468Loop_final_enc:
2469	aese	v26.16b,v0.16b
2470	aesmc	v26.16b,v26.16b
2471	ld1	{v0.4s},[x3],#16
2472	subs	w6,w6,#2
2473	aese	v26.16b,v1.16b
2474	aesmc	v26.16b,v26.16b
2475	ld1	{v1.4s},[x3],#16
2476	b.gt	Loop_final_enc
2477
2478	aese	v26.16b,v0.16b
2479	aesmc	v26.16b,v26.16b
2480	ld1	{v0.4s},[x3]
2481	aese	v26.16b,v1.16b
2482	eor	v26.16b,v26.16b,v0.16b
2483	eor	v26.16b,v26.16b,v6.16b
2484	st1	{v26.16b},[x1]
2485
2486Lxts_abort:
2487	ldp	x21,x22,[sp,#48]
2488	ldp	d8,d9,[sp,#32]
2489	ldp	d10,d11,[sp,#16]
2490	ldp	x19,x20,[sp],#64
2491Lxts_enc_final_abort:
2492	ret
2493
2494.globl	_aes_v8_xts_decrypt
2495
2496.align	5
2497_aes_v8_xts_decrypt:
2498	cmp	x2,#16
2499	// Original input data size bigger than 16, jump to big size processing.
2500	b.ne	Lxts_dec_big_size
2501	// Encrypt the iv with key2, as the first XEX iv.
2502	ldr	w6,[x4,#240]
2503	ld1	{v0.4s},[x4],#16
2504	ld1	{v6.16b},[x5]
2505	sub	w6,w6,#2
2506	ld1	{v1.4s},[x4],#16
2507
2508Loop_dec_small_iv_enc:
2509	aese	v6.16b,v0.16b
2510	aesmc	v6.16b,v6.16b
2511	ld1	{v0.4s},[x4],#16
2512	subs	w6,w6,#2
2513	aese	v6.16b,v1.16b
2514	aesmc	v6.16b,v6.16b
2515	ld1	{v1.4s},[x4],#16
2516	b.gt	Loop_dec_small_iv_enc
2517
2518	aese	v6.16b,v0.16b
2519	aesmc	v6.16b,v6.16b
2520	ld1	{v0.4s},[x4]
2521	aese	v6.16b,v1.16b
2522	eor	v6.16b,v6.16b,v0.16b
2523
2524	ld1	{v0.16b},[x0]
2525	eor	v0.16b,v6.16b,v0.16b
2526
2527	ldr	w6,[x3,#240]
2528	ld1	{v28.4s,v29.4s},[x3],#32			// load key schedule...
2529
2530	aesd	v0.16b,v28.16b
2531	aesimc	v0.16b,v0.16b
2532	ld1	{v16.4s,v17.4s},[x3],#32			// load key schedule...
2533	aesd	v0.16b,v29.16b
2534	aesimc	v0.16b,v0.16b
2535	subs	w6,w6,#10			// bias
2536	b.eq	Lxts_128_dec
2537Lxts_dec_round_loop:
2538	aesd	v0.16b,v16.16b
2539	aesimc	v0.16b,v0.16b
2540	ld1	{v16.4s},[x3],#16			// load key schedule...
2541	aesd	v0.16b,v17.16b
2542	aesimc	v0.16b,v0.16b
2543	ld1	{v17.4s},[x3],#16			// load key schedule...
2544	subs	w6,w6,#2			// bias
2545	b.gt	Lxts_dec_round_loop
2546Lxts_128_dec:
2547	ld1	{v18.4s,v19.4s},[x3],#32			// load key schedule...
2548	aesd	v0.16b,v16.16b
2549	aesimc	v0.16b,v0.16b
2550	aesd	v0.16b,v17.16b
2551	aesimc	v0.16b,v0.16b
2552	ld1	{v20.4s,v21.4s},[x3],#32			// load key schedule...
2553	aesd	v0.16b,v18.16b
2554	aesimc	v0.16b,v0.16b
2555	aesd	v0.16b,v19.16b
2556	aesimc	v0.16b,v0.16b
2557	ld1	{v22.4s,v23.4s},[x3],#32			// load key schedule...
2558	aesd	v0.16b,v20.16b
2559	aesimc	v0.16b,v0.16b
2560	aesd	v0.16b,v21.16b
2561	aesimc	v0.16b,v0.16b
2562	ld1	{v7.4s},[x3]
2563	aesd	v0.16b,v22.16b
2564	aesimc	v0.16b,v0.16b
2565	aesd	v0.16b,v23.16b
2566	eor	v0.16b,v0.16b,v7.16b
2567	eor	v0.16b,v6.16b,v0.16b
2568	st1	{v0.16b},[x1]
2569	b	Lxts_dec_final_abort
2570Lxts_dec_big_size:
2571	stp	x19,x20,[sp,#-64]!
2572	stp	x21,x22,[sp,#48]
2573	stp	d8,d9,[sp,#32]
2574	stp	d10,d11,[sp,#16]
2575
2576	and	x21,x2,#0xf
2577	and	x2,x2,#-16
2578	subs	x2,x2,#16
2579	mov	x8,#16
2580	b.lo	Lxts_dec_abort
2581
2582	// Encrypt the iv with key2, as the first XEX iv
2583	ldr	w6,[x4,#240]
2584	ld1	{v0.4s},[x4],#16
2585	ld1	{v6.16b},[x5]
2586	sub	w6,w6,#2
2587	ld1	{v1.4s},[x4],#16
2588
2589Loop_dec_iv_enc:
2590	aese	v6.16b,v0.16b
2591	aesmc	v6.16b,v6.16b
2592	ld1	{v0.4s},[x4],#16
2593	subs	w6,w6,#2
2594	aese	v6.16b,v1.16b
2595	aesmc	v6.16b,v6.16b
2596	ld1	{v1.4s},[x4],#16
2597	b.gt	Loop_dec_iv_enc
2598
2599	aese	v6.16b,v0.16b
2600	aesmc	v6.16b,v6.16b
2601	ld1	{v0.4s},[x4]
2602	aese	v6.16b,v1.16b
2603	eor	v6.16b,v6.16b,v0.16b
2604
2605	// The iv for second block
2606	// x9- iv(low), x10 - iv(high)
2607	// the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b
2608	fmov	x9,d6
2609	fmov	x10,v6.d[1]
2610	mov	w19,#0x87
2611	extr	x22,x10,x10,#32
2612	extr	x10,x10,x9,#63
2613	and	w11,w19,w22,asr #31
2614	eor	x9,x11,x9,lsl #1
2615	fmov	d8,x9
2616	fmov	v8.d[1],x10
2617
2618	ldr	w5,[x3,#240]		// load rounds number
2619
2620	// The iv for third block
2621	extr	x22,x10,x10,#32
2622	extr	x10,x10,x9,#63
2623	and	w11,w19,w22,asr #31
2624	eor	x9,x11,x9,lsl #1
2625	fmov	d9,x9
2626	fmov	v9.d[1],x10
2627
2628	ld1	{v16.4s,v17.4s},[x3]			// load key schedule...
2629	sub	w5,w5,#6
2630	add	x7,x3,x5,lsl#4		// pointer to last 7 round keys
2631	sub	w5,w5,#2
2632	ld1	{v18.4s,v19.4s},[x7],#32		// load key schedule...
2633	ld1	{v20.4s,v21.4s},[x7],#32
2634	ld1	{v22.4s,v23.4s},[x7],#32
2635	ld1	{v7.4s},[x7]
2636
2637	// The iv for fourth block
2638	extr	x22,x10,x10,#32
2639	extr	x10,x10,x9,#63
2640	and	w11,w19,w22,asr #31
2641	eor	x9,x11,x9,lsl #1
2642	fmov	d10,x9
2643	fmov	v10.d[1],x10
2644
2645	add	x7,x3,#32
2646	mov	w6,w5
2647	b	Lxts_dec
2648
2649	// Decryption
2650.align	5
2651Lxts_dec:
2652	tst	x21,#0xf
2653	b.eq	Lxts_dec_begin
2654	subs	x2,x2,#16
2655	csel	x8,xzr,x8,eq
2656	ld1	{v0.16b},[x0],#16
2657	b.lo	Lxts_done
2658	sub	x0,x0,#16
2659Lxts_dec_begin:
2660	ld1	{v0.16b},[x0],x8
2661	subs	x2,x2,#32			// bias
2662	add	w6,w5,#2
2663	orr	v3.16b,v0.16b,v0.16b
2664	orr	v1.16b,v0.16b,v0.16b
2665	orr	v28.16b,v0.16b,v0.16b
2666	ld1	{v24.16b},[x0],#16
2667	orr	v27.16b,v24.16b,v24.16b
2668	orr	v29.16b,v24.16b,v24.16b
2669	b.lo	Lxts_inner_dec_tail
2670	eor	v0.16b,v0.16b,v6.16b			// before decryt, xor with iv
2671	eor	v24.16b,v24.16b,v8.16b
2672
2673	orr	v1.16b,v24.16b,v24.16b
2674	ld1	{v24.16b},[x0],#16
2675	orr	v2.16b,v0.16b,v0.16b
2676	orr	v3.16b,v1.16b,v1.16b
2677	eor	v27.16b,v24.16b,v9.16b			// third block xox with third iv
2678	eor	v24.16b,v24.16b,v9.16b
2679	cmp	x2,#32
2680	b.lo	Lxts_outer_dec_tail
2681
2682	ld1	{v25.16b},[x0],#16
2683
2684	// The iv for fifth block
2685	extr	x22,x10,x10,#32
2686	extr	x10,x10,x9,#63
2687	and	w11,w19,w22,asr #31
2688	eor	x9,x11,x9,lsl #1
2689	fmov	d11,x9
2690	fmov	v11.d[1],x10
2691
2692	ld1	{v26.16b},[x0],#16
2693	eor	v25.16b,v25.16b,v10.16b		// the fourth block
2694	eor	v26.16b,v26.16b,v11.16b
2695	sub	x2,x2,#32			// bias
2696	mov	w6,w5
2697	b	Loop5x_xts_dec
2698
2699.align	4
2700Loop5x_xts_dec:
2701	aesd	v0.16b,v16.16b
2702	aesimc	v0.16b,v0.16b
2703	aesd	v1.16b,v16.16b
2704	aesimc	v1.16b,v1.16b
2705	aesd	v24.16b,v16.16b
2706	aesimc	v24.16b,v24.16b
2707	aesd	v25.16b,v16.16b
2708	aesimc	v25.16b,v25.16b
2709	aesd	v26.16b,v16.16b
2710	aesimc	v26.16b,v26.16b
2711	ld1	{v16.4s},[x7],#16		// load key schedule...
2712	subs	w6,w6,#2
2713	aesd	v0.16b,v17.16b
2714	aesimc	v0.16b,v0.16b
2715	aesd	v1.16b,v17.16b
2716	aesimc	v1.16b,v1.16b
2717	aesd	v24.16b,v17.16b
2718	aesimc	v24.16b,v24.16b
2719	aesd	v25.16b,v17.16b
2720	aesimc	v25.16b,v25.16b
2721	aesd	v26.16b,v17.16b
2722	aesimc	v26.16b,v26.16b
2723	ld1	{v17.4s},[x7],#16		// load key schedule...
2724	b.gt	Loop5x_xts_dec
2725
2726	aesd	v0.16b,v16.16b
2727	aesimc	v0.16b,v0.16b
2728	aesd	v1.16b,v16.16b
2729	aesimc	v1.16b,v1.16b
2730	aesd	v24.16b,v16.16b
2731	aesimc	v24.16b,v24.16b
2732	aesd	v25.16b,v16.16b
2733	aesimc	v25.16b,v25.16b
2734	aesd	v26.16b,v16.16b
2735	aesimc	v26.16b,v26.16b
2736	subs	x2,x2,#0x50			// because Lxts_dec_tail4x
2737
2738	aesd	v0.16b,v17.16b
2739	aesimc	v0.16b,v0.16b
2740	aesd	v1.16b,v17.16b
2741	aesimc	v1.16b,v1.16b
2742	aesd	v24.16b,v17.16b
2743	aesimc	v24.16b,v24.16b
2744	aesd	v25.16b,v17.16b
2745	aesimc	v25.16b,v25.16b
2746	aesd	v26.16b,v17.16b
2747	aesimc	v26.16b,v26.16b
2748	csel	x6,xzr,x2,gt		// borrow x6, w6, "gt" is not typo
2749	mov	x7,x3
2750
2751	aesd	v0.16b,v18.16b
2752	aesimc	v0.16b,v0.16b
2753	aesd	v1.16b,v18.16b
2754	aesimc	v1.16b,v1.16b
2755	aesd	v24.16b,v18.16b
2756	aesimc	v24.16b,v24.16b
2757	aesd	v25.16b,v18.16b
2758	aesimc	v25.16b,v25.16b
2759	aesd	v26.16b,v18.16b
2760	aesimc	v26.16b,v26.16b
2761	add	x0,x0,x6		// x0 is adjusted in such way that
2762						// at exit from the loop v1.16b-v26.16b
2763						// are loaded with last "words"
2764	add	x6,x2,#0x60		// because Lxts_dec_tail4x
2765
2766	aesd	v0.16b,v19.16b
2767	aesimc	v0.16b,v0.16b
2768	aesd	v1.16b,v19.16b
2769	aesimc	v1.16b,v1.16b
2770	aesd	v24.16b,v19.16b
2771	aesimc	v24.16b,v24.16b
2772	aesd	v25.16b,v19.16b
2773	aesimc	v25.16b,v25.16b
2774	aesd	v26.16b,v19.16b
2775	aesimc	v26.16b,v26.16b
2776
2777	aesd	v0.16b,v20.16b
2778	aesimc	v0.16b,v0.16b
2779	aesd	v1.16b,v20.16b
2780	aesimc	v1.16b,v1.16b
2781	aesd	v24.16b,v20.16b
2782	aesimc	v24.16b,v24.16b
2783	aesd	v25.16b,v20.16b
2784	aesimc	v25.16b,v25.16b
2785	aesd	v26.16b,v20.16b
2786	aesimc	v26.16b,v26.16b
2787
2788	aesd	v0.16b,v21.16b
2789	aesimc	v0.16b,v0.16b
2790	aesd	v1.16b,v21.16b
2791	aesimc	v1.16b,v1.16b
2792	aesd	v24.16b,v21.16b
2793	aesimc	v24.16b,v24.16b
2794	aesd	v25.16b,v21.16b
2795	aesimc	v25.16b,v25.16b
2796	aesd	v26.16b,v21.16b
2797	aesimc	v26.16b,v26.16b
2798
2799	aesd	v0.16b,v22.16b
2800	aesimc	v0.16b,v0.16b
2801	aesd	v1.16b,v22.16b
2802	aesimc	v1.16b,v1.16b
2803	aesd	v24.16b,v22.16b
2804	aesimc	v24.16b,v24.16b
2805	aesd	v25.16b,v22.16b
2806	aesimc	v25.16b,v25.16b
2807	aesd	v26.16b,v22.16b
2808	aesimc	v26.16b,v26.16b
2809
2810	eor	v4.16b,v7.16b,v6.16b
2811	aesd	v0.16b,v23.16b
2812	// The iv for first block of next iteration.
2813	extr	x22,x10,x10,#32
2814	extr	x10,x10,x9,#63
2815	and	w11,w19,w22,asr #31
2816	eor	x9,x11,x9,lsl #1
2817	fmov	d6,x9
2818	fmov	v6.d[1],x10
2819	eor	v5.16b,v7.16b,v8.16b
2820	ld1	{v2.16b},[x0],#16
2821	aesd	v1.16b,v23.16b
2822	// The iv for second block
2823	extr	x22,x10,x10,#32
2824	extr	x10,x10,x9,#63
2825	and	w11,w19,w22,asr #31
2826	eor	x9,x11,x9,lsl #1
2827	fmov	d8,x9
2828	fmov	v8.d[1],x10
2829	eor	v17.16b,v7.16b,v9.16b
2830	ld1	{v3.16b},[x0],#16
2831	aesd	v24.16b,v23.16b
2832	// The iv for third block
2833	extr	x22,x10,x10,#32
2834	extr	x10,x10,x9,#63
2835	and	w11,w19,w22,asr #31
2836	eor	x9,x11,x9,lsl #1
2837	fmov	d9,x9
2838	fmov	v9.d[1],x10
2839	eor	v30.16b,v7.16b,v10.16b
2840	ld1	{v27.16b},[x0],#16
2841	aesd	v25.16b,v23.16b
2842	// The iv for fourth block
2843	extr	x22,x10,x10,#32
2844	extr	x10,x10,x9,#63
2845	and	w11,w19,w22,asr #31
2846	eor	x9,x11,x9,lsl #1
2847	fmov	d10,x9
2848	fmov	v10.d[1],x10
2849	eor	v31.16b,v7.16b,v11.16b
2850	ld1	{v28.16b},[x0],#16
2851	aesd	v26.16b,v23.16b
2852
2853	// The iv for fifth block
2854	extr	x22,x10,x10,#32
2855	extr	x10,x10,x9,#63
2856	and	w11,w19,w22,asr #31
2857	eor	x9,x11,x9,lsl #1
2858	fmov	d11,x9
2859	fmov	v11.d[1],x10
2860
2861	ld1	{v29.16b},[x0],#16
2862	cbz	x6,Lxts_dec_tail4x
2863	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
2864	eor	v4.16b,v4.16b,v0.16b
2865	eor	v0.16b,v2.16b,v6.16b
2866	eor	v5.16b,v5.16b,v1.16b
2867	eor	v1.16b,v3.16b,v8.16b
2868	eor	v17.16b,v17.16b,v24.16b
2869	eor	v24.16b,v27.16b,v9.16b
2870	eor	v30.16b,v30.16b,v25.16b
2871	eor	v25.16b,v28.16b,v10.16b
2872	eor	v31.16b,v31.16b,v26.16b
2873	st1	{v4.16b},[x1],#16
2874	eor	v26.16b,v29.16b,v11.16b
2875	st1	{v5.16b},[x1],#16
2876	mov	w6,w5
2877	st1	{v17.16b},[x1],#16
2878	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
2879	st1	{v30.16b},[x1],#16
2880	st1	{v31.16b},[x1],#16
2881	b.hs	Loop5x_xts_dec
2882
2883	cmn	x2,#0x10
2884	b.ne	Loop5x_dec_after
2885	// If x2(x2) equal to -0x10, the left blocks is 4.
2886	// After specially processing, utilize the five blocks processing again.
2887	// It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b.
2888	orr	v11.16b,v10.16b,v10.16b
2889	orr	v10.16b,v9.16b,v9.16b
2890	orr	v9.16b,v8.16b,v8.16b
2891	orr	v8.16b,v6.16b,v6.16b
2892	fmov	x9,d11
2893	fmov	x10,v11.d[1]
2894	eor	v0.16b,v6.16b,v2.16b
2895	eor	v1.16b,v8.16b,v3.16b
2896	eor	v24.16b,v27.16b,v9.16b
2897	eor	v25.16b,v28.16b,v10.16b
2898	eor	v26.16b,v29.16b,v11.16b
2899	b.eq	Loop5x_xts_dec
2900
2901Loop5x_dec_after:
2902	add	x2,x2,#0x50
2903	cbz	x2,Lxts_done
2904
2905	add	w6,w5,#2
2906	subs	x2,x2,#0x30
2907	b.lo	Lxts_inner_dec_tail
2908
2909	eor	v0.16b,v6.16b,v27.16b
2910	eor	v1.16b,v8.16b,v28.16b
2911	eor	v24.16b,v29.16b,v9.16b
2912	b	Lxts_outer_dec_tail
2913
2914.align	4
2915Lxts_dec_tail4x:
2916	add	x0,x0,#16
2917	tst	x21,#0xf
2918	eor	v5.16b,v1.16b,v4.16b
2919	st1	{v5.16b},[x1],#16
2920	eor	v17.16b,v24.16b,v17.16b
2921	st1	{v17.16b},[x1],#16
2922	eor	v30.16b,v25.16b,v30.16b
2923	eor	v31.16b,v26.16b,v31.16b
2924	st1	{v30.16b,v31.16b},[x1],#32
2925
2926	b.eq	Lxts_dec_abort
2927	ld1	{v0.16b},[x0],#16
2928	b	Lxts_done
2929.align	4
2930Lxts_outer_dec_tail:
2931	aesd	v0.16b,v16.16b
2932	aesimc	v0.16b,v0.16b
2933	aesd	v1.16b,v16.16b
2934	aesimc	v1.16b,v1.16b
2935	aesd	v24.16b,v16.16b
2936	aesimc	v24.16b,v24.16b
2937	ld1	{v16.4s},[x7],#16
2938	subs	w6,w6,#2
2939	aesd	v0.16b,v17.16b
2940	aesimc	v0.16b,v0.16b
2941	aesd	v1.16b,v17.16b
2942	aesimc	v1.16b,v1.16b
2943	aesd	v24.16b,v17.16b
2944	aesimc	v24.16b,v24.16b
2945	ld1	{v17.4s},[x7],#16
2946	b.gt	Lxts_outer_dec_tail
2947
2948	aesd	v0.16b,v16.16b
2949	aesimc	v0.16b,v0.16b
2950	aesd	v1.16b,v16.16b
2951	aesimc	v1.16b,v1.16b
2952	aesd	v24.16b,v16.16b
2953	aesimc	v24.16b,v24.16b
2954	eor	v4.16b,v6.16b,v7.16b
2955	subs	x2,x2,#0x30
2956	// The iv for first block
2957	fmov	x9,d9
2958	fmov	x10,v9.d[1]
2959	mov	w19,#0x87
2960	extr	x22,x10,x10,#32
2961	extr	x10,x10,x9,#63
2962	and	w11,w19,w22,asr #31
2963	eor	x9,x11,x9,lsl #1
2964	fmov	d6,x9
2965	fmov	v6.d[1],x10
2966	eor	v5.16b,v8.16b,v7.16b
2967	csel	x6,x2,x6,lo	// x6, w6, is zero at this point
2968	aesd	v0.16b,v17.16b
2969	aesimc	v0.16b,v0.16b
2970	aesd	v1.16b,v17.16b
2971	aesimc	v1.16b,v1.16b
2972	aesd	v24.16b,v17.16b
2973	aesimc	v24.16b,v24.16b
2974	eor	v17.16b,v9.16b,v7.16b
2975	// The iv for second block
2976	extr	x22,x10,x10,#32
2977	extr	x10,x10,x9,#63
2978	and	w11,w19,w22,asr #31
2979	eor	x9,x11,x9,lsl #1
2980	fmov	d8,x9
2981	fmov	v8.d[1],x10
2982
2983	add	x6,x6,#0x20
2984	add	x0,x0,x6		// x0 is adjusted to the last data
2985
2986	mov	x7,x3
2987
2988	// The iv for third block
2989	extr	x22,x10,x10,#32
2990	extr	x10,x10,x9,#63
2991	and	w11,w19,w22,asr #31
2992	eor	x9,x11,x9,lsl #1
2993	fmov	d9,x9
2994	fmov	v9.d[1],x10
2995
2996	aesd	v0.16b,v20.16b
2997	aesimc	v0.16b,v0.16b
2998	aesd	v1.16b,v20.16b
2999	aesimc	v1.16b,v1.16b
3000	aesd	v24.16b,v20.16b
3001	aesimc	v24.16b,v24.16b
3002	aesd	v0.16b,v21.16b
3003	aesimc	v0.16b,v0.16b
3004	aesd	v1.16b,v21.16b
3005	aesimc	v1.16b,v1.16b
3006	aesd	v24.16b,v21.16b
3007	aesimc	v24.16b,v24.16b
3008	aesd	v0.16b,v22.16b
3009	aesimc	v0.16b,v0.16b
3010	aesd	v1.16b,v22.16b
3011	aesimc	v1.16b,v1.16b
3012	aesd	v24.16b,v22.16b
3013	aesimc	v24.16b,v24.16b
3014	ld1	{v27.16b},[x0],#16
3015	aesd	v0.16b,v23.16b
3016	aesd	v1.16b,v23.16b
3017	aesd	v24.16b,v23.16b
3018	ld1	{v16.4s},[x7],#16		// re-pre-load rndkey[0]
3019	add	w6,w5,#2
3020	eor	v4.16b,v4.16b,v0.16b
3021	eor	v5.16b,v5.16b,v1.16b
3022	eor	v24.16b,v24.16b,v17.16b
3023	ld1	{v17.4s},[x7],#16		// re-pre-load rndkey[1]
3024	st1	{v4.16b},[x1],#16
3025	st1	{v5.16b},[x1],#16
3026	st1	{v24.16b},[x1],#16
3027
3028	cmn	x2,#0x30
3029	add	x2,x2,#0x30
3030	b.eq	Lxts_done
3031	sub	x2,x2,#0x30
3032	orr	v28.16b,v3.16b,v3.16b
3033	orr	v29.16b,v27.16b,v27.16b
3034	nop
3035
3036Lxts_inner_dec_tail:
3037	// x2 == -0x10 means two blocks left.
3038	cmn	x2,#0x10
3039	eor	v1.16b,v28.16b,v6.16b
3040	eor	v24.16b,v29.16b,v8.16b
3041	b.eq	Lxts_dec_tail_loop
3042	eor	v24.16b,v29.16b,v6.16b
3043Lxts_dec_tail_loop:
3044	aesd	v1.16b,v16.16b
3045	aesimc	v1.16b,v1.16b
3046	aesd	v24.16b,v16.16b
3047	aesimc	v24.16b,v24.16b
3048	ld1	{v16.4s},[x7],#16
3049	subs	w6,w6,#2
3050	aesd	v1.16b,v17.16b
3051	aesimc	v1.16b,v1.16b
3052	aesd	v24.16b,v17.16b
3053	aesimc	v24.16b,v24.16b
3054	ld1	{v17.4s},[x7],#16
3055	b.gt	Lxts_dec_tail_loop
3056
3057	aesd	v1.16b,v16.16b
3058	aesimc	v1.16b,v1.16b
3059	aesd	v24.16b,v16.16b
3060	aesimc	v24.16b,v24.16b
3061	aesd	v1.16b,v17.16b
3062	aesimc	v1.16b,v1.16b
3063	aesd	v24.16b,v17.16b
3064	aesimc	v24.16b,v24.16b
3065	aesd	v1.16b,v20.16b
3066	aesimc	v1.16b,v1.16b
3067	aesd	v24.16b,v20.16b
3068	aesimc	v24.16b,v24.16b
3069	cmn	x2,#0x20
3070	aesd	v1.16b,v21.16b
3071	aesimc	v1.16b,v1.16b
3072	aesd	v24.16b,v21.16b
3073	aesimc	v24.16b,v24.16b
3074	eor	v5.16b,v6.16b,v7.16b
3075	aesd	v1.16b,v22.16b
3076	aesimc	v1.16b,v1.16b
3077	aesd	v24.16b,v22.16b
3078	aesimc	v24.16b,v24.16b
3079	eor	v17.16b,v8.16b,v7.16b
3080	aesd	v1.16b,v23.16b
3081	aesd	v24.16b,v23.16b
3082	b.eq	Lxts_dec_one
3083	eor	v5.16b,v5.16b,v1.16b
3084	eor	v17.16b,v17.16b,v24.16b
3085	orr	v6.16b,v9.16b,v9.16b
3086	orr	v8.16b,v10.16b,v10.16b
3087	st1	{v5.16b},[x1],#16
3088	st1	{v17.16b},[x1],#16
3089	add	x2,x2,#16
3090	b	Lxts_done
3091
3092Lxts_dec_one:
3093	eor	v5.16b,v5.16b,v24.16b
3094	orr	v6.16b,v8.16b,v8.16b
3095	orr	v8.16b,v9.16b,v9.16b
3096	st1	{v5.16b},[x1],#16
3097	add	x2,x2,#32
3098
3099Lxts_done:
3100	tst	x21,#0xf
3101	b.eq	Lxts_dec_abort
3102	// Processing the last two blocks with cipher stealing.
3103	mov	x7,x3
3104	cbnz	x2,Lxts_dec_1st_done
3105	ld1	{v0.16b},[x0],#16
3106
3107	// Decrypt the last secod block to get the last plain text block
3108Lxts_dec_1st_done:
3109	eor	v26.16b,v0.16b,v8.16b
3110	ldr	w6,[x3,#240]
3111	ld1	{v0.4s},[x3],#16
3112	sub	w6,w6,#2
3113	ld1	{v1.4s},[x3],#16
3114Loop_final_2nd_dec:
3115	aesd	v26.16b,v0.16b
3116	aesimc	v26.16b,v26.16b
3117	ld1	{v0.4s},[x3],#16		// load key schedule...
3118	subs	w6,w6,#2
3119	aesd	v26.16b,v1.16b
3120	aesimc	v26.16b,v26.16b
3121	ld1	{v1.4s},[x3],#16		// load key schedule...
3122	b.gt	Loop_final_2nd_dec
3123
3124	aesd	v26.16b,v0.16b
3125	aesimc	v26.16b,v26.16b
3126	ld1	{v0.4s},[x3]
3127	aesd	v26.16b,v1.16b
3128	eor	v26.16b,v26.16b,v0.16b
3129	eor	v26.16b,v26.16b,v8.16b
3130	st1	{v26.16b},[x1]
3131
3132	mov	x20,x0
3133	add	x13,x1,#16
3134
3135	// Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3136	// to get the last encrypted block.
3137.composite_dec_loop:
3138	subs	x21,x21,#1
3139	ldrb	w15,[x1,x21]
3140	ldrb	w14,[x20,x21]
3141	strb	w15,[x13,x21]
3142	strb	w14,[x1,x21]
3143	b.gt	.composite_dec_loop
3144Lxts_dec_load_done:
3145	ld1	{v26.16b},[x1]
3146	eor	v26.16b,v26.16b,v6.16b
3147
3148	// Decrypt the composite block to get the last second plain text block
3149	ldr	w6,[x7,#240]
3150	ld1	{v0.4s},[x7],#16
3151	sub	w6,w6,#2
3152	ld1	{v1.4s},[x7],#16
3153Loop_final_dec:
3154	aesd	v26.16b,v0.16b
3155	aesimc	v26.16b,v26.16b
3156	ld1	{v0.4s},[x7],#16		// load key schedule...
3157	subs	w6,w6,#2
3158	aesd	v26.16b,v1.16b
3159	aesimc	v26.16b,v26.16b
3160	ld1	{v1.4s},[x7],#16		// load key schedule...
3161	b.gt	Loop_final_dec
3162
3163	aesd	v26.16b,v0.16b
3164	aesimc	v26.16b,v26.16b
3165	ld1	{v0.4s},[x7]
3166	aesd	v26.16b,v1.16b
3167	eor	v26.16b,v26.16b,v0.16b
3168	eor	v26.16b,v26.16b,v6.16b
3169	st1	{v26.16b},[x1]
3170
3171Lxts_dec_abort:
3172	ldp	x21,x22,[sp,#48]
3173	ldp	d8,d9,[sp,#32]
3174	ldp	d10,d11,[sp,#16]
3175	ldp	x19,x20,[sp],#64
3176
3177Lxts_dec_final_abort:
3178	ret
3179
3180#endif
3181