• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include "arm_arch.h"
2
3#if __ARM_MAX_ARCH__>=7
4.text
5#if !defined(__clang__)
6.arch	armv8-a+crypto
7#endif
8.align	5
9.Lrcon:
10.long	0x01,0x01,0x01,0x01
11.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
12.long	0x1b,0x1b,0x1b,0x1b
13
14.globl	aes_v8_set_encrypt_key
15.type	aes_v8_set_encrypt_key,%function
16.align	5
17aes_v8_set_encrypt_key:
18.Lenc_key:
19	stp	x29,x30,[sp,#-16]!
20	add	x29,sp,#0
21	mov	x3,#-1
22	cmp	x0,#0
23	b.eq	.Lenc_key_abort
24	cmp	x2,#0
25	b.eq	.Lenc_key_abort
26	mov	x3,#-2
27	cmp	w1,#128
28	b.lt	.Lenc_key_abort
29	cmp	w1,#256
30	b.gt	.Lenc_key_abort
31	tst	w1,#0x3f
32	b.ne	.Lenc_key_abort
33
34	adr	x3,.Lrcon
35	cmp	w1,#192
36
37	eor	v0.16b,v0.16b,v0.16b
38	ld1	{v3.16b},[x0],#16
39	mov	w1,#8		// reuse w1
40	ld1	{v1.4s,v2.4s},[x3],#32
41
42	b.lt	.Loop128
43	b.eq	.L192
44	b	.L256
45
46.align	4
47.Loop128:
48	tbl	v6.16b,{v3.16b},v2.16b
49	ext	v5.16b,v0.16b,v3.16b,#12
50	st1	{v3.4s},[x2],#16
51	aese	v6.16b,v0.16b
52	subs	w1,w1,#1
53
54	eor	v3.16b,v3.16b,v5.16b
55	ext	v5.16b,v0.16b,v5.16b,#12
56	eor	v3.16b,v3.16b,v5.16b
57	ext	v5.16b,v0.16b,v5.16b,#12
58	eor	v6.16b,v6.16b,v1.16b
59	eor	v3.16b,v3.16b,v5.16b
60	shl	v1.16b,v1.16b,#1
61	eor	v3.16b,v3.16b,v6.16b
62	b.ne	.Loop128
63
64	ld1	{v1.4s},[x3]
65
66	tbl	v6.16b,{v3.16b},v2.16b
67	ext	v5.16b,v0.16b,v3.16b,#12
68	st1	{v3.4s},[x2],#16
69	aese	v6.16b,v0.16b
70
71	eor	v3.16b,v3.16b,v5.16b
72	ext	v5.16b,v0.16b,v5.16b,#12
73	eor	v3.16b,v3.16b,v5.16b
74	ext	v5.16b,v0.16b,v5.16b,#12
75	eor	v6.16b,v6.16b,v1.16b
76	eor	v3.16b,v3.16b,v5.16b
77	shl	v1.16b,v1.16b,#1
78	eor	v3.16b,v3.16b,v6.16b
79
80	tbl	v6.16b,{v3.16b},v2.16b
81	ext	v5.16b,v0.16b,v3.16b,#12
82	st1	{v3.4s},[x2],#16
83	aese	v6.16b,v0.16b
84
85	eor	v3.16b,v3.16b,v5.16b
86	ext	v5.16b,v0.16b,v5.16b,#12
87	eor	v3.16b,v3.16b,v5.16b
88	ext	v5.16b,v0.16b,v5.16b,#12
89	eor	v6.16b,v6.16b,v1.16b
90	eor	v3.16b,v3.16b,v5.16b
91	eor	v3.16b,v3.16b,v6.16b
92	st1	{v3.4s},[x2]
93	add	x2,x2,#0x50
94
95	mov	w12,#10
96	b	.Ldone
97
98.align	4
99.L192:
100	ld1	{v4.8b},[x0],#8
101	movi	v6.16b,#8			// borrow v6.16b
102	st1	{v3.4s},[x2],#16
103	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
104
105.Loop192:
106	tbl	v6.16b,{v4.16b},v2.16b
107	ext	v5.16b,v0.16b,v3.16b,#12
108	st1	{v4.8b},[x2],#8
109	aese	v6.16b,v0.16b
110	subs	w1,w1,#1
111
112	eor	v3.16b,v3.16b,v5.16b
113	ext	v5.16b,v0.16b,v5.16b,#12
114	eor	v3.16b,v3.16b,v5.16b
115	ext	v5.16b,v0.16b,v5.16b,#12
116	eor	v3.16b,v3.16b,v5.16b
117
118	dup	v5.4s,v3.s[3]
119	eor	v5.16b,v5.16b,v4.16b
120	eor	v6.16b,v6.16b,v1.16b
121	ext	v4.16b,v0.16b,v4.16b,#12
122	shl	v1.16b,v1.16b,#1
123	eor	v4.16b,v4.16b,v5.16b
124	eor	v3.16b,v3.16b,v6.16b
125	eor	v4.16b,v4.16b,v6.16b
126	st1	{v3.4s},[x2],#16
127	b.ne	.Loop192
128
129	mov	w12,#12
130	add	x2,x2,#0x20
131	b	.Ldone
132
133.align	4
134.L256:
135	ld1	{v4.16b},[x0]
136	mov	w1,#7
137	mov	w12,#14
138	st1	{v3.4s},[x2],#16
139
140.Loop256:
141	tbl	v6.16b,{v4.16b},v2.16b
142	ext	v5.16b,v0.16b,v3.16b,#12
143	st1	{v4.4s},[x2],#16
144	aese	v6.16b,v0.16b
145	subs	w1,w1,#1
146
147	eor	v3.16b,v3.16b,v5.16b
148	ext	v5.16b,v0.16b,v5.16b,#12
149	eor	v3.16b,v3.16b,v5.16b
150	ext	v5.16b,v0.16b,v5.16b,#12
151	eor	v6.16b,v6.16b,v1.16b
152	eor	v3.16b,v3.16b,v5.16b
153	shl	v1.16b,v1.16b,#1
154	eor	v3.16b,v3.16b,v6.16b
155	st1	{v3.4s},[x2],#16
156	b.eq	.Ldone
157
158	dup	v6.4s,v3.s[3]		// just splat
159	ext	v5.16b,v0.16b,v4.16b,#12
160	aese	v6.16b,v0.16b
161
162	eor	v4.16b,v4.16b,v5.16b
163	ext	v5.16b,v0.16b,v5.16b,#12
164	eor	v4.16b,v4.16b,v5.16b
165	ext	v5.16b,v0.16b,v5.16b,#12
166	eor	v4.16b,v4.16b,v5.16b
167
168	eor	v4.16b,v4.16b,v6.16b
169	b	.Loop256
170
171.Ldone:
172	str	w12,[x2]
173	mov	x3,#0
174
175.Lenc_key_abort:
176	mov	x0,x3			// return value
177	ldr	x29,[sp],#16
178	ret
179.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
180
181.globl	aes_v8_set_decrypt_key
182.type	aes_v8_set_decrypt_key,%function
183.align	5
184aes_v8_set_decrypt_key:
185	stp	x29,x30,[sp,#-16]!
186	add	x29,sp,#0
187	bl	.Lenc_key
188
189	cmp	x0,#0
190	b.ne	.Ldec_key_abort
191
192	sub	x2,x2,#240		// restore original x2
193	mov	x4,#-16
194	add	x0,x2,x12,lsl#4	// end of key schedule
195
196	ld1	{v0.4s},[x2]
197	ld1	{v1.4s},[x0]
198	st1	{v0.4s},[x0],x4
199	st1	{v1.4s},[x2],#16
200
201.Loop_imc:
202	ld1	{v0.4s},[x2]
203	ld1	{v1.4s},[x0]
204	aesimc	v0.16b,v0.16b
205	aesimc	v1.16b,v1.16b
206	st1	{v0.4s},[x0],x4
207	st1	{v1.4s},[x2],#16
208	cmp	x0,x2
209	b.hi	.Loop_imc
210
211	ld1	{v0.4s},[x2]
212	aesimc	v0.16b,v0.16b
213	st1	{v0.4s},[x0]
214
215	eor	x0,x0,x0		// return value
216.Ldec_key_abort:
217	ldp	x29,x30,[sp],#16
218	ret
219.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
220.globl	aes_v8_encrypt
221.type	aes_v8_encrypt,%function
222.align	5
223aes_v8_encrypt:
224	ldr	w3,[x2,#240]
225	ld1	{v0.4s},[x2],#16
226	ld1	{v2.16b},[x0]
227	sub	w3,w3,#2
228	ld1	{v1.4s},[x2],#16
229
230.Loop_enc:
231	aese	v2.16b,v0.16b
232	aesmc	v2.16b,v2.16b
233	ld1	{v0.4s},[x2],#16
234	subs	w3,w3,#2
235	aese	v2.16b,v1.16b
236	aesmc	v2.16b,v2.16b
237	ld1	{v1.4s},[x2],#16
238	b.gt	.Loop_enc
239
240	aese	v2.16b,v0.16b
241	aesmc	v2.16b,v2.16b
242	ld1	{v0.4s},[x2]
243	aese	v2.16b,v1.16b
244	eor	v2.16b,v2.16b,v0.16b
245
246	st1	{v2.16b},[x1]
247	ret
248.size	aes_v8_encrypt,.-aes_v8_encrypt
249.globl	aes_v8_decrypt
250.type	aes_v8_decrypt,%function
251.align	5
252aes_v8_decrypt:
253	ldr	w3,[x2,#240]
254	ld1	{v0.4s},[x2],#16
255	ld1	{v2.16b},[x0]
256	sub	w3,w3,#2
257	ld1	{v1.4s},[x2],#16
258
259.Loop_dec:
260	aesd	v2.16b,v0.16b
261	aesimc	v2.16b,v2.16b
262	ld1	{v0.4s},[x2],#16
263	subs	w3,w3,#2
264	aesd	v2.16b,v1.16b
265	aesimc	v2.16b,v2.16b
266	ld1	{v1.4s},[x2],#16
267	b.gt	.Loop_dec
268
269	aesd	v2.16b,v0.16b
270	aesimc	v2.16b,v2.16b
271	ld1	{v0.4s},[x2]
272	aesd	v2.16b,v1.16b
273	eor	v2.16b,v2.16b,v0.16b
274
275	st1	{v2.16b},[x1]
276	ret
277.size	aes_v8_decrypt,.-aes_v8_decrypt
278.globl	aes_v8_cbc_encrypt
279.type	aes_v8_cbc_encrypt,%function
280.align	5
281aes_v8_cbc_encrypt:
282	stp	x29,x30,[sp,#-16]!
283	add	x29,sp,#0
284	subs	x2,x2,#16
285	mov	x8,#16
286	b.lo	.Lcbc_abort
287	csel	x8,xzr,x8,eq
288
289	cmp	w5,#0			// en- or decrypting?
290	ldr	w5,[x3,#240]
291	and	x2,x2,#-16
292	ld1	{v6.16b},[x4]
293	ld1	{v0.16b},[x0],x8
294
295	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
296	sub	w5,w5,#6
297	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
298	sub	w5,w5,#2
299	ld1	{v18.4s,v19.4s},[x7],#32
300	ld1	{v20.4s,v21.4s},[x7],#32
301	ld1	{v22.4s,v23.4s},[x7],#32
302	ld1	{v7.4s},[x7]
303
304	add	x7,x3,#32
305	mov	w6,w5
306	b.eq	.Lcbc_dec
307
308	cmp	w5,#2
309	eor	v0.16b,v0.16b,v6.16b
310	eor	v5.16b,v16.16b,v7.16b
311	b.eq	.Lcbc_enc128
312
313	ld1	{v2.4s,v3.4s},[x7]
314	add	x7,x3,#16
315	add	x6,x3,#16*4
316	add	x12,x3,#16*5
317	aese	v0.16b,v16.16b
318	aesmc	v0.16b,v0.16b
319	add	x14,x3,#16*6
320	add	x3,x3,#16*7
321	b	.Lenter_cbc_enc
322
323.align	4
324.Loop_cbc_enc:
325	aese	v0.16b,v16.16b
326	aesmc	v0.16b,v0.16b
327	st1	{v6.16b},[x1],#16
328.Lenter_cbc_enc:
329	aese	v0.16b,v17.16b
330	aesmc	v0.16b,v0.16b
331	aese	v0.16b,v2.16b
332	aesmc	v0.16b,v0.16b
333	ld1	{v16.4s},[x6]
334	cmp	w5,#4
335	aese	v0.16b,v3.16b
336	aesmc	v0.16b,v0.16b
337	ld1	{v17.4s},[x12]
338	b.eq	.Lcbc_enc192
339
340	aese	v0.16b,v16.16b
341	aesmc	v0.16b,v0.16b
342	ld1	{v16.4s},[x14]
343	aese	v0.16b,v17.16b
344	aesmc	v0.16b,v0.16b
345	ld1	{v17.4s},[x3]
346	nop
347
348.Lcbc_enc192:
349	aese	v0.16b,v16.16b
350	aesmc	v0.16b,v0.16b
351	subs	x2,x2,#16
352	aese	v0.16b,v17.16b
353	aesmc	v0.16b,v0.16b
354	csel	x8,xzr,x8,eq
355	aese	v0.16b,v18.16b
356	aesmc	v0.16b,v0.16b
357	aese	v0.16b,v19.16b
358	aesmc	v0.16b,v0.16b
359	ld1	{v16.16b},[x0],x8
360	aese	v0.16b,v20.16b
361	aesmc	v0.16b,v0.16b
362	eor	v16.16b,v16.16b,v5.16b
363	aese	v0.16b,v21.16b
364	aesmc	v0.16b,v0.16b
365	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
366	aese	v0.16b,v22.16b
367	aesmc	v0.16b,v0.16b
368	aese	v0.16b,v23.16b
369	eor	v6.16b,v0.16b,v7.16b
370	b.hs	.Loop_cbc_enc
371
372	st1	{v6.16b},[x1],#16
373	b	.Lcbc_done
374
375.align	5
376.Lcbc_enc128:
377	ld1	{v2.4s,v3.4s},[x7]
378	aese	v0.16b,v16.16b
379	aesmc	v0.16b,v0.16b
380	b	.Lenter_cbc_enc128
381.Loop_cbc_enc128:
382	aese	v0.16b,v16.16b
383	aesmc	v0.16b,v0.16b
384	st1	{v6.16b},[x1],#16
385.Lenter_cbc_enc128:
386	aese	v0.16b,v17.16b
387	aesmc	v0.16b,v0.16b
388	subs	x2,x2,#16
389	aese	v0.16b,v2.16b
390	aesmc	v0.16b,v0.16b
391	csel	x8,xzr,x8,eq
392	aese	v0.16b,v3.16b
393	aesmc	v0.16b,v0.16b
394	aese	v0.16b,v18.16b
395	aesmc	v0.16b,v0.16b
396	aese	v0.16b,v19.16b
397	aesmc	v0.16b,v0.16b
398	ld1	{v16.16b},[x0],x8
399	aese	v0.16b,v20.16b
400	aesmc	v0.16b,v0.16b
401	aese	v0.16b,v21.16b
402	aesmc	v0.16b,v0.16b
403	aese	v0.16b,v22.16b
404	aesmc	v0.16b,v0.16b
405	eor	v16.16b,v16.16b,v5.16b
406	aese	v0.16b,v23.16b
407	eor	v6.16b,v0.16b,v7.16b
408	b.hs	.Loop_cbc_enc128
409
410	st1	{v6.16b},[x1],#16
411	b	.Lcbc_done
412.align	5
413.Lcbc_dec:
414	ld1	{v18.16b},[x0],#16
415	subs	x2,x2,#32		// bias
416	add	w6,w5,#2
417	orr	v3.16b,v0.16b,v0.16b
418	orr	v1.16b,v0.16b,v0.16b
419	orr	v19.16b,v18.16b,v18.16b
420	b.lo	.Lcbc_dec_tail
421
422	orr	v1.16b,v18.16b,v18.16b
423	ld1	{v18.16b},[x0],#16
424	orr	v2.16b,v0.16b,v0.16b
425	orr	v3.16b,v1.16b,v1.16b
426	orr	v19.16b,v18.16b,v18.16b
427
428.Loop3x_cbc_dec:
429	aesd	v0.16b,v16.16b
430	aesimc	v0.16b,v0.16b
431	aesd	v1.16b,v16.16b
432	aesimc	v1.16b,v1.16b
433	aesd	v18.16b,v16.16b
434	aesimc	v18.16b,v18.16b
435	ld1	{v16.4s},[x7],#16
436	subs	w6,w6,#2
437	aesd	v0.16b,v17.16b
438	aesimc	v0.16b,v0.16b
439	aesd	v1.16b,v17.16b
440	aesimc	v1.16b,v1.16b
441	aesd	v18.16b,v17.16b
442	aesimc	v18.16b,v18.16b
443	ld1	{v17.4s},[x7],#16
444	b.gt	.Loop3x_cbc_dec
445
446	aesd	v0.16b,v16.16b
447	aesimc	v0.16b,v0.16b
448	aesd	v1.16b,v16.16b
449	aesimc	v1.16b,v1.16b
450	aesd	v18.16b,v16.16b
451	aesimc	v18.16b,v18.16b
452	eor	v4.16b,v6.16b,v7.16b
453	subs	x2,x2,#0x30
454	eor	v5.16b,v2.16b,v7.16b
455	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
456	aesd	v0.16b,v17.16b
457	aesimc	v0.16b,v0.16b
458	aesd	v1.16b,v17.16b
459	aesimc	v1.16b,v1.16b
460	aesd	v18.16b,v17.16b
461	aesimc	v18.16b,v18.16b
462	eor	v17.16b,v3.16b,v7.16b
463	add	x0,x0,x6		// x0 is adjusted in such way that
464					// at exit from the loop v1.16b-v18.16b
465					// are loaded with last "words"
466	orr	v6.16b,v19.16b,v19.16b
467	mov	x7,x3
468	aesd	v0.16b,v20.16b
469	aesimc	v0.16b,v0.16b
470	aesd	v1.16b,v20.16b
471	aesimc	v1.16b,v1.16b
472	aesd	v18.16b,v20.16b
473	aesimc	v18.16b,v18.16b
474	ld1	{v2.16b},[x0],#16
475	aesd	v0.16b,v21.16b
476	aesimc	v0.16b,v0.16b
477	aesd	v1.16b,v21.16b
478	aesimc	v1.16b,v1.16b
479	aesd	v18.16b,v21.16b
480	aesimc	v18.16b,v18.16b
481	ld1	{v3.16b},[x0],#16
482	aesd	v0.16b,v22.16b
483	aesimc	v0.16b,v0.16b
484	aesd	v1.16b,v22.16b
485	aesimc	v1.16b,v1.16b
486	aesd	v18.16b,v22.16b
487	aesimc	v18.16b,v18.16b
488	ld1	{v19.16b},[x0],#16
489	aesd	v0.16b,v23.16b
490	aesd	v1.16b,v23.16b
491	aesd	v18.16b,v23.16b
492	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
493	add	w6,w5,#2
494	eor	v4.16b,v4.16b,v0.16b
495	eor	v5.16b,v5.16b,v1.16b
496	eor	v18.16b,v18.16b,v17.16b
497	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
498	st1	{v4.16b},[x1],#16
499	orr	v0.16b,v2.16b,v2.16b
500	st1	{v5.16b},[x1],#16
501	orr	v1.16b,v3.16b,v3.16b
502	st1	{v18.16b},[x1],#16
503	orr	v18.16b,v19.16b,v19.16b
504	b.hs	.Loop3x_cbc_dec
505
506	cmn	x2,#0x30
507	b.eq	.Lcbc_done
508	nop
509
510.Lcbc_dec_tail:
511	aesd	v1.16b,v16.16b
512	aesimc	v1.16b,v1.16b
513	aesd	v18.16b,v16.16b
514	aesimc	v18.16b,v18.16b
515	ld1	{v16.4s},[x7],#16
516	subs	w6,w6,#2
517	aesd	v1.16b,v17.16b
518	aesimc	v1.16b,v1.16b
519	aesd	v18.16b,v17.16b
520	aesimc	v18.16b,v18.16b
521	ld1	{v17.4s},[x7],#16
522	b.gt	.Lcbc_dec_tail
523
524	aesd	v1.16b,v16.16b
525	aesimc	v1.16b,v1.16b
526	aesd	v18.16b,v16.16b
527	aesimc	v18.16b,v18.16b
528	aesd	v1.16b,v17.16b
529	aesimc	v1.16b,v1.16b
530	aesd	v18.16b,v17.16b
531	aesimc	v18.16b,v18.16b
532	aesd	v1.16b,v20.16b
533	aesimc	v1.16b,v1.16b
534	aesd	v18.16b,v20.16b
535	aesimc	v18.16b,v18.16b
536	cmn	x2,#0x20
537	aesd	v1.16b,v21.16b
538	aesimc	v1.16b,v1.16b
539	aesd	v18.16b,v21.16b
540	aesimc	v18.16b,v18.16b
541	eor	v5.16b,v6.16b,v7.16b
542	aesd	v1.16b,v22.16b
543	aesimc	v1.16b,v1.16b
544	aesd	v18.16b,v22.16b
545	aesimc	v18.16b,v18.16b
546	eor	v17.16b,v3.16b,v7.16b
547	aesd	v1.16b,v23.16b
548	aesd	v18.16b,v23.16b
549	b.eq	.Lcbc_dec_one
550	eor	v5.16b,v5.16b,v1.16b
551	eor	v17.16b,v17.16b,v18.16b
552	orr	v6.16b,v19.16b,v19.16b
553	st1	{v5.16b},[x1],#16
554	st1	{v17.16b},[x1],#16
555	b	.Lcbc_done
556
557.Lcbc_dec_one:
558	eor	v5.16b,v5.16b,v18.16b
559	orr	v6.16b,v19.16b,v19.16b
560	st1	{v5.16b},[x1],#16
561
562.Lcbc_done:
563	st1	{v6.16b},[x4]
564.Lcbc_abort:
565	ldr	x29,[sp],#16
566	ret
567.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
568.globl	aes_v8_ctr32_encrypt_blocks
569.type	aes_v8_ctr32_encrypt_blocks,%function
570.align	5
571aes_v8_ctr32_encrypt_blocks:
572	stp	x29,x30,[sp,#-16]!
573	add	x29,sp,#0
574	ldr	w5,[x3,#240]
575
576	ldr	w8, [x4, #12]
577	ld1	{v0.4s},[x4]
578
579	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
580	sub	w5,w5,#4
581	mov	x12,#16
582	cmp	x2,#2
583	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
584	sub	w5,w5,#2
585	ld1	{v20.4s,v21.4s},[x7],#32
586	ld1	{v22.4s,v23.4s},[x7],#32
587	ld1	{v7.4s},[x7]
588	add	x7,x3,#32
589	mov	w6,w5
590	csel	x12,xzr,x12,lo
591#ifndef __ARMEB__
592	rev	w8, w8
593#endif
594	orr	v1.16b,v0.16b,v0.16b
595	add	w10, w8, #1
596	orr	v18.16b,v0.16b,v0.16b
597	add	w8, w8, #2
598	orr	v6.16b,v0.16b,v0.16b
599	rev	w10, w10
600	mov	v1.s[3],w10
601	b.ls	.Lctr32_tail
602	rev	w12, w8
603	sub	x2,x2,#3		// bias
604	mov	v18.s[3],w12
605	b	.Loop3x_ctr32
606
607.align	4
608.Loop3x_ctr32:
609	aese	v0.16b,v16.16b
610	aesmc	v0.16b,v0.16b
611	aese	v1.16b,v16.16b
612	aesmc	v1.16b,v1.16b
613	aese	v18.16b,v16.16b
614	aesmc	v18.16b,v18.16b
615	ld1	{v16.4s},[x7],#16
616	subs	w6,w6,#2
617	aese	v0.16b,v17.16b
618	aesmc	v0.16b,v0.16b
619	aese	v1.16b,v17.16b
620	aesmc	v1.16b,v1.16b
621	aese	v18.16b,v17.16b
622	aesmc	v18.16b,v18.16b
623	ld1	{v17.4s},[x7],#16
624	b.gt	.Loop3x_ctr32
625
626	aese	v0.16b,v16.16b
627	aesmc	v4.16b,v0.16b
628	aese	v1.16b,v16.16b
629	aesmc	v5.16b,v1.16b
630	ld1	{v2.16b},[x0],#16
631	orr	v0.16b,v6.16b,v6.16b
632	aese	v18.16b,v16.16b
633	aesmc	v18.16b,v18.16b
634	ld1	{v3.16b},[x0],#16
635	orr	v1.16b,v6.16b,v6.16b
636	aese	v4.16b,v17.16b
637	aesmc	v4.16b,v4.16b
638	aese	v5.16b,v17.16b
639	aesmc	v5.16b,v5.16b
640	ld1	{v19.16b},[x0],#16
641	mov	x7,x3
642	aese	v18.16b,v17.16b
643	aesmc	v17.16b,v18.16b
644	orr	v18.16b,v6.16b,v6.16b
645	add	w9,w8,#1
646	aese	v4.16b,v20.16b
647	aesmc	v4.16b,v4.16b
648	aese	v5.16b,v20.16b
649	aesmc	v5.16b,v5.16b
650	eor	v2.16b,v2.16b,v7.16b
651	add	w10,w8,#2
652	aese	v17.16b,v20.16b
653	aesmc	v17.16b,v17.16b
654	eor	v3.16b,v3.16b,v7.16b
655	add	w8,w8,#3
656	aese	v4.16b,v21.16b
657	aesmc	v4.16b,v4.16b
658	aese	v5.16b,v21.16b
659	aesmc	v5.16b,v5.16b
660	eor	v19.16b,v19.16b,v7.16b
661	rev	w9,w9
662	aese	v17.16b,v21.16b
663	aesmc	v17.16b,v17.16b
664	mov	v0.s[3], w9
665	rev	w10,w10
666	aese	v4.16b,v22.16b
667	aesmc	v4.16b,v4.16b
668	aese	v5.16b,v22.16b
669	aesmc	v5.16b,v5.16b
670	mov	v1.s[3], w10
671	rev	w12,w8
672	aese	v17.16b,v22.16b
673	aesmc	v17.16b,v17.16b
674	mov	v18.s[3], w12
675	subs	x2,x2,#3
676	aese	v4.16b,v23.16b
677	aese	v5.16b,v23.16b
678	aese	v17.16b,v23.16b
679
680	eor	v2.16b,v2.16b,v4.16b
681	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
682	st1	{v2.16b},[x1],#16
683	eor	v3.16b,v3.16b,v5.16b
684	mov	w6,w5
685	st1	{v3.16b},[x1],#16
686	eor	v19.16b,v19.16b,v17.16b
687	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
688	st1	{v19.16b},[x1],#16
689	b.hs	.Loop3x_ctr32
690
691	adds	x2,x2,#3
692	b.eq	.Lctr32_done
693	cmp	x2,#1
694	mov	x12,#16
695	csel	x12,xzr,x12,eq
696
697.Lctr32_tail:
698	aese	v0.16b,v16.16b
699	aesmc	v0.16b,v0.16b
700	aese	v1.16b,v16.16b
701	aesmc	v1.16b,v1.16b
702	ld1	{v16.4s},[x7],#16
703	subs	w6,w6,#2
704	aese	v0.16b,v17.16b
705	aesmc	v0.16b,v0.16b
706	aese	v1.16b,v17.16b
707	aesmc	v1.16b,v1.16b
708	ld1	{v17.4s},[x7],#16
709	b.gt	.Lctr32_tail
710
711	aese	v0.16b,v16.16b
712	aesmc	v0.16b,v0.16b
713	aese	v1.16b,v16.16b
714	aesmc	v1.16b,v1.16b
715	aese	v0.16b,v17.16b
716	aesmc	v0.16b,v0.16b
717	aese	v1.16b,v17.16b
718	aesmc	v1.16b,v1.16b
719	ld1	{v2.16b},[x0],x12
720	aese	v0.16b,v20.16b
721	aesmc	v0.16b,v0.16b
722	aese	v1.16b,v20.16b
723	aesmc	v1.16b,v1.16b
724	ld1	{v3.16b},[x0]
725	aese	v0.16b,v21.16b
726	aesmc	v0.16b,v0.16b
727	aese	v1.16b,v21.16b
728	aesmc	v1.16b,v1.16b
729	eor	v2.16b,v2.16b,v7.16b
730	aese	v0.16b,v22.16b
731	aesmc	v0.16b,v0.16b
732	aese	v1.16b,v22.16b
733	aesmc	v1.16b,v1.16b
734	eor	v3.16b,v3.16b,v7.16b
735	aese	v0.16b,v23.16b
736	aese	v1.16b,v23.16b
737
738	cmp	x2,#1
739	eor	v2.16b,v2.16b,v0.16b
740	eor	v3.16b,v3.16b,v1.16b
741	st1	{v2.16b},[x1],#16
742	b.eq	.Lctr32_done
743	st1	{v3.16b},[x1]
744
745.Lctr32_done:
746	ldr	x29,[sp],#16
747	ret
748.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
749#endif
750