• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include <openssl/arm_arch.h>
2
3#if __ARM_MAX_ARCH__>=7
4.text
5
6.align	5
7Lrcon:
8.long	0x01,0x01,0x01,0x01
9.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
10.long	0x1b,0x1b,0x1b,0x1b
11
12.globl	_aes_hw_set_encrypt_key
13.private_extern	_aes_hw_set_encrypt_key
14
15.align	5
16_aes_hw_set_encrypt_key:
17Lenc_key:
18	stp	x29,x30,[sp,#-16]!
19	add	x29,sp,#0
20	mov	x3,#-1
21	cmp	x0,#0
22	b.eq	Lenc_key_abort
23	cmp	x2,#0
24	b.eq	Lenc_key_abort
25	mov	x3,#-2
26	cmp	w1,#128
27	b.lt	Lenc_key_abort
28	cmp	w1,#256
29	b.gt	Lenc_key_abort
30	tst	w1,#0x3f
31	b.ne	Lenc_key_abort
32
33	adr	x3,Lrcon
34	cmp	w1,#192
35
36	eor	v0.16b,v0.16b,v0.16b
37	ld1	{v3.16b},[x0],#16
38	mov	w1,#8		// reuse w1
39	ld1	{v1.4s,v2.4s},[x3],#32
40
41	b.lt	Loop128
42	b.eq	L192
43	b	L256
44
45.align	4
46Loop128:
47	tbl	v6.16b,{v3.16b},v2.16b
48	ext	v5.16b,v0.16b,v3.16b,#12
49	st1	{v3.4s},[x2],#16
50	aese	v6.16b,v0.16b
51	subs	w1,w1,#1
52
53	eor	v3.16b,v3.16b,v5.16b
54	ext	v5.16b,v0.16b,v5.16b,#12
55	eor	v3.16b,v3.16b,v5.16b
56	ext	v5.16b,v0.16b,v5.16b,#12
57	eor	v6.16b,v6.16b,v1.16b
58	eor	v3.16b,v3.16b,v5.16b
59	shl	v1.16b,v1.16b,#1
60	eor	v3.16b,v3.16b,v6.16b
61	b.ne	Loop128
62
63	ld1	{v1.4s},[x3]
64
65	tbl	v6.16b,{v3.16b},v2.16b
66	ext	v5.16b,v0.16b,v3.16b,#12
67	st1	{v3.4s},[x2],#16
68	aese	v6.16b,v0.16b
69
70	eor	v3.16b,v3.16b,v5.16b
71	ext	v5.16b,v0.16b,v5.16b,#12
72	eor	v3.16b,v3.16b,v5.16b
73	ext	v5.16b,v0.16b,v5.16b,#12
74	eor	v6.16b,v6.16b,v1.16b
75	eor	v3.16b,v3.16b,v5.16b
76	shl	v1.16b,v1.16b,#1
77	eor	v3.16b,v3.16b,v6.16b
78
79	tbl	v6.16b,{v3.16b},v2.16b
80	ext	v5.16b,v0.16b,v3.16b,#12
81	st1	{v3.4s},[x2],#16
82	aese	v6.16b,v0.16b
83
84	eor	v3.16b,v3.16b,v5.16b
85	ext	v5.16b,v0.16b,v5.16b,#12
86	eor	v3.16b,v3.16b,v5.16b
87	ext	v5.16b,v0.16b,v5.16b,#12
88	eor	v6.16b,v6.16b,v1.16b
89	eor	v3.16b,v3.16b,v5.16b
90	eor	v3.16b,v3.16b,v6.16b
91	st1	{v3.4s},[x2]
92	add	x2,x2,#0x50
93
94	mov	w12,#10
95	b	Ldone
96
97.align	4
98L192:
99	ld1	{v4.8b},[x0],#8
100	movi	v6.16b,#8			// borrow v6.16b
101	st1	{v3.4s},[x2],#16
102	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
103
104Loop192:
105	tbl	v6.16b,{v4.16b},v2.16b
106	ext	v5.16b,v0.16b,v3.16b,#12
107	st1	{v4.8b},[x2],#8
108	aese	v6.16b,v0.16b
109	subs	w1,w1,#1
110
111	eor	v3.16b,v3.16b,v5.16b
112	ext	v5.16b,v0.16b,v5.16b,#12
113	eor	v3.16b,v3.16b,v5.16b
114	ext	v5.16b,v0.16b,v5.16b,#12
115	eor	v3.16b,v3.16b,v5.16b
116
117	dup	v5.4s,v3.s[3]
118	eor	v5.16b,v5.16b,v4.16b
119	eor	v6.16b,v6.16b,v1.16b
120	ext	v4.16b,v0.16b,v4.16b,#12
121	shl	v1.16b,v1.16b,#1
122	eor	v4.16b,v4.16b,v5.16b
123	eor	v3.16b,v3.16b,v6.16b
124	eor	v4.16b,v4.16b,v6.16b
125	st1	{v3.4s},[x2],#16
126	b.ne	Loop192
127
128	mov	w12,#12
129	add	x2,x2,#0x20
130	b	Ldone
131
132.align	4
133L256:
134	ld1	{v4.16b},[x0]
135	mov	w1,#7
136	mov	w12,#14
137	st1	{v3.4s},[x2],#16
138
139Loop256:
140	tbl	v6.16b,{v4.16b},v2.16b
141	ext	v5.16b,v0.16b,v3.16b,#12
142	st1	{v4.4s},[x2],#16
143	aese	v6.16b,v0.16b
144	subs	w1,w1,#1
145
146	eor	v3.16b,v3.16b,v5.16b
147	ext	v5.16b,v0.16b,v5.16b,#12
148	eor	v3.16b,v3.16b,v5.16b
149	ext	v5.16b,v0.16b,v5.16b,#12
150	eor	v6.16b,v6.16b,v1.16b
151	eor	v3.16b,v3.16b,v5.16b
152	shl	v1.16b,v1.16b,#1
153	eor	v3.16b,v3.16b,v6.16b
154	st1	{v3.4s},[x2],#16
155	b.eq	Ldone
156
157	dup	v6.4s,v3.s[3]		// just splat
158	ext	v5.16b,v0.16b,v4.16b,#12
159	aese	v6.16b,v0.16b
160
161	eor	v4.16b,v4.16b,v5.16b
162	ext	v5.16b,v0.16b,v5.16b,#12
163	eor	v4.16b,v4.16b,v5.16b
164	ext	v5.16b,v0.16b,v5.16b,#12
165	eor	v4.16b,v4.16b,v5.16b
166
167	eor	v4.16b,v4.16b,v6.16b
168	b	Loop256
169
170Ldone:
171	str	w12,[x2]
172	mov	x3,#0
173
174Lenc_key_abort:
175	mov	x0,x3			// return value
176	ldr	x29,[sp],#16
177	ret
178
179
180.globl	_aes_hw_set_decrypt_key
181.private_extern	_aes_hw_set_decrypt_key
182
183.align	5
184_aes_hw_set_decrypt_key:
185	stp	x29,x30,[sp,#-16]!
186	add	x29,sp,#0
187	bl	Lenc_key
188
189	cmp	x0,#0
190	b.ne	Ldec_key_abort
191
192	sub	x2,x2,#240		// restore original x2
193	mov	x4,#-16
194	add	x0,x2,x12,lsl#4	// end of key schedule
195
196	ld1	{v0.4s},[x2]
197	ld1	{v1.4s},[x0]
198	st1	{v0.4s},[x0],x4
199	st1	{v1.4s},[x2],#16
200
201Loop_imc:
202	ld1	{v0.4s},[x2]
203	ld1	{v1.4s},[x0]
204	aesimc	v0.16b,v0.16b
205	aesimc	v1.16b,v1.16b
206	st1	{v0.4s},[x0],x4
207	st1	{v1.4s},[x2],#16
208	cmp	x0,x2
209	b.hi	Loop_imc
210
211	ld1	{v0.4s},[x2]
212	aesimc	v0.16b,v0.16b
213	st1	{v0.4s},[x0]
214
215	eor	x0,x0,x0		// return value
216Ldec_key_abort:
217	ldp	x29,x30,[sp],#16
218	ret
219
220.globl	_aes_hw_encrypt
221.private_extern	_aes_hw_encrypt
222
223.align	5
224_aes_hw_encrypt:
225	ldr	w3,[x2,#240]
226	ld1	{v0.4s},[x2],#16
227	ld1	{v2.16b},[x0]
228	sub	w3,w3,#2
229	ld1	{v1.4s},[x2],#16
230
231Loop_enc:
232	aese	v2.16b,v0.16b
233	aesmc	v2.16b,v2.16b
234	ld1	{v0.4s},[x2],#16
235	subs	w3,w3,#2
236	aese	v2.16b,v1.16b
237	aesmc	v2.16b,v2.16b
238	ld1	{v1.4s},[x2],#16
239	b.gt	Loop_enc
240
241	aese	v2.16b,v0.16b
242	aesmc	v2.16b,v2.16b
243	ld1	{v0.4s},[x2]
244	aese	v2.16b,v1.16b
245	eor	v2.16b,v2.16b,v0.16b
246
247	st1	{v2.16b},[x1]
248	ret
249
250.globl	_aes_hw_decrypt
251.private_extern	_aes_hw_decrypt
252
253.align	5
254_aes_hw_decrypt:
255	ldr	w3,[x2,#240]
256	ld1	{v0.4s},[x2],#16
257	ld1	{v2.16b},[x0]
258	sub	w3,w3,#2
259	ld1	{v1.4s},[x2],#16
260
261Loop_dec:
262	aesd	v2.16b,v0.16b
263	aesimc	v2.16b,v2.16b
264	ld1	{v0.4s},[x2],#16
265	subs	w3,w3,#2
266	aesd	v2.16b,v1.16b
267	aesimc	v2.16b,v2.16b
268	ld1	{v1.4s},[x2],#16
269	b.gt	Loop_dec
270
271	aesd	v2.16b,v0.16b
272	aesimc	v2.16b,v2.16b
273	ld1	{v0.4s},[x2]
274	aesd	v2.16b,v1.16b
275	eor	v2.16b,v2.16b,v0.16b
276
277	st1	{v2.16b},[x1]
278	ret
279
280.globl	_aes_hw_cbc_encrypt
281.private_extern	_aes_hw_cbc_encrypt
282
283.align	5
284_aes_hw_cbc_encrypt:
285	stp	x29,x30,[sp,#-16]!
286	add	x29,sp,#0
287	subs	x2,x2,#16
288	mov	x8,#16
289	b.lo	Lcbc_abort
290	csel	x8,xzr,x8,eq
291
292	cmp	w5,#0			// en- or decrypting?
293	ldr	w5,[x3,#240]
294	and	x2,x2,#-16
295	ld1	{v6.16b},[x4]
296	ld1	{v0.16b},[x0],x8
297
298	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
299	sub	w5,w5,#6
300	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
301	sub	w5,w5,#2
302	ld1	{v18.4s,v19.4s},[x7],#32
303	ld1	{v20.4s,v21.4s},[x7],#32
304	ld1	{v22.4s,v23.4s},[x7],#32
305	ld1	{v7.4s},[x7]
306
307	add	x7,x3,#32
308	mov	w6,w5
309	b.eq	Lcbc_dec
310
311	cmp	w5,#2
312	eor	v0.16b,v0.16b,v6.16b
313	eor	v5.16b,v16.16b,v7.16b
314	b.eq	Lcbc_enc128
315
316	ld1	{v2.4s,v3.4s},[x7]
317	add	x7,x3,#16
318	add	x6,x3,#16*4
319	add	x12,x3,#16*5
320	aese	v0.16b,v16.16b
321	aesmc	v0.16b,v0.16b
322	add	x14,x3,#16*6
323	add	x3,x3,#16*7
324	b	Lenter_cbc_enc
325
326.align	4
327Loop_cbc_enc:
328	aese	v0.16b,v16.16b
329	aesmc	v0.16b,v0.16b
330	st1	{v6.16b},[x1],#16
331Lenter_cbc_enc:
332	aese	v0.16b,v17.16b
333	aesmc	v0.16b,v0.16b
334	aese	v0.16b,v2.16b
335	aesmc	v0.16b,v0.16b
336	ld1	{v16.4s},[x6]
337	cmp	w5,#4
338	aese	v0.16b,v3.16b
339	aesmc	v0.16b,v0.16b
340	ld1	{v17.4s},[x12]
341	b.eq	Lcbc_enc192
342
343	aese	v0.16b,v16.16b
344	aesmc	v0.16b,v0.16b
345	ld1	{v16.4s},[x14]
346	aese	v0.16b,v17.16b
347	aesmc	v0.16b,v0.16b
348	ld1	{v17.4s},[x3]
349	nop
350
351Lcbc_enc192:
352	aese	v0.16b,v16.16b
353	aesmc	v0.16b,v0.16b
354	subs	x2,x2,#16
355	aese	v0.16b,v17.16b
356	aesmc	v0.16b,v0.16b
357	csel	x8,xzr,x8,eq
358	aese	v0.16b,v18.16b
359	aesmc	v0.16b,v0.16b
360	aese	v0.16b,v19.16b
361	aesmc	v0.16b,v0.16b
362	ld1	{v16.16b},[x0],x8
363	aese	v0.16b,v20.16b
364	aesmc	v0.16b,v0.16b
365	eor	v16.16b,v16.16b,v5.16b
366	aese	v0.16b,v21.16b
367	aesmc	v0.16b,v0.16b
368	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
369	aese	v0.16b,v22.16b
370	aesmc	v0.16b,v0.16b
371	aese	v0.16b,v23.16b
372	eor	v6.16b,v0.16b,v7.16b
373	b.hs	Loop_cbc_enc
374
375	st1	{v6.16b},[x1],#16
376	b	Lcbc_done
377
378.align	5
379Lcbc_enc128:
380	ld1	{v2.4s,v3.4s},[x7]
381	aese	v0.16b,v16.16b
382	aesmc	v0.16b,v0.16b
383	b	Lenter_cbc_enc128
384Loop_cbc_enc128:
385	aese	v0.16b,v16.16b
386	aesmc	v0.16b,v0.16b
387	st1	{v6.16b},[x1],#16
388Lenter_cbc_enc128:
389	aese	v0.16b,v17.16b
390	aesmc	v0.16b,v0.16b
391	subs	x2,x2,#16
392	aese	v0.16b,v2.16b
393	aesmc	v0.16b,v0.16b
394	csel	x8,xzr,x8,eq
395	aese	v0.16b,v3.16b
396	aesmc	v0.16b,v0.16b
397	aese	v0.16b,v18.16b
398	aesmc	v0.16b,v0.16b
399	aese	v0.16b,v19.16b
400	aesmc	v0.16b,v0.16b
401	ld1	{v16.16b},[x0],x8
402	aese	v0.16b,v20.16b
403	aesmc	v0.16b,v0.16b
404	aese	v0.16b,v21.16b
405	aesmc	v0.16b,v0.16b
406	aese	v0.16b,v22.16b
407	aesmc	v0.16b,v0.16b
408	eor	v16.16b,v16.16b,v5.16b
409	aese	v0.16b,v23.16b
410	eor	v6.16b,v0.16b,v7.16b
411	b.hs	Loop_cbc_enc128
412
413	st1	{v6.16b},[x1],#16
414	b	Lcbc_done
415.align	5
416Lcbc_dec:
417	ld1	{v18.16b},[x0],#16
418	subs	x2,x2,#32		// bias
419	add	w6,w5,#2
420	orr	v3.16b,v0.16b,v0.16b
421	orr	v1.16b,v0.16b,v0.16b
422	orr	v19.16b,v18.16b,v18.16b
423	b.lo	Lcbc_dec_tail
424
425	orr	v1.16b,v18.16b,v18.16b
426	ld1	{v18.16b},[x0],#16
427	orr	v2.16b,v0.16b,v0.16b
428	orr	v3.16b,v1.16b,v1.16b
429	orr	v19.16b,v18.16b,v18.16b
430
431Loop3x_cbc_dec:
432	aesd	v0.16b,v16.16b
433	aesimc	v0.16b,v0.16b
434	aesd	v1.16b,v16.16b
435	aesimc	v1.16b,v1.16b
436	aesd	v18.16b,v16.16b
437	aesimc	v18.16b,v18.16b
438	ld1	{v16.4s},[x7],#16
439	subs	w6,w6,#2
440	aesd	v0.16b,v17.16b
441	aesimc	v0.16b,v0.16b
442	aesd	v1.16b,v17.16b
443	aesimc	v1.16b,v1.16b
444	aesd	v18.16b,v17.16b
445	aesimc	v18.16b,v18.16b
446	ld1	{v17.4s},[x7],#16
447	b.gt	Loop3x_cbc_dec
448
449	aesd	v0.16b,v16.16b
450	aesimc	v0.16b,v0.16b
451	aesd	v1.16b,v16.16b
452	aesimc	v1.16b,v1.16b
453	aesd	v18.16b,v16.16b
454	aesimc	v18.16b,v18.16b
455	eor	v4.16b,v6.16b,v7.16b
456	subs	x2,x2,#0x30
457	eor	v5.16b,v2.16b,v7.16b
458	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
459	aesd	v0.16b,v17.16b
460	aesimc	v0.16b,v0.16b
461	aesd	v1.16b,v17.16b
462	aesimc	v1.16b,v1.16b
463	aesd	v18.16b,v17.16b
464	aesimc	v18.16b,v18.16b
465	eor	v17.16b,v3.16b,v7.16b
466	add	x0,x0,x6		// x0 is adjusted in such way that
467					// at exit from the loop v1.16b-v18.16b
468					// are loaded with last "words"
469	orr	v6.16b,v19.16b,v19.16b
470	mov	x7,x3
471	aesd	v0.16b,v20.16b
472	aesimc	v0.16b,v0.16b
473	aesd	v1.16b,v20.16b
474	aesimc	v1.16b,v1.16b
475	aesd	v18.16b,v20.16b
476	aesimc	v18.16b,v18.16b
477	ld1	{v2.16b},[x0],#16
478	aesd	v0.16b,v21.16b
479	aesimc	v0.16b,v0.16b
480	aesd	v1.16b,v21.16b
481	aesimc	v1.16b,v1.16b
482	aesd	v18.16b,v21.16b
483	aesimc	v18.16b,v18.16b
484	ld1	{v3.16b},[x0],#16
485	aesd	v0.16b,v22.16b
486	aesimc	v0.16b,v0.16b
487	aesd	v1.16b,v22.16b
488	aesimc	v1.16b,v1.16b
489	aesd	v18.16b,v22.16b
490	aesimc	v18.16b,v18.16b
491	ld1	{v19.16b},[x0],#16
492	aesd	v0.16b,v23.16b
493	aesd	v1.16b,v23.16b
494	aesd	v18.16b,v23.16b
495	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
496	add	w6,w5,#2
497	eor	v4.16b,v4.16b,v0.16b
498	eor	v5.16b,v5.16b,v1.16b
499	eor	v18.16b,v18.16b,v17.16b
500	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
501	st1	{v4.16b},[x1],#16
502	orr	v0.16b,v2.16b,v2.16b
503	st1	{v5.16b},[x1],#16
504	orr	v1.16b,v3.16b,v3.16b
505	st1	{v18.16b},[x1],#16
506	orr	v18.16b,v19.16b,v19.16b
507	b.hs	Loop3x_cbc_dec
508
509	cmn	x2,#0x30
510	b.eq	Lcbc_done
511	nop
512
513Lcbc_dec_tail:
514	aesd	v1.16b,v16.16b
515	aesimc	v1.16b,v1.16b
516	aesd	v18.16b,v16.16b
517	aesimc	v18.16b,v18.16b
518	ld1	{v16.4s},[x7],#16
519	subs	w6,w6,#2
520	aesd	v1.16b,v17.16b
521	aesimc	v1.16b,v1.16b
522	aesd	v18.16b,v17.16b
523	aesimc	v18.16b,v18.16b
524	ld1	{v17.4s},[x7],#16
525	b.gt	Lcbc_dec_tail
526
527	aesd	v1.16b,v16.16b
528	aesimc	v1.16b,v1.16b
529	aesd	v18.16b,v16.16b
530	aesimc	v18.16b,v18.16b
531	aesd	v1.16b,v17.16b
532	aesimc	v1.16b,v1.16b
533	aesd	v18.16b,v17.16b
534	aesimc	v18.16b,v18.16b
535	aesd	v1.16b,v20.16b
536	aesimc	v1.16b,v1.16b
537	aesd	v18.16b,v20.16b
538	aesimc	v18.16b,v18.16b
539	cmn	x2,#0x20
540	aesd	v1.16b,v21.16b
541	aesimc	v1.16b,v1.16b
542	aesd	v18.16b,v21.16b
543	aesimc	v18.16b,v18.16b
544	eor	v5.16b,v6.16b,v7.16b
545	aesd	v1.16b,v22.16b
546	aesimc	v1.16b,v1.16b
547	aesd	v18.16b,v22.16b
548	aesimc	v18.16b,v18.16b
549	eor	v17.16b,v3.16b,v7.16b
550	aesd	v1.16b,v23.16b
551	aesd	v18.16b,v23.16b
552	b.eq	Lcbc_dec_one
553	eor	v5.16b,v5.16b,v1.16b
554	eor	v17.16b,v17.16b,v18.16b
555	orr	v6.16b,v19.16b,v19.16b
556	st1	{v5.16b},[x1],#16
557	st1	{v17.16b},[x1],#16
558	b	Lcbc_done
559
560Lcbc_dec_one:
561	eor	v5.16b,v5.16b,v18.16b
562	orr	v6.16b,v19.16b,v19.16b
563	st1	{v5.16b},[x1],#16
564
565Lcbc_done:
566	st1	{v6.16b},[x4]
567Lcbc_abort:
568	ldr	x29,[sp],#16
569	ret
570
571.globl	_aes_hw_ctr32_encrypt_blocks
572.private_extern	_aes_hw_ctr32_encrypt_blocks
573
574.align	5
575_aes_hw_ctr32_encrypt_blocks:
576	stp	x29,x30,[sp,#-16]!
577	add	x29,sp,#0
578	ldr	w5,[x3,#240]
579
580	ldr	w8, [x4, #12]
581	ld1	{v0.4s},[x4]
582
583	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
584	sub	w5,w5,#4
585	mov	x12,#16
586	cmp	x2,#2
587	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
588	sub	w5,w5,#2
589	ld1	{v20.4s,v21.4s},[x7],#32
590	ld1	{v22.4s,v23.4s},[x7],#32
591	ld1	{v7.4s},[x7]
592	add	x7,x3,#32
593	mov	w6,w5
594	csel	x12,xzr,x12,lo
595#ifndef __ARMEB__
596	rev	w8, w8
597#endif
598	orr	v1.16b,v0.16b,v0.16b
599	add	w10, w8, #1
600	orr	v18.16b,v0.16b,v0.16b
601	add	w8, w8, #2
602	orr	v6.16b,v0.16b,v0.16b
603	rev	w10, w10
604	mov	v1.s[3],w10
605	b.ls	Lctr32_tail
606	rev	w12, w8
607	sub	x2,x2,#3		// bias
608	mov	v18.s[3],w12
609	b	Loop3x_ctr32
610
611.align	4
612Loop3x_ctr32:
613	aese	v0.16b,v16.16b
614	aesmc	v0.16b,v0.16b
615	aese	v1.16b,v16.16b
616	aesmc	v1.16b,v1.16b
617	aese	v18.16b,v16.16b
618	aesmc	v18.16b,v18.16b
619	ld1	{v16.4s},[x7],#16
620	subs	w6,w6,#2
621	aese	v0.16b,v17.16b
622	aesmc	v0.16b,v0.16b
623	aese	v1.16b,v17.16b
624	aesmc	v1.16b,v1.16b
625	aese	v18.16b,v17.16b
626	aesmc	v18.16b,v18.16b
627	ld1	{v17.4s},[x7],#16
628	b.gt	Loop3x_ctr32
629
630	aese	v0.16b,v16.16b
631	aesmc	v4.16b,v0.16b
632	aese	v1.16b,v16.16b
633	aesmc	v5.16b,v1.16b
634	ld1	{v2.16b},[x0],#16
635	orr	v0.16b,v6.16b,v6.16b
636	aese	v18.16b,v16.16b
637	aesmc	v18.16b,v18.16b
638	ld1	{v3.16b},[x0],#16
639	orr	v1.16b,v6.16b,v6.16b
640	aese	v4.16b,v17.16b
641	aesmc	v4.16b,v4.16b
642	aese	v5.16b,v17.16b
643	aesmc	v5.16b,v5.16b
644	ld1	{v19.16b},[x0],#16
645	mov	x7,x3
646	aese	v18.16b,v17.16b
647	aesmc	v17.16b,v18.16b
648	orr	v18.16b,v6.16b,v6.16b
649	add	w9,w8,#1
650	aese	v4.16b,v20.16b
651	aesmc	v4.16b,v4.16b
652	aese	v5.16b,v20.16b
653	aesmc	v5.16b,v5.16b
654	eor	v2.16b,v2.16b,v7.16b
655	add	w10,w8,#2
656	aese	v17.16b,v20.16b
657	aesmc	v17.16b,v17.16b
658	eor	v3.16b,v3.16b,v7.16b
659	add	w8,w8,#3
660	aese	v4.16b,v21.16b
661	aesmc	v4.16b,v4.16b
662	aese	v5.16b,v21.16b
663	aesmc	v5.16b,v5.16b
664	eor	v19.16b,v19.16b,v7.16b
665	rev	w9,w9
666	aese	v17.16b,v21.16b
667	aesmc	v17.16b,v17.16b
668	mov	v0.s[3], w9
669	rev	w10,w10
670	aese	v4.16b,v22.16b
671	aesmc	v4.16b,v4.16b
672	aese	v5.16b,v22.16b
673	aesmc	v5.16b,v5.16b
674	mov	v1.s[3], w10
675	rev	w12,w8
676	aese	v17.16b,v22.16b
677	aesmc	v17.16b,v17.16b
678	mov	v18.s[3], w12
679	subs	x2,x2,#3
680	aese	v4.16b,v23.16b
681	aese	v5.16b,v23.16b
682	aese	v17.16b,v23.16b
683
684	eor	v2.16b,v2.16b,v4.16b
685	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
686	st1	{v2.16b},[x1],#16
687	eor	v3.16b,v3.16b,v5.16b
688	mov	w6,w5
689	st1	{v3.16b},[x1],#16
690	eor	v19.16b,v19.16b,v17.16b
691	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
692	st1	{v19.16b},[x1],#16
693	b.hs	Loop3x_ctr32
694
695	adds	x2,x2,#3
696	b.eq	Lctr32_done
697	cmp	x2,#1
698	mov	x12,#16
699	csel	x12,xzr,x12,eq
700
701Lctr32_tail:
702	aese	v0.16b,v16.16b
703	aesmc	v0.16b,v0.16b
704	aese	v1.16b,v16.16b
705	aesmc	v1.16b,v1.16b
706	ld1	{v16.4s},[x7],#16
707	subs	w6,w6,#2
708	aese	v0.16b,v17.16b
709	aesmc	v0.16b,v0.16b
710	aese	v1.16b,v17.16b
711	aesmc	v1.16b,v1.16b
712	ld1	{v17.4s},[x7],#16
713	b.gt	Lctr32_tail
714
715	aese	v0.16b,v16.16b
716	aesmc	v0.16b,v0.16b
717	aese	v1.16b,v16.16b
718	aesmc	v1.16b,v1.16b
719	aese	v0.16b,v17.16b
720	aesmc	v0.16b,v0.16b
721	aese	v1.16b,v17.16b
722	aesmc	v1.16b,v1.16b
723	ld1	{v2.16b},[x0],x12
724	aese	v0.16b,v20.16b
725	aesmc	v0.16b,v0.16b
726	aese	v1.16b,v20.16b
727	aesmc	v1.16b,v1.16b
728	ld1	{v3.16b},[x0]
729	aese	v0.16b,v21.16b
730	aesmc	v0.16b,v0.16b
731	aese	v1.16b,v21.16b
732	aesmc	v1.16b,v1.16b
733	eor	v2.16b,v2.16b,v7.16b
734	aese	v0.16b,v22.16b
735	aesmc	v0.16b,v0.16b
736	aese	v1.16b,v22.16b
737	aesmc	v1.16b,v1.16b
738	eor	v3.16b,v3.16b,v7.16b
739	aese	v0.16b,v23.16b
740	aese	v1.16b,v23.16b
741
742	cmp	x2,#1
743	eor	v2.16b,v2.16b,v0.16b
744	eor	v3.16b,v3.16b,v1.16b
745	st1	{v2.16b},[x1],#16
746	b.eq	Lctr32_done
747	st1	{v3.16b},[x1]
748
749Lctr32_done:
750	ldr	x29,[sp],#16
751	ret
752
753#endif
754