• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#include <openssl/arm_arch.h>
2
3#if __ARM_MAX_ARCH__>=7
4.text
5#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH)
6
7#endif
8.align	5
9Lrcon:
10.long	0x01,0x01,0x01,0x01
11.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
12.long	0x1b,0x1b,0x1b,0x1b
13
14.globl	_aes_hw_set_encrypt_key
15.private_extern	_aes_hw_set_encrypt_key
16
17.align	5
18_aes_hw_set_encrypt_key:
19Lenc_key:
20	stp	x29,x30,[sp,#-16]!
21	add	x29,sp,#0
22	mov	x3,#-1
23	cmp	x0,#0
24	b.eq	Lenc_key_abort
25	cmp	x2,#0
26	b.eq	Lenc_key_abort
27	mov	x3,#-2
28	cmp	w1,#128
29	b.lt	Lenc_key_abort
30	cmp	w1,#256
31	b.gt	Lenc_key_abort
32	tst	w1,#0x3f
33	b.ne	Lenc_key_abort
34
35	adr	x3,Lrcon
36	cmp	w1,#192
37
38	eor	v0.16b,v0.16b,v0.16b
39	ld1	{v3.16b},[x0],#16
40	mov	w1,#8		// reuse w1
41	ld1	{v1.4s,v2.4s},[x3],#32
42
43	b.lt	Loop128
44	b.eq	L192
45	b	L256
46
47.align	4
48Loop128:
49	tbl	v6.16b,{v3.16b},v2.16b
50	ext	v5.16b,v0.16b,v3.16b,#12
51	st1	{v3.4s},[x2],#16
52	aese	v6.16b,v0.16b
53	subs	w1,w1,#1
54
55	eor	v3.16b,v3.16b,v5.16b
56	ext	v5.16b,v0.16b,v5.16b,#12
57	eor	v3.16b,v3.16b,v5.16b
58	ext	v5.16b,v0.16b,v5.16b,#12
59	eor	v6.16b,v6.16b,v1.16b
60	eor	v3.16b,v3.16b,v5.16b
61	shl	v1.16b,v1.16b,#1
62	eor	v3.16b,v3.16b,v6.16b
63	b.ne	Loop128
64
65	ld1	{v1.4s},[x3]
66
67	tbl	v6.16b,{v3.16b},v2.16b
68	ext	v5.16b,v0.16b,v3.16b,#12
69	st1	{v3.4s},[x2],#16
70	aese	v6.16b,v0.16b
71
72	eor	v3.16b,v3.16b,v5.16b
73	ext	v5.16b,v0.16b,v5.16b,#12
74	eor	v3.16b,v3.16b,v5.16b
75	ext	v5.16b,v0.16b,v5.16b,#12
76	eor	v6.16b,v6.16b,v1.16b
77	eor	v3.16b,v3.16b,v5.16b
78	shl	v1.16b,v1.16b,#1
79	eor	v3.16b,v3.16b,v6.16b
80
81	tbl	v6.16b,{v3.16b},v2.16b
82	ext	v5.16b,v0.16b,v3.16b,#12
83	st1	{v3.4s},[x2],#16
84	aese	v6.16b,v0.16b
85
86	eor	v3.16b,v3.16b,v5.16b
87	ext	v5.16b,v0.16b,v5.16b,#12
88	eor	v3.16b,v3.16b,v5.16b
89	ext	v5.16b,v0.16b,v5.16b,#12
90	eor	v6.16b,v6.16b,v1.16b
91	eor	v3.16b,v3.16b,v5.16b
92	eor	v3.16b,v3.16b,v6.16b
93	st1	{v3.4s},[x2]
94	add	x2,x2,#0x50
95
96	mov	w12,#10
97	b	Ldone
98
99.align	4
100L192:
101	ld1	{v4.8b},[x0],#8
102	movi	v6.16b,#8			// borrow v6.16b
103	st1	{v3.4s},[x2],#16
104	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
105
106Loop192:
107	tbl	v6.16b,{v4.16b},v2.16b
108	ext	v5.16b,v0.16b,v3.16b,#12
109	st1	{v4.8b},[x2],#8
110	aese	v6.16b,v0.16b
111	subs	w1,w1,#1
112
113	eor	v3.16b,v3.16b,v5.16b
114	ext	v5.16b,v0.16b,v5.16b,#12
115	eor	v3.16b,v3.16b,v5.16b
116	ext	v5.16b,v0.16b,v5.16b,#12
117	eor	v3.16b,v3.16b,v5.16b
118
119	dup	v5.4s,v3.s[3]
120	eor	v5.16b,v5.16b,v4.16b
121	eor	v6.16b,v6.16b,v1.16b
122	ext	v4.16b,v0.16b,v4.16b,#12
123	shl	v1.16b,v1.16b,#1
124	eor	v4.16b,v4.16b,v5.16b
125	eor	v3.16b,v3.16b,v6.16b
126	eor	v4.16b,v4.16b,v6.16b
127	st1	{v3.4s},[x2],#16
128	b.ne	Loop192
129
130	mov	w12,#12
131	add	x2,x2,#0x20
132	b	Ldone
133
134.align	4
135L256:
136	ld1	{v4.16b},[x0]
137	mov	w1,#7
138	mov	w12,#14
139	st1	{v3.4s},[x2],#16
140
141Loop256:
142	tbl	v6.16b,{v4.16b},v2.16b
143	ext	v5.16b,v0.16b,v3.16b,#12
144	st1	{v4.4s},[x2],#16
145	aese	v6.16b,v0.16b
146	subs	w1,w1,#1
147
148	eor	v3.16b,v3.16b,v5.16b
149	ext	v5.16b,v0.16b,v5.16b,#12
150	eor	v3.16b,v3.16b,v5.16b
151	ext	v5.16b,v0.16b,v5.16b,#12
152	eor	v6.16b,v6.16b,v1.16b
153	eor	v3.16b,v3.16b,v5.16b
154	shl	v1.16b,v1.16b,#1
155	eor	v3.16b,v3.16b,v6.16b
156	st1	{v3.4s},[x2],#16
157	b.eq	Ldone
158
159	dup	v6.4s,v3.s[3]		// just splat
160	ext	v5.16b,v0.16b,v4.16b,#12
161	aese	v6.16b,v0.16b
162
163	eor	v4.16b,v4.16b,v5.16b
164	ext	v5.16b,v0.16b,v5.16b,#12
165	eor	v4.16b,v4.16b,v5.16b
166	ext	v5.16b,v0.16b,v5.16b,#12
167	eor	v4.16b,v4.16b,v5.16b
168
169	eor	v4.16b,v4.16b,v6.16b
170	b	Loop256
171
172Ldone:
173	str	w12,[x2]
174	mov	x3,#0
175
176Lenc_key_abort:
177	mov	x0,x3			// return value
178	ldr	x29,[sp],#16
179	ret
180
181
182.globl	_aes_hw_set_decrypt_key
183.private_extern	_aes_hw_set_decrypt_key
184
185.align	5
186_aes_hw_set_decrypt_key:
187	stp	x29,x30,[sp,#-16]!
188	add	x29,sp,#0
189	bl	Lenc_key
190
191	cmp	x0,#0
192	b.ne	Ldec_key_abort
193
194	sub	x2,x2,#240		// restore original x2
195	mov	x4,#-16
196	add	x0,x2,x12,lsl#4	// end of key schedule
197
198	ld1	{v0.4s},[x2]
199	ld1	{v1.4s},[x0]
200	st1	{v0.4s},[x0],x4
201	st1	{v1.4s},[x2],#16
202
203Loop_imc:
204	ld1	{v0.4s},[x2]
205	ld1	{v1.4s},[x0]
206	aesimc	v0.16b,v0.16b
207	aesimc	v1.16b,v1.16b
208	st1	{v0.4s},[x0],x4
209	st1	{v1.4s},[x2],#16
210	cmp	x0,x2
211	b.hi	Loop_imc
212
213	ld1	{v0.4s},[x2]
214	aesimc	v0.16b,v0.16b
215	st1	{v0.4s},[x0]
216
217	eor	x0,x0,x0		// return value
218Ldec_key_abort:
219	ldp	x29,x30,[sp],#16
220	ret
221
222.globl	_aes_hw_encrypt
223.private_extern	_aes_hw_encrypt
224
225.align	5
226_aes_hw_encrypt:
227	ldr	w3,[x2,#240]
228	ld1	{v0.4s},[x2],#16
229	ld1	{v2.16b},[x0]
230	sub	w3,w3,#2
231	ld1	{v1.4s},[x2],#16
232
233Loop_enc:
234	aese	v2.16b,v0.16b
235	aesmc	v2.16b,v2.16b
236	ld1	{v0.4s},[x2],#16
237	subs	w3,w3,#2
238	aese	v2.16b,v1.16b
239	aesmc	v2.16b,v2.16b
240	ld1	{v1.4s},[x2],#16
241	b.gt	Loop_enc
242
243	aese	v2.16b,v0.16b
244	aesmc	v2.16b,v2.16b
245	ld1	{v0.4s},[x2]
246	aese	v2.16b,v1.16b
247	eor	v2.16b,v2.16b,v0.16b
248
249	st1	{v2.16b},[x1]
250	ret
251
252.globl	_aes_hw_decrypt
253.private_extern	_aes_hw_decrypt
254
255.align	5
256_aes_hw_decrypt:
257	ldr	w3,[x2,#240]
258	ld1	{v0.4s},[x2],#16
259	ld1	{v2.16b},[x0]
260	sub	w3,w3,#2
261	ld1	{v1.4s},[x2],#16
262
263Loop_dec:
264	aesd	v2.16b,v0.16b
265	aesimc	v2.16b,v2.16b
266	ld1	{v0.4s},[x2],#16
267	subs	w3,w3,#2
268	aesd	v2.16b,v1.16b
269	aesimc	v2.16b,v2.16b
270	ld1	{v1.4s},[x2],#16
271	b.gt	Loop_dec
272
273	aesd	v2.16b,v0.16b
274	aesimc	v2.16b,v2.16b
275	ld1	{v0.4s},[x2]
276	aesd	v2.16b,v1.16b
277	eor	v2.16b,v2.16b,v0.16b
278
279	st1	{v2.16b},[x1]
280	ret
281
282.globl	_aes_hw_cbc_encrypt
283.private_extern	_aes_hw_cbc_encrypt
284
285.align	5
286_aes_hw_cbc_encrypt:
287	stp	x29,x30,[sp,#-16]!
288	add	x29,sp,#0
289	subs	x2,x2,#16
290	mov	x8,#16
291	b.lo	Lcbc_abort
292	csel	x8,xzr,x8,eq
293
294	cmp	w5,#0			// en- or decrypting?
295	ldr	w5,[x3,#240]
296	and	x2,x2,#-16
297	ld1	{v6.16b},[x4]
298	ld1	{v0.16b},[x0],x8
299
300	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
301	sub	w5,w5,#6
302	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
303	sub	w5,w5,#2
304	ld1	{v18.4s,v19.4s},[x7],#32
305	ld1	{v20.4s,v21.4s},[x7],#32
306	ld1	{v22.4s,v23.4s},[x7],#32
307	ld1	{v7.4s},[x7]
308
309	add	x7,x3,#32
310	mov	w6,w5
311	b.eq	Lcbc_dec
312
313	cmp	w5,#2
314	eor	v0.16b,v0.16b,v6.16b
315	eor	v5.16b,v16.16b,v7.16b
316	b.eq	Lcbc_enc128
317
318	ld1	{v2.4s,v3.4s},[x7]
319	add	x7,x3,#16
320	add	x6,x3,#16*4
321	add	x12,x3,#16*5
322	aese	v0.16b,v16.16b
323	aesmc	v0.16b,v0.16b
324	add	x14,x3,#16*6
325	add	x3,x3,#16*7
326	b	Lenter_cbc_enc
327
328.align	4
329Loop_cbc_enc:
330	aese	v0.16b,v16.16b
331	aesmc	v0.16b,v0.16b
332	st1	{v6.16b},[x1],#16
333Lenter_cbc_enc:
334	aese	v0.16b,v17.16b
335	aesmc	v0.16b,v0.16b
336	aese	v0.16b,v2.16b
337	aesmc	v0.16b,v0.16b
338	ld1	{v16.4s},[x6]
339	cmp	w5,#4
340	aese	v0.16b,v3.16b
341	aesmc	v0.16b,v0.16b
342	ld1	{v17.4s},[x12]
343	b.eq	Lcbc_enc192
344
345	aese	v0.16b,v16.16b
346	aesmc	v0.16b,v0.16b
347	ld1	{v16.4s},[x14]
348	aese	v0.16b,v17.16b
349	aesmc	v0.16b,v0.16b
350	ld1	{v17.4s},[x3]
351	nop
352
353Lcbc_enc192:
354	aese	v0.16b,v16.16b
355	aesmc	v0.16b,v0.16b
356	subs	x2,x2,#16
357	aese	v0.16b,v17.16b
358	aesmc	v0.16b,v0.16b
359	csel	x8,xzr,x8,eq
360	aese	v0.16b,v18.16b
361	aesmc	v0.16b,v0.16b
362	aese	v0.16b,v19.16b
363	aesmc	v0.16b,v0.16b
364	ld1	{v16.16b},[x0],x8
365	aese	v0.16b,v20.16b
366	aesmc	v0.16b,v0.16b
367	eor	v16.16b,v16.16b,v5.16b
368	aese	v0.16b,v21.16b
369	aesmc	v0.16b,v0.16b
370	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
371	aese	v0.16b,v22.16b
372	aesmc	v0.16b,v0.16b
373	aese	v0.16b,v23.16b
374	eor	v6.16b,v0.16b,v7.16b
375	b.hs	Loop_cbc_enc
376
377	st1	{v6.16b},[x1],#16
378	b	Lcbc_done
379
380.align	5
381Lcbc_enc128:
382	ld1	{v2.4s,v3.4s},[x7]
383	aese	v0.16b,v16.16b
384	aesmc	v0.16b,v0.16b
385	b	Lenter_cbc_enc128
386Loop_cbc_enc128:
387	aese	v0.16b,v16.16b
388	aesmc	v0.16b,v0.16b
389	st1	{v6.16b},[x1],#16
390Lenter_cbc_enc128:
391	aese	v0.16b,v17.16b
392	aesmc	v0.16b,v0.16b
393	subs	x2,x2,#16
394	aese	v0.16b,v2.16b
395	aesmc	v0.16b,v0.16b
396	csel	x8,xzr,x8,eq
397	aese	v0.16b,v3.16b
398	aesmc	v0.16b,v0.16b
399	aese	v0.16b,v18.16b
400	aesmc	v0.16b,v0.16b
401	aese	v0.16b,v19.16b
402	aesmc	v0.16b,v0.16b
403	ld1	{v16.16b},[x0],x8
404	aese	v0.16b,v20.16b
405	aesmc	v0.16b,v0.16b
406	aese	v0.16b,v21.16b
407	aesmc	v0.16b,v0.16b
408	aese	v0.16b,v22.16b
409	aesmc	v0.16b,v0.16b
410	eor	v16.16b,v16.16b,v5.16b
411	aese	v0.16b,v23.16b
412	eor	v6.16b,v0.16b,v7.16b
413	b.hs	Loop_cbc_enc128
414
415	st1	{v6.16b},[x1],#16
416	b	Lcbc_done
417.align	5
418Lcbc_dec:
419	ld1	{v18.16b},[x0],#16
420	subs	x2,x2,#32		// bias
421	add	w6,w5,#2
422	orr	v3.16b,v0.16b,v0.16b
423	orr	v1.16b,v0.16b,v0.16b
424	orr	v19.16b,v18.16b,v18.16b
425	b.lo	Lcbc_dec_tail
426
427	orr	v1.16b,v18.16b,v18.16b
428	ld1	{v18.16b},[x0],#16
429	orr	v2.16b,v0.16b,v0.16b
430	orr	v3.16b,v1.16b,v1.16b
431	orr	v19.16b,v18.16b,v18.16b
432
433Loop3x_cbc_dec:
434	aesd	v0.16b,v16.16b
435	aesimc	v0.16b,v0.16b
436	aesd	v1.16b,v16.16b
437	aesimc	v1.16b,v1.16b
438	aesd	v18.16b,v16.16b
439	aesimc	v18.16b,v18.16b
440	ld1	{v16.4s},[x7],#16
441	subs	w6,w6,#2
442	aesd	v0.16b,v17.16b
443	aesimc	v0.16b,v0.16b
444	aesd	v1.16b,v17.16b
445	aesimc	v1.16b,v1.16b
446	aesd	v18.16b,v17.16b
447	aesimc	v18.16b,v18.16b
448	ld1	{v17.4s},[x7],#16
449	b.gt	Loop3x_cbc_dec
450
451	aesd	v0.16b,v16.16b
452	aesimc	v0.16b,v0.16b
453	aesd	v1.16b,v16.16b
454	aesimc	v1.16b,v1.16b
455	aesd	v18.16b,v16.16b
456	aesimc	v18.16b,v18.16b
457	eor	v4.16b,v6.16b,v7.16b
458	subs	x2,x2,#0x30
459	eor	v5.16b,v2.16b,v7.16b
460	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
461	aesd	v0.16b,v17.16b
462	aesimc	v0.16b,v0.16b
463	aesd	v1.16b,v17.16b
464	aesimc	v1.16b,v1.16b
465	aesd	v18.16b,v17.16b
466	aesimc	v18.16b,v18.16b
467	eor	v17.16b,v3.16b,v7.16b
468	add	x0,x0,x6		// x0 is adjusted in such way that
469					// at exit from the loop v1.16b-v18.16b
470					// are loaded with last "words"
471	orr	v6.16b,v19.16b,v19.16b
472	mov	x7,x3
473	aesd	v0.16b,v20.16b
474	aesimc	v0.16b,v0.16b
475	aesd	v1.16b,v20.16b
476	aesimc	v1.16b,v1.16b
477	aesd	v18.16b,v20.16b
478	aesimc	v18.16b,v18.16b
479	ld1	{v2.16b},[x0],#16
480	aesd	v0.16b,v21.16b
481	aesimc	v0.16b,v0.16b
482	aesd	v1.16b,v21.16b
483	aesimc	v1.16b,v1.16b
484	aesd	v18.16b,v21.16b
485	aesimc	v18.16b,v18.16b
486	ld1	{v3.16b},[x0],#16
487	aesd	v0.16b,v22.16b
488	aesimc	v0.16b,v0.16b
489	aesd	v1.16b,v22.16b
490	aesimc	v1.16b,v1.16b
491	aesd	v18.16b,v22.16b
492	aesimc	v18.16b,v18.16b
493	ld1	{v19.16b},[x0],#16
494	aesd	v0.16b,v23.16b
495	aesd	v1.16b,v23.16b
496	aesd	v18.16b,v23.16b
497	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
498	add	w6,w5,#2
499	eor	v4.16b,v4.16b,v0.16b
500	eor	v5.16b,v5.16b,v1.16b
501	eor	v18.16b,v18.16b,v17.16b
502	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
503	st1	{v4.16b},[x1],#16
504	orr	v0.16b,v2.16b,v2.16b
505	st1	{v5.16b},[x1],#16
506	orr	v1.16b,v3.16b,v3.16b
507	st1	{v18.16b},[x1],#16
508	orr	v18.16b,v19.16b,v19.16b
509	b.hs	Loop3x_cbc_dec
510
511	cmn	x2,#0x30
512	b.eq	Lcbc_done
513	nop
514
515Lcbc_dec_tail:
516	aesd	v1.16b,v16.16b
517	aesimc	v1.16b,v1.16b
518	aesd	v18.16b,v16.16b
519	aesimc	v18.16b,v18.16b
520	ld1	{v16.4s},[x7],#16
521	subs	w6,w6,#2
522	aesd	v1.16b,v17.16b
523	aesimc	v1.16b,v1.16b
524	aesd	v18.16b,v17.16b
525	aesimc	v18.16b,v18.16b
526	ld1	{v17.4s},[x7],#16
527	b.gt	Lcbc_dec_tail
528
529	aesd	v1.16b,v16.16b
530	aesimc	v1.16b,v1.16b
531	aesd	v18.16b,v16.16b
532	aesimc	v18.16b,v18.16b
533	aesd	v1.16b,v17.16b
534	aesimc	v1.16b,v1.16b
535	aesd	v18.16b,v17.16b
536	aesimc	v18.16b,v18.16b
537	aesd	v1.16b,v20.16b
538	aesimc	v1.16b,v1.16b
539	aesd	v18.16b,v20.16b
540	aesimc	v18.16b,v18.16b
541	cmn	x2,#0x20
542	aesd	v1.16b,v21.16b
543	aesimc	v1.16b,v1.16b
544	aesd	v18.16b,v21.16b
545	aesimc	v18.16b,v18.16b
546	eor	v5.16b,v6.16b,v7.16b
547	aesd	v1.16b,v22.16b
548	aesimc	v1.16b,v1.16b
549	aesd	v18.16b,v22.16b
550	aesimc	v18.16b,v18.16b
551	eor	v17.16b,v3.16b,v7.16b
552	aesd	v1.16b,v23.16b
553	aesd	v18.16b,v23.16b
554	b.eq	Lcbc_dec_one
555	eor	v5.16b,v5.16b,v1.16b
556	eor	v17.16b,v17.16b,v18.16b
557	orr	v6.16b,v19.16b,v19.16b
558	st1	{v5.16b},[x1],#16
559	st1	{v17.16b},[x1],#16
560	b	Lcbc_done
561
562Lcbc_dec_one:
563	eor	v5.16b,v5.16b,v18.16b
564	orr	v6.16b,v19.16b,v19.16b
565	st1	{v5.16b},[x1],#16
566
567Lcbc_done:
568	st1	{v6.16b},[x4]
569Lcbc_abort:
570	ldr	x29,[sp],#16
571	ret
572
573.globl	_aes_hw_ctr32_encrypt_blocks
574.private_extern	_aes_hw_ctr32_encrypt_blocks
575
576.align	5
577_aes_hw_ctr32_encrypt_blocks:
578	stp	x29,x30,[sp,#-16]!
579	add	x29,sp,#0
580	ldr	w5,[x3,#240]
581
582	ldr	w8, [x4, #12]
583	ld1	{v0.4s},[x4]
584
585	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
586	sub	w5,w5,#4
587	mov	x12,#16
588	cmp	x2,#2
589	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
590	sub	w5,w5,#2
591	ld1	{v20.4s,v21.4s},[x7],#32
592	ld1	{v22.4s,v23.4s},[x7],#32
593	ld1	{v7.4s},[x7]
594	add	x7,x3,#32
595	mov	w6,w5
596	csel	x12,xzr,x12,lo
597#ifndef __ARMEB__
598	rev	w8, w8
599#endif
600	orr	v1.16b,v0.16b,v0.16b
601	add	w10, w8, #1
602	orr	v18.16b,v0.16b,v0.16b
603	add	w8, w8, #2
604	orr	v6.16b,v0.16b,v0.16b
605	rev	w10, w10
606	mov	v1.s[3],w10
607	b.ls	Lctr32_tail
608	rev	w12, w8
609	sub	x2,x2,#3		// bias
610	mov	v18.s[3],w12
611	b	Loop3x_ctr32
612
613.align	4
614Loop3x_ctr32:
615	aese	v0.16b,v16.16b
616	aesmc	v0.16b,v0.16b
617	aese	v1.16b,v16.16b
618	aesmc	v1.16b,v1.16b
619	aese	v18.16b,v16.16b
620	aesmc	v18.16b,v18.16b
621	ld1	{v16.4s},[x7],#16
622	subs	w6,w6,#2
623	aese	v0.16b,v17.16b
624	aesmc	v0.16b,v0.16b
625	aese	v1.16b,v17.16b
626	aesmc	v1.16b,v1.16b
627	aese	v18.16b,v17.16b
628	aesmc	v18.16b,v18.16b
629	ld1	{v17.4s},[x7],#16
630	b.gt	Loop3x_ctr32
631
632	aese	v0.16b,v16.16b
633	aesmc	v4.16b,v0.16b
634	aese	v1.16b,v16.16b
635	aesmc	v5.16b,v1.16b
636	ld1	{v2.16b},[x0],#16
637	orr	v0.16b,v6.16b,v6.16b
638	aese	v18.16b,v16.16b
639	aesmc	v18.16b,v18.16b
640	ld1	{v3.16b},[x0],#16
641	orr	v1.16b,v6.16b,v6.16b
642	aese	v4.16b,v17.16b
643	aesmc	v4.16b,v4.16b
644	aese	v5.16b,v17.16b
645	aesmc	v5.16b,v5.16b
646	ld1	{v19.16b},[x0],#16
647	mov	x7,x3
648	aese	v18.16b,v17.16b
649	aesmc	v17.16b,v18.16b
650	orr	v18.16b,v6.16b,v6.16b
651	add	w9,w8,#1
652	aese	v4.16b,v20.16b
653	aesmc	v4.16b,v4.16b
654	aese	v5.16b,v20.16b
655	aesmc	v5.16b,v5.16b
656	eor	v2.16b,v2.16b,v7.16b
657	add	w10,w8,#2
658	aese	v17.16b,v20.16b
659	aesmc	v17.16b,v17.16b
660	eor	v3.16b,v3.16b,v7.16b
661	add	w8,w8,#3
662	aese	v4.16b,v21.16b
663	aesmc	v4.16b,v4.16b
664	aese	v5.16b,v21.16b
665	aesmc	v5.16b,v5.16b
666	eor	v19.16b,v19.16b,v7.16b
667	rev	w9,w9
668	aese	v17.16b,v21.16b
669	aesmc	v17.16b,v17.16b
670	mov	v0.s[3], w9
671	rev	w10,w10
672	aese	v4.16b,v22.16b
673	aesmc	v4.16b,v4.16b
674	aese	v5.16b,v22.16b
675	aesmc	v5.16b,v5.16b
676	mov	v1.s[3], w10
677	rev	w12,w8
678	aese	v17.16b,v22.16b
679	aesmc	v17.16b,v17.16b
680	mov	v18.s[3], w12
681	subs	x2,x2,#3
682	aese	v4.16b,v23.16b
683	aese	v5.16b,v23.16b
684	aese	v17.16b,v23.16b
685
686	eor	v2.16b,v2.16b,v4.16b
687	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
688	st1	{v2.16b},[x1],#16
689	eor	v3.16b,v3.16b,v5.16b
690	mov	w6,w5
691	st1	{v3.16b},[x1],#16
692	eor	v19.16b,v19.16b,v17.16b
693	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
694	st1	{v19.16b},[x1],#16
695	b.hs	Loop3x_ctr32
696
697	adds	x2,x2,#3
698	b.eq	Lctr32_done
699	cmp	x2,#1
700	mov	x12,#16
701	csel	x12,xzr,x12,eq
702
703Lctr32_tail:
704	aese	v0.16b,v16.16b
705	aesmc	v0.16b,v0.16b
706	aese	v1.16b,v16.16b
707	aesmc	v1.16b,v1.16b
708	ld1	{v16.4s},[x7],#16
709	subs	w6,w6,#2
710	aese	v0.16b,v17.16b
711	aesmc	v0.16b,v0.16b
712	aese	v1.16b,v17.16b
713	aesmc	v1.16b,v1.16b
714	ld1	{v17.4s},[x7],#16
715	b.gt	Lctr32_tail
716
717	aese	v0.16b,v16.16b
718	aesmc	v0.16b,v0.16b
719	aese	v1.16b,v16.16b
720	aesmc	v1.16b,v1.16b
721	aese	v0.16b,v17.16b
722	aesmc	v0.16b,v0.16b
723	aese	v1.16b,v17.16b
724	aesmc	v1.16b,v1.16b
725	ld1	{v2.16b},[x0],x12
726	aese	v0.16b,v20.16b
727	aesmc	v0.16b,v0.16b
728	aese	v1.16b,v20.16b
729	aesmc	v1.16b,v1.16b
730	ld1	{v3.16b},[x0]
731	aese	v0.16b,v21.16b
732	aesmc	v0.16b,v0.16b
733	aese	v1.16b,v21.16b
734	aesmc	v1.16b,v1.16b
735	eor	v2.16b,v2.16b,v7.16b
736	aese	v0.16b,v22.16b
737	aesmc	v0.16b,v0.16b
738	aese	v1.16b,v22.16b
739	aesmc	v1.16b,v1.16b
740	eor	v3.16b,v3.16b,v7.16b
741	aese	v0.16b,v23.16b
742	aese	v1.16b,v23.16b
743
744	cmp	x2,#1
745	eor	v2.16b,v2.16b,v0.16b
746	eor	v3.16b,v3.16b,v1.16b
747	st1	{v2.16b},[x1],#16
748	b.eq	Lctr32_done
749	st1	{v3.16b},[x1]
750
751Lctr32_done:
752	ldr	x29,[sp],#16
753	ret
754
755#endif
756