• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// This file is generated from a similarly-named Perl script in the BoringSSL
2// source tree. Do not edit by hand.
3
4#if !defined(__has_feature)
5#define __has_feature(x) 0
6#endif
7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
8#define OPENSSL_NO_ASM
9#endif
10
11#if !defined(OPENSSL_NO_ASM)
12#if defined(BORINGSSL_PREFIX)
13#include <boringssl_prefix_symbols_asm.h>
14#endif
15#include <openssl/arm_arch.h>
16
17.section	__TEXT,__const
18
19
20.align	7	// totally strategic alignment
21_vpaes_consts:
22Lk_mc_forward:	//	mc_forward
23.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
24.quad	0x080B0A0904070605, 0x000302010C0F0E0D
25.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
26.quad	0x000302010C0F0E0D, 0x080B0A0904070605
27Lk_mc_backward:	//	mc_backward
28.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
29.quad	0x020100030E0D0C0F, 0x0A09080B06050407
30.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
31.quad	0x0A09080B06050407, 0x020100030E0D0C0F
32Lk_sr:	//	sr
33.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
34.quad	0x030E09040F0A0500, 0x0B06010C07020D08
35.quad	0x0F060D040B020900, 0x070E050C030A0108
36.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
37
38//
39// "Hot" constants
40//
41Lk_inv:	//	inv, inva
42.quad	0x0E05060F0D080180, 0x040703090A0B0C02
43.quad	0x01040A060F0B0780, 0x030D0E0C02050809
44Lk_ipt:	//	input transform (lo, hi)
45.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
46.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
47Lk_sbo:	//	sbou, sbot
48.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
49.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
50Lk_sb1:	//	sb1u, sb1t
51.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
52.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
53Lk_sb2:	//	sb2u, sb2t
54.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
55.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
56
57//
58//  Decryption stuff
59//
60Lk_dipt:	//	decryption input transform
61.quad	0x0F505B040B545F00, 0x154A411E114E451A
62.quad	0x86E383E660056500, 0x12771772F491F194
63Lk_dsbo:	//	decryption sbox final output
64.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
65.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
66Lk_dsb9:	//	decryption sbox output *9*u, *9*t
67.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
68.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
69Lk_dsbd:	//	decryption sbox output *D*u, *D*t
70.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
71.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
72Lk_dsbb:	//	decryption sbox output *B*u, *B*t
73.quad	0xD022649296B44200, 0x602646F6B0F2D404
74.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
75Lk_dsbe:	//	decryption sbox output *E*u, *E*t
76.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
77.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
78
79//
80//  Key schedule constants
81//
82Lk_dksd:	//	decryption key schedule: invskew x*D
83.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
84.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
85Lk_dksb:	//	decryption key schedule: invskew x*B
86.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
87.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
88Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
89.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
90.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
91Lk_dks9:	//	decryption key schedule: invskew x*9
92.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
93.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
94
95Lk_rcon:	//	rcon
96.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
97
98Lk_opt:	//	output transform
99.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
100.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
101Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
102.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
103.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
104
105.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
106.align	2
107
108.align	6
109
110.text
111##
112##  _aes_preheat
113##
114##  Fills register %r10 -> .aes_consts (so you can -fPIC)
115##  and %xmm9-%xmm15 as specified below.
116##
117
118.align	4
119_vpaes_encrypt_preheat:
120	adrp	x10, Lk_inv@PAGE
121	add	x10, x10, Lk_inv@PAGEOFF
122	movi	v17.16b, #0x0f
123	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
124	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// Lk_ipt, Lk_sbo
125	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// Lk_sb1, Lk_sb2
126	ret
127
128
129##
130##  _aes_encrypt_core
131##
132##  AES-encrypt %xmm0.
133##
134##  Inputs:
135##     %xmm0 = input
136##     %xmm9-%xmm15 as in _vpaes_preheat
137##    (%rdx) = scheduled keys
138##
139##  Output in %xmm0
140##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
141##  Preserves %xmm6 - %xmm8 so you get some local vectors
142##
143##
144
145.align	4
146_vpaes_encrypt_core:
147	mov	x9, x2
148	ldr	w8, [x2,#240]			// pull rounds
149	adrp	x11, Lk_mc_forward@PAGE+16
150	add	x11, x11, Lk_mc_forward@PAGEOFF+16
151						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
152	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
153	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
154	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
155	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
156						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
157	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
158	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
159	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
160	b	Lenc_entry
161
162.align	4
163Lenc_loop:
164	// middle of middle round
165	add	x10, x11, #0x40
166	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
167	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
168	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
169	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
170	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
171	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
172	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
173	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
174	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
175	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
176	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
177	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
178	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
179	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
180	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
181	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
182	sub	w8, w8, #1			// nr--
183
184Lenc_entry:
185	// top of round
186	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
187	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
188	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
189	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
190	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
191	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
192	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
193	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
194	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
195	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
196	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
197	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
198	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
199	cbnz	w8, Lenc_loop
200
201	// middle of last round
202	add	x10, x11, #0x80
203						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
204						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
205	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
206	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
207	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
208	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
209	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
210	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
211	ret
212
213
214.globl	_vpaes_encrypt
215.private_extern	_vpaes_encrypt
216
217.align	4
218_vpaes_encrypt:
219	AARCH64_SIGN_LINK_REGISTER
220	stp	x29,x30,[sp,#-16]!
221	add	x29,sp,#0
222
223	ld1	{v7.16b}, [x0]
224	bl	_vpaes_encrypt_preheat
225	bl	_vpaes_encrypt_core
226	st1	{v0.16b}, [x1]
227
228	ldp	x29,x30,[sp],#16
229	AARCH64_VALIDATE_LINK_REGISTER
230	ret
231
232
233
234.align	4
235_vpaes_encrypt_2x:
236	mov	x9, x2
237	ldr	w8, [x2,#240]			// pull rounds
238	adrp	x11, Lk_mc_forward@PAGE+16
239	add	x11, x11, Lk_mc_forward@PAGEOFF+16
240						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
241	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
242	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
243	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
244	and	v9.16b,  v15.16b,  v17.16b
245	ushr	v8.16b,  v15.16b,  #4
246	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
247	tbl	v9.16b,  {v20.16b}, v9.16b
248						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
249	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
250	tbl	v10.16b, {v21.16b}, v8.16b
251	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
252	eor	v8.16b,  v9.16b,   v16.16b
253	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
254	eor	v8.16b,  v8.16b,   v10.16b
255	b	Lenc_2x_entry
256
257.align	4
258Lenc_2x_loop:
259	// middle of middle round
260	add	x10, x11, #0x40
261	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
262	tbl	v12.16b, {v25.16b}, v10.16b
263	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# Lk_mc_forward[]
264	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
265	tbl	v8.16b,  {v24.16b}, v11.16b
266	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
267	eor	v12.16b, v12.16b, v16.16b
268	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
269	tbl	v13.16b, {v27.16b}, v10.16b
270	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
271	eor	v8.16b,  v8.16b,  v12.16b
272	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
273	tbl	v10.16b, {v26.16b}, v11.16b
274	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# Lk_mc_backward[]
275	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
276	tbl	v11.16b, {v8.16b}, v1.16b
277	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
278	eor	v10.16b, v10.16b, v13.16b
279	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
280	tbl	v8.16b,  {v8.16b}, v4.16b
281	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
282	eor	v11.16b, v11.16b, v10.16b
283	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
284	tbl	v12.16b, {v11.16b},v1.16b
285	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
286	eor	v8.16b,  v8.16b,  v11.16b
287	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
288	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
289	eor	v8.16b,  v8.16b,  v12.16b
290	sub	w8, w8, #1			// nr--
291
292Lenc_2x_entry:
293	// top of round
294	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
295	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
296	and	v9.16b,  v8.16b, v17.16b
297	ushr	v8.16b,  v8.16b, #4
298	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
299	tbl	v13.16b, {v19.16b},v9.16b
300	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
301	eor	v9.16b,  v9.16b,  v8.16b
302	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
303	tbl	v11.16b, {v18.16b},v8.16b
304	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
305	tbl	v12.16b, {v18.16b},v9.16b
306	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
307	eor	v11.16b, v11.16b, v13.16b
308	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
309	eor	v12.16b, v12.16b, v13.16b
310	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
311	tbl	v10.16b, {v18.16b},v11.16b
312	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
313	tbl	v11.16b, {v18.16b},v12.16b
314	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
315	eor	v10.16b, v10.16b, v9.16b
316	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
317	eor	v11.16b, v11.16b, v8.16b
318	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
319	cbnz	w8, Lenc_2x_loop
320
321	// middle of last round
322	add	x10, x11, #0x80
323						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
324						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
325	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
326	tbl	v12.16b, {v22.16b}, v10.16b
327	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# Lk_sr[]
328	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
329	tbl	v8.16b,  {v23.16b}, v11.16b
330	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
331	eor	v12.16b, v12.16b, v16.16b
332	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
333	eor	v8.16b,  v8.16b,  v12.16b
334	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
335	tbl	v1.16b,  {v8.16b},v1.16b
336	ret
337
338
339
340.align	4
341_vpaes_decrypt_preheat:
342	adrp	x10, Lk_inv@PAGE
343	add	x10, x10, Lk_inv@PAGEOFF
344	movi	v17.16b, #0x0f
345	adrp	x11, Lk_dipt@PAGE
346	add	x11, x11, Lk_dipt@PAGEOFF
347	ld1	{v18.2d,v19.2d}, [x10],#32	// Lk_inv
348	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// Lk_dipt, Lk_dsbo
349	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// Lk_dsb9, Lk_dsbd
350	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// Lk_dsbb, Lk_dsbe
351	ret
352
353
354##
355##  Decryption core
356##
357##  Same API as encryption core.
358##
359
360.align	4
361_vpaes_decrypt_core:
362	mov	x9, x2
363	ldr	w8, [x2,#240]			// pull rounds
364
365						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
366	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
367	eor	x11, x11, #0x30			// xor		$0x30,	%r11
368	adrp	x10, Lk_sr@PAGE
369	add	x10, x10, Lk_sr@PAGEOFF
370	and	x11, x11, #0x30			// and		$0x30,	%r11
371	add	x11, x11, x10
372	adrp	x10, Lk_mc_forward@PAGE+48
373	add	x10, x10, Lk_mc_forward@PAGEOFF+48
374
375	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
376	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
377	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
378	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
379	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
380						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
381	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
382	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
383	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
384	b	Ldec_entry
385
386.align	4
387Ldec_loop:
388//
389//  Inverse mix columns
390//
391						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
392						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
393	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
394	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
395	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
396						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
397	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
398						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
399
400	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
401	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
402	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
403	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
404						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
405	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
406						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
407
408	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
409	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
410	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
411	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
412						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
413	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
414						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
415
416	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
417	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
418	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
419	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
420	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
421	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
422	sub	w8, w8, #1			// sub		$1,%rax			# nr--
423
424Ldec_entry:
425	// top of round
426	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
427	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
428	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
429	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
430	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
431	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
432	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
433	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
434	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
435	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
436	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
437	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
438	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
439	cbnz	w8, Ldec_loop
440
441	// middle of last round
442						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
443	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
444						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
445	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
446	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
447	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
448	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
449	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
450	ret
451
452
453.globl	_vpaes_decrypt
454.private_extern	_vpaes_decrypt
455
456.align	4
457_vpaes_decrypt:
458	AARCH64_SIGN_LINK_REGISTER
459	stp	x29,x30,[sp,#-16]!
460	add	x29,sp,#0
461
462	ld1	{v7.16b}, [x0]
463	bl	_vpaes_decrypt_preheat
464	bl	_vpaes_decrypt_core
465	st1	{v0.16b}, [x1]
466
467	ldp	x29,x30,[sp],#16
468	AARCH64_VALIDATE_LINK_REGISTER
469	ret
470
471
472// v14-v15 input, v0-v1 output
473
474.align	4
475_vpaes_decrypt_2x:
476	mov	x9, x2
477	ldr	w8, [x2,#240]			// pull rounds
478
479						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
480	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
481	eor	x11, x11, #0x30			// xor		$0x30,	%r11
482	adrp	x10, Lk_sr@PAGE
483	add	x10, x10, Lk_sr@PAGEOFF
484	and	x11, x11, #0x30			// and		$0x30,	%r11
485	add	x11, x11, x10
486	adrp	x10, Lk_mc_forward@PAGE+48
487	add	x10, x10, Lk_mc_forward@PAGEOFF+48
488
489	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
490	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
491	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
492	and	v9.16b,  v15.16b, v17.16b
493	ushr	v8.16b,  v15.16b, #4
494	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
495	tbl	v10.16b, {v20.16b},v9.16b
496	ld1	{v5.2d}, [x10]			// vmovdqa	Lk_mc_forward+48(%rip), %xmm5
497						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
498	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
499	tbl	v8.16b,  {v21.16b},v8.16b
500	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
501	eor	v10.16b, v10.16b, v16.16b
502	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
503	eor	v8.16b,  v8.16b,  v10.16b
504	b	Ldec_2x_entry
505
506.align	4
507Ldec_2x_loop:
508//
509//  Inverse mix columns
510//
511						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
512						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
513	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
514	tbl	v12.16b, {v24.16b}, v10.16b
515	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
516	tbl	v9.16b,  {v25.16b}, v11.16b
517	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
518	eor	v8.16b,  v12.16b, v16.16b
519						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
520	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
521	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
522						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
523
524	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
525	tbl	v12.16b, {v26.16b}, v10.16b
526	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
527	tbl	v8.16b,  {v8.16b},v5.16b
528	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
529	tbl	v9.16b,  {v27.16b}, v11.16b
530	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
531	eor	v8.16b,  v8.16b,  v12.16b
532						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
533	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
534	eor	v8.16b,  v8.16b,  v9.16b
535						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
536
537	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
538	tbl	v12.16b, {v28.16b}, v10.16b
539	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
540	tbl	v8.16b,  {v8.16b},v5.16b
541	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
542	tbl	v9.16b,  {v29.16b}, v11.16b
543	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
544	eor	v8.16b,  v8.16b,  v12.16b
545						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
546	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
547	eor	v8.16b,  v8.16b,  v9.16b
548						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
549
550	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
551	tbl	v12.16b, {v30.16b}, v10.16b
552	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
553	tbl	v8.16b,  {v8.16b},v5.16b
554	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
555	tbl	v9.16b,  {v31.16b}, v11.16b
556	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
557	eor	v8.16b,  v8.16b,  v12.16b
558	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
559	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
560	eor	v8.16b,  v8.16b,  v9.16b
561	sub	w8, w8, #1			// sub		$1,%rax			# nr--
562
563Ldec_2x_entry:
564	// top of round
565	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
566	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
567	and	v9.16b,  v8.16b,  v17.16b
568	ushr	v8.16b,  v8.16b,  #4
569	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
570	tbl	v10.16b, {v19.16b},v9.16b
571	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
572	eor	v9.16b,	 v9.16b,  v8.16b
573	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
574	tbl	v11.16b, {v18.16b},v8.16b
575	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
576	tbl	v12.16b, {v18.16b},v9.16b
577	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
578	eor	v11.16b, v11.16b, v10.16b
579	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
580	eor	v12.16b, v12.16b, v10.16b
581	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
582	tbl	v10.16b, {v18.16b},v11.16b
583	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
584	tbl	v11.16b, {v18.16b},v12.16b
585	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
586	eor	v10.16b, v10.16b, v9.16b
587	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
588	eor	v11.16b, v11.16b, v8.16b
589	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
590	cbnz	w8, Ldec_2x_loop
591
592	// middle of last round
593						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
594	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
595	tbl	v12.16b, {v22.16b}, v10.16b
596						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
597	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
598	tbl	v9.16b,  {v23.16b}, v11.16b
599	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# Lk_sr-Lk_dsbd=-0x160
600	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
601	eor	v12.16b, v12.16b, v16.16b
602	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
603	eor	v8.16b,  v9.16b,  v12.16b
604	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
605	tbl	v1.16b,  {v8.16b},v2.16b
606	ret
607
608########################################################
609##                                                    ##
610##                  AES key schedule                  ##
611##                                                    ##
612########################################################
613
614.align	4
615_vpaes_key_preheat:
616	adrp	x10, Lk_inv@PAGE
617	add	x10, x10, Lk_inv@PAGEOFF
618	movi	v16.16b, #0x5b			// Lk_s63
619	adrp	x11, Lk_sb1@PAGE
620	add	x11, x11, Lk_sb1@PAGEOFF
621	movi	v17.16b, #0x0f			// Lk_s0F
622	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// Lk_inv, Lk_ipt
623	adrp	x10, Lk_dksd@PAGE
624	add	x10, x10, Lk_dksd@PAGEOFF
625	ld1	{v22.2d,v23.2d}, [x11]		// Lk_sb1
626	adrp	x11, Lk_mc_forward@PAGE
627	add	x11, x11, Lk_mc_forward@PAGEOFF
628	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// Lk_dksd, Lk_dksb
629	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// Lk_dkse, Lk_dks9
630	ld1	{v8.2d}, [x10]			// Lk_rcon
631	ld1	{v9.2d}, [x11]			// Lk_mc_forward[0]
632	ret
633
634
635
636.align	4
637_vpaes_schedule_core:
638	AARCH64_SIGN_LINK_REGISTER
639	stp	x29, x30, [sp,#-16]!
640	add	x29,sp,#0
641
642	bl	_vpaes_key_preheat		// load the tables
643
644	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
645
646	// input transform
647	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
648	bl	_vpaes_schedule_transform
649	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
650
651	adrp	x10, Lk_sr@PAGE		// lea	Lk_sr(%rip),%r10
652	add	x10, x10, Lk_sr@PAGEOFF
653
654	add	x8, x8, x10
655	cbnz	w3, Lschedule_am_decrypting
656
657	// encrypting, output zeroth round key after transform
658	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
659	b	Lschedule_go
660
661Lschedule_am_decrypting:
662	// decrypting, output zeroth round key after shiftrows
663	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
664	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
665	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
666	eor	x8, x8, #0x30			// xor	$0x30, %r8
667
668Lschedule_go:
669	cmp	w1, #192			// cmp	$192,	%esi
670	b.hi	Lschedule_256
671	b.eq	Lschedule_192
672	// 128: fall though
673
674##
675##  .schedule_128
676##
677##  128-bit specific part of key schedule.
678##
679##  This schedule is really simple, because all its parts
680##  are accomplished by the subroutines.
681##
682Lschedule_128:
683	mov	x0, #10			// mov	$10, %esi
684
685Loop_schedule_128:
686	sub	x0, x0, #1			// dec	%esi
687	bl	_vpaes_schedule_round
688	cbz	x0, Lschedule_mangle_last
689	bl	_vpaes_schedule_mangle		// write output
690	b	Loop_schedule_128
691
692##
693##  .aes_schedule_192
694##
695##  192-bit specific part of key schedule.
696##
697##  The main body of this schedule is the same as the 128-bit
698##  schedule, but with more smearing.  The long, high side is
699##  stored in %xmm7 as before, and the short, low side is in
700##  the high bits of %xmm6.
701##
702##  This schedule is somewhat nastier, however, because each
703##  round produces 192 bits of key material, or 1.5 round keys.
704##  Therefore, on each cycle we do 2 rounds and produce 3 round
705##  keys.
706##
707.align	4
708Lschedule_192:
709	sub	x0, x0, #8
710	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
711	bl	_vpaes_schedule_transform	// input transform
712	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
713	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
714	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
715	mov	x0, #4			// mov	$4,	%esi
716
717Loop_schedule_192:
718	sub	x0, x0, #1			// dec	%esi
719	bl	_vpaes_schedule_round
720	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
721	bl	_vpaes_schedule_mangle		// save key n
722	bl	_vpaes_schedule_192_smear
723	bl	_vpaes_schedule_mangle		// save key n+1
724	bl	_vpaes_schedule_round
725	cbz	x0, Lschedule_mangle_last
726	bl	_vpaes_schedule_mangle		// save key n+2
727	bl	_vpaes_schedule_192_smear
728	b	Loop_schedule_192
729
730##
731##  .aes_schedule_256
732##
733##  256-bit specific part of key schedule.
734##
735##  The structure here is very similar to the 128-bit
736##  schedule, but with an additional "low side" in
737##  %xmm6.  The low side's rounds are the same as the
738##  high side's, except no rcon and no rotation.
739##
740.align	4
741Lschedule_256:
742	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
743	bl	_vpaes_schedule_transform	// input transform
744	mov	x0, #7			// mov	$7, %esi
745
746Loop_schedule_256:
747	sub	x0, x0, #1			// dec	%esi
748	bl	_vpaes_schedule_mangle		// output low result
749	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
750
751	// high round
752	bl	_vpaes_schedule_round
753	cbz	x0, Lschedule_mangle_last
754	bl	_vpaes_schedule_mangle
755
756	// low round. swap xmm7 and xmm6
757	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
758	movi	v4.16b, #0
759	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
760	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
761	bl	_vpaes_schedule_low_round
762	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
763
764	b	Loop_schedule_256
765
766##
767##  .aes_schedule_mangle_last
768##
769##  Mangler for last round of key schedule
770##  Mangles %xmm0
771##    when encrypting, outputs out(%xmm0) ^ 63
772##    when decrypting, outputs unskew(%xmm0)
773##
774##  Always called right before return... jumps to cleanup and exits
775##
776.align	4
777Lschedule_mangle_last:
778	// schedule last round key from xmm0
779	adrp	x11, Lk_deskew@PAGE	// lea	Lk_deskew(%rip),%r11	# prepare to deskew
780	add	x11, x11, Lk_deskew@PAGEOFF
781
782	cbnz	w3, Lschedule_mangle_last_dec
783
784	// encrypting
785	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
786	adrp	x11, Lk_opt@PAGE		// lea	Lk_opt(%rip),	%r11		# prepare to output transform
787	add	x11, x11, Lk_opt@PAGEOFF
788	add	x2, x2, #32			// add	$32,	%rdx
789	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
790
791Lschedule_mangle_last_dec:
792	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
793	sub	x2, x2, #16			// add	$-16,	%rdx
794	eor	v0.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm0
795	bl	_vpaes_schedule_transform	// output transform
796	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
797
798	// cleanup
799	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
800	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
801	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
802	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
803	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
804	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
805	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
806	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
807	ldp	x29, x30, [sp],#16
808	AARCH64_VALIDATE_LINK_REGISTER
809	ret
810
811
812##
813##  .aes_schedule_192_smear
814##
815##  Smear the short, low side in the 192-bit key schedule.
816##
817##  Inputs:
818##    %xmm7: high side, b  a  x  y
819##    %xmm6:  low side, d  c  0  0
820##    %xmm13: 0
821##
822##  Outputs:
823##    %xmm6: b+c+d  b+c  0  0
824##    %xmm0: b+c+d  b+c  b  a
825##
826
827.align	4
828_vpaes_schedule_192_smear:
829	movi	v1.16b, #0
830	dup	v0.4s, v7.s[3]
831	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
832	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
833	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
834	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
835	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
836	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
837	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
838	ret
839
840
841##
842##  .aes_schedule_round
843##
844##  Runs one main round of the key schedule on %xmm0, %xmm7
845##
846##  Specifically, runs subbytes on the high dword of %xmm0
847##  then rotates it by one byte and xors into the low dword of
848##  %xmm7.
849##
850##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
851##  next rcon.
852##
853##  Smears the dwords of %xmm7 by xoring the low into the
854##  second low, result into third, result into highest.
855##
856##  Returns results in %xmm7 = %xmm0.
857##  Clobbers %xmm1-%xmm4, %r11.
858##
859
860.align	4
861_vpaes_schedule_round:
862	// extract rcon from xmm8
863	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
864	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
865	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
866	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
867
868	// rotate
869	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
870	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
871
872	// fall through...
873
874	// low round: same as high round, but no rotation and no rcon.
875_vpaes_schedule_low_round:
876	// smear xmm7
877	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
878	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
879	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
880
881	// subbytes
882	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
883	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
884	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
885	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
886	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
887	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
888	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
889	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
890	eor	v7.16b, v7.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm7,	%xmm7
891	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
892	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
893	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
894	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
895	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
896	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
897	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
898	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
899
900	// add in smeared stuff
901	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
902	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
903	ret
904
905
906##
907##  .aes_schedule_transform
908##
909##  Linear-transform %xmm0 according to tables at (%r11)
910##
911##  Requires that %xmm9 = 0x0F0F... as in preheat
912##  Output in %xmm0
913##  Clobbers %xmm1, %xmm2
914##
915
916.align	4
917_vpaes_schedule_transform:
918	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
919	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
920						// vmovdqa	(%r11),	%xmm2 	# lo
921	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
922						// vmovdqa	16(%r11),	%xmm1 # hi
923	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
924	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
925	ret
926
927
928##
929##  .aes_schedule_mangle
930##
931##  Mangle xmm0 from (basis-transformed) standard version
932##  to our version.
933##
934##  On encrypt,
935##    xor with 0x63
936##    multiply by circulant 0,1,1,1
937##    apply shiftrows transform
938##
939##  On decrypt,
940##    xor with 0x63
941##    multiply by "inverse mixcolumns" circulant E,B,D,9
942##    deskew
943##    apply shiftrows transform
944##
945##
946##  Writes out to (%rdx), and increments or decrements it
947##  Keeps track of round number mod 4 in %r8
948##  Preserves xmm0
949##  Clobbers xmm1-xmm5
950##
951
952.align	4
953_vpaes_schedule_mangle:
954	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
955						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
956	cbnz	w3, Lschedule_mangle_dec
957
958	// encrypting
959	eor	v4.16b, v0.16b, v16.16b		// vpxor	Lk_s63(%rip),	%xmm0,	%xmm4
960	add	x2, x2, #16			// add	$16,	%rdx
961	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
962	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
963	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
964	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
965	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
966	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
967
968	b	Lschedule_mangle_both
969.align	4
970Lschedule_mangle_dec:
971	// inverse mix columns
972						// lea	.Lk_dksd(%rip),%r11
973	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
974	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
975
976						// vmovdqa	0x00(%r11),	%xmm2
977	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
978						// vmovdqa	0x10(%r11),	%xmm3
979	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
980	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
981	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
982
983						// vmovdqa	0x20(%r11),	%xmm2
984	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
985	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
986						// vmovdqa	0x30(%r11),	%xmm3
987	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
988	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
989	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
990
991						// vmovdqa	0x40(%r11),	%xmm2
992	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
993	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
994						// vmovdqa	0x50(%r11),	%xmm3
995	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
996	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
997
998						// vmovdqa	0x60(%r11),	%xmm2
999	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
1000	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
1001						// vmovdqa	0x70(%r11),	%xmm4
1002	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
1003	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
1004	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
1005	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
1006
1007	sub	x2, x2, #16			// add	$-16,	%rdx
1008
1009Lschedule_mangle_both:
1010	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
1011	add	x8, x8, #48			// add	$-16,	%r8
1012	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
1013	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
1014	ret
1015
1016
1017.globl	_vpaes_set_encrypt_key
1018.private_extern	_vpaes_set_encrypt_key
1019
1020.align	4
1021_vpaes_set_encrypt_key:
1022	AARCH64_SIGN_LINK_REGISTER
1023	stp	x29,x30,[sp,#-16]!
1024	add	x29,sp,#0
1025	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1026
1027	lsr	w9, w1, #5		// shr	$5,%eax
1028	add	w9, w9, #5		// $5,%eax
1029	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1030
1031	mov	w3, #0		// mov	$0,%ecx
1032	mov	x8, #0x30		// mov	$0x30,%r8d
1033	bl	_vpaes_schedule_core
1034	eor	x0, x0, x0
1035
1036	ldp	d8,d9,[sp],#16
1037	ldp	x29,x30,[sp],#16
1038	AARCH64_VALIDATE_LINK_REGISTER
1039	ret
1040
1041
1042.globl	_vpaes_set_decrypt_key
1043.private_extern	_vpaes_set_decrypt_key
1044
1045.align	4
1046_vpaes_set_decrypt_key:
1047	AARCH64_SIGN_LINK_REGISTER
1048	stp	x29,x30,[sp,#-16]!
1049	add	x29,sp,#0
1050	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1051
1052	lsr	w9, w1, #5		// shr	$5,%eax
1053	add	w9, w9, #5		// $5,%eax
1054	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1055	lsl	w9, w9, #4		// shl	$4,%eax
1056	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
1057	add	x2, x2, x9
1058
1059	mov	w3, #1		// mov	$1,%ecx
1060	lsr	w8, w1, #1		// shr	$1,%r8d
1061	and	x8, x8, #32		// and	$32,%r8d
1062	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
1063	bl	_vpaes_schedule_core
1064
1065	ldp	d8,d9,[sp],#16
1066	ldp	x29,x30,[sp],#16
1067	AARCH64_VALIDATE_LINK_REGISTER
1068	ret
1069
1070.globl	_vpaes_cbc_encrypt
1071.private_extern	_vpaes_cbc_encrypt
1072
1073.align	4
1074_vpaes_cbc_encrypt:
1075	AARCH64_SIGN_LINK_REGISTER
1076	cbz	x2, Lcbc_abort
1077	cmp	w5, #0			// check direction
1078	b.eq	vpaes_cbc_decrypt
1079
1080	stp	x29,x30,[sp,#-16]!
1081	add	x29,sp,#0
1082
1083	mov	x17, x2		// reassign
1084	mov	x2,  x3		// reassign
1085
1086	ld1	{v0.16b}, [x4]	// load ivec
1087	bl	_vpaes_encrypt_preheat
1088	b	Lcbc_enc_loop
1089
1090.align	4
1091Lcbc_enc_loop:
1092	ld1	{v7.16b}, [x0],#16	// load input
1093	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1094	bl	_vpaes_encrypt_core
1095	st1	{v0.16b}, [x1],#16	// save output
1096	subs	x17, x17, #16
1097	b.hi	Lcbc_enc_loop
1098
1099	st1	{v0.16b}, [x4]	// write ivec
1100
1101	ldp	x29,x30,[sp],#16
1102	AARCH64_VALIDATE_LINK_REGISTER
1103Lcbc_abort:
1104	ret
1105
1106
1107
1108.align	4
1109vpaes_cbc_decrypt:
1110	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
1111	// only from vpaes_cbc_encrypt which has already signed the return address.
1112	stp	x29,x30,[sp,#-16]!
1113	add	x29,sp,#0
1114	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1115	stp	d10,d11,[sp,#-16]!
1116	stp	d12,d13,[sp,#-16]!
1117	stp	d14,d15,[sp,#-16]!
1118
1119	mov	x17, x2		// reassign
1120	mov	x2,  x3		// reassign
1121	ld1	{v6.16b}, [x4]	// load ivec
1122	bl	_vpaes_decrypt_preheat
1123	tst	x17, #16
1124	b.eq	Lcbc_dec_loop2x
1125
1126	ld1	{v7.16b}, [x0], #16	// load input
1127	bl	_vpaes_decrypt_core
1128	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1129	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1130	st1	{v0.16b}, [x1], #16
1131	subs	x17, x17, #16
1132	b.ls	Lcbc_dec_done
1133
1134.align	4
1135Lcbc_dec_loop2x:
1136	ld1	{v14.16b,v15.16b}, [x0], #32
1137	bl	_vpaes_decrypt_2x
1138	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1139	eor	v1.16b, v1.16b, v14.16b
1140	orr	v6.16b, v15.16b, v15.16b
1141	st1	{v0.16b,v1.16b}, [x1], #32
1142	subs	x17, x17, #32
1143	b.hi	Lcbc_dec_loop2x
1144
1145Lcbc_dec_done:
1146	st1	{v6.16b}, [x4]
1147
1148	ldp	d14,d15,[sp],#16
1149	ldp	d12,d13,[sp],#16
1150	ldp	d10,d11,[sp],#16
1151	ldp	d8,d9,[sp],#16
1152	ldp	x29,x30,[sp],#16
1153	AARCH64_VALIDATE_LINK_REGISTER
1154	ret
1155
1156.globl	_vpaes_ctr32_encrypt_blocks
1157.private_extern	_vpaes_ctr32_encrypt_blocks
1158
1159.align	4
1160_vpaes_ctr32_encrypt_blocks:
1161	AARCH64_SIGN_LINK_REGISTER
1162	stp	x29,x30,[sp,#-16]!
1163	add	x29,sp,#0
1164	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1165	stp	d10,d11,[sp,#-16]!
1166	stp	d12,d13,[sp,#-16]!
1167	stp	d14,d15,[sp,#-16]!
1168
1169	cbz	x2, Lctr32_done
1170
1171	// Note, unlike the other functions, x2 here is measured in blocks,
1172	// not bytes.
1173	mov	x17, x2
1174	mov	x2,  x3
1175
1176	// Load the IV and counter portion.
1177	ldr	w6, [x4, #12]
1178	ld1	{v7.16b}, [x4]
1179
1180	bl	_vpaes_encrypt_preheat
1181	tst	x17, #1
1182	rev	w6, w6		// The counter is big-endian.
1183	b.eq	Lctr32_prep_loop
1184
1185	// Handle one block so the remaining block count is even for
1186	// _vpaes_encrypt_2x.
1187	ld1	{v6.16b}, [x0], #16	// Load input ahead of time
1188	bl	_vpaes_encrypt_core
1189	eor	v0.16b, v0.16b, v6.16b	// XOR input and result
1190	st1	{v0.16b}, [x1], #16
1191	subs	x17, x17, #1
1192	// Update the counter.
1193	add	w6, w6, #1
1194	rev	w7, w6
1195	mov	v7.s[3], w7
1196	b.ls	Lctr32_done
1197
1198Lctr32_prep_loop:
1199	// _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x
1200	// uses v14 and v15.
1201	mov	v15.16b, v7.16b
1202	mov	v14.16b, v7.16b
1203	add	w6, w6, #1
1204	rev	w7, w6
1205	mov	v15.s[3], w7
1206
1207Lctr32_loop:
1208	ld1	{v6.16b,v7.16b}, [x0], #32	// Load input ahead of time
1209	bl	_vpaes_encrypt_2x
1210	eor	v0.16b, v0.16b, v6.16b		// XOR input and result
1211	eor	v1.16b, v1.16b, v7.16b		// XOR input and result (#2)
1212	st1	{v0.16b,v1.16b}, [x1], #32
1213	subs	x17, x17, #2
1214	// Update the counter.
1215	add	w7, w6, #1
1216	add	w6, w6, #2
1217	rev	w7, w7
1218	mov	v14.s[3], w7
1219	rev	w7, w6
1220	mov	v15.s[3], w7
1221	b.hi	Lctr32_loop
1222
1223Lctr32_done:
1224	ldp	d14,d15,[sp],#16
1225	ldp	d12,d13,[sp],#16
1226	ldp	d10,d11,[sp],#16
1227	ldp	d8,d9,[sp],#16
1228	ldp	x29,x30,[sp],#16
1229	AARCH64_VALIDATE_LINK_REGISTER
1230	ret
1231
1232#endif  // !OPENSSL_NO_ASM
1233