• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * ARM64 NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
4 *
5 * Copyright (c) 2018 Google, Inc
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10#include <linux/linkage.h>
11
12	.text
13
14	// arguments
15	ROUND_KEYS	.req	x0	// const {u64,u32} *round_keys
16	NROUNDS		.req	w1	// int nrounds
17	NROUNDS_X	.req	x1
18	DST		.req	x2	// void *dst
19	SRC		.req	x3	// const void *src
20	NBYTES		.req	w4	// unsigned int nbytes
21	TWEAK		.req	x5	// void *tweak
22
23	// registers which hold the data being encrypted/decrypted
24	// (underscores avoid a naming collision with ARM64 registers x0-x3)
25	X_0		.req	v0
26	Y_0		.req	v1
27	X_1		.req	v2
28	Y_1		.req	v3
29	X_2		.req	v4
30	Y_2		.req	v5
31	X_3		.req	v6
32	Y_3		.req	v7
33
34	// the round key, duplicated in all lanes
35	ROUND_KEY	.req	v8
36
37	// index vector for tbl-based 8-bit rotates
38	ROTATE_TABLE	.req	v9
39	ROTATE_TABLE_Q	.req	q9
40
41	// temporary registers
42	TMP0		.req	v10
43	TMP1		.req	v11
44	TMP2		.req	v12
45	TMP3		.req	v13
46
47	// multiplication table for updating XTS tweaks
48	GFMUL_TABLE	.req	v14
49	GFMUL_TABLE_Q	.req	q14
50
51	// next XTS tweak value(s)
52	TWEAKV_NEXT	.req	v15
53
54	// XTS tweaks for the blocks currently being encrypted/decrypted
55	TWEAKV0		.req	v16
56	TWEAKV1		.req	v17
57	TWEAKV2		.req	v18
58	TWEAKV3		.req	v19
59	TWEAKV4		.req	v20
60	TWEAKV5		.req	v21
61	TWEAKV6		.req	v22
62	TWEAKV7		.req	v23
63
64	.align		4
65.Lror64_8_table:
66	.octa		0x080f0e0d0c0b0a090007060504030201
67.Lror32_8_table:
68	.octa		0x0c0f0e0d080b0a090407060500030201
69.Lrol64_8_table:
70	.octa		0x0e0d0c0b0a09080f0605040302010007
71.Lrol32_8_table:
72	.octa		0x0e0d0c0f0a09080b0605040702010003
73.Lgf128mul_table:
74	.octa		0x00000000000000870000000000000001
75.Lgf64mul_table:
76	.octa		0x0000000000000000000000002d361b00
77
78/*
79 * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
80 *
81 * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
82 * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
83 * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
84 * 'lanes' is the lane specifier: "2d" for Speck128 or "4s" for Speck64.
85 */
86.macro _speck_round_128bytes	n, lanes
87
88	// x = ror(x, 8)
89	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
90	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
91	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
92	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
93
94	// x += y
95	add		X_0.\lanes, X_0.\lanes, Y_0.\lanes
96	add		X_1.\lanes, X_1.\lanes, Y_1.\lanes
97	add		X_2.\lanes, X_2.\lanes, Y_2.\lanes
98	add		X_3.\lanes, X_3.\lanes, Y_3.\lanes
99
100	// x ^= k
101	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
102	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
103	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
104	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
105
106	// y = rol(y, 3)
107	shl		TMP0.\lanes, Y_0.\lanes, #3
108	shl		TMP1.\lanes, Y_1.\lanes, #3
109	shl		TMP2.\lanes, Y_2.\lanes, #3
110	shl		TMP3.\lanes, Y_3.\lanes, #3
111	sri		TMP0.\lanes, Y_0.\lanes, #(\n - 3)
112	sri		TMP1.\lanes, Y_1.\lanes, #(\n - 3)
113	sri		TMP2.\lanes, Y_2.\lanes, #(\n - 3)
114	sri		TMP3.\lanes, Y_3.\lanes, #(\n - 3)
115
116	// y ^= x
117	eor		Y_0.16b, TMP0.16b, X_0.16b
118	eor		Y_1.16b, TMP1.16b, X_1.16b
119	eor		Y_2.16b, TMP2.16b, X_2.16b
120	eor		Y_3.16b, TMP3.16b, X_3.16b
121.endm
122
123/*
124 * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
125 *
126 * This is the inverse of _speck_round_128bytes().
127 */
128.macro _speck_unround_128bytes	n, lanes
129
130	// y ^= x
131	eor		TMP0.16b, Y_0.16b, X_0.16b
132	eor		TMP1.16b, Y_1.16b, X_1.16b
133	eor		TMP2.16b, Y_2.16b, X_2.16b
134	eor		TMP3.16b, Y_3.16b, X_3.16b
135
136	// y = ror(y, 3)
137	ushr		Y_0.\lanes, TMP0.\lanes, #3
138	ushr		Y_1.\lanes, TMP1.\lanes, #3
139	ushr		Y_2.\lanes, TMP2.\lanes, #3
140	ushr		Y_3.\lanes, TMP3.\lanes, #3
141	sli		Y_0.\lanes, TMP0.\lanes, #(\n - 3)
142	sli		Y_1.\lanes, TMP1.\lanes, #(\n - 3)
143	sli		Y_2.\lanes, TMP2.\lanes, #(\n - 3)
144	sli		Y_3.\lanes, TMP3.\lanes, #(\n - 3)
145
146	// x ^= k
147	eor		X_0.16b, X_0.16b, ROUND_KEY.16b
148	eor		X_1.16b, X_1.16b, ROUND_KEY.16b
149	eor		X_2.16b, X_2.16b, ROUND_KEY.16b
150	eor		X_3.16b, X_3.16b, ROUND_KEY.16b
151
152	// x -= y
153	sub		X_0.\lanes, X_0.\lanes, Y_0.\lanes
154	sub		X_1.\lanes, X_1.\lanes, Y_1.\lanes
155	sub		X_2.\lanes, X_2.\lanes, Y_2.\lanes
156	sub		X_3.\lanes, X_3.\lanes, Y_3.\lanes
157
158	// x = rol(x, 8)
159	tbl		X_0.16b, {X_0.16b}, ROTATE_TABLE.16b
160	tbl		X_1.16b, {X_1.16b}, ROTATE_TABLE.16b
161	tbl		X_2.16b, {X_2.16b}, ROTATE_TABLE.16b
162	tbl		X_3.16b, {X_3.16b}, ROTATE_TABLE.16b
163.endm
164
165.macro _next_xts_tweak	next, cur, tmp, n
166.if \n == 64
167	/*
168	 * Calculate the next tweak by multiplying the current one by x,
169	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
170	 */
171	sshr		\tmp\().2d, \cur\().2d, #63
172	and		\tmp\().16b, \tmp\().16b, GFMUL_TABLE.16b
173	shl		\next\().2d, \cur\().2d, #1
174	ext		\tmp\().16b, \tmp\().16b, \tmp\().16b, #8
175	eor		\next\().16b, \next\().16b, \tmp\().16b
176.else
177	/*
178	 * Calculate the next two tweaks by multiplying the current ones by x^2,
179	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
180	 */
181	ushr		\tmp\().2d, \cur\().2d, #62
182	shl		\next\().2d, \cur\().2d, #2
183	tbl		\tmp\().16b, {GFMUL_TABLE.16b}, \tmp\().16b
184	eor		\next\().16b, \next\().16b, \tmp\().16b
185.endif
186.endm
187
188/*
189 * _speck_xts_crypt() - Speck-XTS encryption/decryption
190 *
191 * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
192 * using Speck-XTS, specifically the variant with a block size of '2n' and round
193 * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
194 * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
195 * nonzero multiple of 128.
196 */
197.macro _speck_xts_crypt	n, lanes, decrypting
198
199	/*
200	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
201	 * round key rather than the first, since for decryption the round keys
202	 * are used in reverse order.
203	 */
204.if \decrypting
205	mov		NROUNDS, NROUNDS	/* zero the high 32 bits */
206.if \n == 64
207	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #3
208	sub		ROUND_KEYS, ROUND_KEYS, #8
209.else
210	add		ROUND_KEYS, ROUND_KEYS, NROUNDS_X, lsl #2
211	sub		ROUND_KEYS, ROUND_KEYS, #4
212.endif
213.endif
214
215	// Load the index vector for tbl-based 8-bit rotates
216.if \decrypting
217	ldr		ROTATE_TABLE_Q, .Lrol\n\()_8_table
218.else
219	ldr		ROTATE_TABLE_Q, .Lror\n\()_8_table
220.endif
221
222	// One-time XTS preparation
223.if \n == 64
224	// Load first tweak
225	ld1		{TWEAKV0.16b}, [TWEAK]
226
227	// Load GF(2^128) multiplication table
228	ldr		GFMUL_TABLE_Q, .Lgf128mul_table
229.else
230	// Load first tweak
231	ld1		{TWEAKV0.8b}, [TWEAK]
232
233	// Load GF(2^64) multiplication table
234	ldr		GFMUL_TABLE_Q, .Lgf64mul_table
235
236	// Calculate second tweak, packing it together with the first
237	ushr		TMP0.2d, TWEAKV0.2d, #63
238	shl		TMP1.2d, TWEAKV0.2d, #1
239	tbl		TMP0.8b, {GFMUL_TABLE.16b}, TMP0.8b
240	eor		TMP0.8b, TMP0.8b, TMP1.8b
241	mov		TWEAKV0.d[1], TMP0.d[0]
242.endif
243
244.Lnext_128bytes_\@:
245
246	// Calculate XTS tweaks for next 128 bytes
247	_next_xts_tweak	TWEAKV1, TWEAKV0, TMP0, \n
248	_next_xts_tweak	TWEAKV2, TWEAKV1, TMP0, \n
249	_next_xts_tweak	TWEAKV3, TWEAKV2, TMP0, \n
250	_next_xts_tweak	TWEAKV4, TWEAKV3, TMP0, \n
251	_next_xts_tweak	TWEAKV5, TWEAKV4, TMP0, \n
252	_next_xts_tweak	TWEAKV6, TWEAKV5, TMP0, \n
253	_next_xts_tweak	TWEAKV7, TWEAKV6, TMP0, \n
254	_next_xts_tweak	TWEAKV_NEXT, TWEAKV7, TMP0, \n
255
256	// Load the next source blocks into {X,Y}[0-3]
257	ld1		{X_0.16b-Y_1.16b}, [SRC], #64
258	ld1		{X_2.16b-Y_3.16b}, [SRC], #64
259
260	// XOR the source blocks with their XTS tweaks
261	eor		TMP0.16b, X_0.16b, TWEAKV0.16b
262	eor		Y_0.16b,  Y_0.16b, TWEAKV1.16b
263	eor		TMP1.16b, X_1.16b, TWEAKV2.16b
264	eor		Y_1.16b,  Y_1.16b, TWEAKV3.16b
265	eor		TMP2.16b, X_2.16b, TWEAKV4.16b
266	eor		Y_2.16b,  Y_2.16b, TWEAKV5.16b
267	eor		TMP3.16b, X_3.16b, TWEAKV6.16b
268	eor		Y_3.16b,  Y_3.16b, TWEAKV7.16b
269
270	/*
271	 * De-interleave the 'x' and 'y' elements of each block, i.e. make it so
272	 * that the X[0-3] registers contain only the second halves of blocks,
273	 * and the Y[0-3] registers contain only the first halves of blocks.
274	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
275	 */
276	uzp2		X_0.\lanes, TMP0.\lanes, Y_0.\lanes
277	uzp1		Y_0.\lanes, TMP0.\lanes, Y_0.\lanes
278	uzp2		X_1.\lanes, TMP1.\lanes, Y_1.\lanes
279	uzp1		Y_1.\lanes, TMP1.\lanes, Y_1.\lanes
280	uzp2		X_2.\lanes, TMP2.\lanes, Y_2.\lanes
281	uzp1		Y_2.\lanes, TMP2.\lanes, Y_2.\lanes
282	uzp2		X_3.\lanes, TMP3.\lanes, Y_3.\lanes
283	uzp1		Y_3.\lanes, TMP3.\lanes, Y_3.\lanes
284
285	// Do the cipher rounds
286	mov		x6, ROUND_KEYS
287	mov		w7, NROUNDS
288.Lnext_round_\@:
289.if \decrypting
290	ld1r		{ROUND_KEY.\lanes}, [x6]
291	sub		x6, x6, #( \n / 8 )
292	_speck_unround_128bytes	\n, \lanes
293.else
294	ld1r		{ROUND_KEY.\lanes}, [x6], #( \n / 8 )
295	_speck_round_128bytes	\n, \lanes
296.endif
297	subs		w7, w7, #1
298	bne		.Lnext_round_\@
299
300	// Re-interleave the 'x' and 'y' elements of each block
301	zip1		TMP0.\lanes, Y_0.\lanes, X_0.\lanes
302	zip2		Y_0.\lanes,  Y_0.\lanes, X_0.\lanes
303	zip1		TMP1.\lanes, Y_1.\lanes, X_1.\lanes
304	zip2		Y_1.\lanes,  Y_1.\lanes, X_1.\lanes
305	zip1		TMP2.\lanes, Y_2.\lanes, X_2.\lanes
306	zip2		Y_2.\lanes,  Y_2.\lanes, X_2.\lanes
307	zip1		TMP3.\lanes, Y_3.\lanes, X_3.\lanes
308	zip2		Y_3.\lanes,  Y_3.\lanes, X_3.\lanes
309
310	// XOR the encrypted/decrypted blocks with the tweaks calculated earlier
311	eor		X_0.16b, TMP0.16b, TWEAKV0.16b
312	eor		Y_0.16b, Y_0.16b,  TWEAKV1.16b
313	eor		X_1.16b, TMP1.16b, TWEAKV2.16b
314	eor		Y_1.16b, Y_1.16b,  TWEAKV3.16b
315	eor		X_2.16b, TMP2.16b, TWEAKV4.16b
316	eor		Y_2.16b, Y_2.16b,  TWEAKV5.16b
317	eor		X_3.16b, TMP3.16b, TWEAKV6.16b
318	eor		Y_3.16b, Y_3.16b,  TWEAKV7.16b
319	mov		TWEAKV0.16b, TWEAKV_NEXT.16b
320
321	// Store the ciphertext in the destination buffer
322	st1		{X_0.16b-Y_1.16b}, [DST], #64
323	st1		{X_2.16b-Y_3.16b}, [DST], #64
324
325	// Continue if there are more 128-byte chunks remaining
326	subs		NBYTES, NBYTES, #128
327	bne		.Lnext_128bytes_\@
328
329	// Store the next tweak and return
330.if \n == 64
331	st1		{TWEAKV_NEXT.16b}, [TWEAK]
332.else
333	st1		{TWEAKV_NEXT.8b}, [TWEAK]
334.endif
335	ret
336.endm
337
338ENTRY(speck128_xts_encrypt_neon)
339	_speck_xts_crypt	n=64, lanes=2d, decrypting=0
340ENDPROC(speck128_xts_encrypt_neon)
341
342ENTRY(speck128_xts_decrypt_neon)
343	_speck_xts_crypt	n=64, lanes=2d, decrypting=1
344ENDPROC(speck128_xts_decrypt_neon)
345
346ENTRY(speck64_xts_encrypt_neon)
347	_speck_xts_crypt	n=32, lanes=4s, decrypting=0
348ENDPROC(speck64_xts_encrypt_neon)
349
350ENTRY(speck64_xts_decrypt_neon)
351	_speck_xts_crypt	n=32, lanes=4s, decrypting=1
352ENDPROC(speck64_xts_decrypt_neon)
353