• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NEON-accelerated implementation of Speck128-XTS and Speck64-XTS
4 *
5 * Copyright (c) 2018 Google, Inc
6 *
7 * Author: Eric Biggers <ebiggers@google.com>
8 */
9
10#include <linux/linkage.h>
11
12	.text
13	.fpu		neon
14
15	// arguments
16	ROUND_KEYS	.req	r0	// const {u64,u32} *round_keys
17	NROUNDS		.req	r1	// int nrounds
18	DST		.req	r2	// void *dst
19	SRC		.req	r3	// const void *src
20	NBYTES		.req	r4	// unsigned int nbytes
21	TWEAK		.req	r5	// void *tweak
22
23	// registers which hold the data being encrypted/decrypted
24	X0		.req	q0
25	X0_L		.req	d0
26	X0_H		.req	d1
27	Y0		.req	q1
28	Y0_H		.req	d3
29	X1		.req	q2
30	X1_L		.req	d4
31	X1_H		.req	d5
32	Y1		.req	q3
33	Y1_H		.req	d7
34	X2		.req	q4
35	X2_L		.req	d8
36	X2_H		.req	d9
37	Y2		.req	q5
38	Y2_H		.req	d11
39	X3		.req	q6
40	X3_L		.req	d12
41	X3_H		.req	d13
42	Y3		.req	q7
43	Y3_H		.req	d15
44
45	// the round key, duplicated in all lanes
46	ROUND_KEY	.req	q8
47	ROUND_KEY_L	.req	d16
48	ROUND_KEY_H	.req	d17
49
50	// index vector for vtbl-based 8-bit rotates
51	ROTATE_TABLE	.req	d18
52
53	// multiplication table for updating XTS tweaks
54	GF128MUL_TABLE	.req	d19
55	GF64MUL_TABLE	.req	d19
56
57	// current XTS tweak value(s)
58	TWEAKV		.req	q10
59	TWEAKV_L	.req	d20
60	TWEAKV_H	.req	d21
61
62	TMP0		.req	q12
63	TMP0_L		.req	d24
64	TMP0_H		.req	d25
65	TMP1		.req	q13
66	TMP2		.req	q14
67	TMP3		.req	q15
68
69	.align		4
70.Lror64_8_table:
71	.byte		1, 2, 3, 4, 5, 6, 7, 0
72.Lror32_8_table:
73	.byte		1, 2, 3, 0, 5, 6, 7, 4
74.Lrol64_8_table:
75	.byte		7, 0, 1, 2, 3, 4, 5, 6
76.Lrol32_8_table:
77	.byte		3, 0, 1, 2, 7, 4, 5, 6
78.Lgf128mul_table:
79	.byte		0, 0x87
80	.fill		14
81.Lgf64mul_table:
82	.byte		0, 0x1b, (0x1b << 1), (0x1b << 1) ^ 0x1b
83	.fill		12
84
85/*
86 * _speck_round_128bytes() - Speck encryption round on 128 bytes at a time
87 *
88 * Do one Speck encryption round on the 128 bytes (8 blocks for Speck128, 16 for
89 * Speck64) stored in X0-X3 and Y0-Y3, using the round key stored in all lanes
90 * of ROUND_KEY.  'n' is the lane size: 64 for Speck128, or 32 for Speck64.
91 *
92 * The 8-bit rotates are implemented using vtbl instead of vshr + vsli because
93 * the vtbl approach is faster on some processors and the same speed on others.
94 */
95.macro _speck_round_128bytes	n
96
97	// x = ror(x, 8)
98	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
99	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
100	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
101	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
102	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
103	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
104	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
105	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
106
107	// x += y
108	vadd.u\n	X0, Y0
109	vadd.u\n	X1, Y1
110	vadd.u\n	X2, Y2
111	vadd.u\n	X3, Y3
112
113	// x ^= k
114	veor		X0, ROUND_KEY
115	veor		X1, ROUND_KEY
116	veor		X2, ROUND_KEY
117	veor		X3, ROUND_KEY
118
119	// y = rol(y, 3)
120	vshl.u\n	TMP0, Y0, #3
121	vshl.u\n	TMP1, Y1, #3
122	vshl.u\n	TMP2, Y2, #3
123	vshl.u\n	TMP3, Y3, #3
124	vsri.u\n	TMP0, Y0, #(\n - 3)
125	vsri.u\n	TMP1, Y1, #(\n - 3)
126	vsri.u\n	TMP2, Y2, #(\n - 3)
127	vsri.u\n	TMP3, Y3, #(\n - 3)
128
129	// y ^= x
130	veor		Y0, TMP0, X0
131	veor		Y1, TMP1, X1
132	veor		Y2, TMP2, X2
133	veor		Y3, TMP3, X3
134.endm
135
136/*
137 * _speck_unround_128bytes() - Speck decryption round on 128 bytes at a time
138 *
139 * This is the inverse of _speck_round_128bytes().
140 */
141.macro _speck_unround_128bytes	n
142
143	// y ^= x
144	veor		TMP0, Y0, X0
145	veor		TMP1, Y1, X1
146	veor		TMP2, Y2, X2
147	veor		TMP3, Y3, X3
148
149	// y = ror(y, 3)
150	vshr.u\n	Y0, TMP0, #3
151	vshr.u\n	Y1, TMP1, #3
152	vshr.u\n	Y2, TMP2, #3
153	vshr.u\n	Y3, TMP3, #3
154	vsli.u\n	Y0, TMP0, #(\n - 3)
155	vsli.u\n	Y1, TMP1, #(\n - 3)
156	vsli.u\n	Y2, TMP2, #(\n - 3)
157	vsli.u\n	Y3, TMP3, #(\n - 3)
158
159	// x ^= k
160	veor		X0, ROUND_KEY
161	veor		X1, ROUND_KEY
162	veor		X2, ROUND_KEY
163	veor		X3, ROUND_KEY
164
165	// x -= y
166	vsub.u\n	X0, Y0
167	vsub.u\n	X1, Y1
168	vsub.u\n	X2, Y2
169	vsub.u\n	X3, Y3
170
171	// x = rol(x, 8);
172	vtbl.8		X0_L, {X0_L}, ROTATE_TABLE
173	vtbl.8		X0_H, {X0_H}, ROTATE_TABLE
174	vtbl.8		X1_L, {X1_L}, ROTATE_TABLE
175	vtbl.8		X1_H, {X1_H}, ROTATE_TABLE
176	vtbl.8		X2_L, {X2_L}, ROTATE_TABLE
177	vtbl.8		X2_H, {X2_H}, ROTATE_TABLE
178	vtbl.8		X3_L, {X3_L}, ROTATE_TABLE
179	vtbl.8		X3_H, {X3_H}, ROTATE_TABLE
180.endm
181
182.macro _xts128_precrypt_one	dst_reg, tweak_buf, tmp
183
184	// Load the next source block
185	vld1.8		{\dst_reg}, [SRC]!
186
187	// Save the current tweak in the tweak buffer
188	vst1.8		{TWEAKV}, [\tweak_buf:128]!
189
190	// XOR the next source block with the current tweak
191	veor		\dst_reg, TWEAKV
192
193	/*
194	 * Calculate the next tweak by multiplying the current one by x,
195	 * modulo p(x) = x^128 + x^7 + x^2 + x + 1.
196	 */
197	vshr.u64	\tmp, TWEAKV, #63
198	vshl.u64	TWEAKV, #1
199	veor		TWEAKV_H, \tmp\()_L
200	vtbl.8		\tmp\()_H, {GF128MUL_TABLE}, \tmp\()_H
201	veor		TWEAKV_L, \tmp\()_H
202.endm
203
204.macro _xts64_precrypt_two	dst_reg, tweak_buf, tmp
205
206	// Load the next two source blocks
207	vld1.8		{\dst_reg}, [SRC]!
208
209	// Save the current two tweaks in the tweak buffer
210	vst1.8		{TWEAKV}, [\tweak_buf:128]!
211
212	// XOR the next two source blocks with the current two tweaks
213	veor		\dst_reg, TWEAKV
214
215	/*
216	 * Calculate the next two tweaks by multiplying the current ones by x^2,
217	 * modulo p(x) = x^64 + x^4 + x^3 + x + 1.
218	 */
219	vshr.u64	\tmp, TWEAKV, #62
220	vshl.u64	TWEAKV, #2
221	vtbl.8		\tmp\()_L, {GF64MUL_TABLE}, \tmp\()_L
222	vtbl.8		\tmp\()_H, {GF64MUL_TABLE}, \tmp\()_H
223	veor		TWEAKV, \tmp
224.endm
225
226/*
227 * _speck_xts_crypt() - Speck-XTS encryption/decryption
228 *
229 * Encrypt or decrypt NBYTES bytes of data from the SRC buffer to the DST buffer
230 * using Speck-XTS, specifically the variant with a block size of '2n' and round
231 * count given by NROUNDS.  The expanded round keys are given in ROUND_KEYS, and
232 * the current XTS tweak value is given in TWEAK.  It's assumed that NBYTES is a
233 * nonzero multiple of 128.
234 */
235.macro _speck_xts_crypt	n, decrypting
236	push		{r4-r7}
237	mov		r7, sp
238
239	/*
240	 * The first four parameters were passed in registers r0-r3.  Load the
241	 * additional parameters, which were passed on the stack.
242	 */
243	ldr		NBYTES, [sp, #16]
244	ldr		TWEAK, [sp, #20]
245
246	/*
247	 * If decrypting, modify the ROUND_KEYS parameter to point to the last
248	 * round key rather than the first, since for decryption the round keys
249	 * are used in reverse order.
250	 */
251.if \decrypting
252.if \n == 64
253	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #3
254	sub		ROUND_KEYS, #8
255.else
256	add		ROUND_KEYS, ROUND_KEYS, NROUNDS, lsl #2
257	sub		ROUND_KEYS, #4
258.endif
259.endif
260
261	// Load the index vector for vtbl-based 8-bit rotates
262.if \decrypting
263	ldr		r12, =.Lrol\n\()_8_table
264.else
265	ldr		r12, =.Lror\n\()_8_table
266.endif
267	vld1.8		{ROTATE_TABLE}, [r12:64]
268
269	// One-time XTS preparation
270
271	/*
272	 * Allocate stack space to store 128 bytes worth of tweaks.  For
273	 * performance, this space is aligned to a 16-byte boundary so that we
274	 * can use the load/store instructions that declare 16-byte alignment.
275	 */
276	sub		sp, #128
277	bic		sp, #0xf
278
279.if \n == 64
280	// Load first tweak
281	vld1.8		{TWEAKV}, [TWEAK]
282
283	// Load GF(2^128) multiplication table
284	ldr		r12, =.Lgf128mul_table
285	vld1.8		{GF128MUL_TABLE}, [r12:64]
286.else
287	// Load first tweak
288	vld1.8		{TWEAKV_L}, [TWEAK]
289
290	// Load GF(2^64) multiplication table
291	ldr		r12, =.Lgf64mul_table
292	vld1.8		{GF64MUL_TABLE}, [r12:64]
293
294	// Calculate second tweak, packing it together with the first
295	vshr.u64	TMP0_L, TWEAKV_L, #63
296	vtbl.u8		TMP0_L, {GF64MUL_TABLE}, TMP0_L
297	vshl.u64	TWEAKV_H, TWEAKV_L, #1
298	veor		TWEAKV_H, TMP0_L
299.endif
300
301.Lnext_128bytes_\@:
302
303	/*
304	 * Load the source blocks into {X,Y}[0-3], XOR them with their XTS tweak
305	 * values, and save the tweaks on the stack for later.  Then
306	 * de-interleave the 'x' and 'y' elements of each block, i.e. make it so
307	 * that the X[0-3] registers contain only the second halves of blocks,
308	 * and the Y[0-3] registers contain only the first halves of blocks.
309	 * (Speck uses the order (y, x) rather than the more intuitive (x, y).)
310	 */
311	mov		r12, sp
312.if \n == 64
313	_xts128_precrypt_one	X0, r12, TMP0
314	_xts128_precrypt_one	Y0, r12, TMP0
315	_xts128_precrypt_one	X1, r12, TMP0
316	_xts128_precrypt_one	Y1, r12, TMP0
317	_xts128_precrypt_one	X2, r12, TMP0
318	_xts128_precrypt_one	Y2, r12, TMP0
319	_xts128_precrypt_one	X3, r12, TMP0
320	_xts128_precrypt_one	Y3, r12, TMP0
321	vswp		X0_L, Y0_H
322	vswp		X1_L, Y1_H
323	vswp		X2_L, Y2_H
324	vswp		X3_L, Y3_H
325.else
326	_xts64_precrypt_two	X0, r12, TMP0
327	_xts64_precrypt_two	Y0, r12, TMP0
328	_xts64_precrypt_two	X1, r12, TMP0
329	_xts64_precrypt_two	Y1, r12, TMP0
330	_xts64_precrypt_two	X2, r12, TMP0
331	_xts64_precrypt_two	Y2, r12, TMP0
332	_xts64_precrypt_two	X3, r12, TMP0
333	_xts64_precrypt_two	Y3, r12, TMP0
334	vuzp.32		Y0, X0
335	vuzp.32		Y1, X1
336	vuzp.32		Y2, X2
337	vuzp.32		Y3, X3
338.endif
339
340	// Do the cipher rounds
341
342	mov		r12, ROUND_KEYS
343	mov		r6, NROUNDS
344
345.Lnext_round_\@:
346.if \decrypting
347.if \n == 64
348	vld1.64		ROUND_KEY_L, [r12]
349	sub		r12, #8
350	vmov		ROUND_KEY_H, ROUND_KEY_L
351.else
352	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]
353	sub		r12, #4
354.endif
355	_speck_unround_128bytes	\n
356.else
357.if \n == 64
358	vld1.64		ROUND_KEY_L, [r12]!
359	vmov		ROUND_KEY_H, ROUND_KEY_L
360.else
361	vld1.32		{ROUND_KEY_L[],ROUND_KEY_H[]}, [r12]!
362.endif
363	_speck_round_128bytes	\n
364.endif
365	subs		r6, r6, #1
366	bne		.Lnext_round_\@
367
368	// Re-interleave the 'x' and 'y' elements of each block
369.if \n == 64
370	vswp		X0_L, Y0_H
371	vswp		X1_L, Y1_H
372	vswp		X2_L, Y2_H
373	vswp		X3_L, Y3_H
374.else
375	vzip.32		Y0, X0
376	vzip.32		Y1, X1
377	vzip.32		Y2, X2
378	vzip.32		Y3, X3
379.endif
380
381	// XOR the encrypted/decrypted blocks with the tweaks we saved earlier
382	mov		r12, sp
383	vld1.8		{TMP0, TMP1}, [r12:128]!
384	vld1.8		{TMP2, TMP3}, [r12:128]!
385	veor		X0, TMP0
386	veor		Y0, TMP1
387	veor		X1, TMP2
388	veor		Y1, TMP3
389	vld1.8		{TMP0, TMP1}, [r12:128]!
390	vld1.8		{TMP2, TMP3}, [r12:128]!
391	veor		X2, TMP0
392	veor		Y2, TMP1
393	veor		X3, TMP2
394	veor		Y3, TMP3
395
396	// Store the ciphertext in the destination buffer
397	vst1.8		{X0, Y0}, [DST]!
398	vst1.8		{X1, Y1}, [DST]!
399	vst1.8		{X2, Y2}, [DST]!
400	vst1.8		{X3, Y3}, [DST]!
401
402	// Continue if there are more 128-byte chunks remaining, else return
403	subs		NBYTES, #128
404	bne		.Lnext_128bytes_\@
405
406	// Store the next tweak
407.if \n == 64
408	vst1.8		{TWEAKV}, [TWEAK]
409.else
410	vst1.8		{TWEAKV_L}, [TWEAK]
411.endif
412
413	mov		sp, r7
414	pop		{r4-r7}
415	bx		lr
416.endm
417
418ENTRY(speck128_xts_encrypt_neon)
419	_speck_xts_crypt	n=64, decrypting=0
420ENDPROC(speck128_xts_encrypt_neon)
421
422ENTRY(speck128_xts_decrypt_neon)
423	_speck_xts_crypt	n=64, decrypting=1
424ENDPROC(speck128_xts_decrypt_neon)
425
426ENTRY(speck64_xts_encrypt_neon)
427	_speck_xts_crypt	n=32, decrypting=0
428ENDPROC(speck64_xts_encrypt_neon)
429
430ENTRY(speck64_xts_decrypt_neon)
431	_speck_xts_crypt	n=32, decrypting=1
432ENDPROC(speck64_xts_decrypt_neon)
433