• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
4 *
5 * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel@linaro.org>
6 */
7
8#include <linux/linkage.h>
9#include <asm/assembler.h>
10
11	.text
12	.arch		armv8-a+crypto
13
14	dga		.req	q20
15	dgav		.req	v20
16	dgb		.req	q21
17	dgbv		.req	v21
18
19	t0		.req	v22
20	t1		.req	v23
21
22	dg0q		.req	q24
23	dg0v		.req	v24
24	dg1q		.req	q25
25	dg1v		.req	v25
26	dg2q		.req	q26
27	dg2v		.req	v26
28
29	.macro		add_only, ev, rc, s0
30	mov		dg2v.16b, dg0v.16b
31	.ifeq		\ev
32	add		t1.4s, v\s0\().4s, \rc\().4s
33	sha256h		dg0q, dg1q, t0.4s
34	sha256h2	dg1q, dg2q, t0.4s
35	.else
36	.ifnb		\s0
37	add		t0.4s, v\s0\().4s, \rc\().4s
38	.endif
39	sha256h		dg0q, dg1q, t1.4s
40	sha256h2	dg1q, dg2q, t1.4s
41	.endif
42	.endm
43
44	.macro		add_update, ev, rc, s0, s1, s2, s3
45	sha256su0	v\s0\().4s, v\s1\().4s
46	add_only	\ev, \rc, \s1
47	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
48	.endm
49
50	/*
51	 * The SHA-256 round constants
52	 */
53	.section	".rodata", "a"
54	.align		4
55.Lsha2_rcon:
56	.word		0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
57	.word		0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
58	.word		0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
59	.word		0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
60	.word		0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
61	.word		0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
62	.word		0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
63	.word		0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
64	.word		0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
65	.word		0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
66	.word		0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
67	.word		0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
68	.word		0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
69	.word		0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
70	.word		0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
71	.word		0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
72
73	.macro load_round_constants	tmp
74	adr_l		\tmp, .Lsha2_rcon
75	ld1		{ v0.4s- v3.4s}, [\tmp], #64
76	ld1		{ v4.4s- v7.4s}, [\tmp], #64
77	ld1		{ v8.4s-v11.4s}, [\tmp], #64
78	ld1		{v12.4s-v15.4s}, [\tmp]
79	.endm
80
81	/*
82	 * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
83	 *			     int blocks)
84	 */
85	.text
86SYM_FUNC_START(__sha256_ce_transform)
87
88	load_round_constants	x8
89
90	/* load state */
91	ld1		{dgav.4s, dgbv.4s}, [x0]
92
93	/* load sha256_ce_state::finalize */
94	ldr_l		w4, sha256_ce_offsetof_finalize, x4
95	ldr		w4, [x0, x4]
96
97	/* load input */
980:	ld1		{v16.4s-v19.4s}, [x1], #64
99	sub		w2, w2, #1
100
101CPU_LE(	rev32		v16.16b, v16.16b	)
102CPU_LE(	rev32		v17.16b, v17.16b	)
103CPU_LE(	rev32		v18.16b, v18.16b	)
104CPU_LE(	rev32		v19.16b, v19.16b	)
105
1061:	add		t0.4s, v16.4s, v0.4s
107	mov		dg0v.16b, dgav.16b
108	mov		dg1v.16b, dgbv.16b
109
110	add_update	0,  v1, 16, 17, 18, 19
111	add_update	1,  v2, 17, 18, 19, 16
112	add_update	0,  v3, 18, 19, 16, 17
113	add_update	1,  v4, 19, 16, 17, 18
114
115	add_update	0,  v5, 16, 17, 18, 19
116	add_update	1,  v6, 17, 18, 19, 16
117	add_update	0,  v7, 18, 19, 16, 17
118	add_update	1,  v8, 19, 16, 17, 18
119
120	add_update	0,  v9, 16, 17, 18, 19
121	add_update	1, v10, 17, 18, 19, 16
122	add_update	0, v11, 18, 19, 16, 17
123	add_update	1, v12, 19, 16, 17, 18
124
125	add_only	0, v13, 17
126	add_only	1, v14, 18
127	add_only	0, v15, 19
128	add_only	1
129
130	/* update state */
131	add		dgav.4s, dgav.4s, dg0v.4s
132	add		dgbv.4s, dgbv.4s, dg1v.4s
133
134	/* handled all input blocks? */
135	cbz		w2, 2f
136	cond_yield	3f, x5, x6
137	b		0b
138
139	/*
140	 * Final block: add padding and total bit count.
141	 * Skip if the input size was not a round multiple of the block size,
142	 * the padding is handled by the C code in that case.
143	 */
1442:	cbz		x4, 3f
145	ldr_l		w4, sha256_ce_offsetof_count, x4
146	ldr		x4, [x0, x4]
147	movi		v17.2d, #0
148	mov		x8, #0x80000000
149	movi		v18.2d, #0
150	ror		x7, x4, #29		// ror(lsl(x4, 3), 32)
151	fmov		d16, x8
152	mov		x4, #0
153	mov		v19.d[0], xzr
154	mov		v19.d[1], x7
155	b		1b
156
157	/* store new state */
1583:	st1		{dgav.4s, dgbv.4s}, [x0]
159	mov		w0, w2
160	ret
161SYM_FUNC_END(__sha256_ce_transform)
162
163	.unreq dga
164	.unreq dgav
165	.unreq dgb
166	.unreq dgbv
167	.unreq t0
168	.unreq t1
169	.unreq dg0q
170	.unreq dg0v
171	.unreq dg1q
172	.unreq dg1v
173	.unreq dg2q
174	.unreq dg2v
175
176	// parameters for __sha256_ce_finup2x()
177	sctx		.req	x0
178	data1		.req	x1
179	data2		.req	x2
180	len		.req	w3
181	out1		.req	x4
182	out2		.req	x5
183
184	// other scalar variables
185	count		.req	x6
186	final_step	.req	w7
187
188	// x8-x9 are used as temporaries.
189
190	// v0-v15 are used to cache the SHA-256 round constants.
191	// v16-v19 are used for the message schedule for the first message.
192	// v20-v23 are used for the message schedule for the second message.
193	// v24-v31 are used for the state and temporaries as given below.
194	// *_a are for the first message and *_b for the second.
195	state0_a_q	.req	q24
196	state0_a	.req	v24
197	state1_a_q	.req	q25
198	state1_a	.req	v25
199	state0_b_q	.req	q26
200	state0_b	.req	v26
201	state1_b_q	.req	q27
202	state1_b	.req	v27
203	t0_a		.req	v28
204	t0_b		.req	v29
205	t1_a_q		.req	q30
206	t1_a		.req	v30
207	t1_b_q		.req	q31
208	t1_b		.req	v31
209
210#define OFFSETOF_COUNT	32	// offsetof(struct sha256_state, count)
211#define OFFSETOF_BUF	40	// offsetof(struct sha256_state, buf)
212// offsetof(struct sha256_state, state) is assumed to be 0.
213
214	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
215	// and m0_b contain the current 4 message schedule words for the first
216	// and second message respectively.
217	//
218	// If not all the message schedule words have been computed yet, then
219	// this also computes 4 more message schedule words for each message.
220	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
221	// the first message, and likewise m1_b-m3_b for the second.  After
222	// consuming the current value of m0_a, this macro computes the group
223	// after m3_a and writes it to m0_a, and likewise for *_b.  This means
224	// that the next (m0_a, m1_a, m2_a, m3_a) is the current (m1_a, m2_a,
225	// m3_a, m0_a), and likewise for *_b, so the caller must cycle through
226	// the registers accordingly.
227	.macro	do_4rounds_2x	i, k,  m0_a, m1_a, m2_a, m3_a,  \
228				       m0_b, m1_b, m2_b, m3_b
229	add		t0_a\().4s, \m0_a\().4s, \k\().4s
230	add		t0_b\().4s, \m0_b\().4s, \k\().4s
231	.if \i < 48
232	sha256su0	\m0_a\().4s, \m1_a\().4s
233	sha256su0	\m0_b\().4s, \m1_b\().4s
234	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
235	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
236	.endif
237	mov		t1_a.16b, state0_a.16b
238	mov		t1_b.16b, state0_b.16b
239	sha256h		state0_a_q, state1_a_q, t0_a\().4s
240	sha256h		state0_b_q, state1_b_q, t0_b\().4s
241	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
242	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
243	.endm
244
245	.macro	do_16rounds_2x	i, k0, k1, k2, k3
246	do_4rounds_2x	\i + 0,  \k0,  v16, v17, v18, v19,  v20, v21, v22, v23
247	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
248	do_4rounds_2x	\i + 8,  \k2,  v18, v19, v16, v17,  v22, v23, v20, v21
249	do_4rounds_2x	\i + 12, \k3,  v19, v16, v17, v18,  v23, v20, v21, v22
250	.endm
251
252//
253// void __sha256_ce_finup2x(const struct sha256_state *sctx,
254//			    const u8 *data1, const u8 *data2, int len,
255//			    u8 out1[SHA256_DIGEST_SIZE],
256//			    u8 out2[SHA256_DIGEST_SIZE]);
257//
258// This function computes the SHA-256 digests of two messages |data1| and
259// |data2| that are both |len| bytes long, starting from the initial state
260// |sctx|.  |len| must be at least SHA256_BLOCK_SIZE.
261//
262// The instructions for the two SHA-256 operations are interleaved.  On many
263// CPUs, this is almost twice as fast as hashing each message individually due
264// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
265//
266SYM_FUNC_START(__sha256_ce_finup2x)
267	sub		sp, sp, #128
268	mov		final_step, #0
269	load_round_constants	x8
270
271	// Load the initial state from sctx->state.
272	ld1		{state0_a.4s-state1_a.4s}, [sctx]
273
274	// Load sctx->count.  Take the mod 64 of it to get the number of bytes
275	// that are buffered in sctx->buf.  Also save it in a register with len
276	// added to it.
277	ldr		x8, [sctx, #OFFSETOF_COUNT]
278	add		count, x8, len, sxtw
279	and		x8, x8, #63
280	cbz		x8, .Lfinup2x_enter_loop	// No bytes buffered?
281
282	// x8 bytes (1 to 63) are currently buffered in sctx->buf.  Load them
283	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
284	// just load 64 bytes from each of sctx->buf, data1, and data2
285	// unconditionally and rearrange the data as needed.
286	add		x9, sctx, #OFFSETOF_BUF
287	ld1		{v16.16b-v19.16b}, [x9]
288	st1		{v16.16b-v19.16b}, [sp]
289
290	ld1		{v16.16b-v19.16b}, [data1], #64
291	add		x9, sp, x8
292	st1		{v16.16b-v19.16b}, [x9]
293	ld1		{v16.4s-v19.4s}, [sp]
294
295	ld1		{v20.16b-v23.16b}, [data2], #64
296	st1		{v20.16b-v23.16b}, [x9]
297	ld1		{v20.4s-v23.4s}, [sp]
298
299	sub		len, len, #64
300	sub		data1, data1, x8
301	sub		data2, data2, x8
302	add		len, len, w8
303	mov		state0_b.16b, state0_a.16b
304	mov		state1_b.16b, state1_a.16b
305	b		.Lfinup2x_loop_have_data
306
307.Lfinup2x_enter_loop:
308	sub		len, len, #64
309	mov		state0_b.16b, state0_a.16b
310	mov		state1_b.16b, state1_a.16b
311.Lfinup2x_loop:
312	// Load the next two data blocks.
313	ld1		{v16.4s-v19.4s}, [data1], #64
314	ld1		{v20.4s-v23.4s}, [data2], #64
315.Lfinup2x_loop_have_data:
316	// Convert the words of the data blocks from big endian.
317CPU_LE(	rev32		v16.16b, v16.16b	)
318CPU_LE(	rev32		v17.16b, v17.16b	)
319CPU_LE(	rev32		v18.16b, v18.16b	)
320CPU_LE(	rev32		v19.16b, v19.16b	)
321CPU_LE(	rev32		v20.16b, v20.16b	)
322CPU_LE(	rev32		v21.16b, v21.16b	)
323CPU_LE(	rev32		v22.16b, v22.16b	)
324CPU_LE(	rev32		v23.16b, v23.16b	)
325.Lfinup2x_loop_have_bswapped_data:
326
327	// Save the original state for each block.
328	st1		{state0_a.4s-state1_b.4s}, [sp]
329
330	// Do the SHA-256 rounds on each block.
331	do_16rounds_2x	0,  v0, v1, v2, v3
332	do_16rounds_2x	16, v4, v5, v6, v7
333	do_16rounds_2x	32, v8, v9, v10, v11
334	do_16rounds_2x	48, v12, v13, v14, v15
335
336	// Add the original state for each block.
337	ld1		{v16.4s-v19.4s}, [sp]
338	add		state0_a.4s, state0_a.4s, v16.4s
339	add		state1_a.4s, state1_a.4s, v17.4s
340	add		state0_b.4s, state0_b.4s, v18.4s
341	add		state1_b.4s, state1_b.4s, v19.4s
342
343	// Update len and loop back if more blocks remain.
344	sub		len, len, #64
345	tbz		len, #31, .Lfinup2x_loop	// len >= 0?
346
347	// Check if any final blocks need to be handled.
348	// final_step = 2: all done
349	// final_step = 1: need to do count-only padding block
350	// final_step = 0: need to do the block with 0x80 padding byte
351	tbnz		final_step, #1, .Lfinup2x_done
352	tbnz		final_step, #0, .Lfinup2x_finalize_countonly
353	add		len, len, #64
354	cbz		len, .Lfinup2x_finalize_blockaligned
355
356	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
357	// To do this, write the padding starting with the 0x80 byte to
358	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
359	// and load from &sp[64 - len] to get the needed padding block.  This
360	// code relies on the data buffers being >= 64 bytes in length.
361	sub		w8, len, #64		// w8 = len - 64
362	add		data1, data1, w8, sxtw	// data1 += len - 64
363	add		data2, data2, w8, sxtw	// data2 += len - 64
364	mov		x9, 0x80
365	fmov		d16, x9
366	movi		v17.16b, #0
367	stp		q16, q17, [sp, #64]
368	stp		q17, q17, [sp, #96]
369	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
370	cmp		len, #56
371	b.ge		1f		// will count spill into its own block?
372	lsl		count, count, #3
373	rev		count, count
374	str		count, [x9, #56]
375	mov		final_step, #2	// won't need count-only block
376	b		2f
3771:
378	mov		final_step, #1	// will need count-only block
3792:
380	ld1		{v16.16b-v19.16b}, [data1]
381	st1		{v16.16b-v19.16b}, [sp]
382	ld1		{v16.4s-v19.4s}, [x9]
383	ld1		{v20.16b-v23.16b}, [data2]
384	st1		{v20.16b-v23.16b}, [sp]
385	ld1		{v20.4s-v23.4s}, [x9]
386	b		.Lfinup2x_loop_have_data
387
388	// Prepare a padding block, either:
389	//
390	//	{0x80, 0, 0, 0, ..., count (as __be64)}
391	//	This is for a block aligned message.
392	//
393	//	{   0, 0, 0, 0, ..., count (as __be64)}
394	//	This is for a message whose length mod 64 is >= 56.
395	//
396	// Pre-swap the endianness of the words.
397.Lfinup2x_finalize_countonly:
398	movi		v16.2d, #0
399	b		1f
400.Lfinup2x_finalize_blockaligned:
401	mov		x8, #0x80000000
402	fmov		d16, x8
4031:
404	movi		v17.2d, #0
405	movi		v18.2d, #0
406	ror		count, count, #29	// ror(lsl(count, 3), 32)
407	mov		v19.d[0], xzr
408	mov		v19.d[1], count
409	mov		v20.16b, v16.16b
410	movi		v21.2d, #0
411	movi		v22.2d, #0
412	mov		v23.16b, v19.16b
413	mov		final_step, #2
414	b		.Lfinup2x_loop_have_bswapped_data
415
416.Lfinup2x_done:
417	// Write the two digests with all bytes in the correct order.
418CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
419CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
420CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
421CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
422	st1		{state0_a.4s-state1_a.4s}, [out1]
423	st1		{state0_b.4s-state1_b.4s}, [out2]
424	add		sp, sp, #128
425	ret
426SYM_FUNC_END(__sha256_ce_finup2x)
427