• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Intel SHA Extensions optimized implementation of a SHA-256 update function
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * 	Sean Gulley <sean.m.gulley@intel.com>
22 * 	Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2015 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * 	* Redistributions of source code must retain the above copyright
33 * 	  notice, this list of conditions and the following disclaimer.
34 * 	* Redistributions in binary form must reproduce the above copyright
35 * 	  notice, this list of conditions and the following disclaimer in
36 * 	  the documentation and/or other materials provided with the
37 * 	  distribution.
38 * 	* Neither the name of Intel Corporation nor the names of its
39 * 	  contributors may be used to endorse or promote products derived
40 * 	  from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 *
54 */
55
56#include <linux/linkage.h>
57#include <linux/cfi_types.h>
58
59#define DIGEST_PTR	%rdi	/* 1st arg */
60#define DATA_PTR	%rsi	/* 2nd arg */
61#define NUM_BLKS	%rdx	/* 3rd arg */
62
63#define SHA256CONSTANTS	%rax
64
65#define MSG		%xmm0
66#define STATE0		%xmm1
67#define STATE1		%xmm2
68#define MSGTMP0		%xmm3
69#define MSGTMP1		%xmm4
70#define MSGTMP2		%xmm5
71#define MSGTMP3		%xmm6
72#define MSGTMP4		%xmm7
73
74#define SHUF_MASK	%xmm8
75
76#define ABEF_SAVE	%xmm9
77#define CDGH_SAVE	%xmm10
78
79/*
80 * Intel SHA Extensions optimized implementation of a SHA-256 update function
81 *
82 * The function takes a pointer to the current hash values, a pointer to the
83 * input data, and a number of 64 byte blocks to process.  Once all blocks have
84 * been processed, the digest pointer is  updated with the resulting hash value.
85 * The function only processes complete blocks, there is no functionality to
86 * store partial blocks.  All message padding and hash value initialization must
87 * be done outside the update function.
88 *
89 * The indented lines in the loop are instructions related to rounds processing.
90 * The non-indented lines are instructions related to the message schedule.
91 *
92 * void sha256_ni_transform(uint32_t *digest, const void *data,
93		uint32_t numBlocks);
94 * digest : pointer to digest
95 * data: pointer to input data
96 * numBlocks: Number of blocks to process
97 */
98
99.text
100SYM_TYPED_FUNC_START(sha256_ni_transform)
101
102	shl		$6, NUM_BLKS		/*  convert to bytes */
103	jz		.Ldone_hash
104	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
105
106	/*
107	 * load initial hash values
108	 * Need to reorder these appropriately
109	 * DCBA, HGFE -> ABEF, CDGH
110	 */
111	movdqu		0*16(DIGEST_PTR), STATE0
112	movdqu		1*16(DIGEST_PTR), STATE1
113
114	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
115	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
116	movdqa		STATE0, MSGTMP4
117	palignr		$8, STATE1,  STATE0		/* ABEF */
118	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
119
120	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
121	lea		K256(%rip), SHA256CONSTANTS
122
123.Lloop0:
124	/* Save hash values for addition after rounds */
125	movdqa		STATE0, ABEF_SAVE
126	movdqa		STATE1, CDGH_SAVE
127
128	/* Rounds 0-3 */
129	movdqu		0*16(DATA_PTR), MSG
130	pshufb		SHUF_MASK, MSG
131	movdqa		MSG, MSGTMP0
132		paddd		0*16(SHA256CONSTANTS), MSG
133		sha256rnds2	STATE0, STATE1
134		pshufd 		$0x0E, MSG, MSG
135		sha256rnds2	STATE1, STATE0
136
137	/* Rounds 4-7 */
138	movdqu		1*16(DATA_PTR), MSG
139	pshufb		SHUF_MASK, MSG
140	movdqa		MSG, MSGTMP1
141		paddd		1*16(SHA256CONSTANTS), MSG
142		sha256rnds2	STATE0, STATE1
143		pshufd 		$0x0E, MSG, MSG
144		sha256rnds2	STATE1, STATE0
145	sha256msg1	MSGTMP1, MSGTMP0
146
147	/* Rounds 8-11 */
148	movdqu		2*16(DATA_PTR), MSG
149	pshufb		SHUF_MASK, MSG
150	movdqa		MSG, MSGTMP2
151		paddd		2*16(SHA256CONSTANTS), MSG
152		sha256rnds2	STATE0, STATE1
153		pshufd 		$0x0E, MSG, MSG
154		sha256rnds2	STATE1, STATE0
155	sha256msg1	MSGTMP2, MSGTMP1
156
157	/* Rounds 12-15 */
158	movdqu		3*16(DATA_PTR), MSG
159	pshufb		SHUF_MASK, MSG
160	movdqa		MSG, MSGTMP3
161		paddd		3*16(SHA256CONSTANTS), MSG
162		sha256rnds2	STATE0, STATE1
163	movdqa		MSGTMP3, MSGTMP4
164	palignr		$4, MSGTMP2, MSGTMP4
165	paddd		MSGTMP4, MSGTMP0
166	sha256msg2	MSGTMP3, MSGTMP0
167		pshufd 		$0x0E, MSG, MSG
168		sha256rnds2	STATE1, STATE0
169	sha256msg1	MSGTMP3, MSGTMP2
170
171	/* Rounds 16-19 */
172	movdqa		MSGTMP0, MSG
173		paddd		4*16(SHA256CONSTANTS), MSG
174		sha256rnds2	STATE0, STATE1
175	movdqa		MSGTMP0, MSGTMP4
176	palignr		$4, MSGTMP3, MSGTMP4
177	paddd		MSGTMP4, MSGTMP1
178	sha256msg2	MSGTMP0, MSGTMP1
179		pshufd 		$0x0E, MSG, MSG
180		sha256rnds2	STATE1, STATE0
181	sha256msg1	MSGTMP0, MSGTMP3
182
183	/* Rounds 20-23 */
184	movdqa		MSGTMP1, MSG
185		paddd		5*16(SHA256CONSTANTS), MSG
186		sha256rnds2	STATE0, STATE1
187	movdqa		MSGTMP1, MSGTMP4
188	palignr		$4, MSGTMP0, MSGTMP4
189	paddd		MSGTMP4, MSGTMP2
190	sha256msg2	MSGTMP1, MSGTMP2
191		pshufd 		$0x0E, MSG, MSG
192		sha256rnds2	STATE1, STATE0
193	sha256msg1	MSGTMP1, MSGTMP0
194
195	/* Rounds 24-27 */
196	movdqa		MSGTMP2, MSG
197		paddd		6*16(SHA256CONSTANTS), MSG
198		sha256rnds2	STATE0, STATE1
199	movdqa		MSGTMP2, MSGTMP4
200	palignr		$4, MSGTMP1, MSGTMP4
201	paddd		MSGTMP4, MSGTMP3
202	sha256msg2	MSGTMP2, MSGTMP3
203		pshufd 		$0x0E, MSG, MSG
204		sha256rnds2	STATE1, STATE0
205	sha256msg1	MSGTMP2, MSGTMP1
206
207	/* Rounds 28-31 */
208	movdqa		MSGTMP3, MSG
209		paddd		7*16(SHA256CONSTANTS), MSG
210		sha256rnds2	STATE0, STATE1
211	movdqa		MSGTMP3, MSGTMP4
212	palignr		$4, MSGTMP2, MSGTMP4
213	paddd		MSGTMP4, MSGTMP0
214	sha256msg2	MSGTMP3, MSGTMP0
215		pshufd 		$0x0E, MSG, MSG
216		sha256rnds2	STATE1, STATE0
217	sha256msg1	MSGTMP3, MSGTMP2
218
219	/* Rounds 32-35 */
220	movdqa		MSGTMP0, MSG
221		paddd		8*16(SHA256CONSTANTS), MSG
222		sha256rnds2	STATE0, STATE1
223	movdqa		MSGTMP0, MSGTMP4
224	palignr		$4, MSGTMP3, MSGTMP4
225	paddd		MSGTMP4, MSGTMP1
226	sha256msg2	MSGTMP0, MSGTMP1
227		pshufd 		$0x0E, MSG, MSG
228		sha256rnds2	STATE1, STATE0
229	sha256msg1	MSGTMP0, MSGTMP3
230
231	/* Rounds 36-39 */
232	movdqa		MSGTMP1, MSG
233		paddd		9*16(SHA256CONSTANTS), MSG
234		sha256rnds2	STATE0, STATE1
235	movdqa		MSGTMP1, MSGTMP4
236	palignr		$4, MSGTMP0, MSGTMP4
237	paddd		MSGTMP4, MSGTMP2
238	sha256msg2	MSGTMP1, MSGTMP2
239		pshufd 		$0x0E, MSG, MSG
240		sha256rnds2	STATE1, STATE0
241	sha256msg1	MSGTMP1, MSGTMP0
242
243	/* Rounds 40-43 */
244	movdqa		MSGTMP2, MSG
245		paddd		10*16(SHA256CONSTANTS), MSG
246		sha256rnds2	STATE0, STATE1
247	movdqa		MSGTMP2, MSGTMP4
248	palignr		$4, MSGTMP1, MSGTMP4
249	paddd		MSGTMP4, MSGTMP3
250	sha256msg2	MSGTMP2, MSGTMP3
251		pshufd 		$0x0E, MSG, MSG
252		sha256rnds2	STATE1, STATE0
253	sha256msg1	MSGTMP2, MSGTMP1
254
255	/* Rounds 44-47 */
256	movdqa		MSGTMP3, MSG
257		paddd		11*16(SHA256CONSTANTS), MSG
258		sha256rnds2	STATE0, STATE1
259	movdqa		MSGTMP3, MSGTMP4
260	palignr		$4, MSGTMP2, MSGTMP4
261	paddd		MSGTMP4, MSGTMP0
262	sha256msg2	MSGTMP3, MSGTMP0
263		pshufd 		$0x0E, MSG, MSG
264		sha256rnds2	STATE1, STATE0
265	sha256msg1	MSGTMP3, MSGTMP2
266
267	/* Rounds 48-51 */
268	movdqa		MSGTMP0, MSG
269		paddd		12*16(SHA256CONSTANTS), MSG
270		sha256rnds2	STATE0, STATE1
271	movdqa		MSGTMP0, MSGTMP4
272	palignr		$4, MSGTMP3, MSGTMP4
273	paddd		MSGTMP4, MSGTMP1
274	sha256msg2	MSGTMP0, MSGTMP1
275		pshufd 		$0x0E, MSG, MSG
276		sha256rnds2	STATE1, STATE0
277	sha256msg1	MSGTMP0, MSGTMP3
278
279	/* Rounds 52-55 */
280	movdqa		MSGTMP1, MSG
281		paddd		13*16(SHA256CONSTANTS), MSG
282		sha256rnds2	STATE0, STATE1
283	movdqa		MSGTMP1, MSGTMP4
284	palignr		$4, MSGTMP0, MSGTMP4
285	paddd		MSGTMP4, MSGTMP2
286	sha256msg2	MSGTMP1, MSGTMP2
287		pshufd 		$0x0E, MSG, MSG
288		sha256rnds2	STATE1, STATE0
289
290	/* Rounds 56-59 */
291	movdqa		MSGTMP2, MSG
292		paddd		14*16(SHA256CONSTANTS), MSG
293		sha256rnds2	STATE0, STATE1
294	movdqa		MSGTMP2, MSGTMP4
295	palignr		$4, MSGTMP1, MSGTMP4
296	paddd		MSGTMP4, MSGTMP3
297	sha256msg2	MSGTMP2, MSGTMP3
298		pshufd 		$0x0E, MSG, MSG
299		sha256rnds2	STATE1, STATE0
300
301	/* Rounds 60-63 */
302	movdqa		MSGTMP3, MSG
303		paddd		15*16(SHA256CONSTANTS), MSG
304		sha256rnds2	STATE0, STATE1
305		pshufd 		$0x0E, MSG, MSG
306		sha256rnds2	STATE1, STATE0
307
308	/* Add current hash values with previously saved */
309	paddd		ABEF_SAVE, STATE0
310	paddd		CDGH_SAVE, STATE1
311
312	/* Increment data pointer and loop if more to process */
313	add		$64, DATA_PTR
314	cmp		NUM_BLKS, DATA_PTR
315	jne		.Lloop0
316
317	/* Write hash values back in the correct order */
318	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
319	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
320	movdqa		STATE0, MSGTMP4
321	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
322	palignr		$8, MSGTMP4, STATE1		/* HGFE */
323
324	movdqu		STATE0, 0*16(DIGEST_PTR)
325	movdqu		STATE1, 1*16(DIGEST_PTR)
326
327.Ldone_hash:
328
329	RET
330SYM_FUNC_END(sha256_ni_transform)
331
332#undef DIGEST_PTR
333#undef DATA_PTR
334#undef NUM_BLKS
335#undef SHA256CONSTANTS
336#undef MSG
337#undef STATE0
338#undef STATE1
339#undef MSG0
340#undef MSG1
341#undef MSG2
342#undef MSG3
343#undef TMP
344#undef SHUF_MASK
345#undef ABEF_SAVE
346#undef CDGH_SAVE
347
348// parameters for __sha256_ni_finup2x()
349#define SCTX		%rdi
350#define DATA1		%rsi
351#define DATA2		%rdx
352#define LEN		%ecx
353#define LEN8		%cl
354#define LEN64		%rcx
355#define OUT1		%r8
356#define OUT2		%r9
357
358// other scalar variables
359#define SHA256CONSTANTS	%rax
360#define COUNT		%r10
361#define COUNT32		%r10d
362#define FINAL_STEP	%r11d
363
364// rbx is used as a temporary.
365
366#define MSG		%xmm0	// sha256rnds2 implicit operand
367#define STATE0_A	%xmm1
368#define STATE1_A	%xmm2
369#define STATE0_B	%xmm3
370#define STATE1_B	%xmm4
371#define TMP_A		%xmm5
372#define TMP_B		%xmm6
373#define MSG0_A		%xmm7
374#define MSG1_A		%xmm8
375#define MSG2_A		%xmm9
376#define MSG3_A		%xmm10
377#define MSG0_B		%xmm11
378#define MSG1_B		%xmm12
379#define MSG2_B		%xmm13
380#define MSG3_B		%xmm14
381#define SHUF_MASK	%xmm15
382
383#define OFFSETOF_STATE	0	// offsetof(struct sha256_state, state)
384#define OFFSETOF_COUNT	32	// offsetof(struct sha256_state, count)
385#define OFFSETOF_BUF	40	// offsetof(struct sha256_state, buf)
386
387// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
388// contain the current 4 message schedule words for the first and second message
389// respectively.
390//
391// If not all the message schedule words have been computed yet, then this also
392// computes 4 more message schedule words for each message.  m1_a-m3_a contain
393// the next 3 groups of 4 message schedule words for the first message, and
394// likewise m1_b-m3_b for the second.  After consuming the current value of
395// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
396// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
397// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
398// cycle through the registers accordingly.
399.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
400	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
401	movdqa		TMP_A, TMP_B
402	paddd		\m0_a, TMP_A
403	paddd		\m0_b, TMP_B
404.if \i < 48
405	sha256msg1	\m1_a, \m0_a
406	sha256msg1	\m1_b, \m0_b
407.endif
408	movdqa		TMP_A, MSG
409	sha256rnds2	STATE0_A, STATE1_A
410	movdqa		TMP_B, MSG
411	sha256rnds2	STATE0_B, STATE1_B
412	pshufd 		$0x0E, TMP_A, MSG
413	sha256rnds2	STATE1_A, STATE0_A
414	pshufd 		$0x0E, TMP_B, MSG
415	sha256rnds2	STATE1_B, STATE0_B
416.if \i < 48
417	movdqa		\m3_a, TMP_A
418	movdqa		\m3_b, TMP_B
419	palignr		$4, \m2_a, TMP_A
420	palignr		$4, \m2_b, TMP_B
421	paddd		TMP_A, \m0_a
422	paddd		TMP_B, \m0_b
423	sha256msg2	\m3_a, \m0_a
424	sha256msg2	\m3_b, \m0_b
425.endif
426.endm
427
428//
429// void __sha256_ni_finup2x(const struct sha256_state *sctx,
430//			    const u8 *data1, const u8 *data2, int len,
431//			    u8 out1[SHA256_DIGEST_SIZE],
432//			    u8 out2[SHA256_DIGEST_SIZE]);
433//
434// This function computes the SHA-256 digests of two messages |data1| and
435// |data2| that are both |len| bytes long, starting from the initial state
436// |sctx|.  |len| must be at least SHA256_BLOCK_SIZE.
437//
438// The instructions for the two SHA-256 operations are interleaved.  On many
439// CPUs, this is almost twice as fast as hashing each message individually due
440// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
441//
442SYM_FUNC_START(__sha256_ni_finup2x)
443	// Allocate 128 bytes of stack space, 16-byte aligned.
444	push		%rbx
445	push		%rbp
446	mov		%rsp, %rbp
447	sub		$128, %rsp
448	and		$~15, %rsp
449
450	// Load the shuffle mask for swapping the endianness of 32-bit words.
451	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
452
453	// Set up pointer to the round constants.
454	lea		K256+32*4(%rip), SHA256CONSTANTS
455
456	// Initially we're not processing the final blocks.
457	xor		FINAL_STEP, FINAL_STEP
458
459	// Load the initial state from sctx->state.
460	movdqu		OFFSETOF_STATE+0*16(SCTX), STATE0_A	// DCBA
461	movdqu		OFFSETOF_STATE+1*16(SCTX), STATE1_A	// HGFE
462	movdqa		STATE0_A, TMP_A
463	punpcklqdq	STATE1_A, STATE0_A			// FEBA
464	punpckhqdq	TMP_A, STATE1_A				// DCHG
465	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
466	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
467
468	// Load sctx->count.  Take the mod 64 of it to get the number of bytes
469	// that are buffered in sctx->buf.  Also save it in a register with LEN
470	// added to it.
471	mov		LEN, LEN
472	mov		OFFSETOF_COUNT(SCTX), %rbx
473	lea		(%rbx, LEN64, 1), COUNT
474	and		$63, %ebx
475	jz		.Lfinup2x_enter_loop	// No bytes buffered?
476
477	// %ebx bytes (1 to 63) are currently buffered in sctx->buf.  Load them
478	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
479	// just load 64 bytes from each of sctx->buf, DATA1, and DATA2
480	// unconditionally and rearrange the data as needed.
481
482	movdqu		OFFSETOF_BUF+0*16(SCTX), MSG0_A
483	movdqu		OFFSETOF_BUF+1*16(SCTX), MSG1_A
484	movdqu		OFFSETOF_BUF+2*16(SCTX), MSG2_A
485	movdqu		OFFSETOF_BUF+3*16(SCTX), MSG3_A
486	movdqa		MSG0_A, 0*16(%rsp)
487	movdqa		MSG1_A, 1*16(%rsp)
488	movdqa		MSG2_A, 2*16(%rsp)
489	movdqa		MSG3_A, 3*16(%rsp)
490
491	movdqu		0*16(DATA1), MSG0_A
492	movdqu		1*16(DATA1), MSG1_A
493	movdqu		2*16(DATA1), MSG2_A
494	movdqu		3*16(DATA1), MSG3_A
495	movdqu		MSG0_A, 0*16(%rsp,%rbx)
496	movdqu		MSG1_A, 1*16(%rsp,%rbx)
497	movdqu		MSG2_A, 2*16(%rsp,%rbx)
498	movdqu		MSG3_A, 3*16(%rsp,%rbx)
499	movdqa		0*16(%rsp), MSG0_A
500	movdqa		1*16(%rsp), MSG1_A
501	movdqa		2*16(%rsp), MSG2_A
502	movdqa		3*16(%rsp), MSG3_A
503
504	movdqu		0*16(DATA2), MSG0_B
505	movdqu		1*16(DATA2), MSG1_B
506	movdqu		2*16(DATA2), MSG2_B
507	movdqu		3*16(DATA2), MSG3_B
508	movdqu		MSG0_B, 0*16(%rsp,%rbx)
509	movdqu		MSG1_B, 1*16(%rsp,%rbx)
510	movdqu		MSG2_B, 2*16(%rsp,%rbx)
511	movdqu		MSG3_B, 3*16(%rsp,%rbx)
512	movdqa		0*16(%rsp), MSG0_B
513	movdqa		1*16(%rsp), MSG1_B
514	movdqa		2*16(%rsp), MSG2_B
515	movdqa		3*16(%rsp), MSG3_B
516
517	sub		$64, %rbx 	// rbx = buffered - 64
518	sub		%rbx, DATA1	// DATA1 += 64 - buffered
519	sub		%rbx, DATA2	// DATA2 += 64 - buffered
520	add		%ebx, LEN	// LEN += buffered - 64
521	movdqa		STATE0_A, STATE0_B
522	movdqa		STATE1_A, STATE1_B
523	jmp		.Lfinup2x_loop_have_data
524
525.Lfinup2x_enter_loop:
526	sub		$64, LEN
527	movdqa		STATE0_A, STATE0_B
528	movdqa		STATE1_A, STATE1_B
529.Lfinup2x_loop:
530	// Load the next two data blocks.
531	movdqu		0*16(DATA1), MSG0_A
532	movdqu		0*16(DATA2), MSG0_B
533	movdqu		1*16(DATA1), MSG1_A
534	movdqu		1*16(DATA2), MSG1_B
535	movdqu		2*16(DATA1), MSG2_A
536	movdqu		2*16(DATA2), MSG2_B
537	movdqu		3*16(DATA1), MSG3_A
538	movdqu		3*16(DATA2), MSG3_B
539	add		$64, DATA1
540	add		$64, DATA2
541.Lfinup2x_loop_have_data:
542	// Convert the words of the data blocks from big endian.
543	pshufb		SHUF_MASK, MSG0_A
544	pshufb		SHUF_MASK, MSG0_B
545	pshufb		SHUF_MASK, MSG1_A
546	pshufb		SHUF_MASK, MSG1_B
547	pshufb		SHUF_MASK, MSG2_A
548	pshufb		SHUF_MASK, MSG2_B
549	pshufb		SHUF_MASK, MSG3_A
550	pshufb		SHUF_MASK, MSG3_B
551.Lfinup2x_loop_have_bswapped_data:
552
553	// Save the original state for each block.
554	movdqa		STATE0_A, 0*16(%rsp)
555	movdqa		STATE0_B, 1*16(%rsp)
556	movdqa		STATE1_A, 2*16(%rsp)
557	movdqa		STATE1_B, 3*16(%rsp)
558
559	// Do the SHA-256 rounds on each block.
560.irp i, 0, 16, 32, 48
561	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
562				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
563	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
564				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
565	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
566				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
567	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
568				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
569.endr
570
571	// Add the original state for each block.
572	paddd		0*16(%rsp), STATE0_A
573	paddd		1*16(%rsp), STATE0_B
574	paddd		2*16(%rsp), STATE1_A
575	paddd		3*16(%rsp), STATE1_B
576
577	// Update LEN and loop back if more blocks remain.
578	sub		$64, LEN
579	jge		.Lfinup2x_loop
580
581	// Check if any final blocks need to be handled.
582	// FINAL_STEP = 2: all done
583	// FINAL_STEP = 1: need to do count-only padding block
584	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
585	cmp		$1, FINAL_STEP
586	jg		.Lfinup2x_done
587	je		.Lfinup2x_finalize_countonly
588	add		$64, LEN
589	jz		.Lfinup2x_finalize_blockaligned
590
591	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
592	// To do this, write the padding starting with the 0x80 byte to
593	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
594	// and load from &sp[64 - LEN] to get the needed padding block.  This
595	// code relies on the data buffers being >= 64 bytes in length.
596	mov		$64, %ebx
597	sub		LEN, %ebx		// ebx = 64 - LEN
598	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
599	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
600	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
601	movd		FINAL_STEP, MSG0_A
602	pxor		MSG1_A, MSG1_A
603	movdqa		MSG0_A, 4*16(%rsp)
604	movdqa		MSG1_A, 5*16(%rsp)
605	movdqa		MSG1_A, 6*16(%rsp)
606	movdqa		MSG1_A, 7*16(%rsp)
607	cmp		$56, LEN
608	jge		1f	// will COUNT spill into its own block?
609	shl		$3, COUNT
610	bswap		COUNT
611	mov		COUNT, 56(%rsp,%rbx)
612	mov		$2, FINAL_STEP	// won't need count-only block
613	jmp		2f
6141:
615	mov		$1, FINAL_STEP	// will need count-only block
6162:
617	movdqu		0*16(DATA1), MSG0_A
618	movdqu		1*16(DATA1), MSG1_A
619	movdqu		2*16(DATA1), MSG2_A
620	movdqu		3*16(DATA1), MSG3_A
621	movdqa		MSG0_A, 0*16(%rsp)
622	movdqa		MSG1_A, 1*16(%rsp)
623	movdqa		MSG2_A, 2*16(%rsp)
624	movdqa		MSG3_A, 3*16(%rsp)
625	movdqu		0*16(%rsp,%rbx), MSG0_A
626	movdqu		1*16(%rsp,%rbx), MSG1_A
627	movdqu		2*16(%rsp,%rbx), MSG2_A
628	movdqu		3*16(%rsp,%rbx), MSG3_A
629
630	movdqu		0*16(DATA2), MSG0_B
631	movdqu		1*16(DATA2), MSG1_B
632	movdqu		2*16(DATA2), MSG2_B
633	movdqu		3*16(DATA2), MSG3_B
634	movdqa		MSG0_B, 0*16(%rsp)
635	movdqa		MSG1_B, 1*16(%rsp)
636	movdqa		MSG2_B, 2*16(%rsp)
637	movdqa		MSG3_B, 3*16(%rsp)
638	movdqu		0*16(%rsp,%rbx), MSG0_B
639	movdqu		1*16(%rsp,%rbx), MSG1_B
640	movdqu		2*16(%rsp,%rbx), MSG2_B
641	movdqu		3*16(%rsp,%rbx), MSG3_B
642	jmp		.Lfinup2x_loop_have_data
643
644	// Prepare a padding block, either:
645	//
646	//	{0x80, 0, 0, 0, ..., count (as __be64)}
647	//	This is for a block aligned message.
648	//
649	//	{   0, 0, 0, 0, ..., count (as __be64)}
650	//	This is for a message whose length mod 64 is >= 56.
651	//
652	// Pre-swap the endianness of the words.
653.Lfinup2x_finalize_countonly:
654	pxor		MSG0_A, MSG0_A
655	jmp		1f
656
657.Lfinup2x_finalize_blockaligned:
658	mov		$0x80000000, %ebx
659	movd		%ebx, MSG0_A
6601:
661	pxor		MSG1_A, MSG1_A
662	pxor		MSG2_A, MSG2_A
663	ror		$29, COUNT
664	movq		COUNT, MSG3_A
665	pslldq		$8, MSG3_A
666	movdqa		MSG0_A, MSG0_B
667	pxor		MSG1_B, MSG1_B
668	pxor		MSG2_B, MSG2_B
669	movdqa		MSG3_A, MSG3_B
670	mov		$2, FINAL_STEP
671	jmp		.Lfinup2x_loop_have_bswapped_data
672
673.Lfinup2x_done:
674	// Write the two digests with all bytes in the correct order.
675	movdqa		STATE0_A, TMP_A
676	movdqa		STATE0_B, TMP_B
677	punpcklqdq	STATE1_A, STATE0_A		// GHEF
678	punpcklqdq	STATE1_B, STATE0_B
679	punpckhqdq	TMP_A, STATE1_A			// ABCD
680	punpckhqdq	TMP_B, STATE1_B
681	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
682	pshufd		$0xB1, STATE0_B, STATE0_B
683	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
684	pshufd		$0x1B, STATE1_B, STATE1_B
685	pshufb		SHUF_MASK, STATE0_A
686	pshufb		SHUF_MASK, STATE0_B
687	pshufb		SHUF_MASK, STATE1_A
688	pshufb		SHUF_MASK, STATE1_B
689	movdqu		STATE0_A, 1*16(OUT1)
690	movdqu		STATE0_B, 1*16(OUT2)
691	movdqu		STATE1_A, 0*16(OUT1)
692	movdqu		STATE1_B, 0*16(OUT2)
693
694	mov		%rbp, %rsp
695	pop		%rbp
696	pop		%rbx
697	RET
698SYM_FUNC_END(__sha256_ni_finup2x)
699
700.section	.rodata.cst256.K256, "aM", @progbits, 256
701.align 64
702K256:
703	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
704	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
705	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
706	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
707	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
708	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
709	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
710	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
711	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
712	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
713	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
714	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
715	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
716	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
717	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
718	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
719
720.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
721.align 16
722PSHUFFLE_BYTE_FLIP_MASK:
723	.octa 0x0c0d0e0f08090a0b0405060700010203
724