• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Intel SHA Extensions optimized implementation of a SHA-256 update function
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * 	Sean Gulley <sean.m.gulley@intel.com>
22 * 	Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2015 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * 	* Redistributions of source code must retain the above copyright
33 * 	  notice, this list of conditions and the following disclaimer.
34 * 	* Redistributions in binary form must reproduce the above copyright
35 * 	  notice, this list of conditions and the following disclaimer in
36 * 	  the documentation and/or other materials provided with the
37 * 	  distribution.
38 * 	* Neither the name of Intel Corporation nor the names of its
39 * 	  contributors may be used to endorse or promote products derived
40 * 	  from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 *
54 */
55
56#include <linux/linkage.h>
57#include <linux/cfi_types.h>
58
59#define DIGEST_PTR	%rdi	/* 1st arg */
60#define DATA_PTR	%rsi	/* 2nd arg */
61#define NUM_BLKS	%rdx	/* 3rd arg */
62
63#define SHA256CONSTANTS	%rax
64
65#define MSG		%xmm0  /* sha256rnds2 implicit operand */
66#define STATE0		%xmm1
67#define STATE1		%xmm2
68#define MSG0		%xmm3
69#define MSG1		%xmm4
70#define MSG2		%xmm5
71#define MSG3		%xmm6
72#define TMP		%xmm7
73
74#define SHUF_MASK	%xmm8
75
76#define ABEF_SAVE	%xmm9
77#define CDGH_SAVE	%xmm10
78
79.macro do_4rounds	i, m0, m1, m2, m3
80.if \i < 16
81	movdqu		\i*4(DATA_PTR), \m0
82	pshufb		SHUF_MASK, \m0
83.endif
84	movdqa		(\i-32)*4(SHA256CONSTANTS), MSG
85	paddd		\m0, MSG
86	sha256rnds2	STATE0, STATE1
87.if \i >= 12 && \i < 60
88	movdqa		\m0, TMP
89	palignr		$4, \m3, TMP
90	paddd		TMP, \m1
91	sha256msg2	\m0, \m1
92.endif
93	punpckhqdq	MSG, MSG
94	sha256rnds2	STATE1, STATE0
95.if \i >= 4 && \i < 52
96	sha256msg1	\m0, \m3
97.endif
98.endm
99
100/*
101 * Intel SHA Extensions optimized implementation of a SHA-256 update function
102 *
103 * The function takes a pointer to the current hash values, a pointer to the
104 * input data, and a number of 64 byte blocks to process.  Once all blocks have
105 * been processed, the digest pointer is  updated with the resulting hash value.
106 * The function only processes complete blocks, there is no functionality to
107 * store partial blocks.  All message padding and hash value initialization must
108 * be done outside the update function.
109 *
110 * void sha256_ni_transform(uint32_t *digest, const void *data,
111		uint32_t numBlocks);
112 * digest : pointer to digest
113 * data: pointer to input data
114 * numBlocks: Number of blocks to process
115 */
116
117.text
118SYM_TYPED_FUNC_START(sha256_ni_transform)
119
120	shl		$6, NUM_BLKS		/*  convert to bytes */
121	jz		.Ldone_hash
122	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
123
124	/*
125	 * load initial hash values
126	 * Need to reorder these appropriately
127	 * DCBA, HGFE -> ABEF, CDGH
128	 */
129	movdqu		0*16(DIGEST_PTR), STATE0	/* DCBA */
130	movdqu		1*16(DIGEST_PTR), STATE1	/* HGFE */
131
132	movdqa		STATE0, TMP
133	punpcklqdq	STATE1, STATE0			/* FEBA */
134	punpckhqdq	TMP, STATE1			/* DCHG */
135	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
136	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
137
138	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
139	lea		K256+32*4(%rip), SHA256CONSTANTS
140
141.Lloop0:
142	/* Save hash values for addition after rounds */
143	movdqa		STATE0, ABEF_SAVE
144	movdqa		STATE1, CDGH_SAVE
145
146.irp i, 0, 16, 32, 48
147	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3
148	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0
149	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1
150	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2
151.endr
152
153	/* Add current hash values with previously saved */
154	paddd		ABEF_SAVE, STATE0
155	paddd		CDGH_SAVE, STATE1
156
157	/* Increment data pointer and loop if more to process */
158	add		$64, DATA_PTR
159	cmp		NUM_BLKS, DATA_PTR
160	jne		.Lloop0
161
162	/* Write hash values back in the correct order */
163	movdqa		STATE0, TMP
164	punpcklqdq	STATE1, STATE0			/* GHEF */
165	punpckhqdq	TMP, STATE1			/* ABCD */
166	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
167	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
168
169	movdqu		STATE1, 0*16(DIGEST_PTR)
170	movdqu		STATE0, 1*16(DIGEST_PTR)
171
172.Ldone_hash:
173
174	RET
175SYM_FUNC_END(sha256_ni_transform)
176
177#undef DIGEST_PTR
178#undef DATA_PTR
179#undef NUM_BLKS
180#undef SHA256CONSTANTS
181#undef MSG
182#undef STATE0
183#undef STATE1
184#undef MSG0
185#undef MSG1
186#undef MSG2
187#undef MSG3
188#undef TMP
189#undef SHUF_MASK
190#undef ABEF_SAVE
191#undef CDGH_SAVE
192
193// parameters for __sha256_ni_finup2x()
194#define SCTX		%rdi
195#define DATA1		%rsi
196#define DATA2		%rdx
197#define LEN		%ecx
198#define LEN8		%cl
199#define LEN64		%rcx
200#define OUT1		%r8
201#define OUT2		%r9
202
203// other scalar variables
204#define SHA256CONSTANTS	%rax
205#define COUNT		%r10
206#define COUNT32		%r10d
207#define FINAL_STEP	%r11d
208
209// rbx is used as a temporary.
210
211#define MSG		%xmm0	// sha256rnds2 implicit operand
212#define STATE0_A	%xmm1
213#define STATE1_A	%xmm2
214#define STATE0_B	%xmm3
215#define STATE1_B	%xmm4
216#define TMP_A		%xmm5
217#define TMP_B		%xmm6
218#define MSG0_A		%xmm7
219#define MSG1_A		%xmm8
220#define MSG2_A		%xmm9
221#define MSG3_A		%xmm10
222#define MSG0_B		%xmm11
223#define MSG1_B		%xmm12
224#define MSG2_B		%xmm13
225#define MSG3_B		%xmm14
226#define SHUF_MASK	%xmm15
227
228#define OFFSETOF_STATE	0	// offsetof(struct sha256_state, state)
229#define OFFSETOF_COUNT	32	// offsetof(struct sha256_state, count)
230#define OFFSETOF_BUF	40	// offsetof(struct sha256_state, buf)
231
232// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
233// contain the current 4 message schedule words for the first and second message
234// respectively.
235//
236// If not all the message schedule words have been computed yet, then this also
237// computes 4 more message schedule words for each message.  m1_a-m3_a contain
238// the next 3 groups of 4 message schedule words for the first message, and
239// likewise m1_b-m3_b for the second.  After consuming the current value of
240// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
241// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
242// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
243// cycle through the registers accordingly.
244.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
245	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
246	movdqa		TMP_A, TMP_B
247	paddd		\m0_a, TMP_A
248	paddd		\m0_b, TMP_B
249.if \i < 48
250	sha256msg1	\m1_a, \m0_a
251	sha256msg1	\m1_b, \m0_b
252.endif
253	movdqa		TMP_A, MSG
254	sha256rnds2	STATE0_A, STATE1_A
255	movdqa		TMP_B, MSG
256	sha256rnds2	STATE0_B, STATE1_B
257	pshufd 		$0x0E, TMP_A, MSG
258	sha256rnds2	STATE1_A, STATE0_A
259	pshufd 		$0x0E, TMP_B, MSG
260	sha256rnds2	STATE1_B, STATE0_B
261.if \i < 48
262	movdqa		\m3_a, TMP_A
263	movdqa		\m3_b, TMP_B
264	palignr		$4, \m2_a, TMP_A
265	palignr		$4, \m2_b, TMP_B
266	paddd		TMP_A, \m0_a
267	paddd		TMP_B, \m0_b
268	sha256msg2	\m3_a, \m0_a
269	sha256msg2	\m3_b, \m0_b
270.endif
271.endm
272
273//
274// void __sha256_ni_finup2x(const struct sha256_state *sctx,
275//			    const u8 *data1, const u8 *data2, int len,
276//			    u8 out1[SHA256_DIGEST_SIZE],
277//			    u8 out2[SHA256_DIGEST_SIZE]);
278//
279// This function computes the SHA-256 digests of two messages |data1| and
280// |data2| that are both |len| bytes long, starting from the initial state
281// |sctx|.  |len| must be at least SHA256_BLOCK_SIZE.
282//
283// The instructions for the two SHA-256 operations are interleaved.  On many
284// CPUs, this is almost twice as fast as hashing each message individually due
285// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
286//
287SYM_FUNC_START(__sha256_ni_finup2x)
288	// Allocate 128 bytes of stack space, 16-byte aligned.
289	push		%rbx
290	push		%rbp
291	mov		%rsp, %rbp
292	sub		$128, %rsp
293	and		$~15, %rsp
294
295	// Load the shuffle mask for swapping the endianness of 32-bit words.
296	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
297
298	// Set up pointer to the round constants.
299	lea		K256+32*4(%rip), SHA256CONSTANTS
300
301	// Initially we're not processing the final blocks.
302	xor		FINAL_STEP, FINAL_STEP
303
304	// Load the initial state from sctx->state.
305	movdqu		OFFSETOF_STATE+0*16(SCTX), STATE0_A	// DCBA
306	movdqu		OFFSETOF_STATE+1*16(SCTX), STATE1_A	// HGFE
307	movdqa		STATE0_A, TMP_A
308	punpcklqdq	STATE1_A, STATE0_A			// FEBA
309	punpckhqdq	TMP_A, STATE1_A				// DCHG
310	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
311	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
312
313	// Load sctx->count.  Take the mod 64 of it to get the number of bytes
314	// that are buffered in sctx->buf.  Also save it in a register with LEN
315	// added to it.
316	mov		LEN, LEN
317	mov		OFFSETOF_COUNT(SCTX), %rbx
318	lea		(%rbx, LEN64, 1), COUNT
319	and		$63, %ebx
320	jz		.Lfinup2x_enter_loop	// No bytes buffered?
321
322	// %ebx bytes (1 to 63) are currently buffered in sctx->buf.  Load them
323	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
324	// just load 64 bytes from each of sctx->buf, DATA1, and DATA2
325	// unconditionally and rearrange the data as needed.
326
327	movdqu		OFFSETOF_BUF+0*16(SCTX), MSG0_A
328	movdqu		OFFSETOF_BUF+1*16(SCTX), MSG1_A
329	movdqu		OFFSETOF_BUF+2*16(SCTX), MSG2_A
330	movdqu		OFFSETOF_BUF+3*16(SCTX), MSG3_A
331	movdqa		MSG0_A, 0*16(%rsp)
332	movdqa		MSG1_A, 1*16(%rsp)
333	movdqa		MSG2_A, 2*16(%rsp)
334	movdqa		MSG3_A, 3*16(%rsp)
335
336	movdqu		0*16(DATA1), MSG0_A
337	movdqu		1*16(DATA1), MSG1_A
338	movdqu		2*16(DATA1), MSG2_A
339	movdqu		3*16(DATA1), MSG3_A
340	movdqu		MSG0_A, 0*16(%rsp,%rbx)
341	movdqu		MSG1_A, 1*16(%rsp,%rbx)
342	movdqu		MSG2_A, 2*16(%rsp,%rbx)
343	movdqu		MSG3_A, 3*16(%rsp,%rbx)
344	movdqa		0*16(%rsp), MSG0_A
345	movdqa		1*16(%rsp), MSG1_A
346	movdqa		2*16(%rsp), MSG2_A
347	movdqa		3*16(%rsp), MSG3_A
348
349	movdqu		0*16(DATA2), MSG0_B
350	movdqu		1*16(DATA2), MSG1_B
351	movdqu		2*16(DATA2), MSG2_B
352	movdqu		3*16(DATA2), MSG3_B
353	movdqu		MSG0_B, 0*16(%rsp,%rbx)
354	movdqu		MSG1_B, 1*16(%rsp,%rbx)
355	movdqu		MSG2_B, 2*16(%rsp,%rbx)
356	movdqu		MSG3_B, 3*16(%rsp,%rbx)
357	movdqa		0*16(%rsp), MSG0_B
358	movdqa		1*16(%rsp), MSG1_B
359	movdqa		2*16(%rsp), MSG2_B
360	movdqa		3*16(%rsp), MSG3_B
361
362	sub		$64, %rbx 	// rbx = buffered - 64
363	sub		%rbx, DATA1	// DATA1 += 64 - buffered
364	sub		%rbx, DATA2	// DATA2 += 64 - buffered
365	add		%ebx, LEN	// LEN += buffered - 64
366	movdqa		STATE0_A, STATE0_B
367	movdqa		STATE1_A, STATE1_B
368	jmp		.Lfinup2x_loop_have_data
369
370.Lfinup2x_enter_loop:
371	sub		$64, LEN
372	movdqa		STATE0_A, STATE0_B
373	movdqa		STATE1_A, STATE1_B
374.Lfinup2x_loop:
375	// Load the next two data blocks.
376	movdqu		0*16(DATA1), MSG0_A
377	movdqu		0*16(DATA2), MSG0_B
378	movdqu		1*16(DATA1), MSG1_A
379	movdqu		1*16(DATA2), MSG1_B
380	movdqu		2*16(DATA1), MSG2_A
381	movdqu		2*16(DATA2), MSG2_B
382	movdqu		3*16(DATA1), MSG3_A
383	movdqu		3*16(DATA2), MSG3_B
384	add		$64, DATA1
385	add		$64, DATA2
386.Lfinup2x_loop_have_data:
387	// Convert the words of the data blocks from big endian.
388	pshufb		SHUF_MASK, MSG0_A
389	pshufb		SHUF_MASK, MSG0_B
390	pshufb		SHUF_MASK, MSG1_A
391	pshufb		SHUF_MASK, MSG1_B
392	pshufb		SHUF_MASK, MSG2_A
393	pshufb		SHUF_MASK, MSG2_B
394	pshufb		SHUF_MASK, MSG3_A
395	pshufb		SHUF_MASK, MSG3_B
396.Lfinup2x_loop_have_bswapped_data:
397
398	// Save the original state for each block.
399	movdqa		STATE0_A, 0*16(%rsp)
400	movdqa		STATE0_B, 1*16(%rsp)
401	movdqa		STATE1_A, 2*16(%rsp)
402	movdqa		STATE1_B, 3*16(%rsp)
403
404	// Do the SHA-256 rounds on each block.
405.irp i, 0, 16, 32, 48
406	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
407				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
408	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
409				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
410	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
411				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
412	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
413				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
414.endr
415
416	// Add the original state for each block.
417	paddd		0*16(%rsp), STATE0_A
418	paddd		1*16(%rsp), STATE0_B
419	paddd		2*16(%rsp), STATE1_A
420	paddd		3*16(%rsp), STATE1_B
421
422	// Update LEN and loop back if more blocks remain.
423	sub		$64, LEN
424	jge		.Lfinup2x_loop
425
426	// Check if any final blocks need to be handled.
427	// FINAL_STEP = 2: all done
428	// FINAL_STEP = 1: need to do count-only padding block
429	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
430	cmp		$1, FINAL_STEP
431	jg		.Lfinup2x_done
432	je		.Lfinup2x_finalize_countonly
433	add		$64, LEN
434	jz		.Lfinup2x_finalize_blockaligned
435
436	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
437	// To do this, write the padding starting with the 0x80 byte to
438	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
439	// and load from &sp[64 - LEN] to get the needed padding block.  This
440	// code relies on the data buffers being >= 64 bytes in length.
441	mov		$64, %ebx
442	sub		LEN, %ebx		// ebx = 64 - LEN
443	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
444	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
445	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
446	movd		FINAL_STEP, MSG0_A
447	pxor		MSG1_A, MSG1_A
448	movdqa		MSG0_A, 4*16(%rsp)
449	movdqa		MSG1_A, 5*16(%rsp)
450	movdqa		MSG1_A, 6*16(%rsp)
451	movdqa		MSG1_A, 7*16(%rsp)
452	cmp		$56, LEN
453	jge		1f	// will COUNT spill into its own block?
454	shl		$3, COUNT
455	bswap		COUNT
456	mov		COUNT, 56(%rsp,%rbx)
457	mov		$2, FINAL_STEP	// won't need count-only block
458	jmp		2f
4591:
460	mov		$1, FINAL_STEP	// will need count-only block
4612:
462	movdqu		0*16(DATA1), MSG0_A
463	movdqu		1*16(DATA1), MSG1_A
464	movdqu		2*16(DATA1), MSG2_A
465	movdqu		3*16(DATA1), MSG3_A
466	movdqa		MSG0_A, 0*16(%rsp)
467	movdqa		MSG1_A, 1*16(%rsp)
468	movdqa		MSG2_A, 2*16(%rsp)
469	movdqa		MSG3_A, 3*16(%rsp)
470	movdqu		0*16(%rsp,%rbx), MSG0_A
471	movdqu		1*16(%rsp,%rbx), MSG1_A
472	movdqu		2*16(%rsp,%rbx), MSG2_A
473	movdqu		3*16(%rsp,%rbx), MSG3_A
474
475	movdqu		0*16(DATA2), MSG0_B
476	movdqu		1*16(DATA2), MSG1_B
477	movdqu		2*16(DATA2), MSG2_B
478	movdqu		3*16(DATA2), MSG3_B
479	movdqa		MSG0_B, 0*16(%rsp)
480	movdqa		MSG1_B, 1*16(%rsp)
481	movdqa		MSG2_B, 2*16(%rsp)
482	movdqa		MSG3_B, 3*16(%rsp)
483	movdqu		0*16(%rsp,%rbx), MSG0_B
484	movdqu		1*16(%rsp,%rbx), MSG1_B
485	movdqu		2*16(%rsp,%rbx), MSG2_B
486	movdqu		3*16(%rsp,%rbx), MSG3_B
487	jmp		.Lfinup2x_loop_have_data
488
489	// Prepare a padding block, either:
490	//
491	//	{0x80, 0, 0, 0, ..., count (as __be64)}
492	//	This is for a block aligned message.
493	//
494	//	{   0, 0, 0, 0, ..., count (as __be64)}
495	//	This is for a message whose length mod 64 is >= 56.
496	//
497	// Pre-swap the endianness of the words.
498.Lfinup2x_finalize_countonly:
499	pxor		MSG0_A, MSG0_A
500	jmp		1f
501
502.Lfinup2x_finalize_blockaligned:
503	mov		$0x80000000, %ebx
504	movd		%ebx, MSG0_A
5051:
506	pxor		MSG1_A, MSG1_A
507	pxor		MSG2_A, MSG2_A
508	ror		$29, COUNT
509	movq		COUNT, MSG3_A
510	pslldq		$8, MSG3_A
511	movdqa		MSG0_A, MSG0_B
512	pxor		MSG1_B, MSG1_B
513	pxor		MSG2_B, MSG2_B
514	movdqa		MSG3_A, MSG3_B
515	mov		$2, FINAL_STEP
516	jmp		.Lfinup2x_loop_have_bswapped_data
517
518.Lfinup2x_done:
519	// Write the two digests with all bytes in the correct order.
520	movdqa		STATE0_A, TMP_A
521	movdqa		STATE0_B, TMP_B
522	punpcklqdq	STATE1_A, STATE0_A		// GHEF
523	punpcklqdq	STATE1_B, STATE0_B
524	punpckhqdq	TMP_A, STATE1_A			// ABCD
525	punpckhqdq	TMP_B, STATE1_B
526	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
527	pshufd		$0xB1, STATE0_B, STATE0_B
528	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
529	pshufd		$0x1B, STATE1_B, STATE1_B
530	pshufb		SHUF_MASK, STATE0_A
531	pshufb		SHUF_MASK, STATE0_B
532	pshufb		SHUF_MASK, STATE1_A
533	pshufb		SHUF_MASK, STATE1_B
534	movdqu		STATE0_A, 1*16(OUT1)
535	movdqu		STATE0_B, 1*16(OUT2)
536	movdqu		STATE1_A, 0*16(OUT1)
537	movdqu		STATE1_B, 0*16(OUT2)
538
539	mov		%rbp, %rsp
540	pop		%rbp
541	pop		%rbx
542	RET
543SYM_FUNC_END(__sha256_ni_finup2x)
544
545.section	.rodata.cst256.K256, "aM", @progbits, 256
546.align 64
547K256:
548	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
549	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
550	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
551	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
552	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
553	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
554	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
555	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
556	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
557	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
558	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
559	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
560	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
561	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
562	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
563	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
564
565.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
566.align 16
567PSHUFFLE_BYTE_FLIP_MASK:
568	.octa 0x0c0d0e0f08090a0b0405060700010203
569