• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_SM3
18
19.file	"sm3_x86_64.s"
20.text
21
22.set	A,%r8d
23.set	B,%r9d
24.set	C,%r10d
25.set	D,%r11d
26.set	E,%r12d
27.set	F,%r13d
28.set	G,%r14d
29.set	H,%r15d
30
31.set	STATE,%rdi
32.set	DATA,%rsi
33.set	NUM,%rdx
34
35.set	ADDR,%rax
36.set	BOOL_OUT,%eax
37.set	SS1,%ebx
38.set	SS2,%eax
39
40.set	X0,%xmm0
41.set	X1,%xmm1
42.set	X2,%xmm2
43.set	X3,%xmm3
44.set	X4,%xmm4
45.set	X5,%xmm5
46.set	X6,%xmm6
47.set	X7,%xmm7
48.set	R16,%xmm13
49.set	R24,%xmm14
50.set	SHUFFLEMASK,%xmm15
51
52.macro	FF0		X Y Z
53	# X ^ Y ^ Z
54	movl	\X,%eax
55	xorl	\Y,%eax
56	xorl	\Z,%eax
57.endm
58
59.macro	FF1		X Y Z
60	# (X & Y) | (X & Z) | (Y & Z)
61	# = (X & (Y | Z)) | (Y & Z)
62	movl	\Y,%eax
63	movl	%eax,%ebx
64	orl		\Z,%eax
65	andl	\Z,%ebx
66	andl	\X,%eax
67	orl		%ebx,%eax
68.endm
69
70.macro	GG0		X Y Z
71	FF0		\X \Y \Z
72.endm
73
74.macro	GG1		X Y Z
75	# (X & Y) | (~X & Z)
76	movl	\X,%ebx
77	andn	\Z,%ebx,%eax
78	andl	\Y,%ebx
79	orl		%ebx,%eax
80.endm
81
82.macro	P0	X
83	rorx	$15,\X,%eax
84	rorx	$23,\X,%ebx
85	xorl	%eax,\X
86	xorl	%ebx,\X
87.endm
88
89.macro	P1	X
90	rorx	$9,\X,%eax
91	rorx	$17,\X,%ebx
92	xorl	%eax,\X
93	xorl	%ebx,\X
94.endm
95
96.macro	ROUND	FF GG Ar Br Cr Dr Er Fr Gr Hr TJ
97	# A <<< 12
98	rorx	$20,\Ar,%eax
99	# SS1 (%ebx) <- ((A <<< 12) + E + (Tj <<< (jmod32))) <<< 7
100	# pre-computed TJ = Tj <<< (jmod32)
101	movl	%eax,%ebx
102	addl	\Er,%ebx
103	addl	$\TJ,%ebx
104	rorx	$25,%ebx,SS1
105	# SS2 (%eax) <- SS1 ^ (A <<< 12)
106	xorl	SS1,SS2
107	# TT1 (D) <- FF(A,B,C) + D + SS2 + W(i)'
108	# TT2 (H) <- GG(E,F,G) + H + SS1 + W(i)
109	addl	SS2,\Dr
110	addl	SS1,\Hr
111	# FF(A,B,C)
112	\FF		\Ar \Br \Cr
113	addl	BOOL_OUT,\Dr
114	# GG(E,F,G)
115	\GG		\Er \Fr \Gr
116	addl	BOOL_OUT,\Hr
117	# B <- B <<< 9
118	rorx	$23,\Br,\Br
119	# F <- F <<< 19
120	rorx	$13,\Fr,\Fr
121	# P0(TT2)
122	P0		\Hr
123.endm
124
125.macro	ROUND_00_15	Ar Br Cr Dr Er Fr Gr Hr TJ WADDR WPADDR
126	# H <- H + W(i)
127	# D <- D + W(i)'
128	addl	\WADDR(%rsp),\Hr
129	addl	\WPADDR(%rsp),\Dr
130	ROUND	FF0 GG0 \Ar \Br \Cr \Dr \Er \Fr \Gr \Hr \TJ
131.endm
132
133.macro	ROUND_16_63	Ar Br Cr Dr Er Fr Gr Hr TJ WADDR WPADDR
134	# H <- H + W(i)
135	# D <- D + W(i)'
136	addl	\WADDR(%rsp),\Hr
137	addl	\WPADDR(%rsp),\Dr
138	ROUND	FF1 GG1 \Ar \Br \Cr \Dr \Er \Fr \Gr \Hr \TJ
139.endm
140
141.macro	ROTATE	IN OUT LEFT RIGHT
142	vpslld		$\LEFT,\IN,%xmm6
143	vpsrld		$\RIGHT,\IN,%xmm7
144	vpxor		%xmm6,%xmm7,\OUT
145.endm
146
147.macro	WORD_SCHEDULER_00_11	I
148	# W'(i) <- W(i) ^ W(i+4)
149	# i = 0, ... ,11
150	movl	\I(%rsp), %ecx			# load W(i)
151	xorl	\I+4*4(%rsp),%ecx		# W'(i) <- W(i) ^ W(i+4)
152	movl	%ecx,284(%rsp)			# store W(i)'
153.endm
154
155.macro	WORD_SCHEDULER_12_63	I
156	# W(i) <- P1( W(i-16) ^ W(i-9) ^ ( W(i-3) <<< 15 ) ) ^ ( W(i-13) <<< 7 ) ^ W(i-6)
157	# i = 12, ... ,63
158	rorx	$17,\I+13*4(%rsp),%ecx	# W(i-3)
159	xorl	\I(%rsp),%ecx			# W(i-16)
160	xorl	\I+7*4(%rsp),%ecx		# W(i-9)
161	P1		%ecx
162	rorx	$25,\I+3*4(%rsp),%eax	# W(i-13)
163	xorl	\I+10*4(%rsp),%eax		# W(i-6)
164	xorl	%eax,%ecx
165	# Store W(i) and W'(i)
166	movl	%ecx,\I+16*4(%rsp)		# store W(i)
167	xorl	\I+12*4(%rsp),%ecx		# W'(i) <- W(i) ^ W(i+4)
168	movl	%ecx,284(%rsp)			# store W(i)'
169.endm
170
171.macro	LOAD_WORD_FOR_SCHEDULER		START
172	vmovdqu		\START(%rsp),X0
173	vmovdqu		\START+12(%rsp),X1
174	vmovdqu		\START+28(%rsp),X2
175	vmovdqu		\START+40(%rsp),X3
176	vmovdqu		\START+48(%rsp),X4
177	vmovdqu		\START+52(%rsp),X5
178.endm
179
180.macro	LOAD_WORD_FOR_SCHEDULER_FAST	START W0 W1 W2 W3 W4 W5
181	vmovdqu		\START+12(%rsp),\W1
182	vmovdqu		\START+48(%rsp),\W4
183	vmovdqu		\START+52(%rsp),\W5
184.endm
185
186.macro	MESSAGE_SCHEDULER	START W0 W1 W2 W3 W4 W5
187	vpxor		\W2,\W0,\W0
188	ROTATE		\W5,\W2,15,17
189	vpxor		\W2,\W0,\W0
190
191	# P1
192	vpshufb		R16,\W0,X6
193	vpshufb		R24,\W0,X7
194	vpxor		X6,X7,X7
195	ROTATE		X7,X7,31,1
196	vpxor		X7,\W0,\W0
197	ROTATE		\W1,\W2,7,25
198	vpxor		\W2,\W0,\W0
199	vpxor		\W3,\W0,\W0
200	# W'(i) <- W(i) ^ W(i+4)
201	vpxor		\W0,\W4,\W4
202
203	vmovdqu		\W0,\START+64(%rsp)
204	vmovdqu		\W4,284(%rsp)
205.endm
206
207.macro	MESSAGE_SCHEDULER_FAST	START W0 W1 W2 W3 W4 W5
208	LOAD_WORD_FOR_SCHEDULER_FAST	\START \W0 \W1 \W2 \W3 \W4 \W5
209	MESSAGE_SCHEDULER	\START \W0 \W1 \W2 \W3 \W4 \W5
210.endm
211
212
213##### SM3 #####
214# void SM3_CompressSIMD(uint32_t state[8], const uint8_t *data, uint32_t blockCnt)
215# state|out		%rdi	32 bytes
216# p				%rsi
217# num			%rdx
218.globl	SM3_CompressSIMD
219.type	SM3_CompressSIMD, @function
220.align	64
221SM3_CompressSIMD:
222	testq	NUM,NUM
223	jz		.Lsm3_avx_ret
224
225	# Store Registers
226	subq	$348,%rsp
227	movq	%rbx,300(%rsp)
228	movq	%rbp,8+300(%rsp)
229	movq	%r12,16+300(%rsp)
230	movq	%r13,24+300(%rsp)
231	movq	%r14,32+300(%rsp)
232	movq	%r15,40+300(%rsp)
233
234.Lsm3_avx_init:
235	leaq		MASKS(%rip),ADDR
236	vmovdqa		(ADDR),SHUFFLEMASK
237	vmovdqa		16(ADDR),R16
238	vmovdqa		32(ADDR),R24
239
240.Lsm3_avx_update:
241	# Load Data (Big Endian)
242	vmovdqu	(DATA),%xmm0
243	vmovdqu	16(DATA),%xmm1
244	vmovdqu	32(DATA),%xmm2
245	vmovdqu	48(DATA),%xmm3
246	vpshufb	SHUFFLEMASK,%xmm0,%xmm0
247	vpshufb	SHUFFLEMASK,%xmm1,%xmm1
248	vpshufb	SHUFFLEMASK,%xmm2,%xmm2
249	vpshufb	SHUFFLEMASK,%xmm3,%xmm3
250	vmovdqu	%xmm0,(%rsp)
251	vmovdqu	%xmm1,16(%rsp)
252	vmovdqu	%xmm2,32(%rsp)
253	vmovdqu	%xmm3,48(%rsp)
254	vpxor	%xmm1,%xmm0,%xmm0
255	vpxor	%xmm2,%xmm1,%xmm1
256	vpxor	%xmm3,%xmm2,%xmm2
257
258	# Load State
259	movl	(STATE),A
260	movl	4(STATE),B
261	movl	8(STATE),C
262	movl	12(STATE),D
263	movl	16(STATE),E
264	movl	20(STATE),F
265	movl	24(STATE),G
266	movl	28(STATE),H
267
268	# ROUND 0-11
269	vmovdqu	%xmm0,284(%rsp)
270	ROUND_00_15	A B C D E F G H	0x79CC4519 0 284
271	ROUND_00_15	D A B C H E F G 0xF3988A32 4 288
272	ROUND_00_15	C D A B G H E F 0xE7311465 8 292
273	ROUND_00_15	B C D A F G H E 0xCE6228CB 12 296
274	vmovdqu	%xmm1,284(%rsp)
275	ROUND_00_15	A B C D E F G H	0x9CC45197 16 284
276	ROUND_00_15	D A B C H E F G 0x3988A32F 20 288
277	ROUND_00_15	C D A B G H E F 0x7311465E 24 292
278	ROUND_00_15	B C D A F G H E 0xE6228CBC 28 296
279	vmovdqu	%xmm2,284(%rsp)
280	ROUND_00_15	A B C D E F G H	0xCC451979 32 284
281	ROUND_00_15	D A B C H E F G 0x988A32F3 36 288
282	ROUND_00_15	C D A B G H E F 0x311465E7 40 292
283	ROUND_00_15	B C D A F G H E 0x6228CBCE 44 296
284	# ROUND 12-15
285	LOAD_WORD_FOR_SCHEDULER	0
286	MESSAGE_SCHEDULER	0 X0 X1 X2 X3 X4 X5
287	ROUND_00_15	A B C D E F G H	0xC451979C 48 284
288	ROUND_00_15	D A B C H E F G 0x88A32F39 52 288
289	ROUND_00_15	C D A B G H E F 0x11465E73 56 292
290	MESSAGE_SCHEDULER_FAST	12 X1 X0 X3 X5 X4 X2
291	ROUND_00_15	B C D A F G H E 0x228CBCE6 60 284
292	# ROUND 16-63
293	ROUND_16_63	A B C D E F G H	0x9D8A7A87 64 288
294	ROUND_16_63	D A B C H E F G 0x3B14F50F 68 292
295	MESSAGE_SCHEDULER_FAST	24 X0 X1 X5 X2 X4 X3
296	ROUND_16_63	C D A B G H E F 0x7629EA1E 72 284
297	ROUND_16_63	B C D A F G H E 0xEC53D43C 76 288
298	ROUND_16_63	A B C D E F G H	0xD8A7A879 80 292
299	MESSAGE_SCHEDULER_FAST	36 X1 X0 X2 X3 X4 X5
300	ROUND_16_63	D A B C H E F G 0xB14F50F3 84 284
301	ROUND_16_63	C D A B G H E F 0x629EA1E7 88 288
302	ROUND_16_63	B C D A F G H E 0xC53D43CE 92 292
303	MESSAGE_SCHEDULER_FAST	48 X0 X1 X3 X5 X4 X2
304	ROUND_16_63	A B C D E F G H	0x8A7A879D 96 284
305	ROUND_16_63	D A B C H E F G 0x14F50F3B 100 288
306	ROUND_16_63	C D A B G H E F 0x29EA1E76 104 292
307	MESSAGE_SCHEDULER_FAST	60 X1 X0 X5 X2 X4 X3
308	ROUND_16_63	B C D A F G H E 0x53D43CEC 108 284
309	ROUND_16_63	A B C D E F G H	0xA7A879D8 112 288
310	ROUND_16_63	D A B C H E F G 0x4F50F3B1 116 292
311	MESSAGE_SCHEDULER_FAST	72 X0 X1 X2 X3 X4 X5
312	ROUND_16_63	C D A B G H E F 0x9EA1E762 120 284
313	ROUND_16_63	B C D A F G H E 0x3D43CEC5 124 288
314	ROUND_16_63	A B C D E F G H	0x7A879D8A 128 292
315	MESSAGE_SCHEDULER_FAST	84 X1 X0 X3 X5 X4 X2
316	ROUND_16_63	D A B C H E F G 0xF50F3B14 132 284
317	ROUND_16_63	C D A B G H E F 0xEA1E7629 136 288
318	ROUND_16_63	B C D A F G H E 0xD43CEC53 140 292
319	MESSAGE_SCHEDULER_FAST	96 X0 X1 X5 X2 X4 X3
320	ROUND_16_63	A B C D E F G H	0xA879D8A7 144 284
321	ROUND_16_63	D A B C H E F G 0x50F3B14F 148 288
322	ROUND_16_63	C D A B G H E F 0xA1E7629E 152 292
323	MESSAGE_SCHEDULER_FAST	108 X1 X0 X2 X3 X4 X5
324	ROUND_16_63	B C D A F G H E 0x43CEC53D 156 284
325	ROUND_16_63	A B C D E F G H	0x879D8A7A 160 288
326	ROUND_16_63	D A B C H E F G 0x0F3B14F5 164 292
327	MESSAGE_SCHEDULER_FAST	120 X0 X1 X3 X5 X4 X2
328	ROUND_16_63	C D A B G H E F 0x1E7629EA 168 284
329	ROUND_16_63	B C D A F G H E 0x3CEC53D4 172 288
330	ROUND_16_63	A B C D E F G H	0x79D8A7A8 176 292
331	MESSAGE_SCHEDULER_FAST	132 X1 X0 X5 X2 X4 X3
332	ROUND_16_63	D A B C H E F G 0xF3B14F50 180 284
333	ROUND_16_63	C D A B G H E F 0xE7629EA1 184 288
334	ROUND_16_63	B C D A F G H E 0xCEC53D43 188 292
335	MESSAGE_SCHEDULER_FAST	144 X0 X1 X2 X3 X4 X5
336	ROUND_16_63	A B C D E F G H	0x9D8A7A87 192 284
337	ROUND_16_63	D A B C H E F G 0x3B14F50F 196 288
338	ROUND_16_63	C D A B G H E F 0x7629EA1E 200 292
339	MESSAGE_SCHEDULER_FAST	156 X1 X0 X3 X5 X4 X2
340	ROUND_16_63	B C D A F G H E 0xEC53D43C 204 284
341	ROUND_16_63	A B C D E F G H	0xD8A7A879 208 288
342	ROUND_16_63	D A B C H E F G 0xB14F50F3 212 292
343	MESSAGE_SCHEDULER_FAST	168 X0 X1 X5 X2 X4 X3
344	ROUND_16_63	C D A B G H E F 0x629EA1E7 216 284
345	ROUND_16_63	B C D A F G H E 0xC53D43CE 220 288
346	ROUND_16_63	A B C D E F G H	0x8A7A879D 224 292
347	MESSAGE_SCHEDULER_FAST	180 X1 X0 X2 X3 X4 X5
348	ROUND_16_63	D A B C H E F G 0x14F50F3B 228 284
349	ROUND_16_63	C D A B G H E F 0x29EA1E76 232 288
350	ROUND_16_63	B C D A F G H E 0x53D43CEC 236 292
351	MESSAGE_SCHEDULER_FAST	192 X0 X1 X3 X5 X4 X2
352	ROUND_16_63	A B C D E F G H	0xA7A879D8 240 284
353	ROUND_16_63	D A B C H E F G 0x4F50F3B1 244 288
354	ROUND_16_63	C D A B G H E F 0x9EA1E762 248 292
355	WORD_SCHEDULER_12_63	204
356	ROUND_16_63	B C D A F G H E 0x3D43CEC5 252 284
357
358	xorl	A,(STATE)
359	xorl	B,4(STATE)
360	xorl	C,8(STATE)
361	xorl	D,12(STATE)
362	xorl	E,16(STATE)
363	xorl	F,20(STATE)
364	xorl	G,24(STATE)
365	xorl	H,28(STATE)
366
367	leaq	64(DATA),DATA
368	decq	NUM
369	jz		.Lsm3_avx_final
370	jmp		.Lsm3_avx_update
371
372.Lsm3_avx_final:
373	vzeroall
374
375	# Clear Context
376	xorq	%r8,%r8
377	xorq	%r9,%r9
378	xorq	%r10,%r10
379	xorq	%r11,%r11
380	# Restore Registers
381	movq	300(%rsp),%rbx
382	movq	8+300(%rsp),%rbp
383	movq	16+300(%rsp),%r12
384	movq	24+300(%rsp),%r13
385	movq	32+300(%rsp),%r14
386	movq	40+300(%rsp),%r15
387	addq	$348,%rsp
388
389.Lsm3_avx_ret:
390	ret
391.size	SM3_CompressSIMD, .-SM3_CompressSIMD
392
393##### SM3 #####
394# void SM3_CompressAsm(uint32_t state[8], const uint8_t *data, uint32_t blockCnt)
395# state|out		%rdi	32 bytes
396# p				%rsi
397# num			%rdx
398.globl	SM3_CompressAsm
399.type	SM3_CompressAsm, @function
400.align	64
401SM3_CompressAsm:
402	testq	NUM,NUM
403	jz		.Lsm3_ret
404
405	# Store Registers
406	subq	$348,%rsp
407	movq	%rbx,300(%rsp)
408	movq	%rbp,8+300(%rsp)
409	movq	%r12,16+300(%rsp)
410	movq	%r13,24+300(%rsp)
411	movq	%r14,32+300(%rsp)
412	movq	%r15,40+300(%rsp)
413
414.Lsm3_loop:
415	# Load Data (Big Endian)
416	movl	(DATA),%r8d
417	movl	4(DATA),%r9d
418	movl	8(DATA),%r10d
419	movl	12(DATA),%r11d
420	movbe	%r8d,(%rsp)
421	movbe	%r9d,4(%rsp)
422	movbe	%r10d,8(%rsp)
423	movbe	%r11d,12(%rsp)
424	movl	16(DATA),%r8d
425	movl	20(DATA),%r9d
426	movl	24(DATA),%r10d
427	movl	28(DATA),%r11d
428	movbe	%r8d,16(%rsp)
429	movbe	%r9d,20(%rsp)
430	movbe	%r10d,24(%rsp)
431	movbe	%r11d,28(%rsp)
432	movl	32(DATA),%r8d
433	movl	36(DATA),%r9d
434	movl	40(DATA),%r10d
435	movl	44(DATA),%r11d
436	movbe	%r8d,32(%rsp)
437	movbe	%r9d,36(%rsp)
438	movbe	%r10d,40(%rsp)
439	movbe	%r11d,44(%rsp)
440	movl	48(DATA),%r8d
441	movl	52(DATA),%r9d
442	movl	56(DATA),%r10d
443	movl	60(DATA),%r11d
444	movbe	%r8d,48(%rsp)
445	movbe	%r9d,52(%rsp)
446	movbe	%r10d,56(%rsp)
447	movbe	%r11d,60(%rsp)
448
449	# Load State
450	movl	(STATE),A
451	movl	4(STATE),B
452	movl	8(STATE),C
453	movl	12(STATE),D
454	movl	16(STATE),E
455	movl	20(STATE),F
456	movl	24(STATE),G
457	movl	28(STATE),H
458
459	# ROUND 0-11
460	WORD_SCHEDULER_00_11	0
461	ROUND_00_15	A B C D E F G H	0x79CC4519 0 284
462	WORD_SCHEDULER_00_11	4
463	ROUND_00_15	D A B C H E F G 0xF3988A32 4 284
464	WORD_SCHEDULER_00_11	8
465	ROUND_00_15	C D A B G H E F 0xE7311465 8 284
466	WORD_SCHEDULER_00_11	12
467	ROUND_00_15	B C D A F G H E 0xCE6228CB 12 284
468	WORD_SCHEDULER_00_11	16
469	ROUND_00_15	A B C D E F G H	0x9CC45197 16 284
470	WORD_SCHEDULER_00_11	20
471	ROUND_00_15	D A B C H E F G 0x3988A32F 20 284
472	WORD_SCHEDULER_00_11	24
473	ROUND_00_15	C D A B G H E F 0x7311465E 24 284
474	WORD_SCHEDULER_00_11	28
475	ROUND_00_15	B C D A F G H E 0xE6228CBC 28 284
476	WORD_SCHEDULER_00_11	32
477	ROUND_00_15	A B C D E F G H	0xCC451979 32 284
478	WORD_SCHEDULER_00_11	36
479	ROUND_00_15	D A B C H E F G 0x988A32F3 36 284
480	WORD_SCHEDULER_00_11	40
481	ROUND_00_15	C D A B G H E F 0x311465E7 40 284
482	WORD_SCHEDULER_00_11	44
483	ROUND_00_15	B C D A F G H E 0x6228CBCE 44 284
484	# ROUND 12-15
485	WORD_SCHEDULER_12_63	0
486	ROUND_00_15	A B C D E F G H	0xC451979C 48 284
487	WORD_SCHEDULER_12_63	4
488	ROUND_00_15	D A B C H E F G 0x88A32F39 52 284
489	WORD_SCHEDULER_12_63	8
490	ROUND_00_15	C D A B G H E F 0x11465E73 56 284
491	WORD_SCHEDULER_12_63	12
492	ROUND_00_15	B C D A F G H E 0x228CBCE6 60 284
493	# ROUND 16-63
494	WORD_SCHEDULER_12_63	16
495	ROUND_16_63	A B C D E F G H	0x9D8A7A87 64 284
496	WORD_SCHEDULER_12_63	20
497	ROUND_16_63	D A B C H E F G 0x3B14F50F 68 284
498	WORD_SCHEDULER_12_63	24
499	ROUND_16_63	C D A B G H E F 0x7629EA1E 72 284
500	WORD_SCHEDULER_12_63	28
501	ROUND_16_63	B C D A F G H E 0xEC53D43C 76 284
502	WORD_SCHEDULER_12_63	32
503	ROUND_16_63	A B C D E F G H	0xD8A7A879 80 284
504	WORD_SCHEDULER_12_63	36
505	ROUND_16_63	D A B C H E F G 0xB14F50F3 84 284
506	WORD_SCHEDULER_12_63	40
507	ROUND_16_63	C D A B G H E F 0x629EA1E7 88 284
508	WORD_SCHEDULER_12_63	44
509	ROUND_16_63	B C D A F G H E 0xC53D43CE 92 284
510	WORD_SCHEDULER_12_63	48
511	ROUND_16_63	A B C D E F G H	0x8A7A879D 96 284
512	WORD_SCHEDULER_12_63	52
513	ROUND_16_63	D A B C H E F G 0x14F50F3B 100 284
514	WORD_SCHEDULER_12_63	56
515	ROUND_16_63	C D A B G H E F 0x29EA1E76 104 284
516	WORD_SCHEDULER_12_63	60
517	ROUND_16_63	B C D A F G H E 0x53D43CEC 108 284
518	WORD_SCHEDULER_12_63	64
519	ROUND_16_63	A B C D E F G H	0xA7A879D8 112 284
520	WORD_SCHEDULER_12_63	68
521	ROUND_16_63	D A B C H E F G 0x4F50F3B1 116 284
522	WORD_SCHEDULER_12_63	72
523	ROUND_16_63	C D A B G H E F 0x9EA1E762 120 284
524	WORD_SCHEDULER_12_63	76
525	ROUND_16_63	B C D A F G H E 0x3D43CEC5 124 284
526	WORD_SCHEDULER_12_63	80
527	ROUND_16_63	A B C D E F G H	0x7A879D8A 128 284
528	WORD_SCHEDULER_12_63	84
529	ROUND_16_63	D A B C H E F G 0xF50F3B14 132 284
530	WORD_SCHEDULER_12_63	88
531	ROUND_16_63	C D A B G H E F 0xEA1E7629 136 284
532	WORD_SCHEDULER_12_63	92
533	ROUND_16_63	B C D A F G H E 0xD43CEC53 140 284
534	WORD_SCHEDULER_12_63	96
535	ROUND_16_63	A B C D E F G H	0xA879D8A7 144 284
536	WORD_SCHEDULER_12_63	100
537	ROUND_16_63	D A B C H E F G 0x50F3B14F 148 284
538	WORD_SCHEDULER_12_63	104
539	ROUND_16_63	C D A B G H E F 0xA1E7629E 152 284
540	WORD_SCHEDULER_12_63	108
541	ROUND_16_63	B C D A F G H E 0x43CEC53D 156 284
542	WORD_SCHEDULER_12_63	112
543	ROUND_16_63	A B C D E F G H	0x879D8A7A 160 284
544	WORD_SCHEDULER_12_63	116
545	ROUND_16_63	D A B C H E F G 0x0F3B14F5 164 284
546	WORD_SCHEDULER_12_63	120
547	ROUND_16_63	C D A B G H E F 0x1E7629EA 168 284
548	WORD_SCHEDULER_12_63	124
549	ROUND_16_63	B C D A F G H E 0x3CEC53D4 172 284
550	WORD_SCHEDULER_12_63	128
551	ROUND_16_63	A B C D E F G H	0x79D8A7A8 176 284
552	WORD_SCHEDULER_12_63	132
553	ROUND_16_63	D A B C H E F G 0xF3B14F50 180 284
554	WORD_SCHEDULER_12_63	136
555	ROUND_16_63	C D A B G H E F 0xE7629EA1 184 284
556	WORD_SCHEDULER_12_63	140
557	ROUND_16_63	B C D A F G H E 0xCEC53D43 188 284
558	WORD_SCHEDULER_12_63	144
559	ROUND_16_63	A B C D E F G H	0x9D8A7A87 192 284
560	WORD_SCHEDULER_12_63	148
561	ROUND_16_63	D A B C H E F G 0x3B14F50F 196 284
562	WORD_SCHEDULER_12_63	152
563	ROUND_16_63	C D A B G H E F 0x7629EA1E 200 284
564	WORD_SCHEDULER_12_63	156
565	ROUND_16_63	B C D A F G H E 0xEC53D43C 204 284
566	WORD_SCHEDULER_12_63	160
567	ROUND_16_63	A B C D E F G H	0xD8A7A879 208 284
568	WORD_SCHEDULER_12_63	164
569	ROUND_16_63	D A B C H E F G 0xB14F50F3 212 284
570	WORD_SCHEDULER_12_63	168
571	ROUND_16_63	C D A B G H E F 0x629EA1E7 216 284
572	WORD_SCHEDULER_12_63	172
573	ROUND_16_63	B C D A F G H E 0xC53D43CE 220 284
574	WORD_SCHEDULER_12_63	176
575	ROUND_16_63	A B C D E F G H	0x8A7A879D 224 284
576	WORD_SCHEDULER_12_63	180
577	ROUND_16_63	D A B C H E F G 0x14F50F3B 228 284
578	WORD_SCHEDULER_12_63	184
579	ROUND_16_63	C D A B G H E F 0x29EA1E76 232 284
580	WORD_SCHEDULER_12_63	188
581	ROUND_16_63	B C D A F G H E 0x53D43CEC 236 284
582	WORD_SCHEDULER_12_63	192
583	ROUND_16_63	A B C D E F G H	0xA7A879D8 240 284
584	WORD_SCHEDULER_12_63	196
585	ROUND_16_63	D A B C H E F G 0x4F50F3B1 244 284
586	WORD_SCHEDULER_12_63	200
587	ROUND_16_63	C D A B G H E F 0x9EA1E762 248 284
588	WORD_SCHEDULER_12_63	204
589	ROUND_16_63	B C D A F G H E 0x3D43CEC5 252 284
590
591	xorl	A,(STATE)
592	xorl	B,4(STATE)
593	xorl	C,8(STATE)
594	xorl	D,12(STATE)
595	xorl	E,16(STATE)
596	xorl	F,20(STATE)
597	xorl	G,24(STATE)
598	xorl	H,28(STATE)
599
600	leaq	64(DATA),DATA
601	decq	NUM
602	jz		.Lsm3_final
603	jmp		.Lsm3_loop
604
605.Lsm3_final:
606	# Clear Context
607	xorq	%r8,%r8
608	xorq	%r9,%r9
609	xorq	%r10,%r10
610	xorq	%r11,%r11
611	# Restore Registers
612	movq	300(%rsp),%rbx
613	movq	8+300(%rsp),%rbp
614	movq	16+300(%rsp),%r12
615	movq	24+300(%rsp),%r13
616	movq	32+300(%rsp),%r14
617	movq	40+300(%rsp),%r15
618	addq	$348,%rsp
619
620.Lsm3_ret:
621	ret
622.size	SM3_CompressAsm, .-SM3_CompressAsm
623
624.section	.rodata
625.align	64
626MASKS:
627# .shuffle_mask: (%rax)
628.byte	3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12
629# left rotations
630# .r16: 16(%rax)
631.byte	2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
632# .r24: 32(%rax)
633.byte	1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12
634
635#endif
636