• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_SM4
18
19#include "crypt_sm4_modes_macro_x86_64.s"
20
21.file	"crypt_sm4_modes_x86_64.S"
22.text
23.extern g_cpuState
24.hidden g_cpuState
25
26.set	X0,%ymm0
27.set	X1,%ymm1
28.set	X2,%ymm2
29.set	X3,%ymm3
30.set	Y0,%ymm4
31.set	Y1,%ymm5
32.set	Y2,%ymm6
33.set	Y3,%ymm7
34
35.set	ADDR,%rax
36.set	IN,%rdi
37.set	OUT,%rsi
38.set	LEN,%rdx
39.set	BLOCKS,%rdx
40.set	RK,%rcx
41.set	IV,%r8
42.set	TWEAK,%r8
43.set	TWEAK_MASK,%r9
44.set	ENC,%r9d
45.set	HI,%r12
46.set	LO,%r13
47.set	HI_TMP,%r14
48.set	LO_TMP,%r15
49
50.set	T0,%r10d
51.set	T0BL,%r10b
52.set	T1,%r11d
53
54.set	T0_64,%r10
55.set	T1_64,%r11
56
57.set	W0,%r12d
58.set	W1,%r13d
59.set	W2,%r14d
60.set	W3,%r15d
61
62.macro	LOAD_DATA
63	vmovdqu		(IN),X0
64	vmovdqu		32(IN),X1
65	vmovdqu		64(IN),X2
66	vmovdqu		96(IN),X3
67	vmovdqu		128(IN),Y0
68	vmovdqu		128+32(IN),Y1
69	vmovdqu		128+64(IN),Y2
70	vmovdqu		128+96(IN),Y3
71.endm
72
73.macro	XOR_DATA
74	vpxor	(IN),X0,X0
75	vpxor	32(IN),X1,X1
76	vpxor	64(IN),X2,X2
77	vpxor	96(IN),X3,X3
78	vpxor	128(IN),Y0,Y0
79	vpxor	128+32(IN),Y1,Y1
80	vpxor	128+64(IN),Y2,Y2
81	vpxor	128+96(IN),Y3,Y3
82.endm
83
84.macro CHECK_GFNI re tmp
85	xorl	\re, \re
86
87	movl    $0x100, \tmp
88	andl	g_cpuState+24(%rip), \tmp  # get gfni flag
89	orl		\tmp, \re
90
91	movl	$0x20, \tmp
92	andl	g_cpuState+20(%rip), \tmp  # check avx2 flag
93	orl		\tmp, \re
94
95	cmpl	$0x120, \re   # code7Out[EAX] & (1<<5)) | code7Out[ECX_OUT_IDX] & (1<<8))
96.endm
97
98.macro	SM4_CRYPT_GFNI_BLOCK16
99	# load affine matric
100    vpbroadcastq .PreAffinT(%rip),PreAffineTRegBLOCK16
101    vpbroadcastq .PostAffinT(%rip),PostAffineTRegBLOCK16
102
103	vmovdqa		32+4096(ADDR),TMP0
104	# vmovdqa		64+4096(ADDR),AES_MASK
105	# vmovdqa		96+4096(ADDR),AES_AND_MASK
106
107	vpshufb		TMP0,X0,X0
108	vpshufb		TMP0,X1,X1
109	vpshufb		TMP0,X2,X2
110	vpshufb		TMP0,X3,X3
111	vpshufb		TMP0,Y0,Y0
112	vpshufb		TMP0,Y1,Y1
113	vpshufb		TMP0,Y2,Y2
114	vpshufb		TMP0,Y3,Y3
115
116	# Pack SIMD Vectors
117	MATRIX_TRANSPOSE	X0 X1 X2 X3
118	MATRIX_TRANSPOSE	Y0 Y1 Y2 Y3
119
120	# AVX2 Rounds
121	SM4_AVX2_GFNI_2_ROUNDS
122
123	# Restore SIMD Vectors
124	MATRIX_TRANSPOSE	X0 X1 X2 X3
125	MATRIX_TRANSPOSE	Y0 Y1 Y2 Y3
126
127	# Reverse Transformation
128	vmovdqa		4096(ADDR),TMP0
129	vpshufb		TMP0,X0,X0
130	vpshufb		TMP0,X1,X1
131	vpshufb		TMP0,X2,X2
132	vpshufb		TMP0,X3,X3
133	vpshufb		TMP0,Y0,Y0
134	vpshufb		TMP0,Y1,Y1
135	vpshufb		TMP0,Y2,Y2
136	vpshufb		TMP0,Y3,Y3
137.endm
138
139.macro	SM4_CRYPT_AESNI_BLOCK16
140
141	vmovdqa		32+4096(ADDR),TMP0
142	vmovdqa		64+4096(ADDR),AES_MASK
143	vmovdqa		96+4096(ADDR),AES_AND_MASK
144
145	vpshufb		TMP0,X0,X0
146	vpshufb		TMP0,X1,X1
147	vpshufb		TMP0,X2,X2
148	vpshufb		TMP0,X3,X3
149	vpshufb		TMP0,Y0,Y0
150	vpshufb		TMP0,Y1,Y1
151	vpshufb		TMP0,Y2,Y2
152	vpshufb		TMP0,Y3,Y3
153
154	# Pack SIMD Vectors
155	MATRIX_TRANSPOSE	X0 X1 X2 X3
156	MATRIX_TRANSPOSE	Y0 Y1 Y2 Y3
157
158	# AVX2 Rounds
159	SM4_AVX2_AES_2_ROUNDS
160
161	# Restore SIMD Vectors
162	MATRIX_TRANSPOSE	X0 X1 X2 X3
163	MATRIX_TRANSPOSE	Y0 Y1 Y2 Y3
164
165	# Reverse Transformation
166	vmovdqa		4096(ADDR),TMP0
167	vpshufb		TMP0,X0,X0
168	vpshufb		TMP0,X1,X1
169	vpshufb		TMP0,X2,X2
170	vpshufb		TMP0,X3,X3
171	vpshufb		TMP0,Y0,Y0
172	vpshufb		TMP0,Y1,Y1
173	vpshufb		TMP0,Y2,Y2
174	vpshufb		TMP0,Y3,Y3
175.endm
176
177.macro	STORE_RESULTS
178	vmovdqu		X0,0(OUT)
179	vmovdqu		X1,32(OUT)
180	vmovdqu		X2,64(OUT)
181	vmovdqu		X3,96(OUT)
182	vmovdqu		Y0,128(OUT)
183	vmovdqu		Y1,128+32(OUT)
184	vmovdqu		Y2,128+64(OUT)
185	vmovdqu		Y3,128+96(OUT)
186.endm
187
188.macro	CLEAR_CONTEXT
189	xorl	T0,T0
190	xorl	T1,T1
191	xorl	W0,W0
192	xorl	W1,W1
193	xorl	W2,W2
194	xorl	W3,W3
195.endm
196
197##### SM4-CBC #####
198	# void SM4_CBC_Encrypt(const unsigned char *in, unsigned char *out, size_t len, const SM4_KEY *key, unsigned char *iv, const int enc)
199	# in		%rdi
200	# out		%rsi
201	# len		%rdx
202	# rk		%rcx
203	# iv		%r8
204	# enc		%r9d
205	.globl	SM4_CBC_Encrypt
206	.type	SM4_CBC_Encrypt, @function
207	.align	64
208
209SM4_CBC_Encrypt:
210
211	# Store Registers
212	subq	$72,%rsp
213	movq	%rbx,(%rsp)
214	movq	%rbp,8(%rsp)
215	movq	%r9,16(%rsp)
216	movq	%r10,24(%rsp)
217	movq	%r11,32(%rsp)
218	movq	%r12,40(%rsp)
219	movq	%r13,48(%rsp)
220	movq	%r14,56(%rsp)
221	movq	%r15,64(%rsp)
222
223	# Get Address
224	leaq	SBOX4X_MASK(%rip),ADDR
225
226	testl	ENC,ENC
227	jz		.Lcbc_decrypt
228
229.Lcbc_encrypt:
230
231	cmpq	$16,LEN
232	jl		.Lcbc_ret
233
234	# Load Data
235	movl	(IN),W0
236	movl	4(IN),W1
237	movl	8(IN),W2
238	movl	12(IN),W3
239
240	# XOR IV
241	xorl	(IV),W0
242	xorl	4(IV),W1
243	xorl	8(IV),W2
244	xorl	12(IV),W3
245
246	bswap	W0
247	bswap	W1
248	bswap	W2
249	bswap	W3
250
251	# Serial Rounds
252	SM4_SERIAL_ROUNDS
253
254	# Store Results
255	bswap	W0
256	bswap	W1
257	bswap	W2
258	bswap	W3
259
260	movl	W3,(OUT)
261	movl	W2,4(OUT)
262	movl	W1,8(OUT)
263	movl	W0,12(OUT)
264
265	movl	W3,(IV)
266	movl	W2,4(IV)
267	movl	W1,8(IV)
268	movl	W0,12(IV)
269
270	leaq	16(IN),IN
271	leaq	16(OUT),OUT
272	subq	$16,LEN
273
274	jmp		.Lcbc_encrypt
275
276.Lcbc_decrypt:
277
278	cmpq	$256,LEN
279	jl		.Lcbc_dec
280
281.Lcbc_dec16:
282
283	LOAD_DATA
284	CHECK_GFNI %r9d %r10d
285	jl .Lcbc_dec_aesni
286.Lcbc_dec_gfni:
287	SM4_CRYPT_GFNI_BLOCK16
288	jmp .Lafter_cbc_dec
289.Lcbc_dec_aesni:
290	SM4_CRYPT_AESNI_BLOCK16
291.Lafter_cbc_dec:
292
293	vmovdqu		(IV),TMP0x
294	vmovdqu		(IN),TMP1x
295	vinserti128	$1,TMP1x,TMP0,TMP0
296	vmovdqu		240(IN),TMP2x
297	vmovdqu		TMP2x,(IV)
298
299	vpxor	TMP0,X0,X0
300	vpxor	16(IN),X1,X1
301	vpxor	32+16(IN),X2,X2
302	vpxor	64+16(IN),X3,X3
303	vpxor	96+16(IN),Y0,Y0
304	vpxor	128+16(IN),Y1,Y1
305	vpxor	160+16(IN),Y2,Y2
306	vpxor	192+16(IN),Y3,Y3
307
308	STORE_RESULTS
309
310	leaq	256(IN),IN
311	leaq	256(OUT),OUT
312	subq	$256,LEN
313	cmpq	$256,LEN
314	jl		.Lcbc_dec16_ret
315	jmp		.Lcbc_dec16
316
317.Lcbc_dec16_ret:
318
319	vzeroall
320
321.Lcbc_dec:
322
323	cmpq	$16,LEN
324	jl		.Lcbc_ret
325
326	# Load Data
327	movl	(IN),W0
328	movl	4(IN),W1
329	movl	8(IN),W2
330	movl	12(IN),W3
331
332	bswap	W0
333	bswap	W1
334	bswap	W2
335	bswap	W3
336
337	# Serial Rounds
338	SM4_SERIAL_ROUNDS
339
340	# Store Result
341	bswap	W0
342	bswap	W1
343	bswap	W2
344	bswap	W3
345
346	xorl	(IV),W3
347	xorl	4(IV),W2
348	xorl	8(IV),W1
349	xorl	12(IV),W0
350
351	movq	(IN),%r10
352	movq	%r10,(IV)
353	movq	8(IN),%r10
354	movq	%r10,8(IV)
355
356	movl	W3,(OUT)
357	movl	W2,4(OUT)
358	movl	W1,8(OUT)
359	movl	W0,12(OUT)
360
361	leaq	16(IN),IN
362	leaq	16(OUT),OUT
363	subq	$16,LEN
364
365	jmp		.Lcbc_dec
366
367.Lcbc_ret:
368
369	CLEAR_CONTEXT
370
371	# Restore Registers
372	movq	(%rsp),%rbx
373	movq	8(%rsp),%rbp
374	movq	16(%rsp),%rax
375	movq	24(%rsp),%r10
376	movq	32(%rsp),%r11
377	movq	40(%rsp),%r12
378	movq	48(%rsp),%r13
379	movq	56(%rsp),%r14
380	movq	64(%rsp),%r15
381	addq	$72,%rsp
382
383	ret
384	.size	SM4_CBC_Encrypt, .-SM4_CBC_Encrypt
385
386##### SM4-ECB #####
387	# void SM4_ECB_Encrypt(const unsigned char *in, unsigned char *out, size_t len, const SM4_KEY *key)
388	# in		%rdi
389	# out		%rsi
390	# len		%rdx
391	# key		%rcx
392	.globl	SM4_ECB_Encrypt
393	.type	SM4_ECB_Encrypt, @function
394	.align	64
395
396SM4_ECB_Encrypt:
397
398	# Store Registers
399	subq	$32,%rsp
400	movq	%r12,(%rsp)
401	movq	%r13,8(%rsp)
402	movq	%r14,16(%rsp)
403	movq	%r15,24(%rsp)
404
405	# Get Address
406	leaq	SBOX4X_MASK(%rip),ADDR
407
408.Lecb_encrypt:
409
410	cmpq	$256,LEN
411	jl		.Lecb_enc
412
413.Lecb_enc16:
414
415	LOAD_DATA
416
417	CHECK_GFNI %r12d %r13d
418	jl .Lecb_enc_aesni
419.Lecb_enc_gfni:
420	SM4_CRYPT_GFNI_BLOCK16
421	jmp .Lafter_ecb_enc
422.Lecb_enc_aesni:
423	SM4_CRYPT_AESNI_BLOCK16
424.Lafter_ecb_enc:
425	STORE_RESULTS
426
427	leaq	256(IN),IN
428	leaq	256(OUT),OUT
429	subq	$256,LEN
430	cmpq	$256,LEN
431	jl		.Lecb_enc16_ret
432	jmp		.Lecb_enc16
433
434.Lecb_enc16_ret:
435
436	vzeroall
437
438.Lecb_enc:
439
440	cmpq	$16,LEN
441	jl		.Lecb_ret
442
443	# Load Data
444	movl	(IN),W0
445	movl	4(IN),W1
446	movl	8(IN),W2
447	movl	12(IN),W3
448
449	bswap	W0
450	bswap	W1
451	bswap	W2
452	bswap	W3
453
454	# Serial Rounds
455	SM4_SERIAL_ROUNDS
456
457	# Store Result
458	bswap	W0
459	bswap	W1
460	bswap	W2
461	bswap	W3
462
463	movl	W3,(OUT)
464	movl	W2,4(OUT)
465	movl	W1,8(OUT)
466	movl	W0,12(OUT)
467
468	leaq	16(IN),IN
469	leaq	16(OUT),OUT
470	subq	$16,LEN
471
472	jmp		.Lecb_enc
473
474.Lecb_ret:
475
476	CLEAR_CONTEXT
477
478	# Restore Registers
479	movq	(%rsp),%r12
480	movq	8(%rsp),%r13
481	movq	16(%rsp),%r14
482	movq	24(%rsp),%r15
483	addq	$32,%rsp
484
485	ret
486	.size	SM4_ECB_Encrypt, .-SM4_ECB_Encrypt
487
488##### SM4-CFB ENC #####
489	# void SM4_CFB128_Encrypt(const unsigned char *in, unsigned char *out, size_t len, const SM4_KEY *key, unsigned char *iv, int *num)
490	# in		%rdi
491	# out		%rsi
492	# len		%rdx
493	# rk		%rcx
494	# iv		%r8
495	# num		%r9d
496	.globl	SM4_CFB128_Encrypt
497	.type	SM4_CFB128_Encrypt, @function
498	.align	64
499
500SM4_CFB128_Encrypt:
501
502	# Store Registers
503	subq	$72,%rsp
504	movq	%rbx,(%rsp)
505	movq	%rbp,8(%rsp)
506	movq	%r9,16(%rsp)
507	movq	%r10,24(%rsp)
508	movq	%r11,32(%rsp)
509	movq	%r12,40(%rsp)
510	movq	%r13,48(%rsp)
511	movq	%r14,56(%rsp)
512	movq	%r15,64(%rsp)
513
514	# Load Num
515	movl	(%r9),%r9d
516	cmpl	$0,%r9d
517	je		.Lcfb128_enc_update
518
519.Lcfb128_enc_init:
520
521	movb	0(IV,%r9,1),%al
522	xorb	(IN),%al
523	movb	%al,(OUT)
524	movb	%al,0(IV,%r9,1)
525
526	leaq	1(IN),IN
527	leaq	1(OUT),OUT
528
529	incl	%r9d
530	decq	LEN
531	cmpl	$16,%r9d
532	je		.Lcfb128_enc_update
533	cmpq	$0,LEN
534	je		.Lcfb128_enc_ret
535
536	jmp		.Lcfb128_enc_init
537
538.Lcfb128_enc_update:
539
540	movl	$0,%r9d
541
542	# Get Address
543	leaq	SBOX4X_MASK(%rip),ADDR
544
545.Lcfb128_enc_loop:
546
547	cmpq	$0,LEN
548	je		.Lcfb128_enc_ret
549
550	movl	 $0,%r9d
551
552	# Load IV
553	movl	(IV),W0
554	movl	4(IV),W1
555	movl	8(IV),W2
556	movl	12(IV),W3
557
558	bswap	W0
559	bswap	W1
560	bswap	W2
561	bswap	W3
562
563	# Serial Rounds
564	SM4_SERIAL_ROUNDS
565
566	# Store Results
567	bswap	W0
568	bswap	W1
569	bswap	W2
570	bswap	W3
571
572	movl	W3,(IV)
573	movl	W2,4(IV)
574	movl	W1,8(IV)
575	movl	W0,12(IV)
576
577	cmpq	$16,LEN
578	jl		.Lcfb128_enc_final
579
580	xorl	(IN),W3
581	xorl	4(IN),W2
582	xorl	8(IN),W1
583	xorl	12(IN),W0
584
585	movl	W3,(OUT)
586	movl	W2,4(OUT)
587	movl	W1,8(OUT)
588	movl	W0,12(OUT)
589
590	movl	W3,(IV)
591	movl	W2,4(IV)
592	movl	W1,8(IV)
593	movl	W0,12(IV)
594
595	leaq	16(IN),IN
596	leaq	16(OUT),OUT
597	subq	$16,LEN
598
599	jmp		.Lcfb128_enc_loop
600
601.Lcfb128_enc_final:
602
603	movb	0(IV,%r9,1),%al
604	xorb	(IN),%al
605	movb	%al,(OUT)
606	movb	%al,0(IV,%r9,1)
607
608	leaq	1(IN),IN
609	leaq	1(OUT),OUT
610
611	incl	%r9d
612	decq	LEN
613	jnz		.Lcfb128_enc_final
614
615.Lcfb128_enc_ret:
616
617	CLEAR_CONTEXT
618
619	# Restore Registers
620	movq	(%rsp),%rbx
621	movq	8(%rsp),%rbp
622	movq	16(%rsp),%rax
623	movq	24(%rsp),%r10
624	movq	32(%rsp),%r11
625	movq	40(%rsp),%r12
626	movq	48(%rsp),%r13
627	movq	56(%rsp),%r14
628	movq	64(%rsp),%r15
629	addq	$72,%rsp
630
631	# Store Num
632	movl	%r9d,(%rax)
633
634	ret
635	.size	SM4_CFB128_Encrypt, .-SM4_CFB128_Encrypt
636
637##### SM4-CFB DEC #####
638	# void SM4_CFB128_Decrypt(const unsigned char *in, unsigned char *out, size_t len, const SM4_KEY *key, unsigned char *iv, int *num)
639	# in		%rdi
640	# out		%rsi
641	# len		%rdx
642	# rk		%rcx
643	# iv		%r8
644	# num		%r9d
645	.globl	SM4_CFB128_Decrypt
646	.type	SM4_CFB128_Decrypt, @function
647	.align	64
648
649SM4_CFB128_Decrypt:
650
651	# Store Registers
652	subq	$72,%rsp
653	movq	%rbx,(%rsp)
654	movq	%rbp,8(%rsp)
655	movq	%r9,16(%rsp)
656	movq	%r10,24(%rsp)
657	movq	%r11,32(%rsp)
658	movq	%r12,40(%rsp)
659	movq	%r13,48(%rsp)
660	movq	%r14,56(%rsp)
661	movq	%r15,64(%rsp)
662
663	# Load Num
664	movl	(%r9),%r9d
665	cmpl	$0,%r9d
666	je		.Lcfb128_dec_update
667
668.Lcfb128_dec_init:
669
670	movb	0(IV,%r9,1),%al
671	movb	(IN),%bl
672	xorb	%bl,%al
673	movb	%al,(OUT)
674	movb	%bl,0(IV,%r9,1)
675
676	leaq	1(IN),IN
677	leaq	1(OUT),OUT
678
679	incl	%r9d
680	decq	LEN
681	cmpl	$16,%r9d
682	je		.Lcfb128_dec_update
683	cmpq	$0,LEN
684	je		.Lcfb128_dec_ret
685
686	jmp		.Lcfb128_dec_init
687
688.Lcfb128_dec_update:
689
690	# Get Address
691	leaq	SBOX4X_MASK(%rip),ADDR
692
693	movl	$0,%r9d
694
695	cmpq	$256,LEN
696	jl		.Lcfb128_dec
697
698.Lcfb128_dec16:
699
700	vmovdqu		(IV),TMP0x
701	vmovdqu		(IN),TMP1x
702	vinserti128	$1,TMP1x,TMP0,TMP0
703	vmovdqu		240(IN),TMP2x
704	vmovdqu		TMP2x,(IV)
705
706	vmovdqu		TMP0,X0
707	vmovdqu		16(IN),X1
708	vmovdqu		32+16(IN),X2
709	vmovdqu		64+16(IN),X3
710	vmovdqu		96+16(IN),Y0
711	vmovdqu		128+16(IN),Y1
712	vmovdqu		160+16(IN),Y2
713	vmovdqu		192+16(IN),Y3
714
715	CHECK_GFNI %r10d %r11d
716	jl .Lcfb128_dec_aesni
717.Lcfb128_dec_gfni:
718	SM4_CRYPT_GFNI_BLOCK16
719	jmp .Lafter_cfb128_dec
720.Lcfb128_dec_aesni:
721	SM4_CRYPT_AESNI_BLOCK16
722.Lafter_cfb128_dec:
723	XOR_DATA
724	STORE_RESULTS
725
726	leaq	256(IN),IN
727	leaq	256(OUT),OUT
728	subq	$256,LEN
729	cmpq	$256,LEN
730	jl		.Lcfb128_dec16_ret
731	jmp		.Lcfb128_dec16
732
733.Lcfb128_dec16_ret:
734
735	vzeroall
736
737.Lcfb128_dec:
738
739	cmpq	$0,LEN
740	je		.Lcfb128_dec_ret
741
742.Lcfb128_dec1:
743
744	# Load IV
745	movl	(IV),W0
746	movl	4(IV),W1
747	movl	8(IV),W2
748	movl	12(IV),W3
749
750	bswap	W0
751	bswap	W1
752	bswap	W2
753	bswap	W3
754
755	# Serial Rounds
756	SM4_SERIAL_ROUNDS
757
758	# Store Results
759	bswap	W0
760	bswap	W1
761	bswap	W2
762	bswap	W3
763
764	movl	W3,(IV)
765	movl	W2,4(IV)
766	movl	W1,8(IV)
767	movl	W0,12(IV)
768
769	cmpq	$16,LEN
770	jl		.Lcfb128_dec_final
771
772	movq	(IN),%rbx
773	movq	%rbx,(IV)
774	movq	8(IN),%rbx
775	movq	%rbx,8(IV)
776	xorq	%rbx,%rbx
777
778	xorl	(IN),W3
779	xorl	4(IN),W2
780	xorl	8(IN),W1
781	xorl	12(IN),W0
782
783	movl	W3,(OUT)
784	movl	W2,4(OUT)
785	movl	W1,8(OUT)
786	movl	W0,12(OUT)
787
788	leaq	16(IN),IN
789	leaq	16(OUT),OUT
790	subq	$16,LEN
791	cmpq	$0,LEN
792	je		.Lcfb128_dec_ret
793	jmp		.Lcfb128_dec1
794
795.Lcfb128_dec_final:
796
797	movb	0(IV,%r9,1),%al
798	movb	(IN),%bl
799	xorb	%bl,%al
800	movb	%al,(OUT)
801	movb	%bl,0(IV,%r9,1)
802
803	leaq	1(IN),IN
804	leaq	1(OUT),OUT
805
806	incl	%r9d
807	decq	LEN
808	jnz		.Lcfb128_dec_final
809
810.Lcfb128_dec_ret:
811
812	CLEAR_CONTEXT
813
814	# Restore Registers
815	movq	(%rsp),%rbx
816	movq	8(%rsp),%rbp
817	movq	16(%rsp),%rax
818	movq	24(%rsp),%r10
819	movq	32(%rsp),%r11
820	movq	40(%rsp),%r12
821	movq	48(%rsp),%r13
822	movq	56(%rsp),%r14
823	movq	64(%rsp),%r15
824	addq	$72,%rsp
825
826	# Store Num
827	movl	%r9d,(%rax)
828
829	ret
830	.size	SM4_CFB128_Decrypt, .-SM4_CFB128_Decrypt
831
832##### SM4-OFB #####
833	# void SM4_OFB_Encrypt(const unsigned char *in, unsigned char *out, size_t len, const SM4_KEY *key, unsigned char *iv, int *num)
834	# in		%rdi
835	# out		%rsi
836	# len		%rdx
837	# rk		%rcx
838	# iv		%r8
839	# num		%r9d
840	.globl	SM4_OFB_Encrypt
841	.type	SM4_OFB_Encrypt, @function
842	.align	64
843
844SM4_OFB_Encrypt:
845
846	# Store Registers
847	subq	$72,%rsp
848	movq	%rbx,(%rsp)
849	movq	%rbp,8(%rsp)
850	movq	%r9,16(%rsp)
851	movq	%r10,24(%rsp)
852	movq	%r11,32(%rsp)
853	movq	%r12,40(%rsp)
854	movq	%r13,48(%rsp)
855	movq	%r14,56(%rsp)
856	movq	%r15,64(%rsp)
857
858	# Load Num
859	movl	(%r9),%r9d
860	cmpl	$0,%r9d
861	jz		.Lofb128_enc_update
862
863.Lofb128_enc_init:
864
865	movb	0(IV,%r9,1),%al
866	xorb	(IN),%al
867	movb	%al,(OUT)
868
869	leaq	1(IN),IN
870	leaq	1(OUT),OUT
871
872	incl	%r9d
873	decq	LEN
874	cmpl	$16,%r9d
875	je		.Lofb128_enc_update
876	cmpq	$0,LEN
877	je		.Lofb128_enc_ret
878
879	jmp		.Lofb128_enc_init
880
881.Lofb128_enc_update:
882
883	movl	$0,%r9d
884
885	# Get Address
886	leaq	SBOX4X_MASK(%rip),ADDR
887
888.Lofb128_enc_loop:
889
890	cmpq	$0,LEN
891	je		.Lofb128_enc_ret
892
893	# Load IV
894	movl	(IV),W0
895	movl	4(IV),W1
896	movl	8(IV),W2
897	movl	12(IV),W3
898
899	bswap	W0
900	bswap	W1
901	bswap	W2
902	bswap	W3
903
904	# Serial Rounds
905	SM4_SERIAL_ROUNDS
906
907	# Store Results
908	bswap	W0
909	bswap	W1
910	bswap	W2
911	bswap	W3
912
913	movl	W3,(IV)
914	movl	W2,4(IV)
915	movl	W1,8(IV)
916	movl	W0,12(IV)
917
918	cmpq	$16,LEN
919	jl		.Lofb128_enc_final
920
921	xorl	(IN),W3
922	xorl	4(IN),W2
923	xorl	8(IN),W1
924	xorl	12(IN),W0
925
926	movl	W3,(OUT)
927	movl	W2,4(OUT)
928	movl	W1,8(OUT)
929	movl	W0,12(OUT)
930
931	leaq	16(IN),IN
932	leaq	16(OUT),OUT
933	subq	$16,LEN
934
935	jmp		.Lofb128_enc_loop
936
937.Lofb128_enc_final:
938
939	movb	0(IV,%r9,1),%al
940	xorb	(IN),%al
941	movb	%al,(OUT)
942
943	leaq	1(IN),IN
944	leaq	1(OUT),OUT
945
946	incl	%r9d
947	decq	LEN
948	jnz		.Lofb128_enc_final
949
950.Lofb128_enc_ret:
951
952	CLEAR_CONTEXT
953
954	# Restore Registers
955	movq	(%rsp),%rbx
956	movq	8(%rsp),%rbp
957	movq	16(%rsp),%rax
958	movq	24(%rsp),%r10
959	movq	32(%rsp),%r11
960	movq	40(%rsp),%r12
961	movq	48(%rsp),%r13
962	movq	56(%rsp),%r14
963	movq	64(%rsp),%r15
964	addq	$72,%rsp
965
966	# Store Num
967	movl	%r9d,(%rax)
968
969	ret
970	.size	SM4_OFB_Encrypt, .-SM4_OFB_Encrypt
971
972##### SM4-CTR32 #####
973# NOTE: the IV/counter CTR mode is big-endian.
974.align	64
975.Lmovbe12:
976.byte	0,1,2,3,4,5,6,7,8,9,10,11,15,14,13,12,0,1,2,3,4,5,6,7,8,9,10,11,15,14,13,12
977.Lone:
978.long	0,0,0,1
979
980.macro	INCREMENT_COUNTER
981	movbe	12(IV),%ebx
982	incl	%ebx
983	movbe	%ebx,12(IV)
984.endm
985
986.macro	LOAD_ECOUNT_BUF	SINK
987	vpaddd		TMP1x,TMP2x,TMP3x
988	vpaddd		TMP1x,TMP3x,TMP4x
989	vinserti128	$1,TMP3x,TMP2,TMP2
990	vpshufb		TMP0,TMP2,TMP2
991	vmovdqa		TMP2,\SINK
992	vmovdqa		TMP4x,TMP2x
993.endm
994
995.macro	LOAD_ECOUNT_BUF_ALL
996	vmovdqa	.Lmovbe12(%rip),TMP0
997	vmovdqa	.Lone(%rip),TMP1x
998	vmovdqu	(IV),TMP2x
999	vpshufb	TMP0x,TMP2x,TMP2x
1000	LOAD_ECOUNT_BUF		X0
1001	LOAD_ECOUNT_BUF		X1
1002	LOAD_ECOUNT_BUF		X2
1003	LOAD_ECOUNT_BUF		X3
1004	LOAD_ECOUNT_BUF		Y0
1005	LOAD_ECOUNT_BUF		Y1
1006	LOAD_ECOUNT_BUF		Y2
1007	LOAD_ECOUNT_BUF		Y3
1008	vpshufb	TMP0x,TMP2x,TMP2x
1009	vmovdqu	TMP2x,(IV)
1010.endm
1011
1012	# void SM4_CTR_EncryptBlocks(const unsigned char *in, unsigned char *out, size_t blocks, const SM4_KEY *key, const unsigned char *iv)
1013	# in		%rdi
1014	# out		%rsi
1015	# blocks	%rdx
1016	# rk		%rcx
1017	# iv		%r8
1018	.globl	SM4_CTR_EncryptBlocks
1019	.type	SM4_CTR_EncryptBlocks, @function
1020	.align	64
1021
1022SM4_CTR_EncryptBlocks:
1023
1024	# Get Address
1025	leaq	SBOX4X_MASK(%rip),ADDR
1026
1027	# Store Registers
1028	subq	$88,%rsp
1029	movq	%rbx,(%rsp)
1030	movq	%rbp,8(%rsp)
1031	movq	%r8,16(%rsp)
1032	movq	%r9,24(%rsp)
1033	movq	%r10,32(%rsp)
1034	movq	%r11,40(%rsp)
1035	movq	%r12,48(%rsp)
1036	movq	%r13,56(%rsp)
1037	movq	%r14,64(%rsp)
1038	movq	%r15,72(%rsp)
1039	movq	%rdx,80(%rsp)
1040
1041	cmpq	$16,BLOCKS
1042	jl		.Lctr32_enc
1043
1044.Lctr32_enc16:
1045
1046	LOAD_ECOUNT_BUF_ALL
1047	CHECK_GFNI %r9d %r10d
1048	jl .Lctr32_enc_aesni
1049.Lctr32_enc_gfni:
1050	SM4_CRYPT_GFNI_BLOCK16
1051	jmp .Lafter_ctr32_enc
1052.Lctr32_enc_aesni:
1053	SM4_CRYPT_AESNI_BLOCK16
1054.Lafter_ctr32_enc:
1055
1056	XOR_DATA
1057	STORE_RESULTS
1058
1059	leaq	256(IN),IN
1060	leaq	256(OUT),OUT
1061	subq	$16,BLOCKS
1062	cmpq	$16,BLOCKS
1063	jl		.Lctr32_enc16_ret
1064	jmp		.Lctr32_enc16
1065
1066.Lctr32_enc16_ret:
1067
1068	vzeroall
1069
1070.Lctr32_enc:
1071
1072	cmpq	$0,BLOCKS
1073	je		.Lctr32_ret
1074
1075	# Load IV
1076	movl	(IV),W0
1077	movl	4(IV),W1
1078	movl	8(IV),W2
1079	movl	12(IV),W3
1080
1081	bswap	W0
1082	bswap	W1
1083	bswap	W2
1084	bswap	W3
1085
1086	# Serial Rounds
1087	SM4_SERIAL_ROUNDS
1088
1089	# Store Results
1090	bswap	W0
1091	bswap	W1
1092	bswap	W2
1093	bswap	W3
1094
1095	xorl	(IN),W3
1096	xorl	4(IN),W2
1097	xorl	8(IN),W1
1098	xorl	12(IN),W0
1099
1100	movl	W3,(OUT)
1101	movl	W2,4(OUT)
1102	movl	W1,8(OUT)
1103	movl	W0,12(OUT)
1104
1105	leaq	16(IN),IN
1106	leaq	16(OUT),OUT
1107	decq	BLOCKS
1108
1109	INCREMENT_COUNTER
1110
1111	jmp		.Lctr32_enc
1112
1113.Lctr32_ret:
1114
1115	CLEAR_CONTEXT
1116
1117	# Restore Registers
1118	movq	(%rsp),%rbx
1119	movq	8(%rsp),%rbp
1120	movq	16(%rsp),%r8
1121	movq	24(%rsp),%r9
1122	movq	32(%rsp),%r10
1123	movq	40(%rsp),%r11
1124	movq	48(%rsp),%r12
1125	movq	56(%rsp),%r13
1126	movq	64(%rsp),%r14
1127	movq	72(%rsp),%r15
1128	movq	80(%rsp),%rdx
1129	addq	$88,%rsp
1130
1131	ret
1132	.size	SM4_CTR_EncryptBlocks, .-SM4_CTR_EncryptBlocks
1133
1134##### SM4-XTS #####
1135
1136.align	16
1137.Lxts_tweak_mask:
1138.long	0,0xe1000000
1139
1140.macro GALOIS_FIELD_MUL	Idx
1141	xorq	LO_TMP,LO_TMP
1142
1143	testq	$1,LO
1144	cmovnzq	TWEAK_MASK,LO_TMP
1145	shrd	$1,HI,LO
1146	shrq	$1,HI
1147	xorq	LO_TMP,HI
1148
1149	movbe	HI,\Idx(TWEAK)
1150	movbe	LO,\Idx+8(TWEAK)
1151.endm
1152
1153.macro GALOIS_FIELD_MUL_16_INNER
1154	GALOIS_FIELD_MUL 16
1155	# T2:T1->T2
1156	GALOIS_FIELD_MUL 32
1157	# T3:T2->T3
1158	GALOIS_FIELD_MUL 48
1159	# T4:T3->T4
1160	GALOIS_FIELD_MUL 64
1161	# T5:T4->T5
1162	GALOIS_FIELD_MUL 80
1163	# T6:T5->T6
1164	GALOIS_FIELD_MUL 96
1165	# T7:T6->T7
1166	GALOIS_FIELD_MUL 112
1167	# T8:T7->T8
1168	GALOIS_FIELD_MUL 128
1169	# T9:T8->T9
1170	GALOIS_FIELD_MUL 144
1171	# T10:T9->T10
1172	GALOIS_FIELD_MUL 160
1173	# T11:T10->T11
1174	GALOIS_FIELD_MUL 176
1175	# T12:T11->T12
1176	GALOIS_FIELD_MUL 192
1177	# T13:T12->T13
1178	GALOIS_FIELD_MUL 208
1179	# T14:T13->T14
1180	GALOIS_FIELD_MUL 224
1181	# T15:T14->T15
1182	GALOIS_FIELD_MUL 240
1183.endm
1184
1185.macro XOR_TWEAK
1186	vpxor	(TWEAK),X0,X0
1187	vpxor	32(TWEAK),X1,X1
1188	vpxor	64(TWEAK),X2,X2
1189	vpxor	96(TWEAK),X3,X3
1190	vpxor	128(TWEAK),Y0,Y0
1191	vpxor	128+32(TWEAK),Y1,Y1
1192	vpxor	128+64(TWEAK),Y2,Y2
1193	vpxor	128+96(TWEAK),Y3,Y3
1194.endm
1195
1196.macro SM4_XTS_16_EN_INNER
1197	LOAD_DATA
1198	XOR_TWEAK
1199	CHECK_GFNI %r15d %r14d
1200	jl .Lxts_enc_aesni
1201.Lxts_enc_gfni:
1202	SM4_CRYPT_GFNI_BLOCK16
1203	jmp .Lafter_xts_enc
1204.Lxts_enc_aesni:
1205	SM4_CRYPT_AESNI_BLOCK16
1206.Lafter_xts_enc:
1207	XOR_TWEAK
1208	STORE_RESULTS
1209.endm
1210
1211	# void SM4_XTS_Encrypt_Blocks(const unsigned char *in, unsigned char *out, size_t len, const SM4_KEY *key, unsigned char *t)
1212	# in		%rdi
1213	# out		%rsi
1214	# len		%rdx
1215	# key		%rcx
1216	# t			%r8
1217	.globl	SM4_XTS_Encrypt_Blocks
1218	.type	SM4_XTS_Encrypt_Blocks, @function
1219	.align	64
1220
1221SM4_XTS_Encrypt_Blocks:
1222
1223	cmpq	$256,LEN
1224	jl		.Lxts_ret
1225
1226	# Store Registers
1227	subq	$56,%rsp
1228	movq	%r9,(%rsp)
1229	movq	%r10,8(%rsp)
1230	movq	%r11,16(%rsp)
1231	movq	%r12,24(%rsp)
1232	movq	%r13,32(%rsp)
1233	movq	%r14,40(%rsp)
1234	movq	%r15,48(%rsp)
1235
1236	# Get Address
1237	leaq	SBOX4X_MASK(%rip),ADDR
1238
1239	# Load tweak mask
1240	movq	.Lxts_tweak_mask(%rip),TWEAK_MASK
1241
1242	# T0: Initial
1243	movbe	(TWEAK),HI
1244	movbe	8(TWEAK),LO
1245
1246.Lxts_update:
1247
1248	GALOIS_FIELD_MUL_16_INNER
1249	SM4_XTS_16_EN_INNER
1250
1251	leaq	256(IN),IN
1252	leaq	256(OUT),OUT
1253	subq	$256,LEN
1254	cmpq	$256,LEN
1255	jl		.Lxts_final
1256
1257	# T15: Initial
1258	movbe	240(TWEAK),HI
1259	movbe	248(TWEAK),LO
1260	# T0:T15->T0
1261	GALOIS_FIELD_MUL 0
1262
1263	jmp		.Lxts_update
1264
1265.Lxts_final:
1266
1267	# Clear Context
1268	vzeroall
1269
1270	# Restore Registers
1271	movq	(%rsp),%r9
1272	movq	8(%rsp),%r10
1273	movq	16(%rsp),%r11
1274	movq	24(%rsp),%r12
1275	movq	32(%rsp),%r13
1276	movq	40(%rsp),%r14
1277	movq	48(%rsp),%r15
1278	addq	$56,%rsp
1279
1280.Lxts_ret:
1281
1282	ret
1283	.size	SM4_XTS_Encrypt_Blocks, .-SM4_XTS_Encrypt_Blocks
1284
1285#endif
1286