• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2014-2022 The GmSSL Project. All Rights Reserved.
3  *
4  *  Licensed under the Apache License, Version 2.0 (the License); you may
5  *  not use this file except in compliance with the License.
6  *
7  *  http://www.apache.org/licenses/LICENSE-2.0
8  */
9 
10 
11 #include <string.h>
12 #include <gmssl/sm3.h>
13 #include <gmssl/endian.h>
14 #include <gmssl/error.h>
15 
16 
17 #ifdef SM3_SSE3
18 # include <x86intrin.h>
19 # include <immintrin.h>
20 
21 # define _mm_rotl_epi32(X,i) \
22 	_mm_xor_si128(_mm_slli_epi32((X),(i)), _mm_srli_epi32((X),32-(i)))
23 #endif
24 
25 
26 #define ROTL(x,n)  (((x)<<(n)) | ((x)>>(32-(n))))
27 #define P0(x) ((x) ^ ROL32((x), 9) ^ ROL32((x),17))
28 #define P1(x) ((x) ^ ROL32((x),15) ^ ROL32((x),23))
29 
30 #define FF00(x,y,z)  ((x) ^ (y) ^ (z))
31 #define FF16(x,y,z)  (((x)&(y)) | ((x)&(z)) | ((y)&(z)))
32 #define GG00(x,y,z)  ((x) ^ (y) ^ (z))
33 #define GG16(x,y,z)  ((((y)^(z)) & (x)) ^ (z))
34 
35 #define R(A, B, C, D, E, F, G, H, xx)				\
36 	SS1 = ROL32((ROL32(A, 12) + E + K[j]), 7);		\
37 	SS2 = SS1 ^ ROL32(A, 12);				\
38 	TT1 = FF##xx(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]);	\
39 	TT2 = GG##xx(E, F, G) + H + SS1 + W[j];			\
40 	B = ROL32(B, 9);					\
41 	H = TT1;						\
42 	F = ROL32(F, 19);					\
43 	D = P0(TT2);						\
44 	j++
45 
46 #define R8(A, B, C, D, E, F, G, H, xx)				\
47 	R(A, B, C, D, E, F, G, H, xx);				\
48 	R(H, A, B, C, D, E, F, G, xx);				\
49 	R(G, H, A, B, C, D, E, F, xx);				\
50 	R(F, G, H, A, B, C, D, E, xx);				\
51 	R(E, F, G, H, A, B, C, D, xx);				\
52 	R(D, E, F, G, H, A, B, C, xx);				\
53 	R(C, D, E, F, G, H, A, B, xx);				\
54 	R(B, C, D, E, F, G, H, A, xx)
55 
56 
57 
58 #define T00 0x79cc4519U
59 #define T16 0x7a879d8aU
60 
61 #define K0	0x79cc4519U
62 #define K1	0xf3988a32U
63 #define K2	0xe7311465U
64 #define K3	0xce6228cbU
65 #define K4	0x9cc45197U
66 #define K5	0x3988a32fU
67 #define K6	0x7311465eU
68 #define K7	0xe6228cbcU
69 #define K8	0xcc451979U
70 #define K9	0x988a32f3U
71 #define K10	0x311465e7U
72 #define K11	0x6228cbceU
73 #define K12	0xc451979cU
74 #define K13	0x88a32f39U
75 #define K14	0x11465e73U
76 #define K15	0x228cbce6U
77 #define K16	0x9d8a7a87U
78 #define K17	0x3b14f50fU
79 #define K18	0x7629ea1eU
80 #define K19	0xec53d43cU
81 #define K20	0xd8a7a879U
82 #define K21	0xb14f50f3U
83 #define K22	0x629ea1e7U
84 #define K23	0xc53d43ceU
85 #define K24	0x8a7a879dU
86 #define K25	0x14f50f3bU
87 #define K26	0x29ea1e76U
88 #define K27	0x53d43cecU
89 #define K28	0xa7a879d8U
90 #define K29	0x4f50f3b1U
91 #define K30	0x9ea1e762U
92 #define K31	0x3d43cec5U
93 #define K32	0x7a879d8aU
94 #define K33	0xf50f3b14U
95 #define K34	0xea1e7629U
96 #define K35	0xd43cec53U
97 #define K36	0xa879d8a7U
98 #define K37	0x50f3b14fU
99 #define K38	0xa1e7629eU
100 #define K39	0x43cec53dU
101 #define K40	0x879d8a7aU
102 #define K41	0x0f3b14f5U
103 #define K42	0x1e7629eaU
104 #define K43	0x3cec53d4U
105 #define K44	0x79d8a7a8U
106 #define K45	0xf3b14f50U
107 #define K46	0xe7629ea1U
108 #define K47	0xcec53d43U
109 #define K48	0x9d8a7a87U
110 #define K49	0x3b14f50fU
111 #define K50	0x7629ea1eU
112 #define K51	0xec53d43cU
113 #define K52	0xd8a7a879U
114 #define K53	0xb14f50f3U
115 #define K54	0x629ea1e7U
116 #define K55	0xc53d43ceU
117 #define K56	0x8a7a879dU
118 #define K57	0x14f50f3bU
119 #define K58	0x29ea1e76U
120 #define K59	0x53d43cecU
121 #define K60	0xa7a879d8U
122 #define K61	0x4f50f3b1U
123 #define K62	0x9ea1e762U
124 #define K63	0x3d43cec5U
125 
126 static uint32_t K[64] = {
127 	K0,  K1,  K2,  K3,  K4,  K5,  K6,  K7,
128 	K8,  K9,  K10, K11, K12, K13, K14, K15,
129 	K16, K17, K18, K19, K20, K21, K22, K23,
130 	K24, K25, K26, K27, K28, K29, K30, K31,
131 	K32, K33, K34, K35, K36, K37, K38, K39,
132 	K40, K41, K42, K43, K44, K45, K46, K47,
133 	K48, K49, K50, K51, K52, K53, K54, K55,
134 	K56, K57, K58, K59, K60, K61, K62, K63,
135 	/*
136 	0x79cc4519U, 0xf3988a32U, 0xe7311465U, 0xce6228cbU,
137 	0x9cc45197U, 0x3988a32fU, 0x7311465eU, 0xe6228cbcU,
138 	0xcc451979U, 0x988a32f3U, 0x311465e7U, 0x6228cbceU,
139 	0xc451979cU, 0x88a32f39U, 0x11465e73U, 0x228cbce6U,
140 	0x9d8a7a87U, 0x3b14f50fU, 0x7629ea1eU, 0xec53d43cU,
141 	0xd8a7a879U, 0xb14f50f3U, 0x629ea1e7U, 0xc53d43ceU,
142 	0x8a7a879dU, 0x14f50f3bU, 0x29ea1e76U, 0x53d43cecU,
143 	0xa7a879d8U, 0x4f50f3b1U, 0x9ea1e762U, 0x3d43cec5U,
144 	0x7a879d8aU, 0xf50f3b14U, 0xea1e7629U, 0xd43cec53U,
145 	0xa879d8a7U, 0x50f3b14fU, 0xa1e7629eU, 0x43cec53dU,
146 	0x879d8a7aU, 0x0f3b14f5U, 0x1e7629eaU, 0x3cec53d4U,
147 	0x79d8a7a8U, 0xf3b14f50U, 0xe7629ea1U, 0xcec53d43U,
148 	0x9d8a7a87U, 0x3b14f50fU, 0x7629ea1eU, 0xec53d43cU,
149 	0xd8a7a879U, 0xb14f50f3U, 0x629ea1e7U, 0xc53d43ceU,
150 	0x8a7a879dU, 0x14f50f3bU, 0x29ea1e76U, 0x53d43cecU,
151 	0xa7a879d8U, 0x4f50f3b1U, 0x9ea1e762U, 0x3d43cec5U,
152 	*/
153 };
154 
sm3_compress_blocks(uint32_t digest[8],const uint8_t * data,size_t blocks)155 void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks)
156 {
157 	uint32_t A;
158 	uint32_t B;
159 	uint32_t C;
160 	uint32_t D;
161 	uint32_t E;
162 	uint32_t F;
163 	uint32_t G;
164 	uint32_t H;
165 	uint32_t W[68];
166 	uint32_t SS1, SS2, TT1, TT2;
167 	int j;
168 
169 #ifdef SM3_SSE3
170 	__m128i X, T, R;
171 	__m128i M = _mm_setr_epi32(0, 0, 0, 0xffffffff);
172 	__m128i V = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
173 #endif
174 
175 	while (blocks--) {
176 
177 		A = digest[0];
178 		B = digest[1];
179 		C = digest[2];
180 		D = digest[3];
181 		E = digest[4];
182 		F = digest[5];
183 		G = digest[6];
184 		H = digest[7];
185 
186 
187 #ifdef SM3_SSE3
188 
189 		for (j = 0; j < 16; j += 4) {
190 			X = _mm_loadu_si128((__m128i *)(data + j * 4));
191 			X = _mm_shuffle_epi8(X, V);
192 			_mm_storeu_si128((__m128i *)(W + j), X);
193 		}
194 
195 		for (j = 16; j < 68; j += 4) {
196 			/* X = (W[j - 3], W[j - 2], W[j - 1], 0) */
197 			X = _mm_loadu_si128((__m128i *)(W + j - 3));
198 			X = _mm_andnot_si128(M, X);
199 
200 			X = _mm_rotl_epi32(X, 15);
201 			T = _mm_loadu_si128((__m128i *)(W + j - 9));
202 			X = _mm_xor_si128(X, T);
203 			T = _mm_loadu_si128((__m128i *)(W + j - 16));
204 			X = _mm_xor_si128(X, T);
205 
206 			/* P1() */
207 			T = _mm_rotl_epi32(X, (23 - 15));
208 			T = _mm_xor_si128(T, X);
209 			T = _mm_rotl_epi32(T, 15);
210 			X = _mm_xor_si128(X, T);
211 
212 			T = _mm_loadu_si128((__m128i *)(W + j - 13));
213 			T = _mm_rotl_epi32(T, 7);
214 			X = _mm_xor_si128(X, T);
215 			T = _mm_loadu_si128((__m128i *)(W + j - 6));
216 			X = _mm_xor_si128(X, T);
217 
218 			/* W[j + 3] ^= P1(ROL32(W[j + 1], 15)) */
219 			R = _mm_shuffle_epi32(X, 0);
220 			R = _mm_and_si128(R, M);
221 			T = _mm_rotl_epi32(R, 15);
222 			T = _mm_xor_si128(T, R);
223 			T = _mm_rotl_epi32(T, 9);
224 			R = _mm_xor_si128(R, T);
225 			R = _mm_rotl_epi32(R, 6);
226 			X = _mm_xor_si128(X, R);
227 
228 			_mm_storeu_si128((__m128i *)(W + j), X);
229 		}
230 #else
231 		for (j = 0; j < 16; j++)
232 			W[j] = GETU32(data + j*4);
233 
234 		for (; j < 68; j++)
235 			W[j] = P1(W[j - 16] ^ W[j - 9] ^ ROL32(W[j - 3], 15))
236 				^ ROL32(W[j - 13], 7) ^ W[j - 6];
237 #endif
238 
239 
240 		j = 0;
241 
242 #define FULL_UNROLL
243 #ifdef FULL_UNROLL
244 		R8(A, B, C, D, E, F, G, H, 00);
245 		R8(A, B, C, D, E, F, G, H, 00);
246 		R8(A, B, C, D, E, F, G, H, 16);
247 		R8(A, B, C, D, E, F, G, H, 16);
248 		R8(A, B, C, D, E, F, G, H, 16);
249 		R8(A, B, C, D, E, F, G, H, 16);
250 		R8(A, B, C, D, E, F, G, H, 16);
251 		R8(A, B, C, D, E, F, G, H, 16);
252 #else
253 		for (; j < 16; j++) {
254 			SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7);
255 			SS2 = SS1 ^ ROL32(A, 12);
256 			TT1 = FF00(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]);
257 			TT2 = GG00(E, F, G) + H + SS1 + W[j];
258 			D = C;
259 			C = ROL32(B, 9);
260 			B = A;
261 			A = TT1;
262 			H = G;
263 			G = ROL32(F, 19);
264 			F = E;
265 			E = P0(TT2);
266 		}
267 
268 		for (; j < 64; j++) {
269 			SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7);
270 			SS2 = SS1 ^ ROL32(A, 12);
271 			TT1 = FF16(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]);
272 			TT2 = GG16(E, F, G) + H + SS1 + W[j];
273 			D = C;
274 			C = ROL32(B, 9);
275 			B = A;
276 			A = TT1;
277 			H = G;
278 			G = ROL32(F, 19);
279 			F = E;
280 			E = P0(TT2);
281 		}
282 #endif
283 
284 		digest[0] ^= A;
285 		digest[1] ^= B;
286 		digest[2] ^= C;
287 		digest[3] ^= D;
288 		digest[4] ^= E;
289 		digest[5] ^= F;
290 		digest[6] ^= G;
291 		digest[7] ^= H;
292 
293 		data += 64;
294 	}
295 }
296 
297 
sm3_init(SM3_CTX * ctx)298 void sm3_init(SM3_CTX *ctx)
299 {
300 	memset(ctx, 0, sizeof(*ctx));
301 	ctx->digest[0] = 0x7380166F;
302 	ctx->digest[1] = 0x4914B2B9;
303 	ctx->digest[2] = 0x172442D7;
304 	ctx->digest[3] = 0xDA8A0600;
305 	ctx->digest[4] = 0xA96F30BC;
306 	ctx->digest[5] = 0x163138AA;
307 	ctx->digest[6] = 0xE38DEE4D;
308 	ctx->digest[7] = 0xB0FB0E4E;
309 }
310 
sm3_update(SM3_CTX * ctx,const uint8_t * data,size_t data_len)311 void sm3_update(SM3_CTX *ctx, const uint8_t *data, size_t data_len)
312 {
313 	size_t blocks;
314 
315 
316 	ctx->num &= 0x3f;
317 	if (ctx->num) {
318 		unsigned int left = SM3_BLOCK_SIZE - ctx->num;
319 		if (data_len < left) {
320 			memcpy(ctx->block + ctx->num, data, data_len);
321 			ctx->num += data_len;
322 			return;
323 		} else {
324 			memcpy(ctx->block + ctx->num, data, left);
325 			sm3_compress_blocks(ctx->digest, ctx->block, 1);
326 			ctx->nblocks++;
327 			data += left;
328 			data_len -= left;
329 		}
330 	}
331 
332 	blocks = data_len / SM3_BLOCK_SIZE;
333 	sm3_compress_blocks(ctx->digest, data, blocks);
334 	ctx->nblocks += blocks;
335 	data += SM3_BLOCK_SIZE * blocks;
336 	data_len -= SM3_BLOCK_SIZE * blocks;
337 
338 	ctx->num = data_len;
339 	if (data_len) {
340 		memcpy(ctx->block, data, data_len);
341 	}
342 }
343 
sm3_finish(SM3_CTX * ctx,uint8_t * digest)344 void sm3_finish(SM3_CTX *ctx, uint8_t *digest)
345 {
346 	int i;
347 
348 	ctx->num &= 0x3f;
349 	ctx->block[ctx->num] = 0x80;
350 
351 	if (ctx->num <= SM3_BLOCK_SIZE - 9) {
352 		memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 9);
353 	} else {
354 		memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 1);
355 		sm3_compress_blocks(ctx->digest, ctx->block, 1);
356 		memset(ctx->block, 0, SM3_BLOCK_SIZE - 8);
357 	}
358 	PUTU32(ctx->block + 56, ctx->nblocks >> 23);
359 	PUTU32(ctx->block + 60, (ctx->nblocks << 9) + (ctx->num << 3));
360 
361 	sm3_compress_blocks(ctx->digest, ctx->block, 1);
362 	for (i = 0; i < 8; i++) {
363 		PUTU32(digest + i*4, ctx->digest[i]);
364 	}
365 	memset(ctx, 0, sizeof(SM3_CTX));
366 }
367 
sm3_digest(const uint8_t * msg,size_t msglen,uint8_t dgst[SM3_DIGEST_SIZE])368 void sm3_digest(const uint8_t *msg, size_t msglen,
369 	uint8_t dgst[SM3_DIGEST_SIZE])
370 {
371 	SM3_CTX ctx;
372 	sm3_init(&ctx);
373 	sm3_update(&ctx, msg, msglen);
374 	sm3_finish(&ctx, dgst);
375 }
376