1 /*
2 * Copyright 2014-2022 The GmSSL Project. All Rights Reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the License); you may
5 * not use this file except in compliance with the License.
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 */
9
10
11 #include <string.h>
12 #include <gmssl/sm3.h>
13 #include <gmssl/endian.h>
14 #include <gmssl/error.h>
15
16
17 #ifdef SM3_SSE3
18 # include <x86intrin.h>
19 # include <immintrin.h>
20
21 # define _mm_rotl_epi32(X,i) \
22 _mm_xor_si128(_mm_slli_epi32((X),(i)), _mm_srli_epi32((X),32-(i)))
23 #endif
24
25
26 #define ROTL(x,n) (((x)<<(n)) | ((x)>>(32-(n))))
27 #define P0(x) ((x) ^ ROL32((x), 9) ^ ROL32((x),17))
28 #define P1(x) ((x) ^ ROL32((x),15) ^ ROL32((x),23))
29
30 #define FF00(x,y,z) ((x) ^ (y) ^ (z))
31 #define FF16(x,y,z) (((x)&(y)) | ((x)&(z)) | ((y)&(z)))
32 #define GG00(x,y,z) ((x) ^ (y) ^ (z))
33 #define GG16(x,y,z) ((((y)^(z)) & (x)) ^ (z))
34
35 #define R(A, B, C, D, E, F, G, H, xx) \
36 SS1 = ROL32((ROL32(A, 12) + E + K[j]), 7); \
37 SS2 = SS1 ^ ROL32(A, 12); \
38 TT1 = FF##xx(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]); \
39 TT2 = GG##xx(E, F, G) + H + SS1 + W[j]; \
40 B = ROL32(B, 9); \
41 H = TT1; \
42 F = ROL32(F, 19); \
43 D = P0(TT2); \
44 j++
45
46 #define R8(A, B, C, D, E, F, G, H, xx) \
47 R(A, B, C, D, E, F, G, H, xx); \
48 R(H, A, B, C, D, E, F, G, xx); \
49 R(G, H, A, B, C, D, E, F, xx); \
50 R(F, G, H, A, B, C, D, E, xx); \
51 R(E, F, G, H, A, B, C, D, xx); \
52 R(D, E, F, G, H, A, B, C, xx); \
53 R(C, D, E, F, G, H, A, B, xx); \
54 R(B, C, D, E, F, G, H, A, xx)
55
56
57
58 #define T00 0x79cc4519U
59 #define T16 0x7a879d8aU
60
61 #define K0 0x79cc4519U
62 #define K1 0xf3988a32U
63 #define K2 0xe7311465U
64 #define K3 0xce6228cbU
65 #define K4 0x9cc45197U
66 #define K5 0x3988a32fU
67 #define K6 0x7311465eU
68 #define K7 0xe6228cbcU
69 #define K8 0xcc451979U
70 #define K9 0x988a32f3U
71 #define K10 0x311465e7U
72 #define K11 0x6228cbceU
73 #define K12 0xc451979cU
74 #define K13 0x88a32f39U
75 #define K14 0x11465e73U
76 #define K15 0x228cbce6U
77 #define K16 0x9d8a7a87U
78 #define K17 0x3b14f50fU
79 #define K18 0x7629ea1eU
80 #define K19 0xec53d43cU
81 #define K20 0xd8a7a879U
82 #define K21 0xb14f50f3U
83 #define K22 0x629ea1e7U
84 #define K23 0xc53d43ceU
85 #define K24 0x8a7a879dU
86 #define K25 0x14f50f3bU
87 #define K26 0x29ea1e76U
88 #define K27 0x53d43cecU
89 #define K28 0xa7a879d8U
90 #define K29 0x4f50f3b1U
91 #define K30 0x9ea1e762U
92 #define K31 0x3d43cec5U
93 #define K32 0x7a879d8aU
94 #define K33 0xf50f3b14U
95 #define K34 0xea1e7629U
96 #define K35 0xd43cec53U
97 #define K36 0xa879d8a7U
98 #define K37 0x50f3b14fU
99 #define K38 0xa1e7629eU
100 #define K39 0x43cec53dU
101 #define K40 0x879d8a7aU
102 #define K41 0x0f3b14f5U
103 #define K42 0x1e7629eaU
104 #define K43 0x3cec53d4U
105 #define K44 0x79d8a7a8U
106 #define K45 0xf3b14f50U
107 #define K46 0xe7629ea1U
108 #define K47 0xcec53d43U
109 #define K48 0x9d8a7a87U
110 #define K49 0x3b14f50fU
111 #define K50 0x7629ea1eU
112 #define K51 0xec53d43cU
113 #define K52 0xd8a7a879U
114 #define K53 0xb14f50f3U
115 #define K54 0x629ea1e7U
116 #define K55 0xc53d43ceU
117 #define K56 0x8a7a879dU
118 #define K57 0x14f50f3bU
119 #define K58 0x29ea1e76U
120 #define K59 0x53d43cecU
121 #define K60 0xa7a879d8U
122 #define K61 0x4f50f3b1U
123 #define K62 0x9ea1e762U
124 #define K63 0x3d43cec5U
125
126 static uint32_t K[64] = {
127 K0, K1, K2, K3, K4, K5, K6, K7,
128 K8, K9, K10, K11, K12, K13, K14, K15,
129 K16, K17, K18, K19, K20, K21, K22, K23,
130 K24, K25, K26, K27, K28, K29, K30, K31,
131 K32, K33, K34, K35, K36, K37, K38, K39,
132 K40, K41, K42, K43, K44, K45, K46, K47,
133 K48, K49, K50, K51, K52, K53, K54, K55,
134 K56, K57, K58, K59, K60, K61, K62, K63,
135 /*
136 0x79cc4519U, 0xf3988a32U, 0xe7311465U, 0xce6228cbU,
137 0x9cc45197U, 0x3988a32fU, 0x7311465eU, 0xe6228cbcU,
138 0xcc451979U, 0x988a32f3U, 0x311465e7U, 0x6228cbceU,
139 0xc451979cU, 0x88a32f39U, 0x11465e73U, 0x228cbce6U,
140 0x9d8a7a87U, 0x3b14f50fU, 0x7629ea1eU, 0xec53d43cU,
141 0xd8a7a879U, 0xb14f50f3U, 0x629ea1e7U, 0xc53d43ceU,
142 0x8a7a879dU, 0x14f50f3bU, 0x29ea1e76U, 0x53d43cecU,
143 0xa7a879d8U, 0x4f50f3b1U, 0x9ea1e762U, 0x3d43cec5U,
144 0x7a879d8aU, 0xf50f3b14U, 0xea1e7629U, 0xd43cec53U,
145 0xa879d8a7U, 0x50f3b14fU, 0xa1e7629eU, 0x43cec53dU,
146 0x879d8a7aU, 0x0f3b14f5U, 0x1e7629eaU, 0x3cec53d4U,
147 0x79d8a7a8U, 0xf3b14f50U, 0xe7629ea1U, 0xcec53d43U,
148 0x9d8a7a87U, 0x3b14f50fU, 0x7629ea1eU, 0xec53d43cU,
149 0xd8a7a879U, 0xb14f50f3U, 0x629ea1e7U, 0xc53d43ceU,
150 0x8a7a879dU, 0x14f50f3bU, 0x29ea1e76U, 0x53d43cecU,
151 0xa7a879d8U, 0x4f50f3b1U, 0x9ea1e762U, 0x3d43cec5U,
152 */
153 };
154
sm3_compress_blocks(uint32_t digest[8],const uint8_t * data,size_t blocks)155 void sm3_compress_blocks(uint32_t digest[8], const uint8_t *data, size_t blocks)
156 {
157 uint32_t A;
158 uint32_t B;
159 uint32_t C;
160 uint32_t D;
161 uint32_t E;
162 uint32_t F;
163 uint32_t G;
164 uint32_t H;
165 uint32_t W[68];
166 uint32_t SS1, SS2, TT1, TT2;
167 int j;
168
169 #ifdef SM3_SSE3
170 __m128i X, T, R;
171 __m128i M = _mm_setr_epi32(0, 0, 0, 0xffffffff);
172 __m128i V = _mm_setr_epi8(3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12);
173 #endif
174
175 while (blocks--) {
176
177 A = digest[0];
178 B = digest[1];
179 C = digest[2];
180 D = digest[3];
181 E = digest[4];
182 F = digest[5];
183 G = digest[6];
184 H = digest[7];
185
186
187 #ifdef SM3_SSE3
188
189 for (j = 0; j < 16; j += 4) {
190 X = _mm_loadu_si128((__m128i *)(data + j * 4));
191 X = _mm_shuffle_epi8(X, V);
192 _mm_storeu_si128((__m128i *)(W + j), X);
193 }
194
195 for (j = 16; j < 68; j += 4) {
196 /* X = (W[j - 3], W[j - 2], W[j - 1], 0) */
197 X = _mm_loadu_si128((__m128i *)(W + j - 3));
198 X = _mm_andnot_si128(M, X);
199
200 X = _mm_rotl_epi32(X, 15);
201 T = _mm_loadu_si128((__m128i *)(W + j - 9));
202 X = _mm_xor_si128(X, T);
203 T = _mm_loadu_si128((__m128i *)(W + j - 16));
204 X = _mm_xor_si128(X, T);
205
206 /* P1() */
207 T = _mm_rotl_epi32(X, (23 - 15));
208 T = _mm_xor_si128(T, X);
209 T = _mm_rotl_epi32(T, 15);
210 X = _mm_xor_si128(X, T);
211
212 T = _mm_loadu_si128((__m128i *)(W + j - 13));
213 T = _mm_rotl_epi32(T, 7);
214 X = _mm_xor_si128(X, T);
215 T = _mm_loadu_si128((__m128i *)(W + j - 6));
216 X = _mm_xor_si128(X, T);
217
218 /* W[j + 3] ^= P1(ROL32(W[j + 1], 15)) */
219 R = _mm_shuffle_epi32(X, 0);
220 R = _mm_and_si128(R, M);
221 T = _mm_rotl_epi32(R, 15);
222 T = _mm_xor_si128(T, R);
223 T = _mm_rotl_epi32(T, 9);
224 R = _mm_xor_si128(R, T);
225 R = _mm_rotl_epi32(R, 6);
226 X = _mm_xor_si128(X, R);
227
228 _mm_storeu_si128((__m128i *)(W + j), X);
229 }
230 #else
231 for (j = 0; j < 16; j++)
232 W[j] = GETU32(data + j*4);
233
234 for (; j < 68; j++)
235 W[j] = P1(W[j - 16] ^ W[j - 9] ^ ROL32(W[j - 3], 15))
236 ^ ROL32(W[j - 13], 7) ^ W[j - 6];
237 #endif
238
239
240 j = 0;
241
242 #define FULL_UNROLL
243 #ifdef FULL_UNROLL
244 R8(A, B, C, D, E, F, G, H, 00);
245 R8(A, B, C, D, E, F, G, H, 00);
246 R8(A, B, C, D, E, F, G, H, 16);
247 R8(A, B, C, D, E, F, G, H, 16);
248 R8(A, B, C, D, E, F, G, H, 16);
249 R8(A, B, C, D, E, F, G, H, 16);
250 R8(A, B, C, D, E, F, G, H, 16);
251 R8(A, B, C, D, E, F, G, H, 16);
252 #else
253 for (; j < 16; j++) {
254 SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7);
255 SS2 = SS1 ^ ROL32(A, 12);
256 TT1 = FF00(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]);
257 TT2 = GG00(E, F, G) + H + SS1 + W[j];
258 D = C;
259 C = ROL32(B, 9);
260 B = A;
261 A = TT1;
262 H = G;
263 G = ROL32(F, 19);
264 F = E;
265 E = P0(TT2);
266 }
267
268 for (; j < 64; j++) {
269 SS1 = ROL32((ROL32(A, 12) + E + K(j)), 7);
270 SS2 = SS1 ^ ROL32(A, 12);
271 TT1 = FF16(A, B, C) + D + SS2 + (W[j] ^ W[j + 4]);
272 TT2 = GG16(E, F, G) + H + SS1 + W[j];
273 D = C;
274 C = ROL32(B, 9);
275 B = A;
276 A = TT1;
277 H = G;
278 G = ROL32(F, 19);
279 F = E;
280 E = P0(TT2);
281 }
282 #endif
283
284 digest[0] ^= A;
285 digest[1] ^= B;
286 digest[2] ^= C;
287 digest[3] ^= D;
288 digest[4] ^= E;
289 digest[5] ^= F;
290 digest[6] ^= G;
291 digest[7] ^= H;
292
293 data += 64;
294 }
295 }
296
297
sm3_init(SM3_CTX * ctx)298 void sm3_init(SM3_CTX *ctx)
299 {
300 memset(ctx, 0, sizeof(*ctx));
301 ctx->digest[0] = 0x7380166F;
302 ctx->digest[1] = 0x4914B2B9;
303 ctx->digest[2] = 0x172442D7;
304 ctx->digest[3] = 0xDA8A0600;
305 ctx->digest[4] = 0xA96F30BC;
306 ctx->digest[5] = 0x163138AA;
307 ctx->digest[6] = 0xE38DEE4D;
308 ctx->digest[7] = 0xB0FB0E4E;
309 }
310
sm3_update(SM3_CTX * ctx,const uint8_t * data,size_t data_len)311 void sm3_update(SM3_CTX *ctx, const uint8_t *data, size_t data_len)
312 {
313 size_t blocks;
314
315
316 ctx->num &= 0x3f;
317 if (ctx->num) {
318 unsigned int left = SM3_BLOCK_SIZE - ctx->num;
319 if (data_len < left) {
320 memcpy(ctx->block + ctx->num, data, data_len);
321 ctx->num += data_len;
322 return;
323 } else {
324 memcpy(ctx->block + ctx->num, data, left);
325 sm3_compress_blocks(ctx->digest, ctx->block, 1);
326 ctx->nblocks++;
327 data += left;
328 data_len -= left;
329 }
330 }
331
332 blocks = data_len / SM3_BLOCK_SIZE;
333 sm3_compress_blocks(ctx->digest, data, blocks);
334 ctx->nblocks += blocks;
335 data += SM3_BLOCK_SIZE * blocks;
336 data_len -= SM3_BLOCK_SIZE * blocks;
337
338 ctx->num = data_len;
339 if (data_len) {
340 memcpy(ctx->block, data, data_len);
341 }
342 }
343
sm3_finish(SM3_CTX * ctx,uint8_t * digest)344 void sm3_finish(SM3_CTX *ctx, uint8_t *digest)
345 {
346 int i;
347
348 ctx->num &= 0x3f;
349 ctx->block[ctx->num] = 0x80;
350
351 if (ctx->num <= SM3_BLOCK_SIZE - 9) {
352 memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 9);
353 } else {
354 memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 1);
355 sm3_compress_blocks(ctx->digest, ctx->block, 1);
356 memset(ctx->block, 0, SM3_BLOCK_SIZE - 8);
357 }
358 PUTU32(ctx->block + 56, ctx->nblocks >> 23);
359 PUTU32(ctx->block + 60, (ctx->nblocks << 9) + (ctx->num << 3));
360
361 sm3_compress_blocks(ctx->digest, ctx->block, 1);
362 for (i = 0; i < 8; i++) {
363 PUTU32(digest + i*4, ctx->digest[i]);
364 }
365 memset(ctx, 0, sizeof(SM3_CTX));
366 }
367
sm3_digest(const uint8_t * msg,size_t msglen,uint8_t dgst[SM3_DIGEST_SIZE])368 void sm3_digest(const uint8_t *msg, size_t msglen,
369 uint8_t dgst[SM3_DIGEST_SIZE])
370 {
371 SM3_CTX ctx;
372 sm3_init(&ctx);
373 sm3_update(&ctx, msg, msglen);
374 sm3_finish(&ctx, dgst);
375 }
376