1 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2 2024-03-01 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5 #include "Compiler.h"
6 #include "CpuArch.h"
7
8 #if defined(_MSC_VER)
9 #if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10 // #define USE_MY_MM
11 #endif
12 #endif
13
14 // #define Z7_USE_HW_SHA_STUB // for debug
15
16 #ifdef MY_CPU_X86_OR_AMD64
17 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18 #define USE_HW_SHA
19 #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
20 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
22 #define USE_HW_SHA
23 #if !defined(_INTEL_COMPILER)
24 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25 #if !defined(__SHA__) || !defined(__SSSE3__)
26 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27 #endif
28 #endif
29 #elif defined(_MSC_VER)
30 #ifdef USE_MY_MM
31 #define USE_VER_MIN 1300
32 #else
33 #define USE_VER_MIN 1900
34 #endif
35 #if (_MSC_VER >= USE_VER_MIN)
36 #define USE_HW_SHA
37 #else
38 #define Z7_USE_HW_SHA_STUB
39 #endif
40 #endif
41 // #endif // MY_CPU_X86_OR_AMD64
42 #ifndef USE_HW_SHA
43 // #define Z7_USE_HW_SHA_STUB // for debug
44 #endif
45
46 #ifdef USE_HW_SHA
47
48 // #pragma message("Sha256 HW")
49
50 // sse/sse2/ssse3:
51 #include <tmmintrin.h>
52 // sha*:
53 #include <immintrin.h>
54
55 #if defined (__clang__) && defined(_MSC_VER)
56 // #if !defined(__SSSE3__)
57 // #endif
58 #if !defined(__SHA__)
59 #include <shaintrin.h>
60 #endif
61 #else
62
63 #ifdef USE_MY_MM
64 #include "My_mm.h"
65 #endif
66
67 #endif
68
69 /*
70 SHA256 uses:
71 SSE2:
72 _mm_loadu_si128
73 _mm_storeu_si128
74 _mm_set_epi32
75 _mm_add_epi32
76 _mm_shuffle_epi32 / pshufd
77
78
79
80 SSSE3:
81 _mm_shuffle_epi8 / pshufb
82 _mm_alignr_epi8
83 SHA:
84 _mm_sha256*
85 */
86
87 // K array must be aligned for 16-bytes at least.
88 // The compiler can look align attribute and selects
89 // movdqu - for code without align attribute
90 // movdqa - for code with align attribute
91 extern
92 MY_ALIGN(64)
93 const UInt32 SHA256_K_ARRAY[64];
94
95 #define K SHA256_K_ARRAY
96
97
98 #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
99 #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
100 #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
101
102
103 #define LOAD_SHUFFLE(m, k) \
104 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105 m = _mm_shuffle_epi8(m, mask); \
106
107 #define SM1(g0, g1, g2, g3) \
108 SHA256_MSG1(g3, g0); \
109
110 #define SM2(g0, g1, g2, g3) \
111 tmp = _mm_alignr_epi8(g1, g0, 4); \
112 ADD_EPI32(g2, tmp) \
113 SHA25G_MSG2(g2, g1); \
114
115 // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
116 // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
117
118
119 #define NNN(g0, g1, g2, g3)
120
121
122 #define RND2(t0, t1) \
123 t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
124
125 #define RND2_0(m, k) \
126 msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
127 RND2(state0, state1); \
128 msg = _mm_shuffle_epi32(msg, 0x0E); \
129
130
131 #define RND2_1 \
132 RND2(state1, state0); \
133
134
135 // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
136
137 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
138 RND2_0(g0, k) \
139 OP0(g0, g1, g2, g3) \
140 RND2_1 \
141 OP1(g0, g1, g2, g3) \
142
143 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
144 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
145 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
146 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
147 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
148
149 #define PREPARE_STATE \
150 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
151 state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
152 state1 = state0; \
153 state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
154 state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
155
156
157 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
158 #ifdef ATTRIB_SHA
159 ATTRIB_SHA
160 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)161 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
162 {
163 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
164 __m128i tmp;
165 __m128i state0, state1;
166
167 if (numBlocks == 0)
168 return;
169
170 state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
171 state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
172
173 PREPARE_STATE
174
175 do
176 {
177 __m128i state0_save, state1_save;
178 __m128i m0, m1, m2, m3;
179 __m128i msg;
180 // #define msg tmp
181
182 state0_save = state0;
183 state1_save = state1;
184
185 LOAD_SHUFFLE (m0, 0)
186 LOAD_SHUFFLE (m1, 1)
187 LOAD_SHUFFLE (m2, 2)
188 LOAD_SHUFFLE (m3, 3)
189
190
191
192 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
193 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
194 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
195 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
196
197 ADD_EPI32(state0, state0_save)
198 ADD_EPI32(state1, state1_save)
199
200 data += 64;
201 }
202 while (--numBlocks);
203
204 PREPARE_STATE
205
206 _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
207 _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
208 }
209
210 #endif // USE_HW_SHA
211
212 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
213
214 #if defined(__ARM_FEATURE_SHA2) \
215 || defined(__ARM_FEATURE_CRYPTO)
216 #define USE_HW_SHA
217 #else
218 #if defined(MY_CPU_ARM64) \
219 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
220 || defined(Z7_MSC_VER_ORIGINAL)
221 #if defined(__ARM_FP) && \
222 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
223 || defined(__GNUC__) && (__GNUC__ >= 6) \
224 ) \
225 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
226 #if defined(MY_CPU_ARM64) \
227 || !defined(Z7_CLANG_VERSION) \
228 || defined(__ARM_NEON) && \
229 (Z7_CLANG_VERSION < 170000 || \
230 Z7_CLANG_VERSION > 170001)
231 #define USE_HW_SHA
232 #endif
233 #endif
234 #endif
235 #endif
236
237 #ifdef USE_HW_SHA
238
239 // #pragma message("=== Sha256 HW === ")
240
241
242 #if defined(__clang__) || defined(__GNUC__)
243 #if !defined(__ARM_FEATURE_SHA2) && \
244 !defined(__ARM_FEATURE_CRYPTO)
245 #ifdef MY_CPU_ARM64
246 #if defined(__clang__)
247 #define ATTRIB_SHA __attribute__((__target__("crypto")))
248 #else
249 #define ATTRIB_SHA __attribute__((__target__("+crypto")))
250 #endif
251 #else
252 #if defined(__clang__) && (__clang_major__ >= 1)
253 #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
254 #else
255 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
256 #endif
257 #endif
258 #endif
259 #else
260 // _MSC_VER
261 // for arm32
262 #define _ARM_USE_NEW_NEON_INTRINSICS
263 #endif
264
265
266
267
268
269 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270 #include <arm64_neon.h>
271 #else
272
273
274
275
276
277
278
279
280
281 #if defined(__clang__) && __clang_major__ < 16
282 #if !defined(__ARM_FEATURE_SHA2) && \
283 !defined(__ARM_FEATURE_CRYPTO)
284 // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
285 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
286 #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
287 // #if defined(__clang__) && __clang_major__ < 13
288 #define __ARM_FEATURE_CRYPTO 1
289 // #else
290 #define __ARM_FEATURE_SHA2 1
291 // #endif
292 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
293 #endif
294 #endif // clang
295
296 #if defined(__clang__)
297
298 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
299 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
300 // #pragma message("#define __ARM_ARCH 8")
301 #undef __ARM_ARCH
302 #define __ARM_ARCH 8
303 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
304 #endif
305
306 #endif // clang
307
308 #include <arm_neon.h>
309
310 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
311 defined(__ARM_FEATURE_CRYPTO) && \
312 defined(__ARM_FEATURE_SHA2)
313 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
314 #undef __ARM_FEATURE_CRYPTO
315 #undef __ARM_FEATURE_SHA2
316 #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
317 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
318 // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
319 #endif
320
321 #endif // Z7_MSC_VER_ORIGINAL
322
323 typedef uint32x4_t v128;
324 // typedef __n128 v128; // MSVC
325
326 #ifdef MY_CPU_BE
327 #define MY_rev32_for_LE(x)
328 #else
329 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
330 #endif
331
332 #define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
333 #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
334
335 #define LOAD_SHUFFLE(m, k) \
336 m = LOAD_128((data + (k) * 16)); \
337 MY_rev32_for_LE(m); \
338
339 // K array must be aligned for 16-bytes at least.
340 extern
341 MY_ALIGN(64)
342 const UInt32 SHA256_K_ARRAY[64];
343
344 #define K SHA256_K_ARRAY
345
346
347 #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
348 #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
349
350 #define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
351 #define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
352 #define NNN(g0, g1, g2, g3)
353
354
355 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
356 msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
357 tmp = state0; \
358 state0 = vsha256hq_u32( state0, state1, msg ); \
359 state1 = vsha256h2q_u32( state1, tmp, msg ); \
360 OP0(g0, g1, g2, g3); \
361 OP1(g0, g1, g2, g3); \
362
363
364 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
365 R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
366 R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
367 R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
368 R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
369
370
371 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
372 #ifdef ATTRIB_SHA
373 ATTRIB_SHA
374 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)375 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
376 {
377 v128 state0, state1;
378
379 if (numBlocks == 0)
380 return;
381
382 state0 = LOAD_128(&state[0]);
383 state1 = LOAD_128(&state[4]);
384
385 do
386 {
387 v128 state0_save, state1_save;
388 v128 m0, m1, m2, m3;
389 v128 msg, tmp;
390
391 state0_save = state0;
392 state1_save = state1;
393
394 LOAD_SHUFFLE (m0, 0)
395 LOAD_SHUFFLE (m1, 1)
396 LOAD_SHUFFLE (m2, 2)
397 LOAD_SHUFFLE (m3, 3)
398
399 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
400 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
401 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
402 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
403
404 state0 = vaddq_u32(state0, state0_save);
405 state1 = vaddq_u32(state1, state1_save);
406
407 data += 64;
408 }
409 while (--numBlocks);
410
411 STORE_128(&state[0], state0);
412 STORE_128(&state[4], state1);
413 }
414
415 #endif // USE_HW_SHA
416
417 #endif // MY_CPU_ARM_OR_ARM64
418
419
420 #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
421 // #error Stop_Compiling_UNSUPPORTED_SHA
422 // #include <stdlib.h>
423 // We can compile this file with another C compiler,
424 // or we can compile asm version.
425 // So we can generate real code instead of this stub function.
426 // #include "Sha256.h"
427 // #if defined(_MSC_VER)
428 #pragma message("Sha256 HW-SW stub was used")
429 // #endif
430 void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks);
431 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)432 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
433 {
434 Sha256_UpdateBlocks(state, data, numBlocks);
435 /*
436 UNUSED_VAR(state);
437 UNUSED_VAR(data);
438 UNUSED_VAR(numBlocks);
439 exit(1);
440 return;
441 */
442 }
443 #endif
444
445
446
447 #undef K
448 #undef RND2
449 #undef RND2_0
450 #undef RND2_1
451
452 #undef MY_rev32_for_LE
453 #undef NNN
454 #undef LOAD_128
455 #undef STORE_128
456 #undef LOAD_SHUFFLE
457 #undef SM1
458 #undef SM2
459
460 #undef NNN
461 #undef R4
462 #undef R16
463 #undef PREPARE_STATE
464 #undef USE_HW_SHA
465 #undef ATTRIB_SHA
466 #undef USE_VER_MIN
467 #undef Z7_USE_HW_SHA_STUB
468