• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
2 2024-03-01 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 #include "Compiler.h"
6 #include "CpuArch.h"
7 
8 #if defined(_MSC_VER)
9 #if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
10 // #define USE_MY_MM
11 #endif
12 #endif
13 
14 // #define Z7_USE_HW_SHA_STUB // for debug
15 
16 #ifdef MY_CPU_X86_OR_AMD64
17   #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
18       #define USE_HW_SHA
19   #elif defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
20      || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
21      || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900)
22       #define USE_HW_SHA
23       #if !defined(_INTEL_COMPILER)
24       // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
25       #if !defined(__SHA__) || !defined(__SSSE3__)
26         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
27       #endif
28       #endif
29   #elif defined(_MSC_VER)
30     #ifdef USE_MY_MM
31       #define USE_VER_MIN 1300
32     #else
33       #define USE_VER_MIN 1900
34     #endif
35     #if (_MSC_VER >= USE_VER_MIN)
36       #define USE_HW_SHA
37     #else
38       #define Z7_USE_HW_SHA_STUB
39     #endif
40   #endif
41 // #endif // MY_CPU_X86_OR_AMD64
42 #ifndef USE_HW_SHA
43   // #define Z7_USE_HW_SHA_STUB // for debug
44 #endif
45 
46 #ifdef USE_HW_SHA
47 
48 // #pragma message("Sha1 HW")
49 
50 // sse/sse2/ssse3:
51 #include <tmmintrin.h>
52 // sha*:
53 #include <immintrin.h>
54 
55 #if defined (__clang__) && defined(_MSC_VER)
56   // #if !defined(__SSSE3__)
57   // #endif
58   #if !defined(__SHA__)
59     #include <shaintrin.h>
60   #endif
61 #else
62 
63 #ifdef USE_MY_MM
64 #include "My_mm.h"
65 #endif
66 
67 #endif
68 
69 /*
70 SHA1 uses:
71 SSE2:
72   _mm_loadu_si128
73   _mm_storeu_si128
74   _mm_set_epi32
75   _mm_add_epi32
76   _mm_shuffle_epi32 / pshufd
77   _mm_xor_si128
78   _mm_cvtsi128_si32
79   _mm_cvtsi32_si128
80 SSSE3:
81   _mm_shuffle_epi8 / pshufb
82 
83 SHA:
84   _mm_sha1*
85 */
86 
87 
88 #define XOR_SI128(dest, src)      dest = _mm_xor_si128(dest, src);
89 #define SHUFFLE_EPI8(dest, mask)  dest = _mm_shuffle_epi8(dest, mask);
90 #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
91 #ifdef __clang__
92 #define SHA1_RNDS4_RET_TYPE_CAST (__m128i)
93 #else
94 #define SHA1_RNDS4_RET_TYPE_CAST
95 #endif
96 #define SHA1_RND4(abcd, e0, f)    abcd = SHA1_RNDS4_RET_TYPE_CAST _mm_sha1rnds4_epu32(abcd, e0, f);
97 #define SHA1_NEXTE(e, m)          e = _mm_sha1nexte_epu32(e, m);
98 #define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
99 #define SHA1_MSG1(dest, src)      dest = _mm_sha1msg1_epu32(dest, src);
100 #define SHA1_MSG2(dest, src)      dest = _mm_sha1msg2_epu32(dest, src);
101 
102 
103 #define LOAD_SHUFFLE(m, k) \
104     m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
105     SHUFFLE_EPI8(m, mask) \
106 
107 #define SM1(m0, m1, m2, m3) \
108     SHA1_MSG1(m0, m1) \
109 
110 #define SM2(m0, m1, m2, m3) \
111     XOR_SI128(m3, m1) \
112     SHA1_MSG2(m3, m2) \
113 
114 #define SM3(m0, m1, m2, m3) \
115     XOR_SI128(m3, m1) \
116     SM1(m0, m1, m2, m3) \
117     SHA1_MSG2(m3, m2) \
118 
119 #define NNN(m0, m1, m2, m3)
120 
121 
122 
123 
124 
125 
126 
127 
128 
129 
130 
131 
132 
133 
134 
135 
136 
137 #define R4(k, e0, e1, m0, m1, m2, m3, OP) \
138     e1 = abcd; \
139     SHA1_RND4(abcd, e0, (k) / 5) \
140     SHA1_NEXTE(e1, m1) \
141     OP(m0, m1, m2, m3) \
142 
143 #define R16(k, mx, OP0, OP1, OP2, OP3) \
144     R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \
145     R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \
146     R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \
147     R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \
148 
149 #define PREPARE_STATE \
150     SHUFFLE_EPI32 (abcd, 0x1B) \
151     SHUFFLE_EPI32 (e0,   0x1B) \
152 
153 
154 
155 
156 
157 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
158 #ifdef ATTRIB_SHA
159 ATTRIB_SHA
160 #endif
Sha1_UpdateBlocks_HW(UInt32 state[5],const Byte * data,size_t numBlocks)161 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
162 {
163   const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
164 
165   __m128i abcd, e0;
166 
167   if (numBlocks == 0)
168     return;
169 
170   abcd = _mm_loadu_si128((const __m128i *) (const void *) &state[0]); // dbca
171   e0 = _mm_cvtsi32_si128((int)state[4]); // 000e
172 
173   PREPARE_STATE
174 
175   do
176   {
177     __m128i abcd_save, e2;
178     __m128i m0, m1, m2, m3;
179     __m128i e1;
180 
181 
182     abcd_save = abcd;
183     e2 = e0;
184 
185     LOAD_SHUFFLE (m0, 0)
186     LOAD_SHUFFLE (m1, 1)
187     LOAD_SHUFFLE (m2, 2)
188     LOAD_SHUFFLE (m3, 3)
189 
190     ADD_EPI32(e0, m0)
191 
192     R16 ( 0, m0, SM1, SM3, SM3, SM3 )
193     R16 ( 1, m0, SM3, SM3, SM3, SM3 )
194     R16 ( 2, m0, SM3, SM3, SM3, SM3 )
195     R16 ( 3, m0, SM3, SM3, SM3, SM3 )
196     R16 ( 4, e2, SM2, NNN, NNN, NNN )
197 
198     ADD_EPI32(abcd, abcd_save)
199 
200     data += 64;
201   }
202   while (--numBlocks);
203 
204   PREPARE_STATE
205 
206   _mm_storeu_si128((__m128i *) (void *) state, abcd);
207   *(state+4) = (UInt32)_mm_cvtsi128_si32(e0);
208 }
209 
210 #endif // USE_HW_SHA
211 
212 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
213    && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
214   #if   defined(__ARM_FEATURE_SHA2) \
215      || defined(__ARM_FEATURE_CRYPTO)
216     #define USE_HW_SHA
217   #else
218     #if  defined(MY_CPU_ARM64) \
219       || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
220       || defined(Z7_MSC_VER_ORIGINAL)
221     #if  defined(__ARM_FP) && \
222           (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
223            || defined(__GNUC__) && (__GNUC__ >= 6) \
224           ) \
225       || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
226     #if  defined(MY_CPU_ARM64) \
227       || !defined(Z7_CLANG_VERSION) \
228       || defined(__ARM_NEON) && \
229           (Z7_CLANG_VERSION < 170000 || \
230            Z7_CLANG_VERSION > 170001)
231       #define USE_HW_SHA
232     #endif
233     #endif
234     #endif
235   #endif
236 
237 #ifdef USE_HW_SHA
238 
239 // #pragma message("=== Sha1 HW === ")
240 // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_SHA2
241 
242 #if defined(__clang__) || defined(__GNUC__)
243 #if !defined(__ARM_FEATURE_SHA2) && \
244     !defined(__ARM_FEATURE_CRYPTO)
245   #ifdef MY_CPU_ARM64
246 #if defined(__clang__)
247     #define ATTRIB_SHA __attribute__((__target__("crypto")))
248 #else
249     #define ATTRIB_SHA __attribute__((__target__("+crypto")))
250 #endif
251   #else
252 #if defined(__clang__) && (__clang_major__ >= 1)
253     #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
254 #else
255     #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
256 #endif
257   #endif
258 #endif
259 #else
260   // _MSC_VER
261   // for arm32
262   #define _ARM_USE_NEW_NEON_INTRINSICS
263 #endif
264 
265 
266 
267 
268 
269 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
270 #include <arm64_neon.h>
271 #else
272 
273 
274 
275 
276 
277 
278 
279 
280 
281 #if defined(__clang__) && __clang_major__ < 16
282 #if !defined(__ARM_FEATURE_SHA2) && \
283     !defined(__ARM_FEATURE_CRYPTO)
284 //     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
285     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
286     #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
287 // #if defined(__clang__) && __clang_major__ < 13
288     #define __ARM_FEATURE_CRYPTO 1
289 // #else
290     #define __ARM_FEATURE_SHA2 1
291 // #endif
292     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
293 #endif
294 #endif // clang
295 
296 #if defined(__clang__)
297 
298 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
299     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
300 //    #pragma message("#define __ARM_ARCH 8")
301     #undef  __ARM_ARCH
302     #define __ARM_ARCH 8
303     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
304 #endif
305 
306 #endif // clang
307 
308 #include <arm_neon.h>
309 
310 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
311     defined(__ARM_FEATURE_CRYPTO) && \
312     defined(__ARM_FEATURE_SHA2)
313 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
314     #undef __ARM_FEATURE_CRYPTO
315     #undef __ARM_FEATURE_SHA2
316     #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
317 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
318 //    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
319 #endif
320 
321 #endif // Z7_MSC_VER_ORIGINAL
322 
323 typedef uint32x4_t v128;
324 // typedef __n128 v128; // MSVC
325 // the bug in clang 3.8.1:
326 // __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1);
327 #if defined(__clang__) && (__clang_major__ <= 9)
328 #pragma GCC diagnostic ignored "-Wvector-conversion"
329 #endif
330 
331 #ifdef MY_CPU_BE
332   #define MY_rev32_for_LE(x)
333 #else
334   #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
335 #endif
336 
337 #define LOAD_128(_p)      (*(const v128 *)(const void *)(_p))
338 #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
339 
340 #define LOAD_SHUFFLE(m, k) \
341     m = LOAD_128((data + (k) * 16)); \
342     MY_rev32_for_LE(m); \
343 
344 #define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3)
345 #define SU1(dest, src)        dest = vsha1su1q_u32(dest, src)
346 #define C(e)                  abcd = vsha1cq_u32(abcd, e, t)
347 #define P(e)                  abcd = vsha1pq_u32(abcd, e, t)
348 #define M(e)                  abcd = vsha1mq_u32(abcd, e, t)
349 #define H(e)                  e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
350 #define T(m, c)               t = vaddq_u32(m, c)
351 
352 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
353 #ifdef ATTRIB_SHA
354 ATTRIB_SHA
355 #endif
Sha1_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)356 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
357 {
358   v128 abcd;
359   v128 c0, c1, c2, c3;
360   uint32_t e0;
361 
362   if (numBlocks == 0)
363     return;
364 
365   c0 = vdupq_n_u32(0x5a827999);
366   c1 = vdupq_n_u32(0x6ed9eba1);
367   c2 = vdupq_n_u32(0x8f1bbcdc);
368   c3 = vdupq_n_u32(0xca62c1d6);
369 
370   abcd = LOAD_128(&state[0]);
371   e0 = state[4];
372 
373   do
374   {
375     v128 abcd_save;
376     v128 m0, m1, m2, m3;
377     v128 t;
378     uint32_t e0_save, e1;
379 
380     abcd_save = abcd;
381     e0_save = e0;
382 
383     LOAD_SHUFFLE (m0, 0)
384     LOAD_SHUFFLE (m1, 1)
385     LOAD_SHUFFLE (m2, 2)
386     LOAD_SHUFFLE (m3, 3)
387 
388     T(m0, c0);                                  H(e1); C(e0);
389     T(m1, c0);  SU0(m0, m1, m2);                H(e0); C(e1);
390     T(m2, c0);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); C(e0);
391     T(m3, c0);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); C(e1);
392     T(m0, c0);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); C(e0);
393     T(m1, c1);  SU0(m0, m1, m2);  SU1(m3, m2);  H(e0); P(e1);
394     T(m2, c1);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); P(e0);
395     T(m3, c1);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); P(e1);
396     T(m0, c1);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); P(e0);
397     T(m1, c1);  SU0(m0, m1, m2);  SU1(m3, m2);  H(e0); P(e1);
398     T(m2, c2);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); M(e0);
399     T(m3, c2);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); M(e1);
400     T(m0, c2);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); M(e0);
401     T(m1, c2);  SU0(m0, m1, m2);  SU1(m3, m2);  H(e0); M(e1);
402     T(m2, c2);  SU0(m1, m2, m3);  SU1(m0, m3);  H(e1); M(e0);
403     T(m3, c3);  SU0(m2, m3, m0);  SU1(m1, m0);  H(e0); P(e1);
404     T(m0, c3);  SU0(m3, m0, m1);  SU1(m2, m1);  H(e1); P(e0);
405     T(m1, c3);                    SU1(m3, m2);  H(e0); P(e1);
406     T(m2, c3);                                  H(e1); P(e0);
407     T(m3, c3);                                  H(e0); P(e1);
408 
409     abcd = vaddq_u32(abcd, abcd_save);
410     e0 += e0_save;
411 
412     data += 64;
413   }
414   while (--numBlocks);
415 
416   STORE_128(&state[0], abcd);
417   state[4] = e0;
418 }
419 
420 #endif // USE_HW_SHA
421 
422 #endif // MY_CPU_ARM_OR_ARM64
423 
424 
425 #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
426 // #error Stop_Compiling_UNSUPPORTED_SHA
427 // #include <stdlib.h>
428 
429 
430 
431 // #include "Sha1.h"
432 // #if defined(_MSC_VER)
433 #pragma message("Sha1   HW-SW stub was used")
434 // #endif
435 void Z7_FASTCALL Sha1_UpdateBlocks   (UInt32 state[5], const Byte *data, size_t numBlocks);
436 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
Sha1_UpdateBlocks_HW(UInt32 state[5],const Byte * data,size_t numBlocks)437 void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
438 {
439   Sha1_UpdateBlocks(state, data, numBlocks);
440   /*
441   UNUSED_VAR(state);
442   UNUSED_VAR(data);
443   UNUSED_VAR(numBlocks);
444   exit(1);
445   return;
446   */
447 }
448 #endif
449 
450 #undef SU0
451 #undef SU1
452 #undef C
453 #undef P
454 #undef M
455 #undef H
456 #undef T
457 #undef MY_rev32_for_LE
458 #undef NNN
459 #undef LOAD_128
460 #undef STORE_128
461 #undef LOAD_SHUFFLE
462 #undef SM1
463 #undef SM2
464 #undef SM3
465 #undef NNN
466 #undef R4
467 #undef R16
468 #undef PREPARE_STATE
469 #undef USE_HW_SHA
470 #undef ATTRIB_SHA
471 #undef USE_VER_MIN
472 #undef Z7_USE_HW_SHA_STUB
473