• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2 2023-04-02 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 #include "Compiler.h"
6 #include "CpuArch.h"
7 
8 #ifndef _IS_TRY_USE_HW_SHA
9 #define _IS_TRY_USE_HW_SHA 1
10 #endif
11 
12 #if defined(_MSC_VER)
13 #if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
14 // #define USE_MY_MM
15 #endif
16 #endif
17 
18 #if (_IS_TRY_USE_HW_SHA) && defined(MY_CPU_X86_OR_AMD64)
19   #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
20       #define USE_HW_SHA
21   #elif defined(Z7_LLVM_CLANG_VERSION)  && (Z7_LLVM_CLANG_VERSION  >= 30800) \
22      || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
23      || defined(Z7_GCC_VERSION)         && (Z7_GCC_VERSION         >= 40900)
24       #define USE_HW_SHA
25       #if !defined(_INTEL_COMPILER)
26       // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
27       #if !defined(__SHA__) || !defined(__SSSE3__)
28         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
29       #endif
30       #endif
31   #elif defined(_MSC_VER)
32     #ifdef USE_MY_MM
33       #define USE_VER_MIN 1300
34     #else
35       #define USE_VER_MIN 1900
36     #endif
37     #if (_MSC_VER >= USE_VER_MIN)
38       #define USE_HW_SHA
39     #endif
40   #endif
41 // #endif // MY_CPU_X86_OR_AMD64
42 
43 #ifdef USE_HW_SHA
44 
45 // #pragma message("Sha256 HW")
46 
47 // sse/sse2/ssse3:
48 #include <tmmintrin.h>
49 // sha*:
50 #include <immintrin.h>
51 
52 #if defined (__clang__) && defined(_MSC_VER)
53   // #if !defined(__SSSE3__)
54   // #endif
55   #if !defined(__SHA__)
56     #include <shaintrin.h>
57   #endif
58 #else
59 
60 #ifdef USE_MY_MM
61 #include "My_mm.h"
62 #endif
63 
64 #endif
65 
66 /*
67 SHA256 uses:
68 SSE2:
69   _mm_loadu_si128
70   _mm_storeu_si128
71   _mm_set_epi32
72   _mm_add_epi32
73   _mm_shuffle_epi32 / pshufd
74 
75 
76 
77 SSSE3:
78   _mm_shuffle_epi8 / pshufb
79   _mm_alignr_epi8
80 SHA:
81   _mm_sha256*
82 */
83 
84 // K array must be aligned for 16-bytes at least.
85 // The compiler can look align attribute and selects
86 //   movdqu - for code without align attribute
87 //   movdqa - for code with    align attribute
88 extern
89 MY_ALIGN(64)
90 const UInt32 SHA256_K_ARRAY[64];
91 
92 #define K SHA256_K_ARRAY
93 
94 
95 #define ADD_EPI32(dest, src)      dest = _mm_add_epi32(dest, src);
96 #define SHA256_MSG1(dest, src)    dest = _mm_sha256msg1_epu32(dest, src);
97 #define SHA25G_MSG2(dest, src)    dest = _mm_sha256msg2_epu32(dest, src);
98 
99 
100 #define LOAD_SHUFFLE(m, k) \
101     m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
102     m = _mm_shuffle_epi8(m, mask); \
103 
104 #define SM1(g0, g1, g2, g3) \
105     SHA256_MSG1(g3, g0); \
106 
107 #define SM2(g0, g1, g2, g3) \
108     tmp = _mm_alignr_epi8(g1, g0, 4); \
109     ADD_EPI32(g2, tmp) \
110     SHA25G_MSG2(g2, g1); \
111 
112 // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
113 // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
114 
115 
116 #define NNN(g0, g1, g2, g3)
117 
118 
119 #define RND2(t0, t1) \
120     t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
121 
122 #define RND2_0(m, k) \
123     msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
124     RND2(state0, state1); \
125     msg = _mm_shuffle_epi32(msg, 0x0E); \
126 
127 
128 #define RND2_1 \
129     RND2(state1, state0); \
130 
131 
132 // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
133 
134 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
135     RND2_0(g0, k) \
136     OP0(g0, g1, g2, g3) \
137     RND2_1 \
138     OP1(g0, g1, g2, g3) \
139 
140 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
141     R4 ( (k)*4+0,        m0,m1,m2,m3, OP0, OP1 ) \
142     R4 ( (k)*4+1,        m1,m2,m3,m0, OP2, OP3 ) \
143     R4 ( (k)*4+2,        m2,m3,m0,m1, OP4, OP5 ) \
144     R4 ( (k)*4+3,        m3,m0,m1,m2, OP6, OP7 ) \
145 
146 #define PREPARE_STATE \
147     tmp    = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
148     state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
149     state1 = state0; \
150     state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
151     state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
152 
153 
154 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
155 #ifdef ATTRIB_SHA
156 ATTRIB_SHA
157 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)158 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
159 {
160   const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
161   __m128i tmp;
162   __m128i state0, state1;
163 
164   if (numBlocks == 0)
165     return;
166 
167   state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
168   state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
169 
170   PREPARE_STATE
171 
172   do
173   {
174     __m128i state0_save, state1_save;
175     __m128i m0, m1, m2, m3;
176     __m128i msg;
177     // #define msg tmp
178 
179     state0_save = state0;
180     state1_save = state1;
181 
182     LOAD_SHUFFLE (m0, 0)
183     LOAD_SHUFFLE (m1, 1)
184     LOAD_SHUFFLE (m2, 2)
185     LOAD_SHUFFLE (m3, 3)
186 
187 
188 
189     R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
190     R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
191     R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
192     R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
193 
194     ADD_EPI32(state0, state0_save)
195     ADD_EPI32(state1, state1_save)
196 
197     data += 64;
198   }
199   while (--numBlocks);
200 
201   PREPARE_STATE
202 
203   _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
204   _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
205 }
206 
207 #endif // USE_HW_SHA
208 
209 #elif (_IS_TRY_USE_HW_SHA) && defined(MY_CPU_ARM64)
210 
211   #if defined(__clang__)
212     #if (__clang_major__ >= 8) && (!defined(_MSC_VER)) // fix that check
213       #define USE_HW_SHA
214     #endif
215   #elif defined(__GNUC__)
216     #if (__GNUC__ >= 6) // fix that check
217       #define USE_HW_SHA
218     #endif
219   #elif defined(_MSC_VER)
220     #if _MSC_VER >= 1910
221       #define USE_HW_SHA
222     #endif
223   #endif
224 
225 #ifdef USE_HW_SHA
226 
227 // #pragma message("=== Sha256 HW === ")
228 
229 #if defined(__clang__) || defined(__GNUC__)
230   #ifdef MY_CPU_ARM64
231     #define ATTRIB_SHA __attribute__((__target__("+crypto")))
232   #else
233     #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
234   #endif
235 #else
236   // _MSC_VER
237   // for arm32
238   #define _ARM_USE_NEW_NEON_INTRINSICS
239 #endif
240 
241 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
242 #include <arm64_neon.h>
243 #else
244 #include <arm_neon.h>
245 #endif
246 
247 typedef uint32x4_t v128;
248 // typedef __n128 v128; // MSVC
249 
250 #ifdef MY_CPU_BE
251   #define MY_rev32_for_LE(x)
252 #else
253   #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
254 #endif
255 
256 #define LOAD_128(_p)      (*(const v128 *)(const void *)(_p))
257 #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
258 
259 #define LOAD_SHUFFLE(m, k) \
260     m = LOAD_128((data + (k) * 16)); \
261     MY_rev32_for_LE(m); \
262 
263 // K array must be aligned for 16-bytes at least.
264 extern
265 MY_ALIGN(64)
266 const UInt32 SHA256_K_ARRAY[64];
267 
268 #define K SHA256_K_ARRAY
269 
270 
271 #define SHA256_SU0(dest, src)        dest = vsha256su0q_u32(dest, src);
272 #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
273 
274 #define SM1(g0, g1, g2, g3)  SHA256_SU0(g3, g0)
275 #define SM2(g0, g1, g2, g3)  SHA25G_SU1(g2, g0, g1)
276 #define NNN(g0, g1, g2, g3)
277 
278 
279 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
280     msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
281     tmp = state0; \
282     state0 = vsha256hq_u32( state0, state1, msg ); \
283     state1 = vsha256h2q_u32( state1, tmp, msg ); \
284     OP0(g0, g1, g2, g3); \
285     OP1(g0, g1, g2, g3); \
286 
287 
288 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
289     R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
290     R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
291     R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
292     R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
293 
294 
295 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
296 #ifdef ATTRIB_SHA
297 ATTRIB_SHA
298 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)299 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
300 {
301   v128 state0, state1;
302 
303   if (numBlocks == 0)
304     return;
305 
306   state0 = LOAD_128(&state[0]);
307   state1 = LOAD_128(&state[4]);
308 
309   do
310   {
311     v128 state0_save, state1_save;
312     v128 m0, m1, m2, m3;
313     v128 msg, tmp;
314 
315     state0_save = state0;
316     state1_save = state1;
317 
318     LOAD_SHUFFLE (m0, 0)
319     LOAD_SHUFFLE (m1, 1)
320     LOAD_SHUFFLE (m2, 2)
321     LOAD_SHUFFLE (m3, 3)
322 
323     R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
324     R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
325     R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
326     R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
327 
328     state0 = vaddq_u32(state0, state0_save);
329     state1 = vaddq_u32(state1, state1_save);
330 
331     data += 64;
332   }
333   while (--numBlocks);
334 
335   STORE_128(&state[0], state0);
336   STORE_128(&state[4], state1);
337 }
338 
339 #endif // USE_HW_SHA
340 
341 #endif // MY_CPU_ARM_OR_ARM64
342 
343 
344 #ifndef USE_HW_SHA
345 
346 // #error Stop_Compiling_UNSUPPORTED_SHA
347 // #include <stdlib.h>
348 
349 // #include "Sha256.h"
350 void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
351 
352 #pragma message("Sha256 HW-SW stub was used")
353 
354 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)355 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
356 {
357   Sha256_UpdateBlocks(state, data, numBlocks);
358   /*
359   UNUSED_VAR(state);
360   UNUSED_VAR(data);
361   UNUSED_VAR(numBlocks);
362   exit(1);
363   return;
364   */
365 }
366 
367 #endif
368 
369 
370 
371 #undef K
372 #undef RND2
373 #undef RND2_0
374 #undef RND2_1
375 
376 #undef MY_rev32_for_LE
377 #undef NNN
378 #undef LOAD_128
379 #undef STORE_128
380 #undef LOAD_SHUFFLE
381 #undef SM1
382 #undef SM2
383 
384 #undef NNN
385 #undef R4
386 #undef R16
387 #undef PREPARE_STATE
388 #undef USE_HW_SHA
389 #undef ATTRIB_SHA
390 #undef USE_VER_MIN
391