• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2 2021-04-01 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #if defined(_MSC_VER)
7 #if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
8 // #define USE_MY_MM
9 #endif
10 #endif
11 
12 #include "CpuArch.h"
13 
14 #ifdef MY_CPU_X86_OR_AMD64
15   #if defined(__clang__)
16     #if (__clang_major__ >= 16) // fix that check
17       #define USE_HW_SHA
18       #ifndef __SHA__
19         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
20         #if defined(_MSC_VER)
21           // SSSE3: for clang-cl:
22           #include <tmmintrin.h>
23           #define __SHA__
24         #endif
25       #endif
26 
27     #endif
28   #elif defined(__GNUC__)
29     #if (__GNUC__ >= 8) // fix that check
30       #define USE_HW_SHA
31       #ifndef __SHA__
32         #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
33         // #pragma GCC target("sha,ssse3")
34       #endif
35     #endif
36   #elif defined(__INTEL_COMPILER)
37     #if (__INTEL_COMPILER >= 1800) // fix that check
38       #define USE_HW_SHA
39     #endif
40   #elif defined(_MSC_VER)
41     #ifdef USE_MY_MM
42       #define USE_VER_MIN 1300
43     #else
44       #define USE_VER_MIN 1910
45     #endif
46     #if _MSC_VER >= USE_VER_MIN
47       #define USE_HW_SHA
48     #endif
49   #endif
50 // #endif // MY_CPU_X86_OR_AMD64
51 
52 #ifdef USE_HW_SHA
53 
54 // #pragma message("Sha256 HW")
55 // #include <wmmintrin.h>
56 
57 #if !defined(_MSC_VER) || (_MSC_VER >= 1900)
58 #include <immintrin.h>
59 #else
60 #include <emmintrin.h>
61 
62 #if defined(_MSC_VER) && (_MSC_VER >= 1600)
63 // #include <intrin.h>
64 #endif
65 
66 #ifdef USE_MY_MM
67 #include "My_mm.h"
68 #endif
69 
70 #endif
71 
72 /*
73 SHA256 uses:
74 SSE2:
75   _mm_loadu_si128
76   _mm_storeu_si128
77   _mm_set_epi32
78   _mm_add_epi32
79   _mm_shuffle_epi32 / pshufd
80 
81 
82 
83 SSSE3:
84   _mm_shuffle_epi8 / pshufb
85   _mm_alignr_epi8
86 SHA:
87   _mm_sha256*
88 */
89 
90 // K array must be aligned for 16-bytes at least.
91 // The compiler can look align attribute and selects
92 //   movdqu - for code without align attribute
93 //   movdqa - for code with    align attribute
94 extern
95 MY_ALIGN(64)
96 const UInt32 SHA256_K_ARRAY[64];
97 
98 #define K SHA256_K_ARRAY
99 
100 
101 #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
102 #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
103 #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
104 
105 
106 #define LOAD_SHUFFLE(m, k) \
107     m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
108     m = _mm_shuffle_epi8(m, mask); \
109 
110 #define SM1(g0, g1, g2, g3) \
111     SHA256_MSG1(g3, g0); \
112 
113 #define SM2(g0, g1, g2, g3) \
114     tmp = _mm_alignr_epi8(g1, g0, 4); \
115     ADD_EPI32(g2, tmp); \
116     SHA25G_MSG2(g2, g1); \
117 
118 // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
119 // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
120 
121 
122 #define NNN(g0, g1, g2, g3)
123 
124 
125 #define RND2(t0, t1) \
126     t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
127 
128 #define RND2_0(m, k) \
129     msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
130     RND2(state0, state1); \
131     msg = _mm_shuffle_epi32(msg, 0x0E); \
132 
133 
134 #define RND2_1 \
135     RND2(state1, state0); \
136 
137 
138 // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
139 
140 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
141     RND2_0(g0, k); \
142     OP0(g0, g1, g2, g3); \
143     RND2_1; \
144     OP1(g0, g1, g2, g3); \
145 
146 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
147     R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
148     R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
149     R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
150     R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
151 
152 #define PREPARE_STATE \
153     tmp    = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
154     state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
155     state1 = state0; \
156     state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
157     state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
158 
159 
160 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
161 #ifdef ATTRIB_SHA
162 ATTRIB_SHA
163 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)164 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
165 {
166   const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
167   __m128i tmp;
168   __m128i state0, state1;
169 
170   if (numBlocks == 0)
171     return;
172 
173   state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
174   state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
175 
176   PREPARE_STATE
177 
178   do
179   {
180     __m128i state0_save, state1_save;
181     __m128i m0, m1, m2, m3;
182     __m128i msg;
183     // #define msg tmp
184 
185     state0_save = state0;
186     state1_save = state1;
187 
188     LOAD_SHUFFLE (m0, 0)
189     LOAD_SHUFFLE (m1, 1)
190     LOAD_SHUFFLE (m2, 2)
191     LOAD_SHUFFLE (m3, 3)
192 
193 
194 
195     R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
196     R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
197     R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
198     R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
199 
200     ADD_EPI32(state0, state0_save);
201     ADD_EPI32(state1, state1_save);
202 
203     data += 64;
204   }
205   while (--numBlocks);
206 
207   PREPARE_STATE
208 
209   _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
210   _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
211 }
212 
213 #endif // USE_HW_SHA
214 
215 #elif defined(MY_CPU_ARM_OR_ARM64)
216 
217   #if defined(__clang__)
218     #if (__clang_major__ >= 16) // fix that check
219       #define USE_HW_SHA
220     #endif
221   #elif defined(__GNUC__)
222     #if (__GNUC__ >= 6) // fix that check
223       #define USE_HW_SHA
224     #endif
225   #elif defined(_MSC_VER)
226     #if _MSC_VER >= 1910
227       #define USE_HW_SHA
228     #endif
229   #endif
230 
231 #ifdef USE_HW_SHA
232 
233 // #pragma message("=== Sha256 HW === ")
234 
235 #if defined(__clang__) || defined(__GNUC__)
236   #ifdef MY_CPU_ARM64
237     #define ATTRIB_SHA __attribute__((__target__("+crypto")))
238   #else
239     #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
240   #endif
241 #else
242   // _MSC_VER
243   // for arm32
244   #define _ARM_USE_NEW_NEON_INTRINSICS
245 #endif
246 
247 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
248 #include <arm64_neon.h>
249 #else
250 #include <arm_neon.h>
251 #endif
252 
253 typedef uint32x4_t v128;
254 // typedef __n128 v128; // MSVC
255 
256 #ifdef MY_CPU_BE
257   #define MY_rev32_for_LE(x)
258 #else
259   #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
260 #endif
261 
262 #define LOAD_128(_p)      (*(const v128 *)(const void *)(_p))
263 #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
264 
265 #define LOAD_SHUFFLE(m, k) \
266     m = LOAD_128((data + (k) * 16)); \
267     MY_rev32_for_LE(m); \
268 
269 // K array must be aligned for 16-bytes at least.
270 extern
271 MY_ALIGN(64)
272 const UInt32 SHA256_K_ARRAY[64];
273 
274 #define K SHA256_K_ARRAY
275 
276 
277 #define SHA256_SU0(dest, src)        dest = vsha256su0q_u32(dest, src);
278 #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
279 
280 #define SM1(g0, g1, g2, g3)  SHA256_SU0(g3, g0)
281 #define SM2(g0, g1, g2, g3)  SHA25G_SU1(g2, g0, g1)
282 #define NNN(g0, g1, g2, g3)
283 
284 
285 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
286     msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
287     tmp = state0; \
288     state0 = vsha256hq_u32( state0, state1, msg ); \
289     state1 = vsha256h2q_u32( state1, tmp, msg ); \
290     OP0(g0, g1, g2, g3); \
291     OP1(g0, g1, g2, g3); \
292 
293 
294 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
295     R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
296     R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
297     R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
298     R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
299 
300 
301 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
302 #ifdef ATTRIB_SHA
303 ATTRIB_SHA
304 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)305 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
306 {
307   v128 state0, state1;
308 
309   if (numBlocks == 0)
310     return;
311 
312   state0 = LOAD_128(&state[0]);
313   state1 = LOAD_128(&state[4]);
314 
315   do
316   {
317     v128 state0_save, state1_save;
318     v128 m0, m1, m2, m3;
319     v128 msg, tmp;
320 
321     state0_save = state0;
322     state1_save = state1;
323 
324     LOAD_SHUFFLE (m0, 0)
325     LOAD_SHUFFLE (m1, 1)
326     LOAD_SHUFFLE (m2, 2)
327     LOAD_SHUFFLE (m3, 3)
328 
329     R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
330     R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
331     R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
332     R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
333 
334     state0 = vaddq_u32(state0, state0_save);
335     state1 = vaddq_u32(state1, state1_save);
336 
337     data += 64;
338   }
339   while (--numBlocks);
340 
341   STORE_128(&state[0], state0);
342   STORE_128(&state[4], state1);
343 }
344 
345 #endif // USE_HW_SHA
346 
347 #endif // MY_CPU_ARM_OR_ARM64
348 
349 
350 #ifndef USE_HW_SHA
351 
352 // #error Stop_Compiling_UNSUPPORTED_SHA
353 // #include <stdlib.h>
354 
355 // #include "Sha256.h"
356 void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
357 
358 #pragma message("Sha256 HW-SW stub was used")
359 
360 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)361 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
362 {
363   Sha256_UpdateBlocks(state, data, numBlocks);
364   /*
365   UNUSED_VAR(state);
366   UNUSED_VAR(data);
367   UNUSED_VAR(numBlocks);
368   exit(1);
369   return;
370   */
371 }
372 
373 #endif
374