1 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2 2023-04-02 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5 #include "Compiler.h"
6 #include "CpuArch.h"
7
8 #ifndef _IS_TRY_USE_HW_SHA
9 #define _IS_TRY_USE_HW_SHA 1
10 #endif
11
12 #if defined(_MSC_VER)
13 #if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
14 // #define USE_MY_MM
15 #endif
16 #endif
17
18 #if (_IS_TRY_USE_HW_SHA) && defined(MY_CPU_X86_OR_AMD64)
19 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
20 #define USE_HW_SHA
21 #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
22 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
23 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
24 #define USE_HW_SHA
25 #if !defined(_INTEL_COMPILER)
26 // icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
27 #if !defined(__SHA__) || !defined(__SSSE3__)
28 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
29 #endif
30 #endif
31 #elif defined(_MSC_VER)
32 #ifdef USE_MY_MM
33 #define USE_VER_MIN 1300
34 #else
35 #define USE_VER_MIN 1900
36 #endif
37 #if (_MSC_VER >= USE_VER_MIN)
38 #define USE_HW_SHA
39 #endif
40 #endif
41 // #endif // MY_CPU_X86_OR_AMD64
42
43 #ifdef USE_HW_SHA
44
45 // #pragma message("Sha256 HW")
46
47 // sse/sse2/ssse3:
48 #include <tmmintrin.h>
49 // sha*:
50 #include <immintrin.h>
51
52 #if defined (__clang__) && defined(_MSC_VER)
53 // #if !defined(__SSSE3__)
54 // #endif
55 #if !defined(__SHA__)
56 #include <shaintrin.h>
57 #endif
58 #else
59
60 #ifdef USE_MY_MM
61 #include "My_mm.h"
62 #endif
63
64 #endif
65
66 /*
67 SHA256 uses:
68 SSE2:
69 _mm_loadu_si128
70 _mm_storeu_si128
71 _mm_set_epi32
72 _mm_add_epi32
73 _mm_shuffle_epi32 / pshufd
74
75
76
77 SSSE3:
78 _mm_shuffle_epi8 / pshufb
79 _mm_alignr_epi8
80 SHA:
81 _mm_sha256*
82 */
83
84 // K array must be aligned for 16-bytes at least.
85 // The compiler can look align attribute and selects
86 // movdqu - for code without align attribute
87 // movdqa - for code with align attribute
88 extern
89 MY_ALIGN(64)
90 const UInt32 SHA256_K_ARRAY[64];
91
92 #define K SHA256_K_ARRAY
93
94
95 #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
96 #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
97 #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
98
99
100 #define LOAD_SHUFFLE(m, k) \
101 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
102 m = _mm_shuffle_epi8(m, mask); \
103
104 #define SM1(g0, g1, g2, g3) \
105 SHA256_MSG1(g3, g0); \
106
107 #define SM2(g0, g1, g2, g3) \
108 tmp = _mm_alignr_epi8(g1, g0, 4); \
109 ADD_EPI32(g2, tmp) \
110 SHA25G_MSG2(g2, g1); \
111
112 // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
113 // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
114
115
116 #define NNN(g0, g1, g2, g3)
117
118
119 #define RND2(t0, t1) \
120 t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
121
122 #define RND2_0(m, k) \
123 msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
124 RND2(state0, state1); \
125 msg = _mm_shuffle_epi32(msg, 0x0E); \
126
127
128 #define RND2_1 \
129 RND2(state1, state0); \
130
131
132 // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
133
134 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
135 RND2_0(g0, k) \
136 OP0(g0, g1, g2, g3) \
137 RND2_1 \
138 OP1(g0, g1, g2, g3) \
139
140 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
141 R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
142 R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
143 R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
144 R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
145
146 #define PREPARE_STATE \
147 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
148 state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
149 state1 = state0; \
150 state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
151 state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
152
153
154 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
155 #ifdef ATTRIB_SHA
156 ATTRIB_SHA
157 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)158 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
159 {
160 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
161 __m128i tmp;
162 __m128i state0, state1;
163
164 if (numBlocks == 0)
165 return;
166
167 state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
168 state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
169
170 PREPARE_STATE
171
172 do
173 {
174 __m128i state0_save, state1_save;
175 __m128i m0, m1, m2, m3;
176 __m128i msg;
177 // #define msg tmp
178
179 state0_save = state0;
180 state1_save = state1;
181
182 LOAD_SHUFFLE (m0, 0)
183 LOAD_SHUFFLE (m1, 1)
184 LOAD_SHUFFLE (m2, 2)
185 LOAD_SHUFFLE (m3, 3)
186
187
188
189 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
190 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
191 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
192 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
193
194 ADD_EPI32(state0, state0_save)
195 ADD_EPI32(state1, state1_save)
196
197 data += 64;
198 }
199 while (--numBlocks);
200
201 PREPARE_STATE
202
203 _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
204 _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
205 }
206
207 #endif // USE_HW_SHA
208
209 #elif (_IS_TRY_USE_HW_SHA) && defined(MY_CPU_ARM64)
210
211 #if defined(__clang__)
212 #if (__clang_major__ >= 8) && (!defined(_MSC_VER)) // fix that check
213 #define USE_HW_SHA
214 #endif
215 #elif defined(__GNUC__)
216 #if (__GNUC__ >= 6) // fix that check
217 #define USE_HW_SHA
218 #endif
219 #elif defined(_MSC_VER)
220 #if _MSC_VER >= 1910
221 #define USE_HW_SHA
222 #endif
223 #endif
224
225 #ifdef USE_HW_SHA
226
227 // #pragma message("=== Sha256 HW === ")
228
229 #if defined(__clang__) || defined(__GNUC__)
230 #ifdef MY_CPU_ARM64
231 #define ATTRIB_SHA __attribute__((__target__("+crypto")))
232 #else
233 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
234 #endif
235 #else
236 // _MSC_VER
237 // for arm32
238 #define _ARM_USE_NEW_NEON_INTRINSICS
239 #endif
240
241 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
242 #include <arm64_neon.h>
243 #else
244 #include <arm_neon.h>
245 #endif
246
247 typedef uint32x4_t v128;
248 // typedef __n128 v128; // MSVC
249
250 #ifdef MY_CPU_BE
251 #define MY_rev32_for_LE(x)
252 #else
253 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
254 #endif
255
256 #define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
257 #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
258
259 #define LOAD_SHUFFLE(m, k) \
260 m = LOAD_128((data + (k) * 16)); \
261 MY_rev32_for_LE(m); \
262
263 // K array must be aligned for 16-bytes at least.
264 extern
265 MY_ALIGN(64)
266 const UInt32 SHA256_K_ARRAY[64];
267
268 #define K SHA256_K_ARRAY
269
270
271 #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
272 #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
273
274 #define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
275 #define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
276 #define NNN(g0, g1, g2, g3)
277
278
279 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
280 msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
281 tmp = state0; \
282 state0 = vsha256hq_u32( state0, state1, msg ); \
283 state1 = vsha256h2q_u32( state1, tmp, msg ); \
284 OP0(g0, g1, g2, g3); \
285 OP1(g0, g1, g2, g3); \
286
287
288 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
289 R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
290 R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
291 R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
292 R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
293
294
295 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
296 #ifdef ATTRIB_SHA
297 ATTRIB_SHA
298 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)299 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
300 {
301 v128 state0, state1;
302
303 if (numBlocks == 0)
304 return;
305
306 state0 = LOAD_128(&state[0]);
307 state1 = LOAD_128(&state[4]);
308
309 do
310 {
311 v128 state0_save, state1_save;
312 v128 m0, m1, m2, m3;
313 v128 msg, tmp;
314
315 state0_save = state0;
316 state1_save = state1;
317
318 LOAD_SHUFFLE (m0, 0)
319 LOAD_SHUFFLE (m1, 1)
320 LOAD_SHUFFLE (m2, 2)
321 LOAD_SHUFFLE (m3, 3)
322
323 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
324 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
325 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
326 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
327
328 state0 = vaddq_u32(state0, state0_save);
329 state1 = vaddq_u32(state1, state1_save);
330
331 data += 64;
332 }
333 while (--numBlocks);
334
335 STORE_128(&state[0], state0);
336 STORE_128(&state[4], state1);
337 }
338
339 #endif // USE_HW_SHA
340
341 #endif // MY_CPU_ARM_OR_ARM64
342
343
344 #ifndef USE_HW_SHA
345
346 // #error Stop_Compiling_UNSUPPORTED_SHA
347 // #include <stdlib.h>
348
349 // #include "Sha256.h"
350 void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
351
352 #pragma message("Sha256 HW-SW stub was used")
353
354 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)355 void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
356 {
357 Sha256_UpdateBlocks(state, data, numBlocks);
358 /*
359 UNUSED_VAR(state);
360 UNUSED_VAR(data);
361 UNUSED_VAR(numBlocks);
362 exit(1);
363 return;
364 */
365 }
366
367 #endif
368
369
370
371 #undef K
372 #undef RND2
373 #undef RND2_0
374 #undef RND2_1
375
376 #undef MY_rev32_for_LE
377 #undef NNN
378 #undef LOAD_128
379 #undef STORE_128
380 #undef LOAD_SHUFFLE
381 #undef SM1
382 #undef SM2
383
384 #undef NNN
385 #undef R4
386 #undef R16
387 #undef PREPARE_STATE
388 #undef USE_HW_SHA
389 #undef ATTRIB_SHA
390 #undef USE_VER_MIN
391