1 /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2 2021-04-01 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #if defined(_MSC_VER)
7 #if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
8 // #define USE_MY_MM
9 #endif
10 #endif
11
12 #include "CpuArch.h"
13
14 #ifdef MY_CPU_X86_OR_AMD64
15 #if defined(__clang__)
16 #if (__clang_major__ >= 16) // fix that check
17 #define USE_HW_SHA
18 #ifndef __SHA__
19 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
20 #if defined(_MSC_VER)
21 // SSSE3: for clang-cl:
22 #include <tmmintrin.h>
23 #define __SHA__
24 #endif
25 #endif
26
27 #endif
28 #elif defined(__GNUC__)
29 #if (__GNUC__ >= 8) // fix that check
30 #define USE_HW_SHA
31 #ifndef __SHA__
32 #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
33 // #pragma GCC target("sha,ssse3")
34 #endif
35 #endif
36 #elif defined(__INTEL_COMPILER)
37 #if (__INTEL_COMPILER >= 1800) // fix that check
38 #define USE_HW_SHA
39 #endif
40 #elif defined(_MSC_VER)
41 #ifdef USE_MY_MM
42 #define USE_VER_MIN 1300
43 #else
44 #define USE_VER_MIN 1910
45 #endif
46 #if _MSC_VER >= USE_VER_MIN
47 #define USE_HW_SHA
48 #endif
49 #endif
50 // #endif // MY_CPU_X86_OR_AMD64
51
52 #ifdef USE_HW_SHA
53
54 // #pragma message("Sha256 HW")
55 // #include <wmmintrin.h>
56
57 #if !defined(_MSC_VER) || (_MSC_VER >= 1900)
58 #include <immintrin.h>
59 #else
60 #include <emmintrin.h>
61
62 #if defined(_MSC_VER) && (_MSC_VER >= 1600)
63 // #include <intrin.h>
64 #endif
65
66 #ifdef USE_MY_MM
67 #include "My_mm.h"
68 #endif
69
70 #endif
71
72 /*
73 SHA256 uses:
74 SSE2:
75 _mm_loadu_si128
76 _mm_storeu_si128
77 _mm_set_epi32
78 _mm_add_epi32
79 _mm_shuffle_epi32 / pshufd
80
81
82
83 SSSE3:
84 _mm_shuffle_epi8 / pshufb
85 _mm_alignr_epi8
86 SHA:
87 _mm_sha256*
88 */
89
90 // K array must be aligned for 16-bytes at least.
91 // The compiler can look align attribute and selects
92 // movdqu - for code without align attribute
93 // movdqa - for code with align attribute
94 extern
95 MY_ALIGN(64)
96 const UInt32 SHA256_K_ARRAY[64];
97
98 #define K SHA256_K_ARRAY
99
100
101 #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
102 #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
103 #define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
104
105
106 #define LOAD_SHUFFLE(m, k) \
107 m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
108 m = _mm_shuffle_epi8(m, mask); \
109
110 #define SM1(g0, g1, g2, g3) \
111 SHA256_MSG1(g3, g0); \
112
113 #define SM2(g0, g1, g2, g3) \
114 tmp = _mm_alignr_epi8(g1, g0, 4); \
115 ADD_EPI32(g2, tmp); \
116 SHA25G_MSG2(g2, g1); \
117
118 // #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
119 // #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
120
121
122 #define NNN(g0, g1, g2, g3)
123
124
125 #define RND2(t0, t1) \
126 t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
127
128 #define RND2_0(m, k) \
129 msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
130 RND2(state0, state1); \
131 msg = _mm_shuffle_epi32(msg, 0x0E); \
132
133
134 #define RND2_1 \
135 RND2(state1, state0); \
136
137
138 // We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
139
140 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
141 RND2_0(g0, k); \
142 OP0(g0, g1, g2, g3); \
143 RND2_1; \
144 OP1(g0, g1, g2, g3); \
145
146 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
147 R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
148 R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
149 R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
150 R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
151
152 #define PREPARE_STATE \
153 tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
154 state0 = _mm_shuffle_epi32(state1, 0x1B); /* efgh */ \
155 state1 = state0; \
156 state0 = _mm_unpacklo_epi64(state0, tmp); /* cdgh */ \
157 state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
158
159
160 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
161 #ifdef ATTRIB_SHA
162 ATTRIB_SHA
163 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)164 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
165 {
166 const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
167 __m128i tmp;
168 __m128i state0, state1;
169
170 if (numBlocks == 0)
171 return;
172
173 state0 = _mm_loadu_si128((const __m128i *) (const void *) &state[0]);
174 state1 = _mm_loadu_si128((const __m128i *) (const void *) &state[4]);
175
176 PREPARE_STATE
177
178 do
179 {
180 __m128i state0_save, state1_save;
181 __m128i m0, m1, m2, m3;
182 __m128i msg;
183 // #define msg tmp
184
185 state0_save = state0;
186 state1_save = state1;
187
188 LOAD_SHUFFLE (m0, 0)
189 LOAD_SHUFFLE (m1, 1)
190 LOAD_SHUFFLE (m2, 2)
191 LOAD_SHUFFLE (m3, 3)
192
193
194
195 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
196 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
197 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
198 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
199
200 ADD_EPI32(state0, state0_save);
201 ADD_EPI32(state1, state1_save);
202
203 data += 64;
204 }
205 while (--numBlocks);
206
207 PREPARE_STATE
208
209 _mm_storeu_si128((__m128i *) (void *) &state[0], state0);
210 _mm_storeu_si128((__m128i *) (void *) &state[4], state1);
211 }
212
213 #endif // USE_HW_SHA
214
215 #elif defined(MY_CPU_ARM_OR_ARM64)
216
217 #if defined(__clang__)
218 #if (__clang_major__ >= 16) // fix that check
219 #define USE_HW_SHA
220 #endif
221 #elif defined(__GNUC__)
222 #if (__GNUC__ >= 6) // fix that check
223 #define USE_HW_SHA
224 #endif
225 #elif defined(_MSC_VER)
226 #if _MSC_VER >= 1910
227 #define USE_HW_SHA
228 #endif
229 #endif
230
231 #ifdef USE_HW_SHA
232
233 // #pragma message("=== Sha256 HW === ")
234
235 #if defined(__clang__) || defined(__GNUC__)
236 #ifdef MY_CPU_ARM64
237 #define ATTRIB_SHA __attribute__((__target__("+crypto")))
238 #else
239 #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
240 #endif
241 #else
242 // _MSC_VER
243 // for arm32
244 #define _ARM_USE_NEW_NEON_INTRINSICS
245 #endif
246
247 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
248 #include <arm64_neon.h>
249 #else
250 #include <arm_neon.h>
251 #endif
252
253 typedef uint32x4_t v128;
254 // typedef __n128 v128; // MSVC
255
256 #ifdef MY_CPU_BE
257 #define MY_rev32_for_LE(x)
258 #else
259 #define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
260 #endif
261
262 #define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
263 #define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
264
265 #define LOAD_SHUFFLE(m, k) \
266 m = LOAD_128((data + (k) * 16)); \
267 MY_rev32_for_LE(m); \
268
269 // K array must be aligned for 16-bytes at least.
270 extern
271 MY_ALIGN(64)
272 const UInt32 SHA256_K_ARRAY[64];
273
274 #define K SHA256_K_ARRAY
275
276
277 #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
278 #define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
279
280 #define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
281 #define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
282 #define NNN(g0, g1, g2, g3)
283
284
285 #define R4(k, g0, g1, g2, g3, OP0, OP1) \
286 msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
287 tmp = state0; \
288 state0 = vsha256hq_u32( state0, state1, msg ); \
289 state1 = vsha256h2q_u32( state1, tmp, msg ); \
290 OP0(g0, g1, g2, g3); \
291 OP1(g0, g1, g2, g3); \
292
293
294 #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
295 R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
296 R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
297 R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
298 R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
299
300
301 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
302 #ifdef ATTRIB_SHA
303 ATTRIB_SHA
304 #endif
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)305 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
306 {
307 v128 state0, state1;
308
309 if (numBlocks == 0)
310 return;
311
312 state0 = LOAD_128(&state[0]);
313 state1 = LOAD_128(&state[4]);
314
315 do
316 {
317 v128 state0_save, state1_save;
318 v128 m0, m1, m2, m3;
319 v128 msg, tmp;
320
321 state0_save = state0;
322 state1_save = state1;
323
324 LOAD_SHUFFLE (m0, 0)
325 LOAD_SHUFFLE (m1, 1)
326 LOAD_SHUFFLE (m2, 2)
327 LOAD_SHUFFLE (m3, 3)
328
329 R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
330 R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
331 R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
332 R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
333
334 state0 = vaddq_u32(state0, state0_save);
335 state1 = vaddq_u32(state1, state1_save);
336
337 data += 64;
338 }
339 while (--numBlocks);
340
341 STORE_128(&state[0], state0);
342 STORE_128(&state[4], state1);
343 }
344
345 #endif // USE_HW_SHA
346
347 #endif // MY_CPU_ARM_OR_ARM64
348
349
350 #ifndef USE_HW_SHA
351
352 // #error Stop_Compiling_UNSUPPORTED_SHA
353 // #include <stdlib.h>
354
355 // #include "Sha256.h"
356 void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
357
358 #pragma message("Sha256 HW-SW stub was used")
359
360 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
Sha256_UpdateBlocks_HW(UInt32 state[8],const Byte * data,size_t numBlocks)361 void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
362 {
363 Sha256_UpdateBlocks(state, data, numBlocks);
364 /*
365 UNUSED_VAR(state);
366 UNUSED_VAR(data);
367 UNUSED_VAR(numBlocks);
368 exit(1);
369 return;
370 */
371 }
372
373 #endif
374