• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Blake2s.c -- BLAKE2sp Hash
2 2024-01-29 : Igor Pavlov : Public domain
3 2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */
4 
5 #include "Precomp.h"
6 
7 // #include <stdio.h>
8 #include <string.h>
9 
10 #include "Blake2.h"
11 #include "RotateDefs.h"
12 #include "Compiler.h"
13 #include "CpuArch.h"
14 
15 #if defined(__SSE2__)
16     #define Z7_BLAKE2S_USE_VECTORS
17 #elif defined(MY_CPU_X86_OR_AMD64)
18   #if  defined(_MSC_VER) && _MSC_VER > 1200 \
19     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
20     || defined(__clang__) \
21     || defined(__INTEL_COMPILER)
22     #define Z7_BLAKE2S_USE_VECTORS
23   #endif
24 #endif
25 
26 #ifdef Z7_BLAKE2S_USE_VECTORS
27 
28 #define Z7_BLAKE2SP_USE_FUNCTIONS
29 
30 //  define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes.
31 // #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
32 
33 // SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%.
34 #if defined(__SSSE3__)
35   #define Z7_BLAKE2S_USE_SSSE3
36 #elif  defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
37     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
38     || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
39     || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
40     || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
41   #define Z7_BLAKE2S_USE_SSSE3
42 #endif
43 
44 #ifdef Z7_BLAKE2S_USE_SSSE3
45 /* SSE41 : for _mm_insert_epi32 (pinsrd)
46   it can slightly reduce code size and improves the performance in some cases.
47     it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used.
48     it can be used for all blocks in another algos (4+).
49 */
50 #if defined(__SSE4_1__)
51   #define Z7_BLAKE2S_USE_SSE41
52 #elif  defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
53     || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
54     || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
55     || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
56     || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
57   #define Z7_BLAKE2S_USE_SSE41
58 #endif
59 #endif // SSSE3
60 
61 #if defined(__GNUC__) || defined(__clang__)
62   #if defined(Z7_BLAKE2S_USE_SSE41)
63     #define BLAKE2S_ATTRIB_128BIT  __attribute__((__target__("sse4.1")))
64   #elif defined(Z7_BLAKE2S_USE_SSSE3)
65     #define BLAKE2S_ATTRIB_128BIT  __attribute__((__target__("ssse3")))
66   #else
67     #define BLAKE2S_ATTRIB_128BIT  __attribute__((__target__("sse2")))
68   #endif
69 #endif
70 
71 
72 #if defined(__AVX2__)
73   #define Z7_BLAKE2S_USE_AVX2
74 #else
75   #if    defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
76       || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
77       || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100)
78     #define Z7_BLAKE2S_USE_AVX2
79     #ifdef Z7_BLAKE2S_USE_AVX2
80       #define BLAKE2S_ATTRIB_AVX2  __attribute__((__target__("avx2")))
81     #endif
82   #elif  defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
83       || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
84     #if (Z7_MSC_VER_ORIGINAL == 1900)
85       #pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
86     #endif
87     #define Z7_BLAKE2S_USE_AVX2
88   #endif
89 #endif
90 
91 #ifdef Z7_BLAKE2S_USE_SSE41
92 #include <smmintrin.h> // SSE4.1
93 #elif defined(Z7_BLAKE2S_USE_SSSE3)
94 #include <tmmintrin.h> // SSSE3
95 #else
96 #include <emmintrin.h> // SSE2
97 #endif
98 
99 #ifdef Z7_BLAKE2S_USE_AVX2
100 #include <immintrin.h>
101 #if defined(__clang__)
102 #include <avxintrin.h>
103 #include <avx2intrin.h>
104 #endif
105 #endif // avx2
106 
107 
108 #if defined(__AVX512F__) && defined(__AVX512VL__)
109    // && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930)
110   #define Z7_BLAKE2S_USE_AVX512_ALWAYS
111   // #pragma message ("=== Blake2s AVX512")
112 #endif
113 
114 
115 #define Z7_BLAKE2S_USE_V128_FAST
116 // for speed optimization for small messages:
117 // #define Z7_BLAKE2S_USE_V128_WAY2
118 
119 #ifdef Z7_BLAKE2S_USE_AVX2
120 
121 // for debug:
122 // gather is slow
123 // #define Z7_BLAKE2S_USE_GATHER
124 
125   #define   Z7_BLAKE2S_USE_AVX2_FAST
126 // for speed optimization for small messages:
127 //   #define   Z7_BLAKE2S_USE_AVX2_WAY2
128 //   #define   Z7_BLAKE2S_USE_AVX2_WAY4
129 #if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \
130     defined(Z7_BLAKE2S_USE_AVX2_WAY4)
131   #define   Z7_BLAKE2S_USE_AVX2_WAY_SLOW
132 #endif
133 #endif
134 
135   #define Z7_BLAKE2SP_ALGO_DEFAULT    0
136   #define Z7_BLAKE2SP_ALGO_SCALAR     1
137 #ifdef Z7_BLAKE2S_USE_V128_FAST
138   #define Z7_BLAKE2SP_ALGO_V128_FAST  2
139 #endif
140 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
141   #define Z7_BLAKE2SP_ALGO_V256_FAST  3
142 #endif
143   #define Z7_BLAKE2SP_ALGO_V128_WAY1  4
144 #ifdef Z7_BLAKE2S_USE_V128_WAY2
145   #define Z7_BLAKE2SP_ALGO_V128_WAY2  5
146 #endif
147 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
148   #define Z7_BLAKE2SP_ALGO_V256_WAY2  6
149 #endif
150 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
151   #define Z7_BLAKE2SP_ALGO_V256_WAY4  7
152 #endif
153 
154 #endif // Z7_BLAKE2S_USE_VECTORS
155 
156 
157 
158 
159 #define BLAKE2S_FINAL_FLAG  (~(UInt32)0)
160 #define NSW                 Z7_BLAKE2SP_NUM_STRUCT_WORDS
161 #define SUPER_BLOCK_SIZE    (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE)
162 #define SUPER_BLOCK_MASK    (SUPER_BLOCK_SIZE - 1)
163 
164 #define V_INDEX_0_0   0
165 #define V_INDEX_1_0   1
166 #define V_INDEX_2_0   2
167 #define V_INDEX_3_0   3
168 #define V_INDEX_0_1   4
169 #define V_INDEX_1_1   5
170 #define V_INDEX_2_1   6
171 #define V_INDEX_3_1   7
172 #define V_INDEX_0_2   8
173 #define V_INDEX_1_2   9
174 #define V_INDEX_2_2  10
175 #define V_INDEX_3_2  11
176 #define V_INDEX_0_3  12
177 #define V_INDEX_1_3  13
178 #define V_INDEX_2_3  14
179 #define V_INDEX_3_3  15
180 #define V_INDEX_4_0   0
181 #define V_INDEX_5_0   1
182 #define V_INDEX_6_0   2
183 #define V_INDEX_7_0   3
184 #define V_INDEX_7_1   4
185 #define V_INDEX_4_1   5
186 #define V_INDEX_5_1   6
187 #define V_INDEX_6_1   7
188 #define V_INDEX_6_2   8
189 #define V_INDEX_7_2   9
190 #define V_INDEX_4_2  10
191 #define V_INDEX_5_2  11
192 #define V_INDEX_5_3  12
193 #define V_INDEX_6_3  13
194 #define V_INDEX_7_3  14
195 #define V_INDEX_4_3  15
196 
197 #define V(row, col)  v[V_INDEX_ ## row ## _ ## col]
198 
199 #define k_Blake2s_IV_0  0x6A09E667UL
200 #define k_Blake2s_IV_1  0xBB67AE85UL
201 #define k_Blake2s_IV_2  0x3C6EF372UL
202 #define k_Blake2s_IV_3  0xA54FF53AUL
203 #define k_Blake2s_IV_4  0x510E527FUL
204 #define k_Blake2s_IV_5  0x9B05688CUL
205 #define k_Blake2s_IV_6  0x1F83D9ABUL
206 #define k_Blake2s_IV_7  0x5BE0CD19UL
207 
208 #define KIV(n)  (k_Blake2s_IV_## n)
209 
210 #ifdef Z7_BLAKE2S_USE_VECTORS
211 MY_ALIGN(16)
212 static const UInt32 k_Blake2s_IV[8] =
213 {
214   KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7)
215 };
216 #endif
217 
218 #define STATE_T(s)        ((s) + 8)
219 #define STATE_F(s)        ((s) + 10)
220 
221 #ifdef Z7_BLAKE2S_USE_VECTORS
222 
223 #define LOAD_128(p)    _mm_load_si128 ((const __m128i *)(const void *)(p))
224 #define LOADU_128(p)   _mm_loadu_si128((const __m128i *)(const void *)(p))
225 #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
226   // here we use unaligned load and stores
227   // use this branch if CBlake2sp can be unaligned for 16 bytes
228   #define STOREU_128(p, r)  _mm_storeu_si128((__m128i *)(void *)(p), r)
229   #define LOAD_128_FROM_STRUCT(p)     LOADU_128(p)
230   #define STORE_128_TO_STRUCT(p, r)   STOREU_128(p, r)
231 #else
232   // here we use aligned load and stores
233   // use this branch if CBlake2sp is aligned for 16 bytes
234   #define STORE_128(p, r)  _mm_store_si128((__m128i *)(void *)(p), r)
235   #define LOAD_128_FROM_STRUCT(p)     LOAD_128(p)
236   #define STORE_128_TO_STRUCT(p, r)   STORE_128(p, r)
237 #endif
238 
239 #endif // Z7_BLAKE2S_USE_VECTORS
240 
241 
242 #if 0
243 static void PrintState(const UInt32 *s, unsigned num)
244 {
245   unsigned i;
246   printf("\n");
247   for (i = 0; i < num; i++)
248     printf(" %08x", (unsigned)s[i]);
249 }
250 static void PrintStates2(const UInt32 *s, unsigned x, unsigned y)
251 {
252   unsigned i;
253   for (i = 0; i < y; i++)
254     PrintState(s + i * x, x);
255   printf("\n");
256 }
257 #endif
258 
259 
260 #define REP8_MACRO(m)  { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) }
261 
262 #define BLAKE2S_NUM_ROUNDS  10
263 
264 #if defined(Z7_BLAKE2S_USE_VECTORS)
265 #define ROUNDS_LOOP(mac) \
266   { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) }
267 #endif
268 /*
269 #define ROUNDS_LOOP_2(mac) \
270   { unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } }
271 */
272 #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
273 #define ROUNDS_LOOP_UNROLLED(m) \
274   { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) }
275 #endif
276 
277 #define SIGMA_TABLE(M) \
278   M(  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 ), \
279   M( 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 ), \
280   M( 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 ), \
281   M(  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 ), \
282   M(  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 ), \
283   M(  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 ), \
284   M( 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 ), \
285   M( 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 ), \
286   M(  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 ), \
287   M( 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13,  0 )
288 
289 #define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
290   { a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m }
291 #define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
292         SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
293 
294 // MY_ALIGN(32)
295 MY_ALIGN(16)
296 static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] =
297   { SIGMA_TABLE(SIGMA_TABLE_MULT_4) };
298 
299 #define GET_SIGMA_PTR(p, index) \
300     ((const void *)((const Byte *)(const void *)(p) + (index)))
301 
302 #define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \
303     ((UInt32 *)(void *)((Byte *)(void *)(s) + (pos)))
304 
305 
306 #ifdef Z7_BLAKE2S_USE_VECTORS
307 
308 
309 #if 0
310   // use loading constants from memory
311   // is faster for some compilers.
312   #define KK4(n)  KIV(n), KIV(n), KIV(n), KIV(n)
313 MY_ALIGN(64)
314 static const UInt32 k_Blake2s_IV_WAY4[]=
315 {
316   KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7)
317 };
318   #define GET_128_IV_WAY4(i)  LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i))
319 #else
320   // use constant generation:
321   #define GET_128_IV_WAY4(i)  _mm_set1_epi32((Int32)KIV(i))
322 #endif
323 
324 
325 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
326 #define GET_CONST_128_FROM_ARRAY32(k) \
327     _mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0])
328 #endif
329 
330 
331 #if 0
332 #define k_r8    _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
333 #define k_r16   _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
334 #define k_inc   _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)
335 #define k_iv0_128  GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)
336 #define k_iv4_128  GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)
337 #else
338 #if  defined(Z7_BLAKE2S_USE_SSSE3) && \
339     !defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
340 MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 };
341 MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 };
342 #define k_r8    LOAD_128(k_r8_arr)
343 #define k_r16   LOAD_128(k_r16_arr)
344 #endif
345 MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 };
346 #define k_inc   LOAD_128(k_inc_arr)
347 #define k_iv0_128  LOAD_128(k_Blake2s_IV + 0)
348 #define k_iv4_128  LOAD_128(k_Blake2s_IV + 4)
349 #endif
350 
351 
352 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
353 
354 #ifdef Z7_BLAKE2S_USE_AVX2
355 #if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
356   #define MY_mm256_set_m128i(hi, lo)  _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
357 #else
358   #define MY_mm256_set_m128i  _mm256_set_m128i
359 #endif
360 
361 #define SET_FROM_128(a)  MY_mm256_set_m128i(a, a)
362 
363 #ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
364 MY_ALIGN(32) static const Byte k_r8_arr_256 [32] =
365 {
366   1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12,
367   1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12
368 };
369 MY_ALIGN(32) static const Byte k_r16_arr_256[32] =
370 {
371   2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13,
372   2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
373 };
374 #define k_r8_256    LOAD_256(k_r8_arr_256)
375 #define k_r16_256   LOAD_256(k_r16_arr_256)
376 #endif
377 
378 // #define k_r8_256    SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1))
379 // #define k_r16_256   SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
380 // #define k_inc_256   SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE))
381 // #define k_iv0_256   SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0))
382 #define k_iv4_256   SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4))
383 #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
384 #endif
385 
386 
387 /*
388 IPC(TP) ports:
389 1 p__5  : skl-      : SSE   : shufps  : _mm_shuffle_ps
390 2 p_15  : icl+
391 1 p__5  : nhm-bdw   : SSE   : xorps   : _mm_xor_ps
392 3 p015  : skl+
393 
394 3 p015              : SSE2  : pxor    : _mm_xor_si128
395 2 p_15:   snb-bdw   : SSE2  : padd    : _mm_add_epi32
396 2 p0_5:   mrm-wsm   :
397 3 p015  : skl+
398 
399 2 p_15  : ivb-,icl+ : SSE2  : punpcklqdq, punpckhqdq, punpckldq, punpckhdq
400 2 p_15  :           : SSE2  : pshufd  : _mm_shuffle_epi32
401 2 p_15  :           : SSE2  : pshuflw : _mm_shufflelo_epi16
402 2 p_15  :           : SSE2  : psrldq  :
403 2 p_15  :           : SSE3  : pshufb  : _mm_shuffle_epi8
404 2 p_15  :           : SSE4  : pblendw : _mm_blend_epi16
405 1 p__5  : hsw-skl   : *
406 
407 1 p0                : SSE2  : pslld (i8) : _mm_slli_si128
408 2 p01   : skl+      :
409 
410 2 p_15  : ivb-      : SSE3  : palignr
411 1 p__5  : hsw+
412 
413 2 p_15 + p23 : ivb-, icl+ : SSE4   : pinsrd  : _mm_insert_epi32(xmm, m32, i8)
414 1 p__5 + p23 : hsw-skl
415 1 p_15 + p5  : ivb-, ice+ : SSE4   : pinsrd  : _mm_insert_epi32(xmm, r32, i8)
416 0.5    2*p5  : hsw-skl
417 
418 2 p23               : SSE2   : movd (m32)
419 3 p23A  : adl       :
420 1 p5:               : SSE2   : movd (r32)
421 */
422 
423 #if 0 && defined(__XOP__)
424 // we must debug and test __XOP__ instruction
425 #include <x86intrin.h>
426 #include <ammintrin.h>
427     #define LOAD_ROTATE_CONSTS
428     #define MM_ROR_EPI32(r, c)  _mm_roti_epi32(r, -(c))
429     #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
430 #elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
431     #define LOAD_ROTATE_CONSTS
432     #define MM_ROR_EPI32(r, c)  _mm_ror_epi32(r, c)
433     #define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
434 #else
435 
436 // MSVC_1937+ uses "orps" instruction for _mm_or_si128().
437 // But "orps" has low throughput: TP=1 for bdw-nhm.
438 // So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps".
439 // But "orps" is fast for modern cpus (skl+).
440 // So we are default with "or" version:
441 #if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937
442   // minor optimization for some old cpus, if "xorps" is slow.
443   #define MM128_EPI32_OR_or_ADD  _mm_add_epi32
444 #else
445   #define MM128_EPI32_OR_or_ADD  _mm_or_si128
446 #endif
447 
448   #define MM_ROR_EPI32_VIA_SHIFT(r, c)( \
449     MM128_EPI32_OR_or_ADD( \
450       _mm_srli_epi32((r), (c)), \
451       _mm_slli_epi32((r), 32-(c))))
452   #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
453     #define LOAD_ROTATE_CONSTS \
454       const __m128i  r8 = k_r8; \
455       const __m128i r16 = k_r16;
456     #define MM_ROR_EPI32(r, c) ( \
457       ( 8==(c)) ? _mm_shuffle_epi8(r,r8) \
458     : (16==(c)) ? _mm_shuffle_epi8(r,r16) \
459     : MM_ROR_EPI32_VIA_SHIFT(r, c))
460   #else
461     #define LOAD_ROTATE_CONSTS
462     #define  MM_ROR_EPI32(r, c) ( \
463       (16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \
464     : MM_ROR_EPI32_VIA_SHIFT(r, c))
465   #endif
466 #endif
467 
468 /*
469 we have 3 main ways to load 4 32-bit integers to __m128i:
470   1) SSE2:  _mm_set_epi32()
471   2) SSE2:  _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128()
472   3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128()
473 good compiler for _mm_set_epi32() generates these instructions:
474 {
475   movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq;
476 }
477 good new compiler generates one instruction
478 {
479   for _mm_insert_epi32()  : { pinsrd xmm, [m32], i }
480   for _mm_cvtsi32_si128() : { movd xmm, [m32] }
481 }
482 but vc2010 generates slow pair of instructions:
483 {
484   for _mm_insert_epi32()  : { mov r32, [m32];  pinsrd xmm, r32, i  }
485   for _mm_cvtsi32_si128() : { mov r32, [m32];  movd  xmm, r32 }
486 }
487 _mm_insert_epi32() (pinsrd) code reduces xmm register pressure
488 in comparison with _mm_set_epi32() (movd + vpunpckld) code.
489 Note that variant with "movd xmm, r32" can be more slow,
490 but register pressure can be more important.
491 So we can force to "pinsrd" always.
492 */
493 // #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86)
494 #ifdef Z7_BLAKE2S_USE_SSE41
495   /* _mm_set_epi32() can be more effective for GCC and CLANG
496      _mm_insert_epi32()  is more effective for MSVC */
497   #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
498     #define Z7_BLAKE2S_USE_INSERT_INSTRUCTION
499   #endif
500 #endif // USE_SSE41
501 // #endif
502 
503 #ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION
504   // for SSE4.1
505 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3)  \
506     _mm_insert_epi32( \
507     _mm_insert_epi32( \
508     _mm_insert_epi32( \
509     _mm_cvtsi32_si128( \
510         *(const Int32 *)p0), \
511         *(const Int32 *)p1, 1), \
512         *(const Int32 *)p2, 2), \
513         *(const Int32 *)p3, 3)
514 #elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
515 /* MSVC 1400 implements _mm_set_epi32() via slow memory write/read.
516    Also _mm_unpacklo_epi32 is more effective for another MSVC compilers.
517    But _mm_set_epi32() is more effective for GCC and CLANG.
518    So we use _mm_unpacklo_epi32 for MSVC only */
519 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3)  \
520     _mm_unpacklo_epi64(  \
521         _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0),  \
522                             _mm_cvtsi32_si128(*(const Int32 *)p1)), \
523         _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2),  \
524                             _mm_cvtsi32_si128(*(const Int32 *)p3)))
525 #else
526 #define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3)  \
527     _mm_set_epi32( \
528         *(const Int32 *)p3, \
529         *(const Int32 *)p2, \
530         *(const Int32 *)p1, \
531         *(const Int32 *)p0)
532 #endif
533 
534 #define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)  \
535       MM_LOAD_EPI32_FROM_4_POINTERS( \
536         GET_SIGMA_PTR(input, i0), \
537         GET_SIGMA_PTR(input, i1), \
538         GET_SIGMA_PTR(input, i2), \
539         GET_SIGMA_PTR(input, i3))
540 
541 #define SET_ROW_FROM_SIGMA(input, sigma_index)  \
542         SET_ROW_FROM_SIGMA_BASE(input, \
543             sigma[(sigma_index)        ], \
544             sigma[(sigma_index) + 2 * 1], \
545             sigma[(sigma_index) + 2 * 2], \
546             sigma[(sigma_index) + 2 * 3]) \
547 
548 
549 #define ADD_128(a, b)   _mm_add_epi32(a, b)
550 #define XOR_128(a, b)   _mm_xor_si128(a, b)
551 
552 #define D_ADD_128(dest, src)        dest = ADD_128(dest, src)
553 #define D_XOR_128(dest, src)        dest = XOR_128(dest, src)
554 #define D_ROR_128(dest, shift)      dest = MM_ROR_EPI32(dest, shift)
555 #define D_ADD_EPI64_128(dest, src)  dest = _mm_add_epi64(dest, src)
556 
557 
558 #define AXR(a, b, d, shift) \
559     D_ADD_128(a, b); \
560     D_XOR_128(d, a); \
561     D_ROR_128(d, shift);
562 
563 #define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \
564     a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \
565     AXR(a, b, d, shift1) \
566     AXR(c, d, b, shift2)
567 
568 #define ROTATE_WORDS_TO_RIGHT(a, n) \
569     a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
570 
571 #define AXR4(a, b, c, d, input, sigma_index)  \
572     AXR2(a, b, c, d, input, sigma_index,     16, 12) \
573     AXR2(a, b, c, d, input, sigma_index + 1,  8,  7) \
574 
575 #define RR2(a, b, c, d, input) \
576   { \
577     AXR4(a, b, c, d, input, 0) \
578       ROTATE_WORDS_TO_RIGHT(b, 1) \
579       ROTATE_WORDS_TO_RIGHT(c, 2) \
580       ROTATE_WORDS_TO_RIGHT(d, 3) \
581     AXR4(a, b, c, d, input, 8) \
582       ROTATE_WORDS_TO_RIGHT(b, 3) \
583       ROTATE_WORDS_TO_RIGHT(c, 2) \
584       ROTATE_WORDS_TO_RIGHT(d, 1) \
585   }
586 
587 
588 /*
589 Way1:
590 per 64 bytes block:
591 10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1
592                     * (7 + 1) = 320 cycles = if pslld TP=2 (skl+)
593 additional operations per 7_op_iter :
594 4 movzx   byte mem
595 1 movd    mem
596 3 pinsrd  mem
597 1.5 pshufd
598 */
599 
600 static
601 #if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \
602                defined(Z7_BLAKE2S_USE_V256_WAY2))
603   Z7_NO_INLINE
604 #else
605   Z7_FORCE_INLINE
606 #endif
607 #ifdef BLAKE2S_ATTRIB_128BIT
608        BLAKE2S_ATTRIB_128BIT
609 #endif
610 void
611 Z7_FASTCALL
Blake2s_Compress_V128_Way1(UInt32 * const s,const Byte * const input)612 Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input)
613 {
614   __m128i a, b, c, d;
615   __m128i f0, f1;
616 
617   LOAD_ROTATE_CONSTS
618   d = LOAD_128_FROM_STRUCT(STATE_T(s));
619   c = k_iv0_128;
620   a = f0 = LOAD_128_FROM_STRUCT(s);
621   b = f1 = LOAD_128_FROM_STRUCT(s + 4);
622   D_ADD_EPI64_128(d, k_inc);
623   STORE_128_TO_STRUCT (STATE_T(s), d);
624   D_XOR_128(d, k_iv4_128);
625 
626 #define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
627       RR2(a, b, c, d, input) }
628 
629   ROUNDS_LOOP(RR)
630 #undef RR
631 
632   STORE_128_TO_STRUCT(s    , XOR_128(f0, XOR_128(a, c)));
633   STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d)));
634 }
635 
636 
637 static
638 Z7_NO_INLINE
639 #ifdef BLAKE2S_ATTRIB_128BIT
640        BLAKE2S_ATTRIB_128BIT
641 #endif
642 void
643 Z7_FASTCALL
Blake2sp_Compress2_V128_Way1(UInt32 * s_items,const Byte * data,const Byte * end)644 Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end)
645 {
646   size_t pos = 0;
647   do
648   {
649     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
650     Blake2s_Compress_V128_Way1(s, data);
651     data += Z7_BLAKE2S_BLOCK_SIZE;
652     pos  += Z7_BLAKE2S_BLOCK_SIZE;
653     pos &= SUPER_BLOCK_MASK;
654   }
655   while (data != end);
656 }
657 
658 
659 #if defined(Z7_BLAKE2S_USE_V128_WAY2) || \
660     defined(Z7_BLAKE2S_USE_AVX2_WAY2)
661 #if 1
662   #define Z7_BLAKE2S_CompressSingleBlock(s, data) \
663     Blake2sp_Compress2_V128_Way1(s, data, \
664         (const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE)
665 #else
666   #define Z7_BLAKE2S_CompressSingleBlock  Blake2s_Compress_V128_Way1
667 #endif
668 #endif
669 
670 
671 #if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \
672      defined(Z7_BLAKE2S_USE_V128_WAY2)) && \
673     !defined(Z7_BLAKE2S_USE_GATHER)
674 #define AXR2_LOAD_INDEXES(sigma_index) \
675       const unsigned i0 = sigma[(sigma_index)]; \
676       const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \
677       const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \
678       const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \
679 
680 #define SET_ROW_FROM_SIGMA_W(input) \
681     SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)
682 #endif
683 
684 
685 #ifdef Z7_BLAKE2S_USE_V128_WAY2
686 
687 #if 1 || !defined(Z7_BLAKE2S_USE_SSE41)
688 /* we use SET_ROW_FROM_SIGMA_BASE, that uses
689    (SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined
690    (SSE2) _mm_set_epi32()
691    MSVC can be faster for this branch:
692 */
693 #define AXR2_W(sigma_index, shift1, shift2) \
694   { \
695     AXR2_LOAD_INDEXES(sigma_index) \
696     a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \
697     a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \
698     AXR(a0, b0, d0, shift1) \
699     AXR(a1, b1, d1, shift1) \
700     AXR(c0, d0, b0, shift2) \
701     AXR(c1, d1, b1, shift2) \
702   }
703 #else
704 /* we use interleaved _mm_insert_epi32():
705    GCC can be faster for this branch:
706 */
707 #define AXR2_W_PRE_INSERT(sigma_index, i) \
708   { const unsigned ii = sigma[(sigma_index) + i * 2]; \
709     t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii),                      i); \
710     t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \
711   }
712 #define AXR2_W(sigma_index, shift1, shift2) \
713   { __m128i t0, t1; \
714     { const unsigned ii = sigma[sigma_index]; \
715       t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \
716       t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \
717     } \
718     AXR2_W_PRE_INSERT(sigma_index, 1) \
719     AXR2_W_PRE_INSERT(sigma_index, 2) \
720     AXR2_W_PRE_INSERT(sigma_index, 3) \
721     a0 = _mm_add_epi32(a0, t0); \
722     a1 = _mm_add_epi32(a1, t1); \
723     AXR(a0, b0, d0, shift1) \
724     AXR(a1, b1, d1, shift1) \
725     AXR(c0, d0, b0, shift2) \
726     AXR(c1, d1, b1, shift2) \
727   }
728 #endif
729 
730 
731 #define AXR4_W(sigma_index) \
732     AXR2_W(sigma_index,     16, 12) \
733     AXR2_W(sigma_index + 1,  8,  7) \
734 
735 #define WW(r) \
736   { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
737     AXR4_W(0) \
738       ROTATE_WORDS_TO_RIGHT(b0, 1) \
739       ROTATE_WORDS_TO_RIGHT(b1, 1) \
740       ROTATE_WORDS_TO_RIGHT(c0, 2) \
741       ROTATE_WORDS_TO_RIGHT(c1, 2) \
742       ROTATE_WORDS_TO_RIGHT(d0, 3) \
743       ROTATE_WORDS_TO_RIGHT(d1, 3) \
744     AXR4_W(8) \
745       ROTATE_WORDS_TO_RIGHT(b0, 3) \
746       ROTATE_WORDS_TO_RIGHT(b1, 3) \
747       ROTATE_WORDS_TO_RIGHT(c0, 2) \
748       ROTATE_WORDS_TO_RIGHT(c1, 2) \
749       ROTATE_WORDS_TO_RIGHT(d0, 1) \
750       ROTATE_WORDS_TO_RIGHT(d1, 1) \
751   }
752 
753 
754 static
755 Z7_NO_INLINE
756 #ifdef BLAKE2S_ATTRIB_128BIT
757        BLAKE2S_ATTRIB_128BIT
758 #endif
759 void
760 Z7_FASTCALL
Blake2sp_Compress2_V128_Way2(UInt32 * s_items,const Byte * data,const Byte * end)761 Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
762 {
763   size_t pos = 0;
764   end -= Z7_BLAKE2S_BLOCK_SIZE;
765 
766   if (data != end)
767   {
768     LOAD_ROTATE_CONSTS
769     do
770     {
771       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
772       __m128i a0, b0, c0, d0;
773       __m128i a1, b1, c1, d1;
774       {
775         const __m128i inc = k_inc;
776         const __m128i temp = k_iv4_128;
777         d0 = LOAD_128_FROM_STRUCT (STATE_T(s));
778         d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
779         D_ADD_EPI64_128(d0, inc);
780         D_ADD_EPI64_128(d1, inc);
781         STORE_128_TO_STRUCT (STATE_T(s      ), d0);
782         STORE_128_TO_STRUCT (STATE_T(s + NSW), d1);
783         D_XOR_128(d0, temp);
784         D_XOR_128(d1, temp);
785       }
786       c1 = c0 = k_iv0_128;
787       a0 = LOAD_128_FROM_STRUCT(s);
788       b0 = LOAD_128_FROM_STRUCT(s + 4);
789       a1 = LOAD_128_FROM_STRUCT(s + NSW);
790       b1 = LOAD_128_FROM_STRUCT(s + NSW + 4);
791 
792       ROUNDS_LOOP (WW)
793 
794 #undef WW
795 
796       D_XOR_128(a0, c0);
797       D_XOR_128(b0, d0);
798       D_XOR_128(a1, c1);
799       D_XOR_128(b1, d1);
800 
801       D_XOR_128(a0, LOAD_128_FROM_STRUCT(s));
802       D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4));
803       D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW));
804       D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4));
805 
806       STORE_128_TO_STRUCT(s,           a0);
807       STORE_128_TO_STRUCT(s + 4,       b0);
808       STORE_128_TO_STRUCT(s + NSW,     a1);
809       STORE_128_TO_STRUCT(s + NSW + 4, b1);
810 
811       data += Z7_BLAKE2S_BLOCK_SIZE * 2;
812       pos  += Z7_BLAKE2S_BLOCK_SIZE * 2;
813       pos &= SUPER_BLOCK_MASK;
814     }
815     while (data < end);
816     if (data != end)
817       return;
818   }
819   {
820     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
821     Z7_BLAKE2S_CompressSingleBlock(s, data);
822   }
823 }
824 #endif // Z7_BLAKE2S_USE_V128_WAY2
825 
826 
827 #ifdef Z7_BLAKE2S_USE_V128_WAY2
828   #define Z7_BLAKE2S_Compress2_V128  Blake2sp_Compress2_V128_Way2
829 #else
830   #define Z7_BLAKE2S_Compress2_V128  Blake2sp_Compress2_V128_Way1
831 #endif
832 
833 
834 
835 #ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
836   #define ROT_128_8(x)    MM_ROR_EPI32(x, 8)
837   #define ROT_128_16(x)   MM_ROR_EPI32(x, 16)
838   #define ROT_128_7(x)    MM_ROR_EPI32(x, 7)
839   #define ROT_128_12(x)   MM_ROR_EPI32(x, 12)
840 #else
841 #if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
842   #define ROT_128_8(x)    _mm_shuffle_epi8(x, r8)   // k_r8
843   #define ROT_128_16(x)   _mm_shuffle_epi8(x, r16)  // k_r16
844 #else
845   #define ROT_128_8(x)    MM_ROR_EPI32_VIA_SHIFT(x, 8)
846   #define ROT_128_16(x)   MM_ROR_EPI32_VIA_SHIFT(x, 16)
847 #endif
848   #define ROT_128_7(x)    MM_ROR_EPI32_VIA_SHIFT(x, 7)
849   #define ROT_128_12(x)   MM_ROR_EPI32_VIA_SHIFT(x, 12)
850 #endif
851 
852 
853 #if 1
854 // this branch can provide similar speed on x86* in most cases,
855 // because [base + index*4] provides same speed as [base + index].
856 // but some compilers can generate different code with this branch, that can be faster sometimes.
857 // this branch uses additional table of 10*16=160 bytes.
858 #define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
859         SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
860 MY_ALIGN(16)
861 static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] =
862   { SIGMA_TABLE(SIGMA_TABLE_MULT_16) };
863 #define GET_SIGMA_PTR_128(r)  const Byte * const sigma = k_Blake2s_Sigma_16[r];
864 #define GET_SIGMA_VAL_128(n)  (sigma[n])
865 #else
866 #define GET_SIGMA_PTR_128(r)  const Byte * const sigma = k_Blake2s_Sigma_4[r];
867 #define GET_SIGMA_VAL_128(n)  (4 * (size_t)sigma[n])
868 #endif
869 
870 
871 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
872 #if 1
873 #define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
874         SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
875 MY_ALIGN(64)
876 static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] =
877   { SIGMA_TABLE(SIGMA_TABLE_MULT_32) };
878 #define GET_SIGMA_PTR_256(r)  const UInt16 * const sigma = k_Blake2s_Sigma_32[r];
879 #define GET_SIGMA_VAL_256(n)  (sigma[n])
880 #else
881 #define GET_SIGMA_PTR_256(r)  const Byte * const sigma = k_Blake2s_Sigma_4[r];
882 #define GET_SIGMA_VAL_256(n)  (8 * (size_t)sigma[n])
883 #endif
884 #endif // Z7_BLAKE2S_USE_AVX2_FAST
885 
886 
887 #define D_ROT_128_7(dest)     dest = ROT_128_7(dest)
888 #define D_ROT_128_8(dest)     dest = ROT_128_8(dest)
889 #define D_ROT_128_12(dest)    dest = ROT_128_12(dest)
890 #define D_ROT_128_16(dest)    dest = ROT_128_16(dest)
891 
892 #define OP_L(a, i)   D_ADD_128 (V(a, 0), \
893     LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i))));
894 
895 #define OP_0(a)   OP_L(a, 0)
896 #define OP_7(a)   OP_L(a, 1)
897 
898 #define OP_1(a)   D_ADD_128 (V(a, 0), V(a, 1));
899 #define OP_2(a)   D_XOR_128 (V(a, 3), V(a, 0));
900 #define OP_4(a)   D_ADD_128 (V(a, 2), V(a, 3));
901 #define OP_5(a)   D_XOR_128 (V(a, 1), V(a, 2));
902 
903 #define OP_3(a)   D_ROT_128_16 (V(a, 3));
904 #define OP_6(a)   D_ROT_128_12 (V(a, 1));
905 #define OP_8(a)   D_ROT_128_8  (V(a, 3));
906 #define OP_9(a)   D_ROT_128_7  (V(a, 1));
907 
908 
909 // for 32-bit x86 : interleave mode works slower, because of register pressure.
910 
911 #if 0 || 1 && (defined(MY_CPU_X86) \
912   || defined(__GNUC__) && !defined(__clang__))
913 // non-inteleaved version:
914 // is fast for x86 32-bit.
915 // is fast for GCC x86-64.
916 
917 #define V4G(a) \
918   OP_0 (a) \
919   OP_1 (a) \
920   OP_2 (a) \
921   OP_3 (a) \
922   OP_4 (a) \
923   OP_5 (a) \
924   OP_6 (a) \
925   OP_7 (a) \
926   OP_1 (a) \
927   OP_2 (a) \
928   OP_8 (a) \
929   OP_4 (a) \
930   OP_5 (a) \
931   OP_9 (a) \
932 
933 #define V4R \
934 { \
935   V4G (0) \
936   V4G (1) \
937   V4G (2) \
938   V4G (3) \
939   V4G (4) \
940   V4G (5) \
941   V4G (6) \
942   V4G (7) \
943 }
944 
945 #elif 0 || 1 && defined(MY_CPU_X86)
946 
947 #define OP_INTER_2(op, a,b) \
948   op (a) \
949   op (b) \
950 
951 #define V4G(a,b) \
952   OP_INTER_2 (OP_0, a,b) \
953   OP_INTER_2 (OP_1, a,b) \
954   OP_INTER_2 (OP_2, a,b) \
955   OP_INTER_2 (OP_3, a,b) \
956   OP_INTER_2 (OP_4, a,b) \
957   OP_INTER_2 (OP_5, a,b) \
958   OP_INTER_2 (OP_6, a,b) \
959   OP_INTER_2 (OP_7, a,b) \
960   OP_INTER_2 (OP_1, a,b) \
961   OP_INTER_2 (OP_2, a,b) \
962   OP_INTER_2 (OP_8, a,b) \
963   OP_INTER_2 (OP_4, a,b) \
964   OP_INTER_2 (OP_5, a,b) \
965   OP_INTER_2 (OP_9, a,b) \
966 
967 #define V4R \
968 { \
969   V4G (0, 1) \
970   V4G (2, 3) \
971   V4G (4, 5) \
972   V4G (6, 7) \
973 }
974 
975 #else
976 // iterleave-4 version is fast for x64 (MSVC/CLANG)
977 
978 #define OP_INTER_4(op, a,b,c,d) \
979   op (a) \
980   op (b) \
981   op (c) \
982   op (d) \
983 
984 #define V4G(a,b,c,d) \
985   OP_INTER_4 (OP_0, a,b,c,d) \
986   OP_INTER_4 (OP_1, a,b,c,d) \
987   OP_INTER_4 (OP_2, a,b,c,d) \
988   OP_INTER_4 (OP_3, a,b,c,d) \
989   OP_INTER_4 (OP_4, a,b,c,d) \
990   OP_INTER_4 (OP_5, a,b,c,d) \
991   OP_INTER_4 (OP_6, a,b,c,d) \
992   OP_INTER_4 (OP_7, a,b,c,d) \
993   OP_INTER_4 (OP_1, a,b,c,d) \
994   OP_INTER_4 (OP_2, a,b,c,d) \
995   OP_INTER_4 (OP_8, a,b,c,d) \
996   OP_INTER_4 (OP_4, a,b,c,d) \
997   OP_INTER_4 (OP_5, a,b,c,d) \
998   OP_INTER_4 (OP_9, a,b,c,d) \
999 
1000 #define V4R \
1001 { \
1002   V4G (0, 1, 2, 3) \
1003   V4G (4, 5, 6, 7) \
1004 }
1005 
1006 #endif
1007 
1008 #define V4_ROUND(r)  { GET_SIGMA_PTR_128(r); V4R }
1009 
1010 
1011 #define V4_LOAD_MSG_1(w, m, i) \
1012 { \
1013   __m128i m0, m1, m2, m3; \
1014   __m128i t0, t1, t2, t3; \
1015   m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \
1016   m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \
1017   m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \
1018   m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \
1019   t0 = _mm_unpacklo_epi32(m0, m1); \
1020   t1 = _mm_unpackhi_epi32(m0, m1); \
1021   t2 = _mm_unpacklo_epi32(m2, m3); \
1022   t3 = _mm_unpackhi_epi32(m2, m3); \
1023   w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \
1024   w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \
1025   w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \
1026   w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \
1027 }
1028 
1029 #define V4_LOAD_MSG(w, m) \
1030 { \
1031   V4_LOAD_MSG_1 (w, m, 0) \
1032   V4_LOAD_MSG_1 (w, m, 1) \
1033   V4_LOAD_MSG_1 (w, m, 2) \
1034   V4_LOAD_MSG_1 (w, m, 3) \
1035 }
1036 
1037 #define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \
1038 { \
1039   const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i    ) * 4);  \
1040   const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4);  \
1041   d0 = _mm_unpacklo_epi32(v0, v1);  \
1042   d1 = _mm_unpackhi_epi32(v0, v1);  \
1043 }
1044 
1045 #define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \
1046 { \
1047   STORE_128_TO_STRUCT((dest32) + i * 4     , _mm_unpacklo_epi64(s0, s1));  \
1048   STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1));  \
1049 }
1050 
1051 #define V4_UNPACK_STATE(dest32, src32) \
1052 { \
1053   __m128i t0, t1, t2, t3, t4, t5, t6, t7; \
1054   V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1)  \
1055   V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3)  \
1056   V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5)  \
1057   V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7)  \
1058   V4_UNPACK_PAIR_128(dest32, 0, t0, t2)  \
1059   V4_UNPACK_PAIR_128(dest32, 8, t1, t3)  \
1060   V4_UNPACK_PAIR_128(dest32, 1, t4, t6)  \
1061   V4_UNPACK_PAIR_128(dest32, 9, t5, t7)  \
1062 }
1063 
1064 
1065 static
1066 Z7_NO_INLINE
1067 #ifdef BLAKE2S_ATTRIB_128BIT
1068        BLAKE2S_ATTRIB_128BIT
1069 #endif
1070 void
1071 Z7_FASTCALL
Blake2sp_Compress2_V128_Fast(UInt32 * s_items,const Byte * data,const Byte * end)1072 Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1073 {
1074   // PrintStates2(s_items, 8, 16);
1075   size_t pos = 0;
1076   pos /= 2;
1077   do
1078   {
1079 #if defined(Z7_BLAKE2S_USE_SSSE3) && \
1080    !defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED)
1081     const __m128i  r8 = k_r8;
1082     const __m128i r16 = k_r16;
1083 #endif
1084     __m128i w[16];
1085     __m128i v[16];
1086     UInt32 *s;
1087     V4_LOAD_MSG(w, data)
1088     s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1089     {
1090       __m128i ctr = LOAD_128_FROM_STRUCT(s + 64);
1091       D_ADD_EPI64_128 (ctr, k_inc);
1092       STORE_128_TO_STRUCT(s + 64, ctr);
1093       v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1094       v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1095     }
1096     v[ 8] = GET_128_IV_WAY4(0);
1097     v[ 9] = GET_128_IV_WAY4(1);
1098     v[10] = GET_128_IV_WAY4(2);
1099     v[11] = GET_128_IV_WAY4(3);
1100     v[14] = GET_128_IV_WAY4(6);
1101     v[15] = GET_128_IV_WAY4(7);
1102 
1103 #define LOAD_STATE_128_FROM_STRUCT(i) \
1104       v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4);
1105 
1106 #define UPDATE_STATE_128_IN_STRUCT(i) \
1107       STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \
1108       XOR_128(v[i], v[(i) + 8]), \
1109       LOAD_128_FROM_STRUCT(s + (i) * 4)));
1110 
1111     REP8_MACRO (LOAD_STATE_128_FROM_STRUCT)
1112     ROUNDS_LOOP (V4_ROUND)
1113     REP8_MACRO (UPDATE_STATE_128_IN_STRUCT)
1114 
1115     data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1116     pos  += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2;
1117     pos &= SUPER_BLOCK_SIZE / 2 - 1;
1118   }
1119   while (data != end);
1120 }
1121 
1122 
1123 static
1124 Z7_NO_INLINE
1125 #ifdef BLAKE2S_ATTRIB_128BIT
1126        BLAKE2S_ATTRIB_128BIT
1127 #endif
1128 void
1129 Z7_FASTCALL
Blake2sp_Final_V128_Fast(UInt32 * states)1130 Blake2sp_Final_V128_Fast(UInt32 *states)
1131 {
1132   const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1133   // printf("\nBlake2sp_Compress2_V128_Fast_Final4\n");
1134   // PrintStates2(states, 8, 16);
1135   {
1136     ptrdiff_t pos = 8 * 4;
1137     do
1138     {
1139       UInt32 *src32  = states + (size_t)(pos * 1);
1140       UInt32 *dest32 = states + (size_t)(pos * 2);
1141       V4_UNPACK_STATE(dest32, src32)
1142       pos -= 8 * 4;
1143     }
1144     while (pos >= 0);
1145   }
1146   {
1147     unsigned k;
1148     for (k = 0; k < 8; k++)
1149     {
1150       UInt32 *s = states + (size_t)k * 16;
1151       STORE_128_TO_STRUCT (STATE_T(s), ctr);
1152     }
1153   }
1154   // PrintStates2(states, 8, 16);
1155 }
1156 
1157 
1158 
1159 #ifdef Z7_BLAKE2S_USE_AVX2
1160 
1161 #define ADD_256(a, b)  _mm256_add_epi32(a, b)
1162 #define XOR_256(a, b)  _mm256_xor_si256(a, b)
1163 
1164 #if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
1165   #define MM256_ROR_EPI32  _mm256_ror_epi32
1166   #define Z7_MM256_ROR_EPI32_IS_SUPPORTED
1167   #define LOAD_ROTATE_CONSTS_256
1168 #else
1169 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1170 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1171   #define LOAD_ROTATE_CONSTS_256 \
1172       const __m256i  r8 = k_r8_256; \
1173       const __m256i r16 = k_r16_256;
1174 #endif // AVX2_WAY2
1175 
1176   #define MM256_ROR_EPI32(r, c) ( \
1177       ( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \
1178     : (16==(c)) ? _mm256_shuffle_epi8(r,r16) \
1179     : _mm256_or_si256( \
1180       _mm256_srli_epi32((r), (c)), \
1181       _mm256_slli_epi32((r), 32-(c))))
1182 #endif // WAY_SLOW
1183 #endif
1184 
1185 
1186 #define D_ADD_256(dest, src)  dest = ADD_256(dest, src)
1187 #define D_XOR_256(dest, src)  dest = XOR_256(dest, src)
1188 
1189 #define LOADU_256(p)     _mm256_loadu_si256((const __m256i *)(const void *)(p))
1190 
1191 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
1192 
1193 #ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1194 #define ROT_256_16(x) MM256_ROR_EPI32((x), 16)
1195 #define ROT_256_12(x) MM256_ROR_EPI32((x), 12)
1196 #define ROT_256_8(x)  MM256_ROR_EPI32((x),  8)
1197 #define ROT_256_7(x)  MM256_ROR_EPI32((x),  7)
1198 #else
1199 #define ROTATE8  _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \
1200                                  12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
1201 #define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \
1202                                  13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
1203 #define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16)
1204 #define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20))
1205 #define ROT_256_8(x)  _mm256_shuffle_epi8((x), ROTATE8)
1206 #define ROT_256_7(x)  _mm256_or_si256(_mm256_srli_epi32((x),  7), _mm256_slli_epi32((x), 25))
1207 #endif
1208 
1209 #define D_ROT_256_7(dest)     dest = ROT_256_7(dest)
1210 #define D_ROT_256_8(dest)     dest = ROT_256_8(dest)
1211 #define D_ROT_256_12(dest)    dest = ROT_256_12(dest)
1212 #define D_ROT_256_16(dest)    dest = ROT_256_16(dest)
1213 
1214 #define LOAD_256(p)      _mm256_load_si256((const __m256i *)(const void *)(p))
1215 #ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
1216   #define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r)
1217   #define LOAD_256_FROM_STRUCT(p)     LOADU_256(p)
1218   #define STORE_256_TO_STRUCT(p, r)   STOREU_256(p, r)
1219 #else
1220   // if struct is aligned for 32-bytes
1221   #define STORE_256(p, r)  _mm256_store_si256((__m256i *)(void *)(p), r)
1222   #define LOAD_256_FROM_STRUCT(p)     LOAD_256(p)
1223   #define STORE_256_TO_STRUCT(p, r)   STORE_256(p, r)
1224 #endif
1225 
1226 #endif // Z7_BLAKE2S_USE_AVX2_FAST
1227 
1228 
1229 
1230 #ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1231 
1232 #if 0
1233     #define DIAG_PERM2(s) \
1234     { \
1235       const __m256i a = LOAD_256_FROM_STRUCT((s)      ); \
1236       const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \
1237       STORE_256_TO_STRUCT((s      ), _mm256_permute2x128_si256(a, b, 0x20)); \
1238       STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \
1239     }
1240 #else
1241     #define DIAG_PERM2(s) \
1242     { \
1243       const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \
1244       const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \
1245       STORE_128_TO_STRUCT((s) + NSW, a); \
1246       STORE_128_TO_STRUCT((s) + 4  , b); \
1247     }
1248 #endif
1249     #define DIAG_PERM8(s_items) \
1250     { \
1251       DIAG_PERM2(s_items) \
1252       DIAG_PERM2(s_items + NSW * 2) \
1253       DIAG_PERM2(s_items + NSW * 4) \
1254       DIAG_PERM2(s_items + NSW * 6) \
1255     }
1256 
1257 
1258 #define AXR256(a, b, d, shift) \
1259     D_ADD_256(a, b); \
1260     D_XOR_256(d, a); \
1261     d = MM256_ROR_EPI32(d, shift); \
1262 
1263 
1264 
1265 #ifdef Z7_BLAKE2S_USE_GATHER
1266 
1267   #define TABLE_GATHER_256_4(a0,a1,a2,a3) \
1268     a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16
1269   #define TABLE_GATHER_256( \
1270     a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
1271   { TABLE_GATHER_256_4(a0,a2,a4,a6), \
1272     TABLE_GATHER_256_4(a1,a3,a5,a7), \
1273     TABLE_GATHER_256_4(a8,a10,a12,a14), \
1274     TABLE_GATHER_256_4(a9,a11,a13,a15) }
1275 MY_ALIGN(64)
1276 static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] =
1277   { SIGMA_TABLE(TABLE_GATHER_256) };
1278   #define GET_SIGMA(r) \
1279     const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r];
1280   #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1281     const __m256i i01234567 = LOAD_256(sigma + (sigma_index));
1282   #define SET_ROW_FROM_SIGMA_AVX(in) \
1283     _mm256_i32gather_epi32((const void *)(in), i01234567, 4)
1284   #define SIGMA_INTERLEAVE    8
1285   #define SIGMA_HALF_ROW_SIZE 16
1286 
1287 #else // !Z7_BLAKE2S_USE_GATHER
1288 
1289   #define GET_SIGMA(r) \
1290     const Byte * const sigma = k_Blake2s_Sigma_4[r];
1291   #define AXR2_LOAD_INDEXES_AVX(sigma_index) \
1292     AXR2_LOAD_INDEXES(sigma_index)
1293   #define SET_ROW_FROM_SIGMA_AVX(in) \
1294     MY_mm256_set_m128i( \
1295         SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \
1296         SET_ROW_FROM_SIGMA_W(in))
1297   #define SIGMA_INTERLEAVE    1
1298   #define SIGMA_HALF_ROW_SIZE 8
1299 #endif // !Z7_BLAKE2S_USE_GATHER
1300 
1301 
1302 #define ROTATE_WORDS_TO_RIGHT_256(a, n) \
1303     a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
1304 
1305 
1306 
1307 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
1308 
1309 #define AXR2_A(sigma_index, shift1, shift2) \
1310     AXR2_LOAD_INDEXES_AVX(sigma_index) \
1311     D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1312     AXR256(a0, b0, d0, shift1) \
1313     AXR256(c0, d0, b0, shift2) \
1314 
1315 #define AXR4_A(sigma_index) \
1316     { AXR2_A(sigma_index, 16, 12) } \
1317     { AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1318 
1319 #define EE1(r) \
1320     { GET_SIGMA(r) \
1321       AXR4_A(0) \
1322         ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1323         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1324         ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1325       AXR4_A(SIGMA_HALF_ROW_SIZE) \
1326         ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1327         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1328         ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1329     }
1330 
1331 static
1332 Z7_NO_INLINE
1333 #ifdef BLAKE2S_ATTRIB_AVX2
1334        BLAKE2S_ATTRIB_AVX2
1335 #endif
1336 void
1337 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way2(UInt32 * s_items,const Byte * data,const Byte * end)1338 Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
1339 {
1340   size_t pos = 0;
1341   end -= Z7_BLAKE2S_BLOCK_SIZE;
1342 
1343   if (data != end)
1344   {
1345     LOAD_ROTATE_CONSTS_256
1346     DIAG_PERM8(s_items)
1347     do
1348     {
1349       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1350       __m256i a0, b0, c0, d0;
1351       {
1352         const __m128i inc = k_inc;
1353         __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1354         __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1355         D_ADD_EPI64_128(d0_128, inc);
1356         D_ADD_EPI64_128(d1_128, inc);
1357         STORE_128_TO_STRUCT (STATE_T(s      ), d0_128);
1358         STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128);
1359         d0 = MY_mm256_set_m128i(d1_128, d0_128);
1360         D_XOR_256(d0, k_iv4_256);
1361       }
1362       c0 = SET_FROM_128(k_iv0_128);
1363       a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1364       b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1365 
1366       ROUNDS_LOOP (EE1)
1367 
1368       D_XOR_256(a0, c0);
1369       D_XOR_256(b0, d0);
1370 
1371       D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1372       D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1373 
1374       STORE_256_TO_STRUCT(s + NSW * 0, a0);
1375       STORE_256_TO_STRUCT(s + NSW * 1, b0);
1376 
1377       data += Z7_BLAKE2S_BLOCK_SIZE * 2;
1378       pos  += Z7_BLAKE2S_BLOCK_SIZE * 2;
1379       pos &= SUPER_BLOCK_MASK;
1380     }
1381     while (data < end);
1382     DIAG_PERM8(s_items)
1383     if (data != end)
1384       return;
1385   }
1386   {
1387     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1388     Z7_BLAKE2S_CompressSingleBlock(s, data);
1389   }
1390 }
1391 
1392 #endif // Z7_BLAKE2S_USE_AVX2_WAY2
1393 
1394 
1395 
1396 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
1397 
1398 #define AXR2_X(sigma_index, shift1, shift2) \
1399     AXR2_LOAD_INDEXES_AVX(sigma_index) \
1400     D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
1401     D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \
1402     AXR256(a0, b0, d0, shift1) \
1403     AXR256(a1, b1, d1, shift1) \
1404     AXR256(c0, d0, b0, shift2) \
1405     AXR256(c1, d1, b1, shift2) \
1406 
1407 #define AXR4_X(sigma_index) \
1408     { AXR2_X(sigma_index, 16, 12) } \
1409     { AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
1410 
1411 #define EE2(r) \
1412     { GET_SIGMA(r) \
1413       AXR4_X(0) \
1414         ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
1415         ROTATE_WORDS_TO_RIGHT_256(b1, 1) \
1416         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1417         ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1418         ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
1419         ROTATE_WORDS_TO_RIGHT_256(d1, 3) \
1420       AXR4_X(SIGMA_HALF_ROW_SIZE) \
1421         ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
1422         ROTATE_WORDS_TO_RIGHT_256(b1, 3) \
1423         ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
1424         ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
1425         ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
1426         ROTATE_WORDS_TO_RIGHT_256(d1, 1) \
1427     }
1428 
1429 static
1430 Z7_NO_INLINE
1431 #ifdef BLAKE2S_ATTRIB_AVX2
1432        BLAKE2S_ATTRIB_AVX2
1433 #endif
1434 void
1435 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way4(UInt32 * s_items,const Byte * data,const Byte * end)1436 Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end)
1437 {
1438   size_t pos = 0;
1439 
1440   if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4)
1441   {
1442 #ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED
1443     const __m256i  r8 = k_r8_256;
1444     const __m256i r16 = k_r16_256;
1445 #endif
1446     end -= Z7_BLAKE2S_BLOCK_SIZE * 3;
1447     DIAG_PERM8(s_items)
1448     do
1449     {
1450       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1451       __m256i a0, b0, c0, d0;
1452       __m256i a1, b1, c1, d1;
1453       {
1454         const __m128i inc = k_inc;
1455         __m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
1456         __m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
1457         __m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2));
1458         __m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3));
1459         D_ADD_EPI64_128(d0_128, inc);
1460         D_ADD_EPI64_128(d1_128, inc);
1461         D_ADD_EPI64_128(d2_128, inc);
1462         D_ADD_EPI64_128(d3_128, inc);
1463         STORE_128_TO_STRUCT (STATE_T(s          ), d0_128);
1464         STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128);
1465         STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128);
1466         STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128);
1467         d0 = MY_mm256_set_m128i(d1_128, d0_128);
1468         d1 = MY_mm256_set_m128i(d3_128, d2_128);
1469         D_XOR_256(d0, k_iv4_256);
1470         D_XOR_256(d1, k_iv4_256);
1471       }
1472       c1 = c0 = SET_FROM_128(k_iv0_128);
1473       a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
1474       b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
1475       a1 = LOAD_256_FROM_STRUCT(s + NSW * 2);
1476       b1 = LOAD_256_FROM_STRUCT(s + NSW * 3);
1477 
1478       ROUNDS_LOOP (EE2)
1479 
1480       D_XOR_256(a0, c0);
1481       D_XOR_256(b0, d0);
1482       D_XOR_256(a1, c1);
1483       D_XOR_256(b1, d1);
1484 
1485       D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
1486       D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
1487       D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2));
1488       D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3));
1489 
1490       STORE_256_TO_STRUCT(s + NSW * 0, a0);
1491       STORE_256_TO_STRUCT(s + NSW * 1, b0);
1492       STORE_256_TO_STRUCT(s + NSW * 2, a1);
1493       STORE_256_TO_STRUCT(s + NSW * 3, b1);
1494 
1495       data += Z7_BLAKE2S_BLOCK_SIZE * 4;
1496       pos  += Z7_BLAKE2S_BLOCK_SIZE * 4;
1497       pos &= SUPER_BLOCK_MASK;
1498     }
1499     while (data < end);
1500     DIAG_PERM8(s_items)
1501     end += Z7_BLAKE2S_BLOCK_SIZE * 3;
1502   }
1503   if (data == end)
1504     return;
1505   // Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos);
1506   do
1507   {
1508     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
1509     Z7_BLAKE2S_CompressSingleBlock(s, data);
1510     data += Z7_BLAKE2S_BLOCK_SIZE;
1511     pos  += Z7_BLAKE2S_BLOCK_SIZE;
1512     pos &= SUPER_BLOCK_MASK;
1513   }
1514   while (data != end);
1515 }
1516 
1517 #endif // Z7_BLAKE2S_USE_AVX2_WAY4
1518 #endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
1519 
1520 
1521 // ---------------------------------------------------------
1522 
1523 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
1524 
1525 #define OP256_L(a, i)   D_ADD_256 (V(a, 0), \
1526     LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i))));
1527 
1528 #define OP256_0(a)   OP256_L(a, 0)
1529 #define OP256_7(a)   OP256_L(a, 1)
1530 
1531 #define OP256_1(a)   D_ADD_256 (V(a, 0), V(a, 1));
1532 #define OP256_2(a)   D_XOR_256 (V(a, 3), V(a, 0));
1533 #define OP256_4(a)   D_ADD_256 (V(a, 2), V(a, 3));
1534 #define OP256_5(a)   D_XOR_256 (V(a, 1), V(a, 2));
1535 
1536 #define OP256_3(a)   D_ROT_256_16 (V(a, 3));
1537 #define OP256_6(a)   D_ROT_256_12 (V(a, 1));
1538 #define OP256_8(a)   D_ROT_256_8  (V(a, 3));
1539 #define OP256_9(a)   D_ROT_256_7  (V(a, 1));
1540 
1541 
1542 #if 0 || 1 && defined(MY_CPU_X86)
1543 
1544 #define V8_G(a) \
1545   OP256_0 (a) \
1546   OP256_1 (a) \
1547   OP256_2 (a) \
1548   OP256_3 (a) \
1549   OP256_4 (a) \
1550   OP256_5 (a) \
1551   OP256_6 (a) \
1552   OP256_7 (a) \
1553   OP256_1 (a) \
1554   OP256_2 (a) \
1555   OP256_8 (a) \
1556   OP256_4 (a) \
1557   OP256_5 (a) \
1558   OP256_9 (a) \
1559 
1560 #define V8R { \
1561   V8_G (0); \
1562   V8_G (1); \
1563   V8_G (2); \
1564   V8_G (3); \
1565   V8_G (4); \
1566   V8_G (5); \
1567   V8_G (6); \
1568   V8_G (7); \
1569 }
1570 
1571 #else
1572 
1573 #define OP256_INTER_4(op, a,b,c,d) \
1574   op (a) \
1575   op (b) \
1576   op (c) \
1577   op (d) \
1578 
1579 #define V8_G(a,b,c,d) \
1580   OP256_INTER_4 (OP256_0, a,b,c,d) \
1581   OP256_INTER_4 (OP256_1, a,b,c,d) \
1582   OP256_INTER_4 (OP256_2, a,b,c,d) \
1583   OP256_INTER_4 (OP256_3, a,b,c,d) \
1584   OP256_INTER_4 (OP256_4, a,b,c,d) \
1585   OP256_INTER_4 (OP256_5, a,b,c,d) \
1586   OP256_INTER_4 (OP256_6, a,b,c,d) \
1587   OP256_INTER_4 (OP256_7, a,b,c,d) \
1588   OP256_INTER_4 (OP256_1, a,b,c,d) \
1589   OP256_INTER_4 (OP256_2, a,b,c,d) \
1590   OP256_INTER_4 (OP256_8, a,b,c,d) \
1591   OP256_INTER_4 (OP256_4, a,b,c,d) \
1592   OP256_INTER_4 (OP256_5, a,b,c,d) \
1593   OP256_INTER_4 (OP256_9, a,b,c,d) \
1594 
1595 #define V8R { \
1596   V8_G (0, 1, 2, 3) \
1597   V8_G (4, 5, 6, 7) \
1598 }
1599 #endif
1600 
1601 #define V8_ROUND(r)  { GET_SIGMA_PTR_256(r); V8R }
1602 
1603 
1604 // for debug:
1605 // #define Z7_BLAKE2S_PERMUTE_WITH_GATHER
1606 #if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER)
1607 // gather instruction is slow.
1608 #define V8_LOAD_MSG(w, m) \
1609 { \
1610   unsigned i; \
1611   for (i = 0; i < 16; ++i) { \
1612     w[i] = _mm256_i32gather_epi32( \
1613       (const void *)((m) + i * sizeof(UInt32)),\
1614       _mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \
1615       sizeof(UInt32)); \
1616   } \
1617 }
1618 #else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1619 
1620 #define V8_LOAD_MSG_2(w, a0, a1) \
1621 { \
1622   (w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20);  \
1623   (w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31);  \
1624 }
1625 
1626 #define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \
1627 { \
1628   __m256i s0, s1, s2, s3;  \
1629   s0 = _mm256_unpacklo_epi64(z0, z1);  \
1630   s1 = _mm256_unpackhi_epi64(z0, z1);  \
1631   s2 = _mm256_unpacklo_epi64(z2, z3);  \
1632   s3 = _mm256_unpackhi_epi64(z2, z3);  \
1633   V8_LOAD_MSG_2((w) + 0, s0, s2)   \
1634   V8_LOAD_MSG_2((w) + 1, s1, s3)   \
1635 }
1636 
1637 #define V8_LOAD_MSG_0(t0, t1, m) \
1638 { \
1639   __m256i m0, m1;  \
1640   m0 = LOADU_256(m);  \
1641   m1 = LOADU_256((m) + 2 * 32);  \
1642   t0 = _mm256_unpacklo_epi32(m0, m1);  \
1643   t1 = _mm256_unpackhi_epi32(m0, m1);  \
1644 }
1645 
1646 #define V8_LOAD_MSG_8(w, m) \
1647 { \
1648   __m256i t0, t1, t2, t3, t4, t5, t6, t7;  \
1649   V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32)  \
1650   V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32)  \
1651   V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32)  \
1652   V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32)  \
1653   V8_LOAD_MSG_4((w)    , t0, t1, t2, t3)   \
1654   V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7)   \
1655 }
1656 
1657 #define V8_LOAD_MSG(w, m) \
1658 { \
1659   V8_LOAD_MSG_8(w, m)  \
1660   V8_LOAD_MSG_8((w) + 8, (m) + 32)  \
1661 }
1662 
1663 #endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
1664 
1665 
1666 #define V8_PERM_PAIR_STORE(u, a0, a2) \
1667 { \
1668   STORE_256_TO_STRUCT((u),     _mm256_permute2x128_si256(a0, a2, 0x20));  \
1669   STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31));  \
1670 }
1671 
1672 #define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \
1673 { \
1674   __m256i s0, s1, s2, s3;  \
1675   s0 = _mm256_unpacklo_epi64(z0, z1);  \
1676   s1 = _mm256_unpackhi_epi64(z0, z1);  \
1677   s2 = _mm256_unpacklo_epi64(z2, z3);  \
1678   s3 = _mm256_unpackhi_epi64(z2, z3);  \
1679   V8_PERM_PAIR_STORE(u + 0, s0, s2)  \
1680   V8_PERM_PAIR_STORE(u + 2, s1, s3)  \
1681 }
1682 
1683 #define V8_UNPACK_STORE_0(src32, d0, d1) \
1684 { \
1685   const __m256i v0 = LOAD_256_FROM_STRUCT ((src32)    );  \
1686   const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8);  \
1687   d0 = _mm256_unpacklo_epi32(v0, v1);  \
1688   d1 = _mm256_unpackhi_epi32(v0, v1);  \
1689 }
1690 
1691 #define V8_UNPACK_STATE(dest32, src32) \
1692 { \
1693   __m256i t0, t1, t2, t3, t4, t5, t6, t7;  \
1694   V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4)  \
1695   V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5)  \
1696   V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6)  \
1697   V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7)  \
1698   V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32)    , t0, t1, t2, t3)  \
1699   V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7)  \
1700 }
1701 
1702 
1703 
1704 #define V8_LOAD_STATE_256_FROM_STRUCT(i) \
1705       v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8);
1706 
1707 #if 0 || 0 && defined(MY_CPU_X86)
1708 #define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1709 #endif
1710 
1711 #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1712 // this branch doesn't use (iv) array
1713 // so register pressure can be lower.
1714 // it can be faster sometimes
1715 #define V8_LOAD_STATE_256(i)  V8_LOAD_STATE_256_FROM_STRUCT(i)
1716 #define V8_UPDATE_STATE_256(i) \
1717 { \
1718     STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \
1719     XOR_256(v[i], v[(i) + 8]), \
1720     LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \
1721 }
1722 #else
1723 // it uses more variables (iv) registers
1724 // it's better for gcc
1725 // maybe that branch is better, if register pressure will be lower (avx512)
1726 #define V8_LOAD_STATE_256(i)   { iv[i] = v[i]; }
1727 #define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); }
1728 #define V8_STORE_STATE_256(i)  { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); }
1729 #endif
1730 
1731 
1732 #if 0
1733   // use loading constants from memory
1734   #define KK8(n)  KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n)
1735 MY_ALIGN(64)
1736 static const UInt32 k_Blake2s_IV_WAY8[]=
1737 {
1738   KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7)
1739 };
1740   #define GET_256_IV_WAY8(i)  LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i))
1741 #else
1742   // use constant generation:
1743   #define GET_256_IV_WAY8(i)  _mm256_set1_epi32((Int32)KIV(i))
1744 #endif
1745 
1746 
1747 static
1748 Z7_NO_INLINE
1749 #ifdef BLAKE2S_ATTRIB_AVX2
1750        BLAKE2S_ATTRIB_AVX2
1751 #endif
1752 void
1753 Z7_FASTCALL
Blake2sp_Compress2_AVX2_Fast(UInt32 * s_items,const Byte * data,const Byte * end)1754 Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
1755 {
1756 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1757   __m256i v[16];
1758 #endif
1759 
1760   // PrintStates2(s_items, 8, 16);
1761 
1762 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1763   REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT)
1764 #endif
1765 
1766   do
1767   {
1768     __m256i w[16];
1769 #ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1770     __m256i v[16];
1771 #else
1772     __m256i iv[8];
1773 #endif
1774     V8_LOAD_MSG(w, data)
1775     {
1776       // we use load/store ctr inside loop to reduce register pressure:
1777 #if 1 || 1 && defined(MY_CPU_X86)
1778       const __m256i ctr = _mm256_add_epi64(
1779           LOAD_256_FROM_STRUCT(s_items + 64),
1780           _mm256_set_epi32(
1781               0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE,
1782               0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE));
1783       STORE_256_TO_STRUCT(s_items + 64, ctr);
1784 #else
1785       const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64)
1786           + Z7_BLAKE2S_BLOCK_SIZE;
1787       const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64);
1788       *(UInt64 *)(void *)(s_items + 64) = ctr64;
1789 #endif
1790       v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
1791       v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
1792     }
1793     v[ 8] = GET_256_IV_WAY8(0);
1794     v[ 9] = GET_256_IV_WAY8(1);
1795     v[10] = GET_256_IV_WAY8(2);
1796     v[11] = GET_256_IV_WAY8(3);
1797     v[14] = GET_256_IV_WAY8(6);
1798     v[15] = GET_256_IV_WAY8(7);
1799 
1800     REP8_MACRO (V8_LOAD_STATE_256)
1801     ROUNDS_LOOP (V8_ROUND)
1802     REP8_MACRO (V8_UPDATE_STATE_256)
1803     data += SUPER_BLOCK_SIZE;
1804   }
1805   while (data != end);
1806 
1807 #ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
1808   REP8_MACRO (V8_STORE_STATE_256)
1809 #endif
1810 }
1811 
1812 
1813 static
1814 Z7_NO_INLINE
1815 #ifdef BLAKE2S_ATTRIB_AVX2
1816        BLAKE2S_ATTRIB_AVX2
1817 #endif
1818 void
1819 Z7_FASTCALL
Blake2sp_Final_AVX2_Fast(UInt32 * states)1820 Blake2sp_Final_AVX2_Fast(UInt32 *states)
1821 {
1822   const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
1823   // PrintStates2(states, 8, 16);
1824   V8_UNPACK_STATE(states, states)
1825   // PrintStates2(states, 8, 16);
1826   {
1827     unsigned k;
1828     for (k = 0; k < 8; k++)
1829     {
1830       UInt32 *s = states + (size_t)k * 16;
1831       STORE_128_TO_STRUCT (STATE_T(s), ctr);
1832     }
1833   }
1834   // PrintStates2(states, 8, 16);
1835   // printf("\nafter V8_UNPACK_STATE \n");
1836 }
1837 
1838 #endif // Z7_BLAKE2S_USE_AVX2_FAST
1839 #endif // avx2
1840 #endif // vector
1841 
1842 
1843 /*
1844 #define Blake2s_Increment_Counter(s, inc) \
1845   { STATE_T(s)[0] += (inc);  STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); }
1846 #define Blake2s_Increment_Counter_Small(s, inc) \
1847   { STATE_T(s)[0] += (inc); }
1848 */
1849 
1850 #define Blake2s_Set_LastBlock(s) \
1851   { STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ }
1852 
1853 
1854 #if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600
1855   // good for vs2022
1856   #define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) }
1857 #else
1858    // good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64.
1859   #define LOOP_8(mac) { REP8_MACRO(mac) }
1860 #endif
1861 
1862 
1863 static
1864 Z7_FORCE_INLINE
1865 // Z7_NO_INLINE
1866 void
1867 Z7_FASTCALL
Blake2s_Compress(UInt32 * s,const Byte * input)1868 Blake2s_Compress(UInt32 *s, const Byte *input)
1869 {
1870   UInt32 m[16];
1871   UInt32 v[16];
1872   {
1873     unsigned i;
1874     for (i = 0; i < 16; i++)
1875       m[i] = GetUi32(input + i * 4);
1876   }
1877 
1878 #define INIT_v_FROM_s(i)  v[i] = s[i];
1879 
1880   LOOP_8(INIT_v_FROM_s)
1881 
1882   // Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE)
1883   {
1884     const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE;
1885     const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE);
1886     STATE_T(s)[0] = t0;
1887     STATE_T(s)[1] = t1;
1888     v[12] = t0 ^ KIV(4);
1889     v[13] = t1 ^ KIV(5);
1890   }
1891   // v[12] = STATE_T(s)[0] ^ KIV(4);
1892   // v[13] = STATE_T(s)[1] ^ KIV(5);
1893   v[14] = STATE_F(s)[0] ^ KIV(6);
1894   v[15] = STATE_F(s)[1] ^ KIV(7);
1895 
1896   v[ 8] = KIV(0);
1897   v[ 9] = KIV(1);
1898   v[10] = KIV(2);
1899   v[11] = KIV(3);
1900   // PrintStates2((const UInt32 *)v, 1, 16);
1901 
1902   #define ADD_SIGMA(a, index)  V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]);
1903   #define ADD32M(dest, src, a)    V(a, dest) += V(a, src);
1904   #define XOR32M(dest, src, a)    V(a, dest) ^= V(a, src);
1905   #define RTR32M(dest, shift, a)  V(a, dest) = rotrFixed(V(a, dest), shift);
1906 
1907 // big interleaving can provides big performance gain, if scheduler queues are small.
1908 #if 0 || 1 && defined(MY_CPU_X86)
1909   // interleave-1: for small register number (x86-32bit)
1910   #define G2(index, a, x, y) \
1911     ADD_SIGMA (a, (index) + 2 * 0) \
1912     ADD32M (0, 1, a) \
1913     XOR32M (3, 0, a) \
1914     RTR32M (3, x, a) \
1915     ADD32M (2, 3, a) \
1916     XOR32M (1, 2, a) \
1917     RTR32M (1, y, a) \
1918 
1919   #define G(a) \
1920     G2(a * 2    , a, 16, 12) \
1921     G2(a * 2 + 1, a,  8,  7) \
1922 
1923   #define R2 \
1924     G(0) \
1925     G(1) \
1926     G(2) \
1927     G(3) \
1928     G(4) \
1929     G(5) \
1930     G(6) \
1931     G(7) \
1932 
1933 #elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64)
1934   // interleave-2: is good if the number of registers is not big (x86-64).
1935 
1936   #define REP2(mac, dest, src, a, b) \
1937       mac(dest, src, a) \
1938       mac(dest, src, b)
1939 
1940   #define G2(index, a, b, x, y) \
1941     ADD_SIGMA (a, (index) + 2 * 0) \
1942     ADD_SIGMA (b, (index) + 2 * 1) \
1943     REP2 (ADD32M, 0, 1, a, b) \
1944     REP2 (XOR32M, 3, 0, a, b) \
1945     REP2 (RTR32M, 3, x, a, b) \
1946     REP2 (ADD32M, 2, 3, a, b) \
1947     REP2 (XOR32M, 1, 2, a, b) \
1948     REP2 (RTR32M, 1, y, a, b) \
1949 
1950   #define G(a, b) \
1951     G2(a * 2    , a, b, 16, 12) \
1952     G2(a * 2 + 1, a, b,  8,  7) \
1953 
1954   #define R2 \
1955     G(0, 1) \
1956     G(2, 3) \
1957     G(4, 5) \
1958     G(6, 7) \
1959 
1960 #else
1961   // interleave-4:
1962   // it has big register pressure for x86/x64.
1963   // and MSVC compilers for x86/x64 are slow for this branch.
1964   // but if we have big number of registers, this branch can be faster.
1965 
1966   #define REP4(mac, dest, src, a, b, c, d) \
1967       mac(dest, src, a) \
1968       mac(dest, src, b) \
1969       mac(dest, src, c) \
1970       mac(dest, src, d)
1971 
1972   #define G2(index, a, b, c, d, x, y) \
1973     ADD_SIGMA (a, (index) + 2 * 0) \
1974     ADD_SIGMA (b, (index) + 2 * 1) \
1975     ADD_SIGMA (c, (index) + 2 * 2) \
1976     ADD_SIGMA (d, (index) + 2 * 3) \
1977     REP4 (ADD32M, 0, 1, a, b, c, d) \
1978     REP4 (XOR32M, 3, 0, a, b, c, d) \
1979     REP4 (RTR32M, 3, x, a, b, c, d) \
1980     REP4 (ADD32M, 2, 3, a, b, c, d) \
1981     REP4 (XOR32M, 1, 2, a, b, c, d) \
1982     REP4 (RTR32M, 1, y, a, b, c, d) \
1983 
1984   #define G(a, b, c, d) \
1985     G2(a * 2    , a, b, c, d, 16, 12) \
1986     G2(a * 2 + 1, a, b, c, d,  8,  7) \
1987 
1988   #define R2 \
1989     G(0, 1, 2, 3) \
1990     G(4, 5, 6, 7) \
1991 
1992 #endif
1993 
1994   #define R(r)  { const Byte *sigma = k_Blake2s_Sigma_4[r];  R2 }
1995 
1996   // Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster:
1997   //   20-40% faster for (x86/x64) VC2010+/GCC/CLANG.
1998   //   30-60% faster for (arm64-arm32) GCC.
1999   //    5-11% faster for (arm64) CLANG-MAC.
2000   // so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch.
2001   // But if there is vectors branch (for x86*), this scalar code will be unused mostly.
2002   // So we want smaller code (without unrolling) in that case (x86*).
2003 #if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
2004   #define Z7_BLAKE2S_UNROLL
2005 #endif
2006 
2007 #ifdef Z7_BLAKE2S_UNROLL
2008     ROUNDS_LOOP_UNROLLED (R)
2009 #else
2010     ROUNDS_LOOP (R)
2011 #endif
2012 
2013   #undef G
2014   #undef G2
2015   #undef R
2016   #undef R2
2017 
2018   // printf("\n v after: \n");
2019   // PrintStates2((const UInt32 *)v, 1, 16);
2020 #define XOR_s_PAIR_v(i)  s[i] ^= v[i] ^ v[i + 8];
2021 
2022   LOOP_8(XOR_s_PAIR_v)
2023   // printf("\n s after:\n");
2024   // PrintStates2((const UInt32 *)s, 1, 16);
2025 }
2026 
2027 
2028 static
2029 Z7_NO_INLINE
2030 void
2031 Z7_FASTCALL
Blake2sp_Compress2(UInt32 * s_items,const Byte * data,const Byte * end)2032 Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end)
2033 {
2034   size_t pos = 0;
2035   // PrintStates2(s_items, 8, 16);
2036   do
2037   {
2038     UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
2039     Blake2s_Compress(s, data);
2040     data += Z7_BLAKE2S_BLOCK_SIZE;
2041     pos  += Z7_BLAKE2S_BLOCK_SIZE;
2042     pos &= SUPER_BLOCK_MASK;
2043   }
2044   while (data != end);
2045 }
2046 
2047 
2048 #ifdef Z7_BLAKE2S_USE_VECTORS
2049 
2050 static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast   = Blake2sp_Compress2;
2051 static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2;
2052 static Z7_BLAKE2SP_FUNC_INIT     g_Z7_BLAKE2SP_FUNC_INIT_Init;
2053 static Z7_BLAKE2SP_FUNC_INIT     g_Z7_BLAKE2SP_FUNC_INIT_Final;
2054 static unsigned g_z7_Blake2sp_SupportedFlags;
2055 
2056   #define Z7_BLAKE2SP_Compress_Fast(p)   (p)->u.header.func_Compress_Fast
2057   #define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single
2058 #else
2059   #define Z7_BLAKE2SP_Compress_Fast(p)   Blake2sp_Compress2
2060   #define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2
2061 #endif // Z7_BLAKE2S_USE_VECTORS
2062 
2063 
2064 #if 1 && defined(MY_CPU_LE)
2065     #define GET_DIGEST(_s, _digest) \
2066       { memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); }
2067 #else
2068     #define GET_DIGEST(_s, _digest) \
2069     { unsigned _i; for (_i = 0; _i < 8; _i++) \
2070         { SetUi32((_digest) + 4 * _i, (_s)[_i]) } \
2071     }
2072 #endif
2073 
2074 
2075 /* ---------- BLAKE2s ---------- */
2076 /*
2077 // we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0()
2078 typedef struct
2079 {
2080   Byte  digest_length;
2081   Byte  key_length;
2082   Byte  fanout;               // = 1 : in sequential mode
2083   Byte  depth;                // = 1 : in sequential mode
2084   UInt32 leaf_length;
2085   Byte  node_offset[6];       // 0 for the first, leftmost, leaf, or in sequential mode
2086   Byte  node_depth;           // 0 for the leaves, or in sequential mode
2087   Byte  inner_length;         // [0, 32], 0 in sequential mode
2088   Byte  salt[BLAKE2S_SALTBYTES];
2089   Byte  personal[BLAKE2S_PERSONALBYTES];
2090 } CBlake2sParam;
2091 */
2092 
2093 #define k_Blake2sp_IV_0  \
2094     (KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)))
2095 #define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth)  \
2096     (KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24))
2097 
2098 Z7_FORCE_INLINE
Blake2sp_Init_Spec(UInt32 * s,unsigned node_offset,unsigned node_depth)2099 static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth)
2100 {
2101   s[0] = k_Blake2sp_IV_0;
2102   s[1] = KIV(1);
2103   s[2] = KIV(2) ^ (UInt32)node_offset;
2104   s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth);
2105   s[4] = KIV(4);
2106   s[5] = KIV(5);
2107   s[6] = KIV(6);
2108   s[7] = KIV(7);
2109 
2110   STATE_T(s)[0] = 0;
2111   STATE_T(s)[1] = 0;
2112   STATE_F(s)[0] = 0;
2113   STATE_F(s)[1] = 0;
2114 }
2115 
2116 
2117 #ifdef Z7_BLAKE2S_USE_V128_FAST
2118 
2119 static
2120 Z7_NO_INLINE
2121 #ifdef BLAKE2S_ATTRIB_128BIT
2122        BLAKE2S_ATTRIB_128BIT
2123 #endif
2124 void
2125 Z7_FASTCALL
Blake2sp_InitState_V128_Fast(UInt32 * states)2126 Blake2sp_InitState_V128_Fast(UInt32 *states)
2127 {
2128 #define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \
2129   { STORE_128_TO_STRUCT(states +  0 + 4 * (i), (t0)); \
2130     STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \
2131   }
2132 #define STORE_128_PAIR_INIT_STATES_1(i, mac) \
2133   { const __m128i t = mac; \
2134     STORE_128_PAIR_INIT_STATES_2(i, t, t) \
2135   }
2136 #define STORE_128_PAIR_INIT_STATES_IV(i) \
2137     STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i))
2138 
2139   STORE_128_PAIR_INIT_STATES_1  (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0))
2140   STORE_128_PAIR_INIT_STATES_IV (1)
2141   {
2142     const __m128i t = GET_128_IV_WAY4(2);
2143     STORE_128_PAIR_INIT_STATES_2 (2,
2144         XOR_128(t, _mm_set_epi32(3, 2, 1, 0)),
2145         XOR_128(t, _mm_set_epi32(7, 6, 5, 4)))
2146   }
2147   STORE_128_PAIR_INIT_STATES_1  (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2148   STORE_128_PAIR_INIT_STATES_IV (4)
2149   STORE_128_PAIR_INIT_STATES_IV (5)
2150   STORE_128_PAIR_INIT_STATES_IV (6)
2151   STORE_128_PAIR_INIT_STATES_IV (7)
2152   STORE_128_PAIR_INIT_STATES_1  (16, _mm_set_epi32(0, 0, 0, 0))
2153   // printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]);
2154 }
2155 
2156 #endif // Z7_BLAKE2S_USE_V128_FAST
2157 
2158 
2159 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2160 
2161 static
2162 Z7_NO_INLINE
2163 #ifdef BLAKE2S_ATTRIB_AVX2
2164        BLAKE2S_ATTRIB_AVX2
2165 #endif
2166 void
2167 Z7_FASTCALL
Blake2sp_InitState_AVX2_Fast(UInt32 * states)2168 Blake2sp_InitState_AVX2_Fast(UInt32 *states)
2169 {
2170 #define STORE_256_INIT_STATES(i, t) \
2171     STORE_256_TO_STRUCT(states + 8 * (i), t);
2172 #define STORE_256_INIT_STATES_IV(i) \
2173     STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i))
2174 
2175   STORE_256_INIT_STATES    (0,  _mm256_set1_epi32((Int32)k_Blake2sp_IV_0))
2176   STORE_256_INIT_STATES_IV (1)
2177   STORE_256_INIT_STATES    (2, XOR_256( GET_256_IV_WAY8(2),
2178                                 _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)))
2179   STORE_256_INIT_STATES    (3,  _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
2180   STORE_256_INIT_STATES_IV (4)
2181   STORE_256_INIT_STATES_IV (5)
2182   STORE_256_INIT_STATES_IV (6)
2183   STORE_256_INIT_STATES_IV (7)
2184   STORE_256_INIT_STATES    (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0))
2185   // printf("\n== exit Blake2sp_InitState_AVX2_Fast\n");
2186 }
2187 
2188 #endif // Z7_BLAKE2S_USE_AVX2_FAST
2189 
2190 
2191 
2192 Z7_NO_INLINE
Blake2sp_InitState(CBlake2sp * p)2193 void Blake2sp_InitState(CBlake2sp *p)
2194 {
2195   size_t i;
2196   // memset(p->states, 0, sizeof(p->states)); // for debug
2197   p->u.header.cycPos = 0;
2198 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2199   if (p->u.header.func_Init)
2200   {
2201     p->u.header.func_Init(p->states);
2202     return;
2203   }
2204 #endif
2205   for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++)
2206     Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0);
2207 }
2208 
Blake2sp_Init(CBlake2sp * p)2209 void Blake2sp_Init(CBlake2sp *p)
2210 {
2211 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2212   p->u.header.func_Compress_Fast =
2213 #ifdef Z7_BLAKE2S_USE_VECTORS
2214     g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2215 #else
2216     NULL;
2217 #endif
2218 
2219   p->u.header.func_Compress_Single =
2220 #ifdef Z7_BLAKE2S_USE_VECTORS
2221     g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2222 #else
2223     NULL;
2224 #endif
2225 
2226   p->u.header.func_Init =
2227 #ifdef Z7_BLAKE2S_USE_VECTORS
2228     g_Z7_BLAKE2SP_FUNC_INIT_Init;
2229 #else
2230     NULL;
2231 #endif
2232 
2233   p->u.header.func_Final =
2234 #ifdef Z7_BLAKE2S_USE_VECTORS
2235     g_Z7_BLAKE2SP_FUNC_INIT_Final;
2236 #else
2237     NULL;
2238 #endif
2239 #endif
2240 
2241   Blake2sp_InitState(p);
2242 }
2243 
2244 
Blake2sp_Update(CBlake2sp * p,const Byte * data,size_t size)2245 void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size)
2246 {
2247   size_t pos;
2248   // printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data);
2249   if (size == 0)
2250     return;
2251   pos = p->u.header.cycPos;
2252   // pos <  SUPER_BLOCK_SIZE * 2  : is expected
2253   // pos == SUPER_BLOCK_SIZE * 2  : is not expected, but is supported also
2254   {
2255     const size_t pos2 = pos & SUPER_BLOCK_MASK;
2256     if (pos2)
2257     {
2258       const size_t rem = SUPER_BLOCK_SIZE - pos2;
2259       if (rem > size)
2260       {
2261         p->u.header.cycPos = (unsigned)(pos + size);
2262         // cycPos < SUPER_BLOCK_SIZE * 2
2263         memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2264         /* to simpilify the code here we don't try to process first superblock,
2265            if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */
2266         return;
2267       }
2268       // (rem <= size)
2269       memcpy((Byte *)(void *)p->buf32 + pos, data, rem);
2270       pos += rem;
2271       data += rem;
2272       size -= rem;
2273     }
2274   }
2275 
2276   // pos <= SUPER_BLOCK_SIZE * 2
2277   // pos  % SUPER_BLOCK_SIZE == 0
2278   if (pos)
2279   {
2280     /* pos == SUPER_BLOCK_SIZE ||
2281        pos == SUPER_BLOCK_SIZE * 2 */
2282     size_t end = pos;
2283     if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE
2284         || (end -= SUPER_BLOCK_SIZE))
2285     {
2286       Z7_BLAKE2SP_Compress_Fast(p)(p->states,
2287           (const Byte *)(const void *)p->buf32,
2288           (const Byte *)(const void *)p->buf32 + end);
2289       if (pos -= end)
2290         memcpy(p->buf32, (const Byte *)(const void *)p->buf32
2291             + SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE);
2292     }
2293   }
2294 
2295   // pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE)
2296   if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2297   {
2298     // pos == 0
2299     const Byte *end;
2300     const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1))
2301         & ~(size_t)SUPER_BLOCK_MASK;
2302     size -= size2;
2303     // size < SUPER_BLOCK_SIZE * 2
2304     end = data + size2;
2305     Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end);
2306     data = end;
2307   }
2308 
2309   if (size != 0)
2310   {
2311     memcpy((Byte *)(void *)p->buf32 + pos, data, size);
2312     pos += size;
2313   }
2314   p->u.header.cycPos = (unsigned)pos;
2315   // cycPos < SUPER_BLOCK_SIZE * 2
2316 }
2317 
2318 
Blake2sp_Final(CBlake2sp * p,Byte * digest)2319 void Blake2sp_Final(CBlake2sp *p, Byte *digest)
2320 {
2321   // UInt32 * const R_states = p->states;
2322   // printf("\nBlake2sp_Final \n");
2323 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2324   if (p->u.header.func_Final)
2325       p->u.header.func_Final(p->states);
2326 #endif
2327   // printf("\n=====\nBlake2sp_Final \n");
2328   // PrintStates(p->states, 32);
2329 
2330   // (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch:
2331   if (p->u.header.cycPos <= SUPER_BLOCK_SIZE)
2332   {
2333     unsigned pos;
2334     memset((Byte *)(void *)p->buf32 + p->u.header.cycPos,
2335         0, SUPER_BLOCK_SIZE - p->u.header.cycPos);
2336     STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2337     for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2338     {
2339       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2340       Blake2s_Set_LastBlock(s)
2341       if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos)
2342       {
2343         UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE;
2344         if (pos < p->u.header.cycPos)
2345           delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1);
2346         // 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE
2347         {
2348           const UInt32 v = STATE_T(s)[0];
2349           STATE_T(s)[1] -= v < delta; //  (v < delta) is same condition here as (v == 0)
2350           STATE_T(s)[0]  = v - delta;
2351         }
2352       }
2353     }
2354     // PrintStates(p->states, 16);
2355     Z7_BLAKE2SP_Compress_Single(p)(p->states,
2356         (Byte *)(void *)p->buf32,
2357         (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2358     // PrintStates(p->states, 16);
2359   }
2360   else
2361   {
2362     // (p->u.header.cycPos > SUPER_BLOCK_SIZE)
2363     unsigned pos;
2364     for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2365     {
2366       UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
2367       if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos)
2368         Blake2s_Set_LastBlock(s)
2369     }
2370     if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
2371       STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2372 
2373     Z7_BLAKE2SP_Compress_Single(p)(p->states,
2374         (Byte *)(void *)p->buf32,
2375         (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
2376 
2377     // if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE;
2378       STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
2379 
2380     // if (p->u.header.cycPos != SUPER_BLOCK_SIZE)
2381     {
2382       pos = SUPER_BLOCK_SIZE;
2383       for (;;)
2384       {
2385         UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK);
2386         Blake2s_Set_LastBlock(s)
2387         pos += Z7_BLAKE2S_BLOCK_SIZE;
2388         if (pos >= p->u.header.cycPos)
2389         {
2390           if (pos != p->u.header.cycPos)
2391           {
2392             const UInt32 delta = pos - p->u.header.cycPos;
2393             const UInt32 v = STATE_T(s)[0];
2394             STATE_T(s)[1] -= v < delta;
2395             STATE_T(s)[0]  = v - delta;
2396             memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta);
2397           }
2398           break;
2399         }
2400       }
2401       Z7_BLAKE2SP_Compress_Single(p)(p->states,
2402           (Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE,
2403           (Byte *)(void *)p->buf32 + pos);
2404     }
2405   }
2406 
2407   {
2408     size_t pos;
2409     for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2)
2410     {
2411       const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2));
2412       Byte *dest = (Byte *)(void *)p->buf32 + pos;
2413       GET_DIGEST(s, dest)
2414     }
2415   }
2416   Blake2sp_Init_Spec(p->states, 0, 1);
2417   {
2418     size_t pos;
2419     for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE)
2420         - Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
2421     {
2422       Z7_BLAKE2SP_Compress_Single(p)(p->states,
2423           (const Byte *)(const void *)p->buf32 + pos,
2424           (const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE);
2425     }
2426   }
2427   // Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i);
2428   Blake2s_Set_LastBlock(p->states)
2429   STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG;
2430   {
2431     Z7_BLAKE2SP_Compress_Single(p)(p->states,
2432         (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE,
2433         (const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE);
2434   }
2435   GET_DIGEST(p->states, digest)
2436   // printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs);
2437 }
2438 
2439 
Blake2sp_SetFunction(CBlake2sp * p,unsigned algo)2440 BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo)
2441 {
2442   // printf("\n========== setfunction = %d ======== \n",  algo);
2443 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2444   Z7_BLAKE2SP_FUNC_COMPRESS func = NULL;
2445   Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL;
2446   Z7_BLAKE2SP_FUNC_INIT     func_Final = NULL;
2447   Z7_BLAKE2SP_FUNC_INIT     func_Init = NULL;
2448 #else
2449   UNUSED_VAR(p)
2450 #endif
2451 
2452 #ifdef Z7_BLAKE2S_USE_VECTORS
2453 
2454   func = func_Single = Blake2sp_Compress2;
2455 
2456   if (algo != Z7_BLAKE2SP_ALGO_SCALAR)
2457   {
2458     // printf("\n========== setfunction NON-SCALER ======== \n");
2459     if (algo == Z7_BLAKE2SP_ALGO_DEFAULT)
2460     {
2461       func        = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
2462       func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
2463       func_Init   = g_Z7_BLAKE2SP_FUNC_INIT_Init;
2464       func_Final  = g_Z7_BLAKE2SP_FUNC_INIT_Final;
2465     }
2466     else
2467     {
2468       if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0)
2469         return False;
2470 
2471 #ifdef Z7_BLAKE2S_USE_AVX2
2472 
2473       func_Single =
2474 #if defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2475         Blake2sp_Compress2_AVX2_Way2;
2476 #else
2477         Z7_BLAKE2S_Compress2_V128;
2478 #endif
2479 
2480 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2481       if (algo == Z7_BLAKE2SP_ALGO_V256_FAST)
2482       {
2483         func = Blake2sp_Compress2_AVX2_Fast;
2484         func_Final = Blake2sp_Final_AVX2_Fast;
2485         func_Init  = Blake2sp_InitState_AVX2_Fast;
2486       }
2487       else
2488 #endif
2489 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2490       if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2)
2491         func = Blake2sp_Compress2_AVX2_Way2;
2492       else
2493 #endif
2494 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2495       if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4)
2496       {
2497         func_Single = func = Blake2sp_Compress2_AVX2_Way4;
2498       }
2499       else
2500 #endif
2501 #endif // avx2
2502       {
2503         if (algo == Z7_BLAKE2SP_ALGO_V128_FAST)
2504         {
2505           func       = Blake2sp_Compress2_V128_Fast;
2506           func_Final = Blake2sp_Final_V128_Fast;
2507           func_Init  = Blake2sp_InitState_V128_Fast;
2508           func_Single = Z7_BLAKE2S_Compress2_V128;
2509         }
2510         else
2511 #ifdef Z7_BLAKE2S_USE_V128_WAY2
2512         if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2)
2513           func = func_Single = Blake2sp_Compress2_V128_Way2;
2514         else
2515 #endif
2516         {
2517           if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1)
2518             return False;
2519           func = func_Single = Blake2sp_Compress2_V128_Way1;
2520         }
2521       }
2522     }
2523   }
2524 #else // !VECTORS
2525   if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR
2526     return False;
2527 #endif // !VECTORS
2528 
2529 #ifdef Z7_BLAKE2SP_USE_FUNCTIONS
2530   p->u.header.func_Compress_Fast = func;
2531   p->u.header.func_Compress_Single = func_Single;
2532   p->u.header.func_Final = func_Final;
2533   p->u.header.func_Init = func_Init;
2534 #endif
2535   // printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress);
2536   return True;
2537 }
2538 
2539 
z7_Black2sp_Prepare(void)2540 void z7_Black2sp_Prepare(void)
2541 {
2542 #ifdef Z7_BLAKE2S_USE_VECTORS
2543   unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR);
2544 
2545   Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2;
2546   Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2;
2547   Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
2548   Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
2549 
2550 #if defined(MY_CPU_X86_OR_AMD64)
2551     #if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2552       if (CPU_IsSupported_AVX512F_AVX512VL())
2553     #endif
2554     #if defined(Z7_BLAKE2S_USE_SSE41)
2555       if (CPU_IsSupported_SSE41())
2556     #elif defined(Z7_BLAKE2S_USE_SSSE3)
2557       if (CPU_IsSupported_SSSE3())
2558     #elif !defined(MY_CPU_AMD64)
2559       if (CPU_IsSupported_SSE2())
2560     #endif
2561 #endif
2562   {
2563     #if defined(Z7_BLAKE2S_USE_SSE41)
2564       // printf("\n========== Blake2s SSE41 128-bit\n");
2565     #elif defined(Z7_BLAKE2S_USE_SSSE3)
2566       // printf("\n========== Blake2s SSSE3 128-bit\n");
2567     #else
2568       // printf("\n========== Blake2s SSE2 128-bit\n");
2569     #endif
2570     // func_Fast = f_vector = Blake2sp_Compress2_V128_Way2;
2571     // printf("\n========== Blake2sp_Compress2_V128_Way2\n");
2572     func_Fast   =
2573     func_Single = Z7_BLAKE2S_Compress2_V128;
2574     flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1);
2575 #ifdef Z7_BLAKE2S_USE_V128_WAY2
2576     flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2);
2577 #endif
2578 #ifdef Z7_BLAKE2S_USE_V128_FAST
2579     flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST);
2580     func_Fast  = Blake2sp_Compress2_V128_Fast;
2581     func_Init  = Blake2sp_InitState_V128_Fast;
2582     func_Final = Blake2sp_Final_V128_Fast;
2583 #endif
2584 
2585 #ifdef Z7_BLAKE2S_USE_AVX2
2586 #if defined(MY_CPU_X86_OR_AMD64)
2587     if (
2588     #if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
2589         CPU_IsSupported_AVX512F_AVX512VL() &&
2590     #endif
2591         CPU_IsSupported_AVX2()
2592         )
2593 #endif
2594     {
2595     // #pragma message ("=== Blake2s AVX2")
2596     // printf("\n========== Blake2s AVX2\n");
2597 
2598 #ifdef Z7_BLAKE2S_USE_AVX2_WAY2
2599       func_Single = Blake2sp_Compress2_AVX2_Way2;
2600       flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2);
2601 #endif
2602 #ifdef Z7_BLAKE2S_USE_AVX2_WAY4
2603       flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4);
2604 #endif
2605 
2606 #ifdef Z7_BLAKE2S_USE_AVX2_FAST
2607       flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST);
2608       func_Fast  = Blake2sp_Compress2_AVX2_Fast;
2609       func_Init  = Blake2sp_InitState_AVX2_Fast;
2610       func_Final = Blake2sp_Final_AVX2_Fast;
2611 #elif defined(Z7_BLAKE2S_USE_AVX2_WAY4)
2612       func_Fast  = Blake2sp_Compress2_AVX2_Way4;
2613 #elif defined(Z7_BLAKE2S_USE_AVX2_WAY2)
2614       func_Fast  = Blake2sp_Compress2_AVX2_Way2;
2615 #endif
2616     } // avx2
2617 #endif // avx2
2618   } // sse*
2619   g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast   = func_Fast;
2620   g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single;
2621   g_Z7_BLAKE2SP_FUNC_INIT_Init       = func_Init;
2622   g_Z7_BLAKE2SP_FUNC_INIT_Final      = func_Final;
2623   g_z7_Blake2sp_SupportedFlags = flags;
2624   // printf("\nflags=%x\n", flags);
2625 #endif // vectors
2626 }
2627 
2628 /*
2629 #ifdef Z7_BLAKE2S_USE_VECTORS
2630 void align_test2(CBlake2sp *sp);
2631 void align_test2(CBlake2sp *sp)
2632 {
2633   __m128i a = LOAD_128(sp->states);
2634   D_XOR_128(a, LOAD_128(sp->states + 4));
2635   STORE_128(sp->states, a);
2636 }
2637 void align_test2(void);
2638 void align_test2(void)
2639 {
2640   CBlake2sp sp;
2641   Blake2sp_Init(&sp);
2642   Blake2sp_Update(&sp, NULL, 0);
2643 }
2644 #endif
2645 */
2646