1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2 2024-03-01 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #include "Aes.h"
7 #include "CpuArch.h"
8
9 #ifdef MY_CPU_X86_OR_AMD64
10
11 #if defined(__INTEL_COMPILER)
12 #if (__INTEL_COMPILER >= 1110)
13 #define USE_INTEL_AES
14 #if (__INTEL_COMPILER >= 1900)
15 #define USE_INTEL_VAES
16 #endif
17 #endif
18 #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
20 #define USE_INTEL_AES
21 #if !defined(__AES__)
22 #define ATTRIB_AES __attribute__((__target__("aes")))
23 #endif
24 #if defined(__clang__) && (__clang_major__ >= 8) \
25 || defined(__GNUC__) && (__GNUC__ >= 8)
26 #define USE_INTEL_VAES
27 #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29 #endif
30 #endif
31 #elif defined(_MSC_VER)
32 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33 #define USE_INTEL_AES
34 #if (_MSC_VER >= 1910)
35 #define USE_INTEL_VAES
36 #endif
37 #endif
38 #ifndef USE_INTEL_AES
39 #define Z7_USE_AES_HW_STUB
40 #endif
41 #ifndef USE_INTEL_VAES
42 #define Z7_USE_VAES_HW_STUB
43 #endif
44 #endif
45
46 #ifndef USE_INTEL_AES
47 // #define Z7_USE_AES_HW_STUB // for debug
48 #endif
49 #ifndef USE_INTEL_VAES
50 // #define Z7_USE_VAES_HW_STUB // for debug
51 #endif
52
53
54 #ifdef USE_INTEL_AES
55
56 #include <wmmintrin.h>
57
58 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
59 #define AES_TYPE_keys UInt32
60 #define AES_TYPE_data Byte
61 // #define AES_TYPE_keys __m128i
62 // #define AES_TYPE_data __m128i
63 #endif
64
65 #ifndef ATTRIB_AES
66 #define ATTRIB_AES
67 #endif
68
69 #define AES_FUNC_START(name) \
70 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
71 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
72
73 #define AES_FUNC_START2(name) \
74 AES_FUNC_START (name); \
75 ATTRIB_AES \
76 AES_FUNC_START (name)
77
78 #define MM_OP(op, dest, src) dest = op(dest, src);
79 #define MM_OP_m(op, src) MM_OP(op, m, src)
80
81 #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
82
AES_FUNC_START2(AesCbc_Encode_HW)83 AES_FUNC_START2 (AesCbc_Encode_HW)
84 {
85 __m128i *p = (__m128i *)(void *)ivAes;
86 __m128i *data = (__m128i *)(void *)data8;
87 __m128i m = *p;
88 const __m128i k0 = p[2];
89 const __m128i k1 = p[3];
90 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
91 for (; numBlocks != 0; numBlocks--, data++)
92 {
93 UInt32 r = numRounds2;
94 const __m128i *w = p + 4;
95 __m128i temp = *data;
96 MM_XOR (temp, k0)
97 MM_XOR (m, temp)
98 MM_OP_m (_mm_aesenc_si128, k1)
99 do
100 {
101 MM_OP_m (_mm_aesenc_si128, w[0])
102 MM_OP_m (_mm_aesenc_si128, w[1])
103 w += 2;
104 }
105 while (--r);
106 MM_OP_m (_mm_aesenclast_si128, w[0])
107 *data = m;
108 }
109 *p = m;
110 }
111
112
113 #define WOP_1(op)
114 #define WOP_2(op) WOP_1 (op) op (m1, 1)
115 #define WOP_3(op) WOP_2 (op) op (m2, 2)
116 #define WOP_4(op) WOP_3 (op) op (m3, 3)
117 #ifdef MY_CPU_AMD64
118 #define WOP_5(op) WOP_4 (op) op (m4, 4)
119 #define WOP_6(op) WOP_5 (op) op (m5, 5)
120 #define WOP_7(op) WOP_6 (op) op (m6, 6)
121 #define WOP_8(op) WOP_7 (op) op (m7, 7)
122 #endif
123 /*
124 #define WOP_9(op) WOP_8 (op) op (m8, 8);
125 #define WOP_10(op) WOP_9 (op) op (m9, 9);
126 #define WOP_11(op) WOP_10(op) op (m10, 10);
127 #define WOP_12(op) WOP_11(op) op (m11, 11);
128 #define WOP_13(op) WOP_12(op) op (m12, 12);
129 #define WOP_14(op) WOP_13(op) op (m13, 13);
130 */
131
132 #ifdef MY_CPU_AMD64
133 #define NUM_WAYS 8
134 #define WOP_M1 WOP_8
135 #else
136 #define NUM_WAYS 4
137 #define WOP_M1 WOP_4
138 #endif
139
140 #define WOP(op) op (m0, 0) WOP_M1(op)
141
142
143 #define DECLARE_VAR(reg, ii) __m128i reg;
144 #define LOAD_data( reg, ii) reg = data[ii];
145 #define STORE_data( reg, ii) data[ii] = reg;
146 #if (NUM_WAYS > 1)
147 #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
148 #endif
149
150 #define MM_OP_key(op, reg) MM_OP(op, reg, key);
151
152 #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg)
153 #define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg)
154 #define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg)
155 #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg)
156 #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
157
158 #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
159 #define CTR_END( reg, ii) MM_XOR (data[ii], reg)
160
161 #define WOP_KEY(op, n) { \
162 const __m128i key = w[n]; \
163 WOP(op); }
164
165
166 #define WIDE_LOOP_START \
167 dataEnd = data + numBlocks; \
168 if (numBlocks >= NUM_WAYS) \
169 { dataEnd -= NUM_WAYS; do { \
170
171
172 #define WIDE_LOOP_END \
173 data += NUM_WAYS; \
174 } while (data <= dataEnd); \
175 dataEnd += NUM_WAYS; } \
176
177
178 #define SINGLE_LOOP \
179 for (; data < dataEnd; data++)
180
181
182
183 #ifdef USE_INTEL_VAES
184
185 #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
186 #define AVX_DECLARE_VAR(reg, ii) __m256i reg;
187 #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
188 #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
189 /*
190 AVX_XOR_data_M1() needs unaligned memory load
191 if (we don't use _mm256_loadu_si256() here)
192 {
193 Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
194 instruction that can load unaligned data.
195 But GCC and CLANG without -O2 or -O1 optimizations can generate separated
196 LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
197 }
198 Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
199 v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
200 */
201 #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
202 // for debug only: the following code will fail on execution, if compiled by some compilers:
203 // #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
204
205 #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
206 #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
207 #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
208 #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
209 #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
210 #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
211 #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
212 #define AVX_WOP_KEY(op, n) { \
213 const __m256i key = w[n]; \
214 WOP(op); }
215
216 #define NUM_AES_KEYS_MAX 15
217
218 #define WIDE_LOOP_START_AVX(OP) \
219 dataEnd = data + numBlocks; \
220 if (numBlocks >= NUM_WAYS * 2) \
221 { __m256i keys[NUM_AES_KEYS_MAX]; \
222 UInt32 ii; \
223 OP \
224 for (ii = 0; ii < numRounds; ii++) \
225 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
226 dataEnd -= NUM_WAYS * 2; do { \
227
228
229 #define WIDE_LOOP_END_AVX(OP) \
230 data += NUM_WAYS * 2; \
231 } while (data <= dataEnd); \
232 dataEnd += NUM_WAYS * 2; \
233 OP \
234 _mm256_zeroupper(); \
235 } \
236
237 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
238 MSVC still can insert vzeroupper instruction. */
239
240 #endif
241
242
243
AES_FUNC_START2(AesCbc_Decode_HW)244 AES_FUNC_START2 (AesCbc_Decode_HW)
245 {
246 __m128i *p = (__m128i *)(void *)ivAes;
247 __m128i *data = (__m128i *)(void *)data8;
248 __m128i iv = *p;
249 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
250 const __m128i *dataEnd;
251 p += 2;
252
253 WIDE_LOOP_START
254 {
255 const __m128i *w = wStart;
256
257 WOP (DECLARE_VAR)
258 WOP (LOAD_data)
259 WOP_KEY (AES_XOR, 1)
260
261 do
262 {
263 WOP_KEY (AES_DEC, 0)
264 w--;
265 }
266 while (w != p);
267 WOP_KEY (AES_DEC_LAST, 0)
268
269 MM_XOR (m0, iv)
270 WOP_M1 (XOR_data_M1)
271 iv = data[NUM_WAYS - 1];
272 WOP (STORE_data)
273 }
274 WIDE_LOOP_END
275
276 SINGLE_LOOP
277 {
278 const __m128i *w = wStart - 1;
279 __m128i m = _mm_xor_si128 (w[2], *data);
280 do
281 {
282 MM_OP_m (_mm_aesdec_si128, w[1])
283 MM_OP_m (_mm_aesdec_si128, w[0])
284 w -= 2;
285 }
286 while (w != p);
287 MM_OP_m (_mm_aesdec_si128, w[1])
288 MM_OP_m (_mm_aesdeclast_si128, w[0])
289
290 MM_XOR (m, iv)
291 iv = *data;
292 *data = m;
293 }
294
295 p[-2] = iv;
296 }
297
298
AES_FUNC_START2(AesCtr_Code_HW)299 AES_FUNC_START2 (AesCtr_Code_HW)
300 {
301 __m128i *p = (__m128i *)(void *)ivAes;
302 __m128i *data = (__m128i *)(void *)data8;
303 __m128i ctr = *p;
304 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
305 const __m128i *dataEnd;
306 __m128i one = _mm_cvtsi32_si128(1);
307
308 p += 2;
309
310 WIDE_LOOP_START
311 {
312 const __m128i *w = p;
313 UInt32 r = numRoundsMinus2;
314 WOP (DECLARE_VAR)
315 WOP (CTR_START)
316 WOP_KEY (AES_XOR, 0)
317 w += 1;
318 do
319 {
320 WOP_KEY (AES_ENC, 0)
321 w += 1;
322 }
323 while (--r);
324 WOP_KEY (AES_ENC_LAST, 0)
325
326 WOP (CTR_END)
327 }
328 WIDE_LOOP_END
329
330 SINGLE_LOOP
331 {
332 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
333 const __m128i *w = p;
334 __m128i m;
335 MM_OP (_mm_add_epi64, ctr, one)
336 m = _mm_xor_si128 (ctr, p[0]);
337 w += 1;
338 do
339 {
340 MM_OP_m (_mm_aesenc_si128, w[0])
341 MM_OP_m (_mm_aesenc_si128, w[1])
342 w += 2;
343 }
344 while (--numRounds2);
345 MM_OP_m (_mm_aesenc_si128, w[0])
346 MM_OP_m (_mm_aesenclast_si128, w[1])
347 MM_XOR (*data, m)
348 }
349
350 p[-2] = ctr;
351 }
352
353
354
355 #ifdef USE_INTEL_VAES
356
357 /*
358 GCC before 2013-Jun:
359 <immintrin.h>:
360 #ifdef __AVX__
361 #include <avxintrin.h>
362 #endif
363 GCC after 2013-Jun:
364 <immintrin.h>:
365 #include <avxintrin.h>
366 CLANG 3.8+:
367 {
368 <immintrin.h>:
369 #if !defined(_MSC_VER) || defined(__AVX__)
370 #include <avxintrin.h>
371 #endif
372
373 if (the compiler is clang for Windows and if global arch is not set for __AVX__)
374 [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
375 {
376 <immintrin.h> doesn't include <avxintrin.h>
377 and we have 2 ways to fix it:
378 1) we can define required __AVX__ before <immintrin.h>
379 or
380 2) we can include <avxintrin.h> after <immintrin.h>
381 }
382 }
383
384 If we include <avxintrin.h> manually for GCC/CLANG, it's
385 required that <immintrin.h> must be included before <avxintrin.h>.
386 */
387
388 /*
389 #if defined(__clang__) && defined(_MSC_VER)
390 #define __AVX__
391 #define __AVX2__
392 #define __VAES__
393 #endif
394 */
395
396 #include <immintrin.h>
397 #if defined(__clang__) && defined(_MSC_VER)
398 #if !defined(__AVX__)
399 #include <avxintrin.h>
400 #endif
401 #if !defined(__AVX2__)
402 #include <avx2intrin.h>
403 #endif
404 #if !defined(__VAES__)
405 #include <vaesintrin.h>
406 #endif
407 #endif // __clang__ && _MSC_VER
408
409 #ifndef ATTRIB_VAES
410 #define ATTRIB_VAES
411 #endif
412
413 #define VAES_FUNC_START2(name) \
414 AES_FUNC_START (name); \
415 ATTRIB_VAES \
416 AES_FUNC_START (name)
417
VAES_FUNC_START2(AesCbc_Decode_HW_256)418 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
419 {
420 __m128i *p = (__m128i *)(void *)ivAes;
421 __m128i *data = (__m128i *)(void *)data8;
422 __m128i iv = *p;
423 const __m128i *dataEnd;
424 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
425 p += 2;
426
427 WIDE_LOOP_START_AVX(;)
428 {
429 const __m256i *w = keys + numRounds - 2;
430
431 WOP (AVX_DECLARE_VAR)
432 WOP (AVX_LOAD_data)
433 AVX_WOP_KEY (AVX_AES_XOR, 1)
434
435 do
436 {
437 AVX_WOP_KEY (AVX_AES_DEC, 0)
438 w--;
439 }
440 while (w != keys);
441 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
442
443 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
444 WOP_M1 (AVX_XOR_data_M1)
445 iv = data[NUM_WAYS * 2 - 1];
446 WOP (AVX_STORE_data)
447 }
448 WIDE_LOOP_END_AVX(;)
449
450 SINGLE_LOOP
451 {
452 const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
453 __m128i m = _mm_xor_si128 (w[2], *data);
454 do
455 {
456 MM_OP_m (_mm_aesdec_si128, w[1])
457 MM_OP_m (_mm_aesdec_si128, w[0])
458 w -= 2;
459 }
460 while (w != p);
461 MM_OP_m (_mm_aesdec_si128, w[1])
462 MM_OP_m (_mm_aesdeclast_si128, w[0])
463
464 MM_XOR (m, iv)
465 iv = *data;
466 *data = m;
467 }
468
469 p[-2] = iv;
470 }
471
472
473 /*
474 SSE2: _mm_cvtsi32_si128 : movd
475 AVX: _mm256_setr_m128i : vinsertf128
476 AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm
477 _mm256_extracti128_si256 : vextracti128
478 _mm256_broadcastsi128_si256 : vbroadcasti128
479 */
480
481 #define AVX_CTR_LOOP_START \
482 ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
483 two = _mm256_setr_m128i(one, one); \
484 two = _mm256_add_epi64(two, two); \
485
486 // two = _mm256_setr_epi64x(2, 0, 2, 0);
487
488 #define AVX_CTR_LOOP_ENC \
489 ctr = _mm256_extracti128_si256 (ctr2, 1); \
490
VAES_FUNC_START2(AesCtr_Code_HW_256)491 VAES_FUNC_START2 (AesCtr_Code_HW_256)
492 {
493 __m128i *p = (__m128i *)(void *)ivAes;
494 __m128i *data = (__m128i *)(void *)data8;
495 __m128i ctr = *p;
496 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
497 const __m128i *dataEnd;
498 __m128i one = _mm_cvtsi32_si128(1);
499 __m256i ctr2, two;
500 p += 2;
501
502 WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
503 {
504 const __m256i *w = keys;
505 UInt32 r = numRounds - 2;
506 WOP (AVX_DECLARE_VAR)
507 AVX_WOP_KEY (AVX_CTR_START, 0)
508
509 w += 1;
510 do
511 {
512 AVX_WOP_KEY (AVX_AES_ENC, 0)
513 w += 1;
514 }
515 while (--r);
516 AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
517
518 WOP (AVX_CTR_END)
519 }
520 WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
521
522 SINGLE_LOOP
523 {
524 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
525 const __m128i *w = p;
526 __m128i m;
527 MM_OP (_mm_add_epi64, ctr, one)
528 m = _mm_xor_si128 (ctr, p[0]);
529 w += 1;
530 do
531 {
532 MM_OP_m (_mm_aesenc_si128, w[0])
533 MM_OP_m (_mm_aesenc_si128, w[1])
534 w += 2;
535 }
536 while (--numRounds2);
537 MM_OP_m (_mm_aesenc_si128, w[0])
538 MM_OP_m (_mm_aesenclast_si128, w[1])
539 MM_XOR (*data, m)
540 }
541
542 p[-2] = ctr;
543 }
544
545 #endif // USE_INTEL_VAES
546
547 #else // USE_INTEL_AES
548
549 /* no USE_INTEL_AES */
550
551 #if defined(Z7_USE_AES_HW_STUB)
552 // We can compile this file with another C compiler,
553 // or we can compile asm version.
554 // So we can generate real code instead of this stub function.
555 // #if defined(_MSC_VER)
556 #pragma message("AES HW_SW stub was used")
557 // #endif
558
559 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
560 #define AES_TYPE_keys UInt32
561 #define AES_TYPE_data Byte
562 #endif
563
564 #define AES_FUNC_START(name) \
565 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
566
567 #define AES_COMPAT_STUB(name) \
568 AES_FUNC_START(name); \
569 AES_FUNC_START(name ## _HW) \
570 { name(p, data, numBlocks); }
571
572 AES_COMPAT_STUB (AesCbc_Encode)
573 AES_COMPAT_STUB (AesCbc_Decode)
574 AES_COMPAT_STUB (AesCtr_Code)
575 #endif // Z7_USE_AES_HW_STUB
576
577 #endif // USE_INTEL_AES
578
579
580 #ifndef USE_INTEL_VAES
581 #if defined(Z7_USE_VAES_HW_STUB)
582 // #if defined(_MSC_VER)
583 #pragma message("VAES HW_SW stub was used")
584 // #endif
585
586 #define VAES_COMPAT_STUB(name) \
587 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
588 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
589 { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
590
591 VAES_COMPAT_STUB (AesCbc_Decode_HW)
592 VAES_COMPAT_STUB (AesCtr_Code_HW)
593 #endif
594 #endif // ! USE_INTEL_VAES
595
596
597
598
599 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
600
601 #if defined(__ARM_FEATURE_AES) \
602 || defined(__ARM_FEATURE_CRYPTO)
603 #define USE_HW_AES
604 #else
605 #if defined(MY_CPU_ARM64) \
606 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
607 || defined(Z7_MSC_VER_ORIGINAL)
608 #if defined(__ARM_FP) && \
609 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
610 || defined(__GNUC__) && (__GNUC__ >= 6) \
611 ) \
612 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
613 #if defined(MY_CPU_ARM64) \
614 || !defined(Z7_CLANG_VERSION) \
615 || defined(__ARM_NEON) && \
616 (Z7_CLANG_VERSION < 170000 || \
617 Z7_CLANG_VERSION > 170001)
618 #define USE_HW_AES
619 #endif
620 #endif
621 #endif
622 #endif
623
624 #ifdef USE_HW_AES
625
626 // #pragma message("=== AES HW === ")
627 // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
628
629 #if defined(__clang__) || defined(__GNUC__)
630 #if !defined(__ARM_FEATURE_AES) && \
631 !defined(__ARM_FEATURE_CRYPTO)
632 #ifdef MY_CPU_ARM64
633 #if defined(__clang__)
634 #define ATTRIB_AES __attribute__((__target__("crypto")))
635 #else
636 #define ATTRIB_AES __attribute__((__target__("+crypto")))
637 #endif
638 #else
639 #if defined(__clang__)
640 #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
641 #else
642 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
643 #endif
644 #endif
645 #endif
646 #else
647 // _MSC_VER
648 // for arm32
649 #define _ARM_USE_NEW_NEON_INTRINSICS
650 #endif
651
652 #ifndef ATTRIB_AES
653 #define ATTRIB_AES
654 #endif
655
656 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
657 #include <arm64_neon.h>
658 #else
659 /*
660 clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
661 clang
662 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO)
663 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
664 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
665 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
666 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
667 */
668 #if defined(__clang__) && __clang_major__ < 16
669 #if !defined(__ARM_FEATURE_AES) && \
670 !defined(__ARM_FEATURE_CRYPTO)
671 // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
672 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
673 #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
674 // #if defined(__clang__) && __clang_major__ < 13
675 #define __ARM_FEATURE_CRYPTO 1
676 // #else
677 #define __ARM_FEATURE_AES 1
678 // #endif
679 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
680 #endif
681 #endif // clang
682
683 #if defined(__clang__)
684
685 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
686 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
687 // #pragma message("#define __ARM_ARCH 8")
688 #undef __ARM_ARCH
689 #define __ARM_ARCH 8
690 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
691 #endif
692
693 #endif // clang
694
695 #include <arm_neon.h>
696
697 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
698 defined(__ARM_FEATURE_CRYPTO) && \
699 defined(__ARM_FEATURE_AES)
700 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
701 #undef __ARM_FEATURE_CRYPTO
702 #undef __ARM_FEATURE_AES
703 #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
704 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
705 // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
706 #endif
707
708 #endif // Z7_MSC_VER_ORIGINAL
709
710 typedef uint8x16_t v128;
711
712 #define AES_FUNC_START(name) \
713 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
714 // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
715
716 #define AES_FUNC_START2(name) \
717 AES_FUNC_START (name); \
718 ATTRIB_AES \
719 AES_FUNC_START (name)
720
721 #define MM_OP(op, dest, src) dest = op(dest, src);
722 #define MM_OP_m(op, src) MM_OP(op, m, src)
723 #define MM_OP1_m(op) m = op(m);
724
725 #define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src)
726 #define MM_XOR_m( src) MM_XOR(m, src)
727
728 #define AES_E_m(k) MM_OP_m (vaeseq_u8, k)
729 #define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8)
730
731
732 AES_FUNC_START2 (AesCbc_Encode_HW)
733 {
734 v128 * const p = (v128*)(void*)ivAes;
735 v128 *data = (v128*)(void*)data8;
736 v128 m = *p;
737 const v128 k0 = p[2];
738 const v128 k1 = p[3];
739 const v128 k2 = p[4];
740 const v128 k3 = p[5];
741 const v128 k4 = p[6];
742 const v128 k5 = p[7];
743 const v128 k6 = p[8];
744 const v128 k7 = p[9];
745 const v128 k8 = p[10];
746 const v128 k9 = p[11];
747 const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
748 const v128 *w = p + ((size_t)numRounds2 * 2);
749 const v128 k_z1 = w[1];
750 const v128 k_z0 = w[2];
751 for (; numBlocks != 0; numBlocks--, data++)
752 {
753 MM_XOR_m (*data)
754 AES_E_MC_m (k0)
755 AES_E_MC_m (k1)
756 AES_E_MC_m (k2)
757 AES_E_MC_m (k3)
758 AES_E_MC_m (k4)
759 AES_E_MC_m (k5)
760 AES_E_MC_m (k6)
761 AES_E_MC_m (k7)
762 AES_E_MC_m (k8)
763 if (numRounds2 >= 6)
764 {
765 AES_E_MC_m (k9)
766 AES_E_MC_m (p[12])
767 if (numRounds2 != 6)
768 {
769 AES_E_MC_m (p[13])
770 AES_E_MC_m (p[14])
771 }
772 }
773 AES_E_m (k_z1)
774 MM_XOR_m (k_z0)
775 *data = m;
776 }
777 *p = m;
778 }
779
780
781 #define WOP_1(op)
782 #define WOP_2(op) WOP_1 (op) op (m1, 1)
783 #define WOP_3(op) WOP_2 (op) op (m2, 2)
784 #define WOP_4(op) WOP_3 (op) op (m3, 3)
785 #define WOP_5(op) WOP_4 (op) op (m4, 4)
786 #define WOP_6(op) WOP_5 (op) op (m5, 5)
787 #define WOP_7(op) WOP_6 (op) op (m6, 6)
788 #define WOP_8(op) WOP_7 (op) op (m7, 7)
789
790 #define NUM_WAYS 8
791 #define WOP_M1 WOP_8
792
793 #define WOP(op) op (m0, 0) WOP_M1(op)
794
795 #define DECLARE_VAR(reg, ii) v128 reg;
796 #define LOAD_data( reg, ii) reg = data[ii];
797 #define STORE_data( reg, ii) data[ii] = reg;
798 #if (NUM_WAYS > 1)
799 #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
800 #endif
801
802 #define MM_OP_key(op, reg) MM_OP (op, reg, key)
803
804 #define AES_D_m(k) MM_OP_m (vaesdq_u8, k)
805 #define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8)
806
807 #define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg)
808 #define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg)
809 #define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg)
810
811 #define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg);
812 #define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg);
813
814 #define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr);
815 #define CTR_END( reg, ii) MM_XOR (data[ii], reg)
816
817 #define WOP_KEY(op, n) { \
818 const v128 key = w[n]; \
819 WOP(op) }
820
821 #define WIDE_LOOP_START \
822 dataEnd = data + numBlocks; \
823 if (numBlocks >= NUM_WAYS) \
824 { dataEnd -= NUM_WAYS; do { \
825
826 #define WIDE_LOOP_END \
827 data += NUM_WAYS; \
828 } while (data <= dataEnd); \
829 dataEnd += NUM_WAYS; } \
830
831 #define SINGLE_LOOP \
832 for (; data < dataEnd; data++)
833
834
835 AES_FUNC_START2 (AesCbc_Decode_HW)
836 {
837 v128 *p = (v128*)(void*)ivAes;
838 v128 *data = (v128*)(void*)data8;
839 v128 iv = *p;
840 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
841 const v128 *dataEnd;
842 p += 2;
843
844 WIDE_LOOP_START
845 {
846 const v128 *w = wStart;
847 WOP (DECLARE_VAR)
848 WOP (LOAD_data)
849 WOP_KEY (AES_D_IMC, 2)
850 do
851 {
852 WOP_KEY (AES_D_IMC, 1)
853 WOP_KEY (AES_D_IMC, 0)
854 w -= 2;
855 }
856 while (w != p);
857 WOP_KEY (AES_D, 1)
858 WOP_KEY (AES_XOR, 0)
859 MM_XOR (m0, iv)
860 WOP_M1 (XOR_data_M1)
861 iv = data[NUM_WAYS - 1];
862 WOP (STORE_data)
863 }
864 WIDE_LOOP_END
865
866 SINGLE_LOOP
867 {
868 const v128 *w = wStart;
869 v128 m = *data;
870 AES_D_IMC_m (w[2])
871 do
872 {
873 AES_D_IMC_m (w[1])
874 AES_D_IMC_m (w[0])
875 w -= 2;
876 }
877 while (w != p);
878 AES_D_m (w[1])
879 MM_XOR_m (w[0])
880 MM_XOR_m (iv)
881 iv = *data;
882 *data = m;
883 }
884
885 p[-2] = iv;
886 }
887
888
889 AES_FUNC_START2 (AesCtr_Code_HW)
890 {
891 v128 *p = (v128*)(void*)ivAes;
892 v128 *data = (v128*)(void*)data8;
893 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
894 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
895 const v128 *dataEnd;
896 uint64x2_t one = vdupq_n_u64(0);
897
898 // the bug in clang:
899 // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
900 #if defined(__clang__) && (__clang_major__ <= 9)
901 #pragma GCC diagnostic ignored "-Wvector-conversion"
902 #endif
903 one = vsetq_lane_u64(1, one, 0);
904 p += 2;
905
906 WIDE_LOOP_START
907 {
908 const v128 *w = p;
909 WOP (DECLARE_VAR)
910 WOP (CTR_START)
911 do
912 {
913 WOP_KEY (AES_E_MC, 0)
914 WOP_KEY (AES_E_MC, 1)
915 w += 2;
916 }
917 while (w != wEnd);
918 WOP_KEY (AES_E_MC, 0)
919 WOP_KEY (AES_E, 1)
920 WOP_KEY (AES_XOR, 2)
921 WOP (CTR_END)
922 }
923 WIDE_LOOP_END
924
925 SINGLE_LOOP
926 {
927 const v128 *w = p;
928 v128 m;
929 CTR_START (m, 0)
930 do
931 {
932 AES_E_MC_m (w[0])
933 AES_E_MC_m (w[1])
934 w += 2;
935 }
936 while (w != wEnd);
937 AES_E_MC_m (w[0])
938 AES_E_m (w[1])
939 MM_XOR_m (w[2])
940 CTR_END (m, 0)
941 }
942
943 p[-2] = vreinterpretq_u8_u64(ctr);
944 }
945
946 #endif // USE_HW_AES
947
948 #endif // MY_CPU_ARM_OR_ARM64
949
950 #undef NUM_WAYS
951 #undef WOP_M1
952 #undef WOP
953 #undef DECLARE_VAR
954 #undef LOAD_data
955 #undef STORE_data
956 #undef USE_INTEL_AES
957 #undef USE_HW_AES
958