• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2 2024-03-01 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include "Aes.h"
7 #include "CpuArch.h"
8 
9 #ifdef MY_CPU_X86_OR_AMD64
10 
11   #if defined(__INTEL_COMPILER)
12     #if (__INTEL_COMPILER >= 1110)
13       #define USE_INTEL_AES
14       #if (__INTEL_COMPILER >= 1900)
15         #define USE_INTEL_VAES
16       #endif
17     #endif
18   #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19      || defined(Z7_GCC_VERSION)   && (Z7_GCC_VERSION   >= 40400)
20         #define USE_INTEL_AES
21         #if !defined(__AES__)
22           #define ATTRIB_AES __attribute__((__target__("aes")))
23         #endif
24       #if defined(__clang__) && (__clang_major__ >= 8) \
25           || defined(__GNUC__) && (__GNUC__ >= 8)
26         #define USE_INTEL_VAES
27         #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28           #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29         #endif
30       #endif
31   #elif defined(_MSC_VER)
32     #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33       #define USE_INTEL_AES
34       #if (_MSC_VER >= 1910)
35         #define USE_INTEL_VAES
36       #endif
37     #endif
38     #ifndef USE_INTEL_AES
39       #define Z7_USE_AES_HW_STUB
40     #endif
41     #ifndef USE_INTEL_VAES
42       #define Z7_USE_VAES_HW_STUB
43     #endif
44   #endif
45 
46     #ifndef USE_INTEL_AES
47       // #define Z7_USE_AES_HW_STUB // for debug
48     #endif
49     #ifndef USE_INTEL_VAES
50       // #define Z7_USE_VAES_HW_STUB // for debug
51     #endif
52 
53 
54 #ifdef USE_INTEL_AES
55 
56 #include <wmmintrin.h>
57 
58 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
59 #define AES_TYPE_keys UInt32
60 #define AES_TYPE_data Byte
61 // #define AES_TYPE_keys __m128i
62 // #define AES_TYPE_data __m128i
63 #endif
64 
65 #ifndef ATTRIB_AES
66   #define ATTRIB_AES
67 #endif
68 
69 #define AES_FUNC_START(name) \
70     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
71     // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
72 
73 #define AES_FUNC_START2(name) \
74 AES_FUNC_START (name); \
75 ATTRIB_AES \
76 AES_FUNC_START (name)
77 
78 #define MM_OP(op, dest, src)  dest = op(dest, src);
79 #define MM_OP_m(op, src)      MM_OP(op, m, src)
80 
81 #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)
82 
AES_FUNC_START2(AesCbc_Encode_HW)83 AES_FUNC_START2 (AesCbc_Encode_HW)
84 {
85   __m128i *p = (__m128i *)(void *)ivAes;
86   __m128i *data = (__m128i *)(void *)data8;
87   __m128i m = *p;
88   const __m128i k0 = p[2];
89   const __m128i k1 = p[3];
90   const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
91   for (; numBlocks != 0; numBlocks--, data++)
92   {
93     UInt32 r = numRounds2;
94     const __m128i *w = p + 4;
95     __m128i temp = *data;
96     MM_XOR (temp, k0)
97     MM_XOR (m, temp)
98     MM_OP_m (_mm_aesenc_si128, k1)
99     do
100     {
101       MM_OP_m (_mm_aesenc_si128, w[0])
102       MM_OP_m (_mm_aesenc_si128, w[1])
103       w += 2;
104     }
105     while (--r);
106     MM_OP_m (_mm_aesenclast_si128, w[0])
107     *data = m;
108   }
109   *p = m;
110 }
111 
112 
113 #define WOP_1(op)
114 #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
115 #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
116 #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
117 #ifdef MY_CPU_AMD64
118 #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
119 #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
120 #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
121 #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
122 #endif
123 /*
124 #define WOP_9(op)   WOP_8 (op)  op (m8, 8);
125 #define WOP_10(op)  WOP_9 (op)  op (m9, 9);
126 #define WOP_11(op)  WOP_10(op)  op (m10, 10);
127 #define WOP_12(op)  WOP_11(op)  op (m11, 11);
128 #define WOP_13(op)  WOP_12(op)  op (m12, 12);
129 #define WOP_14(op)  WOP_13(op)  op (m13, 13);
130 */
131 
132 #ifdef MY_CPU_AMD64
133   #define NUM_WAYS      8
134   #define WOP_M1    WOP_8
135 #else
136   #define NUM_WAYS      4
137   #define WOP_M1    WOP_4
138 #endif
139 
140 #define WOP(op)  op (m0, 0)  WOP_M1(op)
141 
142 
143 #define DECLARE_VAR(reg, ii)  __m128i reg;
144 #define LOAD_data(  reg, ii)  reg = data[ii];
145 #define STORE_data( reg, ii)  data[ii] = reg;
146 #if (NUM_WAYS > 1)
147 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
148 #endif
149 
150 #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
151 
152 #define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
153 #define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
154 #define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
155 #define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
156 #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
157 
158 #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
159 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
160 
161 #define WOP_KEY(op, n) { \
162     const __m128i key = w[n]; \
163     WOP(op); }
164 
165 
166 #define WIDE_LOOP_START  \
167     dataEnd = data + numBlocks;  \
168     if (numBlocks >= NUM_WAYS)  \
169     { dataEnd -= NUM_WAYS; do {  \
170 
171 
172 #define WIDE_LOOP_END  \
173     data += NUM_WAYS;  \
174     } while (data <= dataEnd);  \
175     dataEnd += NUM_WAYS; }  \
176 
177 
178 #define SINGLE_LOOP  \
179     for (; data < dataEnd; data++)
180 
181 
182 
183 #ifdef USE_INTEL_VAES
184 
185 #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
186 #define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
187 #define AVX_LOAD_data(  reg, ii)  reg = ((const __m256i *)(const void *)data)[ii];
188 #define AVX_STORE_data( reg, ii)  ((__m256i *)(void *)data)[ii] = reg;
189 /*
190 AVX_XOR_data_M1() needs unaligned memory load
191 if (we don't use _mm256_loadu_si256() here)
192 {
193   Most compilers with enabled optimizations generate fused AVX (LOAD + OP)
194   instruction that can load unaligned data.
195   But GCC and CLANG without -O2 or -O1 optimizations can generate separated
196   LOAD-ALIGNED (vmovdqa) instruction that will fail on execution.
197 }
198 Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here.
199 v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler.
200 */
201 #define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii])))
202 // for debug only: the following code will fail on execution, if compiled by some compilers:
203 // #define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
204 
205 #define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
206 #define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
207 #define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
208 #define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
209 #define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
210 #define AVX_CTR_START(reg, ii)  MM_OP (_mm256_add_epi64, ctr2, two)  reg = _mm256_xor_si256(ctr2, key);
211 #define AVX_CTR_END(  reg, ii)  AVX_XOR (((__m256i *)(void *)data)[ii], reg)
212 #define AVX_WOP_KEY(op, n) { \
213     const __m256i key = w[n]; \
214     WOP(op); }
215 
216 #define NUM_AES_KEYS_MAX 15
217 
218 #define WIDE_LOOP_START_AVX(OP)  \
219     dataEnd = data + numBlocks;  \
220     if (numBlocks >= NUM_WAYS * 2)  \
221     { __m256i keys[NUM_AES_KEYS_MAX]; \
222     UInt32 ii; \
223     OP \
224     for (ii = 0; ii < numRounds; ii++) \
225       keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
226     dataEnd -= NUM_WAYS * 2; do {  \
227 
228 
229 #define WIDE_LOOP_END_AVX(OP)  \
230     data += NUM_WAYS * 2;  \
231     } while (data <= dataEnd);  \
232     dataEnd += NUM_WAYS * 2;  \
233     OP  \
234     _mm256_zeroupper();  \
235     }  \
236 
237 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
238    MSVC still can insert vzeroupper instruction. */
239 
240 #endif
241 
242 
243 
AES_FUNC_START2(AesCbc_Decode_HW)244 AES_FUNC_START2 (AesCbc_Decode_HW)
245 {
246   __m128i *p = (__m128i *)(void *)ivAes;
247   __m128i *data = (__m128i *)(void *)data8;
248   __m128i iv = *p;
249   const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
250   const __m128i *dataEnd;
251   p += 2;
252 
253   WIDE_LOOP_START
254   {
255     const __m128i *w = wStart;
256 
257     WOP (DECLARE_VAR)
258     WOP (LOAD_data)
259     WOP_KEY (AES_XOR, 1)
260 
261     do
262     {
263       WOP_KEY (AES_DEC, 0)
264       w--;
265     }
266     while (w != p);
267     WOP_KEY (AES_DEC_LAST, 0)
268 
269     MM_XOR (m0, iv)
270     WOP_M1 (XOR_data_M1)
271     iv = data[NUM_WAYS - 1];
272     WOP (STORE_data)
273   }
274   WIDE_LOOP_END
275 
276   SINGLE_LOOP
277   {
278     const __m128i *w = wStart - 1;
279     __m128i m = _mm_xor_si128 (w[2], *data);
280     do
281     {
282       MM_OP_m (_mm_aesdec_si128, w[1])
283       MM_OP_m (_mm_aesdec_si128, w[0])
284       w -= 2;
285     }
286     while (w != p);
287     MM_OP_m (_mm_aesdec_si128,     w[1])
288     MM_OP_m (_mm_aesdeclast_si128, w[0])
289 
290     MM_XOR (m, iv)
291     iv = *data;
292     *data = m;
293   }
294 
295   p[-2] = iv;
296 }
297 
298 
AES_FUNC_START2(AesCtr_Code_HW)299 AES_FUNC_START2 (AesCtr_Code_HW)
300 {
301   __m128i *p = (__m128i *)(void *)ivAes;
302   __m128i *data = (__m128i *)(void *)data8;
303   __m128i ctr = *p;
304   UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
305   const __m128i *dataEnd;
306   __m128i one = _mm_cvtsi32_si128(1);
307 
308   p += 2;
309 
310   WIDE_LOOP_START
311   {
312     const __m128i *w = p;
313     UInt32 r = numRoundsMinus2;
314     WOP (DECLARE_VAR)
315     WOP (CTR_START)
316     WOP_KEY (AES_XOR, 0)
317     w += 1;
318     do
319     {
320       WOP_KEY (AES_ENC, 0)
321       w += 1;
322     }
323     while (--r);
324     WOP_KEY (AES_ENC_LAST, 0)
325 
326     WOP (CTR_END)
327   }
328   WIDE_LOOP_END
329 
330   SINGLE_LOOP
331   {
332     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
333     const __m128i *w = p;
334     __m128i m;
335     MM_OP (_mm_add_epi64, ctr, one)
336     m = _mm_xor_si128 (ctr, p[0]);
337     w += 1;
338     do
339     {
340       MM_OP_m (_mm_aesenc_si128, w[0])
341       MM_OP_m (_mm_aesenc_si128, w[1])
342       w += 2;
343     }
344     while (--numRounds2);
345     MM_OP_m (_mm_aesenc_si128,     w[0])
346     MM_OP_m (_mm_aesenclast_si128, w[1])
347     MM_XOR (*data, m)
348   }
349 
350   p[-2] = ctr;
351 }
352 
353 
354 
355 #ifdef USE_INTEL_VAES
356 
357 /*
358 GCC before 2013-Jun:
359   <immintrin.h>:
360     #ifdef __AVX__
361      #include <avxintrin.h>
362     #endif
363 GCC after 2013-Jun:
364   <immintrin.h>:
365     #include <avxintrin.h>
366 CLANG 3.8+:
367 {
368   <immintrin.h>:
369     #if !defined(_MSC_VER) || defined(__AVX__)
370       #include <avxintrin.h>
371     #endif
372 
373   if (the compiler is clang for Windows and if global arch is not set for __AVX__)
374     [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
375   {
376     <immintrin.h> doesn't include <avxintrin.h>
377     and we have 2 ways to fix it:
378       1) we can define required __AVX__ before <immintrin.h>
379       or
380       2) we can include <avxintrin.h> after <immintrin.h>
381   }
382 }
383 
384 If we include <avxintrin.h> manually for GCC/CLANG, it's
385 required that <immintrin.h> must be included before <avxintrin.h>.
386 */
387 
388 /*
389 #if defined(__clang__) && defined(_MSC_VER)
390 #define __AVX__
391 #define __AVX2__
392 #define __VAES__
393 #endif
394 */
395 
396 #include <immintrin.h>
397 #if defined(__clang__) && defined(_MSC_VER)
398   #if !defined(__AVX__)
399     #include <avxintrin.h>
400   #endif
401   #if !defined(__AVX2__)
402     #include <avx2intrin.h>
403   #endif
404   #if !defined(__VAES__)
405     #include <vaesintrin.h>
406   #endif
407 #endif  // __clang__ && _MSC_VER
408 
409 #ifndef ATTRIB_VAES
410   #define ATTRIB_VAES
411 #endif
412 
413 #define VAES_FUNC_START2(name) \
414 AES_FUNC_START (name); \
415 ATTRIB_VAES \
416 AES_FUNC_START (name)
417 
VAES_FUNC_START2(AesCbc_Decode_HW_256)418 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
419 {
420   __m128i *p = (__m128i *)(void *)ivAes;
421   __m128i *data = (__m128i *)(void *)data8;
422   __m128i iv = *p;
423   const __m128i *dataEnd;
424   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
425   p += 2;
426 
427   WIDE_LOOP_START_AVX(;)
428   {
429     const __m256i *w = keys + numRounds - 2;
430 
431     WOP (AVX_DECLARE_VAR)
432     WOP (AVX_LOAD_data)
433     AVX_WOP_KEY (AVX_AES_XOR, 1)
434 
435     do
436     {
437       AVX_WOP_KEY (AVX_AES_DEC, 0)
438       w--;
439     }
440     while (w != keys);
441     AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
442 
443     AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
444     WOP_M1 (AVX_XOR_data_M1)
445     iv = data[NUM_WAYS * 2 - 1];
446     WOP (AVX_STORE_data)
447   }
448   WIDE_LOOP_END_AVX(;)
449 
450   SINGLE_LOOP
451   {
452     const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
453     __m128i m = _mm_xor_si128 (w[2], *data);
454     do
455     {
456       MM_OP_m (_mm_aesdec_si128, w[1])
457       MM_OP_m (_mm_aesdec_si128, w[0])
458       w -= 2;
459     }
460     while (w != p);
461     MM_OP_m (_mm_aesdec_si128,     w[1])
462     MM_OP_m (_mm_aesdeclast_si128, w[0])
463 
464     MM_XOR (m, iv)
465     iv = *data;
466     *data = m;
467   }
468 
469   p[-2] = iv;
470 }
471 
472 
473 /*
474 SSE2: _mm_cvtsi32_si128 : movd
475 AVX:  _mm256_setr_m128i            : vinsertf128
476 AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
477       _mm256_extracti128_si256     : vextracti128
478       _mm256_broadcastsi128_si256  : vbroadcasti128
479 */
480 
481 #define AVX_CTR_LOOP_START  \
482     ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
483     two = _mm256_setr_m128i(one, one); \
484     two = _mm256_add_epi64(two, two); \
485 
486 // two = _mm256_setr_epi64x(2, 0, 2, 0);
487 
488 #define AVX_CTR_LOOP_ENC  \
489     ctr = _mm256_extracti128_si256 (ctr2, 1); \
490 
VAES_FUNC_START2(AesCtr_Code_HW_256)491 VAES_FUNC_START2 (AesCtr_Code_HW_256)
492 {
493   __m128i *p = (__m128i *)(void *)ivAes;
494   __m128i *data = (__m128i *)(void *)data8;
495   __m128i ctr = *p;
496   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
497   const __m128i *dataEnd;
498   __m128i one = _mm_cvtsi32_si128(1);
499   __m256i ctr2, two;
500   p += 2;
501 
502   WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
503   {
504     const __m256i *w = keys;
505     UInt32 r = numRounds - 2;
506     WOP (AVX_DECLARE_VAR)
507     AVX_WOP_KEY (AVX_CTR_START, 0)
508 
509     w += 1;
510     do
511     {
512       AVX_WOP_KEY (AVX_AES_ENC, 0)
513       w += 1;
514     }
515     while (--r);
516     AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
517 
518     WOP (AVX_CTR_END)
519   }
520   WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
521 
522   SINGLE_LOOP
523   {
524     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
525     const __m128i *w = p;
526     __m128i m;
527     MM_OP (_mm_add_epi64, ctr, one)
528     m = _mm_xor_si128 (ctr, p[0]);
529     w += 1;
530     do
531     {
532       MM_OP_m (_mm_aesenc_si128, w[0])
533       MM_OP_m (_mm_aesenc_si128, w[1])
534       w += 2;
535     }
536     while (--numRounds2);
537     MM_OP_m (_mm_aesenc_si128,     w[0])
538     MM_OP_m (_mm_aesenclast_si128, w[1])
539     MM_XOR (*data, m)
540   }
541 
542   p[-2] = ctr;
543 }
544 
545 #endif // USE_INTEL_VAES
546 
547 #else // USE_INTEL_AES
548 
549 /* no USE_INTEL_AES */
550 
551 #if defined(Z7_USE_AES_HW_STUB)
552 // We can compile this file with another C compiler,
553 // or we can compile asm version.
554 // So we can generate real code instead of this stub function.
555 // #if defined(_MSC_VER)
556 #pragma message("AES  HW_SW stub was used")
557 // #endif
558 
559 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
560 #define AES_TYPE_keys UInt32
561 #define AES_TYPE_data Byte
562 #endif
563 
564 #define AES_FUNC_START(name) \
565     void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
566 
567 #define AES_COMPAT_STUB(name) \
568     AES_FUNC_START(name); \
569     AES_FUNC_START(name ## _HW) \
570     { name(p, data, numBlocks); }
571 
572 AES_COMPAT_STUB (AesCbc_Encode)
573 AES_COMPAT_STUB (AesCbc_Decode)
574 AES_COMPAT_STUB (AesCtr_Code)
575 #endif // Z7_USE_AES_HW_STUB
576 
577 #endif // USE_INTEL_AES
578 
579 
580 #ifndef USE_INTEL_VAES
581 #if defined(Z7_USE_VAES_HW_STUB)
582 // #if defined(_MSC_VER)
583 #pragma message("VAES HW_SW stub was used")
584 // #endif
585 
586 #define VAES_COMPAT_STUB(name) \
587     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
588     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
589     { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
590 
591 VAES_COMPAT_STUB (AesCbc_Decode_HW)
592 VAES_COMPAT_STUB (AesCtr_Code_HW)
593 #endif
594 #endif // ! USE_INTEL_VAES
595 
596 
597 
598 
599 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
600 
601   #if   defined(__ARM_FEATURE_AES) \
602      || defined(__ARM_FEATURE_CRYPTO)
603     #define USE_HW_AES
604   #else
605     #if  defined(MY_CPU_ARM64) \
606       || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
607       || defined(Z7_MSC_VER_ORIGINAL)
608     #if  defined(__ARM_FP) && \
609           (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
610            || defined(__GNUC__) && (__GNUC__ >= 6) \
611           ) \
612       || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
613     #if  defined(MY_CPU_ARM64) \
614       || !defined(Z7_CLANG_VERSION) \
615       || defined(__ARM_NEON) && \
616           (Z7_CLANG_VERSION < 170000 || \
617            Z7_CLANG_VERSION > 170001)
618       #define USE_HW_AES
619     #endif
620     #endif
621     #endif
622   #endif
623 
624 #ifdef USE_HW_AES
625 
626 // #pragma message("=== AES HW === ")
627 // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
628 
629 #if defined(__clang__) || defined(__GNUC__)
630 #if !defined(__ARM_FEATURE_AES) && \
631     !defined(__ARM_FEATURE_CRYPTO)
632   #ifdef MY_CPU_ARM64
633 #if defined(__clang__)
634     #define ATTRIB_AES __attribute__((__target__("crypto")))
635 #else
636     #define ATTRIB_AES __attribute__((__target__("+crypto")))
637 #endif
638   #else
639 #if defined(__clang__)
640     #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
641 #else
642     #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
643 #endif
644   #endif
645 #endif
646 #else
647   // _MSC_VER
648   // for arm32
649   #define _ARM_USE_NEW_NEON_INTRINSICS
650 #endif
651 
652 #ifndef ATTRIB_AES
653   #define ATTRIB_AES
654 #endif
655 
656 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
657 #include <arm64_neon.h>
658 #else
659 /*
660   clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
661   clang
662    3.8.1 : __ARM_NEON             :                    defined(__ARM_FEATURE_CRYPTO)
663    7.0.1 : __ARM_NEON             : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
664   11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
665   13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
666   16     : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
667 */
668 #if defined(__clang__) && __clang_major__ < 16
669 #if !defined(__ARM_FEATURE_AES) && \
670     !defined(__ARM_FEATURE_CRYPTO)
671 //     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
672     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
673     #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
674 // #if defined(__clang__) && __clang_major__ < 13
675     #define __ARM_FEATURE_CRYPTO 1
676 // #else
677     #define __ARM_FEATURE_AES 1
678 // #endif
679     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
680 #endif
681 #endif // clang
682 
683 #if defined(__clang__)
684 
685 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
686     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
687 //    #pragma message("#define __ARM_ARCH 8")
688     #undef  __ARM_ARCH
689     #define __ARM_ARCH 8
690     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
691 #endif
692 
693 #endif // clang
694 
695 #include <arm_neon.h>
696 
697 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
698     defined(__ARM_FEATURE_CRYPTO) && \
699     defined(__ARM_FEATURE_AES)
700 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
701     #undef __ARM_FEATURE_CRYPTO
702     #undef __ARM_FEATURE_AES
703     #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
704 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
705 //    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
706 #endif
707 
708 #endif // Z7_MSC_VER_ORIGINAL
709 
710 typedef uint8x16_t v128;
711 
712 #define AES_FUNC_START(name) \
713     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
714     // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
715 
716 #define AES_FUNC_START2(name) \
717 AES_FUNC_START (name); \
718 ATTRIB_AES \
719 AES_FUNC_START (name)
720 
721 #define MM_OP(op, dest, src)  dest = op(dest, src);
722 #define MM_OP_m(op, src)      MM_OP(op, m, src)
723 #define MM_OP1_m(op)          m = op(m);
724 
725 #define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src)
726 #define MM_XOR_m( src)        MM_XOR(m, src)
727 
728 #define AES_E_m(k)     MM_OP_m (vaeseq_u8, k)
729 #define AES_E_MC_m(k)  AES_E_m (k)  MM_OP1_m(vaesmcq_u8)
730 
731 
732 AES_FUNC_START2 (AesCbc_Encode_HW)
733 {
734   v128 * const p = (v128*)(void*)ivAes;
735   v128 *data = (v128*)(void*)data8;
736   v128 m = *p;
737   const v128 k0 = p[2];
738   const v128 k1 = p[3];
739   const v128 k2 = p[4];
740   const v128 k3 = p[5];
741   const v128 k4 = p[6];
742   const v128 k5 = p[7];
743   const v128 k6 = p[8];
744   const v128 k7 = p[9];
745   const v128 k8 = p[10];
746   const v128 k9 = p[11];
747   const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
748   const v128 *w = p + ((size_t)numRounds2 * 2);
749   const v128 k_z1 = w[1];
750   const v128 k_z0 = w[2];
751   for (; numBlocks != 0; numBlocks--, data++)
752   {
753     MM_XOR_m (*data)
754     AES_E_MC_m (k0)
755     AES_E_MC_m (k1)
756     AES_E_MC_m (k2)
757     AES_E_MC_m (k3)
758     AES_E_MC_m (k4)
759     AES_E_MC_m (k5)
760     AES_E_MC_m (k6)
761     AES_E_MC_m (k7)
762     AES_E_MC_m (k8)
763     if (numRounds2 >= 6)
764     {
765       AES_E_MC_m (k9)
766       AES_E_MC_m (p[12])
767       if (numRounds2 != 6)
768       {
769         AES_E_MC_m (p[13])
770         AES_E_MC_m (p[14])
771       }
772     }
773     AES_E_m  (k_z1)
774     MM_XOR_m (k_z0)
775     *data = m;
776   }
777   *p = m;
778 }
779 
780 
781 #define WOP_1(op)
782 #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
783 #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
784 #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
785 #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
786 #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
787 #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
788 #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
789 
790   #define NUM_WAYS      8
791   #define WOP_M1    WOP_8
792 
793 #define WOP(op)  op (m0, 0)   WOP_M1(op)
794 
795 #define DECLARE_VAR(reg, ii)  v128 reg;
796 #define LOAD_data(  reg, ii)  reg = data[ii];
797 #define STORE_data( reg, ii)  data[ii] = reg;
798 #if (NUM_WAYS > 1)
799 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
800 #endif
801 
802 #define MM_OP_key(op, reg)  MM_OP (op, reg, key)
803 
804 #define AES_D_m(k)      MM_OP_m (vaesdq_u8, k)
805 #define AES_D_IMC_m(k)  AES_D_m (k)  MM_OP1_m (vaesimcq_u8)
806 
807 #define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
808 #define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
809 #define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
810 
811 #define AES_D_IMC( reg, ii)  AES_D (reg, ii)  reg = vaesimcq_u8(reg);
812 #define AES_E_MC(  reg, ii)  AES_E (reg, ii)  reg = vaesmcq_u8(reg);
813 
814 #define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one)  reg = vreinterpretq_u8_u64(ctr);
815 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
816 
817 #define WOP_KEY(op, n) { \
818     const v128 key = w[n]; \
819     WOP(op) }
820 
821 #define WIDE_LOOP_START  \
822     dataEnd = data + numBlocks;  \
823     if (numBlocks >= NUM_WAYS)  \
824     { dataEnd -= NUM_WAYS; do {  \
825 
826 #define WIDE_LOOP_END  \
827     data += NUM_WAYS;  \
828     } while (data <= dataEnd);  \
829     dataEnd += NUM_WAYS; }  \
830 
831 #define SINGLE_LOOP  \
832     for (; data < dataEnd; data++)
833 
834 
835 AES_FUNC_START2 (AesCbc_Decode_HW)
836 {
837   v128 *p = (v128*)(void*)ivAes;
838   v128 *data = (v128*)(void*)data8;
839   v128 iv = *p;
840   const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
841   const v128 *dataEnd;
842   p += 2;
843 
844   WIDE_LOOP_START
845   {
846     const v128 *w = wStart;
847     WOP (DECLARE_VAR)
848     WOP (LOAD_data)
849     WOP_KEY (AES_D_IMC, 2)
850     do
851     {
852       WOP_KEY (AES_D_IMC, 1)
853       WOP_KEY (AES_D_IMC, 0)
854       w -= 2;
855     }
856     while (w != p);
857     WOP_KEY (AES_D,   1)
858     WOP_KEY (AES_XOR, 0)
859     MM_XOR (m0, iv)
860     WOP_M1 (XOR_data_M1)
861     iv = data[NUM_WAYS - 1];
862     WOP (STORE_data)
863   }
864   WIDE_LOOP_END
865 
866   SINGLE_LOOP
867   {
868     const v128 *w = wStart;
869     v128 m = *data;
870     AES_D_IMC_m (w[2])
871     do
872     {
873       AES_D_IMC_m (w[1])
874       AES_D_IMC_m (w[0])
875       w -= 2;
876     }
877     while (w != p);
878     AES_D_m  (w[1])
879     MM_XOR_m (w[0])
880     MM_XOR_m (iv)
881     iv = *data;
882     *data = m;
883   }
884 
885   p[-2] = iv;
886 }
887 
888 
889 AES_FUNC_START2 (AesCtr_Code_HW)
890 {
891   v128 *p = (v128*)(void*)ivAes;
892   v128 *data = (v128*)(void*)data8;
893   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
894   const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
895   const v128 *dataEnd;
896   uint64x2_t one = vdupq_n_u64(0);
897 
898 // the bug in clang:
899 // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
900 #if defined(__clang__) && (__clang_major__ <= 9)
901 #pragma GCC diagnostic ignored "-Wvector-conversion"
902 #endif
903   one = vsetq_lane_u64(1, one, 0);
904   p += 2;
905 
906   WIDE_LOOP_START
907   {
908     const v128 *w = p;
909     WOP (DECLARE_VAR)
910     WOP (CTR_START)
911     do
912     {
913       WOP_KEY (AES_E_MC, 0)
914       WOP_KEY (AES_E_MC, 1)
915       w += 2;
916     }
917     while (w != wEnd);
918     WOP_KEY (AES_E_MC, 0)
919     WOP_KEY (AES_E,    1)
920     WOP_KEY (AES_XOR,  2)
921     WOP (CTR_END)
922   }
923   WIDE_LOOP_END
924 
925   SINGLE_LOOP
926   {
927     const v128 *w = p;
928     v128 m;
929     CTR_START (m, 0)
930     do
931     {
932       AES_E_MC_m (w[0])
933       AES_E_MC_m (w[1])
934       w += 2;
935     }
936     while (w != wEnd);
937     AES_E_MC_m (w[0])
938     AES_E_m    (w[1])
939     MM_XOR_m   (w[2])
940     CTR_END (m, 0)
941   }
942 
943   p[-2] = vreinterpretq_u8_u64(ctr);
944 }
945 
946 #endif // USE_HW_AES
947 
948 #endif // MY_CPU_ARM_OR_ARM64
949 
950 #undef NUM_WAYS
951 #undef WOP_M1
952 #undef WOP
953 #undef DECLARE_VAR
954 #undef LOAD_data
955 #undef STORE_data
956 #undef USE_INTEL_AES
957 #undef USE_HW_AES
958