• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2 2023-04-02 : Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include "Aes.h"
7 #include "CpuArch.h"
8 
9 #ifdef MY_CPU_X86_OR_AMD64
10 
11   #if defined(__INTEL_COMPILER)
12     #if (__INTEL_COMPILER >= 1110)
13       #define USE_INTEL_AES
14       #if (__INTEL_COMPILER >= 1900)
15         #define USE_INTEL_VAES
16       #endif
17     #endif
18   #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \
19        || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4)
20         #define USE_INTEL_AES
21         #if !defined(__AES__)
22           #define ATTRIB_AES __attribute__((__target__("aes")))
23         #endif
24       #if defined(__clang__) && (__clang_major__ >= 8) \
25           || defined(__GNUC__) && (__GNUC__ >= 8)
26         #define USE_INTEL_VAES
27         #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28           #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29         #endif
30       #endif
31   #elif defined(_MSC_VER)
32     #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33       #define USE_INTEL_AES
34       #if (_MSC_VER >= 1910)
35         #define USE_INTEL_VAES
36       #endif
37     #endif
38   #endif
39 
40 #ifndef ATTRIB_AES
41   #define ATTRIB_AES
42 #endif
43 #ifndef ATTRIB_VAES
44   #define ATTRIB_VAES
45 #endif
46 
47 
48 #ifdef USE_INTEL_AES
49 
50 #include <wmmintrin.h>
51 
52 #ifndef USE_INTEL_VAES
53 #define AES_TYPE_keys UInt32
54 #define AES_TYPE_data Byte
55 // #define AES_TYPE_keys __m128i
56 // #define AES_TYPE_data __m128i
57 #endif
58 
59 #define AES_FUNC_START(name) \
60     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
61     // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
62 
63 #define AES_FUNC_START2(name) \
64 AES_FUNC_START (name); \
65 ATTRIB_AES \
66 AES_FUNC_START (name)
67 
68 #define MM_OP(op, dest, src)  dest = op(dest, src);
69 #define MM_OP_m(op, src)      MM_OP(op, m, src)
70 
71 #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)
72 #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
73 
74 
AES_FUNC_START2(AesCbc_Encode_HW)75 AES_FUNC_START2 (AesCbc_Encode_HW)
76 {
77   __m128i *p = (__m128i *)(void *)ivAes;
78   __m128i *data = (__m128i *)(void *)data8;
79   __m128i m = *p;
80   const __m128i k0 = p[2];
81   const __m128i k1 = p[3];
82   const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
83   for (; numBlocks != 0; numBlocks--, data++)
84   {
85     UInt32 r = numRounds2;
86     const __m128i *w = p + 4;
87     __m128i temp = *data;
88     MM_XOR (temp, k0)
89     MM_XOR (m, temp)
90     MM_OP_m (_mm_aesenc_si128, k1)
91     do
92     {
93       MM_OP_m (_mm_aesenc_si128, w[0])
94       MM_OP_m (_mm_aesenc_si128, w[1])
95       w += 2;
96     }
97     while (--r);
98     MM_OP_m (_mm_aesenclast_si128, w[0])
99     *data = m;
100   }
101   *p = m;
102 }
103 
104 
105 #define WOP_1(op)
106 #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
107 #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
108 #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
109 #ifdef MY_CPU_AMD64
110 #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
111 #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
112 #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
113 #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
114 #endif
115 /*
116 #define WOP_9(op)   WOP_8 (op)  op (m8, 8);
117 #define WOP_10(op)  WOP_9 (op)  op (m9, 9);
118 #define WOP_11(op)  WOP_10(op)  op (m10, 10);
119 #define WOP_12(op)  WOP_11(op)  op (m11, 11);
120 #define WOP_13(op)  WOP_12(op)  op (m12, 12);
121 #define WOP_14(op)  WOP_13(op)  op (m13, 13);
122 */
123 
124 #ifdef MY_CPU_AMD64
125   #define NUM_WAYS      8
126   #define WOP_M1    WOP_8
127 #else
128   #define NUM_WAYS      4
129   #define WOP_M1    WOP_4
130 #endif
131 
132 #define WOP(op)  op (m0, 0)  WOP_M1(op)
133 
134 
135 #define DECLARE_VAR(reg, ii)  __m128i reg;
136 #define LOAD_data(  reg, ii)  reg = data[ii];
137 #define STORE_data( reg, ii)  data[ii] = reg;
138 #if (NUM_WAYS > 1)
139 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
140 #endif
141 
142 #define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
143 #define AVX_LOAD_data(  reg, ii)  reg = ((const __m256i *)(const void *)data)[ii];
144 #define AVX_STORE_data( reg, ii)  ((__m256i *)(void *)data)[ii] = reg;
145 #define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
146 
147 #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
148 
149 #define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
150 #define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
151 #define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
152 #define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
153 #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
154 
155 
156 #define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
157 #define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
158 #define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
159 #define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
160 #define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
161 
162 #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
163 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
164 
165 #define AVX_CTR_START(reg, ii)  MM_OP (_mm256_add_epi64, ctr2, two)  reg = _mm256_xor_si256(ctr2, key);
166 #define AVX_CTR_END(  reg, ii)  AVX_XOR (((__m256i *)(void *)data)[ii], reg)
167 
168 #define WOP_KEY(op, n) { \
169     const __m128i key = w[n]; \
170     WOP(op); }
171 
172 #define AVX_WOP_KEY(op, n) { \
173     const __m256i key = w[n]; \
174     WOP(op); }
175 
176 
177 #define WIDE_LOOP_START  \
178     dataEnd = data + numBlocks;  \
179     if (numBlocks >= NUM_WAYS)  \
180     { dataEnd -= NUM_WAYS; do {  \
181 
182 
183 #define WIDE_LOOP_END  \
184     data += NUM_WAYS;  \
185     } while (data <= dataEnd);  \
186     dataEnd += NUM_WAYS; }  \
187 
188 
189 #define SINGLE_LOOP  \
190     for (; data < dataEnd; data++)
191 
192 
193 #define NUM_AES_KEYS_MAX 15
194 
195 #define WIDE_LOOP_START_AVX(OP)  \
196     dataEnd = data + numBlocks;  \
197     if (numBlocks >= NUM_WAYS * 2)  \
198     { __m256i keys[NUM_AES_KEYS_MAX]; \
199     UInt32 ii; \
200     OP \
201     for (ii = 0; ii < numRounds; ii++) \
202       keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
203     dataEnd -= NUM_WAYS * 2; do {  \
204 
205 
206 #define WIDE_LOOP_END_AVX(OP)  \
207     data += NUM_WAYS * 2;  \
208     } while (data <= dataEnd);  \
209     dataEnd += NUM_WAYS * 2;  \
210     OP  \
211     _mm256_zeroupper();  \
212     }  \
213 
214 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
215    MSVC still can insert vzeroupper instruction. */
216 
217 
AES_FUNC_START2(AesCbc_Decode_HW)218 AES_FUNC_START2 (AesCbc_Decode_HW)
219 {
220   __m128i *p = (__m128i *)(void *)ivAes;
221   __m128i *data = (__m128i *)(void *)data8;
222   __m128i iv = *p;
223   const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
224   const __m128i *dataEnd;
225   p += 2;
226 
227   WIDE_LOOP_START
228   {
229     const __m128i *w = wStart;
230 
231     WOP (DECLARE_VAR)
232     WOP (LOAD_data)
233     WOP_KEY (AES_XOR, 1)
234 
235     do
236     {
237       WOP_KEY (AES_DEC, 0)
238       w--;
239     }
240     while (w != p);
241     WOP_KEY (AES_DEC_LAST, 0)
242 
243     MM_XOR (m0, iv)
244     WOP_M1 (XOR_data_M1)
245     iv = data[NUM_WAYS - 1];
246     WOP (STORE_data)
247   }
248   WIDE_LOOP_END
249 
250   SINGLE_LOOP
251   {
252     const __m128i *w = wStart - 1;
253     __m128i m = _mm_xor_si128 (w[2], *data);
254     do
255     {
256       MM_OP_m (_mm_aesdec_si128, w[1])
257       MM_OP_m (_mm_aesdec_si128, w[0])
258       w -= 2;
259     }
260     while (w != p);
261     MM_OP_m (_mm_aesdec_si128,     w[1])
262     MM_OP_m (_mm_aesdeclast_si128, w[0])
263 
264     MM_XOR (m, iv)
265     iv = *data;
266     *data = m;
267   }
268 
269   p[-2] = iv;
270 }
271 
272 
AES_FUNC_START2(AesCtr_Code_HW)273 AES_FUNC_START2 (AesCtr_Code_HW)
274 {
275   __m128i *p = (__m128i *)(void *)ivAes;
276   __m128i *data = (__m128i *)(void *)data8;
277   __m128i ctr = *p;
278   UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
279   const __m128i *dataEnd;
280   __m128i one = _mm_cvtsi32_si128(1);
281 
282   p += 2;
283 
284   WIDE_LOOP_START
285   {
286     const __m128i *w = p;
287     UInt32 r = numRoundsMinus2;
288     WOP (DECLARE_VAR)
289     WOP (CTR_START)
290     WOP_KEY (AES_XOR, 0)
291     w += 1;
292     do
293     {
294       WOP_KEY (AES_ENC, 0)
295       w += 1;
296     }
297     while (--r);
298     WOP_KEY (AES_ENC_LAST, 0)
299 
300     WOP (CTR_END)
301   }
302   WIDE_LOOP_END
303 
304   SINGLE_LOOP
305   {
306     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
307     const __m128i *w = p;
308     __m128i m;
309     MM_OP (_mm_add_epi64, ctr, one)
310     m = _mm_xor_si128 (ctr, p[0]);
311     w += 1;
312     do
313     {
314       MM_OP_m (_mm_aesenc_si128, w[0])
315       MM_OP_m (_mm_aesenc_si128, w[1])
316       w += 2;
317     }
318     while (--numRounds2);
319     MM_OP_m (_mm_aesenc_si128,     w[0])
320     MM_OP_m (_mm_aesenclast_si128, w[1])
321     MM_XOR (*data, m)
322   }
323 
324   p[-2] = ctr;
325 }
326 
327 
328 
329 #ifdef USE_INTEL_VAES
330 
331 /*
332 GCC before 2013-Jun:
333   <immintrin.h>:
334     #ifdef __AVX__
335      #include <avxintrin.h>
336     #endif
337 GCC after 2013-Jun:
338   <immintrin.h>:
339     #include <avxintrin.h>
340 CLANG 3.8+:
341 {
342   <immintrin.h>:
343     #if !defined(_MSC_VER) || defined(__AVX__)
344       #include <avxintrin.h>
345     #endif
346 
347   if (the compiler is clang for Windows and if global arch is not set for __AVX__)
348     [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
349   {
350     <immintrin.h> doesn't include <avxintrin.h>
351     and we have 2 ways to fix it:
352       1) we can define required __AVX__ before <immintrin.h>
353       or
354       2) we can include <avxintrin.h> after <immintrin.h>
355   }
356 }
357 
358 If we include <avxintrin.h> manually for GCC/CLANG, it's
359 required that <immintrin.h> must be included before <avxintrin.h>.
360 */
361 
362 /*
363 #if defined(__clang__) && defined(_MSC_VER)
364 #define __AVX__
365 #define __AVX2__
366 #define __VAES__
367 #endif
368 */
369 
370 #include <immintrin.h>
371 #if defined(__clang__) && defined(_MSC_VER)
372   #if !defined(__AVX__)
373     #include <avxintrin.h>
374   #endif
375   #if !defined(__AVX2__)
376     #include <avx2intrin.h>
377   #endif
378   #if !defined(__VAES__)
379     #include <vaesintrin.h>
380   #endif
381 #endif  // __clang__ && _MSC_VER
382 
383 
384 #define VAES_FUNC_START2(name) \
385 AES_FUNC_START (name); \
386 ATTRIB_VAES \
387 AES_FUNC_START (name)
388 
VAES_FUNC_START2(AesCbc_Decode_HW_256)389 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
390 {
391   __m128i *p = (__m128i *)(void *)ivAes;
392   __m128i *data = (__m128i *)(void *)data8;
393   __m128i iv = *p;
394   const __m128i *dataEnd;
395   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
396   p += 2;
397 
398   WIDE_LOOP_START_AVX(;)
399   {
400     const __m256i *w = keys + numRounds - 2;
401 
402     WOP (AVX_DECLARE_VAR)
403     WOP (AVX_LOAD_data)
404     AVX_WOP_KEY (AVX_AES_XOR, 1)
405 
406     do
407     {
408       AVX_WOP_KEY (AVX_AES_DEC, 0)
409       w--;
410     }
411     while (w != keys);
412     AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
413 
414     AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
415     WOP_M1 (AVX_XOR_data_M1)
416     iv = data[NUM_WAYS * 2 - 1];
417     WOP (AVX_STORE_data)
418   }
419   WIDE_LOOP_END_AVX(;)
420 
421   SINGLE_LOOP
422   {
423     const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
424     __m128i m = _mm_xor_si128 (w[2], *data);
425     do
426     {
427       MM_OP_m (_mm_aesdec_si128, w[1])
428       MM_OP_m (_mm_aesdec_si128, w[0])
429       w -= 2;
430     }
431     while (w != p);
432     MM_OP_m (_mm_aesdec_si128,     w[1])
433     MM_OP_m (_mm_aesdeclast_si128, w[0])
434 
435     MM_XOR (m, iv)
436     iv = *data;
437     *data = m;
438   }
439 
440   p[-2] = iv;
441 }
442 
443 
444 /*
445 SSE2: _mm_cvtsi32_si128 : movd
446 AVX:  _mm256_setr_m128i            : vinsertf128
447 AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
448       _mm256_extracti128_si256     : vextracti128
449       _mm256_broadcastsi128_si256  : vbroadcasti128
450 */
451 
452 #define AVX_CTR_LOOP_START  \
453     ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
454     two = _mm256_setr_m128i(one, one); \
455     two = _mm256_add_epi64(two, two); \
456 
457 // two = _mm256_setr_epi64x(2, 0, 2, 0);
458 
459 #define AVX_CTR_LOOP_ENC  \
460     ctr = _mm256_extracti128_si256 (ctr2, 1); \
461 
VAES_FUNC_START2(AesCtr_Code_HW_256)462 VAES_FUNC_START2 (AesCtr_Code_HW_256)
463 {
464   __m128i *p = (__m128i *)(void *)ivAes;
465   __m128i *data = (__m128i *)(void *)data8;
466   __m128i ctr = *p;
467   UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
468   const __m128i *dataEnd;
469   __m128i one = _mm_cvtsi32_si128(1);
470   __m256i ctr2, two;
471   p += 2;
472 
473   WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
474   {
475     const __m256i *w = keys;
476     UInt32 r = numRounds - 2;
477     WOP (AVX_DECLARE_VAR)
478     AVX_WOP_KEY (AVX_CTR_START, 0)
479 
480     w += 1;
481     do
482     {
483       AVX_WOP_KEY (AVX_AES_ENC, 0)
484       w += 1;
485     }
486     while (--r);
487     AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
488 
489     WOP (AVX_CTR_END)
490   }
491   WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
492 
493   SINGLE_LOOP
494   {
495     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
496     const __m128i *w = p;
497     __m128i m;
498     MM_OP (_mm_add_epi64, ctr, one)
499     m = _mm_xor_si128 (ctr, p[0]);
500     w += 1;
501     do
502     {
503       MM_OP_m (_mm_aesenc_si128, w[0])
504       MM_OP_m (_mm_aesenc_si128, w[1])
505       w += 2;
506     }
507     while (--numRounds2);
508     MM_OP_m (_mm_aesenc_si128,     w[0])
509     MM_OP_m (_mm_aesenclast_si128, w[1])
510     MM_XOR (*data, m)
511   }
512 
513   p[-2] = ctr;
514 }
515 
516 #endif // USE_INTEL_VAES
517 
518 #else // USE_INTEL_AES
519 
520 /* no USE_INTEL_AES */
521 
522 #pragma message("AES  HW_SW stub was used")
523 
524 #define AES_TYPE_keys UInt32
525 #define AES_TYPE_data Byte
526 
527 #define AES_FUNC_START(name) \
528     void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
529 
530 #define AES_COMPAT_STUB(name) \
531     AES_FUNC_START(name); \
532     AES_FUNC_START(name ## _HW) \
533     { name(p, data, numBlocks); }
534 
535 AES_COMPAT_STUB (AesCbc_Encode)
536 AES_COMPAT_STUB (AesCbc_Decode)
537 AES_COMPAT_STUB (AesCtr_Code)
538 
539 #endif // USE_INTEL_AES
540 
541 
542 #ifndef USE_INTEL_VAES
543 
544 #pragma message("VAES HW_SW stub was used")
545 
546 #define VAES_COMPAT_STUB(name) \
547     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
548     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
549     { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
550 
551 VAES_COMPAT_STUB (AesCbc_Decode_HW)
552 VAES_COMPAT_STUB (AesCtr_Code_HW)
553 
554 #endif // ! USE_INTEL_VAES
555 
556 
557 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
558 
559   #if defined(__clang__)
560     #if (__clang_major__ >= 8) // fix that check
561       #define USE_HW_AES
562     #endif
563   #elif defined(__GNUC__)
564     #if (__GNUC__ >= 6) // fix that check
565       #define USE_HW_AES
566     #endif
567   #elif defined(_MSC_VER)
568     #if _MSC_VER >= 1910
569       #define USE_HW_AES
570     #endif
571   #endif
572 
573 #ifdef USE_HW_AES
574 
575 // #pragma message("=== AES HW === ")
576 
577 #if defined(__clang__) || defined(__GNUC__)
578   #ifdef MY_CPU_ARM64
579     #define ATTRIB_AES __attribute__((__target__("+crypto")))
580   #else
581     #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
582   #endif
583 #else
584   // _MSC_VER
585   // for arm32
586   #define _ARM_USE_NEW_NEON_INTRINSICS
587 #endif
588 
589 #ifndef ATTRIB_AES
590   #define ATTRIB_AES
591 #endif
592 
593 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
594 #include <arm64_neon.h>
595 #else
596 #include <arm_neon.h>
597 #endif
598 
599 typedef uint8x16_t v128;
600 
601 #define AES_FUNC_START(name) \
602     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
603     // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
604 
605 #define AES_FUNC_START2(name) \
606 AES_FUNC_START (name); \
607 ATTRIB_AES \
608 AES_FUNC_START (name)
609 
610 #define MM_OP(op, dest, src)  dest = op(dest, src);
611 #define MM_OP_m(op, src)      MM_OP(op, m, src)
612 #define MM_OP1_m(op)          m = op(m);
613 
614 #define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src)
615 #define MM_XOR_m( src)        MM_XOR(m, src)
616 
617 #define AES_E_m(k)     MM_OP_m (vaeseq_u8, k)
618 #define AES_E_MC_m(k)  AES_E_m (k)  MM_OP1_m(vaesmcq_u8)
619 
620 
621 AES_FUNC_START2 (AesCbc_Encode_HW)
622 {
623   v128 *p = (v128*)(void*)ivAes;
624   v128 *data = (v128*)(void*)data8;
625   v128 m = *p;
626   const v128 k0 = p[2];
627   const v128 k1 = p[3];
628   const v128 k2 = p[4];
629   const v128 k3 = p[5];
630   const v128 k4 = p[6];
631   const v128 k5 = p[7];
632   const v128 k6 = p[8];
633   const v128 k7 = p[9];
634   const v128 k8 = p[10];
635   const v128 k9 = p[11];
636   const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
637   const v128 *w = p + ((size_t)numRounds2 * 2);
638   const v128 k_z1 = w[1];
639   const v128 k_z0 = w[2];
640   for (; numBlocks != 0; numBlocks--, data++)
641   {
642     MM_XOR_m (*data);
643     AES_E_MC_m (k0)
644     AES_E_MC_m (k1)
645     AES_E_MC_m (k2)
646     AES_E_MC_m (k3)
647     AES_E_MC_m (k4)
648     AES_E_MC_m (k5)
649     AES_E_MC_m (k6)
650     AES_E_MC_m (k7)
651     AES_E_MC_m (k8)
652     if (numRounds2 >= 6)
653     {
654       AES_E_MC_m (k9)
655       AES_E_MC_m (p[12])
656       if (numRounds2 != 6)
657       {
658         AES_E_MC_m (p[13])
659         AES_E_MC_m (p[14])
660       }
661     }
662     AES_E_m  (k_z1)
663     MM_XOR_m (k_z0);
664     *data = m;
665   }
666   *p = m;
667 }
668 
669 
670 #define WOP_1(op)
671 #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
672 #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
673 #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
674 #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
675 #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
676 #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
677 #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
678 
679   #define NUM_WAYS      8
680   #define WOP_M1    WOP_8
681 
682 #define WOP(op)  op (m0, 0)   WOP_M1(op)
683 
684 #define DECLARE_VAR(reg, ii)  v128 reg;
685 #define LOAD_data(  reg, ii)  reg = data[ii];
686 #define STORE_data( reg, ii)  data[ii] = reg;
687 #if (NUM_WAYS > 1)
688 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
689 #endif
690 
691 #define MM_OP_key(op, reg)  MM_OP (op, reg, key)
692 
693 #define AES_D_m(k)      MM_OP_m (vaesdq_u8, k)
694 #define AES_D_IMC_m(k)  AES_D_m (k)  MM_OP1_m (vaesimcq_u8)
695 
696 #define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
697 #define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
698 #define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
699 
700 #define AES_D_IMC( reg, ii)  AES_D (reg, ii)  reg = vaesimcq_u8(reg);
701 #define AES_E_MC(  reg, ii)  AES_E (reg, ii)  reg = vaesmcq_u8(reg);
702 
703 #define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one)  reg = vreinterpretq_u8_u64(ctr);
704 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
705 
706 #define WOP_KEY(op, n) { \
707     const v128 key = w[n]; \
708     WOP(op) }
709 
710 #define WIDE_LOOP_START  \
711     dataEnd = data + numBlocks;  \
712     if (numBlocks >= NUM_WAYS)  \
713     { dataEnd -= NUM_WAYS; do {  \
714 
715 #define WIDE_LOOP_END  \
716     data += NUM_WAYS;  \
717     } while (data <= dataEnd);  \
718     dataEnd += NUM_WAYS; }  \
719 
720 #define SINGLE_LOOP  \
721     for (; data < dataEnd; data++)
722 
723 
724 AES_FUNC_START2 (AesCbc_Decode_HW)
725 {
726   v128 *p = (v128*)(void*)ivAes;
727   v128 *data = (v128*)(void*)data8;
728   v128 iv = *p;
729   const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
730   const v128 *dataEnd;
731   p += 2;
732 
733   WIDE_LOOP_START
734   {
735     const v128 *w = wStart;
736     WOP (DECLARE_VAR)
737     WOP (LOAD_data)
738     WOP_KEY (AES_D_IMC, 2)
739     do
740     {
741       WOP_KEY (AES_D_IMC, 1)
742       WOP_KEY (AES_D_IMC, 0)
743       w -= 2;
744     }
745     while (w != p);
746     WOP_KEY (AES_D,   1)
747     WOP_KEY (AES_XOR, 0)
748     MM_XOR (m0, iv);
749     WOP_M1 (XOR_data_M1)
750     iv = data[NUM_WAYS - 1];
751     WOP (STORE_data)
752   }
753   WIDE_LOOP_END
754 
755   SINGLE_LOOP
756   {
757     const v128 *w = wStart;
758     v128 m = *data;
759     AES_D_IMC_m (w[2])
760     do
761     {
762       AES_D_IMC_m (w[1]);
763       AES_D_IMC_m (w[0]);
764       w -= 2;
765     }
766     while (w != p);
767     AES_D_m  (w[1]);
768     MM_XOR_m (w[0]);
769     MM_XOR_m (iv);
770     iv = *data;
771     *data = m;
772   }
773 
774   p[-2] = iv;
775 }
776 
777 
778 AES_FUNC_START2 (AesCtr_Code_HW)
779 {
780   v128 *p = (v128*)(void*)ivAes;
781   v128 *data = (v128*)(void*)data8;
782   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
783   const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
784   const v128 *dataEnd;
785   uint64x2_t one = vdupq_n_u64(0);
786   one = vsetq_lane_u64(1, one, 0);
787   p += 2;
788 
789   WIDE_LOOP_START
790   {
791     const v128 *w = p;
792     WOP (DECLARE_VAR)
793     WOP (CTR_START)
794     do
795     {
796       WOP_KEY (AES_E_MC, 0)
797       WOP_KEY (AES_E_MC, 1)
798       w += 2;
799     }
800     while (w != wEnd);
801     WOP_KEY (AES_E_MC, 0)
802     WOP_KEY (AES_E,    1)
803     WOP_KEY (AES_XOR,  2)
804     WOP (CTR_END)
805   }
806   WIDE_LOOP_END
807 
808   SINGLE_LOOP
809   {
810     const v128 *w = p;
811     v128 m;
812     CTR_START (m, 0);
813     do
814     {
815       AES_E_MC_m (w[0]);
816       AES_E_MC_m (w[1]);
817       w += 2;
818     }
819     while (w != wEnd);
820     AES_E_MC_m (w[0])
821     AES_E_m    (w[1])
822     MM_XOR_m   (w[2])
823     CTR_END (m, 0)
824   }
825 
826   p[-2] = vreinterpretq_u8_u64(ctr);
827 }
828 
829 #endif // USE_HW_AES
830 
831 #endif // MY_CPU_ARM_OR_ARM64
832 
833 #undef NUM_WAYS
834 #undef WOP_M1
835 #undef WOP
836 #undef DECLARE_VAR
837 #undef LOAD_data
838 #undef STORE_data
839 #undef USE_INTEL_AES
840 #undef USE_HW_AES
841