1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2 2023-04-02 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #include "Aes.h"
7 #include "CpuArch.h"
8
9 #ifdef MY_CPU_X86_OR_AMD64
10
11 #if defined(__INTEL_COMPILER)
12 #if (__INTEL_COMPILER >= 1110)
13 #define USE_INTEL_AES
14 #if (__INTEL_COMPILER >= 1900)
15 #define USE_INTEL_VAES
16 #endif
17 #endif
18 #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \
19 || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4)
20 #define USE_INTEL_AES
21 #if !defined(__AES__)
22 #define ATTRIB_AES __attribute__((__target__("aes")))
23 #endif
24 #if defined(__clang__) && (__clang_major__ >= 8) \
25 || defined(__GNUC__) && (__GNUC__ >= 8)
26 #define USE_INTEL_VAES
27 #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29 #endif
30 #endif
31 #elif defined(_MSC_VER)
32 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33 #define USE_INTEL_AES
34 #if (_MSC_VER >= 1910)
35 #define USE_INTEL_VAES
36 #endif
37 #endif
38 #endif
39
40 #ifndef ATTRIB_AES
41 #define ATTRIB_AES
42 #endif
43 #ifndef ATTRIB_VAES
44 #define ATTRIB_VAES
45 #endif
46
47
48 #ifdef USE_INTEL_AES
49
50 #include <wmmintrin.h>
51
52 #ifndef USE_INTEL_VAES
53 #define AES_TYPE_keys UInt32
54 #define AES_TYPE_data Byte
55 // #define AES_TYPE_keys __m128i
56 // #define AES_TYPE_data __m128i
57 #endif
58
59 #define AES_FUNC_START(name) \
60 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
61 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
62
63 #define AES_FUNC_START2(name) \
64 AES_FUNC_START (name); \
65 ATTRIB_AES \
66 AES_FUNC_START (name)
67
68 #define MM_OP(op, dest, src) dest = op(dest, src);
69 #define MM_OP_m(op, src) MM_OP(op, m, src)
70
71 #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
72 #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
73
74
AES_FUNC_START2(AesCbc_Encode_HW)75 AES_FUNC_START2 (AesCbc_Encode_HW)
76 {
77 __m128i *p = (__m128i *)(void *)ivAes;
78 __m128i *data = (__m128i *)(void *)data8;
79 __m128i m = *p;
80 const __m128i k0 = p[2];
81 const __m128i k1 = p[3];
82 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
83 for (; numBlocks != 0; numBlocks--, data++)
84 {
85 UInt32 r = numRounds2;
86 const __m128i *w = p + 4;
87 __m128i temp = *data;
88 MM_XOR (temp, k0)
89 MM_XOR (m, temp)
90 MM_OP_m (_mm_aesenc_si128, k1)
91 do
92 {
93 MM_OP_m (_mm_aesenc_si128, w[0])
94 MM_OP_m (_mm_aesenc_si128, w[1])
95 w += 2;
96 }
97 while (--r);
98 MM_OP_m (_mm_aesenclast_si128, w[0])
99 *data = m;
100 }
101 *p = m;
102 }
103
104
105 #define WOP_1(op)
106 #define WOP_2(op) WOP_1 (op) op (m1, 1)
107 #define WOP_3(op) WOP_2 (op) op (m2, 2)
108 #define WOP_4(op) WOP_3 (op) op (m3, 3)
109 #ifdef MY_CPU_AMD64
110 #define WOP_5(op) WOP_4 (op) op (m4, 4)
111 #define WOP_6(op) WOP_5 (op) op (m5, 5)
112 #define WOP_7(op) WOP_6 (op) op (m6, 6)
113 #define WOP_8(op) WOP_7 (op) op (m7, 7)
114 #endif
115 /*
116 #define WOP_9(op) WOP_8 (op) op (m8, 8);
117 #define WOP_10(op) WOP_9 (op) op (m9, 9);
118 #define WOP_11(op) WOP_10(op) op (m10, 10);
119 #define WOP_12(op) WOP_11(op) op (m11, 11);
120 #define WOP_13(op) WOP_12(op) op (m12, 12);
121 #define WOP_14(op) WOP_13(op) op (m13, 13);
122 */
123
124 #ifdef MY_CPU_AMD64
125 #define NUM_WAYS 8
126 #define WOP_M1 WOP_8
127 #else
128 #define NUM_WAYS 4
129 #define WOP_M1 WOP_4
130 #endif
131
132 #define WOP(op) op (m0, 0) WOP_M1(op)
133
134
135 #define DECLARE_VAR(reg, ii) __m128i reg;
136 #define LOAD_data( reg, ii) reg = data[ii];
137 #define STORE_data( reg, ii) data[ii] = reg;
138 #if (NUM_WAYS > 1)
139 #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
140 #endif
141
142 #define AVX_DECLARE_VAR(reg, ii) __m256i reg;
143 #define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
144 #define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
145 #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
146
147 #define MM_OP_key(op, reg) MM_OP(op, reg, key);
148
149 #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg)
150 #define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg)
151 #define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg)
152 #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg)
153 #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
154
155
156 #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
157 #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
158 #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
159 #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
160 #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
161
162 #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
163 #define CTR_END( reg, ii) MM_XOR (data[ii], reg)
164
165 #define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
166 #define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
167
168 #define WOP_KEY(op, n) { \
169 const __m128i key = w[n]; \
170 WOP(op); }
171
172 #define AVX_WOP_KEY(op, n) { \
173 const __m256i key = w[n]; \
174 WOP(op); }
175
176
177 #define WIDE_LOOP_START \
178 dataEnd = data + numBlocks; \
179 if (numBlocks >= NUM_WAYS) \
180 { dataEnd -= NUM_WAYS; do { \
181
182
183 #define WIDE_LOOP_END \
184 data += NUM_WAYS; \
185 } while (data <= dataEnd); \
186 dataEnd += NUM_WAYS; } \
187
188
189 #define SINGLE_LOOP \
190 for (; data < dataEnd; data++)
191
192
193 #define NUM_AES_KEYS_MAX 15
194
195 #define WIDE_LOOP_START_AVX(OP) \
196 dataEnd = data + numBlocks; \
197 if (numBlocks >= NUM_WAYS * 2) \
198 { __m256i keys[NUM_AES_KEYS_MAX]; \
199 UInt32 ii; \
200 OP \
201 for (ii = 0; ii < numRounds; ii++) \
202 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); \
203 dataEnd -= NUM_WAYS * 2; do { \
204
205
206 #define WIDE_LOOP_END_AVX(OP) \
207 data += NUM_WAYS * 2; \
208 } while (data <= dataEnd); \
209 dataEnd += NUM_WAYS * 2; \
210 OP \
211 _mm256_zeroupper(); \
212 } \
213
214 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
215 MSVC still can insert vzeroupper instruction. */
216
217
AES_FUNC_START2(AesCbc_Decode_HW)218 AES_FUNC_START2 (AesCbc_Decode_HW)
219 {
220 __m128i *p = (__m128i *)(void *)ivAes;
221 __m128i *data = (__m128i *)(void *)data8;
222 __m128i iv = *p;
223 const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
224 const __m128i *dataEnd;
225 p += 2;
226
227 WIDE_LOOP_START
228 {
229 const __m128i *w = wStart;
230
231 WOP (DECLARE_VAR)
232 WOP (LOAD_data)
233 WOP_KEY (AES_XOR, 1)
234
235 do
236 {
237 WOP_KEY (AES_DEC, 0)
238 w--;
239 }
240 while (w != p);
241 WOP_KEY (AES_DEC_LAST, 0)
242
243 MM_XOR (m0, iv)
244 WOP_M1 (XOR_data_M1)
245 iv = data[NUM_WAYS - 1];
246 WOP (STORE_data)
247 }
248 WIDE_LOOP_END
249
250 SINGLE_LOOP
251 {
252 const __m128i *w = wStart - 1;
253 __m128i m = _mm_xor_si128 (w[2], *data);
254 do
255 {
256 MM_OP_m (_mm_aesdec_si128, w[1])
257 MM_OP_m (_mm_aesdec_si128, w[0])
258 w -= 2;
259 }
260 while (w != p);
261 MM_OP_m (_mm_aesdec_si128, w[1])
262 MM_OP_m (_mm_aesdeclast_si128, w[0])
263
264 MM_XOR (m, iv)
265 iv = *data;
266 *data = m;
267 }
268
269 p[-2] = iv;
270 }
271
272
AES_FUNC_START2(AesCtr_Code_HW)273 AES_FUNC_START2 (AesCtr_Code_HW)
274 {
275 __m128i *p = (__m128i *)(void *)ivAes;
276 __m128i *data = (__m128i *)(void *)data8;
277 __m128i ctr = *p;
278 UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
279 const __m128i *dataEnd;
280 __m128i one = _mm_cvtsi32_si128(1);
281
282 p += 2;
283
284 WIDE_LOOP_START
285 {
286 const __m128i *w = p;
287 UInt32 r = numRoundsMinus2;
288 WOP (DECLARE_VAR)
289 WOP (CTR_START)
290 WOP_KEY (AES_XOR, 0)
291 w += 1;
292 do
293 {
294 WOP_KEY (AES_ENC, 0)
295 w += 1;
296 }
297 while (--r);
298 WOP_KEY (AES_ENC_LAST, 0)
299
300 WOP (CTR_END)
301 }
302 WIDE_LOOP_END
303
304 SINGLE_LOOP
305 {
306 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
307 const __m128i *w = p;
308 __m128i m;
309 MM_OP (_mm_add_epi64, ctr, one)
310 m = _mm_xor_si128 (ctr, p[0]);
311 w += 1;
312 do
313 {
314 MM_OP_m (_mm_aesenc_si128, w[0])
315 MM_OP_m (_mm_aesenc_si128, w[1])
316 w += 2;
317 }
318 while (--numRounds2);
319 MM_OP_m (_mm_aesenc_si128, w[0])
320 MM_OP_m (_mm_aesenclast_si128, w[1])
321 MM_XOR (*data, m)
322 }
323
324 p[-2] = ctr;
325 }
326
327
328
329 #ifdef USE_INTEL_VAES
330
331 /*
332 GCC before 2013-Jun:
333 <immintrin.h>:
334 #ifdef __AVX__
335 #include <avxintrin.h>
336 #endif
337 GCC after 2013-Jun:
338 <immintrin.h>:
339 #include <avxintrin.h>
340 CLANG 3.8+:
341 {
342 <immintrin.h>:
343 #if !defined(_MSC_VER) || defined(__AVX__)
344 #include <avxintrin.h>
345 #endif
346
347 if (the compiler is clang for Windows and if global arch is not set for __AVX__)
348 [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
349 {
350 <immintrin.h> doesn't include <avxintrin.h>
351 and we have 2 ways to fix it:
352 1) we can define required __AVX__ before <immintrin.h>
353 or
354 2) we can include <avxintrin.h> after <immintrin.h>
355 }
356 }
357
358 If we include <avxintrin.h> manually for GCC/CLANG, it's
359 required that <immintrin.h> must be included before <avxintrin.h>.
360 */
361
362 /*
363 #if defined(__clang__) && defined(_MSC_VER)
364 #define __AVX__
365 #define __AVX2__
366 #define __VAES__
367 #endif
368 */
369
370 #include <immintrin.h>
371 #if defined(__clang__) && defined(_MSC_VER)
372 #if !defined(__AVX__)
373 #include <avxintrin.h>
374 #endif
375 #if !defined(__AVX2__)
376 #include <avx2intrin.h>
377 #endif
378 #if !defined(__VAES__)
379 #include <vaesintrin.h>
380 #endif
381 #endif // __clang__ && _MSC_VER
382
383
384 #define VAES_FUNC_START2(name) \
385 AES_FUNC_START (name); \
386 ATTRIB_VAES \
387 AES_FUNC_START (name)
388
VAES_FUNC_START2(AesCbc_Decode_HW_256)389 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
390 {
391 __m128i *p = (__m128i *)(void *)ivAes;
392 __m128i *data = (__m128i *)(void *)data8;
393 __m128i iv = *p;
394 const __m128i *dataEnd;
395 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
396 p += 2;
397
398 WIDE_LOOP_START_AVX(;)
399 {
400 const __m256i *w = keys + numRounds - 2;
401
402 WOP (AVX_DECLARE_VAR)
403 WOP (AVX_LOAD_data)
404 AVX_WOP_KEY (AVX_AES_XOR, 1)
405
406 do
407 {
408 AVX_WOP_KEY (AVX_AES_DEC, 0)
409 w--;
410 }
411 while (w != keys);
412 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
413
414 AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
415 WOP_M1 (AVX_XOR_data_M1)
416 iv = data[NUM_WAYS * 2 - 1];
417 WOP (AVX_STORE_data)
418 }
419 WIDE_LOOP_END_AVX(;)
420
421 SINGLE_LOOP
422 {
423 const __m128i *w = p + *(const UInt32 *)(p + 1 - 2) * 2 + 1 - 3;
424 __m128i m = _mm_xor_si128 (w[2], *data);
425 do
426 {
427 MM_OP_m (_mm_aesdec_si128, w[1])
428 MM_OP_m (_mm_aesdec_si128, w[0])
429 w -= 2;
430 }
431 while (w != p);
432 MM_OP_m (_mm_aesdec_si128, w[1])
433 MM_OP_m (_mm_aesdeclast_si128, w[0])
434
435 MM_XOR (m, iv)
436 iv = *data;
437 *data = m;
438 }
439
440 p[-2] = iv;
441 }
442
443
444 /*
445 SSE2: _mm_cvtsi32_si128 : movd
446 AVX: _mm256_setr_m128i : vinsertf128
447 AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm
448 _mm256_extracti128_si256 : vextracti128
449 _mm256_broadcastsi128_si256 : vbroadcasti128
450 */
451
452 #define AVX_CTR_LOOP_START \
453 ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
454 two = _mm256_setr_m128i(one, one); \
455 two = _mm256_add_epi64(two, two); \
456
457 // two = _mm256_setr_epi64x(2, 0, 2, 0);
458
459 #define AVX_CTR_LOOP_ENC \
460 ctr = _mm256_extracti128_si256 (ctr2, 1); \
461
VAES_FUNC_START2(AesCtr_Code_HW_256)462 VAES_FUNC_START2 (AesCtr_Code_HW_256)
463 {
464 __m128i *p = (__m128i *)(void *)ivAes;
465 __m128i *data = (__m128i *)(void *)data8;
466 __m128i ctr = *p;
467 UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
468 const __m128i *dataEnd;
469 __m128i one = _mm_cvtsi32_si128(1);
470 __m256i ctr2, two;
471 p += 2;
472
473 WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
474 {
475 const __m256i *w = keys;
476 UInt32 r = numRounds - 2;
477 WOP (AVX_DECLARE_VAR)
478 AVX_WOP_KEY (AVX_CTR_START, 0)
479
480 w += 1;
481 do
482 {
483 AVX_WOP_KEY (AVX_AES_ENC, 0)
484 w += 1;
485 }
486 while (--r);
487 AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
488
489 WOP (AVX_CTR_END)
490 }
491 WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
492
493 SINGLE_LOOP
494 {
495 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
496 const __m128i *w = p;
497 __m128i m;
498 MM_OP (_mm_add_epi64, ctr, one)
499 m = _mm_xor_si128 (ctr, p[0]);
500 w += 1;
501 do
502 {
503 MM_OP_m (_mm_aesenc_si128, w[0])
504 MM_OP_m (_mm_aesenc_si128, w[1])
505 w += 2;
506 }
507 while (--numRounds2);
508 MM_OP_m (_mm_aesenc_si128, w[0])
509 MM_OP_m (_mm_aesenclast_si128, w[1])
510 MM_XOR (*data, m)
511 }
512
513 p[-2] = ctr;
514 }
515
516 #endif // USE_INTEL_VAES
517
518 #else // USE_INTEL_AES
519
520 /* no USE_INTEL_AES */
521
522 #pragma message("AES HW_SW stub was used")
523
524 #define AES_TYPE_keys UInt32
525 #define AES_TYPE_data Byte
526
527 #define AES_FUNC_START(name) \
528 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
529
530 #define AES_COMPAT_STUB(name) \
531 AES_FUNC_START(name); \
532 AES_FUNC_START(name ## _HW) \
533 { name(p, data, numBlocks); }
534
535 AES_COMPAT_STUB (AesCbc_Encode)
536 AES_COMPAT_STUB (AesCbc_Decode)
537 AES_COMPAT_STUB (AesCtr_Code)
538
539 #endif // USE_INTEL_AES
540
541
542 #ifndef USE_INTEL_VAES
543
544 #pragma message("VAES HW_SW stub was used")
545
546 #define VAES_COMPAT_STUB(name) \
547 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
548 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
549 { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
550
551 VAES_COMPAT_STUB (AesCbc_Decode_HW)
552 VAES_COMPAT_STUB (AesCtr_Code_HW)
553
554 #endif // ! USE_INTEL_VAES
555
556
557 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
558
559 #if defined(__clang__)
560 #if (__clang_major__ >= 8) // fix that check
561 #define USE_HW_AES
562 #endif
563 #elif defined(__GNUC__)
564 #if (__GNUC__ >= 6) // fix that check
565 #define USE_HW_AES
566 #endif
567 #elif defined(_MSC_VER)
568 #if _MSC_VER >= 1910
569 #define USE_HW_AES
570 #endif
571 #endif
572
573 #ifdef USE_HW_AES
574
575 // #pragma message("=== AES HW === ")
576
577 #if defined(__clang__) || defined(__GNUC__)
578 #ifdef MY_CPU_ARM64
579 #define ATTRIB_AES __attribute__((__target__("+crypto")))
580 #else
581 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
582 #endif
583 #else
584 // _MSC_VER
585 // for arm32
586 #define _ARM_USE_NEW_NEON_INTRINSICS
587 #endif
588
589 #ifndef ATTRIB_AES
590 #define ATTRIB_AES
591 #endif
592
593 #if defined(_MSC_VER) && defined(MY_CPU_ARM64)
594 #include <arm64_neon.h>
595 #else
596 #include <arm_neon.h>
597 #endif
598
599 typedef uint8x16_t v128;
600
601 #define AES_FUNC_START(name) \
602 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
603 // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
604
605 #define AES_FUNC_START2(name) \
606 AES_FUNC_START (name); \
607 ATTRIB_AES \
608 AES_FUNC_START (name)
609
610 #define MM_OP(op, dest, src) dest = op(dest, src);
611 #define MM_OP_m(op, src) MM_OP(op, m, src)
612 #define MM_OP1_m(op) m = op(m);
613
614 #define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src)
615 #define MM_XOR_m( src) MM_XOR(m, src)
616
617 #define AES_E_m(k) MM_OP_m (vaeseq_u8, k)
618 #define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8)
619
620
621 AES_FUNC_START2 (AesCbc_Encode_HW)
622 {
623 v128 *p = (v128*)(void*)ivAes;
624 v128 *data = (v128*)(void*)data8;
625 v128 m = *p;
626 const v128 k0 = p[2];
627 const v128 k1 = p[3];
628 const v128 k2 = p[4];
629 const v128 k3 = p[5];
630 const v128 k4 = p[6];
631 const v128 k5 = p[7];
632 const v128 k6 = p[8];
633 const v128 k7 = p[9];
634 const v128 k8 = p[10];
635 const v128 k9 = p[11];
636 const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
637 const v128 *w = p + ((size_t)numRounds2 * 2);
638 const v128 k_z1 = w[1];
639 const v128 k_z0 = w[2];
640 for (; numBlocks != 0; numBlocks--, data++)
641 {
642 MM_XOR_m (*data);
643 AES_E_MC_m (k0)
644 AES_E_MC_m (k1)
645 AES_E_MC_m (k2)
646 AES_E_MC_m (k3)
647 AES_E_MC_m (k4)
648 AES_E_MC_m (k5)
649 AES_E_MC_m (k6)
650 AES_E_MC_m (k7)
651 AES_E_MC_m (k8)
652 if (numRounds2 >= 6)
653 {
654 AES_E_MC_m (k9)
655 AES_E_MC_m (p[12])
656 if (numRounds2 != 6)
657 {
658 AES_E_MC_m (p[13])
659 AES_E_MC_m (p[14])
660 }
661 }
662 AES_E_m (k_z1)
663 MM_XOR_m (k_z0);
664 *data = m;
665 }
666 *p = m;
667 }
668
669
670 #define WOP_1(op)
671 #define WOP_2(op) WOP_1 (op) op (m1, 1)
672 #define WOP_3(op) WOP_2 (op) op (m2, 2)
673 #define WOP_4(op) WOP_3 (op) op (m3, 3)
674 #define WOP_5(op) WOP_4 (op) op (m4, 4)
675 #define WOP_6(op) WOP_5 (op) op (m5, 5)
676 #define WOP_7(op) WOP_6 (op) op (m6, 6)
677 #define WOP_8(op) WOP_7 (op) op (m7, 7)
678
679 #define NUM_WAYS 8
680 #define WOP_M1 WOP_8
681
682 #define WOP(op) op (m0, 0) WOP_M1(op)
683
684 #define DECLARE_VAR(reg, ii) v128 reg;
685 #define LOAD_data( reg, ii) reg = data[ii];
686 #define STORE_data( reg, ii) data[ii] = reg;
687 #if (NUM_WAYS > 1)
688 #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
689 #endif
690
691 #define MM_OP_key(op, reg) MM_OP (op, reg, key)
692
693 #define AES_D_m(k) MM_OP_m (vaesdq_u8, k)
694 #define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8)
695
696 #define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg)
697 #define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg)
698 #define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg)
699
700 #define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg);
701 #define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg);
702
703 #define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr);
704 #define CTR_END( reg, ii) MM_XOR (data[ii], reg)
705
706 #define WOP_KEY(op, n) { \
707 const v128 key = w[n]; \
708 WOP(op) }
709
710 #define WIDE_LOOP_START \
711 dataEnd = data + numBlocks; \
712 if (numBlocks >= NUM_WAYS) \
713 { dataEnd -= NUM_WAYS; do { \
714
715 #define WIDE_LOOP_END \
716 data += NUM_WAYS; \
717 } while (data <= dataEnd); \
718 dataEnd += NUM_WAYS; } \
719
720 #define SINGLE_LOOP \
721 for (; data < dataEnd; data++)
722
723
724 AES_FUNC_START2 (AesCbc_Decode_HW)
725 {
726 v128 *p = (v128*)(void*)ivAes;
727 v128 *data = (v128*)(void*)data8;
728 v128 iv = *p;
729 const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
730 const v128 *dataEnd;
731 p += 2;
732
733 WIDE_LOOP_START
734 {
735 const v128 *w = wStart;
736 WOP (DECLARE_VAR)
737 WOP (LOAD_data)
738 WOP_KEY (AES_D_IMC, 2)
739 do
740 {
741 WOP_KEY (AES_D_IMC, 1)
742 WOP_KEY (AES_D_IMC, 0)
743 w -= 2;
744 }
745 while (w != p);
746 WOP_KEY (AES_D, 1)
747 WOP_KEY (AES_XOR, 0)
748 MM_XOR (m0, iv);
749 WOP_M1 (XOR_data_M1)
750 iv = data[NUM_WAYS - 1];
751 WOP (STORE_data)
752 }
753 WIDE_LOOP_END
754
755 SINGLE_LOOP
756 {
757 const v128 *w = wStart;
758 v128 m = *data;
759 AES_D_IMC_m (w[2])
760 do
761 {
762 AES_D_IMC_m (w[1]);
763 AES_D_IMC_m (w[0]);
764 w -= 2;
765 }
766 while (w != p);
767 AES_D_m (w[1]);
768 MM_XOR_m (w[0]);
769 MM_XOR_m (iv);
770 iv = *data;
771 *data = m;
772 }
773
774 p[-2] = iv;
775 }
776
777
778 AES_FUNC_START2 (AesCtr_Code_HW)
779 {
780 v128 *p = (v128*)(void*)ivAes;
781 v128 *data = (v128*)(void*)data8;
782 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
783 const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
784 const v128 *dataEnd;
785 uint64x2_t one = vdupq_n_u64(0);
786 one = vsetq_lane_u64(1, one, 0);
787 p += 2;
788
789 WIDE_LOOP_START
790 {
791 const v128 *w = p;
792 WOP (DECLARE_VAR)
793 WOP (CTR_START)
794 do
795 {
796 WOP_KEY (AES_E_MC, 0)
797 WOP_KEY (AES_E_MC, 1)
798 w += 2;
799 }
800 while (w != wEnd);
801 WOP_KEY (AES_E_MC, 0)
802 WOP_KEY (AES_E, 1)
803 WOP_KEY (AES_XOR, 2)
804 WOP (CTR_END)
805 }
806 WIDE_LOOP_END
807
808 SINGLE_LOOP
809 {
810 const v128 *w = p;
811 v128 m;
812 CTR_START (m, 0);
813 do
814 {
815 AES_E_MC_m (w[0]);
816 AES_E_MC_m (w[1]);
817 w += 2;
818 }
819 while (w != wEnd);
820 AES_E_MC_m (w[0])
821 AES_E_m (w[1])
822 MM_XOR_m (w[2])
823 CTR_END (m, 0)
824 }
825
826 p[-2] = vreinterpretq_u8_u64(ctr);
827 }
828
829 #endif // USE_HW_AES
830
831 #endif // MY_CPU_ARM_OR_ARM64
832
833 #undef NUM_WAYS
834 #undef WOP_M1
835 #undef WOP
836 #undef DECLARE_VAR
837 #undef LOAD_data
838 #undef STORE_data
839 #undef USE_INTEL_AES
840 #undef USE_HW_AES
841