1 /* AesOpt.c -- Intel's AES
2 2017-06-08 : Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #include "CpuArch.h"
7
8 #ifdef MY_CPU_X86_OR_AMD64
9 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
10 #define USE_INTEL_AES
11 #endif
12 #endif
13
14 #ifdef USE_INTEL_AES
15
16 #include <wmmintrin.h>
17
AesCbc_Encode_Intel(__m128i * p,__m128i * data,size_t numBlocks)18 void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
19 {
20 __m128i m = *p;
21 for (; numBlocks != 0; numBlocks--, data++)
22 {
23 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
24 const __m128i *w = p + 3;
25 m = _mm_xor_si128(m, *data);
26 m = _mm_xor_si128(m, p[2]);
27 do
28 {
29 m = _mm_aesenc_si128(m, w[0]);
30 m = _mm_aesenc_si128(m, w[1]);
31 w += 2;
32 }
33 while (--numRounds2 != 0);
34 m = _mm_aesenc_si128(m, w[0]);
35 m = _mm_aesenclast_si128(m, w[1]);
36 *data = m;
37 }
38 *p = m;
39 }
40
41 #define NUM_WAYS 3
42
43 #define AES_OP_W(op, n) { \
44 const __m128i t = w[n]; \
45 m0 = op(m0, t); \
46 m1 = op(m1, t); \
47 m2 = op(m2, t); \
48 }
49
50 #define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
51 #define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
52 #define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
53 #define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
54
AesCbc_Decode_Intel(__m128i * p,__m128i * data,size_t numBlocks)55 void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
56 {
57 __m128i iv = *p;
58 for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
59 {
60 UInt32 numRounds2 = *(const UInt32 *)(p + 1);
61 const __m128i *w = p + numRounds2 * 2;
62 __m128i m0, m1, m2;
63 {
64 const __m128i t = w[2];
65 m0 = _mm_xor_si128(t, data[0]);
66 m1 = _mm_xor_si128(t, data[1]);
67 m2 = _mm_xor_si128(t, data[2]);
68 }
69 numRounds2--;
70 do
71 {
72 AES_DEC(1)
73 AES_DEC(0)
74 w -= 2;
75 }
76 while (--numRounds2 != 0);
77 AES_DEC(1)
78 AES_DEC_LAST(0)
79
80 {
81 __m128i t;
82 t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
83 t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
84 t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
85 }
86 }
87 for (; numBlocks != 0; numBlocks--, data++)
88 {
89 UInt32 numRounds2 = *(const UInt32 *)(p + 1);
90 const __m128i *w = p + numRounds2 * 2;
91 __m128i m = _mm_xor_si128(w[2], *data);
92 numRounds2--;
93 do
94 {
95 m = _mm_aesdec_si128(m, w[1]);
96 m = _mm_aesdec_si128(m, w[0]);
97 w -= 2;
98 }
99 while (--numRounds2 != 0);
100 m = _mm_aesdec_si128(m, w[1]);
101 m = _mm_aesdeclast_si128(m, w[0]);
102
103 m = _mm_xor_si128(m, iv);
104 iv = *data;
105 *data = m;
106 }
107 *p = iv;
108 }
109
AesCtr_Code_Intel(__m128i * p,__m128i * data,size_t numBlocks)110 void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
111 {
112 __m128i ctr = *p;
113 __m128i one;
114 one.m128i_u64[0] = 1;
115 one.m128i_u64[1] = 0;
116 for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
117 {
118 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
119 const __m128i *w = p;
120 __m128i m0, m1, m2;
121 {
122 const __m128i t = w[2];
123 ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
124 ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
125 ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
126 }
127 w += 3;
128 do
129 {
130 AES_ENC(0)
131 AES_ENC(1)
132 w += 2;
133 }
134 while (--numRounds2 != 0);
135 AES_ENC(0)
136 AES_ENC_LAST(1)
137 data[0] = _mm_xor_si128(data[0], m0);
138 data[1] = _mm_xor_si128(data[1], m1);
139 data[2] = _mm_xor_si128(data[2], m2);
140 }
141 for (; numBlocks != 0; numBlocks--, data++)
142 {
143 UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
144 const __m128i *w = p;
145 __m128i m;
146 ctr = _mm_add_epi64(ctr, one);
147 m = _mm_xor_si128(ctr, p[2]);
148 w += 3;
149 do
150 {
151 m = _mm_aesenc_si128(m, w[0]);
152 m = _mm_aesenc_si128(m, w[1]);
153 w += 2;
154 }
155 while (--numRounds2 != 0);
156 m = _mm_aesenc_si128(m, w[0]);
157 m = _mm_aesenclast_si128(m, w[1]);
158 *data = _mm_xor_si128(*data, m);
159 }
160 *p = ctr;
161 }
162
163 #else
164
165 void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
166 void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
167 void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
168
AesCbc_Encode_Intel(UInt32 * p,Byte * data,size_t numBlocks)169 void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
170 {
171 AesCbc_Encode(p, data, numBlocks);
172 }
173
AesCbc_Decode_Intel(UInt32 * p,Byte * data,size_t numBlocks)174 void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
175 {
176 AesCbc_Decode(p, data, numBlocks);
177 }
178
AesCtr_Code_Intel(UInt32 * p,Byte * data,size_t numBlocks)179 void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
180 {
181 AesCtr_Code(p, data, numBlocks);
182 }
183
184 #endif
185