1 /*
2 * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the OpenSSL license (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 #include <openssl/crypto.h>
11 #include "modes_local.h"
12 #include <string.h>
13
14 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
15 typedef size_t size_t_aX __attribute((__aligned__(1)));
16 #else
17 typedef size_t size_t_aX;
18 #endif
19
20 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
21 /* redefine, because alignment is ensured */
22 # undef GETU32
23 # define GETU32(p) BSWAP4(*(const u32 *)(p))
24 # undef PUTU32
25 # define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
26 #endif
27
28 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
29 #define REDUCE1BIT(V) do { \
30 if (sizeof(size_t)==8) { \
31 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
32 V.lo = (V.hi<<63)|(V.lo>>1); \
33 V.hi = (V.hi>>1 )^T; \
34 } \
35 else { \
36 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
37 V.lo = (V.hi<<63)|(V.lo>>1); \
38 V.hi = (V.hi>>1 )^((u64)T<<32); \
39 } \
40 } while(0)
41
42 /*-
43 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
44 * never be set to 8. 8 is effectively reserved for testing purposes.
45 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
46 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
47 * whole spectrum of possible table driven implementations. Why? In
48 * non-"Shoup's" case memory access pattern is segmented in such manner,
49 * that it's trivial to see that cache timing information can reveal
50 * fair portion of intermediate hash value. Given that ciphertext is
51 * always available to attacker, it's possible for him to attempt to
52 * deduce secret parameter H and if successful, tamper with messages
53 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
54 * not as trivial, but there is no reason to believe that it's resistant
55 * to cache-timing attack. And the thing about "8-bit" implementation is
56 * that it consumes 16 (sixteen) times more memory, 4KB per individual
57 * key + 1KB shared. Well, on pros side it should be twice as fast as
58 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
59 * was observed to run ~75% faster, closer to 100% for commercial
60 * compilers... Yet "4-bit" procedure is preferred, because it's
61 * believed to provide better security-performance balance and adequate
62 * all-round performance. "All-round" refers to things like:
63 *
64 * - shorter setup time effectively improves overall timing for
65 * handling short messages;
66 * - larger table allocation can become unbearable because of VM
67 * subsystem penalties (for example on Windows large enough free
68 * results in VM working set trimming, meaning that consequent
69 * malloc would immediately incur working set expansion);
70 * - larger table has larger cache footprint, which can affect
71 * performance of other code paths (not necessarily even from same
72 * thread in Hyper-Threading world);
73 *
74 * Value of 1 is not appropriate for performance reasons.
75 */
76 #if TABLE_BITS==8
77
gcm_init_8bit(u128 Htable[256],u64 H[2])78 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
79 {
80 int i, j;
81 u128 V;
82
83 Htable[0].hi = 0;
84 Htable[0].lo = 0;
85 V.hi = H[0];
86 V.lo = H[1];
87
88 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
89 REDUCE1BIT(V);
90 Htable[i] = V;
91 }
92
93 for (i = 2; i < 256; i <<= 1) {
94 u128 *Hi = Htable + i, H0 = *Hi;
95 for (j = 1; j < i; ++j) {
96 Hi[j].hi = H0.hi ^ Htable[j].hi;
97 Hi[j].lo = H0.lo ^ Htable[j].lo;
98 }
99 }
100 }
101
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])102 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
103 {
104 u128 Z = { 0, 0 };
105 const u8 *xi = (const u8 *)Xi + 15;
106 size_t rem, n = *xi;
107 const union {
108 long one;
109 char little;
110 } is_endian = { 1 };
111 static const size_t rem_8bit[256] = {
112 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
113 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
114 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
115 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
116 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
117 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
118 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
119 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
120 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
121 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
122 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
123 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
124 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
125 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
126 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
127 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
128 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
129 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
130 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
131 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
132 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
133 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
134 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
135 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
136 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
137 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
138 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
139 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
140 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
141 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
142 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
143 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
144 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
145 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
146 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
147 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
148 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
149 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
150 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
151 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
152 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
153 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
154 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
155 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
156 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
157 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
158 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
159 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
160 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
161 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
162 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
163 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
164 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
165 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
166 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
167 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
168 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
169 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
170 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
171 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
172 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
173 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
174 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
175 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
176 };
177
178 while (1) {
179 Z.hi ^= Htable[n].hi;
180 Z.lo ^= Htable[n].lo;
181
182 if ((u8 *)Xi == xi)
183 break;
184
185 n = *(--xi);
186
187 rem = (size_t)Z.lo & 0xff;
188 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
189 Z.hi = (Z.hi >> 8);
190 if (sizeof(size_t) == 8)
191 Z.hi ^= rem_8bit[rem];
192 else
193 Z.hi ^= (u64)rem_8bit[rem] << 32;
194 }
195
196 if (is_endian.little) {
197 # ifdef BSWAP8
198 Xi[0] = BSWAP8(Z.hi);
199 Xi[1] = BSWAP8(Z.lo);
200 # else
201 u8 *p = (u8 *)Xi;
202 u32 v;
203 v = (u32)(Z.hi >> 32);
204 PUTU32(p, v);
205 v = (u32)(Z.hi);
206 PUTU32(p + 4, v);
207 v = (u32)(Z.lo >> 32);
208 PUTU32(p + 8, v);
209 v = (u32)(Z.lo);
210 PUTU32(p + 12, v);
211 # endif
212 } else {
213 Xi[0] = Z.hi;
214 Xi[1] = Z.lo;
215 }
216 }
217
218 # define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
219
220 #elif TABLE_BITS==4
221
gcm_init_4bit(u128 Htable[16],u64 H[2])222 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
223 {
224 u128 V;
225 # if defined(OPENSSL_SMALL_FOOTPRINT)
226 int i;
227 # endif
228
229 Htable[0].hi = 0;
230 Htable[0].lo = 0;
231 V.hi = H[0];
232 V.lo = H[1];
233
234 # if defined(OPENSSL_SMALL_FOOTPRINT)
235 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
236 REDUCE1BIT(V);
237 Htable[i] = V;
238 }
239
240 for (i = 2; i < 16; i <<= 1) {
241 u128 *Hi = Htable + i;
242 int j;
243 for (V = *Hi, j = 1; j < i; ++j) {
244 Hi[j].hi = V.hi ^ Htable[j].hi;
245 Hi[j].lo = V.lo ^ Htable[j].lo;
246 }
247 }
248 # else
249 Htable[8] = V;
250 REDUCE1BIT(V);
251 Htable[4] = V;
252 REDUCE1BIT(V);
253 Htable[2] = V;
254 REDUCE1BIT(V);
255 Htable[1] = V;
256 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
257 V = Htable[4];
258 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
259 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
260 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
261 V = Htable[8];
262 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
263 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
264 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
265 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
266 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
267 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
268 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
269 # endif
270 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
271 /*
272 * ARM assembler expects specific dword order in Htable.
273 */
274 {
275 int j;
276 const union {
277 long one;
278 char little;
279 } is_endian = { 1 };
280
281 if (is_endian.little)
282 for (j = 0; j < 16; ++j) {
283 V = Htable[j];
284 Htable[j].hi = V.lo;
285 Htable[j].lo = V.hi;
286 } else
287 for (j = 0; j < 16; ++j) {
288 V = Htable[j];
289 Htable[j].hi = V.lo << 32 | V.lo >> 32;
290 Htable[j].lo = V.hi << 32 | V.hi >> 32;
291 }
292 }
293 # endif
294 }
295
296 # ifndef GHASH_ASM
297 static const size_t rem_4bit[16] = {
298 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
299 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
300 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
301 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
302 };
303
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])304 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
305 {
306 u128 Z;
307 int cnt = 15;
308 size_t rem, nlo, nhi;
309 const union {
310 long one;
311 char little;
312 } is_endian = { 1 };
313
314 nlo = ((const u8 *)Xi)[15];
315 nhi = nlo >> 4;
316 nlo &= 0xf;
317
318 Z.hi = Htable[nlo].hi;
319 Z.lo = Htable[nlo].lo;
320
321 while (1) {
322 rem = (size_t)Z.lo & 0xf;
323 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
324 Z.hi = (Z.hi >> 4);
325 if (sizeof(size_t) == 8)
326 Z.hi ^= rem_4bit[rem];
327 else
328 Z.hi ^= (u64)rem_4bit[rem] << 32;
329
330 Z.hi ^= Htable[nhi].hi;
331 Z.lo ^= Htable[nhi].lo;
332
333 if (--cnt < 0)
334 break;
335
336 nlo = ((const u8 *)Xi)[cnt];
337 nhi = nlo >> 4;
338 nlo &= 0xf;
339
340 rem = (size_t)Z.lo & 0xf;
341 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
342 Z.hi = (Z.hi >> 4);
343 if (sizeof(size_t) == 8)
344 Z.hi ^= rem_4bit[rem];
345 else
346 Z.hi ^= (u64)rem_4bit[rem] << 32;
347
348 Z.hi ^= Htable[nlo].hi;
349 Z.lo ^= Htable[nlo].lo;
350 }
351
352 if (is_endian.little) {
353 # ifdef BSWAP8
354 Xi[0] = BSWAP8(Z.hi);
355 Xi[1] = BSWAP8(Z.lo);
356 # else
357 u8 *p = (u8 *)Xi;
358 u32 v;
359 v = (u32)(Z.hi >> 32);
360 PUTU32(p, v);
361 v = (u32)(Z.hi);
362 PUTU32(p + 4, v);
363 v = (u32)(Z.lo >> 32);
364 PUTU32(p + 8, v);
365 v = (u32)(Z.lo);
366 PUTU32(p + 12, v);
367 # endif
368 } else {
369 Xi[0] = Z.hi;
370 Xi[1] = Z.lo;
371 }
372 }
373
374 # if !defined(OPENSSL_SMALL_FOOTPRINT)
375 /*
376 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
377 * details... Compiler-generated code doesn't seem to give any
378 * performance improvement, at least not on x86[_64]. It's here
379 * mostly as reference and a placeholder for possible future
380 * non-trivial optimization[s]...
381 */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)382 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
383 const u8 *inp, size_t len)
384 {
385 u128 Z;
386 int cnt;
387 size_t rem, nlo, nhi;
388 const union {
389 long one;
390 char little;
391 } is_endian = { 1 };
392
393 # if 1
394 do {
395 cnt = 15;
396 nlo = ((const u8 *)Xi)[15];
397 nlo ^= inp[15];
398 nhi = nlo >> 4;
399 nlo &= 0xf;
400
401 Z.hi = Htable[nlo].hi;
402 Z.lo = Htable[nlo].lo;
403
404 while (1) {
405 rem = (size_t)Z.lo & 0xf;
406 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
407 Z.hi = (Z.hi >> 4);
408 if (sizeof(size_t) == 8)
409 Z.hi ^= rem_4bit[rem];
410 else
411 Z.hi ^= (u64)rem_4bit[rem] << 32;
412
413 Z.hi ^= Htable[nhi].hi;
414 Z.lo ^= Htable[nhi].lo;
415
416 if (--cnt < 0)
417 break;
418
419 nlo = ((const u8 *)Xi)[cnt];
420 nlo ^= inp[cnt];
421 nhi = nlo >> 4;
422 nlo &= 0xf;
423
424 rem = (size_t)Z.lo & 0xf;
425 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
426 Z.hi = (Z.hi >> 4);
427 if (sizeof(size_t) == 8)
428 Z.hi ^= rem_4bit[rem];
429 else
430 Z.hi ^= (u64)rem_4bit[rem] << 32;
431
432 Z.hi ^= Htable[nlo].hi;
433 Z.lo ^= Htable[nlo].lo;
434 }
435 # else
436 /*
437 * Extra 256+16 bytes per-key plus 512 bytes shared tables
438 * [should] give ~50% improvement... One could have PACK()-ed
439 * the rem_8bit even here, but the priority is to minimize
440 * cache footprint...
441 */
442 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
443 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
444 static const unsigned short rem_8bit[256] = {
445 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
446 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
447 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
448 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
449 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
450 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
451 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
452 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
453 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
454 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
455 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
456 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
457 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
458 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
459 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
460 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
461 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
462 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
463 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
464 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
465 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
466 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
467 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
468 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
469 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
470 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
471 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
472 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
473 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
474 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
475 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
476 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
477 };
478 /*
479 * This pre-processing phase slows down procedure by approximately
480 * same time as it makes each loop spin faster. In other words
481 * single block performance is approximately same as straightforward
482 * "4-bit" implementation, and then it goes only faster...
483 */
484 for (cnt = 0; cnt < 16; ++cnt) {
485 Z.hi = Htable[cnt].hi;
486 Z.lo = Htable[cnt].lo;
487 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
488 Hshr4[cnt].hi = (Z.hi >> 4);
489 Hshl4[cnt] = (u8)(Z.lo << 4);
490 }
491
492 do {
493 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
494 nlo = ((const u8 *)Xi)[cnt];
495 nlo ^= inp[cnt];
496 nhi = nlo >> 4;
497 nlo &= 0xf;
498
499 Z.hi ^= Htable[nlo].hi;
500 Z.lo ^= Htable[nlo].lo;
501
502 rem = (size_t)Z.lo & 0xff;
503
504 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
505 Z.hi = (Z.hi >> 8);
506
507 Z.hi ^= Hshr4[nhi].hi;
508 Z.lo ^= Hshr4[nhi].lo;
509 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
510 }
511
512 nlo = ((const u8 *)Xi)[0];
513 nlo ^= inp[0];
514 nhi = nlo >> 4;
515 nlo &= 0xf;
516
517 Z.hi ^= Htable[nlo].hi;
518 Z.lo ^= Htable[nlo].lo;
519
520 rem = (size_t)Z.lo & 0xf;
521
522 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
523 Z.hi = (Z.hi >> 4);
524
525 Z.hi ^= Htable[nhi].hi;
526 Z.lo ^= Htable[nhi].lo;
527 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
528 # endif
529
530 if (is_endian.little) {
531 # ifdef BSWAP8
532 Xi[0] = BSWAP8(Z.hi);
533 Xi[1] = BSWAP8(Z.lo);
534 # else
535 u8 *p = (u8 *)Xi;
536 u32 v;
537 v = (u32)(Z.hi >> 32);
538 PUTU32(p, v);
539 v = (u32)(Z.hi);
540 PUTU32(p + 4, v);
541 v = (u32)(Z.lo >> 32);
542 PUTU32(p + 8, v);
543 v = (u32)(Z.lo);
544 PUTU32(p + 12, v);
545 # endif
546 } else {
547 Xi[0] = Z.hi;
548 Xi[1] = Z.lo;
549 }
550 } while (inp += 16, len -= 16);
551 }
552 # endif
553 # else
554 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
555 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
556 size_t len);
557 # endif
558
559 # define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
560 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
561 # define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
562 /*
563 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
564 * effect. In other words idea is to hash data while it's still in L1 cache
565 * after encryption pass...
566 */
567 # define GHASH_CHUNK (3*1024)
568 # endif
569
570 #else /* TABLE_BITS */
571
572 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
573 {
574 u128 V, Z = { 0, 0 };
575 long X;
576 int i, j;
577 const long *xi = (const long *)Xi;
578 const union {
579 long one;
580 char little;
581 } is_endian = { 1 };
582
583 V.hi = H[0]; /* H is in host byte order, no byte swapping */
584 V.lo = H[1];
585
586 for (j = 0; j < 16 / sizeof(long); ++j) {
587 if (is_endian.little) {
588 if (sizeof(long) == 8) {
589 # ifdef BSWAP8
590 X = (long)(BSWAP8(xi[j]));
591 # else
592 const u8 *p = (const u8 *)(xi + j);
593 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
594 # endif
595 } else {
596 const u8 *p = (const u8 *)(xi + j);
597 X = (long)GETU32(p);
598 }
599 } else
600 X = xi[j];
601
602 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
603 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
604 Z.hi ^= V.hi & M;
605 Z.lo ^= V.lo & M;
606
607 REDUCE1BIT(V);
608 }
609 }
610
611 if (is_endian.little) {
612 # ifdef BSWAP8
613 Xi[0] = BSWAP8(Z.hi);
614 Xi[1] = BSWAP8(Z.lo);
615 # else
616 u8 *p = (u8 *)Xi;
617 u32 v;
618 v = (u32)(Z.hi >> 32);
619 PUTU32(p, v);
620 v = (u32)(Z.hi);
621 PUTU32(p + 4, v);
622 v = (u32)(Z.lo >> 32);
623 PUTU32(p + 8, v);
624 v = (u32)(Z.lo);
625 PUTU32(p + 12, v);
626 # endif
627 } else {
628 Xi[0] = Z.hi;
629 Xi[1] = Z.lo;
630 }
631 }
632
633 # define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
634
635 #endif
636
637 #if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
638 # if !defined(I386_ONLY) && \
639 (defined(__i386) || defined(__i386__) || \
640 defined(__x86_64) || defined(__x86_64__) || \
641 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
642 # define GHASH_ASM_X86_OR_64
643 # define GCM_FUNCREF_4BIT
644 extern unsigned int OPENSSL_ia32cap_P[];
645
646 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
647 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
648 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
649 size_t len);
650
651 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
652 # define gcm_init_avx gcm_init_clmul
653 # define gcm_gmult_avx gcm_gmult_clmul
654 # define gcm_ghash_avx gcm_ghash_clmul
655 # else
656 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
657 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
658 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
659 size_t len);
660 # endif
661
662 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
663 # define GHASH_ASM_X86
664 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
665 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
666 size_t len);
667
668 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670 size_t len);
671 # endif
672 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
673 # include "arm_arch.h"
674 # if __ARM_MAX_ARCH__>=7
675 # define GHASH_ASM_ARM
676 # define GCM_FUNCREF_4BIT
677 # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
678 # if defined(__arm__) || defined(__arm)
679 # define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
680 # endif
681 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
682 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
683 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684 size_t len);
685 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
686 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
687 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
688 size_t len);
689 # endif
690 # elif defined(__sparc__) || defined(__sparc)
691 # include "sparc_arch.h"
692 # define GHASH_ASM_SPARC
693 # define GCM_FUNCREF_4BIT
694 extern unsigned int OPENSSL_sparcv9cap_P[];
695 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
696 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
697 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
698 size_t len);
699 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
700 # include "ppc_arch.h"
701 # define GHASH_ASM_PPC
702 # define GCM_FUNCREF_4BIT
703 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
704 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
705 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
706 size_t len);
707 # endif
708 #endif
709
710 #ifdef GCM_FUNCREF_4BIT
711 # undef GCM_MUL
712 # define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
713 # ifdef GHASH
714 # undef GHASH
715 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
716 # endif
717 #endif
718
719 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
720 {
721 const union {
722 long one;
723 char little;
724 } is_endian = { 1 };
725
726 memset(ctx, 0, sizeof(*ctx));
727 ctx->block = block;
728 ctx->key = key;
729
730 (*block) (ctx->H.c, ctx->H.c, key);
731
732 if (is_endian.little) {
733 /* H is stored in host byte order */
734 #ifdef BSWAP8
735 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
736 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
737 #else
738 u8 *p = ctx->H.c;
739 u64 hi, lo;
740 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
741 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
742 ctx->H.u[0] = hi;
743 ctx->H.u[1] = lo;
744 #endif
745 }
746 #if TABLE_BITS==8
747 gcm_init_8bit(ctx->Htable, ctx->H.u);
748 #elif TABLE_BITS==4
749 # if defined(GHASH)
750 # define CTX__GHASH(f) (ctx->ghash = (f))
751 # else
752 # define CTX__GHASH(f) (ctx->ghash = NULL)
753 # endif
754 # if defined(GHASH_ASM_X86_OR_64)
755 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
756 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
757 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
758 gcm_init_avx(ctx->Htable, ctx->H.u);
759 ctx->gmult = gcm_gmult_avx;
760 CTX__GHASH(gcm_ghash_avx);
761 } else {
762 gcm_init_clmul(ctx->Htable, ctx->H.u);
763 ctx->gmult = gcm_gmult_clmul;
764 CTX__GHASH(gcm_ghash_clmul);
765 }
766 return;
767 }
768 # endif
769 gcm_init_4bit(ctx->Htable, ctx->H.u);
770 # if defined(GHASH_ASM_X86) /* x86 only */
771 # if defined(OPENSSL_IA32_SSE2)
772 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
773 # else
774 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
775 # endif
776 ctx->gmult = gcm_gmult_4bit_mmx;
777 CTX__GHASH(gcm_ghash_4bit_mmx);
778 } else {
779 ctx->gmult = gcm_gmult_4bit_x86;
780 CTX__GHASH(gcm_ghash_4bit_x86);
781 }
782 # else
783 ctx->gmult = gcm_gmult_4bit;
784 CTX__GHASH(gcm_ghash_4bit);
785 # endif
786 # elif defined(GHASH_ASM_ARM)
787 # ifdef PMULL_CAPABLE
788 if (PMULL_CAPABLE) {
789 gcm_init_v8(ctx->Htable, ctx->H.u);
790 ctx->gmult = gcm_gmult_v8;
791 CTX__GHASH(gcm_ghash_v8);
792 } else
793 # endif
794 # ifdef NEON_CAPABLE
795 if (NEON_CAPABLE) {
796 gcm_init_neon(ctx->Htable, ctx->H.u);
797 ctx->gmult = gcm_gmult_neon;
798 CTX__GHASH(gcm_ghash_neon);
799 } else
800 # endif
801 {
802 gcm_init_4bit(ctx->Htable, ctx->H.u);
803 ctx->gmult = gcm_gmult_4bit;
804 CTX__GHASH(gcm_ghash_4bit);
805 }
806 # elif defined(GHASH_ASM_SPARC)
807 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
808 gcm_init_vis3(ctx->Htable, ctx->H.u);
809 ctx->gmult = gcm_gmult_vis3;
810 CTX__GHASH(gcm_ghash_vis3);
811 } else {
812 gcm_init_4bit(ctx->Htable, ctx->H.u);
813 ctx->gmult = gcm_gmult_4bit;
814 CTX__GHASH(gcm_ghash_4bit);
815 }
816 # elif defined(GHASH_ASM_PPC)
817 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
818 gcm_init_p8(ctx->Htable, ctx->H.u);
819 ctx->gmult = gcm_gmult_p8;
820 CTX__GHASH(gcm_ghash_p8);
821 } else {
822 gcm_init_4bit(ctx->Htable, ctx->H.u);
823 ctx->gmult = gcm_gmult_4bit;
824 CTX__GHASH(gcm_ghash_4bit);
825 }
826 # else
827 gcm_init_4bit(ctx->Htable, ctx->H.u);
828 # endif
829 # undef CTX__GHASH
830 #endif
831 }
832
833 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
834 size_t len)
835 {
836 const union {
837 long one;
838 char little;
839 } is_endian = { 1 };
840 unsigned int ctr;
841 #ifdef GCM_FUNCREF_4BIT
842 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
843 #endif
844
845 ctx->len.u[0] = 0; /* AAD length */
846 ctx->len.u[1] = 0; /* message length */
847 ctx->ares = 0;
848 ctx->mres = 0;
849
850 if (len == 12) {
851 memcpy(ctx->Yi.c, iv, 12);
852 ctx->Yi.c[12] = 0;
853 ctx->Yi.c[13] = 0;
854 ctx->Yi.c[14] = 0;
855 ctx->Yi.c[15] = 1;
856 ctr = 1;
857 } else {
858 size_t i;
859 u64 len0 = len;
860
861 /* Borrow ctx->Xi to calculate initial Yi */
862 ctx->Xi.u[0] = 0;
863 ctx->Xi.u[1] = 0;
864
865 while (len >= 16) {
866 for (i = 0; i < 16; ++i)
867 ctx->Xi.c[i] ^= iv[i];
868 GCM_MUL(ctx);
869 iv += 16;
870 len -= 16;
871 }
872 if (len) {
873 for (i = 0; i < len; ++i)
874 ctx->Xi.c[i] ^= iv[i];
875 GCM_MUL(ctx);
876 }
877 len0 <<= 3;
878 if (is_endian.little) {
879 #ifdef BSWAP8
880 ctx->Xi.u[1] ^= BSWAP8(len0);
881 #else
882 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
883 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
884 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
885 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
886 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
887 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
888 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
889 ctx->Xi.c[15] ^= (u8)(len0);
890 #endif
891 } else {
892 ctx->Xi.u[1] ^= len0;
893 }
894
895 GCM_MUL(ctx);
896
897 if (is_endian.little)
898 #ifdef BSWAP4
899 ctr = BSWAP4(ctx->Xi.d[3]);
900 #else
901 ctr = GETU32(ctx->Xi.c + 12);
902 #endif
903 else
904 ctr = ctx->Xi.d[3];
905
906 /* Copy borrowed Xi to Yi */
907 ctx->Yi.u[0] = ctx->Xi.u[0];
908 ctx->Yi.u[1] = ctx->Xi.u[1];
909 }
910
911 ctx->Xi.u[0] = 0;
912 ctx->Xi.u[1] = 0;
913
914 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
915 ++ctr;
916 if (is_endian.little)
917 #ifdef BSWAP4
918 ctx->Yi.d[3] = BSWAP4(ctr);
919 #else
920 PUTU32(ctx->Yi.c + 12, ctr);
921 #endif
922 else
923 ctx->Yi.d[3] = ctr;
924 }
925
926 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
927 size_t len)
928 {
929 size_t i;
930 unsigned int n;
931 u64 alen = ctx->len.u[0];
932 #ifdef GCM_FUNCREF_4BIT
933 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
934 # ifdef GHASH
935 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
936 const u8 *inp, size_t len) = ctx->ghash;
937 # endif
938 #endif
939
940 if (ctx->len.u[1])
941 return -2;
942
943 alen += len;
944 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
945 return -1;
946 ctx->len.u[0] = alen;
947
948 n = ctx->ares;
949 if (n) {
950 while (n && len) {
951 ctx->Xi.c[n] ^= *(aad++);
952 --len;
953 n = (n + 1) % 16;
954 }
955 if (n == 0)
956 GCM_MUL(ctx);
957 else {
958 ctx->ares = n;
959 return 0;
960 }
961 }
962 #ifdef GHASH
963 if ((i = (len & (size_t)-16))) {
964 GHASH(ctx, aad, i);
965 aad += i;
966 len -= i;
967 }
968 #else
969 while (len >= 16) {
970 for (i = 0; i < 16; ++i)
971 ctx->Xi.c[i] ^= aad[i];
972 GCM_MUL(ctx);
973 aad += 16;
974 len -= 16;
975 }
976 #endif
977 if (len) {
978 n = (unsigned int)len;
979 for (i = 0; i < len; ++i)
980 ctx->Xi.c[i] ^= aad[i];
981 }
982
983 ctx->ares = n;
984 return 0;
985 }
986
987 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
988 const unsigned char *in, unsigned char *out,
989 size_t len)
990 {
991 const union {
992 long one;
993 char little;
994 } is_endian = { 1 };
995 unsigned int n, ctr, mres;
996 size_t i;
997 u64 mlen = ctx->len.u[1];
998 block128_f block = ctx->block;
999 void *key = ctx->key;
1000 #ifdef GCM_FUNCREF_4BIT
1001 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1002 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1003 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1004 const u8 *inp, size_t len) = ctx->ghash;
1005 # endif
1006 #endif
1007
1008 mlen += len;
1009 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1010 return -1;
1011 ctx->len.u[1] = mlen;
1012
1013 mres = ctx->mres;
1014
1015 if (ctx->ares) {
1016 /* First call to encrypt finalizes GHASH(AAD) */
1017 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1018 if (len == 0) {
1019 GCM_MUL(ctx);
1020 ctx->ares = 0;
1021 return 0;
1022 }
1023 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1024 ctx->Xi.u[0] = 0;
1025 ctx->Xi.u[1] = 0;
1026 mres = sizeof(ctx->Xi);
1027 #else
1028 GCM_MUL(ctx);
1029 #endif
1030 ctx->ares = 0;
1031 }
1032
1033 if (is_endian.little)
1034 #ifdef BSWAP4
1035 ctr = BSWAP4(ctx->Yi.d[3]);
1036 #else
1037 ctr = GETU32(ctx->Yi.c + 12);
1038 #endif
1039 else
1040 ctr = ctx->Yi.d[3];
1041
1042 n = mres % 16;
1043 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1044 if (16 % sizeof(size_t) == 0) { /* always true actually */
1045 do {
1046 if (n) {
1047 # if defined(GHASH)
1048 while (n && len) {
1049 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1050 --len;
1051 n = (n + 1) % 16;
1052 }
1053 if (n == 0) {
1054 GHASH(ctx, ctx->Xn, mres);
1055 mres = 0;
1056 } else {
1057 ctx->mres = mres;
1058 return 0;
1059 }
1060 # else
1061 while (n && len) {
1062 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1063 --len;
1064 n = (n + 1) % 16;
1065 }
1066 if (n == 0) {
1067 GCM_MUL(ctx);
1068 mres = 0;
1069 } else {
1070 ctx->mres = n;
1071 return 0;
1072 }
1073 # endif
1074 }
1075 # if defined(STRICT_ALIGNMENT)
1076 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1077 break;
1078 # endif
1079 # if defined(GHASH)
1080 if (len >= 16 && mres) {
1081 GHASH(ctx, ctx->Xn, mres);
1082 mres = 0;
1083 }
1084 # if defined(GHASH_CHUNK)
1085 while (len >= GHASH_CHUNK) {
1086 size_t j = GHASH_CHUNK;
1087
1088 while (j) {
1089 size_t_aX *out_t = (size_t_aX *)out;
1090 const size_t_aX *in_t = (const size_t_aX *)in;
1091
1092 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1093 ++ctr;
1094 if (is_endian.little)
1095 # ifdef BSWAP4
1096 ctx->Yi.d[3] = BSWAP4(ctr);
1097 # else
1098 PUTU32(ctx->Yi.c + 12, ctr);
1099 # endif
1100 else
1101 ctx->Yi.d[3] = ctr;
1102 for (i = 0; i < 16 / sizeof(size_t); ++i)
1103 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1104 out += 16;
1105 in += 16;
1106 j -= 16;
1107 }
1108 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1109 len -= GHASH_CHUNK;
1110 }
1111 # endif
1112 if ((i = (len & (size_t)-16))) {
1113 size_t j = i;
1114
1115 while (len >= 16) {
1116 size_t_aX *out_t = (size_t_aX *)out;
1117 const size_t_aX *in_t = (const size_t_aX *)in;
1118
1119 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1120 ++ctr;
1121 if (is_endian.little)
1122 # ifdef BSWAP4
1123 ctx->Yi.d[3] = BSWAP4(ctr);
1124 # else
1125 PUTU32(ctx->Yi.c + 12, ctr);
1126 # endif
1127 else
1128 ctx->Yi.d[3] = ctr;
1129 for (i = 0; i < 16 / sizeof(size_t); ++i)
1130 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1131 out += 16;
1132 in += 16;
1133 len -= 16;
1134 }
1135 GHASH(ctx, out - j, j);
1136 }
1137 # else
1138 while (len >= 16) {
1139 size_t *out_t = (size_t *)out;
1140 const size_t *in_t = (const size_t *)in;
1141
1142 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1143 ++ctr;
1144 if (is_endian.little)
1145 # ifdef BSWAP4
1146 ctx->Yi.d[3] = BSWAP4(ctr);
1147 # else
1148 PUTU32(ctx->Yi.c + 12, ctr);
1149 # endif
1150 else
1151 ctx->Yi.d[3] = ctr;
1152 for (i = 0; i < 16 / sizeof(size_t); ++i)
1153 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1154 GCM_MUL(ctx);
1155 out += 16;
1156 in += 16;
1157 len -= 16;
1158 }
1159 # endif
1160 if (len) {
1161 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1162 ++ctr;
1163 if (is_endian.little)
1164 # ifdef BSWAP4
1165 ctx->Yi.d[3] = BSWAP4(ctr);
1166 # else
1167 PUTU32(ctx->Yi.c + 12, ctr);
1168 # endif
1169 else
1170 ctx->Yi.d[3] = ctr;
1171 # if defined(GHASH)
1172 while (len--) {
1173 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1174 ++n;
1175 }
1176 # else
1177 while (len--) {
1178 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1179 ++n;
1180 }
1181 mres = n;
1182 # endif
1183 }
1184
1185 ctx->mres = mres;
1186 return 0;
1187 } while (0);
1188 }
1189 #endif
1190 for (i = 0; i < len; ++i) {
1191 if (n == 0) {
1192 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1193 ++ctr;
1194 if (is_endian.little)
1195 #ifdef BSWAP4
1196 ctx->Yi.d[3] = BSWAP4(ctr);
1197 #else
1198 PUTU32(ctx->Yi.c + 12, ctr);
1199 #endif
1200 else
1201 ctx->Yi.d[3] = ctr;
1202 }
1203 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1205 n = (n + 1) % 16;
1206 if (mres == sizeof(ctx->Xn)) {
1207 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1208 mres = 0;
1209 }
1210 #else
1211 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1212 mres = n = (n + 1) % 16;
1213 if (n == 0)
1214 GCM_MUL(ctx);
1215 #endif
1216 }
1217
1218 ctx->mres = mres;
1219 return 0;
1220 }
1221
1222 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1223 const unsigned char *in, unsigned char *out,
1224 size_t len)
1225 {
1226 const union {
1227 long one;
1228 char little;
1229 } is_endian = { 1 };
1230 unsigned int n, ctr, mres;
1231 size_t i;
1232 u64 mlen = ctx->len.u[1];
1233 block128_f block = ctx->block;
1234 void *key = ctx->key;
1235 #ifdef GCM_FUNCREF_4BIT
1236 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1237 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1238 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1239 const u8 *inp, size_t len) = ctx->ghash;
1240 # endif
1241 #endif
1242
1243 mlen += len;
1244 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1245 return -1;
1246 ctx->len.u[1] = mlen;
1247
1248 mres = ctx->mres;
1249
1250 if (ctx->ares) {
1251 /* First call to decrypt finalizes GHASH(AAD) */
1252 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1253 if (len == 0) {
1254 GCM_MUL(ctx);
1255 ctx->ares = 0;
1256 return 0;
1257 }
1258 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1259 ctx->Xi.u[0] = 0;
1260 ctx->Xi.u[1] = 0;
1261 mres = sizeof(ctx->Xi);
1262 #else
1263 GCM_MUL(ctx);
1264 #endif
1265 ctx->ares = 0;
1266 }
1267
1268 if (is_endian.little)
1269 #ifdef BSWAP4
1270 ctr = BSWAP4(ctx->Yi.d[3]);
1271 #else
1272 ctr = GETU32(ctx->Yi.c + 12);
1273 #endif
1274 else
1275 ctr = ctx->Yi.d[3];
1276
1277 n = mres % 16;
1278 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1279 if (16 % sizeof(size_t) == 0) { /* always true actually */
1280 do {
1281 if (n) {
1282 # if defined(GHASH)
1283 while (n && len) {
1284 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1285 --len;
1286 n = (n + 1) % 16;
1287 }
1288 if (n == 0) {
1289 GHASH(ctx, ctx->Xn, mres);
1290 mres = 0;
1291 } else {
1292 ctx->mres = mres;
1293 return 0;
1294 }
1295 # else
1296 while (n && len) {
1297 u8 c = *(in++);
1298 *(out++) = c ^ ctx->EKi.c[n];
1299 ctx->Xi.c[n] ^= c;
1300 --len;
1301 n = (n + 1) % 16;
1302 }
1303 if (n == 0) {
1304 GCM_MUL(ctx);
1305 mres = 0;
1306 } else {
1307 ctx->mres = n;
1308 return 0;
1309 }
1310 # endif
1311 }
1312 # if defined(STRICT_ALIGNMENT)
1313 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1314 break;
1315 # endif
1316 # if defined(GHASH)
1317 if (len >= 16 && mres) {
1318 GHASH(ctx, ctx->Xn, mres);
1319 mres = 0;
1320 }
1321 # if defined(GHASH_CHUNK)
1322 while (len >= GHASH_CHUNK) {
1323 size_t j = GHASH_CHUNK;
1324
1325 GHASH(ctx, in, GHASH_CHUNK);
1326 while (j) {
1327 size_t_aX *out_t = (size_t_aX *)out;
1328 const size_t_aX *in_t = (const size_t_aX *)in;
1329
1330 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1331 ++ctr;
1332 if (is_endian.little)
1333 # ifdef BSWAP4
1334 ctx->Yi.d[3] = BSWAP4(ctr);
1335 # else
1336 PUTU32(ctx->Yi.c + 12, ctr);
1337 # endif
1338 else
1339 ctx->Yi.d[3] = ctr;
1340 for (i = 0; i < 16 / sizeof(size_t); ++i)
1341 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1342 out += 16;
1343 in += 16;
1344 j -= 16;
1345 }
1346 len -= GHASH_CHUNK;
1347 }
1348 # endif
1349 if ((i = (len & (size_t)-16))) {
1350 GHASH(ctx, in, i);
1351 while (len >= 16) {
1352 size_t_aX *out_t = (size_t_aX *)out;
1353 const size_t_aX *in_t = (const size_t_aX *)in;
1354
1355 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1356 ++ctr;
1357 if (is_endian.little)
1358 # ifdef BSWAP4
1359 ctx->Yi.d[3] = BSWAP4(ctr);
1360 # else
1361 PUTU32(ctx->Yi.c + 12, ctr);
1362 # endif
1363 else
1364 ctx->Yi.d[3] = ctr;
1365 for (i = 0; i < 16 / sizeof(size_t); ++i)
1366 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1367 out += 16;
1368 in += 16;
1369 len -= 16;
1370 }
1371 }
1372 # else
1373 while (len >= 16) {
1374 size_t *out_t = (size_t *)out;
1375 const size_t *in_t = (const size_t *)in;
1376
1377 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1378 ++ctr;
1379 if (is_endian.little)
1380 # ifdef BSWAP4
1381 ctx->Yi.d[3] = BSWAP4(ctr);
1382 # else
1383 PUTU32(ctx->Yi.c + 12, ctr);
1384 # endif
1385 else
1386 ctx->Yi.d[3] = ctr;
1387 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1388 size_t c = in_t[i];
1389 out_t[i] = c ^ ctx->EKi.t[i];
1390 ctx->Xi.t[i] ^= c;
1391 }
1392 GCM_MUL(ctx);
1393 out += 16;
1394 in += 16;
1395 len -= 16;
1396 }
1397 # endif
1398 if (len) {
1399 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1400 ++ctr;
1401 if (is_endian.little)
1402 # ifdef BSWAP4
1403 ctx->Yi.d[3] = BSWAP4(ctr);
1404 # else
1405 PUTU32(ctx->Yi.c + 12, ctr);
1406 # endif
1407 else
1408 ctx->Yi.d[3] = ctr;
1409 # if defined(GHASH)
1410 while (len--) {
1411 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1412 ++n;
1413 }
1414 # else
1415 while (len--) {
1416 u8 c = in[n];
1417 ctx->Xi.c[n] ^= c;
1418 out[n] = c ^ ctx->EKi.c[n];
1419 ++n;
1420 }
1421 mres = n;
1422 # endif
1423 }
1424
1425 ctx->mres = mres;
1426 return 0;
1427 } while (0);
1428 }
1429 #endif
1430 for (i = 0; i < len; ++i) {
1431 u8 c;
1432 if (n == 0) {
1433 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1434 ++ctr;
1435 if (is_endian.little)
1436 #ifdef BSWAP4
1437 ctx->Yi.d[3] = BSWAP4(ctr);
1438 #else
1439 PUTU32(ctx->Yi.c + 12, ctr);
1440 #endif
1441 else
1442 ctx->Yi.d[3] = ctr;
1443 }
1444 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1445 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1446 n = (n + 1) % 16;
1447 if (mres == sizeof(ctx->Xn)) {
1448 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1449 mres = 0;
1450 }
1451 #else
1452 c = in[i];
1453 out[i] = c ^ ctx->EKi.c[n];
1454 ctx->Xi.c[n] ^= c;
1455 mres = n = (n + 1) % 16;
1456 if (n == 0)
1457 GCM_MUL(ctx);
1458 #endif
1459 }
1460
1461 ctx->mres = mres;
1462 return 0;
1463 }
1464
1465 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1466 const unsigned char *in, unsigned char *out,
1467 size_t len, ctr128_f stream)
1468 {
1469 #if defined(OPENSSL_SMALL_FOOTPRINT)
1470 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1471 #else
1472 const union {
1473 long one;
1474 char little;
1475 } is_endian = { 1 };
1476 unsigned int n, ctr, mres;
1477 size_t i;
1478 u64 mlen = ctx->len.u[1];
1479 void *key = ctx->key;
1480 # ifdef GCM_FUNCREF_4BIT
1481 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1482 # ifdef GHASH
1483 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1484 const u8 *inp, size_t len) = ctx->ghash;
1485 # endif
1486 # endif
1487
1488 mlen += len;
1489 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1490 return -1;
1491 ctx->len.u[1] = mlen;
1492
1493 mres = ctx->mres;
1494
1495 if (ctx->ares) {
1496 /* First call to encrypt finalizes GHASH(AAD) */
1497 #if defined(GHASH)
1498 if (len == 0) {
1499 GCM_MUL(ctx);
1500 ctx->ares = 0;
1501 return 0;
1502 }
1503 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1504 ctx->Xi.u[0] = 0;
1505 ctx->Xi.u[1] = 0;
1506 mres = sizeof(ctx->Xi);
1507 #else
1508 GCM_MUL(ctx);
1509 #endif
1510 ctx->ares = 0;
1511 }
1512
1513 if (is_endian.little)
1514 # ifdef BSWAP4
1515 ctr = BSWAP4(ctx->Yi.d[3]);
1516 # else
1517 ctr = GETU32(ctx->Yi.c + 12);
1518 # endif
1519 else
1520 ctr = ctx->Yi.d[3];
1521
1522 n = mres % 16;
1523 if (n) {
1524 # if defined(GHASH)
1525 while (n && len) {
1526 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1527 --len;
1528 n = (n + 1) % 16;
1529 }
1530 if (n == 0) {
1531 GHASH(ctx, ctx->Xn, mres);
1532 mres = 0;
1533 } else {
1534 ctx->mres = mres;
1535 return 0;
1536 }
1537 # else
1538 while (n && len) {
1539 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1540 --len;
1541 n = (n + 1) % 16;
1542 }
1543 if (n == 0) {
1544 GCM_MUL(ctx);
1545 mres = 0;
1546 } else {
1547 ctx->mres = n;
1548 return 0;
1549 }
1550 # endif
1551 }
1552 # if defined(GHASH)
1553 if (len >= 16 && mres) {
1554 GHASH(ctx, ctx->Xn, mres);
1555 mres = 0;
1556 }
1557 # if defined(GHASH_CHUNK)
1558 while (len >= GHASH_CHUNK) {
1559 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1560 ctr += GHASH_CHUNK / 16;
1561 if (is_endian.little)
1562 # ifdef BSWAP4
1563 ctx->Yi.d[3] = BSWAP4(ctr);
1564 # else
1565 PUTU32(ctx->Yi.c + 12, ctr);
1566 # endif
1567 else
1568 ctx->Yi.d[3] = ctr;
1569 GHASH(ctx, out, GHASH_CHUNK);
1570 out += GHASH_CHUNK;
1571 in += GHASH_CHUNK;
1572 len -= GHASH_CHUNK;
1573 }
1574 # endif
1575 # endif
1576 if ((i = (len & (size_t)-16))) {
1577 size_t j = i / 16;
1578
1579 (*stream) (in, out, j, key, ctx->Yi.c);
1580 ctr += (unsigned int)j;
1581 if (is_endian.little)
1582 # ifdef BSWAP4
1583 ctx->Yi.d[3] = BSWAP4(ctr);
1584 # else
1585 PUTU32(ctx->Yi.c + 12, ctr);
1586 # endif
1587 else
1588 ctx->Yi.d[3] = ctr;
1589 in += i;
1590 len -= i;
1591 # if defined(GHASH)
1592 GHASH(ctx, out, i);
1593 out += i;
1594 # else
1595 while (j--) {
1596 for (i = 0; i < 16; ++i)
1597 ctx->Xi.c[i] ^= out[i];
1598 GCM_MUL(ctx);
1599 out += 16;
1600 }
1601 # endif
1602 }
1603 if (len) {
1604 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1605 ++ctr;
1606 if (is_endian.little)
1607 # ifdef BSWAP4
1608 ctx->Yi.d[3] = BSWAP4(ctr);
1609 # else
1610 PUTU32(ctx->Yi.c + 12, ctr);
1611 # endif
1612 else
1613 ctx->Yi.d[3] = ctr;
1614 while (len--) {
1615 # if defined(GHASH)
1616 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1617 # else
1618 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1619 # endif
1620 ++n;
1621 }
1622 }
1623
1624 ctx->mres = mres;
1625 return 0;
1626 #endif
1627 }
1628
1629 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1630 const unsigned char *in, unsigned char *out,
1631 size_t len, ctr128_f stream)
1632 {
1633 #if defined(OPENSSL_SMALL_FOOTPRINT)
1634 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1635 #else
1636 const union {
1637 long one;
1638 char little;
1639 } is_endian = { 1 };
1640 unsigned int n, ctr, mres;
1641 size_t i;
1642 u64 mlen = ctx->len.u[1];
1643 void *key = ctx->key;
1644 # ifdef GCM_FUNCREF_4BIT
1645 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1646 # ifdef GHASH
1647 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1648 const u8 *inp, size_t len) = ctx->ghash;
1649 # endif
1650 # endif
1651
1652 mlen += len;
1653 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1654 return -1;
1655 ctx->len.u[1] = mlen;
1656
1657 mres = ctx->mres;
1658
1659 if (ctx->ares) {
1660 /* First call to decrypt finalizes GHASH(AAD) */
1661 # if defined(GHASH)
1662 if (len == 0) {
1663 GCM_MUL(ctx);
1664 ctx->ares = 0;
1665 return 0;
1666 }
1667 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1668 ctx->Xi.u[0] = 0;
1669 ctx->Xi.u[1] = 0;
1670 mres = sizeof(ctx->Xi);
1671 # else
1672 GCM_MUL(ctx);
1673 # endif
1674 ctx->ares = 0;
1675 }
1676
1677 if (is_endian.little)
1678 # ifdef BSWAP4
1679 ctr = BSWAP4(ctx->Yi.d[3]);
1680 # else
1681 ctr = GETU32(ctx->Yi.c + 12);
1682 # endif
1683 else
1684 ctr = ctx->Yi.d[3];
1685
1686 n = mres % 16;
1687 if (n) {
1688 # if defined(GHASH)
1689 while (n && len) {
1690 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1691 --len;
1692 n = (n + 1) % 16;
1693 }
1694 if (n == 0) {
1695 GHASH(ctx, ctx->Xn, mres);
1696 mres = 0;
1697 } else {
1698 ctx->mres = mres;
1699 return 0;
1700 }
1701 # else
1702 while (n && len) {
1703 u8 c = *(in++);
1704 *(out++) = c ^ ctx->EKi.c[n];
1705 ctx->Xi.c[n] ^= c;
1706 --len;
1707 n = (n + 1) % 16;
1708 }
1709 if (n == 0) {
1710 GCM_MUL(ctx);
1711 mres = 0;
1712 } else {
1713 ctx->mres = n;
1714 return 0;
1715 }
1716 # endif
1717 }
1718 # if defined(GHASH)
1719 if (len >= 16 && mres) {
1720 GHASH(ctx, ctx->Xn, mres);
1721 mres = 0;
1722 }
1723 # if defined(GHASH_CHUNK)
1724 while (len >= GHASH_CHUNK) {
1725 GHASH(ctx, in, GHASH_CHUNK);
1726 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1727 ctr += GHASH_CHUNK / 16;
1728 if (is_endian.little)
1729 # ifdef BSWAP4
1730 ctx->Yi.d[3] = BSWAP4(ctr);
1731 # else
1732 PUTU32(ctx->Yi.c + 12, ctr);
1733 # endif
1734 else
1735 ctx->Yi.d[3] = ctr;
1736 out += GHASH_CHUNK;
1737 in += GHASH_CHUNK;
1738 len -= GHASH_CHUNK;
1739 }
1740 # endif
1741 # endif
1742 if ((i = (len & (size_t)-16))) {
1743 size_t j = i / 16;
1744
1745 # if defined(GHASH)
1746 GHASH(ctx, in, i);
1747 # else
1748 while (j--) {
1749 size_t k;
1750 for (k = 0; k < 16; ++k)
1751 ctx->Xi.c[k] ^= in[k];
1752 GCM_MUL(ctx);
1753 in += 16;
1754 }
1755 j = i / 16;
1756 in -= i;
1757 # endif
1758 (*stream) (in, out, j, key, ctx->Yi.c);
1759 ctr += (unsigned int)j;
1760 if (is_endian.little)
1761 # ifdef BSWAP4
1762 ctx->Yi.d[3] = BSWAP4(ctr);
1763 # else
1764 PUTU32(ctx->Yi.c + 12, ctr);
1765 # endif
1766 else
1767 ctx->Yi.d[3] = ctr;
1768 out += i;
1769 in += i;
1770 len -= i;
1771 }
1772 if (len) {
1773 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1774 ++ctr;
1775 if (is_endian.little)
1776 # ifdef BSWAP4
1777 ctx->Yi.d[3] = BSWAP4(ctr);
1778 # else
1779 PUTU32(ctx->Yi.c + 12, ctr);
1780 # endif
1781 else
1782 ctx->Yi.d[3] = ctr;
1783 while (len--) {
1784 # if defined(GHASH)
1785 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1786 # else
1787 u8 c = in[n];
1788 ctx->Xi.c[mres++] ^= c;
1789 out[n] = c ^ ctx->EKi.c[n];
1790 # endif
1791 ++n;
1792 }
1793 }
1794
1795 ctx->mres = mres;
1796 return 0;
1797 #endif
1798 }
1799
1800 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1801 size_t len)
1802 {
1803 const union {
1804 long one;
1805 char little;
1806 } is_endian = { 1 };
1807 u64 alen = ctx->len.u[0] << 3;
1808 u64 clen = ctx->len.u[1] << 3;
1809 #ifdef GCM_FUNCREF_4BIT
1810 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1811 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1813 const u8 *inp, size_t len) = ctx->ghash;
1814 # endif
1815 #endif
1816
1817 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1818 u128 bitlen;
1819 unsigned int mres = ctx->mres;
1820
1821 if (mres) {
1822 unsigned blocks = (mres + 15) & -16;
1823
1824 memset(ctx->Xn + mres, 0, blocks - mres);
1825 mres = blocks;
1826 if (mres == sizeof(ctx->Xn)) {
1827 GHASH(ctx, ctx->Xn, mres);
1828 mres = 0;
1829 }
1830 } else if (ctx->ares) {
1831 GCM_MUL(ctx);
1832 }
1833 #else
1834 if (ctx->mres || ctx->ares)
1835 GCM_MUL(ctx);
1836 #endif
1837
1838 if (is_endian.little) {
1839 #ifdef BSWAP8
1840 alen = BSWAP8(alen);
1841 clen = BSWAP8(clen);
1842 #else
1843 u8 *p = ctx->len.c;
1844
1845 ctx->len.u[0] = alen;
1846 ctx->len.u[1] = clen;
1847
1848 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1849 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1850 #endif
1851 }
1852
1853 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1854 bitlen.hi = alen;
1855 bitlen.lo = clen;
1856 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1857 mres += sizeof(bitlen);
1858 GHASH(ctx, ctx->Xn, mres);
1859 #else
1860 ctx->Xi.u[0] ^= alen;
1861 ctx->Xi.u[1] ^= clen;
1862 GCM_MUL(ctx);
1863 #endif
1864
1865 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1866 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1867
1868 if (tag && len <= sizeof(ctx->Xi))
1869 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1870 else
1871 return -1;
1872 }
1873
1874 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1875 {
1876 CRYPTO_gcm128_finish(ctx, NULL, 0);
1877 memcpy(tag, ctx->Xi.c,
1878 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1879 }
1880
1881 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1882 {
1883 GCM128_CONTEXT *ret;
1884
1885 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1886 CRYPTO_gcm128_init(ret, key, block);
1887
1888 return ret;
1889 }
1890
1891 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1892 {
1893 OPENSSL_clear_free(ctx, sizeof(*ctx));
1894 }
1895