• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the OpenSSL license (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <openssl/crypto.h>
11 #include "modes_local.h"
12 #include <string.h>
13 
14 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
15 typedef size_t size_t_aX __attribute((__aligned__(1)));
16 #else
17 typedef size_t size_t_aX;
18 #endif
19 
20 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
21 /* redefine, because alignment is ensured */
22 # undef  GETU32
23 # define GETU32(p)       BSWAP4(*(const u32 *)(p))
24 # undef  PUTU32
25 # define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
26 #endif
27 
28 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
29 #define REDUCE1BIT(V)   do { \
30         if (sizeof(size_t)==8) { \
31                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
32                 V.lo  = (V.hi<<63)|(V.lo>>1); \
33                 V.hi  = (V.hi>>1 )^T; \
34         } \
35         else { \
36                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
37                 V.lo  = (V.hi<<63)|(V.lo>>1); \
38                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
39         } \
40 } while(0)
41 
42 /*-
43  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
44  * never be set to 8. 8 is effectively reserved for testing purposes.
45  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
46  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
47  * whole spectrum of possible table driven implementations. Why? In
48  * non-"Shoup's" case memory access pattern is segmented in such manner,
49  * that it's trivial to see that cache timing information can reveal
50  * fair portion of intermediate hash value. Given that ciphertext is
51  * always available to attacker, it's possible for him to attempt to
52  * deduce secret parameter H and if successful, tamper with messages
53  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
54  * not as trivial, but there is no reason to believe that it's resistant
55  * to cache-timing attack. And the thing about "8-bit" implementation is
56  * that it consumes 16 (sixteen) times more memory, 4KB per individual
57  * key + 1KB shared. Well, on pros side it should be twice as fast as
58  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
59  * was observed to run ~75% faster, closer to 100% for commercial
60  * compilers... Yet "4-bit" procedure is preferred, because it's
61  * believed to provide better security-performance balance and adequate
62  * all-round performance. "All-round" refers to things like:
63  *
64  * - shorter setup time effectively improves overall timing for
65  *   handling short messages;
66  * - larger table allocation can become unbearable because of VM
67  *   subsystem penalties (for example on Windows large enough free
68  *   results in VM working set trimming, meaning that consequent
69  *   malloc would immediately incur working set expansion);
70  * - larger table has larger cache footprint, which can affect
71  *   performance of other code paths (not necessarily even from same
72  *   thread in Hyper-Threading world);
73  *
74  * Value of 1 is not appropriate for performance reasons.
75  */
76 #if     TABLE_BITS==8
77 
gcm_init_8bit(u128 Htable[256],u64 H[2])78 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
79 {
80     int i, j;
81     u128 V;
82 
83     Htable[0].hi = 0;
84     Htable[0].lo = 0;
85     V.hi = H[0];
86     V.lo = H[1];
87 
88     for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
89         REDUCE1BIT(V);
90         Htable[i] = V;
91     }
92 
93     for (i = 2; i < 256; i <<= 1) {
94         u128 *Hi = Htable + i, H0 = *Hi;
95         for (j = 1; j < i; ++j) {
96             Hi[j].hi = H0.hi ^ Htable[j].hi;
97             Hi[j].lo = H0.lo ^ Htable[j].lo;
98         }
99     }
100 }
101 
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])102 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
103 {
104     u128 Z = { 0, 0 };
105     const u8 *xi = (const u8 *)Xi + 15;
106     size_t rem, n = *xi;
107     const union {
108         long one;
109         char little;
110     } is_endian = { 1 };
111     static const size_t rem_8bit[256] = {
112         PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
113         PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
114         PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
115         PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
116         PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
117         PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
118         PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
119         PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
120         PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
121         PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
122         PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
123         PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
124         PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
125         PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
126         PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
127         PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
128         PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
129         PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
130         PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
131         PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
132         PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
133         PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
134         PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
135         PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
136         PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
137         PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
138         PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
139         PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
140         PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
141         PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
142         PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
143         PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
144         PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
145         PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
146         PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
147         PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
148         PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
149         PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
150         PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
151         PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
152         PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
153         PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
154         PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
155         PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
156         PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
157         PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
158         PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
159         PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
160         PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
161         PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
162         PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
163         PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
164         PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
165         PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
166         PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
167         PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
168         PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
169         PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
170         PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
171         PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
172         PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
173         PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
174         PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
175         PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
176     };
177 
178     while (1) {
179         Z.hi ^= Htable[n].hi;
180         Z.lo ^= Htable[n].lo;
181 
182         if ((u8 *)Xi == xi)
183             break;
184 
185         n = *(--xi);
186 
187         rem = (size_t)Z.lo & 0xff;
188         Z.lo = (Z.hi << 56) | (Z.lo >> 8);
189         Z.hi = (Z.hi >> 8);
190         if (sizeof(size_t) == 8)
191             Z.hi ^= rem_8bit[rem];
192         else
193             Z.hi ^= (u64)rem_8bit[rem] << 32;
194     }
195 
196     if (is_endian.little) {
197 # ifdef BSWAP8
198         Xi[0] = BSWAP8(Z.hi);
199         Xi[1] = BSWAP8(Z.lo);
200 # else
201         u8 *p = (u8 *)Xi;
202         u32 v;
203         v = (u32)(Z.hi >> 32);
204         PUTU32(p, v);
205         v = (u32)(Z.hi);
206         PUTU32(p + 4, v);
207         v = (u32)(Z.lo >> 32);
208         PUTU32(p + 8, v);
209         v = (u32)(Z.lo);
210         PUTU32(p + 12, v);
211 # endif
212     } else {
213         Xi[0] = Z.hi;
214         Xi[1] = Z.lo;
215     }
216 }
217 
218 # define GCM_MUL(ctx)      gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
219 
220 #elif   TABLE_BITS==4
221 
gcm_init_4bit(u128 Htable[16],u64 H[2])222 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
223 {
224     u128 V;
225 # if defined(OPENSSL_SMALL_FOOTPRINT)
226     int i;
227 # endif
228 
229     Htable[0].hi = 0;
230     Htable[0].lo = 0;
231     V.hi = H[0];
232     V.lo = H[1];
233 
234 # if defined(OPENSSL_SMALL_FOOTPRINT)
235     for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
236         REDUCE1BIT(V);
237         Htable[i] = V;
238     }
239 
240     for (i = 2; i < 16; i <<= 1) {
241         u128 *Hi = Htable + i;
242         int j;
243         for (V = *Hi, j = 1; j < i; ++j) {
244             Hi[j].hi = V.hi ^ Htable[j].hi;
245             Hi[j].lo = V.lo ^ Htable[j].lo;
246         }
247     }
248 # else
249     Htable[8] = V;
250     REDUCE1BIT(V);
251     Htable[4] = V;
252     REDUCE1BIT(V);
253     Htable[2] = V;
254     REDUCE1BIT(V);
255     Htable[1] = V;
256     Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
257     V = Htable[4];
258     Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
259     Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
260     Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
261     V = Htable[8];
262     Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
263     Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
264     Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
265     Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
266     Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
267     Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
268     Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
269 # endif
270 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
271     /*
272      * ARM assembler expects specific dword order in Htable.
273      */
274     {
275         int j;
276         const union {
277             long one;
278             char little;
279         } is_endian = { 1 };
280 
281         if (is_endian.little)
282             for (j = 0; j < 16; ++j) {
283                 V = Htable[j];
284                 Htable[j].hi = V.lo;
285                 Htable[j].lo = V.hi;
286         } else
287             for (j = 0; j < 16; ++j) {
288                 V = Htable[j];
289                 Htable[j].hi = V.lo << 32 | V.lo >> 32;
290                 Htable[j].lo = V.hi << 32 | V.hi >> 32;
291             }
292     }
293 # endif
294 }
295 
296 # ifndef GHASH_ASM
297 static const size_t rem_4bit[16] = {
298     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
299     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
300     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
301     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
302 };
303 
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])304 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
305 {
306     u128 Z;
307     int cnt = 15;
308     size_t rem, nlo, nhi;
309     const union {
310         long one;
311         char little;
312     } is_endian = { 1 };
313 
314     nlo = ((const u8 *)Xi)[15];
315     nhi = nlo >> 4;
316     nlo &= 0xf;
317 
318     Z.hi = Htable[nlo].hi;
319     Z.lo = Htable[nlo].lo;
320 
321     while (1) {
322         rem = (size_t)Z.lo & 0xf;
323         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
324         Z.hi = (Z.hi >> 4);
325         if (sizeof(size_t) == 8)
326             Z.hi ^= rem_4bit[rem];
327         else
328             Z.hi ^= (u64)rem_4bit[rem] << 32;
329 
330         Z.hi ^= Htable[nhi].hi;
331         Z.lo ^= Htable[nhi].lo;
332 
333         if (--cnt < 0)
334             break;
335 
336         nlo = ((const u8 *)Xi)[cnt];
337         nhi = nlo >> 4;
338         nlo &= 0xf;
339 
340         rem = (size_t)Z.lo & 0xf;
341         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
342         Z.hi = (Z.hi >> 4);
343         if (sizeof(size_t) == 8)
344             Z.hi ^= rem_4bit[rem];
345         else
346             Z.hi ^= (u64)rem_4bit[rem] << 32;
347 
348         Z.hi ^= Htable[nlo].hi;
349         Z.lo ^= Htable[nlo].lo;
350     }
351 
352     if (is_endian.little) {
353 #  ifdef BSWAP8
354         Xi[0] = BSWAP8(Z.hi);
355         Xi[1] = BSWAP8(Z.lo);
356 #  else
357         u8 *p = (u8 *)Xi;
358         u32 v;
359         v = (u32)(Z.hi >> 32);
360         PUTU32(p, v);
361         v = (u32)(Z.hi);
362         PUTU32(p + 4, v);
363         v = (u32)(Z.lo >> 32);
364         PUTU32(p + 8, v);
365         v = (u32)(Z.lo);
366         PUTU32(p + 12, v);
367 #  endif
368     } else {
369         Xi[0] = Z.hi;
370         Xi[1] = Z.lo;
371     }
372 }
373 
374 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
375 /*
376  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
377  * details... Compiler-generated code doesn't seem to give any
378  * performance improvement, at least not on x86[_64]. It's here
379  * mostly as reference and a placeholder for possible future
380  * non-trivial optimization[s]...
381  */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)382 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
383                            const u8 *inp, size_t len)
384 {
385     u128 Z;
386     int cnt;
387     size_t rem, nlo, nhi;
388     const union {
389         long one;
390         char little;
391     } is_endian = { 1 };
392 
393 #   if 1
394     do {
395         cnt = 15;
396         nlo = ((const u8 *)Xi)[15];
397         nlo ^= inp[15];
398         nhi = nlo >> 4;
399         nlo &= 0xf;
400 
401         Z.hi = Htable[nlo].hi;
402         Z.lo = Htable[nlo].lo;
403 
404         while (1) {
405             rem = (size_t)Z.lo & 0xf;
406             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
407             Z.hi = (Z.hi >> 4);
408             if (sizeof(size_t) == 8)
409                 Z.hi ^= rem_4bit[rem];
410             else
411                 Z.hi ^= (u64)rem_4bit[rem] << 32;
412 
413             Z.hi ^= Htable[nhi].hi;
414             Z.lo ^= Htable[nhi].lo;
415 
416             if (--cnt < 0)
417                 break;
418 
419             nlo = ((const u8 *)Xi)[cnt];
420             nlo ^= inp[cnt];
421             nhi = nlo >> 4;
422             nlo &= 0xf;
423 
424             rem = (size_t)Z.lo & 0xf;
425             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
426             Z.hi = (Z.hi >> 4);
427             if (sizeof(size_t) == 8)
428                 Z.hi ^= rem_4bit[rem];
429             else
430                 Z.hi ^= (u64)rem_4bit[rem] << 32;
431 
432             Z.hi ^= Htable[nlo].hi;
433             Z.lo ^= Htable[nlo].lo;
434         }
435 #   else
436     /*
437      * Extra 256+16 bytes per-key plus 512 bytes shared tables
438      * [should] give ~50% improvement... One could have PACK()-ed
439      * the rem_8bit even here, but the priority is to minimize
440      * cache footprint...
441      */
442     u128 Hshr4[16];             /* Htable shifted right by 4 bits */
443     u8 Hshl4[16];               /* Htable shifted left by 4 bits */
444     static const unsigned short rem_8bit[256] = {
445         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
446         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
447         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
448         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
449         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
450         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
451         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
452         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
453         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
454         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
455         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
456         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
457         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
458         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
459         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
460         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
461         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
462         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
463         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
464         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
465         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
466         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
467         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
468         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
469         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
470         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
471         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
472         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
473         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
474         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
475         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
476         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
477     };
478     /*
479      * This pre-processing phase slows down procedure by approximately
480      * same time as it makes each loop spin faster. In other words
481      * single block performance is approximately same as straightforward
482      * "4-bit" implementation, and then it goes only faster...
483      */
484     for (cnt = 0; cnt < 16; ++cnt) {
485         Z.hi = Htable[cnt].hi;
486         Z.lo = Htable[cnt].lo;
487         Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
488         Hshr4[cnt].hi = (Z.hi >> 4);
489         Hshl4[cnt] = (u8)(Z.lo << 4);
490     }
491 
492     do {
493         for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
494             nlo = ((const u8 *)Xi)[cnt];
495             nlo ^= inp[cnt];
496             nhi = nlo >> 4;
497             nlo &= 0xf;
498 
499             Z.hi ^= Htable[nlo].hi;
500             Z.lo ^= Htable[nlo].lo;
501 
502             rem = (size_t)Z.lo & 0xff;
503 
504             Z.lo = (Z.hi << 56) | (Z.lo >> 8);
505             Z.hi = (Z.hi >> 8);
506 
507             Z.hi ^= Hshr4[nhi].hi;
508             Z.lo ^= Hshr4[nhi].lo;
509             Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
510         }
511 
512         nlo = ((const u8 *)Xi)[0];
513         nlo ^= inp[0];
514         nhi = nlo >> 4;
515         nlo &= 0xf;
516 
517         Z.hi ^= Htable[nlo].hi;
518         Z.lo ^= Htable[nlo].lo;
519 
520         rem = (size_t)Z.lo & 0xf;
521 
522         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
523         Z.hi = (Z.hi >> 4);
524 
525         Z.hi ^= Htable[nhi].hi;
526         Z.lo ^= Htable[nhi].lo;
527         Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
528 #   endif
529 
530         if (is_endian.little) {
531 #   ifdef BSWAP8
532             Xi[0] = BSWAP8(Z.hi);
533             Xi[1] = BSWAP8(Z.lo);
534 #   else
535             u8 *p = (u8 *)Xi;
536             u32 v;
537             v = (u32)(Z.hi >> 32);
538             PUTU32(p, v);
539             v = (u32)(Z.hi);
540             PUTU32(p + 4, v);
541             v = (u32)(Z.lo >> 32);
542             PUTU32(p + 8, v);
543             v = (u32)(Z.lo);
544             PUTU32(p + 12, v);
545 #   endif
546         } else {
547             Xi[0] = Z.hi;
548             Xi[1] = Z.lo;
549         }
550     } while (inp += 16, len -= 16);
551 }
552 #  endif
553 # else
554 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
555 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
556                     size_t len);
557 # endif
558 
559 # define GCM_MUL(ctx)      gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
560 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
561 #  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
562 /*
563  * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
564  * effect. In other words idea is to hash data while it's still in L1 cache
565  * after encryption pass...
566  */
567 #  define GHASH_CHUNK       (3*1024)
568 # endif
569 
570 #else                           /* TABLE_BITS */
571 
572 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
573 {
574     u128 V, Z = { 0, 0 };
575     long X;
576     int i, j;
577     const long *xi = (const long *)Xi;
578     const union {
579         long one;
580         char little;
581     } is_endian = { 1 };
582 
583     V.hi = H[0];                /* H is in host byte order, no byte swapping */
584     V.lo = H[1];
585 
586     for (j = 0; j < 16 / sizeof(long); ++j) {
587         if (is_endian.little) {
588             if (sizeof(long) == 8) {
589 # ifdef BSWAP8
590                 X = (long)(BSWAP8(xi[j]));
591 # else
592                 const u8 *p = (const u8 *)(xi + j);
593                 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
594 # endif
595             } else {
596                 const u8 *p = (const u8 *)(xi + j);
597                 X = (long)GETU32(p);
598             }
599         } else
600             X = xi[j];
601 
602         for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
603             u64 M = (u64)(X >> (8 * sizeof(long) - 1));
604             Z.hi ^= V.hi & M;
605             Z.lo ^= V.lo & M;
606 
607             REDUCE1BIT(V);
608         }
609     }
610 
611     if (is_endian.little) {
612 # ifdef BSWAP8
613         Xi[0] = BSWAP8(Z.hi);
614         Xi[1] = BSWAP8(Z.lo);
615 # else
616         u8 *p = (u8 *)Xi;
617         u32 v;
618         v = (u32)(Z.hi >> 32);
619         PUTU32(p, v);
620         v = (u32)(Z.hi);
621         PUTU32(p + 4, v);
622         v = (u32)(Z.lo >> 32);
623         PUTU32(p + 8, v);
624         v = (u32)(Z.lo);
625         PUTU32(p + 12, v);
626 # endif
627     } else {
628         Xi[0] = Z.hi;
629         Xi[1] = Z.lo;
630     }
631 }
632 
633 # define GCM_MUL(ctx)      gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
634 
635 #endif
636 
637 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
638 # if    !defined(I386_ONLY) && \
639         (defined(__i386)        || defined(__i386__)    || \
640          defined(__x86_64)      || defined(__x86_64__)  || \
641          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
642 #  define GHASH_ASM_X86_OR_64
643 #  define GCM_FUNCREF_4BIT
644 extern unsigned int OPENSSL_ia32cap_P[];
645 
646 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
647 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
648 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
649                      size_t len);
650 
651 #  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
652 #   define gcm_init_avx   gcm_init_clmul
653 #   define gcm_gmult_avx  gcm_gmult_clmul
654 #   define gcm_ghash_avx  gcm_ghash_clmul
655 #  else
656 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
657 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
658 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
659                    size_t len);
660 #  endif
661 
662 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
663 #   define GHASH_ASM_X86
664 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
665 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
666                         size_t len);
667 
668 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
670                         size_t len);
671 #  endif
672 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
673 #  include "arm_arch.h"
674 #  if __ARM_MAX_ARCH__>=7
675 #   define GHASH_ASM_ARM
676 #   define GCM_FUNCREF_4BIT
677 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
678 #   if defined(__arm__) || defined(__arm)
679 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
680 #   endif
681 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
682 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
683 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684                     size_t len);
685 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
686 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
687 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
688                   size_t len);
689 #  endif
690 # elif defined(__sparc__) || defined(__sparc)
691 #  include "sparc_arch.h"
692 #  define GHASH_ASM_SPARC
693 #  define GCM_FUNCREF_4BIT
694 extern unsigned int OPENSSL_sparcv9cap_P[];
695 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
696 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
697 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
698                     size_t len);
699 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
700 #  include "ppc_arch.h"
701 #  define GHASH_ASM_PPC
702 #  define GCM_FUNCREF_4BIT
703 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
704 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
705 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
706                   size_t len);
707 # endif
708 #endif
709 
710 #ifdef GCM_FUNCREF_4BIT
711 # undef  GCM_MUL
712 # define GCM_MUL(ctx)           (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
713 # ifdef GHASH
714 #  undef  GHASH
715 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
716 # endif
717 #endif
718 
719 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
720 {
721     const union {
722         long one;
723         char little;
724     } is_endian = { 1 };
725 
726     memset(ctx, 0, sizeof(*ctx));
727     ctx->block = block;
728     ctx->key = key;
729 
730     (*block) (ctx->H.c, ctx->H.c, key);
731 
732     if (is_endian.little) {
733         /* H is stored in host byte order */
734 #ifdef BSWAP8
735         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
736         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
737 #else
738         u8 *p = ctx->H.c;
739         u64 hi, lo;
740         hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
741         lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
742         ctx->H.u[0] = hi;
743         ctx->H.u[1] = lo;
744 #endif
745     }
746 #if     TABLE_BITS==8
747     gcm_init_8bit(ctx->Htable, ctx->H.u);
748 #elif   TABLE_BITS==4
749 # if    defined(GHASH)
750 #  define CTX__GHASH(f) (ctx->ghash = (f))
751 # else
752 #  define CTX__GHASH(f) (ctx->ghash = NULL)
753 # endif
754 # if    defined(GHASH_ASM_X86_OR_64)
755 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
756     if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
757         if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
758             gcm_init_avx(ctx->Htable, ctx->H.u);
759             ctx->gmult = gcm_gmult_avx;
760             CTX__GHASH(gcm_ghash_avx);
761         } else {
762             gcm_init_clmul(ctx->Htable, ctx->H.u);
763             ctx->gmult = gcm_gmult_clmul;
764             CTX__GHASH(gcm_ghash_clmul);
765         }
766         return;
767     }
768 #  endif
769     gcm_init_4bit(ctx->Htable, ctx->H.u);
770 #  if   defined(GHASH_ASM_X86)  /* x86 only */
771 #   if  defined(OPENSSL_IA32_SSE2)
772     if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
773 #   else
774     if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
775 #   endif
776         ctx->gmult = gcm_gmult_4bit_mmx;
777         CTX__GHASH(gcm_ghash_4bit_mmx);
778     } else {
779         ctx->gmult = gcm_gmult_4bit_x86;
780         CTX__GHASH(gcm_ghash_4bit_x86);
781     }
782 #  else
783     ctx->gmult = gcm_gmult_4bit;
784     CTX__GHASH(gcm_ghash_4bit);
785 #  endif
786 # elif  defined(GHASH_ASM_ARM)
787 #  ifdef PMULL_CAPABLE
788     if (PMULL_CAPABLE) {
789         gcm_init_v8(ctx->Htable, ctx->H.u);
790         ctx->gmult = gcm_gmult_v8;
791         CTX__GHASH(gcm_ghash_v8);
792     } else
793 #  endif
794 #  ifdef NEON_CAPABLE
795     if (NEON_CAPABLE) {
796         gcm_init_neon(ctx->Htable, ctx->H.u);
797         ctx->gmult = gcm_gmult_neon;
798         CTX__GHASH(gcm_ghash_neon);
799     } else
800 #  endif
801     {
802         gcm_init_4bit(ctx->Htable, ctx->H.u);
803         ctx->gmult = gcm_gmult_4bit;
804         CTX__GHASH(gcm_ghash_4bit);
805     }
806 # elif  defined(GHASH_ASM_SPARC)
807     if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
808         gcm_init_vis3(ctx->Htable, ctx->H.u);
809         ctx->gmult = gcm_gmult_vis3;
810         CTX__GHASH(gcm_ghash_vis3);
811     } else {
812         gcm_init_4bit(ctx->Htable, ctx->H.u);
813         ctx->gmult = gcm_gmult_4bit;
814         CTX__GHASH(gcm_ghash_4bit);
815     }
816 # elif  defined(GHASH_ASM_PPC)
817     if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
818         gcm_init_p8(ctx->Htable, ctx->H.u);
819         ctx->gmult = gcm_gmult_p8;
820         CTX__GHASH(gcm_ghash_p8);
821     } else {
822         gcm_init_4bit(ctx->Htable, ctx->H.u);
823         ctx->gmult = gcm_gmult_4bit;
824         CTX__GHASH(gcm_ghash_4bit);
825     }
826 # else
827     gcm_init_4bit(ctx->Htable, ctx->H.u);
828 # endif
829 # undef CTX__GHASH
830 #endif
831 }
832 
833 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
834                          size_t len)
835 {
836     const union {
837         long one;
838         char little;
839     } is_endian = { 1 };
840     unsigned int ctr;
841 #ifdef GCM_FUNCREF_4BIT
842     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
843 #endif
844 
845     ctx->len.u[0] = 0;          /* AAD length */
846     ctx->len.u[1] = 0;          /* message length */
847     ctx->ares = 0;
848     ctx->mres = 0;
849 
850     if (len == 12) {
851         memcpy(ctx->Yi.c, iv, 12);
852         ctx->Yi.c[12] = 0;
853         ctx->Yi.c[13] = 0;
854         ctx->Yi.c[14] = 0;
855         ctx->Yi.c[15] = 1;
856         ctr = 1;
857     } else {
858         size_t i;
859         u64 len0 = len;
860 
861         /* Borrow ctx->Xi to calculate initial Yi */
862         ctx->Xi.u[0] = 0;
863         ctx->Xi.u[1] = 0;
864 
865         while (len >= 16) {
866             for (i = 0; i < 16; ++i)
867                 ctx->Xi.c[i] ^= iv[i];
868             GCM_MUL(ctx);
869             iv += 16;
870             len -= 16;
871         }
872         if (len) {
873             for (i = 0; i < len; ++i)
874                 ctx->Xi.c[i] ^= iv[i];
875             GCM_MUL(ctx);
876         }
877         len0 <<= 3;
878         if (is_endian.little) {
879 #ifdef BSWAP8
880             ctx->Xi.u[1] ^= BSWAP8(len0);
881 #else
882             ctx->Xi.c[8] ^= (u8)(len0 >> 56);
883             ctx->Xi.c[9] ^= (u8)(len0 >> 48);
884             ctx->Xi.c[10] ^= (u8)(len0 >> 40);
885             ctx->Xi.c[11] ^= (u8)(len0 >> 32);
886             ctx->Xi.c[12] ^= (u8)(len0 >> 24);
887             ctx->Xi.c[13] ^= (u8)(len0 >> 16);
888             ctx->Xi.c[14] ^= (u8)(len0 >> 8);
889             ctx->Xi.c[15] ^= (u8)(len0);
890 #endif
891         } else {
892             ctx->Xi.u[1] ^= len0;
893         }
894 
895         GCM_MUL(ctx);
896 
897         if (is_endian.little)
898 #ifdef BSWAP4
899             ctr = BSWAP4(ctx->Xi.d[3]);
900 #else
901             ctr = GETU32(ctx->Xi.c + 12);
902 #endif
903         else
904             ctr = ctx->Xi.d[3];
905 
906         /* Copy borrowed Xi to Yi */
907         ctx->Yi.u[0] = ctx->Xi.u[0];
908         ctx->Yi.u[1] = ctx->Xi.u[1];
909     }
910 
911     ctx->Xi.u[0] = 0;
912     ctx->Xi.u[1] = 0;
913 
914     (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
915     ++ctr;
916     if (is_endian.little)
917 #ifdef BSWAP4
918         ctx->Yi.d[3] = BSWAP4(ctr);
919 #else
920         PUTU32(ctx->Yi.c + 12, ctr);
921 #endif
922     else
923         ctx->Yi.d[3] = ctr;
924 }
925 
926 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
927                       size_t len)
928 {
929     size_t i;
930     unsigned int n;
931     u64 alen = ctx->len.u[0];
932 #ifdef GCM_FUNCREF_4BIT
933     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
934 # ifdef GHASH
935     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
936                          const u8 *inp, size_t len) = ctx->ghash;
937 # endif
938 #endif
939 
940     if (ctx->len.u[1])
941         return -2;
942 
943     alen += len;
944     if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
945         return -1;
946     ctx->len.u[0] = alen;
947 
948     n = ctx->ares;
949     if (n) {
950         while (n && len) {
951             ctx->Xi.c[n] ^= *(aad++);
952             --len;
953             n = (n + 1) % 16;
954         }
955         if (n == 0)
956             GCM_MUL(ctx);
957         else {
958             ctx->ares = n;
959             return 0;
960         }
961     }
962 #ifdef GHASH
963     if ((i = (len & (size_t)-16))) {
964         GHASH(ctx, aad, i);
965         aad += i;
966         len -= i;
967     }
968 #else
969     while (len >= 16) {
970         for (i = 0; i < 16; ++i)
971             ctx->Xi.c[i] ^= aad[i];
972         GCM_MUL(ctx);
973         aad += 16;
974         len -= 16;
975     }
976 #endif
977     if (len) {
978         n = (unsigned int)len;
979         for (i = 0; i < len; ++i)
980             ctx->Xi.c[i] ^= aad[i];
981     }
982 
983     ctx->ares = n;
984     return 0;
985 }
986 
987 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
988                           const unsigned char *in, unsigned char *out,
989                           size_t len)
990 {
991     const union {
992         long one;
993         char little;
994     } is_endian = { 1 };
995     unsigned int n, ctr, mres;
996     size_t i;
997     u64 mlen = ctx->len.u[1];
998     block128_f block = ctx->block;
999     void *key = ctx->key;
1000 #ifdef GCM_FUNCREF_4BIT
1001     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1002 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1003     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1004                          const u8 *inp, size_t len) = ctx->ghash;
1005 # endif
1006 #endif
1007 
1008     mlen += len;
1009     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1010         return -1;
1011     ctx->len.u[1] = mlen;
1012 
1013     mres = ctx->mres;
1014 
1015     if (ctx->ares) {
1016         /* First call to encrypt finalizes GHASH(AAD) */
1017 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1018         if (len == 0) {
1019             GCM_MUL(ctx);
1020             ctx->ares = 0;
1021             return 0;
1022         }
1023         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1024         ctx->Xi.u[0] = 0;
1025         ctx->Xi.u[1] = 0;
1026         mres = sizeof(ctx->Xi);
1027 #else
1028         GCM_MUL(ctx);
1029 #endif
1030         ctx->ares = 0;
1031     }
1032 
1033     if (is_endian.little)
1034 #ifdef BSWAP4
1035         ctr = BSWAP4(ctx->Yi.d[3]);
1036 #else
1037         ctr = GETU32(ctx->Yi.c + 12);
1038 #endif
1039     else
1040         ctr = ctx->Yi.d[3];
1041 
1042     n = mres % 16;
1043 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1044     if (16 % sizeof(size_t) == 0) { /* always true actually */
1045         do {
1046             if (n) {
1047 # if defined(GHASH)
1048                 while (n && len) {
1049                     ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1050                     --len;
1051                     n = (n + 1) % 16;
1052                 }
1053                 if (n == 0) {
1054                     GHASH(ctx, ctx->Xn, mres);
1055                     mres = 0;
1056                 } else {
1057                     ctx->mres = mres;
1058                     return 0;
1059                 }
1060 # else
1061                 while (n && len) {
1062                     ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1063                     --len;
1064                     n = (n + 1) % 16;
1065                 }
1066                 if (n == 0) {
1067                     GCM_MUL(ctx);
1068                     mres = 0;
1069                 } else {
1070                     ctx->mres = n;
1071                     return 0;
1072                 }
1073 # endif
1074             }
1075 # if defined(STRICT_ALIGNMENT)
1076             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1077                 break;
1078 # endif
1079 # if defined(GHASH)
1080             if (len >= 16 && mres) {
1081                 GHASH(ctx, ctx->Xn, mres);
1082                 mres = 0;
1083             }
1084 #  if defined(GHASH_CHUNK)
1085             while (len >= GHASH_CHUNK) {
1086                 size_t j = GHASH_CHUNK;
1087 
1088                 while (j) {
1089                     size_t_aX *out_t = (size_t_aX *)out;
1090                     const size_t_aX *in_t = (const size_t_aX *)in;
1091 
1092                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1093                     ++ctr;
1094                     if (is_endian.little)
1095 #   ifdef BSWAP4
1096                         ctx->Yi.d[3] = BSWAP4(ctr);
1097 #   else
1098                         PUTU32(ctx->Yi.c + 12, ctr);
1099 #   endif
1100                     else
1101                         ctx->Yi.d[3] = ctr;
1102                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1103                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1104                     out += 16;
1105                     in += 16;
1106                     j -= 16;
1107                 }
1108                 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1109                 len -= GHASH_CHUNK;
1110             }
1111 #  endif
1112             if ((i = (len & (size_t)-16))) {
1113                 size_t j = i;
1114 
1115                 while (len >= 16) {
1116                     size_t_aX *out_t = (size_t_aX *)out;
1117                     const size_t_aX *in_t = (const size_t_aX *)in;
1118 
1119                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1120                     ++ctr;
1121                     if (is_endian.little)
1122 #  ifdef BSWAP4
1123                         ctx->Yi.d[3] = BSWAP4(ctr);
1124 #  else
1125                         PUTU32(ctx->Yi.c + 12, ctr);
1126 #  endif
1127                     else
1128                         ctx->Yi.d[3] = ctr;
1129                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1130                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1131                     out += 16;
1132                     in += 16;
1133                     len -= 16;
1134                 }
1135                 GHASH(ctx, out - j, j);
1136             }
1137 # else
1138             while (len >= 16) {
1139                 size_t *out_t = (size_t *)out;
1140                 const size_t *in_t = (const size_t *)in;
1141 
1142                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1143                 ++ctr;
1144                 if (is_endian.little)
1145 #  ifdef BSWAP4
1146                     ctx->Yi.d[3] = BSWAP4(ctr);
1147 #  else
1148                     PUTU32(ctx->Yi.c + 12, ctr);
1149 #  endif
1150                 else
1151                     ctx->Yi.d[3] = ctr;
1152                 for (i = 0; i < 16 / sizeof(size_t); ++i)
1153                     ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1154                 GCM_MUL(ctx);
1155                 out += 16;
1156                 in += 16;
1157                 len -= 16;
1158             }
1159 # endif
1160             if (len) {
1161                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1162                 ++ctr;
1163                 if (is_endian.little)
1164 # ifdef BSWAP4
1165                     ctx->Yi.d[3] = BSWAP4(ctr);
1166 # else
1167                     PUTU32(ctx->Yi.c + 12, ctr);
1168 # endif
1169                 else
1170                     ctx->Yi.d[3] = ctr;
1171 # if defined(GHASH)
1172                 while (len--) {
1173                     ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1174                     ++n;
1175                 }
1176 # else
1177                 while (len--) {
1178                     ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1179                     ++n;
1180                 }
1181                 mres = n;
1182 # endif
1183             }
1184 
1185             ctx->mres = mres;
1186             return 0;
1187         } while (0);
1188     }
1189 #endif
1190     for (i = 0; i < len; ++i) {
1191         if (n == 0) {
1192             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1193             ++ctr;
1194             if (is_endian.little)
1195 #ifdef BSWAP4
1196                 ctx->Yi.d[3] = BSWAP4(ctr);
1197 #else
1198                 PUTU32(ctx->Yi.c + 12, ctr);
1199 #endif
1200             else
1201                 ctx->Yi.d[3] = ctr;
1202         }
1203 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204         ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1205         n = (n + 1) % 16;
1206         if (mres == sizeof(ctx->Xn)) {
1207             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1208             mres = 0;
1209         }
1210 #else
1211         ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1212         mres = n = (n + 1) % 16;
1213         if (n == 0)
1214             GCM_MUL(ctx);
1215 #endif
1216     }
1217 
1218     ctx->mres = mres;
1219     return 0;
1220 }
1221 
1222 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1223                           const unsigned char *in, unsigned char *out,
1224                           size_t len)
1225 {
1226     const union {
1227         long one;
1228         char little;
1229     } is_endian = { 1 };
1230     unsigned int n, ctr, mres;
1231     size_t i;
1232     u64 mlen = ctx->len.u[1];
1233     block128_f block = ctx->block;
1234     void *key = ctx->key;
1235 #ifdef GCM_FUNCREF_4BIT
1236     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1237 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1238     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1239                          const u8 *inp, size_t len) = ctx->ghash;
1240 # endif
1241 #endif
1242 
1243     mlen += len;
1244     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1245         return -1;
1246     ctx->len.u[1] = mlen;
1247 
1248     mres = ctx->mres;
1249 
1250     if (ctx->ares) {
1251         /* First call to decrypt finalizes GHASH(AAD) */
1252 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1253         if (len == 0) {
1254             GCM_MUL(ctx);
1255             ctx->ares = 0;
1256             return 0;
1257         }
1258         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1259         ctx->Xi.u[0] = 0;
1260         ctx->Xi.u[1] = 0;
1261         mres = sizeof(ctx->Xi);
1262 #else
1263         GCM_MUL(ctx);
1264 #endif
1265         ctx->ares = 0;
1266     }
1267 
1268     if (is_endian.little)
1269 #ifdef BSWAP4
1270         ctr = BSWAP4(ctx->Yi.d[3]);
1271 #else
1272         ctr = GETU32(ctx->Yi.c + 12);
1273 #endif
1274     else
1275         ctr = ctx->Yi.d[3];
1276 
1277     n = mres % 16;
1278 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1279     if (16 % sizeof(size_t) == 0) { /* always true actually */
1280         do {
1281             if (n) {
1282 # if defined(GHASH)
1283                 while (n && len) {
1284                     *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1285                     --len;
1286                     n = (n + 1) % 16;
1287                 }
1288                 if (n == 0) {
1289                     GHASH(ctx, ctx->Xn, mres);
1290                     mres = 0;
1291                 } else {
1292                     ctx->mres = mres;
1293                     return 0;
1294                 }
1295 # else
1296                 while (n && len) {
1297                     u8 c = *(in++);
1298                     *(out++) = c ^ ctx->EKi.c[n];
1299                     ctx->Xi.c[n] ^= c;
1300                     --len;
1301                     n = (n + 1) % 16;
1302                 }
1303                 if (n == 0) {
1304                     GCM_MUL(ctx);
1305                     mres = 0;
1306                 } else {
1307                     ctx->mres = n;
1308                     return 0;
1309                 }
1310 # endif
1311             }
1312 # if defined(STRICT_ALIGNMENT)
1313             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1314                 break;
1315 # endif
1316 # if defined(GHASH)
1317             if (len >= 16 && mres) {
1318                 GHASH(ctx, ctx->Xn, mres);
1319                 mres = 0;
1320             }
1321 #  if defined(GHASH_CHUNK)
1322             while (len >= GHASH_CHUNK) {
1323                 size_t j = GHASH_CHUNK;
1324 
1325                 GHASH(ctx, in, GHASH_CHUNK);
1326                 while (j) {
1327                     size_t_aX *out_t = (size_t_aX *)out;
1328                     const size_t_aX *in_t = (const size_t_aX *)in;
1329 
1330                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1331                     ++ctr;
1332                     if (is_endian.little)
1333 #   ifdef BSWAP4
1334                         ctx->Yi.d[3] = BSWAP4(ctr);
1335 #   else
1336                         PUTU32(ctx->Yi.c + 12, ctr);
1337 #   endif
1338                     else
1339                         ctx->Yi.d[3] = ctr;
1340                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1341                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1342                     out += 16;
1343                     in += 16;
1344                     j -= 16;
1345                 }
1346                 len -= GHASH_CHUNK;
1347             }
1348 #  endif
1349             if ((i = (len & (size_t)-16))) {
1350                 GHASH(ctx, in, i);
1351                 while (len >= 16) {
1352                     size_t_aX *out_t = (size_t_aX *)out;
1353                     const size_t_aX *in_t = (const size_t_aX *)in;
1354 
1355                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1356                     ++ctr;
1357                     if (is_endian.little)
1358 #  ifdef BSWAP4
1359                         ctx->Yi.d[3] = BSWAP4(ctr);
1360 #  else
1361                         PUTU32(ctx->Yi.c + 12, ctr);
1362 #  endif
1363                     else
1364                         ctx->Yi.d[3] = ctr;
1365                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1366                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1367                     out += 16;
1368                     in += 16;
1369                     len -= 16;
1370                 }
1371             }
1372 # else
1373             while (len >= 16) {
1374                 size_t *out_t = (size_t *)out;
1375                 const size_t *in_t = (const size_t *)in;
1376 
1377                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1378                 ++ctr;
1379                 if (is_endian.little)
1380 #  ifdef BSWAP4
1381                     ctx->Yi.d[3] = BSWAP4(ctr);
1382 #  else
1383                     PUTU32(ctx->Yi.c + 12, ctr);
1384 #  endif
1385                 else
1386                     ctx->Yi.d[3] = ctr;
1387                 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1388                     size_t c = in_t[i];
1389                     out_t[i] = c ^ ctx->EKi.t[i];
1390                     ctx->Xi.t[i] ^= c;
1391                 }
1392                 GCM_MUL(ctx);
1393                 out += 16;
1394                 in += 16;
1395                 len -= 16;
1396             }
1397 # endif
1398             if (len) {
1399                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1400                 ++ctr;
1401                 if (is_endian.little)
1402 # ifdef BSWAP4
1403                     ctx->Yi.d[3] = BSWAP4(ctr);
1404 # else
1405                     PUTU32(ctx->Yi.c + 12, ctr);
1406 # endif
1407                 else
1408                     ctx->Yi.d[3] = ctr;
1409 # if defined(GHASH)
1410                 while (len--) {
1411                     out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1412                     ++n;
1413                 }
1414 # else
1415                 while (len--) {
1416                     u8 c = in[n];
1417                     ctx->Xi.c[n] ^= c;
1418                     out[n] = c ^ ctx->EKi.c[n];
1419                     ++n;
1420                 }
1421                 mres = n;
1422 # endif
1423             }
1424 
1425             ctx->mres = mres;
1426             return 0;
1427         } while (0);
1428     }
1429 #endif
1430     for (i = 0; i < len; ++i) {
1431         u8 c;
1432         if (n == 0) {
1433             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1434             ++ctr;
1435             if (is_endian.little)
1436 #ifdef BSWAP4
1437                 ctx->Yi.d[3] = BSWAP4(ctr);
1438 #else
1439                 PUTU32(ctx->Yi.c + 12, ctr);
1440 #endif
1441             else
1442                 ctx->Yi.d[3] = ctr;
1443         }
1444 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1445         out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1446         n = (n + 1) % 16;
1447         if (mres == sizeof(ctx->Xn)) {
1448             GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1449             mres = 0;
1450         }
1451 #else
1452         c = in[i];
1453         out[i] = c ^ ctx->EKi.c[n];
1454         ctx->Xi.c[n] ^= c;
1455         mres = n = (n + 1) % 16;
1456         if (n == 0)
1457             GCM_MUL(ctx);
1458 #endif
1459     }
1460 
1461     ctx->mres = mres;
1462     return 0;
1463 }
1464 
1465 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1466                                 const unsigned char *in, unsigned char *out,
1467                                 size_t len, ctr128_f stream)
1468 {
1469 #if defined(OPENSSL_SMALL_FOOTPRINT)
1470     return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1471 #else
1472     const union {
1473         long one;
1474         char little;
1475     } is_endian = { 1 };
1476     unsigned int n, ctr, mres;
1477     size_t i;
1478     u64 mlen = ctx->len.u[1];
1479     void *key = ctx->key;
1480 # ifdef GCM_FUNCREF_4BIT
1481     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1482 #  ifdef GHASH
1483     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1484                          const u8 *inp, size_t len) = ctx->ghash;
1485 #  endif
1486 # endif
1487 
1488     mlen += len;
1489     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1490         return -1;
1491     ctx->len.u[1] = mlen;
1492 
1493     mres = ctx->mres;
1494 
1495     if (ctx->ares) {
1496         /* First call to encrypt finalizes GHASH(AAD) */
1497 #if defined(GHASH)
1498         if (len == 0) {
1499             GCM_MUL(ctx);
1500             ctx->ares = 0;
1501             return 0;
1502         }
1503         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1504         ctx->Xi.u[0] = 0;
1505         ctx->Xi.u[1] = 0;
1506         mres = sizeof(ctx->Xi);
1507 #else
1508         GCM_MUL(ctx);
1509 #endif
1510         ctx->ares = 0;
1511     }
1512 
1513     if (is_endian.little)
1514 # ifdef BSWAP4
1515         ctr = BSWAP4(ctx->Yi.d[3]);
1516 # else
1517         ctr = GETU32(ctx->Yi.c + 12);
1518 # endif
1519     else
1520         ctr = ctx->Yi.d[3];
1521 
1522     n = mres % 16;
1523     if (n) {
1524 # if defined(GHASH)
1525         while (n && len) {
1526             ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1527             --len;
1528             n = (n + 1) % 16;
1529         }
1530         if (n == 0) {
1531             GHASH(ctx, ctx->Xn, mres);
1532             mres = 0;
1533         } else {
1534             ctx->mres = mres;
1535             return 0;
1536         }
1537 # else
1538         while (n && len) {
1539             ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1540             --len;
1541             n = (n + 1) % 16;
1542         }
1543         if (n == 0) {
1544             GCM_MUL(ctx);
1545             mres = 0;
1546         } else {
1547             ctx->mres = n;
1548             return 0;
1549         }
1550 # endif
1551     }
1552 # if defined(GHASH)
1553         if (len >= 16 && mres) {
1554             GHASH(ctx, ctx->Xn, mres);
1555             mres = 0;
1556         }
1557 #  if defined(GHASH_CHUNK)
1558     while (len >= GHASH_CHUNK) {
1559         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1560         ctr += GHASH_CHUNK / 16;
1561         if (is_endian.little)
1562 #   ifdef BSWAP4
1563             ctx->Yi.d[3] = BSWAP4(ctr);
1564 #   else
1565             PUTU32(ctx->Yi.c + 12, ctr);
1566 #   endif
1567         else
1568             ctx->Yi.d[3] = ctr;
1569         GHASH(ctx, out, GHASH_CHUNK);
1570         out += GHASH_CHUNK;
1571         in += GHASH_CHUNK;
1572         len -= GHASH_CHUNK;
1573     }
1574 #  endif
1575 # endif
1576     if ((i = (len & (size_t)-16))) {
1577         size_t j = i / 16;
1578 
1579         (*stream) (in, out, j, key, ctx->Yi.c);
1580         ctr += (unsigned int)j;
1581         if (is_endian.little)
1582 # ifdef BSWAP4
1583             ctx->Yi.d[3] = BSWAP4(ctr);
1584 # else
1585             PUTU32(ctx->Yi.c + 12, ctr);
1586 # endif
1587         else
1588             ctx->Yi.d[3] = ctr;
1589         in += i;
1590         len -= i;
1591 # if defined(GHASH)
1592         GHASH(ctx, out, i);
1593         out += i;
1594 # else
1595         while (j--) {
1596             for (i = 0; i < 16; ++i)
1597                 ctx->Xi.c[i] ^= out[i];
1598             GCM_MUL(ctx);
1599             out += 16;
1600         }
1601 # endif
1602     }
1603     if (len) {
1604         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1605         ++ctr;
1606         if (is_endian.little)
1607 # ifdef BSWAP4
1608             ctx->Yi.d[3] = BSWAP4(ctr);
1609 # else
1610             PUTU32(ctx->Yi.c + 12, ctr);
1611 # endif
1612         else
1613             ctx->Yi.d[3] = ctr;
1614         while (len--) {
1615 # if defined(GHASH)
1616             ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1617 # else
1618             ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1619 # endif
1620             ++n;
1621         }
1622     }
1623 
1624     ctx->mres = mres;
1625     return 0;
1626 #endif
1627 }
1628 
1629 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1630                                 const unsigned char *in, unsigned char *out,
1631                                 size_t len, ctr128_f stream)
1632 {
1633 #if defined(OPENSSL_SMALL_FOOTPRINT)
1634     return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1635 #else
1636     const union {
1637         long one;
1638         char little;
1639     } is_endian = { 1 };
1640     unsigned int n, ctr, mres;
1641     size_t i;
1642     u64 mlen = ctx->len.u[1];
1643     void *key = ctx->key;
1644 # ifdef GCM_FUNCREF_4BIT
1645     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1646 #  ifdef GHASH
1647     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1648                          const u8 *inp, size_t len) = ctx->ghash;
1649 #  endif
1650 # endif
1651 
1652     mlen += len;
1653     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1654         return -1;
1655     ctx->len.u[1] = mlen;
1656 
1657     mres = ctx->mres;
1658 
1659     if (ctx->ares) {
1660         /* First call to decrypt finalizes GHASH(AAD) */
1661 # if defined(GHASH)
1662         if (len == 0) {
1663             GCM_MUL(ctx);
1664             ctx->ares = 0;
1665             return 0;
1666         }
1667         memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1668         ctx->Xi.u[0] = 0;
1669         ctx->Xi.u[1] = 0;
1670         mres = sizeof(ctx->Xi);
1671 # else
1672         GCM_MUL(ctx);
1673 # endif
1674         ctx->ares = 0;
1675     }
1676 
1677     if (is_endian.little)
1678 # ifdef BSWAP4
1679         ctr = BSWAP4(ctx->Yi.d[3]);
1680 # else
1681         ctr = GETU32(ctx->Yi.c + 12);
1682 # endif
1683     else
1684         ctr = ctx->Yi.d[3];
1685 
1686     n = mres % 16;
1687     if (n) {
1688 # if defined(GHASH)
1689         while (n && len) {
1690             *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1691             --len;
1692             n = (n + 1) % 16;
1693         }
1694         if (n == 0) {
1695             GHASH(ctx, ctx->Xn, mres);
1696             mres = 0;
1697         } else {
1698             ctx->mres = mres;
1699             return 0;
1700         }
1701 # else
1702         while (n && len) {
1703             u8 c = *(in++);
1704             *(out++) = c ^ ctx->EKi.c[n];
1705             ctx->Xi.c[n] ^= c;
1706             --len;
1707             n = (n + 1) % 16;
1708         }
1709         if (n == 0) {
1710             GCM_MUL(ctx);
1711             mres = 0;
1712         } else {
1713             ctx->mres = n;
1714             return 0;
1715         }
1716 # endif
1717     }
1718 # if defined(GHASH)
1719     if (len >= 16 && mres) {
1720         GHASH(ctx, ctx->Xn, mres);
1721         mres = 0;
1722     }
1723 #  if defined(GHASH_CHUNK)
1724     while (len >= GHASH_CHUNK) {
1725         GHASH(ctx, in, GHASH_CHUNK);
1726         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1727         ctr += GHASH_CHUNK / 16;
1728         if (is_endian.little)
1729 #   ifdef BSWAP4
1730             ctx->Yi.d[3] = BSWAP4(ctr);
1731 #   else
1732             PUTU32(ctx->Yi.c + 12, ctr);
1733 #   endif
1734         else
1735             ctx->Yi.d[3] = ctr;
1736         out += GHASH_CHUNK;
1737         in += GHASH_CHUNK;
1738         len -= GHASH_CHUNK;
1739     }
1740 #  endif
1741 # endif
1742     if ((i = (len & (size_t)-16))) {
1743         size_t j = i / 16;
1744 
1745 # if defined(GHASH)
1746         GHASH(ctx, in, i);
1747 # else
1748         while (j--) {
1749             size_t k;
1750             for (k = 0; k < 16; ++k)
1751                 ctx->Xi.c[k] ^= in[k];
1752             GCM_MUL(ctx);
1753             in += 16;
1754         }
1755         j = i / 16;
1756         in -= i;
1757 # endif
1758         (*stream) (in, out, j, key, ctx->Yi.c);
1759         ctr += (unsigned int)j;
1760         if (is_endian.little)
1761 # ifdef BSWAP4
1762             ctx->Yi.d[3] = BSWAP4(ctr);
1763 # else
1764             PUTU32(ctx->Yi.c + 12, ctr);
1765 # endif
1766         else
1767             ctx->Yi.d[3] = ctr;
1768         out += i;
1769         in += i;
1770         len -= i;
1771     }
1772     if (len) {
1773         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1774         ++ctr;
1775         if (is_endian.little)
1776 # ifdef BSWAP4
1777             ctx->Yi.d[3] = BSWAP4(ctr);
1778 # else
1779             PUTU32(ctx->Yi.c + 12, ctr);
1780 # endif
1781         else
1782             ctx->Yi.d[3] = ctr;
1783         while (len--) {
1784 # if defined(GHASH)
1785             out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1786 # else
1787             u8 c = in[n];
1788             ctx->Xi.c[mres++] ^= c;
1789             out[n] = c ^ ctx->EKi.c[n];
1790 # endif
1791             ++n;
1792         }
1793     }
1794 
1795     ctx->mres = mres;
1796     return 0;
1797 #endif
1798 }
1799 
1800 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1801                          size_t len)
1802 {
1803     const union {
1804         long one;
1805         char little;
1806     } is_endian = { 1 };
1807     u64 alen = ctx->len.u[0] << 3;
1808     u64 clen = ctx->len.u[1] << 3;
1809 #ifdef GCM_FUNCREF_4BIT
1810     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1811 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1812     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1813                          const u8 *inp, size_t len) = ctx->ghash;
1814 # endif
1815 #endif
1816 
1817 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1818     u128 bitlen;
1819     unsigned int mres = ctx->mres;
1820 
1821     if (mres) {
1822         unsigned blocks = (mres + 15) & -16;
1823 
1824         memset(ctx->Xn + mres, 0, blocks - mres);
1825         mres = blocks;
1826         if (mres == sizeof(ctx->Xn)) {
1827             GHASH(ctx, ctx->Xn, mres);
1828             mres = 0;
1829         }
1830     } else if (ctx->ares) {
1831         GCM_MUL(ctx);
1832     }
1833 #else
1834     if (ctx->mres || ctx->ares)
1835         GCM_MUL(ctx);
1836 #endif
1837 
1838     if (is_endian.little) {
1839 #ifdef BSWAP8
1840         alen = BSWAP8(alen);
1841         clen = BSWAP8(clen);
1842 #else
1843         u8 *p = ctx->len.c;
1844 
1845         ctx->len.u[0] = alen;
1846         ctx->len.u[1] = clen;
1847 
1848         alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1849         clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1850 #endif
1851     }
1852 
1853 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1854     bitlen.hi = alen;
1855     bitlen.lo = clen;
1856     memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1857     mres += sizeof(bitlen);
1858     GHASH(ctx, ctx->Xn, mres);
1859 #else
1860     ctx->Xi.u[0] ^= alen;
1861     ctx->Xi.u[1] ^= clen;
1862     GCM_MUL(ctx);
1863 #endif
1864 
1865     ctx->Xi.u[0] ^= ctx->EK0.u[0];
1866     ctx->Xi.u[1] ^= ctx->EK0.u[1];
1867 
1868     if (tag && len <= sizeof(ctx->Xi))
1869         return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1870     else
1871         return -1;
1872 }
1873 
1874 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1875 {
1876     CRYPTO_gcm128_finish(ctx, NULL, 0);
1877     memcpy(tag, ctx->Xi.c,
1878            len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1879 }
1880 
1881 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1882 {
1883     GCM128_CONTEXT *ret;
1884 
1885     if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1886         CRYPTO_gcm128_init(ret, key, block);
1887 
1888     return ret;
1889 }
1890 
1891 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1892 {
1893     OPENSSL_clear_free(ctx, sizeof(*ctx));
1894 }
1895