• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49 
50 #define OPENSSL_FIPSAPI
51 
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55 
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62 
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef	GETU32
66 #define	GETU32(p)	BSWAP4(*(const u32 *)(p))
67 #undef	PUTU32
68 #define	PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
69 #endif
70 
71 #define	PACK(s)		((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)	do { \
73 	if (sizeof(size_t)==8) { \
74 		u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 		V.lo  = (V.hi<<63)|(V.lo>>1); \
76 		V.hi  = (V.hi>>1 )^T; \
77 	} \
78 	else { \
79 		u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 		V.lo  = (V.hi<<63)|(V.lo>>1); \
81 		V.hi  = (V.hi>>1 )^((u64)T<<32); \
82 	} \
83 } while(0)
84 
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if	TABLE_BITS==8
120 
gcm_init_8bit(u128 Htable[256],u64 H[2])121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123 	int  i, j;
124 	u128 V;
125 
126 	Htable[0].hi = 0;
127 	Htable[0].lo = 0;
128 	V.hi = H[0];
129 	V.lo = H[1];
130 
131 	for (Htable[128]=V, i=64; i>0; i>>=1) {
132 		REDUCE1BIT(V);
133 		Htable[i] = V;
134 	}
135 
136 	for (i=2; i<256; i<<=1) {
137 		u128 *Hi = Htable+i, H0 = *Hi;
138 		for (j=1; j<i; ++j) {
139 			Hi[j].hi = H0.hi^Htable[j].hi;
140 			Hi[j].lo = H0.lo^Htable[j].lo;
141 		}
142 	}
143 }
144 
gcm_gmult_8bit(u64 Xi[2],const u128 Htable[256])145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147 	u128 Z = { 0, 0};
148 	const u8 *xi = (const u8 *)Xi+15;
149 	size_t rem, n = *xi;
150 	const union { long one; char little; } is_endian = {1};
151 	static const size_t rem_8bit[256] = {
152 		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153 		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154 		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155 		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156 		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157 		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158 		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159 		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160 		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161 		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162 		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163 		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164 		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165 		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166 		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167 		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168 		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169 		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170 		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171 		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172 		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173 		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174 		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175 		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176 		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177 		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178 		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179 		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180 		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181 		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182 		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183 		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184 		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185 		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186 		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187 		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188 		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189 		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190 		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191 		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192 		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193 		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194 		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195 		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196 		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197 		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198 		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199 		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200 		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201 		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202 		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203 		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204 		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205 		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206 		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207 		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208 		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209 		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210 		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211 		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212 		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213 		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214 		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215 		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216 
217 	while (1) {
218 		Z.hi ^= Htable[n].hi;
219 		Z.lo ^= Htable[n].lo;
220 
221 		if ((u8 *)Xi==xi)	break;
222 
223 		n = *(--xi);
224 
225 		rem  = (size_t)Z.lo&0xff;
226 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
227 		Z.hi = (Z.hi>>8);
228 		if (sizeof(size_t)==8)
229 			Z.hi ^= rem_8bit[rem];
230 		else
231 			Z.hi ^= (u64)rem_8bit[rem]<<32;
232 	}
233 
234 	if (is_endian.little) {
235 #ifdef BSWAP8
236 		Xi[0] = BSWAP8(Z.hi);
237 		Xi[1] = BSWAP8(Z.lo);
238 #else
239 		u8 *p = (u8 *)Xi;
240 		u32 v;
241 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
242 		v = (u32)(Z.hi);	PUTU32(p+4,v);
243 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
244 		v = (u32)(Z.lo);	PUTU32(p+12,v);
245 #endif
246 	}
247 	else {
248 		Xi[0] = Z.hi;
249 		Xi[1] = Z.lo;
250 	}
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253 
254 #elif	TABLE_BITS==4
255 
gcm_init_4bit(u128 Htable[16],u64 H[2])256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258 	u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260 	int  i;
261 #endif
262 
263 	Htable[0].hi = 0;
264 	Htable[0].lo = 0;
265 	V.hi = H[0];
266 	V.lo = H[1];
267 
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269 	for (Htable[8]=V, i=4; i>0; i>>=1) {
270 		REDUCE1BIT(V);
271 		Htable[i] = V;
272 	}
273 
274 	for (i=2; i<16; i<<=1) {
275 		u128 *Hi = Htable+i;
276 		int   j;
277 		for (V=*Hi, j=1; j<i; ++j) {
278 			Hi[j].hi = V.hi^Htable[j].hi;
279 			Hi[j].lo = V.lo^Htable[j].lo;
280 		}
281 	}
282 #else
283 	Htable[8] = V;
284 	REDUCE1BIT(V);
285 	Htable[4] = V;
286 	REDUCE1BIT(V);
287 	Htable[2] = V;
288 	REDUCE1BIT(V);
289 	Htable[1] = V;
290 	Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291 	V=Htable[4];
292 	Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293 	Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294 	Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295 	V=Htable[8];
296 	Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297 	Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298 	Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299 	Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300 	Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301 	Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302 	Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305 	/*
306 	 * ARM assembler expects specific dword order in Htable.
307 	 */
308 	{
309 	int j;
310 	const union { long one; char little; } is_endian = {1};
311 
312 	if (is_endian.little)
313 		for (j=0;j<16;++j) {
314 			V = Htable[j];
315 			Htable[j].hi = V.lo;
316 			Htable[j].lo = V.hi;
317 		}
318 	else
319 		for (j=0;j<16;++j) {
320 			V = Htable[j];
321 			Htable[j].hi = V.lo<<32|V.lo>>32;
322 			Htable[j].lo = V.hi<<32|V.hi>>32;
323 		}
324 	}
325 #endif
326 }
327 
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330 	PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331 	PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332 	PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333 	PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334 
gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16])335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337 	u128 Z;
338 	int cnt = 15;
339 	size_t rem, nlo, nhi;
340 	const union { long one; char little; } is_endian = {1};
341 
342 	nlo  = ((const u8 *)Xi)[15];
343 	nhi  = nlo>>4;
344 	nlo &= 0xf;
345 
346 	Z.hi = Htable[nlo].hi;
347 	Z.lo = Htable[nlo].lo;
348 
349 	while (1) {
350 		rem  = (size_t)Z.lo&0xf;
351 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
352 		Z.hi = (Z.hi>>4);
353 		if (sizeof(size_t)==8)
354 			Z.hi ^= rem_4bit[rem];
355 		else
356 			Z.hi ^= (u64)rem_4bit[rem]<<32;
357 
358 		Z.hi ^= Htable[nhi].hi;
359 		Z.lo ^= Htable[nhi].lo;
360 
361 		if (--cnt<0)		break;
362 
363 		nlo  = ((const u8 *)Xi)[cnt];
364 		nhi  = nlo>>4;
365 		nlo &= 0xf;
366 
367 		rem  = (size_t)Z.lo&0xf;
368 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
369 		Z.hi = (Z.hi>>4);
370 		if (sizeof(size_t)==8)
371 			Z.hi ^= rem_4bit[rem];
372 		else
373 			Z.hi ^= (u64)rem_4bit[rem]<<32;
374 
375 		Z.hi ^= Htable[nlo].hi;
376 		Z.lo ^= Htable[nlo].lo;
377 	}
378 
379 	if (is_endian.little) {
380 #ifdef BSWAP8
381 		Xi[0] = BSWAP8(Z.hi);
382 		Xi[1] = BSWAP8(Z.lo);
383 #else
384 		u8 *p = (u8 *)Xi;
385 		u32 v;
386 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
387 		v = (u32)(Z.hi);	PUTU32(p+4,v);
388 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
389 		v = (u32)(Z.lo);	PUTU32(p+12,v);
390 #endif
391 	}
392 	else {
393 		Xi[0] = Z.hi;
394 		Xi[1] = Z.lo;
395 	}
396 }
397 
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 * inp,size_t len)406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407 				const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413 
414 #if 1
415     do {
416 	cnt  = 15;
417 	nlo  = ((const u8 *)Xi)[15];
418 	nlo ^= inp[15];
419 	nhi  = nlo>>4;
420 	nlo &= 0xf;
421 
422 	Z.hi = Htable[nlo].hi;
423 	Z.lo = Htable[nlo].lo;
424 
425 	while (1) {
426 		rem  = (size_t)Z.lo&0xf;
427 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
428 		Z.hi = (Z.hi>>4);
429 		if (sizeof(size_t)==8)
430 			Z.hi ^= rem_4bit[rem];
431 		else
432 			Z.hi ^= (u64)rem_4bit[rem]<<32;
433 
434 		Z.hi ^= Htable[nhi].hi;
435 		Z.lo ^= Htable[nhi].lo;
436 
437 		if (--cnt<0)		break;
438 
439 		nlo  = ((const u8 *)Xi)[cnt];
440 		nlo ^= inp[cnt];
441 		nhi  = nlo>>4;
442 		nlo &= 0xf;
443 
444 		rem  = (size_t)Z.lo&0xf;
445 		Z.lo = (Z.hi<<60)|(Z.lo>>4);
446 		Z.hi = (Z.hi>>4);
447 		if (sizeof(size_t)==8)
448 			Z.hi ^= rem_4bit[rem];
449 		else
450 			Z.hi ^= (u64)rem_4bit[rem]<<32;
451 
452 		Z.hi ^= Htable[nlo].hi;
453 		Z.lo ^= Htable[nlo].lo;
454 	}
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */
462     u128 Hshr4[16];	/* Htable shifted right by 4 bits */
463     u8   Hshl4[16];	/* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465 	0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466 	0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467 	0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468 	0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469 	0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470 	0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471 	0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472 	0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473 	0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474 	0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475 	0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476 	0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477 	0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478 	0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479 	0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480 	0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481 	0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482 	0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483 	0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484 	0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485 	0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486 	0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487 	0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488 	0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489 	0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490 	0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491 	0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492 	0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493 	0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494 	0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495 	0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496 	0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504 	Z.hi = Htable[cnt].hi;
505 	Z.lo = Htable[cnt].lo;
506 	Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507 	Hshr4[cnt].hi = (Z.hi>>4);
508 	Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510 
511     do {
512 	for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513 		nlo  = ((const u8 *)Xi)[cnt];
514 		nlo ^= inp[cnt];
515 		nhi  = nlo>>4;
516 		nlo &= 0xf;
517 
518 		Z.hi ^= Htable[nlo].hi;
519 		Z.lo ^= Htable[nlo].lo;
520 
521 		rem = (size_t)Z.lo&0xff;
522 
523 		Z.lo = (Z.hi<<56)|(Z.lo>>8);
524 		Z.hi = (Z.hi>>8);
525 
526 		Z.hi ^= Hshr4[nhi].hi;
527 		Z.lo ^= Hshr4[nhi].lo;
528 		Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529 	}
530 
531 	nlo  = ((const u8 *)Xi)[0];
532 	nlo ^= inp[0];
533 	nhi  = nlo>>4;
534 	nlo &= 0xf;
535 
536 	Z.hi ^= Htable[nlo].hi;
537 	Z.lo ^= Htable[nlo].lo;
538 
539 	rem = (size_t)Z.lo&0xf;
540 
541 	Z.lo = (Z.hi<<60)|(Z.lo>>4);
542 	Z.hi = (Z.hi>>4);
543 
544 	Z.hi ^= Htable[nhi].hi;
545 	Z.lo ^= Htable[nhi].lo;
546 	Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548 
549 	if (is_endian.little) {
550 #ifdef BSWAP8
551 		Xi[0] = BSWAP8(Z.hi);
552 		Xi[1] = BSWAP8(Z.lo);
553 #else
554 		u8 *p = (u8 *)Xi;
555 		u32 v;
556 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
557 		v = (u32)(Z.hi);	PUTU32(p+4,v);
558 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
559 		v = (u32)(Z.lo);	PUTU32(p+12,v);
560 #endif
561 	}
562 	else {
563 		Xi[0] = Z.hi;
564 		Xi[1] = Z.lo;
565 	}
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573 
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582 
583 #else	/* TABLE_BITS */
584 
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587 	u128 V,Z = { 0,0 };
588 	long X;
589 	int  i,j;
590 	const long *xi = (const long *)Xi;
591 	const union { long one; char little; } is_endian = {1};
592 
593 	V.hi = H[0];	/* H is in host byte order, no byte swapping */
594 	V.lo = H[1];
595 
596 	for (j=0; j<16/sizeof(long); ++j) {
597 		if (is_endian.little) {
598 			if (sizeof(long)==8) {
599 #ifdef BSWAP8
600 				X = (long)(BSWAP8(xi[j]));
601 #else
602 				const u8 *p = (const u8 *)(xi+j);
603 				X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605 			}
606 			else {
607 				const u8 *p = (const u8 *)(xi+j);
608 				X = (long)GETU32(p);
609 			}
610 		}
611 		else
612 			X = xi[j];
613 
614 		for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615 			u64 M = (u64)(X>>(8*sizeof(long)-1));
616 			Z.hi ^= V.hi&M;
617 			Z.lo ^= V.lo&M;
618 
619 			REDUCE1BIT(V);
620 		}
621 	}
622 
623 	if (is_endian.little) {
624 #ifdef BSWAP8
625 		Xi[0] = BSWAP8(Z.hi);
626 		Xi[1] = BSWAP8(Z.lo);
627 #else
628 		u8 *p = (u8 *)Xi;
629 		u32 v;
630 		v = (u32)(Z.hi>>32);	PUTU32(p,v);
631 		v = (u32)(Z.hi);	PUTU32(p+4,v);
632 		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
633 		v = (u32)(Z.lo);	PUTU32(p+12,v);
634 #endif
635 	}
636 	else {
637 		Xi[0] = Z.hi;
638 		Xi[1] = Z.lo;
639 	}
640 }
641 #define GCM_MUL(ctx,Xi)	  gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642 
643 #endif
644 
645 #if	TABLE_BITS==4 && defined(GHASH_ASM)
646 # if	!defined(I386_ONLY) && \
647 	(defined(__i386)	|| defined(__i386__)	|| \
648 	 defined(__x86_64)	|| defined(__x86_64__)	|| \
649 	 defined(_M_IX86)	|| defined(_M_AMD64)	|| defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653 
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657 
658 #  if	defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 #   define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662 
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 #  endif
666 # elif defined(__arm__) || defined(__arm)
667 #  include "arm_arch.h"
668 #  if __ARM_ARCH__>=7
669 #   define GHASH_ASM_ARM
670 #   define GCM_FUNCREF_4BIT
671 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673 #  endif
674 # endif
675 #endif
676 
677 #ifdef GCM_FUNCREF_4BIT
678 # undef  GCM_MUL
679 # define GCM_MUL(ctx,Xi)	(*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680 # ifdef GHASH
681 #  undef  GHASH
682 #  define GHASH(ctx,in,len)	(*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683 # endif
684 #endif
685 
686 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687 {
688 	const union { long one; char little; } is_endian = {1};
689 
690 	memset(ctx,0,sizeof(*ctx));
691 	ctx->block = block;
692 	ctx->key   = key;
693 
694 	(*block)(ctx->H.c,ctx->H.c,key);
695 
696 	if (is_endian.little) {
697 		/* H is stored in host byte order */
698 #ifdef BSWAP8
699 		ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
700 		ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
701 #else
702 		u8 *p = ctx->H.c;
703 		u64 hi,lo;
704 		hi = (u64)GETU32(p)  <<32|GETU32(p+4);
705 		lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
706 		ctx->H.u[0] = hi;
707 		ctx->H.u[1] = lo;
708 #endif
709 	}
710 
711 #if	TABLE_BITS==8
712 	gcm_init_8bit(ctx->Htable,ctx->H.u);
713 #elif	TABLE_BITS==4
714 # if	defined(GHASH_ASM_X86_OR_64)
715 #  if	!defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
716 	if (OPENSSL_ia32cap_P[0]&(1<<24) &&	/* check FXSR bit */
717 	    OPENSSL_ia32cap_P[1]&(1<<1) ) {	/* check PCLMULQDQ bit */
718 		gcm_init_clmul(ctx->Htable,ctx->H.u);
719 		ctx->gmult = gcm_gmult_clmul;
720 		ctx->ghash = gcm_ghash_clmul;
721 		return;
722 	}
723 #  endif
724 	gcm_init_4bit(ctx->Htable,ctx->H.u);
725 #  if	defined(GHASH_ASM_X86)			/* x86 only */
726 #   if	defined(OPENSSL_IA32_SSE2)
727 	if (OPENSSL_ia32cap_P[0]&(1<<25)) {	/* check SSE bit */
728 #   else
729 	if (OPENSSL_ia32cap_P[0]&(1<<23)) {	/* check MMX bit */
730 #   endif
731 		ctx->gmult = gcm_gmult_4bit_mmx;
732 		ctx->ghash = gcm_ghash_4bit_mmx;
733 	} else {
734 		ctx->gmult = gcm_gmult_4bit_x86;
735 		ctx->ghash = gcm_ghash_4bit_x86;
736 	}
737 #  else
738 	ctx->gmult = gcm_gmult_4bit;
739 	ctx->ghash = gcm_ghash_4bit;
740 #  endif
741 # elif	defined(GHASH_ASM_ARM)
742 	if (OPENSSL_armcap_P & ARMV7_NEON) {
743 		ctx->gmult = gcm_gmult_neon;
744 		ctx->ghash = gcm_ghash_neon;
745 	} else {
746 		gcm_init_4bit(ctx->Htable,ctx->H.u);
747 		ctx->gmult = gcm_gmult_4bit;
748 		ctx->ghash = gcm_ghash_4bit;
749 	}
750 # else
751 	gcm_init_4bit(ctx->Htable,ctx->H.u);
752 # endif
753 #endif
754 }
755 
756 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
757 {
758 	const union { long one; char little; } is_endian = {1};
759 	unsigned int ctr;
760 #ifdef GCM_FUNCREF_4BIT
761 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
762 #endif
763 
764 	ctx->Yi.u[0]  = 0;
765 	ctx->Yi.u[1]  = 0;
766 	ctx->Xi.u[0]  = 0;
767 	ctx->Xi.u[1]  = 0;
768 	ctx->len.u[0] = 0;	/* AAD length */
769 	ctx->len.u[1] = 0;	/* message length */
770 	ctx->ares = 0;
771 	ctx->mres = 0;
772 
773 	if (len==12) {
774 		memcpy(ctx->Yi.c,iv,12);
775 		ctx->Yi.c[15]=1;
776 		ctr=1;
777 	}
778 	else {
779 		size_t i;
780 		u64 len0 = len;
781 
782 		while (len>=16) {
783 			for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
784 			GCM_MUL(ctx,Yi);
785 			iv += 16;
786 			len -= 16;
787 		}
788 		if (len) {
789 			for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
790 			GCM_MUL(ctx,Yi);
791 		}
792 		len0 <<= 3;
793 		if (is_endian.little) {
794 #ifdef BSWAP8
795 			ctx->Yi.u[1]  ^= BSWAP8(len0);
796 #else
797 			ctx->Yi.c[8]  ^= (u8)(len0>>56);
798 			ctx->Yi.c[9]  ^= (u8)(len0>>48);
799 			ctx->Yi.c[10] ^= (u8)(len0>>40);
800 			ctx->Yi.c[11] ^= (u8)(len0>>32);
801 			ctx->Yi.c[12] ^= (u8)(len0>>24);
802 			ctx->Yi.c[13] ^= (u8)(len0>>16);
803 			ctx->Yi.c[14] ^= (u8)(len0>>8);
804 			ctx->Yi.c[15] ^= (u8)(len0);
805 #endif
806 		}
807 		else
808 			ctx->Yi.u[1]  ^= len0;
809 
810 		GCM_MUL(ctx,Yi);
811 
812 		if (is_endian.little)
813 			ctr = GETU32(ctx->Yi.c+12);
814 		else
815 			ctr = ctx->Yi.d[3];
816 	}
817 
818 	(*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
819 	++ctr;
820 	if (is_endian.little)
821 		PUTU32(ctx->Yi.c+12,ctr);
822 	else
823 		ctx->Yi.d[3] = ctr;
824 }
825 
826 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
827 {
828 	size_t i;
829 	unsigned int n;
830 	u64 alen = ctx->len.u[0];
831 #ifdef GCM_FUNCREF_4BIT
832 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
833 # ifdef GHASH
834 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
835 				const u8 *inp,size_t len)	= ctx->ghash;
836 # endif
837 #endif
838 
839 	if (ctx->len.u[1]) return -2;
840 
841 	alen += len;
842 	if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
843 		return -1;
844 	ctx->len.u[0] = alen;
845 
846 	n = ctx->ares;
847 	if (n) {
848 		while (n && len) {
849 			ctx->Xi.c[n] ^= *(aad++);
850 			--len;
851 			n = (n+1)%16;
852 		}
853 		if (n==0) GCM_MUL(ctx,Xi);
854 		else {
855 			ctx->ares = n;
856 			return 0;
857 		}
858 	}
859 
860 #ifdef GHASH
861 	if ((i = (len&(size_t)-16))) {
862 		GHASH(ctx,aad,i);
863 		aad += i;
864 		len -= i;
865 	}
866 #else
867 	while (len>=16) {
868 		for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
869 		GCM_MUL(ctx,Xi);
870 		aad += 16;
871 		len -= 16;
872 	}
873 #endif
874 	if (len) {
875 		n = (unsigned int)len;
876 		for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
877 	}
878 
879 	ctx->ares = n;
880 	return 0;
881 }
882 
883 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
884 		const unsigned char *in, unsigned char *out,
885 		size_t len)
886 {
887 	const union { long one; char little; } is_endian = {1};
888 	unsigned int n, ctr;
889 	size_t i;
890 	u64        mlen  = ctx->len.u[1];
891 	block128_f block = ctx->block;
892 	void      *key   = ctx->key;
893 #ifdef GCM_FUNCREF_4BIT
894 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
895 # ifdef GHASH
896 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
897 				const u8 *inp,size_t len)	= ctx->ghash;
898 # endif
899 #endif
900 
901 #if 0
902 	n = (unsigned int)mlen%16; /* alternative to ctx->mres */
903 #endif
904 	mlen += len;
905 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
906 		return -1;
907 	ctx->len.u[1] = mlen;
908 
909 	if (ctx->ares) {
910 		/* First call to encrypt finalizes GHASH(AAD) */
911 		GCM_MUL(ctx,Xi);
912 		ctx->ares = 0;
913 	}
914 
915 	if (is_endian.little)
916 		ctr = GETU32(ctx->Yi.c+12);
917 	else
918 		ctr = ctx->Yi.d[3];
919 
920 	n = ctx->mres;
921 #if !defined(OPENSSL_SMALL_FOOTPRINT)
922 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
923 		if (n) {
924 			while (n && len) {
925 				ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
926 				--len;
927 				n = (n+1)%16;
928 			}
929 			if (n==0) GCM_MUL(ctx,Xi);
930 			else {
931 				ctx->mres = n;
932 				return 0;
933 			}
934 		}
935 #if defined(STRICT_ALIGNMENT)
936 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
937 			break;
938 #endif
939 #if defined(GHASH) && defined(GHASH_CHUNK)
940 		while (len>=GHASH_CHUNK) {
941 		    size_t j=GHASH_CHUNK;
942 
943 		    while (j) {
944 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
945 			++ctr;
946 			if (is_endian.little)
947 				PUTU32(ctx->Yi.c+12,ctr);
948 			else
949 				ctx->Yi.d[3] = ctr;
950 			for (i=0; i<16; i+=sizeof(size_t))
951 				*(size_t *)(out+i) =
952 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
953 			out += 16;
954 			in  += 16;
955 			j   -= 16;
956 		    }
957 		    GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
958 		    len -= GHASH_CHUNK;
959 		}
960 		if ((i = (len&(size_t)-16))) {
961 		    size_t j=i;
962 
963 		    while (len>=16) {
964 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
965 			++ctr;
966 			if (is_endian.little)
967 				PUTU32(ctx->Yi.c+12,ctr);
968 			else
969 				ctx->Yi.d[3] = ctr;
970 			for (i=0; i<16; i+=sizeof(size_t))
971 				*(size_t *)(out+i) =
972 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
973 			out += 16;
974 			in  += 16;
975 			len -= 16;
976 		    }
977 		    GHASH(ctx,out-j,j);
978 		}
979 #else
980 		while (len>=16) {
981 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
982 			++ctr;
983 			if (is_endian.little)
984 				PUTU32(ctx->Yi.c+12,ctr);
985 			else
986 				ctx->Yi.d[3] = ctr;
987 			for (i=0; i<16; i+=sizeof(size_t))
988 				*(size_t *)(ctx->Xi.c+i) ^=
989 				*(size_t *)(out+i) =
990 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
991 			GCM_MUL(ctx,Xi);
992 			out += 16;
993 			in  += 16;
994 			len -= 16;
995 		}
996 #endif
997 		if (len) {
998 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
999 			++ctr;
1000 			if (is_endian.little)
1001 				PUTU32(ctx->Yi.c+12,ctr);
1002 			else
1003 				ctx->Yi.d[3] = ctr;
1004 			while (len--) {
1005 				ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1006 				++n;
1007 			}
1008 		}
1009 
1010 		ctx->mres = n;
1011 		return 0;
1012 	} while(0);
1013 #endif
1014 	for (i=0;i<len;++i) {
1015 		if (n==0) {
1016 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1017 			++ctr;
1018 			if (is_endian.little)
1019 				PUTU32(ctx->Yi.c+12,ctr);
1020 			else
1021 				ctx->Yi.d[3] = ctr;
1022 		}
1023 		ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1024 		n = (n+1)%16;
1025 		if (n==0)
1026 			GCM_MUL(ctx,Xi);
1027 	}
1028 
1029 	ctx->mres = n;
1030 	return 0;
1031 }
1032 
1033 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1034 		const unsigned char *in, unsigned char *out,
1035 		size_t len)
1036 {
1037 	const union { long one; char little; } is_endian = {1};
1038 	unsigned int n, ctr;
1039 	size_t i;
1040 	u64        mlen  = ctx->len.u[1];
1041 	block128_f block = ctx->block;
1042 	void      *key   = ctx->key;
1043 #ifdef GCM_FUNCREF_4BIT
1044 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1045 # ifdef GHASH
1046 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1047 				const u8 *inp,size_t len)	= ctx->ghash;
1048 # endif
1049 #endif
1050 
1051 	mlen += len;
1052 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1053 		return -1;
1054 	ctx->len.u[1] = mlen;
1055 
1056 	if (ctx->ares) {
1057 		/* First call to decrypt finalizes GHASH(AAD) */
1058 		GCM_MUL(ctx,Xi);
1059 		ctx->ares = 0;
1060 	}
1061 
1062 	if (is_endian.little)
1063 		ctr = GETU32(ctx->Yi.c+12);
1064 	else
1065 		ctr = ctx->Yi.d[3];
1066 
1067 	n = ctx->mres;
1068 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1069 	if (16%sizeof(size_t) == 0) do {	/* always true actually */
1070 		if (n) {
1071 			while (n && len) {
1072 				u8 c = *(in++);
1073 				*(out++) = c^ctx->EKi.c[n];
1074 				ctx->Xi.c[n] ^= c;
1075 				--len;
1076 				n = (n+1)%16;
1077 			}
1078 			if (n==0) GCM_MUL (ctx,Xi);
1079 			else {
1080 				ctx->mres = n;
1081 				return 0;
1082 			}
1083 		}
1084 #if defined(STRICT_ALIGNMENT)
1085 		if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1086 			break;
1087 #endif
1088 #if defined(GHASH) && defined(GHASH_CHUNK)
1089 		while (len>=GHASH_CHUNK) {
1090 		    size_t j=GHASH_CHUNK;
1091 
1092 		    GHASH(ctx,in,GHASH_CHUNK);
1093 		    while (j) {
1094 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1095 			++ctr;
1096 			if (is_endian.little)
1097 				PUTU32(ctx->Yi.c+12,ctr);
1098 			else
1099 				ctx->Yi.d[3] = ctr;
1100 			for (i=0; i<16; i+=sizeof(size_t))
1101 				*(size_t *)(out+i) =
1102 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1103 			out += 16;
1104 			in  += 16;
1105 			j   -= 16;
1106 		    }
1107 		    len -= GHASH_CHUNK;
1108 		}
1109 		if ((i = (len&(size_t)-16))) {
1110 		    GHASH(ctx,in,i);
1111 		    while (len>=16) {
1112 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1113 			++ctr;
1114 			if (is_endian.little)
1115 				PUTU32(ctx->Yi.c+12,ctr);
1116 			else
1117 				ctx->Yi.d[3] = ctr;
1118 			for (i=0; i<16; i+=sizeof(size_t))
1119 				*(size_t *)(out+i) =
1120 				*(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1121 			out += 16;
1122 			in  += 16;
1123 			len -= 16;
1124 		    }
1125 		}
1126 #else
1127 		while (len>=16) {
1128 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1129 			++ctr;
1130 			if (is_endian.little)
1131 				PUTU32(ctx->Yi.c+12,ctr);
1132 			else
1133 				ctx->Yi.d[3] = ctr;
1134 			for (i=0; i<16; i+=sizeof(size_t)) {
1135 				size_t c = *(size_t *)(in+i);
1136 				*(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1137 				*(size_t *)(ctx->Xi.c+i) ^= c;
1138 			}
1139 			GCM_MUL(ctx,Xi);
1140 			out += 16;
1141 			in  += 16;
1142 			len -= 16;
1143 		}
1144 #endif
1145 		if (len) {
1146 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1147 			++ctr;
1148 			if (is_endian.little)
1149 				PUTU32(ctx->Yi.c+12,ctr);
1150 			else
1151 				ctx->Yi.d[3] = ctr;
1152 			while (len--) {
1153 				u8 c = in[n];
1154 				ctx->Xi.c[n] ^= c;
1155 				out[n] = c^ctx->EKi.c[n];
1156 				++n;
1157 			}
1158 		}
1159 
1160 		ctx->mres = n;
1161 		return 0;
1162 	} while(0);
1163 #endif
1164 	for (i=0;i<len;++i) {
1165 		u8 c;
1166 		if (n==0) {
1167 			(*block)(ctx->Yi.c,ctx->EKi.c,key);
1168 			++ctr;
1169 			if (is_endian.little)
1170 				PUTU32(ctx->Yi.c+12,ctr);
1171 			else
1172 				ctx->Yi.d[3] = ctr;
1173 		}
1174 		c = in[i];
1175 		out[i] = c^ctx->EKi.c[n];
1176 		ctx->Xi.c[n] ^= c;
1177 		n = (n+1)%16;
1178 		if (n==0)
1179 			GCM_MUL(ctx,Xi);
1180 	}
1181 
1182 	ctx->mres = n;
1183 	return 0;
1184 }
1185 
1186 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1187 		const unsigned char *in, unsigned char *out,
1188 		size_t len, ctr128_f stream)
1189 {
1190 	const union { long one; char little; } is_endian = {1};
1191 	unsigned int n, ctr;
1192 	size_t i;
1193 	u64   mlen = ctx->len.u[1];
1194 	void *key  = ctx->key;
1195 #ifdef GCM_FUNCREF_4BIT
1196 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1197 # ifdef GHASH
1198 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1199 				const u8 *inp,size_t len)	= ctx->ghash;
1200 # endif
1201 #endif
1202 
1203 	mlen += len;
1204 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1205 		return -1;
1206 	ctx->len.u[1] = mlen;
1207 
1208 	if (ctx->ares) {
1209 		/* First call to encrypt finalizes GHASH(AAD) */
1210 		GCM_MUL(ctx,Xi);
1211 		ctx->ares = 0;
1212 	}
1213 
1214 	if (is_endian.little)
1215 		ctr = GETU32(ctx->Yi.c+12);
1216 	else
1217 		ctr = ctx->Yi.d[3];
1218 
1219 	n = ctx->mres;
1220 	if (n) {
1221 		while (n && len) {
1222 			ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1223 			--len;
1224 			n = (n+1)%16;
1225 		}
1226 		if (n==0) GCM_MUL(ctx,Xi);
1227 		else {
1228 			ctx->mres = n;
1229 			return 0;
1230 		}
1231 	}
1232 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1233 	while (len>=GHASH_CHUNK) {
1234 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1235 		ctr += GHASH_CHUNK/16;
1236 		if (is_endian.little)
1237 			PUTU32(ctx->Yi.c+12,ctr);
1238 		else
1239 			ctx->Yi.d[3] = ctr;
1240 		GHASH(ctx,out,GHASH_CHUNK);
1241 		out += GHASH_CHUNK;
1242 		in  += GHASH_CHUNK;
1243 		len -= GHASH_CHUNK;
1244 	}
1245 #endif
1246 	if ((i = (len&(size_t)-16))) {
1247 		size_t j=i/16;
1248 
1249 		(*stream)(in,out,j,key,ctx->Yi.c);
1250 		ctr += (unsigned int)j;
1251 		if (is_endian.little)
1252 			PUTU32(ctx->Yi.c+12,ctr);
1253 		else
1254 			ctx->Yi.d[3] = ctr;
1255 		in  += i;
1256 		len -= i;
1257 #if defined(GHASH)
1258 		GHASH(ctx,out,i);
1259 		out += i;
1260 #else
1261 		while (j--) {
1262 			for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1263 			GCM_MUL(ctx,Xi);
1264 			out += 16;
1265 		}
1266 #endif
1267 	}
1268 	if (len) {
1269 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1270 		++ctr;
1271 		if (is_endian.little)
1272 			PUTU32(ctx->Yi.c+12,ctr);
1273 		else
1274 			ctx->Yi.d[3] = ctr;
1275 		while (len--) {
1276 			ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1277 			++n;
1278 		}
1279 	}
1280 
1281 	ctx->mres = n;
1282 	return 0;
1283 }
1284 
1285 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1286 		const unsigned char *in, unsigned char *out,
1287 		size_t len,ctr128_f stream)
1288 {
1289 	const union { long one; char little; } is_endian = {1};
1290 	unsigned int n, ctr;
1291 	size_t i;
1292 	u64   mlen = ctx->len.u[1];
1293 	void *key  = ctx->key;
1294 #ifdef GCM_FUNCREF_4BIT
1295 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1296 # ifdef GHASH
1297 	void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1298 				const u8 *inp,size_t len)	= ctx->ghash;
1299 # endif
1300 #endif
1301 
1302 	mlen += len;
1303 	if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1304 		return -1;
1305 	ctx->len.u[1] = mlen;
1306 
1307 	if (ctx->ares) {
1308 		/* First call to decrypt finalizes GHASH(AAD) */
1309 		GCM_MUL(ctx,Xi);
1310 		ctx->ares = 0;
1311 	}
1312 
1313 	if (is_endian.little)
1314 		ctr = GETU32(ctx->Yi.c+12);
1315 	else
1316 		ctr = ctx->Yi.d[3];
1317 
1318 	n = ctx->mres;
1319 	if (n) {
1320 		while (n && len) {
1321 			u8 c = *(in++);
1322 			*(out++) = c^ctx->EKi.c[n];
1323 			ctx->Xi.c[n] ^= c;
1324 			--len;
1325 			n = (n+1)%16;
1326 		}
1327 		if (n==0) GCM_MUL (ctx,Xi);
1328 		else {
1329 			ctx->mres = n;
1330 			return 0;
1331 		}
1332 	}
1333 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1334 	while (len>=GHASH_CHUNK) {
1335 		GHASH(ctx,in,GHASH_CHUNK);
1336 		(*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1337 		ctr += GHASH_CHUNK/16;
1338 		if (is_endian.little)
1339 			PUTU32(ctx->Yi.c+12,ctr);
1340 		else
1341 			ctx->Yi.d[3] = ctr;
1342 		out += GHASH_CHUNK;
1343 		in  += GHASH_CHUNK;
1344 		len -= GHASH_CHUNK;
1345 	}
1346 #endif
1347 	if ((i = (len&(size_t)-16))) {
1348 		size_t j=i/16;
1349 
1350 #if defined(GHASH)
1351 		GHASH(ctx,in,i);
1352 #else
1353 		while (j--) {
1354 			size_t k;
1355 			for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1356 			GCM_MUL(ctx,Xi);
1357 			in += 16;
1358 		}
1359 		j   = i/16;
1360 		in -= i;
1361 #endif
1362 		(*stream)(in,out,j,key,ctx->Yi.c);
1363 		ctr += (unsigned int)j;
1364 		if (is_endian.little)
1365 			PUTU32(ctx->Yi.c+12,ctr);
1366 		else
1367 			ctx->Yi.d[3] = ctr;
1368 		out += i;
1369 		in  += i;
1370 		len -= i;
1371 	}
1372 	if (len) {
1373 		(*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1374 		++ctr;
1375 		if (is_endian.little)
1376 			PUTU32(ctx->Yi.c+12,ctr);
1377 		else
1378 			ctx->Yi.d[3] = ctr;
1379 		while (len--) {
1380 			u8 c = in[n];
1381 			ctx->Xi.c[n] ^= c;
1382 			out[n] = c^ctx->EKi.c[n];
1383 			++n;
1384 		}
1385 	}
1386 
1387 	ctx->mres = n;
1388 	return 0;
1389 }
1390 
1391 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1392 			size_t len)
1393 {
1394 	const union { long one; char little; } is_endian = {1};
1395 	u64 alen = ctx->len.u[0]<<3;
1396 	u64 clen = ctx->len.u[1]<<3;
1397 #ifdef GCM_FUNCREF_4BIT
1398 	void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])	= ctx->gmult;
1399 #endif
1400 
1401 	if (ctx->mres || ctx->ares)
1402 		GCM_MUL(ctx,Xi);
1403 
1404 	if (is_endian.little) {
1405 #ifdef BSWAP8
1406 		alen = BSWAP8(alen);
1407 		clen = BSWAP8(clen);
1408 #else
1409 		u8 *p = ctx->len.c;
1410 
1411 		ctx->len.u[0] = alen;
1412 		ctx->len.u[1] = clen;
1413 
1414 		alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1415 		clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1416 #endif
1417 	}
1418 
1419 	ctx->Xi.u[0] ^= alen;
1420 	ctx->Xi.u[1] ^= clen;
1421 	GCM_MUL(ctx,Xi);
1422 
1423 	ctx->Xi.u[0] ^= ctx->EK0.u[0];
1424 	ctx->Xi.u[1] ^= ctx->EK0.u[1];
1425 
1426 	if (tag && len<=sizeof(ctx->Xi))
1427 		return memcmp(ctx->Xi.c,tag,len);
1428 	else
1429 		return -1;
1430 }
1431 
1432 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1433 {
1434 	CRYPTO_gcm128_finish(ctx, NULL, 0);
1435 	memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1436 }
1437 
1438 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1439 {
1440 	GCM128_CONTEXT *ret;
1441 
1442 	if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1443 		CRYPTO_gcm128_init(ret,key,block);
1444 
1445 	return ret;
1446 }
1447 
1448 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1449 {
1450 	if (ctx) {
1451 		OPENSSL_cleanse(ctx,sizeof(*ctx));
1452 		OPENSSL_free(ctx);
1453 	}
1454 }
1455 
1456 #if defined(SELFTEST)
1457 #include <stdio.h>
1458 #include <openssl/aes.h>
1459 
1460 /* Test Case 1 */
1461 static const u8	K1[16],
1462 		*P1=NULL,
1463 		*A1=NULL,
1464 		IV1[12],
1465 		*C1=NULL,
1466 		T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1467 
1468 /* Test Case 2 */
1469 #define K2 K1
1470 #define A2 A1
1471 #define IV2 IV1
1472 static const u8	P2[16],
1473 		C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1474 		T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1475 
1476 /* Test Case 3 */
1477 #define A3 A2
1478 static const u8	K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1479 		P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1480 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1481 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1482 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1483 		IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1484 		C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1485 			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1486 			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1487 			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1488 		T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1489 
1490 /* Test Case 4 */
1491 #define K4 K3
1492 #define IV4 IV3
1493 static const u8	P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1494 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1495 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1496 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1497 		A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1498 			0xab,0xad,0xda,0xd2},
1499 		C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1500 			0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1501 			0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1502 			0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1503 		T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1504 
1505 /* Test Case 5 */
1506 #define K5 K4
1507 #define P5 P4
1508 #define A5 A4
1509 static const u8	IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1510 		C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1511 			0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1512 			0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1513 			0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1514 		T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1515 
1516 /* Test Case 6 */
1517 #define K6 K5
1518 #define P6 P5
1519 #define A6 A5
1520 static const u8	IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1521 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1522 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1523 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1524 		C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1525 			0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1526 			0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1527 			0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1528 		T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1529 
1530 /* Test Case 7 */
1531 static const u8 K7[24],
1532 		*P7=NULL,
1533 		*A7=NULL,
1534 		IV7[12],
1535 		*C7=NULL,
1536 		T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1537 
1538 /* Test Case 8 */
1539 #define K8 K7
1540 #define IV8 IV7
1541 #define A8 A7
1542 static const u8	P8[16],
1543 		C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1544 		T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1545 
1546 /* Test Case 9 */
1547 #define A9 A8
1548 static const u8	K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1549 			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1550 		P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1551 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1552 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1553 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1554 		IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1555 		C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1556 			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1557 			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1558 			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1559 		T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1560 
1561 /* Test Case 10 */
1562 #define K10 K9
1563 #define IV10 IV9
1564 static const u8	P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1565 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1566 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1567 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1568 		A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1569 			0xab,0xad,0xda,0xd2},
1570 		C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1571 			0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1572 			0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1573 			0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1574 		T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1575 
1576 /* Test Case 11 */
1577 #define K11 K10
1578 #define P11 P10
1579 #define A11 A10
1580 static const u8	IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1581 		C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1582 			0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1583 			0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1584 			0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1585 		T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1586 
1587 /* Test Case 12 */
1588 #define K12 K11
1589 #define P12 P11
1590 #define A12 A11
1591 static const u8	IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1592 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1593 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1594 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1595 		C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1596 			0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1597 			0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1598 			0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1599 		T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1600 
1601 /* Test Case 13 */
1602 static const u8	K13[32],
1603 		*P13=NULL,
1604 		*A13=NULL,
1605 		IV13[12],
1606 		*C13=NULL,
1607 		T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1608 
1609 /* Test Case 14 */
1610 #define K14 K13
1611 #define A14 A13
1612 static const u8	P14[16],
1613 		IV14[12],
1614 		C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1615 		T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1616 
1617 /* Test Case 15 */
1618 #define A15 A14
1619 static const u8	K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1620 			0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1621 		P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1622 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1623 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1624 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1625 		IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1626 		C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1627 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1628 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1629 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1630 		T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1631 
1632 /* Test Case 16 */
1633 #define K16 K15
1634 #define IV16 IV15
1635 static const u8	P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1636 			0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1637 			0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1638 			0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1639 		A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1640 			0xab,0xad,0xda,0xd2},
1641 		C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1642 			0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1643 			0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1644 			0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1645 		T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1646 
1647 /* Test Case 17 */
1648 #define K17 K16
1649 #define P17 P16
1650 #define A17 A16
1651 static const u8	IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1652 		C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1653 			0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1654 			0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1655 			0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1656 		T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1657 
1658 /* Test Case 18 */
1659 #define K18 K17
1660 #define P18 P17
1661 #define A18 A17
1662 static const u8	IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1663 			0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1664 			0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1665 			0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1666 		C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1667 			0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1668 			0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1669 			0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1670 		T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1671 
1672 #define TEST_CASE(n)	do {					\
1673 	u8 out[sizeof(P##n)];					\
1674 	AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);		\
1675 	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);	\
1676 	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1677 	memset(out,0,sizeof(out));				\
1678 	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1679 	if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));	\
1680 	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1681 	    (C##n && memcmp(out,C##n,sizeof(out))))		\
1682 		ret++, printf ("encrypt test#%d failed.\n",n);	\
1683 	CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));		\
1684 	memset(out,0,sizeof(out));				\
1685 	if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));	\
1686 	if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));	\
1687 	if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||		\
1688 	    (P##n && memcmp(out,P##n,sizeof(out))))		\
1689 		ret++, printf ("decrypt test#%d failed.\n",n);	\
1690 	} while(0)
1691 
1692 int main()
1693 {
1694 	GCM128_CONTEXT ctx;
1695 	AES_KEY key;
1696 	int ret=0;
1697 
1698 	TEST_CASE(1);
1699 	TEST_CASE(2);
1700 	TEST_CASE(3);
1701 	TEST_CASE(4);
1702 	TEST_CASE(5);
1703 	TEST_CASE(6);
1704 	TEST_CASE(7);
1705 	TEST_CASE(8);
1706 	TEST_CASE(9);
1707 	TEST_CASE(10);
1708 	TEST_CASE(11);
1709 	TEST_CASE(12);
1710 	TEST_CASE(13);
1711 	TEST_CASE(14);
1712 	TEST_CASE(15);
1713 	TEST_CASE(16);
1714 	TEST_CASE(17);
1715 	TEST_CASE(18);
1716 
1717 #ifdef OPENSSL_CPUID_OBJ
1718 	{
1719 	size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1720 	union { u64 u; u8 c[1024]; } buf;
1721 	int i;
1722 
1723 	AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1724 	CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1725 	CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1726 
1727 	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1728 	start = OPENSSL_rdtsc();
1729 	CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1730 	gcm_t = OPENSSL_rdtsc() - start;
1731 
1732 	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1733 			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1734 			(block128_f)AES_encrypt);
1735 	start = OPENSSL_rdtsc();
1736 	CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1737 			&key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1738 			(block128_f)AES_encrypt);
1739 	ctr_t = OPENSSL_rdtsc() - start;
1740 
1741 	printf("%.2f-%.2f=%.2f\n",
1742 			gcm_t/(double)sizeof(buf),
1743 			ctr_t/(double)sizeof(buf),
1744 			(gcm_t-ctr_t)/(double)sizeof(buf));
1745 #ifdef GHASH
1746 	GHASH(&ctx,buf.c,sizeof(buf));
1747 	start = OPENSSL_rdtsc();
1748 	for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1749 	gcm_t = OPENSSL_rdtsc() - start;
1750 	printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1751 #endif
1752 	}
1753 #endif
1754 
1755 	return ret;
1756 }
1757 #endif
1758