• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17#
18# AES-NI-CTR+GHASH stitch.
19#
20# February 2013
21#
22# OpenSSL GCM implementation is organized in such way that its
23# performance is rather close to the sum of its streamed components,
24# in the context parallelized AES-NI CTR and modulo-scheduled
25# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26# was observed to perform significantly better than the sum of the
27# components on contemporary CPUs, the effort was deemed impossible to
28# justify. This module is based on combination of Intel submissions,
29# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30# Locktyukhin of Intel Corp. who verified that it reduces shuffles
31# pressure with notable relative improvement, achieving 1.0 cycle per
32# byte processed with 128-bit key on Haswell processor, 0.74 - on
33# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34# measurements for favourable packet size, one divisible by 96.
35# Applications using the EVP interface will observe a few percent
36# worse performance.]
37#
38# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39#
40# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will
55# be computed incorrectly.
56#
57# In upstream, this is controlled by shelling out to the compiler to check
58# versions, but BoringSSL is intended to be used with pre-generated perlasm
59# output, so this isn't useful anyway.
60#
61# The upstream code uses the condition |$avx>1| even though no AVX2
62# instructions are used, because it assumes MOVBE is supported by the assembler
63# if and only if AVX2 is also supported by the assembler; see
64# https://marc.info/?l=openssl-dev&m=146567589526984&w=2.
65$avx = 2;
66
67open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
68*STDOUT=*OUT;
69
70# See the comment above regarding why the condition is ($avx>1) when there are
71# no AVX2 instructions being used.
72if ($avx>1) {{{
73
74# On Windows, only four parameters are passed in registers. The last two
75# parameters will be manually loaded into %rdi and %rsi.
76my ($inp, $out, $len, $key, $ivp, $Htable) =
77    $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") :
78             ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9");
79
80# The offset from %rbp to the Xip parameter. On Windows, all parameters have
81# corresponding stack positions, not just ones passed on the stack.
82# (0x40 = 6*8 + 0x10)
83#
84# Xip only needs to be accessed at the beginning and end of the function, and
85# this function is short on registers, so we make it the last parameter for
86# convenience.
87my $Xip_offset = $win64 ? 0x40 : 0x10;
88
89($Ii,$T1,$T2,$Hkey,
90 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
91
92($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
93
94($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15");
95
96$code=<<___;
97.text
98
99.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
100.align	32
101_aesni_ctr32_ghash_6x:
102.cfi_startproc
103	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
104	sub		\$6,$len
105	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
106	vmovdqu		0x00-0x80($key),$rndkey
107	vpaddb		$T2,$T1,$inout1
108	vpaddb		$T2,$inout1,$inout2
109	vpaddb		$T2,$inout2,$inout3
110	vpaddb		$T2,$inout3,$inout4
111	vpaddb		$T2,$inout4,$inout5
112	vpxor		$rndkey,$T1,$inout0
113	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
114	jmp		.Loop6x
115
116.align	32
117.Loop6x:
118	add		\$`6<<24`,$counter
119	jc		.Lhandle_ctr32		# discard $inout[1-5]?
120	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
121	  vpaddb	$T2,$inout5,$T1		# next counter value
122	  vpxor		$rndkey,$inout1,$inout1
123	  vpxor		$rndkey,$inout2,$inout2
124
125.Lresume_ctr32:
126	vmovdqu		$T1,($ivp)		# save next counter value
127	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
128	  vpxor		$rndkey,$inout3,$inout3
129	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
130	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
131
132	# At this point, the current block of 96 (0x60) bytes has already been
133	# loaded into registers. Concurrently with processing it, we want to
134	# load the next 96 bytes of input for the next round. Obviously, we can
135	# only do this if there are at least 96 more bytes of input beyond the
136	# input we're currently processing, or else we'd read past the end of
137	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
138	# bytes of input beyond the 96 bytes we're already processing, and we
139	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
140	# we'll read in the next block so that it is in registers for the next
141	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
142	# the current block and then ignore what we re-read.
143	#
144	# At this point, |$in0| points to the current (already read into
145	# registers) block, and |$end0| points to 2*96 bytes before the end of
146	# the input. Thus, |$in0| > |$end0| means that we do not have the next
147	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
148	xor		%r12,%r12
149	cmp		$in0,$end0
150
151	  vaesenc	$T2,$inout0,$inout0
152	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
153	  vpxor		$rndkey,$inout4,$inout4
154	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
155	  vaesenc	$T2,$inout1,$inout1
156	  vpxor		$rndkey,$inout5,$inout5
157	setnc		%r12b
158	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
159	  vaesenc	$T2,$inout2,$inout2
160	vmovdqu		0x10-0x20($Htable),$Hkey	# $Hkey^2
161	neg		%r12
162	  vaesenc	$T2,$inout3,$inout3
163	 vpxor		$Z1,$Z2,$Z2
164	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
165	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
166	  vaesenc	$T2,$inout4,$inout4
167	 vpxor		$Z1,$T1,$Z0
168	and		\$0x60,%r12
169	  vmovups	0x20-0x80($key),$rndkey
170	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
171	  vaesenc	$T2,$inout5,$inout5
172
173	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
174	lea		($in0,%r12),$in0
175	  vaesenc	$rndkey,$inout0,$inout0
176	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
177	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
178	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
179	  vaesenc	$rndkey,$inout1,$inout1
180	movbe		0x58($in0),%r13
181	  vaesenc	$rndkey,$inout2,$inout2
182	movbe		0x50($in0),%r12
183	  vaesenc	$rndkey,$inout3,$inout3
184	mov		%r13,0x20+8(%rsp)
185	  vaesenc	$rndkey,$inout4,$inout4
186	mov		%r12,0x28+8(%rsp)
187	vmovdqu		0x30-0x20($Htable),$Z1	# borrow $Z1 for $Hkey^3
188	  vaesenc	$rndkey,$inout5,$inout5
189
190	  vmovups	0x30-0x80($key),$rndkey
191	 vpxor		$T1,$Z2,$Z2
192	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
193	  vaesenc	$rndkey,$inout0,$inout0
194	 vpxor		$T2,$Z2,$Z2
195	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
196	  vaesenc	$rndkey,$inout1,$inout1
197	 vpxor		$Hkey,$Z3,$Z3
198	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
199	  vaesenc	$rndkey,$inout2,$inout2
200	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
201	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
202	  vaesenc	$rndkey,$inout3,$inout3
203	  vaesenc	$rndkey,$inout4,$inout4
204	 vpxor		$T1,$Z0,$Z0
205	vmovdqu		0x40-0x20($Htable),$T1	# borrow $T1 for $Hkey^4
206	  vaesenc	$rndkey,$inout5,$inout5
207
208	  vmovups	0x40-0x80($key),$rndkey
209	 vpxor		$T2,$Z2,$Z2
210	vpclmulqdq	\$0x00,$T1,$Ii,$T2
211	  vaesenc	$rndkey,$inout0,$inout0
212	 vpxor		$Hkey,$Z2,$Z2
213	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
214	  vaesenc	$rndkey,$inout1,$inout1
215	movbe		0x48($in0),%r13
216	 vpxor		$Z1,$Z3,$Z3
217	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
218	  vaesenc	$rndkey,$inout2,$inout2
219	movbe		0x40($in0),%r12
220	vpclmulqdq	\$0x11,$T1,$Ii,$T1
221	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
222	  vaesenc	$rndkey,$inout3,$inout3
223	mov		%r13,0x30+8(%rsp)
224	  vaesenc	$rndkey,$inout4,$inout4
225	mov		%r12,0x38+8(%rsp)
226	 vpxor		$T2,$Z0,$Z0
227	vmovdqu		0x60-0x20($Htable),$T2	# borrow $T2 for $Hkey^5
228	  vaesenc	$rndkey,$inout5,$inout5
229
230	  vmovups	0x50-0x80($key),$rndkey
231	 vpxor		$Hkey,$Z2,$Z2
232	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
233	  vaesenc	$rndkey,$inout0,$inout0
234	 vpxor		$Z1,$Z2,$Z2
235	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
236	  vaesenc	$rndkey,$inout1,$inout1
237	movbe		0x38($in0),%r13
238	 vpxor		$T1,$Z3,$Z3
239	vpclmulqdq	\$0x01,$T2,$Ii,$T1
240	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
241	  vaesenc	$rndkey,$inout2,$inout2
242	movbe		0x30($in0),%r12
243	vpclmulqdq	\$0x11,$T2,$Ii,$T2
244	  vaesenc	$rndkey,$inout3,$inout3
245	mov		%r13,0x40+8(%rsp)
246	  vaesenc	$rndkey,$inout4,$inout4
247	mov		%r12,0x48+8(%rsp)
248	 vpxor		$Hkey,$Z0,$Z0
249	 vmovdqu	0x70-0x20($Htable),$Hkey	# $Hkey^6
250	  vaesenc	$rndkey,$inout5,$inout5
251
252	  vmovups	0x60-0x80($key),$rndkey
253	 vpxor		$Z1,$Z2,$Z2
254	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
255	  vaesenc	$rndkey,$inout0,$inout0
256	 vpxor		$T1,$Z2,$Z2
257	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
258	  vaesenc	$rndkey,$inout1,$inout1
259	movbe		0x28($in0),%r13
260	 vpxor		$T2,$Z3,$Z3
261	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
262	  vaesenc	$rndkey,$inout2,$inout2
263	movbe		0x20($in0),%r12
264	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
265	  vaesenc	$rndkey,$inout3,$inout3
266	mov		%r13,0x50+8(%rsp)
267	  vaesenc	$rndkey,$inout4,$inout4
268	mov		%r12,0x58+8(%rsp)
269	vpxor		$Z1,$Z2,$Z2
270	  vaesenc	$rndkey,$inout5,$inout5
271	vpxor		$T1,$Z2,$Z2
272
273	  vmovups	0x70-0x80($key),$rndkey
274	vpslldq		\$8,$Z2,$Z1
275	vpxor		$T2,$Z0,$Z0
276	vmovdqu		0x10($const),$Hkey	# .Lpoly
277
278	  vaesenc	$rndkey,$inout0,$inout0
279	vpxor		$Xi,$Z3,$Z3
280	  vaesenc	$rndkey,$inout1,$inout1
281	vpxor		$Z1,$Z0,$Z0
282	movbe		0x18($in0),%r13
283	  vaesenc	$rndkey,$inout2,$inout2
284	movbe		0x10($in0),%r12
285	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
286	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
287	mov		%r13,0x60+8(%rsp)
288	  vaesenc	$rndkey,$inout3,$inout3
289	mov		%r12,0x68+8(%rsp)
290	  vaesenc	$rndkey,$inout4,$inout4
291	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
292	  vaesenc	$rndkey,$inout5,$inout5
293
294	  vaesenc	$T1,$inout0,$inout0
295	  vmovups	0x90-0x80($key),$rndkey
296	  vaesenc	$T1,$inout1,$inout1
297	vpsrldq		\$8,$Z2,$Z2
298	  vaesenc	$T1,$inout2,$inout2
299	vpxor		$Z2,$Z3,$Z3
300	  vaesenc	$T1,$inout3,$inout3
301	vpxor		$Ii,$Z0,$Z0
302	movbe		0x08($in0),%r13
303	  vaesenc	$T1,$inout4,$inout4
304	movbe		0x00($in0),%r12
305	  vaesenc	$T1,$inout5,$inout5
306	  vmovups	0xa0-0x80($key),$T1
307	  cmp		\$11,$rounds
308	  jb		.Lenc_tail		# 128-bit key
309
310	  vaesenc	$rndkey,$inout0,$inout0
311	  vaesenc	$rndkey,$inout1,$inout1
312	  vaesenc	$rndkey,$inout2,$inout2
313	  vaesenc	$rndkey,$inout3,$inout3
314	  vaesenc	$rndkey,$inout4,$inout4
315	  vaesenc	$rndkey,$inout5,$inout5
316
317	  vaesenc	$T1,$inout0,$inout0
318	  vaesenc	$T1,$inout1,$inout1
319	  vaesenc	$T1,$inout2,$inout2
320	  vaesenc	$T1,$inout3,$inout3
321	  vaesenc	$T1,$inout4,$inout4
322	  vmovups	0xb0-0x80($key),$rndkey
323	  vaesenc	$T1,$inout5,$inout5
324	  vmovups	0xc0-0x80($key),$T1
325	  # 192-bit key support was removed.
326
327	  vaesenc	$rndkey,$inout0,$inout0
328	  vaesenc	$rndkey,$inout1,$inout1
329	  vaesenc	$rndkey,$inout2,$inout2
330	  vaesenc	$rndkey,$inout3,$inout3
331	  vaesenc	$rndkey,$inout4,$inout4
332	  vaesenc	$rndkey,$inout5,$inout5
333
334	  vaesenc	$T1,$inout0,$inout0
335	  vaesenc	$T1,$inout1,$inout1
336	  vaesenc	$T1,$inout2,$inout2
337	  vaesenc	$T1,$inout3,$inout3
338	  vaesenc	$T1,$inout4,$inout4
339	  vmovups	0xd0-0x80($key),$rndkey
340	  vaesenc	$T1,$inout5,$inout5
341	  vmovups	0xe0-0x80($key),$T1
342	  jmp		.Lenc_tail		# 256-bit key
343
344.align	32
345.Lhandle_ctr32:
346	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
347	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
348	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
349	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
350	  vpaddd	$Z1,$Z2,$inout2
351	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
352	  vpaddd	$Z1,$inout1,$inout3
353	  vpshufb	$Ii,$inout1,$inout1
354	  vpaddd	$Z1,$inout2,$inout4
355	  vpshufb	$Ii,$inout2,$inout2
356	  vpxor		$rndkey,$inout1,$inout1
357	  vpaddd	$Z1,$inout3,$inout5
358	  vpshufb	$Ii,$inout3,$inout3
359	  vpxor		$rndkey,$inout2,$inout2
360	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
361	  vpshufb	$Ii,$inout4,$inout4
362	  vpshufb	$Ii,$inout5,$inout5
363	  vpshufb	$Ii,$T1,$T1		# next counter value
364	jmp		.Lresume_ctr32
365
366.align	32
367.Lenc_tail:
368	  vaesenc	$rndkey,$inout0,$inout0
369	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
370	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
371	  vaesenc	$rndkey,$inout1,$inout1
372	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
373	  vpxor		0x00($inp),$T1,$T2
374	  vaesenc	$rndkey,$inout2,$inout2
375	  vpxor		0x10($inp),$T1,$Ii
376	  vaesenc	$rndkey,$inout3,$inout3
377	  vpxor		0x20($inp),$T1,$Z1
378	  vaesenc	$rndkey,$inout4,$inout4
379	  vpxor		0x30($inp),$T1,$Z2
380	  vaesenc	$rndkey,$inout5,$inout5
381	  vpxor		0x40($inp),$T1,$Z3
382	  vpxor		0x50($inp),$T1,$Hkey
383	  vmovdqu	($ivp),$T1		# load next counter value
384
385	  vaesenclast	$T2,$inout0,$inout0
386	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
387	  vaesenclast	$Ii,$inout1,$inout1
388	 vpaddb		$T2,$T1,$Ii
389	mov		%r13,0x70+8(%rsp)
390	lea		0x60($inp),$inp
391	# These two prefetches were added in BoringSSL. See change that added them.
392	 prefetcht0	512($inp)		# We use 96-byte block so prefetch 2 lines (128 bytes)
393	 prefetcht0	576($inp)
394	  vaesenclast	$Z1,$inout2,$inout2
395	 vpaddb		$T2,$Ii,$Z1
396	mov		%r12,0x78+8(%rsp)
397	lea		0x60($out),$out
398	  vmovdqu	0x00-0x80($key),$rndkey
399	  vaesenclast	$Z2,$inout3,$inout3
400	 vpaddb		$T2,$Z1,$Z2
401	  vaesenclast	$Z3, $inout4,$inout4
402	 vpaddb		$T2,$Z2,$Z3
403	  vaesenclast	$Hkey,$inout5,$inout5
404	 vpaddb		$T2,$Z3,$Hkey
405
406	add		\$0x60,%rax
407	sub		\$0x6,$len
408	jc		.L6x_done
409
410	  vmovups	$inout0,-0x60($out)	# save output
411	 vpxor		$rndkey,$T1,$inout0
412	  vmovups	$inout1,-0x50($out)
413	 vmovdqa	$Ii,$inout1		# 0 latency
414	  vmovups	$inout2,-0x40($out)
415	 vmovdqa	$Z1,$inout2		# 0 latency
416	  vmovups	$inout3,-0x30($out)
417	 vmovdqa	$Z2,$inout3		# 0 latency
418	  vmovups	$inout4,-0x20($out)
419	 vmovdqa	$Z3,$inout4		# 0 latency
420	  vmovups	$inout5,-0x10($out)
421	 vmovdqa	$Hkey,$inout5		# 0 latency
422	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
423	jmp		.Loop6x
424
425.L6x_done:
426	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
427	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
428
429	ret
430.cfi_endproc
431.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
432___
433######################################################################
434#
435# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
436#		const AES_KEY *key, unsigned char iv[16], const u128 Htbl[9],
437#		u128 *Xip);
438$code.=<<___;
439.globl	aesni_gcm_decrypt
440.type	aesni_gcm_decrypt,\@abi-omnipotent
441.align	32
442aesni_gcm_decrypt:
443.cfi_startproc
444.seh_startproc
445	_CET_ENDBR
446	xor	%rax,%rax
447
448	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
449	# bytes of input.
450	cmp	\$0x60,$len			# minimal accepted length
451	jb	.Lgcm_dec_abort
452
453	push	%rbp
454.cfi_push	%rbp
455.seh_pushreg	%rbp
456	mov	%rsp, %rbp			# save stack pointer
457.cfi_def_cfa_register	%rbp
458	push	%rbx
459.cfi_push	%rbx
460.seh_pushreg	%rbx
461	push	%r12
462.cfi_push	%r12
463.seh_pushreg	%r12
464	push	%r13
465.cfi_push	%r13
466.seh_pushreg	%r13
467	push	%r14
468.cfi_push	%r14
469.seh_pushreg	%r14
470	push	%r15
471.cfi_push	%r15
472.seh_pushreg	%r15
473___
474if ($win64) {
475$code.=<<___
476	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
477.seh_allocstack	0xa8
478.seh_setframe	%rbp, 0xa8+5*8
479	# Load the last two parameters. These go into %rdi and %rsi, which are
480	# non-volatile on Windows, so stash them in the parameter stack area
481	# first.
482	mov	%rdi, 0x10(%rbp)
483.seh_savereg	%rdi, 0xa8+5*8+0x10
484	mov	%rsi, 0x18(%rbp)
485.seh_savereg	%rsi, 0xa8+5*8+0x18
486	mov	0x30(%rbp), $ivp
487	mov	0x38(%rbp), $Htable
488	# Save non-volatile XMM registers.
489	movaps	%xmm6,-0xd0(%rbp)
490.seh_savexmm128	%xmm6, 0xa8+5*8-0xd0
491	movaps	%xmm7,-0xc0(%rbp)
492.seh_savexmm128	%xmm7, 0xa8+5*8-0xc0
493	movaps	%xmm8,-0xb0(%rbp)
494.seh_savexmm128	%xmm8, 0xa8+5*8-0xb0
495	movaps	%xmm9,-0xa0(%rbp)
496.seh_savexmm128	%xmm9, 0xa8+5*8-0xa0
497	movaps	%xmm10,-0x90(%rbp)
498.seh_savexmm128	%xmm10, 0xa8+5*8-0x90
499	movaps	%xmm11,-0x80(%rbp)
500.seh_savexmm128	%xmm11, 0xa8+5*8-0x80
501	movaps	%xmm12,-0x70(%rbp)
502.seh_savexmm128	%xmm12, 0xa8+5*8-0x70
503	movaps	%xmm13,-0x60(%rbp)
504.seh_savexmm128	%xmm13, 0xa8+5*8-0x60
505	movaps	%xmm14,-0x50(%rbp)
506.seh_savexmm128	%xmm14, 0xa8+5*8-0x50
507	movaps	%xmm15,-0x40(%rbp)
508.seh_savexmm128	%xmm15, 0xa8+5*8-0x40
509___
510}
511$code.=<<___;
512	vzeroupper
513
514	mov		$Xip_offset(%rbp), %r12
515	vmovdqu		($ivp),$T1		# input counter value
516	add		\$-128,%rsp
517	mov		12($ivp),$counter
518	lea		.Lbswap_mask(%rip),$const
519	lea		-0x80($key),$in0	# borrow $in0
520	mov		\$0xf80,$end0		# borrow $end0
521	vmovdqu		(%r12),$Xi		# load Xi
522	and		\$-128,%rsp		# ensure stack alignment
523	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
524	lea		0x80($key),$key		# size optimization
525	lea		0x20($Htable),$Htable	# size optimization
526	mov		0xf0-0x80($key),$rounds
527	vpshufb		$Ii,$Xi,$Xi
528
529	and		$end0,$in0
530	and		%rsp,$end0
531	sub		$in0,$end0
532	jc		.Ldec_no_key_aliasing
533	cmp		\$768,$end0
534	jnc		.Ldec_no_key_aliasing
535	sub		$end0,%rsp		# avoid aliasing with key
536.Ldec_no_key_aliasing:
537
538	vmovdqu		0x50($inp),$Z3		# I[5]
539	mov		$inp,$in0
540	vmovdqu		0x40($inp),$Z0
541
542	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
543	# bytes before the end of the input. Note, in particular, that this is
544	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
545	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
546	# not be near the very beginning of the address space when |$len| < 2*96
547	# (0xc0).
548	lea		-0xc0($inp,$len),$end0
549
550	vmovdqu		0x30($inp),$Z1
551	shr		\$4,$len
552	xor		%rax,%rax
553	vmovdqu		0x20($inp),$Z2
554	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
555	vmovdqu		0x10($inp),$T2
556	 vpshufb	$Ii,$Z0,$Z0
557	vmovdqu		($inp),$Hkey
558	 vpshufb	$Ii,$Z1,$Z1
559	vmovdqu		$Z0,0x30(%rsp)
560	 vpshufb	$Ii,$Z2,$Z2
561	vmovdqu		$Z1,0x40(%rsp)
562	 vpshufb	$Ii,$T2,$T2
563	vmovdqu		$Z2,0x50(%rsp)
564	 vpshufb	$Ii,$Hkey,$Hkey
565	vmovdqu		$T2,0x60(%rsp)
566	vmovdqu		$Hkey,0x70(%rsp)
567
568	call		_aesni_ctr32_ghash_6x
569
570	mov		$Xip_offset(%rbp), %r12
571	vmovups		$inout0,-0x60($out)	# save output
572	vmovups		$inout1,-0x50($out)
573	vmovups		$inout2,-0x40($out)
574	vmovups		$inout3,-0x30($out)
575	vmovups		$inout4,-0x20($out)
576	vmovups		$inout5,-0x10($out)
577
578	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
579	vmovdqu		$Xi,(%r12)		# output Xi
580
581	vzeroupper
582___
583$code.=<<___ if ($win64);
584	movaps	-0xd0(%rbp),%xmm6
585	movaps	-0xc0(%rbp),%xmm7
586	movaps	-0xb0(%rbp),%xmm8
587	movaps	-0xa0(%rbp),%xmm9
588	movaps	-0x90(%rbp),%xmm10
589	movaps	-0x80(%rbp),%xmm11
590	movaps	-0x70(%rbp),%xmm12
591	movaps	-0x60(%rbp),%xmm13
592	movaps	-0x50(%rbp),%xmm14
593	movaps	-0x40(%rbp),%xmm15
594	mov	0x10(%rbp),%rdi
595	mov	0x18(%rbp),%rsi
596___
597$code.=<<___;
598	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
599.cfi_def_cfa	%rsp, 0x38
600	pop	%r15
601.cfi_pop	%r15
602	pop	%r14
603.cfi_pop	%r14
604	pop	%r13
605.cfi_pop	%r13
606	pop	%r12
607.cfi_pop	%r12
608	pop	%rbx
609.cfi_pop	%rbx
610	pop	%rbp
611.cfi_pop	%rbp
612.Lgcm_dec_abort:
613	ret
614.seh_endproc
615.cfi_endproc
616.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
617___
618
619$code.=<<___;
620.type	_aesni_ctr32_6x,\@abi-omnipotent
621.align	32
622_aesni_ctr32_6x:
623.cfi_startproc
624	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
625	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
626	lea		-1($rounds),%r13
627	vmovups		0x10-0x80($key),$rndkey
628	lea		0x20-0x80($key),%r12
629	vpxor		$Z0,$T1,$inout0
630	add		\$`6<<24`,$counter
631	jc		.Lhandle_ctr32_2
632	vpaddb		$T2,$T1,$inout1
633	vpaddb		$T2,$inout1,$inout2
634	vpxor		$Z0,$inout1,$inout1
635	vpaddb		$T2,$inout2,$inout3
636	vpxor		$Z0,$inout2,$inout2
637	vpaddb		$T2,$inout3,$inout4
638	vpxor		$Z0,$inout3,$inout3
639	vpaddb		$T2,$inout4,$inout5
640	vpxor		$Z0,$inout4,$inout4
641	vpaddb		$T2,$inout5,$T1
642	vpxor		$Z0,$inout5,$inout5
643	jmp		.Loop_ctr32
644
645.align	16
646.Loop_ctr32:
647	vaesenc		$rndkey,$inout0,$inout0
648	vaesenc		$rndkey,$inout1,$inout1
649	vaesenc		$rndkey,$inout2,$inout2
650	vaesenc		$rndkey,$inout3,$inout3
651	vaesenc		$rndkey,$inout4,$inout4
652	vaesenc		$rndkey,$inout5,$inout5
653	vmovups		(%r12),$rndkey
654	lea		0x10(%r12),%r12
655	dec		%r13d
656	jnz		.Loop_ctr32
657
658	vmovdqu		(%r12),$Hkey		# last round key
659	vaesenc		$rndkey,$inout0,$inout0
660	vpxor		0x00($inp),$Hkey,$Z0
661	vaesenc		$rndkey,$inout1,$inout1
662	vpxor		0x10($inp),$Hkey,$Z1
663	vaesenc		$rndkey,$inout2,$inout2
664	vpxor		0x20($inp),$Hkey,$Z2
665	vaesenc		$rndkey,$inout3,$inout3
666	vpxor		0x30($inp),$Hkey,$Xi
667	vaesenc		$rndkey,$inout4,$inout4
668	vpxor		0x40($inp),$Hkey,$T2
669	vaesenc		$rndkey,$inout5,$inout5
670	vpxor		0x50($inp),$Hkey,$Hkey
671	lea		0x60($inp),$inp
672
673	vaesenclast	$Z0,$inout0,$inout0
674	vaesenclast	$Z1,$inout1,$inout1
675	vaesenclast	$Z2,$inout2,$inout2
676	vaesenclast	$Xi,$inout3,$inout3
677	vaesenclast	$T2,$inout4,$inout4
678	vaesenclast	$Hkey,$inout5,$inout5
679	vmovups		$inout0,0x00($out)
680	vmovups		$inout1,0x10($out)
681	vmovups		$inout2,0x20($out)
682	vmovups		$inout3,0x30($out)
683	vmovups		$inout4,0x40($out)
684	vmovups		$inout5,0x50($out)
685	lea		0x60($out),$out
686
687	ret
688.align	32
689.Lhandle_ctr32_2:
690	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
691	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
692	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
693	vpaddd		$Z1,$Z2,$inout2
694	vpaddd		$Z1,$inout1,$inout3
695	vpshufb		$Ii,$inout1,$inout1
696	vpaddd		$Z1,$inout2,$inout4
697	vpshufb		$Ii,$inout2,$inout2
698	vpxor		$Z0,$inout1,$inout1
699	vpaddd		$Z1,$inout3,$inout5
700	vpshufb		$Ii,$inout3,$inout3
701	vpxor		$Z0,$inout2,$inout2
702	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
703	vpshufb		$Ii,$inout4,$inout4
704	vpxor		$Z0,$inout3,$inout3
705	vpshufb		$Ii,$inout5,$inout5
706	vpxor		$Z0,$inout4,$inout4
707	vpshufb		$Ii,$T1,$T1		# next counter value
708	vpxor		$Z0,$inout5,$inout5
709	jmp	.Loop_ctr32
710.cfi_endproc
711.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
712
713.globl	aesni_gcm_encrypt
714.type	aesni_gcm_encrypt,\@abi-omnipotent
715.align	32
716aesni_gcm_encrypt:
717.cfi_startproc
718.seh_startproc
719	_CET_ENDBR
720#ifdef BORINGSSL_DISPATCH_TEST
721.extern	BORINGSSL_function_hit
722	movb \$1,BORINGSSL_function_hit+2(%rip)
723#endif
724	xor	%rax,%rax
725
726	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
727	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
728	# least 96 more bytes of input.
729	cmp	\$0x60*3,$len			# minimal accepted length
730	jb	.Lgcm_enc_abort
731
732	push	%rbp
733.cfi_push	%rbp
734.seh_pushreg	%rbp
735	mov	%rsp, %rbp			# save stack pointer
736.cfi_def_cfa_register	%rbp
737	push	%rbx
738.cfi_push	%rbx
739.seh_pushreg	%rbx
740	push	%r12
741.cfi_push	%r12
742.seh_pushreg	%r12
743	push	%r13
744.cfi_push	%r13
745.seh_pushreg	%r13
746	push	%r14
747.cfi_push	%r14
748.seh_pushreg	%r14
749	push	%r15
750.cfi_push	%r15
751.seh_pushreg	%r15
752___
753if ($win64) {
754$code.=<<___
755	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
756.seh_allocstack	0xa8
757.seh_setframe	%rbp, 0xa8+5*8
758	# Load the last two parameters. These go into %rdi and %rsi, which are
759	# non-volatile on Windows, so stash them in the parameter stack area
760	# first.
761	mov	%rdi, 0x10(%rbp)
762.seh_savereg	%rdi, 0xa8+5*8+0x10
763	mov	%rsi, 0x18(%rbp)
764.seh_savereg	%rsi, 0xa8+5*8+0x18
765	mov	0x30(%rbp), $ivp
766	mov	0x38(%rbp), $Htable
767	# Save non-volatile XMM registers.
768	movaps	%xmm6,-0xd0(%rbp)
769.seh_savexmm128	%xmm6, 0xa8+5*8-0xd0
770	movaps	%xmm7,-0xc0(%rbp)
771.seh_savexmm128	%xmm7, 0xa8+5*8-0xc0
772	movaps	%xmm8,-0xb0(%rbp)
773.seh_savexmm128	%xmm8, 0xa8+5*8-0xb0
774	movaps	%xmm9,-0xa0(%rbp)
775.seh_savexmm128	%xmm9, 0xa8+5*8-0xa0
776	movaps	%xmm10,-0x90(%rbp)
777.seh_savexmm128	%xmm10, 0xa8+5*8-0x90
778	movaps	%xmm11,-0x80(%rbp)
779.seh_savexmm128	%xmm11, 0xa8+5*8-0x80
780	movaps	%xmm12,-0x70(%rbp)
781.seh_savexmm128	%xmm12, 0xa8+5*8-0x70
782	movaps	%xmm13,-0x60(%rbp)
783.seh_savexmm128	%xmm13, 0xa8+5*8-0x60
784	movaps	%xmm14,-0x50(%rbp)
785.seh_savexmm128	%xmm14, 0xa8+5*8-0x50
786	movaps	%xmm15,-0x40(%rbp)
787.seh_savexmm128	%xmm15, 0xa8+5*8-0x40
788___
789}
790$code.=<<___;
791	vzeroupper
792
793	vmovdqu		($ivp),$T1		# input counter value
794	add		\$-128,%rsp
795	mov		12($ivp),$counter
796	lea		.Lbswap_mask(%rip),$const
797	lea		-0x80($key),$in0	# borrow $in0
798	mov		\$0xf80,$end0		# borrow $end0
799	lea		0x80($key),$key		# size optimization
800	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
801	and		\$-128,%rsp		# ensure stack alignment
802	mov		0xf0-0x80($key),$rounds
803
804	and		$end0,$in0
805	and		%rsp,$end0
806	sub		$in0,$end0
807	jc		.Lenc_no_key_aliasing
808	cmp		\$768,$end0
809	jnc		.Lenc_no_key_aliasing
810	sub		$end0,%rsp		# avoid aliasing with key
811.Lenc_no_key_aliasing:
812
813	mov		$out,$in0
814
815	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
816	# bytes before the end of the input. Note, in particular, that this is
817	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
818	# the decryption case, there's no caveat that |$out| must not be near
819	# the very beginning of the address space, because we know that
820	# |$len| >= 3*96 from the check above, and so we know
821	# |$out| + |$len| >= 2*96 (0xc0).
822	lea		-0xc0($out,$len),$end0
823
824	shr		\$4,$len
825
826	call		_aesni_ctr32_6x
827	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
828	vpshufb		$Ii,$inout1,$T2
829	vmovdqu		$Xi,0x70(%rsp)
830	vpshufb		$Ii,$inout2,$Z0
831	vmovdqu		$T2,0x60(%rsp)
832	vpshufb		$Ii,$inout3,$Z1
833	vmovdqu		$Z0,0x50(%rsp)
834	vpshufb		$Ii,$inout4,$Z2
835	vmovdqu		$Z1,0x40(%rsp)
836	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
837	vmovdqu		$Z2,0x30(%rsp)
838
839	call		_aesni_ctr32_6x
840
841	mov		$Xip_offset(%rbp), %r12
842	lea		0x20($Htable),$Htable	# size optimization
843	vmovdqu		(%r12),$Xi		# load Xi
844	sub		\$12,$len
845	mov		\$0x60*2,%rax
846	vpshufb		$Ii,$Xi,$Xi
847
848	call		_aesni_ctr32_ghash_6x
849	vmovdqu		0x20(%rsp),$Z3		# I[5]
850	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
851	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
852	vpunpckhqdq	$Z3,$Z3,$T1
853	vmovdqu		0x20-0x20($Htable),$rndkey	# borrow $rndkey for $HK
854	 vmovups	$inout0,-0x60($out)	# save output
855	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
856	vpxor		$Z3,$T1,$T1
857	 vmovups	$inout1,-0x50($out)
858	 vpshufb	$Ii,$inout1,$inout1
859	 vmovups	$inout2,-0x40($out)
860	 vpshufb	$Ii,$inout2,$inout2
861	 vmovups	$inout3,-0x30($out)
862	 vpshufb	$Ii,$inout3,$inout3
863	 vmovups	$inout4,-0x20($out)
864	 vpshufb	$Ii,$inout4,$inout4
865	 vmovups	$inout5,-0x10($out)
866	 vpshufb	$Ii,$inout5,$inout5
867	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
868___
869{ my ($HK,$T3)=($rndkey,$inout0);
870
871$code.=<<___;
872	 vmovdqu	0x30(%rsp),$Z2		# I[4]
873	 vmovdqu	0x10-0x20($Htable),$Ii	# borrow $Ii for $Hkey^2
874	 vpunpckhqdq	$Z2,$Z2,$T2
875	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
876	 vpxor		$Z2,$T2,$T2
877	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
878	vpclmulqdq	\$0x00,$HK,$T1,$T1
879
880	 vmovdqu	0x40(%rsp),$T3		# I[3]
881	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
882	 vmovdqu	0x30-0x20($Htable),$Hkey	# $Hkey^3
883	vpxor		$Z1,$Z0,$Z0
884	 vpunpckhqdq	$T3,$T3,$Z1
885	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
886	 vpxor		$T3,$Z1,$Z1
887	vpxor		$Z3,$Z2,$Z2
888	vpclmulqdq	\$0x10,$HK,$T2,$T2
889	 vmovdqu	0x50-0x20($Htable),$HK
890	vpxor		$T1,$T2,$T2
891
892	 vmovdqu	0x50(%rsp),$T1		# I[2]
893	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
894	 vmovdqu	0x40-0x20($Htable),$Ii	# borrow $Ii for $Hkey^4
895	vpxor		$Z0,$Z3,$Z3
896	 vpunpckhqdq	$T1,$T1,$Z0
897	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
898	 vpxor		$T1,$Z0,$Z0
899	vpxor		$Z2,$T3,$T3
900	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
901	vpxor		$T2,$Z1,$Z1
902
903	 vmovdqu	0x60(%rsp),$T2		# I[1]
904	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
905	 vmovdqu	0x60-0x20($Htable),$Hkey	# $Hkey^5
906	vpxor		$Z3,$Z2,$Z2
907	 vpunpckhqdq	$T2,$T2,$Z3
908	vpclmulqdq	\$0x11,$Ii,$T1,$T1
909	 vpxor		$T2,$Z3,$Z3
910	vpxor		$T3,$T1,$T1
911	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
912	 vmovdqu	0x80-0x20($Htable),$HK
913	vpxor		$Z1,$Z0,$Z0
914
915	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
916	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
917	 vmovdqu	0x70-0x20($Htable),$Ii	# borrow $Ii for $Hkey^6
918	 vpunpckhqdq	$Xi,$Xi,$T3
919	vpxor		$Z2,$Z1,$Z1
920	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
921	 vpxor		$Xi,$T3,$T3
922	vpxor		$T1,$T2,$T2
923	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
924	vpxor		$Z0,$Z3,$Z0
925
926	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
927	 vmovdqu	0x00-0x20($Htable),$Hkey	# $Hkey^1
928	 vpunpckhqdq	$inout5,$inout5,$T1
929	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
930	 vpxor		$inout5,$T1,$T1
931	vpxor		$Z1,$Z2,$Z1
932	vpclmulqdq	\$0x10,$HK,$T3,$T3
933	 vmovdqu	0x20-0x20($Htable),$HK
934	vpxor		$T2,$Xi,$Z3
935	vpxor		$Z0,$T3,$Z2
936
937	 vmovdqu	0x10-0x20($Htable),$Ii	# borrow $Ii for $Hkey^2
938	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
939	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
940	  vpxor		$T3,$Z2,$Z2
941	 vpunpckhqdq	$inout4,$inout4,$T2
942	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
943	 vpxor		$inout4,$T2,$T2
944	  vpslldq	\$8,$Z2,$T3
945	vpclmulqdq	\$0x00,$HK,$T1,$T1
946	  vpxor		$T3,$Z1,$Xi
947	  vpsrldq	\$8,$Z2,$Z2
948	  vpxor		$Z2,$Z3,$Z3
949
950	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
951	 vmovdqu	0x30-0x20($Htable),$Hkey	# $Hkey^3
952	vpxor		$Z0,$Z1,$Z1
953	 vpunpckhqdq	$inout3,$inout3,$T3
954	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
955	 vpxor		$inout3,$T3,$T3
956	vpxor		$inout5,$inout4,$inout4
957	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
958	vpclmulqdq	\$0x10,$HK,$T2,$T2
959	 vmovdqu	0x50-0x20($Htable),$HK
960	vpxor		$T1,$T2,$T2
961
962	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
963	 vmovdqu	0x40-0x20($Htable),$Ii	# borrow $Ii for $Hkey^4
964	vpxor		$Z1,$Z0,$Z0
965	 vpunpckhqdq	$inout2,$inout2,$T1
966	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
967	 vpxor		$inout2,$T1,$T1
968	vpxor		$inout4,$inout3,$inout3
969	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
970	vpclmulqdq	\$0x00,$HK,$T3,$T3
971	vpxor		$T2,$T3,$T3
972
973	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
974	  vxorps	$inout5,$Xi,$Xi
975
976	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
977	 vmovdqu	0x60-0x20($Htable),$Hkey	# $Hkey^5
978	vpxor		$Z0,$Z1,$Z1
979	 vpunpckhqdq	$inout1,$inout1,$T2
980	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
981	 vpxor		$inout1,$T2,$T2
982	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
983	vpxor		$inout3,$inout2,$inout2
984	vpclmulqdq	\$0x10,$HK,$T1,$T1
985	 vmovdqu	0x80-0x20($Htable),$HK
986	vpxor		$T3,$T1,$T1
987
988	  vxorps	$Z3,$inout5,$inout5
989	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
990	  vxorps	$inout5,$Xi,$Xi
991
992	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
993	 vmovdqu	0x70-0x20($Htable),$Ii	# borrow $Ii for $Hkey^6
994	vpxor		$Z1,$Z0,$Z0
995	 vpunpckhqdq	$Xi,$Xi,$T3
996	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
997	 vpxor		$Xi,$T3,$T3
998	vpxor		$inout2,$inout1,$inout1
999	vpclmulqdq	\$0x00,$HK,$T2,$T2
1000	vpxor		$T1,$T2,$T2
1001
1002	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
1003	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
1004	vpxor		$Z0,$Z1,$Z1
1005	vpclmulqdq	\$0x10,$HK,$T3,$Z2
1006	vpxor		$inout1,$Z3,$Z3
1007	vpxor		$T2,$Z2,$Z2
1008
1009	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
1010	vpxor		$Z0,$Z2,$Z2
1011	vpslldq		\$8,$Z2,$T1
1012	vmovdqu		0x10($const),$Hkey	# .Lpoly
1013	vpsrldq		\$8,$Z2,$Z2
1014	vpxor		$T1,$Z1,$Xi
1015	vpxor		$Z2,$Z3,$Z3
1016
1017	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
1018	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
1019	vpxor		$T2,$Xi,$Xi
1020
1021	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
1022	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
1023	vpxor		$Z3,$T2,$T2
1024	vpxor		$T2,$Xi,$Xi
1025___
1026}
1027$code.=<<___;
1028	mov		$Xip_offset(%rbp), %r12
1029	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
1030	vmovdqu		$Xi,(%r12)		# output Xi
1031
1032	vzeroupper
1033___
1034$code.=<<___ if ($win64);
1035	movaps	-0xd0(%rbp),%xmm6
1036	movaps	-0xc0(%rbp),%xmm7
1037	movaps	-0xb0(%rbp),%xmm8
1038	movaps	-0xa0(%rbp),%xmm9
1039	movaps	-0x90(%rbp),%xmm10
1040	movaps	-0x80(%rbp),%xmm11
1041	movaps	-0x70(%rbp),%xmm12
1042	movaps	-0x60(%rbp),%xmm13
1043	movaps	-0x50(%rbp),%xmm14
1044	movaps	-0x40(%rbp),%xmm15
1045	mov	0x10(%rbp),%rdi
1046	mov	0x18(%rbp),%rsi
1047___
1048$code.=<<___;
1049	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
1050.cfi_def_cfa	%rsp, 0x38
1051	pop	%r15
1052.cfi_pop	%r15
1053	pop	%r14
1054.cfi_pop	%r14
1055	pop	%r13
1056.cfi_pop	%r13
1057	pop	%r12
1058.cfi_pop	%r12
1059	pop	%rbx
1060.cfi_pop	%rbx
1061	pop	%rbp
1062.cfi_pop	%rbp
1063.Lgcm_enc_abort:
1064	ret
1065.seh_endproc
1066.cfi_endproc
1067.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1068___
1069
1070$code.=<<___;
1071.section .rodata
1072.align	64
1073.Lbswap_mask:
1074	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1075.Lpoly:
1076	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1077.Lone_msb:
1078	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1079.Ltwo_lsb:
1080	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1081.Lone_lsb:
1082	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1083.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1084.align	64
1085.text
1086___
1087}}} else {{{
1088$code=<<___;	# assembler is too old
1089.text
1090
1091.globl	aesni_gcm_encrypt
1092.type	aesni_gcm_encrypt,\@abi-omnipotent
1093aesni_gcm_encrypt:
1094	_CET_ENDBR
1095	xor	%eax,%eax
1096	ret
1097.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1098
1099.globl	aesni_gcm_decrypt
1100.type	aesni_gcm_decrypt,\@abi-omnipotent
1101aesni_gcm_decrypt:
1102	_CET_ENDBR
1103	xor	%eax,%eax
1104	ret
1105.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
1106___
1107}}}
1108
1109$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1110
1111print $code;
1112
1113close STDOUT or die "error closing STDOUT: $!";
1114