• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17#
18# AES-NI-CTR+GHASH stitch.
19#
20# February 2013
21#
22# OpenSSL GCM implementation is organized in such way that its
23# performance is rather close to the sum of its streamed components,
24# in the context parallelized AES-NI CTR and modulo-scheduled
25# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26# was observed to perform significantly better than the sum of the
27# components on contemporary CPUs, the effort was deemed impossible to
28# justify. This module is based on combination of Intel submissions,
29# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30# Locktyukhin of Intel Corp. who verified that it reduces shuffles
31# pressure with notable relative improvement, achieving 1.0 cycle per
32# byte processed with 128-bit key on Haswell processor, 0.74 - on
33# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34# measurements for favourable packet size, one divisible by 96.
35# Applications using the EVP interface will observe a few percent
36# worse performance.]
37#
38# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39#
40# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42
43$flavour = shift;
44$output  = shift;
45if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52die "can't locate x86_64-xlate.pl";
53
54# |$avx| in ghash-x86_64.pl must be set to at least 1; otherwise tags will
55# be computed incorrectly.
56#
57# In upstream, this is controlled by shelling out to the compiler to check
58# versions, but BoringSSL is intended to be used with pre-generated perlasm
59# output, so this isn't useful anyway.
60#
61# The upstream code uses the condition |$avx>1| even though no AVX2
62# instructions are used, because it assumes MOVBE is supported by the assembler
63# if and only if AVX2 is also supported by the assembler; see
64# https://marc.info/?l=openssl-dev&m=146567589526984&w=2.
65$avx = 2;
66
67open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
68*STDOUT=*OUT;
69
70# See the comment above regarding why the condition is ($avx>1) when there are
71# no AVX2 instructions being used.
72if ($avx>1) {{{
73
74# On Windows, only four parameters are passed in registers. The last two
75# parameters will be manually loaded into %rdi and %rsi.
76my ($inp, $out, $len, $key, $ivp, $Htable) =
77    $win64 ? ("%rcx", "%rdx", "%r8", "%r9", "%rdi", "%rsi") :
78             ("%rdi", "%rsi", "%rdx", "%rcx", "%r8", "%r9");
79
80# The offset from %rbp to the Xip parameter. On Windows, all parameters have
81# corresponding stack positions, not just ones passed on the stack.
82# (0x40 = 6*8 + 0x10)
83#
84# Xip only needs to be accessed at the beginning and end of the function, and
85# this function is short on registers, so we make it the last parameter for
86# convenience.
87my $Xip_offset = $win64 ? 0x40 : 0x10;
88
89($Ii,$T1,$T2,$Hkey,
90 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
91
92($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
93
94($counter,$rounds,$const,$in0,$end0)=("%ebx","%r10d","%r11","%r14","%r15");
95
96$code=<<___;
97.text
98
99.type	_aesni_ctr32_ghash_6x,\@abi-omnipotent
100.align	32
101_aesni_ctr32_ghash_6x:
102.cfi_startproc
103	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
104	sub		\$6,$len
105	vpxor		$Z0,$Z0,$Z0		# $Z0   = 0
106	vmovdqu		0x00-0x80($key),$rndkey
107	vpaddb		$T2,$T1,$inout1
108	vpaddb		$T2,$inout1,$inout2
109	vpaddb		$T2,$inout2,$inout3
110	vpaddb		$T2,$inout3,$inout4
111	vpaddb		$T2,$inout4,$inout5
112	vpxor		$rndkey,$T1,$inout0
113	vmovdqu		$Z0,16+8(%rsp)		# "$Z3" = 0
114	jmp		.Loop6x
115
116.align	32
117.Loop6x:
118	add		\$`6<<24`,$counter
119	jc		.Lhandle_ctr32		# discard $inout[1-5]?
120	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
121	  vpaddb	$T2,$inout5,$T1		# next counter value
122	  vpxor		$rndkey,$inout1,$inout1
123	  vpxor		$rndkey,$inout2,$inout2
124
125.Lresume_ctr32:
126	vmovdqu		$T1,($ivp)		# save next counter value
127	vpclmulqdq	\$0x10,$Hkey,$Z3,$Z1
128	  vpxor		$rndkey,$inout3,$inout3
129	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
130	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
131
132	# At this point, the current block of 96 (0x60) bytes has already been
133	# loaded into registers. Concurrently with processing it, we want to
134	# load the next 96 bytes of input for the next round. Obviously, we can
135	# only do this if there are at least 96 more bytes of input beyond the
136	# input we're currently processing, or else we'd read past the end of
137	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
138	# bytes of input beyond the 96 bytes we're already processing, and we
139	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
140	# we'll read in the next block so that it is in registers for the next
141	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
142	# the current block and then ignore what we re-read.
143	#
144	# At this point, |$in0| points to the current (already read into
145	# registers) block, and |$end0| points to 2*96 bytes before the end of
146	# the input. Thus, |$in0| > |$end0| means that we do not have the next
147	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
148	xor		%r12,%r12
149	cmp		$in0,$end0
150
151	  vaesenc	$T2,$inout0,$inout0
152	vmovdqu		0x30+8(%rsp),$Ii	# I[4]
153	  vpxor		$rndkey,$inout4,$inout4
154	vpclmulqdq	\$0x00,$Hkey,$Z3,$T1
155	  vaesenc	$T2,$inout1,$inout1
156	  vpxor		$rndkey,$inout5,$inout5
157	setnc		%r12b
158	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
159	  vaesenc	$T2,$inout2,$inout2
160	vmovdqu		0x10-0x20($Htable),$Hkey	# $Hkey^2
161	neg		%r12
162	  vaesenc	$T2,$inout3,$inout3
163	 vpxor		$Z1,$Z2,$Z2
164	vpclmulqdq	\$0x00,$Hkey,$Ii,$Z1
165	 vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
166	  vaesenc	$T2,$inout4,$inout4
167	 vpxor		$Z1,$T1,$Z0
168	and		\$0x60,%r12
169	  vmovups	0x20-0x80($key),$rndkey
170	vpclmulqdq	\$0x10,$Hkey,$Ii,$T1
171	  vaesenc	$T2,$inout5,$inout5
172
173	vpclmulqdq	\$0x01,$Hkey,$Ii,$T2
174	lea		($in0,%r12),$in0
175	  vaesenc	$rndkey,$inout0,$inout0
176	 vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled [vpxor $Z3,$Xi,$Xi]
177	vpclmulqdq	\$0x11,$Hkey,$Ii,$Hkey
178	 vmovdqu	0x40+8(%rsp),$Ii	# I[3]
179	  vaesenc	$rndkey,$inout1,$inout1
180	movbe		0x58($in0),%r13
181	  vaesenc	$rndkey,$inout2,$inout2
182	movbe		0x50($in0),%r12
183	  vaesenc	$rndkey,$inout3,$inout3
184	mov		%r13,0x20+8(%rsp)
185	  vaesenc	$rndkey,$inout4,$inout4
186	mov		%r12,0x28+8(%rsp)
187	vmovdqu		0x30-0x20($Htable),$Z1	# borrow $Z1 for $Hkey^3
188	  vaesenc	$rndkey,$inout5,$inout5
189
190	  vmovups	0x30-0x80($key),$rndkey
191	 vpxor		$T1,$Z2,$Z2
192	vpclmulqdq	\$0x00,$Z1,$Ii,$T1
193	  vaesenc	$rndkey,$inout0,$inout0
194	 vpxor		$T2,$Z2,$Z2
195	vpclmulqdq	\$0x10,$Z1,$Ii,$T2
196	  vaesenc	$rndkey,$inout1,$inout1
197	 vpxor		$Hkey,$Z3,$Z3
198	vpclmulqdq	\$0x01,$Z1,$Ii,$Hkey
199	  vaesenc	$rndkey,$inout2,$inout2
200	vpclmulqdq	\$0x11,$Z1,$Ii,$Z1
201	 vmovdqu	0x50+8(%rsp),$Ii	# I[2]
202	  vaesenc	$rndkey,$inout3,$inout3
203	  vaesenc	$rndkey,$inout4,$inout4
204	 vpxor		$T1,$Z0,$Z0
205	vmovdqu		0x40-0x20($Htable),$T1	# borrow $T1 for $Hkey^4
206	  vaesenc	$rndkey,$inout5,$inout5
207
208	  vmovups	0x40-0x80($key),$rndkey
209	 vpxor		$T2,$Z2,$Z2
210	vpclmulqdq	\$0x00,$T1,$Ii,$T2
211	  vaesenc	$rndkey,$inout0,$inout0
212	 vpxor		$Hkey,$Z2,$Z2
213	vpclmulqdq	\$0x10,$T1,$Ii,$Hkey
214	  vaesenc	$rndkey,$inout1,$inout1
215	movbe		0x48($in0),%r13
216	 vpxor		$Z1,$Z3,$Z3
217	vpclmulqdq	\$0x01,$T1,$Ii,$Z1
218	  vaesenc	$rndkey,$inout2,$inout2
219	movbe		0x40($in0),%r12
220	vpclmulqdq	\$0x11,$T1,$Ii,$T1
221	 vmovdqu	0x60+8(%rsp),$Ii	# I[1]
222	  vaesenc	$rndkey,$inout3,$inout3
223	mov		%r13,0x30+8(%rsp)
224	  vaesenc	$rndkey,$inout4,$inout4
225	mov		%r12,0x38+8(%rsp)
226	 vpxor		$T2,$Z0,$Z0
227	vmovdqu		0x60-0x20($Htable),$T2	# borrow $T2 for $Hkey^5
228	  vaesenc	$rndkey,$inout5,$inout5
229
230	  vmovups	0x50-0x80($key),$rndkey
231	 vpxor		$Hkey,$Z2,$Z2
232	vpclmulqdq	\$0x00,$T2,$Ii,$Hkey
233	  vaesenc	$rndkey,$inout0,$inout0
234	 vpxor		$Z1,$Z2,$Z2
235	vpclmulqdq	\$0x10,$T2,$Ii,$Z1
236	  vaesenc	$rndkey,$inout1,$inout1
237	movbe		0x38($in0),%r13
238	 vpxor		$T1,$Z3,$Z3
239	vpclmulqdq	\$0x01,$T2,$Ii,$T1
240	 vpxor		0x70+8(%rsp),$Xi,$Xi	# accumulate I[0]
241	  vaesenc	$rndkey,$inout2,$inout2
242	movbe		0x30($in0),%r12
243	vpclmulqdq	\$0x11,$T2,$Ii,$T2
244	  vaesenc	$rndkey,$inout3,$inout3
245	mov		%r13,0x40+8(%rsp)
246	  vaesenc	$rndkey,$inout4,$inout4
247	mov		%r12,0x48+8(%rsp)
248	 vpxor		$Hkey,$Z0,$Z0
249	 vmovdqu	0x70-0x20($Htable),$Hkey	# $Hkey^6
250	  vaesenc	$rndkey,$inout5,$inout5
251
252	  vmovups	0x60-0x80($key),$rndkey
253	 vpxor		$Z1,$Z2,$Z2
254	vpclmulqdq	\$0x10,$Hkey,$Xi,$Z1
255	  vaesenc	$rndkey,$inout0,$inout0
256	 vpxor		$T1,$Z2,$Z2
257	vpclmulqdq	\$0x01,$Hkey,$Xi,$T1
258	  vaesenc	$rndkey,$inout1,$inout1
259	movbe		0x28($in0),%r13
260	 vpxor		$T2,$Z3,$Z3
261	vpclmulqdq	\$0x00,$Hkey,$Xi,$T2
262	  vaesenc	$rndkey,$inout2,$inout2
263	movbe		0x20($in0),%r12
264	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xi
265	  vaesenc	$rndkey,$inout3,$inout3
266	mov		%r13,0x50+8(%rsp)
267	  vaesenc	$rndkey,$inout4,$inout4
268	mov		%r12,0x58+8(%rsp)
269	vpxor		$Z1,$Z2,$Z2
270	  vaesenc	$rndkey,$inout5,$inout5
271	vpxor		$T1,$Z2,$Z2
272
273	  vmovups	0x70-0x80($key),$rndkey
274	vpslldq		\$8,$Z2,$Z1
275	vpxor		$T2,$Z0,$Z0
276	vmovdqu		0x10($const),$Hkey	# .Lpoly
277
278	  vaesenc	$rndkey,$inout0,$inout0
279	vpxor		$Xi,$Z3,$Z3
280	  vaesenc	$rndkey,$inout1,$inout1
281	vpxor		$Z1,$Z0,$Z0
282	movbe		0x18($in0),%r13
283	  vaesenc	$rndkey,$inout2,$inout2
284	movbe		0x10($in0),%r12
285	vpalignr	\$8,$Z0,$Z0,$Ii		# 1st phase
286	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
287	mov		%r13,0x60+8(%rsp)
288	  vaesenc	$rndkey,$inout3,$inout3
289	mov		%r12,0x68+8(%rsp)
290	  vaesenc	$rndkey,$inout4,$inout4
291	  vmovups	0x80-0x80($key),$T1	# borrow $T1 for $rndkey
292	  vaesenc	$rndkey,$inout5,$inout5
293
294	  vaesenc	$T1,$inout0,$inout0
295	  vmovups	0x90-0x80($key),$rndkey
296	  vaesenc	$T1,$inout1,$inout1
297	vpsrldq		\$8,$Z2,$Z2
298	  vaesenc	$T1,$inout2,$inout2
299	vpxor		$Z2,$Z3,$Z3
300	  vaesenc	$T1,$inout3,$inout3
301	vpxor		$Ii,$Z0,$Z0
302	movbe		0x08($in0),%r13
303	  vaesenc	$T1,$inout4,$inout4
304	movbe		0x00($in0),%r12
305	  vaesenc	$T1,$inout5,$inout5
306	  vmovups	0xa0-0x80($key),$T1
307	  cmp		\$11,$rounds
308	  jb		.Lenc_tail		# 128-bit key
309
310	  vaesenc	$rndkey,$inout0,$inout0
311	  vaesenc	$rndkey,$inout1,$inout1
312	  vaesenc	$rndkey,$inout2,$inout2
313	  vaesenc	$rndkey,$inout3,$inout3
314	  vaesenc	$rndkey,$inout4,$inout4
315	  vaesenc	$rndkey,$inout5,$inout5
316
317	  vaesenc	$T1,$inout0,$inout0
318	  vaesenc	$T1,$inout1,$inout1
319	  vaesenc	$T1,$inout2,$inout2
320	  vaesenc	$T1,$inout3,$inout3
321	  vaesenc	$T1,$inout4,$inout4
322	  vmovups	0xb0-0x80($key),$rndkey
323	  vaesenc	$T1,$inout5,$inout5
324	  vmovups	0xc0-0x80($key),$T1
325	  je		.Lenc_tail		# 192-bit key
326
327	  vaesenc	$rndkey,$inout0,$inout0
328	  vaesenc	$rndkey,$inout1,$inout1
329	  vaesenc	$rndkey,$inout2,$inout2
330	  vaesenc	$rndkey,$inout3,$inout3
331	  vaesenc	$rndkey,$inout4,$inout4
332	  vaesenc	$rndkey,$inout5,$inout5
333
334	  vaesenc	$T1,$inout0,$inout0
335	  vaesenc	$T1,$inout1,$inout1
336	  vaesenc	$T1,$inout2,$inout2
337	  vaesenc	$T1,$inout3,$inout3
338	  vaesenc	$T1,$inout4,$inout4
339	  vmovups	0xd0-0x80($key),$rndkey
340	  vaesenc	$T1,$inout5,$inout5
341	  vmovups	0xe0-0x80($key),$T1
342	  jmp		.Lenc_tail		# 256-bit key
343
344.align	32
345.Lhandle_ctr32:
346	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
347	  vpshufb	$Ii,$T1,$Z2		# byte-swap counter
348	  vmovdqu	0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
349	  vpaddd	0x40($const),$Z2,$inout1	# .Lone_lsb
350	  vpaddd	$Z1,$Z2,$inout2
351	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
352	  vpaddd	$Z1,$inout1,$inout3
353	  vpshufb	$Ii,$inout1,$inout1
354	  vpaddd	$Z1,$inout2,$inout4
355	  vpshufb	$Ii,$inout2,$inout2
356	  vpxor		$rndkey,$inout1,$inout1
357	  vpaddd	$Z1,$inout3,$inout5
358	  vpshufb	$Ii,$inout3,$inout3
359	  vpxor		$rndkey,$inout2,$inout2
360	  vpaddd	$Z1,$inout4,$T1		# byte-swapped next counter value
361	  vpshufb	$Ii,$inout4,$inout4
362	  vpshufb	$Ii,$inout5,$inout5
363	  vpshufb	$Ii,$T1,$T1		# next counter value
364	jmp		.Lresume_ctr32
365
366.align	32
367.Lenc_tail:
368	  vaesenc	$rndkey,$inout0,$inout0
369	vmovdqu		$Z3,16+8(%rsp)		# postpone vpxor $Z3,$Xi,$Xi
370	vpalignr	\$8,$Z0,$Z0,$Xi		# 2nd phase
371	  vaesenc	$rndkey,$inout1,$inout1
372	vpclmulqdq	\$0x10,$Hkey,$Z0,$Z0
373	  vpxor		0x00($inp),$T1,$T2
374	  vaesenc	$rndkey,$inout2,$inout2
375	  vpxor		0x10($inp),$T1,$Ii
376	  vaesenc	$rndkey,$inout3,$inout3
377	  vpxor		0x20($inp),$T1,$Z1
378	  vaesenc	$rndkey,$inout4,$inout4
379	  vpxor		0x30($inp),$T1,$Z2
380	  vaesenc	$rndkey,$inout5,$inout5
381	  vpxor		0x40($inp),$T1,$Z3
382	  vpxor		0x50($inp),$T1,$Hkey
383	  vmovdqu	($ivp),$T1		# load next counter value
384
385	  vaesenclast	$T2,$inout0,$inout0
386	  vmovdqu	0x20($const),$T2	# borrow $T2, .Lone_msb
387	  vaesenclast	$Ii,$inout1,$inout1
388	 vpaddb		$T2,$T1,$Ii
389	mov		%r13,0x70+8(%rsp)
390	lea		0x60($inp),$inp
391	# These two prefetches were added in BoringSSL. See change that added them.
392	 prefetcht0	512($inp)		# We use 96-byte block so prefetch 2 lines (128 bytes)
393	 prefetcht0	576($inp)
394	  vaesenclast	$Z1,$inout2,$inout2
395	 vpaddb		$T2,$Ii,$Z1
396	mov		%r12,0x78+8(%rsp)
397	lea		0x60($out),$out
398	  vmovdqu	0x00-0x80($key),$rndkey
399	  vaesenclast	$Z2,$inout3,$inout3
400	 vpaddb		$T2,$Z1,$Z2
401	  vaesenclast	$Z3, $inout4,$inout4
402	 vpaddb		$T2,$Z2,$Z3
403	  vaesenclast	$Hkey,$inout5,$inout5
404	 vpaddb		$T2,$Z3,$Hkey
405
406	add		\$0x60,%rax
407	sub		\$0x6,$len
408	jc		.L6x_done
409
410	  vmovups	$inout0,-0x60($out)	# save output
411	 vpxor		$rndkey,$T1,$inout0
412	  vmovups	$inout1,-0x50($out)
413	 vmovdqa	$Ii,$inout1		# 0 latency
414	  vmovups	$inout2,-0x40($out)
415	 vmovdqa	$Z1,$inout2		# 0 latency
416	  vmovups	$inout3,-0x30($out)
417	 vmovdqa	$Z2,$inout3		# 0 latency
418	  vmovups	$inout4,-0x20($out)
419	 vmovdqa	$Z3,$inout4		# 0 latency
420	  vmovups	$inout5,-0x10($out)
421	 vmovdqa	$Hkey,$inout5		# 0 latency
422	vmovdqu		0x20+8(%rsp),$Z3	# I[5]
423	jmp		.Loop6x
424
425.L6x_done:
426	vpxor		16+8(%rsp),$Xi,$Xi	# modulo-scheduled
427	vpxor		$Z0,$Xi,$Xi		# modulo-scheduled
428
429	ret
430.cfi_endproc
431.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
432___
433######################################################################
434#
435# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
436#		const AES_KEY *key, unsigned char iv[16], const u128 *Htbl[9],
437#		u128 *Xip);
438$code.=<<___;
439.globl	aesni_gcm_decrypt
440.type	aesni_gcm_decrypt,\@abi-omnipotent
441.align	32
442aesni_gcm_decrypt:
443.cfi_startproc
444.seh_startproc
445	_CET_ENDBR
446	xor	%rax,%rax
447
448	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
449	# bytes of input.
450	cmp	\$0x60,$len			# minimal accepted length
451	jb	.Lgcm_dec_abort
452
453	push	%rbp
454.cfi_push	%rbp
455.seh_pushreg	%rbp
456	mov	%rsp, %rbp			# save stack pointer
457.cfi_def_cfa_register	%rbp
458	push	%rbx
459.cfi_push	%rbx
460.seh_pushreg	%rbx
461	push	%r12
462.cfi_push	%r12
463.seh_pushreg	%r12
464	push	%r13
465.cfi_push	%r13
466.seh_pushreg	%r13
467	push	%r14
468.cfi_push	%r14
469.seh_pushreg	%r14
470	push	%r15
471.cfi_push	%r15
472.seh_pushreg	%r15
473___
474if ($win64) {
475$code.=<<___
476	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
477.seh_stackalloc	0xa8
478.seh_setframe	%rbp, 0xa8+5*8
479	# Load the last two parameters. These go into %rdi and %rsi, which are
480	# non-volatile on Windows, so stash them in the parameter stack area
481	# first.
482	mov	%rdi, 0x10(%rbp)
483.seh_savereg	%rdi, 0xa8+5*8+0x10
484	mov	%rsi, 0x18(%rbp)
485.seh_savereg	%rsi, 0xa8+5*8+0x18
486	mov	0x30(%rbp), $ivp
487	mov	0x38(%rbp), $Htable
488	# Save non-volatile XMM registers.
489	movaps	%xmm6,-0xd0(%rbp)
490.seh_savexmm	%xmm6, 0xa8+5*8-0xd0
491	movaps	%xmm7,-0xc0(%rbp)
492.seh_savexmm	%xmm7, 0xa8+5*8-0xc0
493	movaps	%xmm8,-0xb0(%rbp)
494.seh_savexmm	%xmm8, 0xa8+5*8-0xb0
495	movaps	%xmm9,-0xa0(%rbp)
496.seh_savexmm	%xmm9, 0xa8+5*8-0xa0
497	movaps	%xmm10,-0x90(%rbp)
498.seh_savexmm	%xmm10, 0xa8+5*8-0x90
499	movaps	%xmm11,-0x80(%rbp)
500.seh_savexmm	%xmm11, 0xa8+5*8-0x80
501	movaps	%xmm12,-0x70(%rbp)
502.seh_savexmm	%xmm12, 0xa8+5*8-0x70
503	movaps	%xmm13,-0x60(%rbp)
504.seh_savexmm	%xmm13, 0xa8+5*8-0x60
505	movaps	%xmm14,-0x50(%rbp)
506.seh_savexmm	%xmm14, 0xa8+5*8-0x50
507	movaps	%xmm15,-0x40(%rbp)
508.seh_savexmm	%xmm15, 0xa8+5*8-0x40
509.seh_endprologue
510___
511}
512$code.=<<___;
513	vzeroupper
514
515	mov		$Xip_offset(%rbp), %r12
516	vmovdqu		($ivp),$T1		# input counter value
517	add		\$-128,%rsp
518	mov		12($ivp),$counter
519	lea		.Lbswap_mask(%rip),$const
520	lea		-0x80($key),$in0	# borrow $in0
521	mov		\$0xf80,$end0		# borrow $end0
522	vmovdqu		(%r12),$Xi		# load Xi
523	and		\$-128,%rsp		# ensure stack alignment
524	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
525	lea		0x80($key),$key		# size optimization
526	lea		0x20($Htable),$Htable	# size optimization
527	mov		0xf0-0x80($key),$rounds
528	vpshufb		$Ii,$Xi,$Xi
529
530	and		$end0,$in0
531	and		%rsp,$end0
532	sub		$in0,$end0
533	jc		.Ldec_no_key_aliasing
534	cmp		\$768,$end0
535	jnc		.Ldec_no_key_aliasing
536	sub		$end0,%rsp		# avoid aliasing with key
537.Ldec_no_key_aliasing:
538
539	vmovdqu		0x50($inp),$Z3		# I[5]
540	mov		$inp,$in0
541	vmovdqu		0x40($inp),$Z0
542
543	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
544	# bytes before the end of the input. Note, in particular, that this is
545	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
546	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
547	# not be near the very beginning of the address space when |$len| < 2*96
548	# (0xc0).
549	lea		-0xc0($inp,$len),$end0
550
551	vmovdqu		0x30($inp),$Z1
552	shr		\$4,$len
553	xor		%rax,%rax
554	vmovdqu		0x20($inp),$Z2
555	 vpshufb	$Ii,$Z3,$Z3		# passed to _aesni_ctr32_ghash_6x
556	vmovdqu		0x10($inp),$T2
557	 vpshufb	$Ii,$Z0,$Z0
558	vmovdqu		($inp),$Hkey
559	 vpshufb	$Ii,$Z1,$Z1
560	vmovdqu		$Z0,0x30(%rsp)
561	 vpshufb	$Ii,$Z2,$Z2
562	vmovdqu		$Z1,0x40(%rsp)
563	 vpshufb	$Ii,$T2,$T2
564	vmovdqu		$Z2,0x50(%rsp)
565	 vpshufb	$Ii,$Hkey,$Hkey
566	vmovdqu		$T2,0x60(%rsp)
567	vmovdqu		$Hkey,0x70(%rsp)
568
569	call		_aesni_ctr32_ghash_6x
570
571	mov		$Xip_offset(%rbp), %r12
572	vmovups		$inout0,-0x60($out)	# save output
573	vmovups		$inout1,-0x50($out)
574	vmovups		$inout2,-0x40($out)
575	vmovups		$inout3,-0x30($out)
576	vmovups		$inout4,-0x20($out)
577	vmovups		$inout5,-0x10($out)
578
579	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
580	vmovdqu		$Xi,(%r12)		# output Xi
581
582	vzeroupper
583___
584$code.=<<___ if ($win64);
585	movaps	-0xd0(%rbp),%xmm6
586	movaps	-0xc0(%rbp),%xmm7
587	movaps	-0xb0(%rbp),%xmm8
588	movaps	-0xa0(%rbp),%xmm9
589	movaps	-0x90(%rbp),%xmm10
590	movaps	-0x80(%rbp),%xmm11
591	movaps	-0x70(%rbp),%xmm12
592	movaps	-0x60(%rbp),%xmm13
593	movaps	-0x50(%rbp),%xmm14
594	movaps	-0x40(%rbp),%xmm15
595	mov	0x10(%rbp),%rdi
596	mov	0x18(%rbp),%rsi
597___
598$code.=<<___;
599	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
600.cfi_def_cfa	%rsp, 0x38
601	pop	%r15
602.cfi_pop	%r15
603	pop	%r14
604.cfi_pop	%r14
605	pop	%r13
606.cfi_pop	%r13
607	pop	%r12
608.cfi_pop	%r12
609	pop	%rbx
610.cfi_pop	%rbx
611	pop	%rbp
612.cfi_pop	%rbp
613.Lgcm_dec_abort:
614	ret
615.seh_endproc
616.cfi_endproc
617.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
618___
619
620$code.=<<___;
621.type	_aesni_ctr32_6x,\@abi-omnipotent
622.align	32
623_aesni_ctr32_6x:
624.cfi_startproc
625	vmovdqu		0x00-0x80($key),$Z0	# borrow $Z0 for $rndkey
626	vmovdqu		0x20($const),$T2	# borrow $T2, .Lone_msb
627	lea		-1($rounds),%r13
628	vmovups		0x10-0x80($key),$rndkey
629	lea		0x20-0x80($key),%r12
630	vpxor		$Z0,$T1,$inout0
631	add		\$`6<<24`,$counter
632	jc		.Lhandle_ctr32_2
633	vpaddb		$T2,$T1,$inout1
634	vpaddb		$T2,$inout1,$inout2
635	vpxor		$Z0,$inout1,$inout1
636	vpaddb		$T2,$inout2,$inout3
637	vpxor		$Z0,$inout2,$inout2
638	vpaddb		$T2,$inout3,$inout4
639	vpxor		$Z0,$inout3,$inout3
640	vpaddb		$T2,$inout4,$inout5
641	vpxor		$Z0,$inout4,$inout4
642	vpaddb		$T2,$inout5,$T1
643	vpxor		$Z0,$inout5,$inout5
644	jmp		.Loop_ctr32
645
646.align	16
647.Loop_ctr32:
648	vaesenc		$rndkey,$inout0,$inout0
649	vaesenc		$rndkey,$inout1,$inout1
650	vaesenc		$rndkey,$inout2,$inout2
651	vaesenc		$rndkey,$inout3,$inout3
652	vaesenc		$rndkey,$inout4,$inout4
653	vaesenc		$rndkey,$inout5,$inout5
654	vmovups		(%r12),$rndkey
655	lea		0x10(%r12),%r12
656	dec		%r13d
657	jnz		.Loop_ctr32
658
659	vmovdqu		(%r12),$Hkey		# last round key
660	vaesenc		$rndkey,$inout0,$inout0
661	vpxor		0x00($inp),$Hkey,$Z0
662	vaesenc		$rndkey,$inout1,$inout1
663	vpxor		0x10($inp),$Hkey,$Z1
664	vaesenc		$rndkey,$inout2,$inout2
665	vpxor		0x20($inp),$Hkey,$Z2
666	vaesenc		$rndkey,$inout3,$inout3
667	vpxor		0x30($inp),$Hkey,$Xi
668	vaesenc		$rndkey,$inout4,$inout4
669	vpxor		0x40($inp),$Hkey,$T2
670	vaesenc		$rndkey,$inout5,$inout5
671	vpxor		0x50($inp),$Hkey,$Hkey
672	lea		0x60($inp),$inp
673
674	vaesenclast	$Z0,$inout0,$inout0
675	vaesenclast	$Z1,$inout1,$inout1
676	vaesenclast	$Z2,$inout2,$inout2
677	vaesenclast	$Xi,$inout3,$inout3
678	vaesenclast	$T2,$inout4,$inout4
679	vaesenclast	$Hkey,$inout5,$inout5
680	vmovups		$inout0,0x00($out)
681	vmovups		$inout1,0x10($out)
682	vmovups		$inout2,0x20($out)
683	vmovups		$inout3,0x30($out)
684	vmovups		$inout4,0x40($out)
685	vmovups		$inout5,0x50($out)
686	lea		0x60($out),$out
687
688	ret
689.align	32
690.Lhandle_ctr32_2:
691	vpshufb		$Ii,$T1,$Z2		# byte-swap counter
692	vmovdqu		0x30($const),$Z1	# borrow $Z1, .Ltwo_lsb
693	vpaddd		0x40($const),$Z2,$inout1	# .Lone_lsb
694	vpaddd		$Z1,$Z2,$inout2
695	vpaddd		$Z1,$inout1,$inout3
696	vpshufb		$Ii,$inout1,$inout1
697	vpaddd		$Z1,$inout2,$inout4
698	vpshufb		$Ii,$inout2,$inout2
699	vpxor		$Z0,$inout1,$inout1
700	vpaddd		$Z1,$inout3,$inout5
701	vpshufb		$Ii,$inout3,$inout3
702	vpxor		$Z0,$inout2,$inout2
703	vpaddd		$Z1,$inout4,$T1		# byte-swapped next counter value
704	vpshufb		$Ii,$inout4,$inout4
705	vpxor		$Z0,$inout3,$inout3
706	vpshufb		$Ii,$inout5,$inout5
707	vpxor		$Z0,$inout4,$inout4
708	vpshufb		$Ii,$T1,$T1		# next counter value
709	vpxor		$Z0,$inout5,$inout5
710	jmp	.Loop_ctr32
711.cfi_endproc
712.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
713
714.globl	aesni_gcm_encrypt
715.type	aesni_gcm_encrypt,\@abi-omnipotent
716.align	32
717aesni_gcm_encrypt:
718.cfi_startproc
719.seh_startproc
720	_CET_ENDBR
721#ifdef BORINGSSL_DISPATCH_TEST
722.extern	BORINGSSL_function_hit
723	movb \$1,BORINGSSL_function_hit+2(%rip)
724#endif
725	xor	%rax,%rax
726
727	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
728	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
729	# least 96 more bytes of input.
730	cmp	\$0x60*3,$len			# minimal accepted length
731	jb	.Lgcm_enc_abort
732
733	push	%rbp
734.cfi_push	%rbp
735.seh_pushreg	%rbp
736	mov	%rsp, %rbp			# save stack pointer
737.cfi_def_cfa_register	%rbp
738	push	%rbx
739.cfi_push	%rbx
740.seh_pushreg	%rbx
741	push	%r12
742.cfi_push	%r12
743.seh_pushreg	%r12
744	push	%r13
745.cfi_push	%r13
746.seh_pushreg	%r13
747	push	%r14
748.cfi_push	%r14
749.seh_pushreg	%r14
750	push	%r15
751.cfi_push	%r15
752.seh_pushreg	%r15
753___
754if ($win64) {
755$code.=<<___
756	lea	-0xa8(%rsp),%rsp		# 8 extra bytes to align the stack
757.seh_stackalloc	0xa8
758.seh_setframe	%rbp, 0xa8+5*8
759	# Load the last two parameters. These go into %rdi and %rsi, which are
760	# non-volatile on Windows, so stash them in the parameter stack area
761	# first.
762	mov	%rdi, 0x10(%rbp)
763.seh_savereg	%rdi, 0xa8+5*8+0x10
764	mov	%rsi, 0x18(%rbp)
765.seh_savereg	%rsi, 0xa8+5*8+0x18
766	mov	0x30(%rbp), $ivp
767	mov	0x38(%rbp), $Htable
768	# Save non-volatile XMM registers.
769	movaps	%xmm6,-0xd0(%rbp)
770.seh_savexmm	%xmm6, 0xa8+5*8-0xd0
771	movaps	%xmm7,-0xc0(%rbp)
772.seh_savexmm	%xmm7, 0xa8+5*8-0xc0
773	movaps	%xmm8,-0xb0(%rbp)
774.seh_savexmm	%xmm8, 0xa8+5*8-0xb0
775	movaps	%xmm9,-0xa0(%rbp)
776.seh_savexmm	%xmm9, 0xa8+5*8-0xa0
777	movaps	%xmm10,-0x90(%rbp)
778.seh_savexmm	%xmm10, 0xa8+5*8-0x90
779	movaps	%xmm11,-0x80(%rbp)
780.seh_savexmm	%xmm11, 0xa8+5*8-0x80
781	movaps	%xmm12,-0x70(%rbp)
782.seh_savexmm	%xmm12, 0xa8+5*8-0x70
783	movaps	%xmm13,-0x60(%rbp)
784.seh_savexmm	%xmm13, 0xa8+5*8-0x60
785	movaps	%xmm14,-0x50(%rbp)
786.seh_savexmm	%xmm14, 0xa8+5*8-0x50
787	movaps	%xmm15,-0x40(%rbp)
788.seh_savexmm	%xmm15, 0xa8+5*8-0x40
789.seh_endprologue
790___
791}
792$code.=<<___;
793	vzeroupper
794
795	vmovdqu		($ivp),$T1		# input counter value
796	add		\$-128,%rsp
797	mov		12($ivp),$counter
798	lea		.Lbswap_mask(%rip),$const
799	lea		-0x80($key),$in0	# borrow $in0
800	mov		\$0xf80,$end0		# borrow $end0
801	lea		0x80($key),$key		# size optimization
802	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
803	and		\$-128,%rsp		# ensure stack alignment
804	mov		0xf0-0x80($key),$rounds
805
806	and		$end0,$in0
807	and		%rsp,$end0
808	sub		$in0,$end0
809	jc		.Lenc_no_key_aliasing
810	cmp		\$768,$end0
811	jnc		.Lenc_no_key_aliasing
812	sub		$end0,%rsp		# avoid aliasing with key
813.Lenc_no_key_aliasing:
814
815	mov		$out,$in0
816
817	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
818	# bytes before the end of the input. Note, in particular, that this is
819	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
820	# the decryption case, there's no caveat that |$out| must not be near
821	# the very beginning of the address space, because we know that
822	# |$len| >= 3*96 from the check above, and so we know
823	# |$out| + |$len| >= 2*96 (0xc0).
824	lea		-0xc0($out,$len),$end0
825
826	shr		\$4,$len
827
828	call		_aesni_ctr32_6x
829	vpshufb		$Ii,$inout0,$Xi		# save bswapped output on stack
830	vpshufb		$Ii,$inout1,$T2
831	vmovdqu		$Xi,0x70(%rsp)
832	vpshufb		$Ii,$inout2,$Z0
833	vmovdqu		$T2,0x60(%rsp)
834	vpshufb		$Ii,$inout3,$Z1
835	vmovdqu		$Z0,0x50(%rsp)
836	vpshufb		$Ii,$inout4,$Z2
837	vmovdqu		$Z1,0x40(%rsp)
838	vpshufb		$Ii,$inout5,$Z3		# passed to _aesni_ctr32_ghash_6x
839	vmovdqu		$Z2,0x30(%rsp)
840
841	call		_aesni_ctr32_6x
842
843	mov		$Xip_offset(%rbp), %r12
844	lea		0x20($Htable),$Htable	# size optimization
845	vmovdqu		(%r12),$Xi		# load Xi
846	sub		\$12,$len
847	mov		\$0x60*2,%rax
848	vpshufb		$Ii,$Xi,$Xi
849
850	call		_aesni_ctr32_ghash_6x
851	vmovdqu		0x20(%rsp),$Z3		# I[5]
852	 vmovdqu	($const),$Ii		# borrow $Ii for .Lbswap_mask
853	vmovdqu		0x00-0x20($Htable),$Hkey	# $Hkey^1
854	vpunpckhqdq	$Z3,$Z3,$T1
855	vmovdqu		0x20-0x20($Htable),$rndkey	# borrow $rndkey for $HK
856	 vmovups	$inout0,-0x60($out)	# save output
857	 vpshufb	$Ii,$inout0,$inout0	# but keep bswapped copy
858	vpxor		$Z3,$T1,$T1
859	 vmovups	$inout1,-0x50($out)
860	 vpshufb	$Ii,$inout1,$inout1
861	 vmovups	$inout2,-0x40($out)
862	 vpshufb	$Ii,$inout2,$inout2
863	 vmovups	$inout3,-0x30($out)
864	 vpshufb	$Ii,$inout3,$inout3
865	 vmovups	$inout4,-0x20($out)
866	 vpshufb	$Ii,$inout4,$inout4
867	 vmovups	$inout5,-0x10($out)
868	 vpshufb	$Ii,$inout5,$inout5
869	 vmovdqu	$inout0,0x10(%rsp)	# free $inout0
870___
871{ my ($HK,$T3)=($rndkey,$inout0);
872
873$code.=<<___;
874	 vmovdqu	0x30(%rsp),$Z2		# I[4]
875	 vmovdqu	0x10-0x20($Htable),$Ii	# borrow $Ii for $Hkey^2
876	 vpunpckhqdq	$Z2,$Z2,$T2
877	vpclmulqdq	\$0x00,$Hkey,$Z3,$Z1
878	 vpxor		$Z2,$T2,$T2
879	vpclmulqdq	\$0x11,$Hkey,$Z3,$Z3
880	vpclmulqdq	\$0x00,$HK,$T1,$T1
881
882	 vmovdqu	0x40(%rsp),$T3		# I[3]
883	vpclmulqdq	\$0x00,$Ii,$Z2,$Z0
884	 vmovdqu	0x30-0x20($Htable),$Hkey	# $Hkey^3
885	vpxor		$Z1,$Z0,$Z0
886	 vpunpckhqdq	$T3,$T3,$Z1
887	vpclmulqdq	\$0x11,$Ii,$Z2,$Z2
888	 vpxor		$T3,$Z1,$Z1
889	vpxor		$Z3,$Z2,$Z2
890	vpclmulqdq	\$0x10,$HK,$T2,$T2
891	 vmovdqu	0x50-0x20($Htable),$HK
892	vpxor		$T1,$T2,$T2
893
894	 vmovdqu	0x50(%rsp),$T1		# I[2]
895	vpclmulqdq	\$0x00,$Hkey,$T3,$Z3
896	 vmovdqu	0x40-0x20($Htable),$Ii	# borrow $Ii for $Hkey^4
897	vpxor		$Z0,$Z3,$Z3
898	 vpunpckhqdq	$T1,$T1,$Z0
899	vpclmulqdq	\$0x11,$Hkey,$T3,$T3
900	 vpxor		$T1,$Z0,$Z0
901	vpxor		$Z2,$T3,$T3
902	vpclmulqdq	\$0x00,$HK,$Z1,$Z1
903	vpxor		$T2,$Z1,$Z1
904
905	 vmovdqu	0x60(%rsp),$T2		# I[1]
906	vpclmulqdq	\$0x00,$Ii,$T1,$Z2
907	 vmovdqu	0x60-0x20($Htable),$Hkey	# $Hkey^5
908	vpxor		$Z3,$Z2,$Z2
909	 vpunpckhqdq	$T2,$T2,$Z3
910	vpclmulqdq	\$0x11,$Ii,$T1,$T1
911	 vpxor		$T2,$Z3,$Z3
912	vpxor		$T3,$T1,$T1
913	vpclmulqdq	\$0x10,$HK,$Z0,$Z0
914	 vmovdqu	0x80-0x20($Htable),$HK
915	vpxor		$Z1,$Z0,$Z0
916
917	 vpxor		0x70(%rsp),$Xi,$Xi	# accumulate I[0]
918	vpclmulqdq	\$0x00,$Hkey,$T2,$Z1
919	 vmovdqu	0x70-0x20($Htable),$Ii	# borrow $Ii for $Hkey^6
920	 vpunpckhqdq	$Xi,$Xi,$T3
921	vpxor		$Z2,$Z1,$Z1
922	vpclmulqdq	\$0x11,$Hkey,$T2,$T2
923	 vpxor		$Xi,$T3,$T3
924	vpxor		$T1,$T2,$T2
925	vpclmulqdq	\$0x00,$HK,$Z3,$Z3
926	vpxor		$Z0,$Z3,$Z0
927
928	vpclmulqdq	\$0x00,$Ii,$Xi,$Z2
929	 vmovdqu	0x00-0x20($Htable),$Hkey	# $Hkey^1
930	 vpunpckhqdq	$inout5,$inout5,$T1
931	vpclmulqdq	\$0x11,$Ii,$Xi,$Xi
932	 vpxor		$inout5,$T1,$T1
933	vpxor		$Z1,$Z2,$Z1
934	vpclmulqdq	\$0x10,$HK,$T3,$T3
935	 vmovdqu	0x20-0x20($Htable),$HK
936	vpxor		$T2,$Xi,$Z3
937	vpxor		$Z0,$T3,$Z2
938
939	 vmovdqu	0x10-0x20($Htable),$Ii	# borrow $Ii for $Hkey^2
940	  vpxor		$Z1,$Z3,$T3		# aggregated Karatsuba post-processing
941	vpclmulqdq	\$0x00,$Hkey,$inout5,$Z0
942	  vpxor		$T3,$Z2,$Z2
943	 vpunpckhqdq	$inout4,$inout4,$T2
944	vpclmulqdq	\$0x11,$Hkey,$inout5,$inout5
945	 vpxor		$inout4,$T2,$T2
946	  vpslldq	\$8,$Z2,$T3
947	vpclmulqdq	\$0x00,$HK,$T1,$T1
948	  vpxor		$T3,$Z1,$Xi
949	  vpsrldq	\$8,$Z2,$Z2
950	  vpxor		$Z2,$Z3,$Z3
951
952	vpclmulqdq	\$0x00,$Ii,$inout4,$Z1
953	 vmovdqu	0x30-0x20($Htable),$Hkey	# $Hkey^3
954	vpxor		$Z0,$Z1,$Z1
955	 vpunpckhqdq	$inout3,$inout3,$T3
956	vpclmulqdq	\$0x11,$Ii,$inout4,$inout4
957	 vpxor		$inout3,$T3,$T3
958	vpxor		$inout5,$inout4,$inout4
959	  vpalignr	\$8,$Xi,$Xi,$inout5	# 1st phase
960	vpclmulqdq	\$0x10,$HK,$T2,$T2
961	 vmovdqu	0x50-0x20($Htable),$HK
962	vpxor		$T1,$T2,$T2
963
964	vpclmulqdq	\$0x00,$Hkey,$inout3,$Z0
965	 vmovdqu	0x40-0x20($Htable),$Ii	# borrow $Ii for $Hkey^4
966	vpxor		$Z1,$Z0,$Z0
967	 vpunpckhqdq	$inout2,$inout2,$T1
968	vpclmulqdq	\$0x11,$Hkey,$inout3,$inout3
969	 vpxor		$inout2,$T1,$T1
970	vpxor		$inout4,$inout3,$inout3
971	  vxorps	0x10(%rsp),$Z3,$Z3	# accumulate $inout0
972	vpclmulqdq	\$0x00,$HK,$T3,$T3
973	vpxor		$T2,$T3,$T3
974
975	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
976	  vxorps	$inout5,$Xi,$Xi
977
978	vpclmulqdq	\$0x00,$Ii,$inout2,$Z1
979	 vmovdqu	0x60-0x20($Htable),$Hkey	# $Hkey^5
980	vpxor		$Z0,$Z1,$Z1
981	 vpunpckhqdq	$inout1,$inout1,$T2
982	vpclmulqdq	\$0x11,$Ii,$inout2,$inout2
983	 vpxor		$inout1,$T2,$T2
984	  vpalignr	\$8,$Xi,$Xi,$inout5	# 2nd phase
985	vpxor		$inout3,$inout2,$inout2
986	vpclmulqdq	\$0x10,$HK,$T1,$T1
987	 vmovdqu	0x80-0x20($Htable),$HK
988	vpxor		$T3,$T1,$T1
989
990	  vxorps	$Z3,$inout5,$inout5
991	  vpclmulqdq	\$0x10,0x10($const),$Xi,$Xi
992	  vxorps	$inout5,$Xi,$Xi
993
994	vpclmulqdq	\$0x00,$Hkey,$inout1,$Z0
995	 vmovdqu	0x70-0x20($Htable),$Ii	# borrow $Ii for $Hkey^6
996	vpxor		$Z1,$Z0,$Z0
997	 vpunpckhqdq	$Xi,$Xi,$T3
998	vpclmulqdq	\$0x11,$Hkey,$inout1,$inout1
999	 vpxor		$Xi,$T3,$T3
1000	vpxor		$inout2,$inout1,$inout1
1001	vpclmulqdq	\$0x00,$HK,$T2,$T2
1002	vpxor		$T1,$T2,$T2
1003
1004	vpclmulqdq	\$0x00,$Ii,$Xi,$Z1
1005	vpclmulqdq	\$0x11,$Ii,$Xi,$Z3
1006	vpxor		$Z0,$Z1,$Z1
1007	vpclmulqdq	\$0x10,$HK,$T3,$Z2
1008	vpxor		$inout1,$Z3,$Z3
1009	vpxor		$T2,$Z2,$Z2
1010
1011	vpxor		$Z1,$Z3,$Z0		# aggregated Karatsuba post-processing
1012	vpxor		$Z0,$Z2,$Z2
1013	vpslldq		\$8,$Z2,$T1
1014	vmovdqu		0x10($const),$Hkey	# .Lpoly
1015	vpsrldq		\$8,$Z2,$Z2
1016	vpxor		$T1,$Z1,$Xi
1017	vpxor		$Z2,$Z3,$Z3
1018
1019	vpalignr	\$8,$Xi,$Xi,$T2		# 1st phase
1020	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
1021	vpxor		$T2,$Xi,$Xi
1022
1023	vpalignr	\$8,$Xi,$Xi,$T2		# 2nd phase
1024	vpclmulqdq	\$0x10,$Hkey,$Xi,$Xi
1025	vpxor		$Z3,$T2,$T2
1026	vpxor		$T2,$Xi,$Xi
1027___
1028}
1029$code.=<<___;
1030	mov		$Xip_offset(%rbp), %r12
1031	vpshufb		($const),$Xi,$Xi	# .Lbswap_mask
1032	vmovdqu		$Xi,(%r12)		# output Xi
1033
1034	vzeroupper
1035___
1036$code.=<<___ if ($win64);
1037	movaps	-0xd0(%rbp),%xmm6
1038	movaps	-0xc0(%rbp),%xmm7
1039	movaps	-0xb0(%rbp),%xmm8
1040	movaps	-0xa0(%rbp),%xmm9
1041	movaps	-0x90(%rbp),%xmm10
1042	movaps	-0x80(%rbp),%xmm11
1043	movaps	-0x70(%rbp),%xmm12
1044	movaps	-0x60(%rbp),%xmm13
1045	movaps	-0x50(%rbp),%xmm14
1046	movaps	-0x40(%rbp),%xmm15
1047	mov	0x10(%rbp),%rdi
1048	mov	0x18(%rbp),%rsi
1049___
1050$code.=<<___;
1051	lea	-0x28(%rbp), %rsp	# restore %rsp to fixed allocation
1052.cfi_def_cfa	%rsp, 0x38
1053	pop	%r15
1054.cfi_pop	%r15
1055	pop	%r14
1056.cfi_pop	%r14
1057	pop	%r13
1058.cfi_pop	%r13
1059	pop	%r12
1060.cfi_pop	%r12
1061	pop	%rbx
1062.cfi_pop	%rbx
1063	pop	%rbp
1064.cfi_pop	%rbp
1065.Lgcm_enc_abort:
1066	ret
1067.seh_endproc
1068.cfi_endproc
1069.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1070___
1071
1072$code.=<<___;
1073.section .rodata
1074.align	64
1075.Lbswap_mask:
1076	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1077.Lpoly:
1078	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1079.Lone_msb:
1080	.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1081.Ltwo_lsb:
1082	.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1083.Lone_lsb:
1084	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1085.asciz	"AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1086.align	64
1087.text
1088___
1089}}} else {{{
1090$code=<<___;	# assembler is too old
1091.text
1092
1093.globl	aesni_gcm_encrypt
1094.type	aesni_gcm_encrypt,\@abi-omnipotent
1095aesni_gcm_encrypt:
1096	_CET_ENDBR
1097	xor	%eax,%eax
1098	ret
1099.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
1100
1101.globl	aesni_gcm_decrypt
1102.type	aesni_gcm_decrypt,\@abi-omnipotent
1103aesni_gcm_decrypt:
1104	_CET_ENDBR
1105	xor	%eax,%eax
1106	ret
1107.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
1108___
1109}}}
1110
1111$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1112
1113print $code;
1114
1115close STDOUT or die "error closing STDOUT: $!";
1116