• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March, June 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that
21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22# function features so called "528B" variant utilizing additional
23# 256+16 bytes of per-key storage [+512 bytes shared table].
24# Performance results are for this streamed GHASH subroutine and are
25# expressed in cycles per processed byte, less is better:
26#
27#		gcc 3.4.x(*)	assembler
28#
29# P4		28.6		14.0		+100%
30# Opteron	19.3		7.7		+150%
31# Core2		17.8		8.1(**)		+120%
32# Atom		31.6		16.8		+88%
33# VIA Nano	21.8		10.1		+115%
34#
35# (*)	comparison is not completely fair, because C results are
36#	for vanilla "256B" implementation, while assembler results
37#	are for "528B";-)
38# (**)	it's mystery [to me] why Core2 result is not same as for
39#	Opteron;
40
41# May 2010
42#
43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44# See ghash-x86.pl for background information and details about coding
45# techniques.
46#
47# Special thanks to David Woodhouse for providing access to a
48# Westmere-based system on behalf of Intel Open Source Technology Centre.
49
50# December 2012
51#
52# Overhaul: aggregate Karatsuba post-processing, improve ILP in
53# reduction_alg9, increase reduction aggregate factor to 4x. As for
54# the latter. ghash-x86.pl discusses that it makes lesser sense to
55# increase aggregate factor. Then why increase here? Critical path
56# consists of 3 independent pclmulqdq instructions, Karatsuba post-
57# processing and reduction. "On top" of this we lay down aggregated
58# multiplication operations, triplets of independent pclmulqdq's. As
59# issue rate for pclmulqdq is limited, it makes lesser sense to
60# aggregate more multiplications than it takes to perform remaining
61# non-multiplication operations. 2x is near-optimal coefficient for
62# contemporary Intel CPUs (therefore modest improvement coefficient),
63# but not for Bulldozer. Latter is because logical SIMD operations
64# are twice as slow in comparison to Intel, so that critical path is
65# longer. A CPU with higher pclmulqdq issue rate would also benefit
66# from higher aggregate factor...
67#
68# Westmere	1.78(+13%)
69# Sandy Bridge	1.80(+8%)
70# Ivy Bridge	1.80(+7%)
71# Haswell	0.55(+93%) (if system doesn't support AVX)
72# Broadwell	0.45(+110%)(if system doesn't support AVX)
73# Skylake	0.44(+110%)(if system doesn't support AVX)
74# Bulldozer	1.49(+27%)
75# Silvermont	2.88(+13%)
76# Knights L	2.12(-)    (if system doesn't support AVX)
77# Goldmont	1.08(+24%)
78
79# March 2013
80#
81# ... 8x aggregate factor AVX code path is using reduction algorithm
82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84# sub-optimally in comparison to above mentioned version. But thanks
85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86# it performs in 0.41 cycles per byte on Haswell processor, in
87# 0.29 on Broadwell, and in 0.36 on Skylake.
88#
89# Knights Landing achieves 1.09 cpb.
90#
91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
92
93# This file was patched in BoringSSL to remove the variable-time 4-bit
94# implementation.
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be
108# computed incorrectly.
109#
110# In upstream, this is controlled by shelling out to the compiler to check
111# versions, but BoringSSL is intended to be used with pre-generated perlasm
112# output, so this isn't useful anyway.
113$avx = 1;
114
115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
116*STDOUT=*OUT;
117
118$do4xaggr=1;
119
120
121$code=<<___;
122.text
123.extern	OPENSSL_ia32cap_P
124___
125
126
127######################################################################
128# PCLMULQDQ version.
129
130@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
131		("%rdi","%rsi","%rdx","%rcx");	# Unix order
132
133($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
134($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
135
136sub clmul64x64_T2 {	# minimal register pressure
137my ($Xhi,$Xi,$Hkey,$HK)=@_;
138
139if (!defined($HK)) {	$HK = $T2;
140$code.=<<___;
141	movdqa		$Xi,$Xhi		#
142	pshufd		\$0b01001110,$Xi,$T1
143	pshufd		\$0b01001110,$Hkey,$T2
144	pxor		$Xi,$T1			#
145	pxor		$Hkey,$T2
146___
147} else {
148$code.=<<___;
149	movdqa		$Xi,$Xhi		#
150	pshufd		\$0b01001110,$Xi,$T1
151	pxor		$Xi,$T1			#
152___
153}
154$code.=<<___;
155	pclmulqdq	\$0x00,$Hkey,$Xi	#######
156	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
157	pclmulqdq	\$0x00,$HK,$T1		#######
158	pxor		$Xi,$T1			#
159	pxor		$Xhi,$T1		#
160
161	movdqa		$T1,$T2			#
162	psrldq		\$8,$T1
163	pslldq		\$8,$T2			#
164	pxor		$T1,$Xhi
165	pxor		$T2,$Xi			#
166___
167}
168
169sub reduction_alg9 {	# 17/11 times faster than Intel version
170my ($Xhi,$Xi) = @_;
171
172$code.=<<___;
173	# 1st phase
174	movdqa		$Xi,$T2			#
175	movdqa		$Xi,$T1
176	psllq		\$5,$Xi
177	pxor		$Xi,$T1			#
178	psllq		\$1,$Xi
179	pxor		$T1,$Xi			#
180	psllq		\$57,$Xi		#
181	movdqa		$Xi,$T1			#
182	pslldq		\$8,$Xi
183	psrldq		\$8,$T1			#
184	pxor		$T2,$Xi
185	pxor		$T1,$Xhi		#
186
187	# 2nd phase
188	movdqa		$Xi,$T2
189	psrlq		\$1,$Xi
190	pxor		$T2,$Xhi		#
191	pxor		$Xi,$T2
192	psrlq		\$5,$Xi
193	pxor		$T2,$Xi			#
194	psrlq		\$1,$Xi			#
195	pxor		$Xhi,$Xi		#
196___
197}
198
199{ my ($Htbl,$Xip)=@_4args;
200  my $HK="%xmm6";
201
202$code.=<<___;
203.globl	gcm_init_clmul
204.type	gcm_init_clmul,\@abi-omnipotent
205.align	16
206gcm_init_clmul:
207.cfi_startproc
208.L_init_clmul:
209___
210$code.=<<___ if ($win64);
211.LSEH_begin_gcm_init_clmul:
212	# I can't trust assembler to use specific encoding:-(
213	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
214	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
215___
216$code.=<<___;
217	movdqu		($Xip),$Hkey
218	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
219
220	# <<1 twist
221	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
222	movdqa		$Hkey,$T1
223	psllq		\$1,$Hkey
224	pxor		$T3,$T3			#
225	psrlq		\$63,$T1
226	pcmpgtd		$T2,$T3			# broadcast carry bit
227	pslldq		\$8,$T1
228	por		$T1,$Hkey		# H<<=1
229
230	# magic reduction
231	pand		.L0x1c2_polynomial(%rip),$T3
232	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
233
234	# calculate H^2
235	pshufd		\$0b01001110,$Hkey,$HK
236	movdqa		$Hkey,$Xi
237	pxor		$Hkey,$HK
238___
239	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
240	&reduction_alg9	($Xhi,$Xi);
241$code.=<<___;
242	pshufd		\$0b01001110,$Hkey,$T1
243	pshufd		\$0b01001110,$Xi,$T2
244	pxor		$Hkey,$T1		# Karatsuba pre-processing
245	movdqu		$Hkey,0x00($Htbl)	# save H
246	pxor		$Xi,$T2			# Karatsuba pre-processing
247	movdqu		$Xi,0x10($Htbl)		# save H^2
248	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
249	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
250___
251if ($do4xaggr) {
252	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
253	&reduction_alg9	($Xhi,$Xi);
254$code.=<<___;
255	movdqa		$Xi,$T3
256___
257	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
258	&reduction_alg9	($Xhi,$Xi);
259$code.=<<___;
260	pshufd		\$0b01001110,$T3,$T1
261	pshufd		\$0b01001110,$Xi,$T2
262	pxor		$T3,$T1			# Karatsuba pre-processing
263	movdqu		$T3,0x30($Htbl)		# save H^3
264	pxor		$Xi,$T2			# Karatsuba pre-processing
265	movdqu		$Xi,0x40($Htbl)		# save H^4
266	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
267	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
268___
269}
270$code.=<<___ if ($win64);
271	movaps	(%rsp),%xmm6
272	lea	0x18(%rsp),%rsp
273.LSEH_end_gcm_init_clmul:
274___
275$code.=<<___;
276	ret
277.cfi_endproc
278.size	gcm_init_clmul,.-gcm_init_clmul
279___
280}
281
282{ my ($Xip,$Htbl)=@_4args;
283
284$code.=<<___;
285.globl	gcm_gmult_clmul
286.type	gcm_gmult_clmul,\@abi-omnipotent
287.align	16
288gcm_gmult_clmul:
289.cfi_startproc
290.L_gmult_clmul:
291	movdqu		($Xip),$Xi
292	movdqa		.Lbswap_mask(%rip),$T3
293	movdqu		($Htbl),$Hkey
294	movdqu		0x20($Htbl),$T2
295	pshufb		$T3,$Xi
296___
297	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
298$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
299	# experimental alternative. special thing about is that there
300	# no dependency between the two multiplications...
301	mov		\$`0xE1<<1`,%eax
302	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
303	mov		\$0x07,%r11d
304	movq		%rax,$T1
305	movq		%r10,$T2
306	movq		%r11,$T3		# borrow $T3
307	pand		$Xi,$T3
308	pshufb		$T3,$T2			# ($Xi&7)·0xE0
309	movq		%rax,$T3
310	pclmulqdq	\$0x00,$Xi,$T1		# ·(0xE1<<1)
311	pxor		$Xi,$T2
312	pslldq		\$15,$T2
313	paddd		$T2,$T2			# <<(64+56+1)
314	pxor		$T2,$Xi
315	pclmulqdq	\$0x01,$T3,$Xi
316	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
317	psrldq		\$1,$T1
318	pxor		$T1,$Xhi
319	pslldq		\$7,$Xi
320	pxor		$Xhi,$Xi
321___
322$code.=<<___;
323	pshufb		$T3,$Xi
324	movdqu		$Xi,($Xip)
325	ret
326.cfi_endproc
327.size	gcm_gmult_clmul,.-gcm_gmult_clmul
328___
329}
330
331{ my ($Xip,$Htbl,$inp,$len)=@_4args;
332  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
333  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
334
335$code.=<<___;
336.globl	gcm_ghash_clmul
337.type	gcm_ghash_clmul,\@abi-omnipotent
338.align	32
339gcm_ghash_clmul:
340.cfi_startproc
341.L_ghash_clmul:
342___
343$code.=<<___ if ($win64);
344	lea	-0x88(%rsp),%rax
345.LSEH_begin_gcm_ghash_clmul:
346	# I can't trust assembler to use specific encoding:-(
347	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
348	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
349	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
350	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
351	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
352	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
353	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
354	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
355	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
356	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
357	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
358___
359$code.=<<___;
360	movdqa		.Lbswap_mask(%rip),$T3
361
362	movdqu		($Xip),$Xi
363	movdqu		($Htbl),$Hkey
364	movdqu		0x20($Htbl),$HK
365	pshufb		$T3,$Xi
366
367	sub		\$0x10,$len
368	jz		.Lodd_tail
369
370	movdqu		0x10($Htbl),$Hkey2
371___
372if ($do4xaggr) {
373my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
374
375$code.=<<___;
376	leaq		OPENSSL_ia32cap_P(%rip),%rax
377	mov		4(%rax),%eax
378	cmp		\$0x30,$len
379	jb		.Lskip4x
380
381	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
382	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
383	je		.Lskip4x
384
385	sub		\$0x30,$len
386	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
387	movdqu		0x30($Htbl),$Hkey3
388	movdqu		0x40($Htbl),$Hkey4
389
390	#######
391	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
392	#
393	movdqu		0x30($inp),$Xln
394	 movdqu		0x20($inp),$Xl
395	pshufb		$T3,$Xln
396	 pshufb		$T3,$Xl
397	movdqa		$Xln,$Xhn
398	pshufd		\$0b01001110,$Xln,$Xmn
399	pxor		$Xln,$Xmn
400	pclmulqdq	\$0x00,$Hkey,$Xln
401	pclmulqdq	\$0x11,$Hkey,$Xhn
402	pclmulqdq	\$0x00,$HK,$Xmn
403
404	movdqa		$Xl,$Xh
405	pshufd		\$0b01001110,$Xl,$Xm
406	pxor		$Xl,$Xm
407	pclmulqdq	\$0x00,$Hkey2,$Xl
408	pclmulqdq	\$0x11,$Hkey2,$Xh
409	pclmulqdq	\$0x10,$HK,$Xm
410	xorps		$Xl,$Xln
411	xorps		$Xh,$Xhn
412	movups		0x50($Htbl),$HK
413	xorps		$Xm,$Xmn
414
415	movdqu		0x10($inp),$Xl
416	 movdqu		0($inp),$T1
417	pshufb		$T3,$Xl
418	 pshufb		$T3,$T1
419	movdqa		$Xl,$Xh
420	pshufd		\$0b01001110,$Xl,$Xm
421	 pxor		$T1,$Xi
422	pxor		$Xl,$Xm
423	pclmulqdq	\$0x00,$Hkey3,$Xl
424	 movdqa		$Xi,$Xhi
425	 pshufd		\$0b01001110,$Xi,$T1
426	 pxor		$Xi,$T1
427	pclmulqdq	\$0x11,$Hkey3,$Xh
428	pclmulqdq	\$0x00,$HK,$Xm
429	xorps		$Xl,$Xln
430	xorps		$Xh,$Xhn
431
432	lea	0x40($inp),$inp
433	sub	\$0x40,$len
434	jc	.Ltail4x
435
436	jmp	.Lmod4_loop
437.align	32
438.Lmod4_loop:
439	pclmulqdq	\$0x00,$Hkey4,$Xi
440	xorps		$Xm,$Xmn
441	 movdqu		0x30($inp),$Xl
442	 pshufb		$T3,$Xl
443	pclmulqdq	\$0x11,$Hkey4,$Xhi
444	xorps		$Xln,$Xi
445	 movdqu		0x20($inp),$Xln
446	 movdqa		$Xl,$Xh
447	pclmulqdq	\$0x10,$HK,$T1
448	 pshufd		\$0b01001110,$Xl,$Xm
449	xorps		$Xhn,$Xhi
450	 pxor		$Xl,$Xm
451	 pshufb		$T3,$Xln
452	movups		0x20($Htbl),$HK
453	xorps		$Xmn,$T1
454	 pclmulqdq	\$0x00,$Hkey,$Xl
455	 pshufd		\$0b01001110,$Xln,$Xmn
456
457	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
458	 movdqa		$Xln,$Xhn
459	pxor		$Xhi,$T1		#
460	 pxor		$Xln,$Xmn
461	movdqa		$T1,$T2			#
462	 pclmulqdq	\$0x11,$Hkey,$Xh
463	pslldq		\$8,$T1
464	psrldq		\$8,$T2			#
465	pxor		$T1,$Xi
466	movdqa		.L7_mask(%rip),$T1
467	pxor		$T2,$Xhi		#
468	movq		%rax,$T2
469
470	pand		$Xi,$T1			# 1st phase
471	pshufb		$T1,$T2			#
472	pxor		$Xi,$T2			#
473	 pclmulqdq	\$0x00,$HK,$Xm
474	psllq		\$57,$T2		#
475	movdqa		$T2,$T1			#
476	pslldq		\$8,$T2
477	 pclmulqdq	\$0x00,$Hkey2,$Xln
478	psrldq		\$8,$T1			#
479	pxor		$T2,$Xi
480	pxor		$T1,$Xhi		#
481	movdqu		0($inp),$T1
482
483	movdqa		$Xi,$T2			# 2nd phase
484	psrlq		\$1,$Xi
485	 pclmulqdq	\$0x11,$Hkey2,$Xhn
486	 xorps		$Xl,$Xln
487	 movdqu		0x10($inp),$Xl
488	 pshufb		$T3,$Xl
489	 pclmulqdq	\$0x10,$HK,$Xmn
490	 xorps		$Xh,$Xhn
491	 movups		0x50($Htbl),$HK
492	pshufb		$T3,$T1
493	pxor		$T2,$Xhi		#
494	pxor		$Xi,$T2
495	psrlq		\$5,$Xi
496
497	 movdqa		$Xl,$Xh
498	 pxor		$Xm,$Xmn
499	 pshufd		\$0b01001110,$Xl,$Xm
500	pxor		$T2,$Xi			#
501	pxor		$T1,$Xhi
502	 pxor		$Xl,$Xm
503	 pclmulqdq	\$0x00,$Hkey3,$Xl
504	psrlq		\$1,$Xi			#
505	pxor		$Xhi,$Xi		#
506	movdqa		$Xi,$Xhi
507	 pclmulqdq	\$0x11,$Hkey3,$Xh
508	 xorps		$Xl,$Xln
509	pshufd		\$0b01001110,$Xi,$T1
510	pxor		$Xi,$T1
511
512	 pclmulqdq	\$0x00,$HK,$Xm
513	 xorps		$Xh,$Xhn
514
515	lea	0x40($inp),$inp
516	sub	\$0x40,$len
517	jnc	.Lmod4_loop
518
519.Ltail4x:
520	pclmulqdq	\$0x00,$Hkey4,$Xi
521	pclmulqdq	\$0x11,$Hkey4,$Xhi
522	pclmulqdq	\$0x10,$HK,$T1
523	xorps		$Xm,$Xmn
524	xorps		$Xln,$Xi
525	xorps		$Xhn,$Xhi
526	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
527	pxor		$Xmn,$T1
528
529	pxor		$Xhi,$T1		#
530	pxor		$Xi,$Xhi
531
532	movdqa		$T1,$T2			#
533	psrldq		\$8,$T1
534	pslldq		\$8,$T2			#
535	pxor		$T1,$Xhi
536	pxor		$T2,$Xi			#
537___
538	&reduction_alg9($Xhi,$Xi);
539$code.=<<___;
540	add	\$0x40,$len
541	jz	.Ldone
542	movdqu	0x20($Htbl),$HK
543	sub	\$0x10,$len
544	jz	.Lodd_tail
545.Lskip4x:
546___
547}
548$code.=<<___;
549	#######
550	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
551	#	[(H*Ii+1) + (H*Xi+1)] mod P =
552	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
553	#
554	movdqu		($inp),$T1		# Ii
555	movdqu		16($inp),$Xln		# Ii+1
556	pshufb		$T3,$T1
557	pshufb		$T3,$Xln
558	pxor		$T1,$Xi			# Ii+Xi
559
560	movdqa		$Xln,$Xhn
561	pshufd		\$0b01001110,$Xln,$Xmn
562	pxor		$Xln,$Xmn
563	pclmulqdq	\$0x00,$Hkey,$Xln
564	pclmulqdq	\$0x11,$Hkey,$Xhn
565	pclmulqdq	\$0x00,$HK,$Xmn
566
567	lea		32($inp),$inp		# i+=2
568	nop
569	sub		\$0x20,$len
570	jbe		.Leven_tail
571	nop
572	jmp		.Lmod_loop
573
574.align	32
575.Lmod_loop:
576	movdqa		$Xi,$Xhi
577	movdqa		$Xmn,$T1
578	pshufd		\$0b01001110,$Xi,$Xmn	#
579	pxor		$Xi,$Xmn		#
580
581	pclmulqdq	\$0x00,$Hkey2,$Xi
582	pclmulqdq	\$0x11,$Hkey2,$Xhi
583	pclmulqdq	\$0x10,$HK,$Xmn
584
585	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
586	pxor		$Xhn,$Xhi
587	  movdqu	($inp),$T2		# Ii
588	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
589	  pshufb	$T3,$T2
590	  movdqu	16($inp),$Xln		# Ii+1
591
592	pxor		$Xhi,$T1
593	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
594	pxor		$T1,$Xmn
595	 pshufb		$T3,$Xln
596	movdqa		$Xmn,$T1		#
597	psrldq		\$8,$T1
598	pslldq		\$8,$Xmn		#
599	pxor		$T1,$Xhi
600	pxor		$Xmn,$Xi		#
601
602	movdqa		$Xln,$Xhn		#
603
604	  movdqa	$Xi,$T2			# 1st phase
605	  movdqa	$Xi,$T1
606	  psllq		\$5,$Xi
607	  pxor		$Xi,$T1			#
608	pclmulqdq	\$0x00,$Hkey,$Xln	#######
609	  psllq		\$1,$Xi
610	  pxor		$T1,$Xi			#
611	  psllq		\$57,$Xi		#
612	  movdqa	$Xi,$T1			#
613	  pslldq	\$8,$Xi
614	  psrldq	\$8,$T1			#
615	  pxor		$T2,$Xi
616	pshufd		\$0b01001110,$Xhn,$Xmn
617	  pxor		$T1,$Xhi		#
618	pxor		$Xhn,$Xmn		#
619
620	  movdqa	$Xi,$T2			# 2nd phase
621	  psrlq		\$1,$Xi
622	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
623	  pxor		$T2,$Xhi		#
624	  pxor		$Xi,$T2
625	  psrlq		\$5,$Xi
626	  pxor		$T2,$Xi			#
627	lea		32($inp),$inp
628	  psrlq		\$1,$Xi			#
629	pclmulqdq	\$0x00,$HK,$Xmn		#######
630	  pxor		$Xhi,$Xi		#
631
632	sub		\$0x20,$len
633	ja		.Lmod_loop
634
635.Leven_tail:
636	 movdqa		$Xi,$Xhi
637	 movdqa		$Xmn,$T1
638	 pshufd		\$0b01001110,$Xi,$Xmn	#
639	 pxor		$Xi,$Xmn		#
640
641	pclmulqdq	\$0x00,$Hkey2,$Xi
642	pclmulqdq	\$0x11,$Hkey2,$Xhi
643	pclmulqdq	\$0x10,$HK,$Xmn
644
645	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
646	pxor		$Xhn,$Xhi
647	pxor		$Xi,$T1
648	pxor		$Xhi,$T1
649	pxor		$T1,$Xmn
650	movdqa		$Xmn,$T1		#
651	psrldq		\$8,$T1
652	pslldq		\$8,$Xmn		#
653	pxor		$T1,$Xhi
654	pxor		$Xmn,$Xi		#
655___
656	&reduction_alg9	($Xhi,$Xi);
657$code.=<<___;
658	test		$len,$len
659	jnz		.Ldone
660
661.Lodd_tail:
662	movdqu		($inp),$T1		# Ii
663	pshufb		$T3,$T1
664	pxor		$T1,$Xi			# Ii+Xi
665___
666	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
667	&reduction_alg9	($Xhi,$Xi);
668$code.=<<___;
669.Ldone:
670	pshufb		$T3,$Xi
671	movdqu		$Xi,($Xip)
672___
673$code.=<<___ if ($win64);
674	movaps	(%rsp),%xmm6
675	movaps	0x10(%rsp),%xmm7
676	movaps	0x20(%rsp),%xmm8
677	movaps	0x30(%rsp),%xmm9
678	movaps	0x40(%rsp),%xmm10
679	movaps	0x50(%rsp),%xmm11
680	movaps	0x60(%rsp),%xmm12
681	movaps	0x70(%rsp),%xmm13
682	movaps	0x80(%rsp),%xmm14
683	movaps	0x90(%rsp),%xmm15
684	lea	0xa8(%rsp),%rsp
685.LSEH_end_gcm_ghash_clmul:
686___
687$code.=<<___;
688	ret
689.cfi_endproc
690.size	gcm_ghash_clmul,.-gcm_ghash_clmul
691___
692}
693
694$code.=<<___;
695.globl	gcm_init_avx
696.type	gcm_init_avx,\@abi-omnipotent
697.align	32
698gcm_init_avx:
699.cfi_startproc
700___
701if ($avx) {
702my ($Htbl,$Xip)=@_4args;
703my $HK="%xmm6";
704
705$code.=<<___ if ($win64);
706.LSEH_begin_gcm_init_avx:
707	# I can't trust assembler to use specific encoding:-(
708	.byte	0x48,0x83,0xec,0x18		#sub	$0x18,%rsp
709	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
710___
711$code.=<<___;
712	vzeroupper
713
714	vmovdqu		($Xip),$Hkey
715	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
716
717	# <<1 twist
718	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
719	vpsrlq		\$63,$Hkey,$T1
720	vpsllq		\$1,$Hkey,$Hkey
721	vpxor		$T3,$T3,$T3		#
722	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
723	vpslldq		\$8,$T1,$T1
724	vpor		$T1,$Hkey,$Hkey		# H<<=1
725
726	# magic reduction
727	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
728	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
729
730	vpunpckhqdq	$Hkey,$Hkey,$HK
731	vmovdqa		$Hkey,$Xi
732	vpxor		$Hkey,$HK,$HK
733	mov		\$4,%r10		# up to H^8
734	jmp		.Linit_start_avx
735___
736
737sub clmul64x64_avx {
738my ($Xhi,$Xi,$Hkey,$HK)=@_;
739
740if (!defined($HK)) {	$HK = $T2;
741$code.=<<___;
742	vpunpckhqdq	$Xi,$Xi,$T1
743	vpunpckhqdq	$Hkey,$Hkey,$T2
744	vpxor		$Xi,$T1,$T1		#
745	vpxor		$Hkey,$T2,$T2
746___
747} else {
748$code.=<<___;
749	vpunpckhqdq	$Xi,$Xi,$T1
750	vpxor		$Xi,$T1,$T1		#
751___
752}
753$code.=<<___;
754	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
755	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
756	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
757	vpxor		$Xi,$Xhi,$T2		#
758	vpxor		$T2,$T1,$T1		#
759
760	vpslldq		\$8,$T1,$T2		#
761	vpsrldq		\$8,$T1,$T1
762	vpxor		$T2,$Xi,$Xi		#
763	vpxor		$T1,$Xhi,$Xhi
764___
765}
766
767sub reduction_avx {
768my ($Xhi,$Xi) = @_;
769
770$code.=<<___;
771	vpsllq		\$57,$Xi,$T1		# 1st phase
772	vpsllq		\$62,$Xi,$T2
773	vpxor		$T1,$T2,$T2		#
774	vpsllq		\$63,$Xi,$T1
775	vpxor		$T1,$T2,$T2		#
776	vpslldq		\$8,$T2,$T1		#
777	vpsrldq		\$8,$T2,$T2
778	vpxor		$T1,$Xi,$Xi		#
779	vpxor		$T2,$Xhi,$Xhi
780
781	vpsrlq		\$1,$Xi,$T2		# 2nd phase
782	vpxor		$Xi,$Xhi,$Xhi
783	vpxor		$T2,$Xi,$Xi		#
784	vpsrlq		\$5,$T2,$T2
785	vpxor		$T2,$Xi,$Xi		#
786	vpsrlq		\$1,$Xi,$Xi		#
787	vpxor		$Xhi,$Xi,$Xi		#
788___
789}
790
791$code.=<<___;
792.align	32
793.Linit_loop_avx:
794	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
795	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
796___
797	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
798	&reduction_avx	($Xhi,$Xi);
799$code.=<<___;
800.Linit_start_avx:
801	vmovdqa		$Xi,$T3
802___
803	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
804	&reduction_avx	($Xhi,$Xi);
805$code.=<<___;
806	vpshufd		\$0b01001110,$T3,$T1
807	vpshufd		\$0b01001110,$Xi,$T2
808	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
809	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
810	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
811	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
812	lea		0x30($Htbl),$Htbl
813	sub		\$1,%r10
814	jnz		.Linit_loop_avx
815
816	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
817	vmovdqu		$T3,-0x10($Htbl)
818
819	vzeroupper
820___
821$code.=<<___ if ($win64);
822	movaps	(%rsp),%xmm6
823	lea	0x18(%rsp),%rsp
824.LSEH_end_gcm_init_avx:
825___
826$code.=<<___;
827	ret
828.cfi_endproc
829.size	gcm_init_avx,.-gcm_init_avx
830___
831} else {
832$code.=<<___;
833	jmp	.L_init_clmul
834.size	gcm_init_avx,.-gcm_init_avx
835___
836}
837
838$code.=<<___;
839.globl	gcm_ghash_avx
840.type	gcm_ghash_avx,\@abi-omnipotent
841.align	32
842gcm_ghash_avx:
843.cfi_startproc
844___
845if ($avx) {
846my ($Xip,$Htbl,$inp,$len)=@_4args;
847my ($Xlo,$Xhi,$Xmi,
848    $Zlo,$Zhi,$Zmi,
849    $Hkey,$HK,$T1,$T2,
850    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
851
852$code.=<<___ if ($win64);
853	lea	-0x88(%rsp),%rax
854.LSEH_begin_gcm_ghash_avx:
855	# I can't trust assembler to use specific encoding:-(
856	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
857	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
858	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
859	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
860	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
861	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
862	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
863	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
864	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
865	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
866	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
867___
868$code.=<<___;
869	vzeroupper
870
871	vmovdqu		($Xip),$Xi		# load $Xi
872	lea		.L0x1c2_polynomial(%rip),%r10
873	lea		0x40($Htbl),$Htbl	# size optimization
874	vmovdqu		.Lbswap_mask(%rip),$bswap
875	vpshufb		$bswap,$Xi,$Xi
876	cmp		\$0x80,$len
877	jb		.Lshort_avx
878	sub		\$0x80,$len
879
880	vmovdqu		0x70($inp),$Ii		# I[7]
881	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
882	vpshufb		$bswap,$Ii,$Ii
883	vmovdqu		0x20-0x40($Htbl),$HK
884
885	vpunpckhqdq	$Ii,$Ii,$T2
886	 vmovdqu	0x60($inp),$Ij		# I[6]
887	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
888	vpxor		$Ii,$T2,$T2
889	 vpshufb	$bswap,$Ij,$Ij
890	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
891	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
892	 vpunpckhqdq	$Ij,$Ij,$T1
893	 vmovdqu	0x50($inp),$Ii		# I[5]
894	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
895	 vpxor		$Ij,$T1,$T1
896
897	 vpshufb	$bswap,$Ii,$Ii
898	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
899	 vpunpckhqdq	$Ii,$Ii,$T2
900	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
901	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
902	 vpxor		$Ii,$T2,$T2
903	 vmovdqu	0x40($inp),$Ij		# I[4]
904	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
905	 vmovdqu	0x50-0x40($Htbl),$HK
906
907	 vpshufb	$bswap,$Ij,$Ij
908	vpxor		$Xlo,$Zlo,$Zlo
909	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
910	vpxor		$Xhi,$Zhi,$Zhi
911	 vpunpckhqdq	$Ij,$Ij,$T1
912	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
913	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
914	vpxor		$Xmi,$Zmi,$Zmi
915	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
916	 vpxor		$Ij,$T1,$T1
917
918	 vmovdqu	0x30($inp),$Ii		# I[3]
919	vpxor		$Zlo,$Xlo,$Xlo
920	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
921	vpxor		$Zhi,$Xhi,$Xhi
922	 vpshufb	$bswap,$Ii,$Ii
923	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
924	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
925	vpxor		$Zmi,$Xmi,$Xmi
926	 vpunpckhqdq	$Ii,$Ii,$T2
927	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
928	 vmovdqu	0x80-0x40($Htbl),$HK
929	 vpxor		$Ii,$T2,$T2
930
931	 vmovdqu	0x20($inp),$Ij		# I[2]
932	vpxor		$Xlo,$Zlo,$Zlo
933	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
934	vpxor		$Xhi,$Zhi,$Zhi
935	 vpshufb	$bswap,$Ij,$Ij
936	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
937	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
938	vpxor		$Xmi,$Zmi,$Zmi
939	 vpunpckhqdq	$Ij,$Ij,$T1
940	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
941	 vpxor		$Ij,$T1,$T1
942
943	 vmovdqu	0x10($inp),$Ii		# I[1]
944	vpxor		$Zlo,$Xlo,$Xlo
945	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
946	vpxor		$Zhi,$Xhi,$Xhi
947	 vpshufb	$bswap,$Ii,$Ii
948	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
949	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
950	vpxor		$Zmi,$Xmi,$Xmi
951	 vpunpckhqdq	$Ii,$Ii,$T2
952	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
953	 vmovdqu	0xb0-0x40($Htbl),$HK
954	 vpxor		$Ii,$T2,$T2
955
956	 vmovdqu	($inp),$Ij		# I[0]
957	vpxor		$Xlo,$Zlo,$Zlo
958	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
959	vpxor		$Xhi,$Zhi,$Zhi
960	 vpshufb	$bswap,$Ij,$Ij
961	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
962	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
963	vpxor		$Xmi,$Zmi,$Zmi
964	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
965
966	lea		0x80($inp),$inp
967	cmp		\$0x80,$len
968	jb		.Ltail_avx
969
970	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
971	sub		\$0x80,$len
972	jmp		.Loop8x_avx
973
974.align	32
975.Loop8x_avx:
976	vpunpckhqdq	$Ij,$Ij,$T1
977	 vmovdqu	0x70($inp),$Ii		# I[7]
978	vpxor		$Xlo,$Zlo,$Zlo
979	vpxor		$Ij,$T1,$T1
980	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
981	 vpshufb	$bswap,$Ii,$Ii
982	vpxor		$Xhi,$Zhi,$Zhi
983	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
984	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
985	 vpunpckhqdq	$Ii,$Ii,$T2
986	vpxor		$Xmi,$Zmi,$Zmi
987	vpclmulqdq	\$0x00,$HK,$T1,$Tred
988	 vmovdqu	0x20-0x40($Htbl),$HK
989	 vpxor		$Ii,$T2,$T2
990
991	  vmovdqu	0x60($inp),$Ij		# I[6]
992	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
993	vpxor		$Zlo,$Xi,$Xi		# collect result
994	  vpshufb	$bswap,$Ij,$Ij
995	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
996	vxorps		$Zhi,$Xo,$Xo
997	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
998	 vpunpckhqdq	$Ij,$Ij,$T1
999	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1000	vpxor		$Zmi,$Tred,$Tred
1001	 vxorps		$Ij,$T1,$T1
1002
1003	  vmovdqu	0x50($inp),$Ii		# I[5]
1004	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
1005	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1006	vpxor		$Xo,$Tred,$Tred
1007	vpslldq		\$8,$Tred,$T2
1008	 vpxor		$Xlo,$Zlo,$Zlo
1009	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1010	vpsrldq		\$8,$Tred,$Tred
1011	vpxor		$T2, $Xi, $Xi
1012	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
1013	  vpshufb	$bswap,$Ii,$Ii
1014	vxorps		$Tred,$Xo, $Xo
1015	 vpxor		$Xhi,$Zhi,$Zhi
1016	 vpunpckhqdq	$Ii,$Ii,$T2
1017	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1018	  vmovdqu	0x50-0x40($Htbl),$HK
1019	 vpxor		$Ii,$T2,$T2
1020	 vpxor		$Xmi,$Zmi,$Zmi
1021
1022	  vmovdqu	0x40($inp),$Ij		# I[4]
1023	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
1024	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1025	  vpshufb	$bswap,$Ij,$Ij
1026	 vpxor		$Zlo,$Xlo,$Xlo
1027	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1028	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
1029	 vpunpckhqdq	$Ij,$Ij,$T1
1030	 vpxor		$Zhi,$Xhi,$Xhi
1031	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1032	 vxorps		$Ij,$T1,$T1
1033	 vpxor		$Zmi,$Xmi,$Xmi
1034
1035	  vmovdqu	0x30($inp),$Ii		# I[3]
1036	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1037	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1038	  vpshufb	$bswap,$Ii,$Ii
1039	 vpxor		$Xlo,$Zlo,$Zlo
1040	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1041	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
1042	 vpunpckhqdq	$Ii,$Ii,$T2
1043	 vpxor		$Xhi,$Zhi,$Zhi
1044	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1045	  vmovdqu	0x80-0x40($Htbl),$HK
1046	 vpxor		$Ii,$T2,$T2
1047	 vpxor		$Xmi,$Zmi,$Zmi
1048
1049	  vmovdqu	0x20($inp),$Ij		# I[2]
1050	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1051	  vpshufb	$bswap,$Ij,$Ij
1052	 vpxor		$Zlo,$Xlo,$Xlo
1053	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1054	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
1055	 vpunpckhqdq	$Ij,$Ij,$T1
1056	 vpxor		$Zhi,$Xhi,$Xhi
1057	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1058	 vpxor		$Ij,$T1,$T1
1059	 vpxor		$Zmi,$Xmi,$Xmi
1060	vxorps		$Tred,$Xi,$Xi
1061
1062	  vmovdqu	0x10($inp),$Ii		# I[1]
1063	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
1064	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1065	  vpshufb	$bswap,$Ii,$Ii
1066	 vpxor		$Xlo,$Zlo,$Zlo
1067	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1068	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
1069	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1070	vxorps		$Xo,$Tred,$Tred
1071	 vpunpckhqdq	$Ii,$Ii,$T2
1072	 vpxor		$Xhi,$Zhi,$Zhi
1073	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1074	  vmovdqu	0xb0-0x40($Htbl),$HK
1075	 vpxor		$Ii,$T2,$T2
1076	 vpxor		$Xmi,$Zmi,$Zmi
1077
1078	  vmovdqu	($inp),$Ij		# I[0]
1079	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1080	  vpshufb	$bswap,$Ij,$Ij
1081	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1082	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
1083	vpxor		$Tred,$Ij,$Ij
1084	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
1085	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1086
1087	lea		0x80($inp),$inp
1088	sub		\$0x80,$len
1089	jnc		.Loop8x_avx
1090
1091	add		\$0x80,$len
1092	jmp		.Ltail_no_xor_avx
1093
1094.align	32
1095.Lshort_avx:
1096	vmovdqu		-0x10($inp,$len),$Ii	# very last word
1097	lea		($inp,$len),$inp
1098	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
1099	vmovdqu		0x20-0x40($Htbl),$HK
1100	vpshufb		$bswap,$Ii,$Ij
1101
1102	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
1103	vmovdqa		$Xhi,$Zhi		# $Zhi and
1104	vmovdqa		$Xmi,$Zmi		# $Zmi
1105	sub		\$0x10,$len
1106	jz		.Ltail_avx
1107
1108	vpunpckhqdq	$Ij,$Ij,$T1
1109	vpxor		$Xlo,$Zlo,$Zlo
1110	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1111	vpxor		$Ij,$T1,$T1
1112	 vmovdqu	-0x20($inp),$Ii
1113	vpxor		$Xhi,$Zhi,$Zhi
1114	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1115	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
1116	 vpshufb	$bswap,$Ii,$Ij
1117	vpxor		$Xmi,$Zmi,$Zmi
1118	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1119	vpsrldq		\$8,$HK,$HK
1120	sub		\$0x10,$len
1121	jz		.Ltail_avx
1122
1123	vpunpckhqdq	$Ij,$Ij,$T1
1124	vpxor		$Xlo,$Zlo,$Zlo
1125	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1126	vpxor		$Ij,$T1,$T1
1127	 vmovdqu	-0x30($inp),$Ii
1128	vpxor		$Xhi,$Zhi,$Zhi
1129	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1130	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
1131	 vpshufb	$bswap,$Ii,$Ij
1132	vpxor		$Xmi,$Zmi,$Zmi
1133	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1134	vmovdqu		0x50-0x40($Htbl),$HK
1135	sub		\$0x10,$len
1136	jz		.Ltail_avx
1137
1138	vpunpckhqdq	$Ij,$Ij,$T1
1139	vpxor		$Xlo,$Zlo,$Zlo
1140	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1141	vpxor		$Ij,$T1,$T1
1142	 vmovdqu	-0x40($inp),$Ii
1143	vpxor		$Xhi,$Zhi,$Zhi
1144	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1145	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
1146	 vpshufb	$bswap,$Ii,$Ij
1147	vpxor		$Xmi,$Zmi,$Zmi
1148	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1149	vpsrldq		\$8,$HK,$HK
1150	sub		\$0x10,$len
1151	jz		.Ltail_avx
1152
1153	vpunpckhqdq	$Ij,$Ij,$T1
1154	vpxor		$Xlo,$Zlo,$Zlo
1155	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1156	vpxor		$Ij,$T1,$T1
1157	 vmovdqu	-0x50($inp),$Ii
1158	vpxor		$Xhi,$Zhi,$Zhi
1159	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1160	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
1161	 vpshufb	$bswap,$Ii,$Ij
1162	vpxor		$Xmi,$Zmi,$Zmi
1163	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1164	vmovdqu		0x80-0x40($Htbl),$HK
1165	sub		\$0x10,$len
1166	jz		.Ltail_avx
1167
1168	vpunpckhqdq	$Ij,$Ij,$T1
1169	vpxor		$Xlo,$Zlo,$Zlo
1170	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1171	vpxor		$Ij,$T1,$T1
1172	 vmovdqu	-0x60($inp),$Ii
1173	vpxor		$Xhi,$Zhi,$Zhi
1174	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1175	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
1176	 vpshufb	$bswap,$Ii,$Ij
1177	vpxor		$Xmi,$Zmi,$Zmi
1178	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1179	vpsrldq		\$8,$HK,$HK
1180	sub		\$0x10,$len
1181	jz		.Ltail_avx
1182
1183	vpunpckhqdq	$Ij,$Ij,$T1
1184	vpxor		$Xlo,$Zlo,$Zlo
1185	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1186	vpxor		$Ij,$T1,$T1
1187	 vmovdqu	-0x70($inp),$Ii
1188	vpxor		$Xhi,$Zhi,$Zhi
1189	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1190	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
1191	 vpshufb	$bswap,$Ii,$Ij
1192	vpxor		$Xmi,$Zmi,$Zmi
1193	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1194	vmovq		0xb8-0x40($Htbl),$HK
1195	sub		\$0x10,$len
1196	jmp		.Ltail_avx
1197
1198.align	32
1199.Ltail_avx:
1200	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1201.Ltail_no_xor_avx:
1202	vpunpckhqdq	$Ij,$Ij,$T1
1203	vpxor		$Xlo,$Zlo,$Zlo
1204	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1205	vpxor		$Ij,$T1,$T1
1206	vpxor		$Xhi,$Zhi,$Zhi
1207	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1208	vpxor		$Xmi,$Zmi,$Zmi
1209	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1210
1211	vmovdqu		(%r10),$Tred
1212
1213	vpxor		$Xlo,$Zlo,$Xi
1214	vpxor		$Xhi,$Zhi,$Xo
1215	vpxor		$Xmi,$Zmi,$Zmi
1216
1217	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
1218	vpxor		$Xo, $Zmi,$Zmi
1219	vpslldq		\$8, $Zmi,$T2
1220	vpsrldq		\$8, $Zmi,$Zmi
1221	vpxor		$T2, $Xi, $Xi
1222	vpxor		$Zmi,$Xo, $Xo
1223
1224	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
1225	vpalignr	\$8,$Xi,$Xi,$Xi
1226	vpxor		$T2,$Xi,$Xi
1227
1228	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
1229	vpalignr	\$8,$Xi,$Xi,$Xi
1230	vpxor		$Xo,$Xi,$Xi
1231	vpxor		$T2,$Xi,$Xi
1232
1233	cmp		\$0,$len
1234	jne		.Lshort_avx
1235
1236	vpshufb		$bswap,$Xi,$Xi
1237	vmovdqu		$Xi,($Xip)
1238	vzeroupper
1239___
1240$code.=<<___ if ($win64);
1241	movaps	(%rsp),%xmm6
1242	movaps	0x10(%rsp),%xmm7
1243	movaps	0x20(%rsp),%xmm8
1244	movaps	0x30(%rsp),%xmm9
1245	movaps	0x40(%rsp),%xmm10
1246	movaps	0x50(%rsp),%xmm11
1247	movaps	0x60(%rsp),%xmm12
1248	movaps	0x70(%rsp),%xmm13
1249	movaps	0x80(%rsp),%xmm14
1250	movaps	0x90(%rsp),%xmm15
1251	lea	0xa8(%rsp),%rsp
1252.LSEH_end_gcm_ghash_avx:
1253___
1254$code.=<<___;
1255	ret
1256.cfi_endproc
1257.size	gcm_ghash_avx,.-gcm_ghash_avx
1258___
1259} else {
1260$code.=<<___;
1261	jmp	.L_ghash_clmul
1262.size	gcm_ghash_avx,.-gcm_ghash_avx
1263___
1264}
1265
1266$code.=<<___;
1267.align	64
1268.Lbswap_mask:
1269	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1270.L0x1c2_polynomial:
1271	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1272.L7_mask:
1273	.long	7,0,7,0
1274.align	64
1275
1276.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1277.align	64
1278___
1279
1280if ($win64) {
1281$code.=<<___;
1282.section	.pdata
1283.align	4
1284	.rva	.LSEH_begin_gcm_init_clmul
1285	.rva	.LSEH_end_gcm_init_clmul
1286	.rva	.LSEH_info_gcm_init_clmul
1287
1288	.rva	.LSEH_begin_gcm_ghash_clmul
1289	.rva	.LSEH_end_gcm_ghash_clmul
1290	.rva	.LSEH_info_gcm_ghash_clmul
1291___
1292$code.=<<___	if ($avx);
1293	.rva	.LSEH_begin_gcm_init_avx
1294	.rva	.LSEH_end_gcm_init_avx
1295	.rva	.LSEH_info_gcm_init_clmul
1296
1297	.rva	.LSEH_begin_gcm_ghash_avx
1298	.rva	.LSEH_end_gcm_ghash_avx
1299	.rva	.LSEH_info_gcm_ghash_clmul
1300___
1301$code.=<<___;
1302.section	.xdata
1303.align	8
1304.LSEH_info_gcm_init_clmul:
1305	.byte	0x01,0x08,0x03,0x00
1306	.byte	0x08,0x68,0x00,0x00	#movaps	0x00(rsp),xmm6
1307	.byte	0x04,0x22,0x00,0x00	#sub	rsp,0x18
1308.LSEH_info_gcm_ghash_clmul:
1309	.byte	0x01,0x33,0x16,0x00
1310	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
1311	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
1312	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
1313	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
1314	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
1315	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
1316	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
1317	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
1318	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
1319	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
1320	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
1321___
1322}
1323
1324$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1325
1326print $code;
1327
1328close STDOUT or die "error closing STDOUT";
1329