• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March, June 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that
21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22# function features so called "528B" variant utilizing additional
23# 256+16 bytes of per-key storage [+512 bytes shared table].
24# Performance results are for this streamed GHASH subroutine and are
25# expressed in cycles per processed byte, less is better:
26#
27#		gcc 3.4.x(*)	assembler
28#
29# P4		28.6		14.0		+100%
30# Opteron	19.3		7.7		+150%
31# Core2		17.8		8.1(**)		+120%
32# Atom		31.6		16.8		+88%
33# VIA Nano	21.8		10.1		+115%
34#
35# (*)	comparison is not completely fair, because C results are
36#	for vanilla "256B" implementation, while assembler results
37#	are for "528B";-)
38# (**)	it's mystery [to me] why Core2 result is not same as for
39#	Opteron;
40
41# May 2010
42#
43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44# See ghash-x86.pl for background information and details about coding
45# techniques.
46#
47# Special thanks to David Woodhouse for providing access to a
48# Westmere-based system on behalf of Intel Open Source Technology Centre.
49
50# December 2012
51#
52# Overhaul: aggregate Karatsuba post-processing, improve ILP in
53# reduction_alg9, increase reduction aggregate factor to 4x. As for
54# the latter. ghash-x86.pl discusses that it makes lesser sense to
55# increase aggregate factor. Then why increase here? Critical path
56# consists of 3 independent pclmulqdq instructions, Karatsuba post-
57# processing and reduction. "On top" of this we lay down aggregated
58# multiplication operations, triplets of independent pclmulqdq's. As
59# issue rate for pclmulqdq is limited, it makes lesser sense to
60# aggregate more multiplications than it takes to perform remaining
61# non-multiplication operations. 2x is near-optimal coefficient for
62# contemporary Intel CPUs (therefore modest improvement coefficient),
63# but not for Bulldozer. Latter is because logical SIMD operations
64# are twice as slow in comparison to Intel, so that critical path is
65# longer. A CPU with higher pclmulqdq issue rate would also benefit
66# from higher aggregate factor...
67#
68# Westmere	1.78(+13%)
69# Sandy Bridge	1.80(+8%)
70# Ivy Bridge	1.80(+7%)
71# Haswell	0.55(+93%) (if system doesn't support AVX)
72# Broadwell	0.45(+110%)(if system doesn't support AVX)
73# Skylake	0.44(+110%)(if system doesn't support AVX)
74# Bulldozer	1.49(+27%)
75# Silvermont	2.88(+13%)
76# Knights L	2.12(-)    (if system doesn't support AVX)
77# Goldmont	1.08(+24%)
78
79# March 2013
80#
81# ... 8x aggregate factor AVX code path is using reduction algorithm
82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84# sub-optimally in comparison to above mentioned version. But thanks
85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86# it performs in 0.41 cycles per byte on Haswell processor, in
87# 0.29 on Broadwell, and in 0.36 on Skylake.
88#
89# Knights Landing achieves 1.09 cpb.
90#
91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
92
93# This file was patched in BoringSSL to remove the variable-time 4-bit
94# implementation.
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be
108# computed incorrectly.
109#
110# In upstream, this is controlled by shelling out to the compiler to check
111# versions, but BoringSSL is intended to be used with pre-generated perlasm
112# output, so this isn't useful anyway.
113$avx = 1;
114
115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
116*STDOUT=*OUT;
117
118$do4xaggr=1;
119
120
121$code=<<___;
122.text
123.extern	OPENSSL_ia32cap_P
124___
125
126
127######################################################################
128# PCLMULQDQ version.
129
130@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
131		("%rdi","%rsi","%rdx","%rcx");	# Unix order
132
133($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
134($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
135
136sub clmul64x64_T2 {	# minimal register pressure
137my ($Xhi,$Xi,$Hkey,$HK)=@_;
138
139if (!defined($HK)) {	$HK = $T2;
140$code.=<<___;
141	movdqa		$Xi,$Xhi		#
142	pshufd		\$0b01001110,$Xi,$T1
143	pshufd		\$0b01001110,$Hkey,$T2
144	pxor		$Xi,$T1			#
145	pxor		$Hkey,$T2
146___
147} else {
148$code.=<<___;
149	movdqa		$Xi,$Xhi		#
150	pshufd		\$0b01001110,$Xi,$T1
151	pxor		$Xi,$T1			#
152___
153}
154$code.=<<___;
155	pclmulqdq	\$0x00,$Hkey,$Xi	#######
156	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
157	pclmulqdq	\$0x00,$HK,$T1		#######
158	pxor		$Xi,$T1			#
159	pxor		$Xhi,$T1		#
160
161	movdqa		$T1,$T2			#
162	psrldq		\$8,$T1
163	pslldq		\$8,$T2			#
164	pxor		$T1,$Xhi
165	pxor		$T2,$Xi			#
166___
167}
168
169sub reduction_alg9 {	# 17/11 times faster than Intel version
170my ($Xhi,$Xi) = @_;
171
172$code.=<<___;
173	# 1st phase
174	movdqa		$Xi,$T2			#
175	movdqa		$Xi,$T1
176	psllq		\$5,$Xi
177	pxor		$Xi,$T1			#
178	psllq		\$1,$Xi
179	pxor		$T1,$Xi			#
180	psllq		\$57,$Xi		#
181	movdqa		$Xi,$T1			#
182	pslldq		\$8,$Xi
183	psrldq		\$8,$T1			#
184	pxor		$T2,$Xi
185	pxor		$T1,$Xhi		#
186
187	# 2nd phase
188	movdqa		$Xi,$T2
189	psrlq		\$1,$Xi
190	pxor		$T2,$Xhi		#
191	pxor		$Xi,$T2
192	psrlq		\$5,$Xi
193	pxor		$T2,$Xi			#
194	psrlq		\$1,$Xi			#
195	pxor		$Xhi,$Xi		#
196___
197}
198
199{ my ($Htbl,$Xip)=@_4args;
200  my $HK="%xmm6";
201
202$code.=<<___;
203.globl	gcm_init_clmul
204.type	gcm_init_clmul,\@abi-omnipotent
205.align	16
206gcm_init_clmul:
207.cfi_startproc
208.seh_startproc
209	_CET_ENDBR
210.L_init_clmul:
211___
212$code.=<<___ if ($win64);
213	sub	\$0x18,%rsp
214.seh_allocstack	0x18
215	movaps	%xmm6,(%rsp)
216.seh_savexmm128	%xmm6, 0
217___
218$code.=<<___;
219	movdqu		($Xip),$Hkey
220	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
221
222	# <<1 twist
223	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
224	movdqa		$Hkey,$T1
225	psllq		\$1,$Hkey
226	pxor		$T3,$T3			#
227	psrlq		\$63,$T1
228	pcmpgtd		$T2,$T3			# broadcast carry bit
229	pslldq		\$8,$T1
230	por		$T1,$Hkey		# H<<=1
231
232	# magic reduction
233	pand		.L0x1c2_polynomial(%rip),$T3
234	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
235
236	# calculate H^2
237	pshufd		\$0b01001110,$Hkey,$HK
238	movdqa		$Hkey,$Xi
239	pxor		$Hkey,$HK
240___
241	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
242	&reduction_alg9	($Xhi,$Xi);
243$code.=<<___;
244	pshufd		\$0b01001110,$Hkey,$T1
245	pshufd		\$0b01001110,$Xi,$T2
246	pxor		$Hkey,$T1		# Karatsuba pre-processing
247	movdqu		$Hkey,0x00($Htbl)	# save H
248	pxor		$Xi,$T2			# Karatsuba pre-processing
249	movdqu		$Xi,0x10($Htbl)		# save H^2
250	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
251	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
252___
253if ($do4xaggr) {
254	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
255	&reduction_alg9	($Xhi,$Xi);
256$code.=<<___;
257	movdqa		$Xi,$T3
258___
259	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
260	&reduction_alg9	($Xhi,$Xi);
261$code.=<<___;
262	pshufd		\$0b01001110,$T3,$T1
263	pshufd		\$0b01001110,$Xi,$T2
264	pxor		$T3,$T1			# Karatsuba pre-processing
265	movdqu		$T3,0x30($Htbl)		# save H^3
266	pxor		$Xi,$T2			# Karatsuba pre-processing
267	movdqu		$Xi,0x40($Htbl)		# save H^4
268	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
269	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
270___
271}
272$code.=<<___ if ($win64);
273	movaps	(%rsp),%xmm6
274	lea	0x18(%rsp),%rsp
275___
276$code.=<<___;
277	ret
278.cfi_endproc
279.seh_endproc
280.size	gcm_init_clmul,.-gcm_init_clmul
281___
282}
283
284{ my ($Xip,$Htbl)=@_4args;
285
286$code.=<<___;
287.globl	gcm_gmult_clmul
288.type	gcm_gmult_clmul,\@abi-omnipotent
289.align	16
290gcm_gmult_clmul:
291.cfi_startproc
292	_CET_ENDBR
293.L_gmult_clmul:
294	movdqu		($Xip),$Xi
295	movdqa		.Lbswap_mask(%rip),$T3
296	movdqu		($Htbl),$Hkey
297	movdqu		0x20($Htbl),$T2
298	pshufb		$T3,$Xi
299___
300	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
301$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
302	# experimental alternative. special thing about is that there
303	# no dependency between the two multiplications...
304	mov		\$`0xE1<<1`,%eax
305	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
306	mov		\$0x07,%r11d
307	movq		%rax,$T1
308	movq		%r10,$T2
309	movq		%r11,$T3		# borrow $T3
310	pand		$Xi,$T3
311	pshufb		$T3,$T2			# ($Xi&7)·0xE0
312	movq		%rax,$T3
313	pclmulqdq	\$0x00,$Xi,$T1		# ·(0xE1<<1)
314	pxor		$Xi,$T2
315	pslldq		\$15,$T2
316	paddd		$T2,$T2			# <<(64+56+1)
317	pxor		$T2,$Xi
318	pclmulqdq	\$0x01,$T3,$Xi
319	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
320	psrldq		\$1,$T1
321	pxor		$T1,$Xhi
322	pslldq		\$7,$Xi
323	pxor		$Xhi,$Xi
324___
325$code.=<<___;
326	pshufb		$T3,$Xi
327	movdqu		$Xi,($Xip)
328	ret
329.cfi_endproc
330.size	gcm_gmult_clmul,.-gcm_gmult_clmul
331___
332}
333
334{ my ($Xip,$Htbl,$inp,$len)=@_4args;
335  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
336  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
337
338$code.=<<___;
339.globl	gcm_ghash_clmul
340.type	gcm_ghash_clmul,\@abi-omnipotent
341.align	32
342gcm_ghash_clmul:
343.cfi_startproc
344.seh_startproc
345	_CET_ENDBR
346.L_ghash_clmul:
347___
348$code.=<<___ if ($win64);
349	lea	-0x88(%rsp),%rax
350	lea	-0x20(%rax),%rsp
351.seh_allocstack	0x20+0x88
352	movaps	%xmm6,-0x20(%rax)
353.seh_savexmm128	%xmm6, 0x20-0x20
354	movaps	%xmm7,-0x10(%rax)
355.seh_savexmm128	%xmm7, 0x20-0x10
356	movaps	%xmm8,0(%rax)
357.seh_savexmm128	%xmm8, 0x20+0
358	movaps	%xmm9,0x10(%rax)
359.seh_savexmm128	%xmm9, 0x20+0x10
360	movaps	%xmm10,0x20(%rax)
361.seh_savexmm128	%xmm10, 0x20+0x20
362	movaps	%xmm11,0x30(%rax)
363.seh_savexmm128	%xmm11, 0x20+0x30
364	movaps	%xmm12,0x40(%rax)
365.seh_savexmm128	%xmm12, 0x20+0x40
366	movaps	%xmm13,0x50(%rax)
367.seh_savexmm128	%xmm13, 0x20+0x50
368	movaps	%xmm14,0x60(%rax)
369.seh_savexmm128	%xmm14, 0x20+0x60
370	movaps	%xmm15,0x70(%rax)
371.seh_savexmm128	%xmm15, 0x20+0x70
372___
373$code.=<<___;
374	movdqa		.Lbswap_mask(%rip),$T3
375
376	movdqu		($Xip),$Xi
377	movdqu		($Htbl),$Hkey
378	movdqu		0x20($Htbl),$HK
379	pshufb		$T3,$Xi
380
381	sub		\$0x10,$len
382	jz		.Lodd_tail
383
384	movdqu		0x10($Htbl),$Hkey2
385___
386if ($do4xaggr) {
387my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
388
389$code.=<<___;
390	leaq		OPENSSL_ia32cap_P(%rip),%rax
391	mov		4(%rax),%eax
392	cmp		\$0x30,$len
393	jb		.Lskip4x
394
395	and		\$`1<<26|1<<22`,%eax	# isolate MOVBE+XSAVE
396	cmp		\$`1<<22`,%eax		# check for MOVBE without XSAVE
397	je		.Lskip4x
398
399	sub		\$0x30,$len
400	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
401	movdqu		0x30($Htbl),$Hkey3
402	movdqu		0x40($Htbl),$Hkey4
403
404	#######
405	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
406	#
407	movdqu		0x30($inp),$Xln
408	 movdqu		0x20($inp),$Xl
409	pshufb		$T3,$Xln
410	 pshufb		$T3,$Xl
411	movdqa		$Xln,$Xhn
412	pshufd		\$0b01001110,$Xln,$Xmn
413	pxor		$Xln,$Xmn
414	pclmulqdq	\$0x00,$Hkey,$Xln
415	pclmulqdq	\$0x11,$Hkey,$Xhn
416	pclmulqdq	\$0x00,$HK,$Xmn
417
418	movdqa		$Xl,$Xh
419	pshufd		\$0b01001110,$Xl,$Xm
420	pxor		$Xl,$Xm
421	pclmulqdq	\$0x00,$Hkey2,$Xl
422	pclmulqdq	\$0x11,$Hkey2,$Xh
423	pclmulqdq	\$0x10,$HK,$Xm
424	xorps		$Xl,$Xln
425	xorps		$Xh,$Xhn
426	movups		0x50($Htbl),$HK
427	xorps		$Xm,$Xmn
428
429	movdqu		0x10($inp),$Xl
430	 movdqu		0($inp),$T1
431	pshufb		$T3,$Xl
432	 pshufb		$T3,$T1
433	movdqa		$Xl,$Xh
434	pshufd		\$0b01001110,$Xl,$Xm
435	 pxor		$T1,$Xi
436	pxor		$Xl,$Xm
437	pclmulqdq	\$0x00,$Hkey3,$Xl
438	 movdqa		$Xi,$Xhi
439	 pshufd		\$0b01001110,$Xi,$T1
440	 pxor		$Xi,$T1
441	pclmulqdq	\$0x11,$Hkey3,$Xh
442	pclmulqdq	\$0x00,$HK,$Xm
443	xorps		$Xl,$Xln
444	xorps		$Xh,$Xhn
445
446	lea	0x40($inp),$inp
447	sub	\$0x40,$len
448	jc	.Ltail4x
449
450	jmp	.Lmod4_loop
451.align	32
452.Lmod4_loop:
453	pclmulqdq	\$0x00,$Hkey4,$Xi
454	xorps		$Xm,$Xmn
455	 movdqu		0x30($inp),$Xl
456	 pshufb		$T3,$Xl
457	pclmulqdq	\$0x11,$Hkey4,$Xhi
458	xorps		$Xln,$Xi
459	 movdqu		0x20($inp),$Xln
460	 movdqa		$Xl,$Xh
461	pclmulqdq	\$0x10,$HK,$T1
462	 pshufd		\$0b01001110,$Xl,$Xm
463	xorps		$Xhn,$Xhi
464	 pxor		$Xl,$Xm
465	 pshufb		$T3,$Xln
466	movups		0x20($Htbl),$HK
467	xorps		$Xmn,$T1
468	 pclmulqdq	\$0x00,$Hkey,$Xl
469	 pshufd		\$0b01001110,$Xln,$Xmn
470
471	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
472	 movdqa		$Xln,$Xhn
473	pxor		$Xhi,$T1		#
474	 pxor		$Xln,$Xmn
475	movdqa		$T1,$T2			#
476	 pclmulqdq	\$0x11,$Hkey,$Xh
477	pslldq		\$8,$T1
478	psrldq		\$8,$T2			#
479	pxor		$T1,$Xi
480	movdqa		.L7_mask(%rip),$T1
481	pxor		$T2,$Xhi		#
482	movq		%rax,$T2
483
484	pand		$Xi,$T1			# 1st phase
485	pshufb		$T1,$T2			#
486	pxor		$Xi,$T2			#
487	 pclmulqdq	\$0x00,$HK,$Xm
488	psllq		\$57,$T2		#
489	movdqa		$T2,$T1			#
490	pslldq		\$8,$T2
491	 pclmulqdq	\$0x00,$Hkey2,$Xln
492	psrldq		\$8,$T1			#
493	pxor		$T2,$Xi
494	pxor		$T1,$Xhi		#
495	movdqu		0($inp),$T1
496
497	movdqa		$Xi,$T2			# 2nd phase
498	psrlq		\$1,$Xi
499	 pclmulqdq	\$0x11,$Hkey2,$Xhn
500	 xorps		$Xl,$Xln
501	 movdqu		0x10($inp),$Xl
502	 pshufb		$T3,$Xl
503	 pclmulqdq	\$0x10,$HK,$Xmn
504	 xorps		$Xh,$Xhn
505	 movups		0x50($Htbl),$HK
506	pshufb		$T3,$T1
507	pxor		$T2,$Xhi		#
508	pxor		$Xi,$T2
509	psrlq		\$5,$Xi
510
511	 movdqa		$Xl,$Xh
512	 pxor		$Xm,$Xmn
513	 pshufd		\$0b01001110,$Xl,$Xm
514	pxor		$T2,$Xi			#
515	pxor		$T1,$Xhi
516	 pxor		$Xl,$Xm
517	 pclmulqdq	\$0x00,$Hkey3,$Xl
518	psrlq		\$1,$Xi			#
519	pxor		$Xhi,$Xi		#
520	movdqa		$Xi,$Xhi
521	 pclmulqdq	\$0x11,$Hkey3,$Xh
522	 xorps		$Xl,$Xln
523	pshufd		\$0b01001110,$Xi,$T1
524	pxor		$Xi,$T1
525
526	 pclmulqdq	\$0x00,$HK,$Xm
527	 xorps		$Xh,$Xhn
528
529	lea	0x40($inp),$inp
530	sub	\$0x40,$len
531	jnc	.Lmod4_loop
532
533.Ltail4x:
534	pclmulqdq	\$0x00,$Hkey4,$Xi
535	pclmulqdq	\$0x11,$Hkey4,$Xhi
536	pclmulqdq	\$0x10,$HK,$T1
537	xorps		$Xm,$Xmn
538	xorps		$Xln,$Xi
539	xorps		$Xhn,$Xhi
540	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
541	pxor		$Xmn,$T1
542
543	pxor		$Xhi,$T1		#
544	pxor		$Xi,$Xhi
545
546	movdqa		$T1,$T2			#
547	psrldq		\$8,$T1
548	pslldq		\$8,$T2			#
549	pxor		$T1,$Xhi
550	pxor		$T2,$Xi			#
551___
552	&reduction_alg9($Xhi,$Xi);
553$code.=<<___;
554	add	\$0x40,$len
555	jz	.Ldone
556	movdqu	0x20($Htbl),$HK
557	sub	\$0x10,$len
558	jz	.Lodd_tail
559.Lskip4x:
560___
561}
562$code.=<<___;
563	#######
564	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
565	#	[(H*Ii+1) + (H*Xi+1)] mod P =
566	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
567	#
568	movdqu		($inp),$T1		# Ii
569	movdqu		16($inp),$Xln		# Ii+1
570	pshufb		$T3,$T1
571	pshufb		$T3,$Xln
572	pxor		$T1,$Xi			# Ii+Xi
573
574	movdqa		$Xln,$Xhn
575	pshufd		\$0b01001110,$Xln,$Xmn
576	pxor		$Xln,$Xmn
577	pclmulqdq	\$0x00,$Hkey,$Xln
578	pclmulqdq	\$0x11,$Hkey,$Xhn
579	pclmulqdq	\$0x00,$HK,$Xmn
580
581	lea		32($inp),$inp		# i+=2
582	nop
583	sub		\$0x20,$len
584	jbe		.Leven_tail
585	nop
586	jmp		.Lmod_loop
587
588.align	32
589.Lmod_loop:
590	movdqa		$Xi,$Xhi
591	movdqa		$Xmn,$T1
592	pshufd		\$0b01001110,$Xi,$Xmn	#
593	pxor		$Xi,$Xmn		#
594
595	pclmulqdq	\$0x00,$Hkey2,$Xi
596	pclmulqdq	\$0x11,$Hkey2,$Xhi
597	pclmulqdq	\$0x10,$HK,$Xmn
598
599	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
600	pxor		$Xhn,$Xhi
601	  movdqu	($inp),$T2		# Ii
602	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
603	  pshufb	$T3,$T2
604	  movdqu	16($inp),$Xln		# Ii+1
605
606	pxor		$Xhi,$T1
607	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
608	pxor		$T1,$Xmn
609	 pshufb		$T3,$Xln
610	movdqa		$Xmn,$T1		#
611	psrldq		\$8,$T1
612	pslldq		\$8,$Xmn		#
613	pxor		$T1,$Xhi
614	pxor		$Xmn,$Xi		#
615
616	movdqa		$Xln,$Xhn		#
617
618	  movdqa	$Xi,$T2			# 1st phase
619	  movdqa	$Xi,$T1
620	  psllq		\$5,$Xi
621	  pxor		$Xi,$T1			#
622	pclmulqdq	\$0x00,$Hkey,$Xln	#######
623	  psllq		\$1,$Xi
624	  pxor		$T1,$Xi			#
625	  psllq		\$57,$Xi		#
626	  movdqa	$Xi,$T1			#
627	  pslldq	\$8,$Xi
628	  psrldq	\$8,$T1			#
629	  pxor		$T2,$Xi
630	pshufd		\$0b01001110,$Xhn,$Xmn
631	  pxor		$T1,$Xhi		#
632	pxor		$Xhn,$Xmn		#
633
634	  movdqa	$Xi,$T2			# 2nd phase
635	  psrlq		\$1,$Xi
636	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
637	  pxor		$T2,$Xhi		#
638	  pxor		$Xi,$T2
639	  psrlq		\$5,$Xi
640	  pxor		$T2,$Xi			#
641	lea		32($inp),$inp
642	  psrlq		\$1,$Xi			#
643	pclmulqdq	\$0x00,$HK,$Xmn		#######
644	  pxor		$Xhi,$Xi		#
645
646	sub		\$0x20,$len
647	ja		.Lmod_loop
648
649.Leven_tail:
650	 movdqa		$Xi,$Xhi
651	 movdqa		$Xmn,$T1
652	 pshufd		\$0b01001110,$Xi,$Xmn	#
653	 pxor		$Xi,$Xmn		#
654
655	pclmulqdq	\$0x00,$Hkey2,$Xi
656	pclmulqdq	\$0x11,$Hkey2,$Xhi
657	pclmulqdq	\$0x10,$HK,$Xmn
658
659	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
660	pxor		$Xhn,$Xhi
661	pxor		$Xi,$T1
662	pxor		$Xhi,$T1
663	pxor		$T1,$Xmn
664	movdqa		$Xmn,$T1		#
665	psrldq		\$8,$T1
666	pslldq		\$8,$Xmn		#
667	pxor		$T1,$Xhi
668	pxor		$Xmn,$Xi		#
669___
670	&reduction_alg9	($Xhi,$Xi);
671$code.=<<___;
672	test		$len,$len
673	jnz		.Ldone
674
675.Lodd_tail:
676	movdqu		($inp),$T1		# Ii
677	pshufb		$T3,$T1
678	pxor		$T1,$Xi			# Ii+Xi
679___
680	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
681	&reduction_alg9	($Xhi,$Xi);
682$code.=<<___;
683.Ldone:
684	pshufb		$T3,$Xi
685	movdqu		$Xi,($Xip)
686___
687$code.=<<___ if ($win64);
688	movaps	(%rsp),%xmm6
689	movaps	0x10(%rsp),%xmm7
690	movaps	0x20(%rsp),%xmm8
691	movaps	0x30(%rsp),%xmm9
692	movaps	0x40(%rsp),%xmm10
693	movaps	0x50(%rsp),%xmm11
694	movaps	0x60(%rsp),%xmm12
695	movaps	0x70(%rsp),%xmm13
696	movaps	0x80(%rsp),%xmm14
697	movaps	0x90(%rsp),%xmm15
698	lea	0xa8(%rsp),%rsp
699___
700$code.=<<___;
701	ret
702.cfi_endproc
703.seh_endproc
704.size	gcm_ghash_clmul,.-gcm_ghash_clmul
705___
706}
707
708$code.=<<___;
709.globl	gcm_init_avx
710.type	gcm_init_avx,\@abi-omnipotent
711.align	32
712gcm_init_avx:
713.cfi_startproc
714	_CET_ENDBR
715___
716if ($avx) {
717my ($Htbl,$Xip)=@_4args;
718my $HK="%xmm6";
719
720$code.=<<___ if ($win64);
721.seh_startproc
722	sub	\$0x18,%rsp
723.seh_allocstack	0x18
724	movaps	%xmm6,(%rsp)
725.seh_savexmm128	%xmm6, 0
726___
727$code.=<<___;
728	vzeroupper
729
730	vmovdqu		($Xip),$Hkey
731	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
732
733	# <<1 twist
734	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
735	vpsrlq		\$63,$Hkey,$T1
736	vpsllq		\$1,$Hkey,$Hkey
737	vpxor		$T3,$T3,$T3		#
738	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
739	vpslldq		\$8,$T1,$T1
740	vpor		$T1,$Hkey,$Hkey		# H<<=1
741
742	# magic reduction
743	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
744	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
745
746	vpunpckhqdq	$Hkey,$Hkey,$HK
747	vmovdqa		$Hkey,$Xi
748	vpxor		$Hkey,$HK,$HK
749	mov		\$4,%r10		# up to H^8
750	jmp		.Linit_start_avx
751___
752
753sub clmul64x64_avx {
754my ($Xhi,$Xi,$Hkey,$HK)=@_;
755
756if (!defined($HK)) {	$HK = $T2;
757$code.=<<___;
758	vpunpckhqdq	$Xi,$Xi,$T1
759	vpunpckhqdq	$Hkey,$Hkey,$T2
760	vpxor		$Xi,$T1,$T1		#
761	vpxor		$Hkey,$T2,$T2
762___
763} else {
764$code.=<<___;
765	vpunpckhqdq	$Xi,$Xi,$T1
766	vpxor		$Xi,$T1,$T1		#
767___
768}
769$code.=<<___;
770	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
771	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
772	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
773	vpxor		$Xi,$Xhi,$T2		#
774	vpxor		$T2,$T1,$T1		#
775
776	vpslldq		\$8,$T1,$T2		#
777	vpsrldq		\$8,$T1,$T1
778	vpxor		$T2,$Xi,$Xi		#
779	vpxor		$T1,$Xhi,$Xhi
780___
781}
782
783sub reduction_avx {
784my ($Xhi,$Xi) = @_;
785
786$code.=<<___;
787	vpsllq		\$57,$Xi,$T1		# 1st phase
788	vpsllq		\$62,$Xi,$T2
789	vpxor		$T1,$T2,$T2		#
790	vpsllq		\$63,$Xi,$T1
791	vpxor		$T1,$T2,$T2		#
792	vpslldq		\$8,$T2,$T1		#
793	vpsrldq		\$8,$T2,$T2
794	vpxor		$T1,$Xi,$Xi		#
795	vpxor		$T2,$Xhi,$Xhi
796
797	vpsrlq		\$1,$Xi,$T2		# 2nd phase
798	vpxor		$Xi,$Xhi,$Xhi
799	vpxor		$T2,$Xi,$Xi		#
800	vpsrlq		\$5,$T2,$T2
801	vpxor		$T2,$Xi,$Xi		#
802	vpsrlq		\$1,$Xi,$Xi		#
803	vpxor		$Xhi,$Xi,$Xi		#
804___
805}
806
807$code.=<<___;
808.align	32
809.Linit_loop_avx:
810	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
811	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
812___
813	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
814	&reduction_avx	($Xhi,$Xi);
815$code.=<<___;
816.Linit_start_avx:
817	vmovdqa		$Xi,$T3
818___
819	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
820	&reduction_avx	($Xhi,$Xi);
821$code.=<<___;
822	vpshufd		\$0b01001110,$T3,$T1
823	vpshufd		\$0b01001110,$Xi,$T2
824	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
825	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
826	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
827	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
828	lea		0x30($Htbl),$Htbl
829	sub		\$1,%r10
830	jnz		.Linit_loop_avx
831
832	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
833	vmovdqu		$T3,-0x10($Htbl)
834
835	vzeroupper
836___
837$code.=<<___ if ($win64);
838	movaps	(%rsp),%xmm6
839	lea	0x18(%rsp),%rsp
840___
841$code.=<<___;
842	ret
843.seh_endproc
844.cfi_endproc
845.size	gcm_init_avx,.-gcm_init_avx
846___
847} else {
848$code.=<<___;
849	jmp	.L_init_clmul
850.size	gcm_init_avx,.-gcm_init_avx
851___
852}
853
854$code.=<<___;
855.globl	gcm_ghash_avx
856.type	gcm_ghash_avx,\@abi-omnipotent
857.align	32
858gcm_ghash_avx:
859.cfi_startproc
860	_CET_ENDBR
861___
862if ($avx) {
863my ($Xip,$Htbl,$inp,$len)=@_4args;
864my ($Xlo,$Xhi,$Xmi,
865    $Zlo,$Zhi,$Zmi,
866    $Hkey,$HK,$T1,$T2,
867    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
868
869$code.=<<___ if ($win64);
870.seh_startproc
871	lea	-0x88(%rsp),%rax
872	lea	-0x20(%rax),%rsp
873.seh_allocstack	0x20+0x88
874	movaps	%xmm6,-0x20(%rax)
875.seh_savexmm128	%xmm6, 0x20-0x20
876	movaps	%xmm7,-0x10(%rax)
877.seh_savexmm128	%xmm7, 0x20-0x10
878	movaps	%xmm8,0(%rax)
879.seh_savexmm128	%xmm8, 0x20+0
880	movaps	%xmm9,0x10(%rax)
881.seh_savexmm128	%xmm9, 0x20+0x10
882	movaps	%xmm10,0x20(%rax)
883.seh_savexmm128	%xmm10, 0x20+0x20
884	movaps	%xmm11,0x30(%rax)
885.seh_savexmm128	%xmm11, 0x20+0x30
886	movaps	%xmm12,0x40(%rax)
887.seh_savexmm128	%xmm12, 0x20+0x40
888	movaps	%xmm13,0x50(%rax)
889.seh_savexmm128	%xmm13, 0x20+0x50
890	movaps	%xmm14,0x60(%rax)
891.seh_savexmm128	%xmm14, 0x20+0x60
892	movaps	%xmm15,0x70(%rax)
893.seh_savexmm128	%xmm15, 0x20+0x70
894___
895$code.=<<___;
896	vzeroupper
897
898	vmovdqu		($Xip),$Xi		# load $Xi
899	lea		.L0x1c2_polynomial(%rip),%r10
900	lea		0x40($Htbl),$Htbl	# size optimization
901	vmovdqu		.Lbswap_mask(%rip),$bswap
902	vpshufb		$bswap,$Xi,$Xi
903	cmp		\$0x80,$len
904	jb		.Lshort_avx
905	sub		\$0x80,$len
906
907	vmovdqu		0x70($inp),$Ii		# I[7]
908	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
909	vpshufb		$bswap,$Ii,$Ii
910	vmovdqu		0x20-0x40($Htbl),$HK
911
912	vpunpckhqdq	$Ii,$Ii,$T2
913	 vmovdqu	0x60($inp),$Ij		# I[6]
914	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
915	vpxor		$Ii,$T2,$T2
916	 vpshufb	$bswap,$Ij,$Ij
917	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
918	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
919	 vpunpckhqdq	$Ij,$Ij,$T1
920	 vmovdqu	0x50($inp),$Ii		# I[5]
921	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
922	 vpxor		$Ij,$T1,$T1
923
924	 vpshufb	$bswap,$Ii,$Ii
925	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
926	 vpunpckhqdq	$Ii,$Ii,$T2
927	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
928	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
929	 vpxor		$Ii,$T2,$T2
930	 vmovdqu	0x40($inp),$Ij		# I[4]
931	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
932	 vmovdqu	0x50-0x40($Htbl),$HK
933
934	 vpshufb	$bswap,$Ij,$Ij
935	vpxor		$Xlo,$Zlo,$Zlo
936	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
937	vpxor		$Xhi,$Zhi,$Zhi
938	 vpunpckhqdq	$Ij,$Ij,$T1
939	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
940	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
941	vpxor		$Xmi,$Zmi,$Zmi
942	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
943	 vpxor		$Ij,$T1,$T1
944
945	 vmovdqu	0x30($inp),$Ii		# I[3]
946	vpxor		$Zlo,$Xlo,$Xlo
947	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
948	vpxor		$Zhi,$Xhi,$Xhi
949	 vpshufb	$bswap,$Ii,$Ii
950	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
951	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
952	vpxor		$Zmi,$Xmi,$Xmi
953	 vpunpckhqdq	$Ii,$Ii,$T2
954	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
955	 vmovdqu	0x80-0x40($Htbl),$HK
956	 vpxor		$Ii,$T2,$T2
957
958	 vmovdqu	0x20($inp),$Ij		# I[2]
959	vpxor		$Xlo,$Zlo,$Zlo
960	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
961	vpxor		$Xhi,$Zhi,$Zhi
962	 vpshufb	$bswap,$Ij,$Ij
963	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
964	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
965	vpxor		$Xmi,$Zmi,$Zmi
966	 vpunpckhqdq	$Ij,$Ij,$T1
967	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
968	 vpxor		$Ij,$T1,$T1
969
970	 vmovdqu	0x10($inp),$Ii		# I[1]
971	vpxor		$Zlo,$Xlo,$Xlo
972	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
973	vpxor		$Zhi,$Xhi,$Xhi
974	 vpshufb	$bswap,$Ii,$Ii
975	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
976	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
977	vpxor		$Zmi,$Xmi,$Xmi
978	 vpunpckhqdq	$Ii,$Ii,$T2
979	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
980	 vmovdqu	0xb0-0x40($Htbl),$HK
981	 vpxor		$Ii,$T2,$T2
982
983	 vmovdqu	($inp),$Ij		# I[0]
984	vpxor		$Xlo,$Zlo,$Zlo
985	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
986	vpxor		$Xhi,$Zhi,$Zhi
987	 vpshufb	$bswap,$Ij,$Ij
988	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
989	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
990	vpxor		$Xmi,$Zmi,$Zmi
991	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
992
993	lea		0x80($inp),$inp
994	cmp		\$0x80,$len
995	jb		.Ltail_avx
996
997	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
998	sub		\$0x80,$len
999	jmp		.Loop8x_avx
1000
1001.align	32
1002.Loop8x_avx:
1003	vpunpckhqdq	$Ij,$Ij,$T1
1004	 vmovdqu	0x70($inp),$Ii		# I[7]
1005	vpxor		$Xlo,$Zlo,$Zlo
1006	vpxor		$Ij,$T1,$T1
1007	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
1008	 vpshufb	$bswap,$Ii,$Ii
1009	vpxor		$Xhi,$Zhi,$Zhi
1010	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
1011	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
1012	 vpunpckhqdq	$Ii,$Ii,$T2
1013	vpxor		$Xmi,$Zmi,$Zmi
1014	vpclmulqdq	\$0x00,$HK,$T1,$Tred
1015	 vmovdqu	0x20-0x40($Htbl),$HK
1016	 vpxor		$Ii,$T2,$T2
1017
1018	  vmovdqu	0x60($inp),$Ij		# I[6]
1019	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1020	vpxor		$Zlo,$Xi,$Xi		# collect result
1021	  vpshufb	$bswap,$Ij,$Ij
1022	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1023	vxorps		$Zhi,$Xo,$Xo
1024	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
1025	 vpunpckhqdq	$Ij,$Ij,$T1
1026	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1027	vpxor		$Zmi,$Tred,$Tred
1028	 vxorps		$Ij,$T1,$T1
1029
1030	  vmovdqu	0x50($inp),$Ii		# I[5]
1031	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
1032	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1033	vpxor		$Xo,$Tred,$Tred
1034	vpslldq		\$8,$Tred,$T2
1035	 vpxor		$Xlo,$Zlo,$Zlo
1036	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1037	vpsrldq		\$8,$Tred,$Tred
1038	vpxor		$T2, $Xi, $Xi
1039	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
1040	  vpshufb	$bswap,$Ii,$Ii
1041	vxorps		$Tred,$Xo, $Xo
1042	 vpxor		$Xhi,$Zhi,$Zhi
1043	 vpunpckhqdq	$Ii,$Ii,$T2
1044	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1045	  vmovdqu	0x50-0x40($Htbl),$HK
1046	 vpxor		$Ii,$T2,$T2
1047	 vpxor		$Xmi,$Zmi,$Zmi
1048
1049	  vmovdqu	0x40($inp),$Ij		# I[4]
1050	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
1051	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1052	  vpshufb	$bswap,$Ij,$Ij
1053	 vpxor		$Zlo,$Xlo,$Xlo
1054	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1055	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
1056	 vpunpckhqdq	$Ij,$Ij,$T1
1057	 vpxor		$Zhi,$Xhi,$Xhi
1058	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1059	 vxorps		$Ij,$T1,$T1
1060	 vpxor		$Zmi,$Xmi,$Xmi
1061
1062	  vmovdqu	0x30($inp),$Ii		# I[3]
1063	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1064	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1065	  vpshufb	$bswap,$Ii,$Ii
1066	 vpxor		$Xlo,$Zlo,$Zlo
1067	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1068	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
1069	 vpunpckhqdq	$Ii,$Ii,$T2
1070	 vpxor		$Xhi,$Zhi,$Zhi
1071	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1072	  vmovdqu	0x80-0x40($Htbl),$HK
1073	 vpxor		$Ii,$T2,$T2
1074	 vpxor		$Xmi,$Zmi,$Zmi
1075
1076	  vmovdqu	0x20($inp),$Ij		# I[2]
1077	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1078	  vpshufb	$bswap,$Ij,$Ij
1079	 vpxor		$Zlo,$Xlo,$Xlo
1080	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1081	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
1082	 vpunpckhqdq	$Ij,$Ij,$T1
1083	 vpxor		$Zhi,$Xhi,$Xhi
1084	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1085	 vpxor		$Ij,$T1,$T1
1086	 vpxor		$Zmi,$Xmi,$Xmi
1087	vxorps		$Tred,$Xi,$Xi
1088
1089	  vmovdqu	0x10($inp),$Ii		# I[1]
1090	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
1091	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1092	  vpshufb	$bswap,$Ii,$Ii
1093	 vpxor		$Xlo,$Zlo,$Zlo
1094	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1095	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
1096	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1097	vxorps		$Xo,$Tred,$Tred
1098	 vpunpckhqdq	$Ii,$Ii,$T2
1099	 vpxor		$Xhi,$Zhi,$Zhi
1100	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1101	  vmovdqu	0xb0-0x40($Htbl),$HK
1102	 vpxor		$Ii,$T2,$T2
1103	 vpxor		$Xmi,$Zmi,$Zmi
1104
1105	  vmovdqu	($inp),$Ij		# I[0]
1106	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1107	  vpshufb	$bswap,$Ij,$Ij
1108	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1109	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
1110	vpxor		$Tred,$Ij,$Ij
1111	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
1112	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1113
1114	lea		0x80($inp),$inp
1115	sub		\$0x80,$len
1116	jnc		.Loop8x_avx
1117
1118	add		\$0x80,$len
1119	jmp		.Ltail_no_xor_avx
1120
1121.align	32
1122.Lshort_avx:
1123	vmovdqu		-0x10($inp,$len),$Ii	# very last word
1124	lea		($inp,$len),$inp
1125	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
1126	vmovdqu		0x20-0x40($Htbl),$HK
1127	vpshufb		$bswap,$Ii,$Ij
1128
1129	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
1130	vmovdqa		$Xhi,$Zhi		# $Zhi and
1131	vmovdqa		$Xmi,$Zmi		# $Zmi
1132	sub		\$0x10,$len
1133	jz		.Ltail_avx
1134
1135	vpunpckhqdq	$Ij,$Ij,$T1
1136	vpxor		$Xlo,$Zlo,$Zlo
1137	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1138	vpxor		$Ij,$T1,$T1
1139	 vmovdqu	-0x20($inp),$Ii
1140	vpxor		$Xhi,$Zhi,$Zhi
1141	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1142	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
1143	 vpshufb	$bswap,$Ii,$Ij
1144	vpxor		$Xmi,$Zmi,$Zmi
1145	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1146	vpsrldq		\$8,$HK,$HK
1147	sub		\$0x10,$len
1148	jz		.Ltail_avx
1149
1150	vpunpckhqdq	$Ij,$Ij,$T1
1151	vpxor		$Xlo,$Zlo,$Zlo
1152	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1153	vpxor		$Ij,$T1,$T1
1154	 vmovdqu	-0x30($inp),$Ii
1155	vpxor		$Xhi,$Zhi,$Zhi
1156	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1157	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
1158	 vpshufb	$bswap,$Ii,$Ij
1159	vpxor		$Xmi,$Zmi,$Zmi
1160	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1161	vmovdqu		0x50-0x40($Htbl),$HK
1162	sub		\$0x10,$len
1163	jz		.Ltail_avx
1164
1165	vpunpckhqdq	$Ij,$Ij,$T1
1166	vpxor		$Xlo,$Zlo,$Zlo
1167	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1168	vpxor		$Ij,$T1,$T1
1169	 vmovdqu	-0x40($inp),$Ii
1170	vpxor		$Xhi,$Zhi,$Zhi
1171	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1172	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
1173	 vpshufb	$bswap,$Ii,$Ij
1174	vpxor		$Xmi,$Zmi,$Zmi
1175	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1176	vpsrldq		\$8,$HK,$HK
1177	sub		\$0x10,$len
1178	jz		.Ltail_avx
1179
1180	vpunpckhqdq	$Ij,$Ij,$T1
1181	vpxor		$Xlo,$Zlo,$Zlo
1182	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1183	vpxor		$Ij,$T1,$T1
1184	 vmovdqu	-0x50($inp),$Ii
1185	vpxor		$Xhi,$Zhi,$Zhi
1186	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1187	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
1188	 vpshufb	$bswap,$Ii,$Ij
1189	vpxor		$Xmi,$Zmi,$Zmi
1190	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1191	vmovdqu		0x80-0x40($Htbl),$HK
1192	sub		\$0x10,$len
1193	jz		.Ltail_avx
1194
1195	vpunpckhqdq	$Ij,$Ij,$T1
1196	vpxor		$Xlo,$Zlo,$Zlo
1197	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1198	vpxor		$Ij,$T1,$T1
1199	 vmovdqu	-0x60($inp),$Ii
1200	vpxor		$Xhi,$Zhi,$Zhi
1201	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1202	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
1203	 vpshufb	$bswap,$Ii,$Ij
1204	vpxor		$Xmi,$Zmi,$Zmi
1205	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1206	vpsrldq		\$8,$HK,$HK
1207	sub		\$0x10,$len
1208	jz		.Ltail_avx
1209
1210	vpunpckhqdq	$Ij,$Ij,$T1
1211	vpxor		$Xlo,$Zlo,$Zlo
1212	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1213	vpxor		$Ij,$T1,$T1
1214	 vmovdqu	-0x70($inp),$Ii
1215	vpxor		$Xhi,$Zhi,$Zhi
1216	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1217	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
1218	 vpshufb	$bswap,$Ii,$Ij
1219	vpxor		$Xmi,$Zmi,$Zmi
1220	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1221	vmovq		0xb8-0x40($Htbl),$HK
1222	sub		\$0x10,$len
1223	jmp		.Ltail_avx
1224
1225.align	32
1226.Ltail_avx:
1227	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1228.Ltail_no_xor_avx:
1229	vpunpckhqdq	$Ij,$Ij,$T1
1230	vpxor		$Xlo,$Zlo,$Zlo
1231	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1232	vpxor		$Ij,$T1,$T1
1233	vpxor		$Xhi,$Zhi,$Zhi
1234	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1235	vpxor		$Xmi,$Zmi,$Zmi
1236	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1237
1238	vmovdqu		(%r10),$Tred
1239
1240	vpxor		$Xlo,$Zlo,$Xi
1241	vpxor		$Xhi,$Zhi,$Xo
1242	vpxor		$Xmi,$Zmi,$Zmi
1243
1244	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
1245	vpxor		$Xo, $Zmi,$Zmi
1246	vpslldq		\$8, $Zmi,$T2
1247	vpsrldq		\$8, $Zmi,$Zmi
1248	vpxor		$T2, $Xi, $Xi
1249	vpxor		$Zmi,$Xo, $Xo
1250
1251	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
1252	vpalignr	\$8,$Xi,$Xi,$Xi
1253	vpxor		$T2,$Xi,$Xi
1254
1255	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
1256	vpalignr	\$8,$Xi,$Xi,$Xi
1257	vpxor		$Xo,$Xi,$Xi
1258	vpxor		$T2,$Xi,$Xi
1259
1260	cmp		\$0,$len
1261	jne		.Lshort_avx
1262
1263	vpshufb		$bswap,$Xi,$Xi
1264	vmovdqu		$Xi,($Xip)
1265	vzeroupper
1266___
1267$code.=<<___ if ($win64);
1268	movaps	(%rsp),%xmm6
1269	movaps	0x10(%rsp),%xmm7
1270	movaps	0x20(%rsp),%xmm8
1271	movaps	0x30(%rsp),%xmm9
1272	movaps	0x40(%rsp),%xmm10
1273	movaps	0x50(%rsp),%xmm11
1274	movaps	0x60(%rsp),%xmm12
1275	movaps	0x70(%rsp),%xmm13
1276	movaps	0x80(%rsp),%xmm14
1277	movaps	0x90(%rsp),%xmm15
1278	lea	0xa8(%rsp),%rsp
1279___
1280$code.=<<___;
1281	ret
1282.cfi_endproc
1283.seh_endproc
1284.size	gcm_ghash_avx,.-gcm_ghash_avx
1285___
1286} else {
1287$code.=<<___;
1288	jmp	.L_ghash_clmul
1289.size	gcm_ghash_avx,.-gcm_ghash_avx
1290___
1291}
1292
1293$code.=<<___;
1294.section .rodata
1295.align	64
1296.Lbswap_mask:
1297	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1298.L0x1c2_polynomial:
1299	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1300.L7_mask:
1301	.long	7,0,7,0
1302.align	64
1303
1304.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1305.align	64
1306.text
1307___
1308
1309$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1310
1311print $code;
1312
1313close STDOUT or die "error closing STDOUT: $!";
1314