• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March, June 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that
21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22# function features so called "528B" variant utilizing additional
23# 256+16 bytes of per-key storage [+512 bytes shared table].
24# Performance results are for this streamed GHASH subroutine and are
25# expressed in cycles per processed byte, less is better:
26#
27#		gcc 3.4.x(*)	assembler
28#
29# P4		28.6		14.0		+100%
30# Opteron	19.3		7.7		+150%
31# Core2		17.8		8.1(**)		+120%
32# Atom		31.6		16.8		+88%
33# VIA Nano	21.8		10.1		+115%
34#
35# (*)	comparison is not completely fair, because C results are
36#	for vanilla "256B" implementation, while assembler results
37#	are for "528B";-)
38# (**)	it's mystery [to me] why Core2 result is not same as for
39#	Opteron;
40
41# May 2010
42#
43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44# See ghash-x86.pl for background information and details about coding
45# techniques.
46#
47# Special thanks to David Woodhouse for providing access to a
48# Westmere-based system on behalf of Intel Open Source Technology Centre.
49
50# December 2012
51#
52# Overhaul: aggregate Karatsuba post-processing, improve ILP in
53# reduction_alg9, increase reduction aggregate factor to 4x. As for
54# the latter. ghash-x86.pl discusses that it makes lesser sense to
55# increase aggregate factor. Then why increase here? Critical path
56# consists of 3 independent pclmulqdq instructions, Karatsuba post-
57# processing and reduction. "On top" of this we lay down aggregated
58# multiplication operations, triplets of independent pclmulqdq's. As
59# issue rate for pclmulqdq is limited, it makes lesser sense to
60# aggregate more multiplications than it takes to perform remaining
61# non-multiplication operations. 2x is near-optimal coefficient for
62# contemporary Intel CPUs (therefore modest improvement coefficient),
63# but not for Bulldozer. Latter is because logical SIMD operations
64# are twice as slow in comparison to Intel, so that critical path is
65# longer. A CPU with higher pclmulqdq issue rate would also benefit
66# from higher aggregate factor...
67#
68# Westmere	1.78(+13%)
69# Sandy Bridge	1.80(+8%)
70# Ivy Bridge	1.80(+7%)
71# Haswell	0.55(+93%) (if system doesn't support AVX)
72# Broadwell	0.45(+110%)(if system doesn't support AVX)
73# Skylake	0.44(+110%)(if system doesn't support AVX)
74# Bulldozer	1.49(+27%)
75# Silvermont	2.88(+13%)
76# Knights L	2.12(-)    (if system doesn't support AVX)
77# Goldmont	1.08(+24%)
78
79# March 2013
80#
81# ... 8x aggregate factor AVX code path is using reduction algorithm
82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84# sub-optimally in comparison to above mentioned version. But thanks
85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86# it performs in 0.41 cycles per byte on Haswell processor, in
87# 0.29 on Broadwell, and in 0.36 on Skylake.
88#
89# Knights Landing achieves 1.09 cpb.
90#
91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
92
93# This file was patched in BoringSSL to remove the variable-time 4-bit
94# implementation.
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be
108# computed incorrectly.
109#
110# In upstream, this is controlled by shelling out to the compiler to check
111# versions, but BoringSSL is intended to be used with pre-generated perlasm
112# output, so this isn't useful anyway.
113$avx = 1;
114
115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
116*STDOUT=*OUT;
117
118$do4xaggr=1;
119
120
121$code=<<___;
122.text
123___
124
125
126######################################################################
127# PCLMULQDQ version.
128
129@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
130		("%rdi","%rsi","%rdx","%rcx");	# Unix order
131
132($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
133($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
134
135sub clmul64x64_T2 {	# minimal register pressure
136my ($Xhi,$Xi,$Hkey,$HK)=@_;
137
138if (!defined($HK)) {	$HK = $T2;
139$code.=<<___;
140	movdqa		$Xi,$Xhi		#
141	pshufd		\$0b01001110,$Xi,$T1
142	pshufd		\$0b01001110,$Hkey,$T2
143	pxor		$Xi,$T1			#
144	pxor		$Hkey,$T2
145___
146} else {
147$code.=<<___;
148	movdqa		$Xi,$Xhi		#
149	pshufd		\$0b01001110,$Xi,$T1
150	pxor		$Xi,$T1			#
151___
152}
153$code.=<<___;
154	pclmulqdq	\$0x00,$Hkey,$Xi	#######
155	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
156	pclmulqdq	\$0x00,$HK,$T1		#######
157	pxor		$Xi,$T1			#
158	pxor		$Xhi,$T1		#
159
160	movdqa		$T1,$T2			#
161	psrldq		\$8,$T1
162	pslldq		\$8,$T2			#
163	pxor		$T1,$Xhi
164	pxor		$T2,$Xi			#
165___
166}
167
168sub reduction_alg9 {	# 17/11 times faster than Intel version
169my ($Xhi,$Xi) = @_;
170
171$code.=<<___;
172	# 1st phase
173	movdqa		$Xi,$T2			#
174	movdqa		$Xi,$T1
175	psllq		\$5,$Xi
176	pxor		$Xi,$T1			#
177	psllq		\$1,$Xi
178	pxor		$T1,$Xi			#
179	psllq		\$57,$Xi		#
180	movdqa		$Xi,$T1			#
181	pslldq		\$8,$Xi
182	psrldq		\$8,$T1			#
183	pxor		$T2,$Xi
184	pxor		$T1,$Xhi		#
185
186	# 2nd phase
187	movdqa		$Xi,$T2
188	psrlq		\$1,$Xi
189	pxor		$T2,$Xhi		#
190	pxor		$Xi,$T2
191	psrlq		\$5,$Xi
192	pxor		$T2,$Xi			#
193	psrlq		\$1,$Xi			#
194	pxor		$Xhi,$Xi		#
195___
196}
197
198{ my ($Htbl,$Xip)=@_4args;
199  my $HK="%xmm6";
200
201$code.=<<___;
202.globl	gcm_init_clmul
203.type	gcm_init_clmul,\@abi-omnipotent
204.align	16
205gcm_init_clmul:
206.cfi_startproc
207.seh_startproc
208	_CET_ENDBR
209.L_init_clmul:
210___
211$code.=<<___ if ($win64);
212	sub	\$0x18,%rsp
213.seh_stackalloc	0x18
214	movaps	%xmm6,(%rsp)
215.seh_savexmm	%xmm6, 0
216.seh_endprologue
217___
218$code.=<<___;
219	movdqu		($Xip),$Hkey
220	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
221
222	# <<1 twist
223	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
224	movdqa		$Hkey,$T1
225	psllq		\$1,$Hkey
226	pxor		$T3,$T3			#
227	psrlq		\$63,$T1
228	pcmpgtd		$T2,$T3			# broadcast carry bit
229	pslldq		\$8,$T1
230	por		$T1,$Hkey		# H<<=1
231
232	# magic reduction
233	pand		.L0x1c2_polynomial(%rip),$T3
234	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
235
236	# calculate H^2
237	pshufd		\$0b01001110,$Hkey,$HK
238	movdqa		$Hkey,$Xi
239	pxor		$Hkey,$HK
240___
241	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
242	&reduction_alg9	($Xhi,$Xi);
243$code.=<<___;
244	pshufd		\$0b01001110,$Hkey,$T1
245	pshufd		\$0b01001110,$Xi,$T2
246	pxor		$Hkey,$T1		# Karatsuba pre-processing
247	movdqu		$Hkey,0x00($Htbl)	# save H
248	pxor		$Xi,$T2			# Karatsuba pre-processing
249	movdqu		$Xi,0x10($Htbl)		# save H^2
250	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
251	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
252___
253if ($do4xaggr) {
254	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
255	&reduction_alg9	($Xhi,$Xi);
256$code.=<<___;
257	movdqa		$Xi,$T3
258___
259	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
260	&reduction_alg9	($Xhi,$Xi);
261$code.=<<___;
262	pshufd		\$0b01001110,$T3,$T1
263	pshufd		\$0b01001110,$Xi,$T2
264	pxor		$T3,$T1			# Karatsuba pre-processing
265	movdqu		$T3,0x30($Htbl)		# save H^3
266	pxor		$Xi,$T2			# Karatsuba pre-processing
267	movdqu		$Xi,0x40($Htbl)		# save H^4
268	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
269	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
270___
271}
272$code.=<<___ if ($win64);
273	movaps	(%rsp),%xmm6
274	lea	0x18(%rsp),%rsp
275___
276$code.=<<___;
277	ret
278.cfi_endproc
279.seh_endproc
280.size	gcm_init_clmul,.-gcm_init_clmul
281___
282}
283
284{ my ($Xip,$Htbl)=@_4args;
285
286$code.=<<___;
287.globl	gcm_gmult_clmul
288.type	gcm_gmult_clmul,\@abi-omnipotent
289.align	16
290gcm_gmult_clmul:
291.cfi_startproc
292	_CET_ENDBR
293.L_gmult_clmul:
294	movdqu		($Xip),$Xi
295	movdqa		.Lbswap_mask(%rip),$T3
296	movdqu		($Htbl),$Hkey
297	movdqu		0x20($Htbl),$T2
298	pshufb		$T3,$Xi
299___
300	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
301$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
302	# experimental alternative. special thing about is that there
303	# no dependency between the two multiplications...
304	mov		\$`0xE1<<1`,%eax
305	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
306	mov		\$0x07,%r11d
307	movq		%rax,$T1
308	movq		%r10,$T2
309	movq		%r11,$T3		# borrow $T3
310	pand		$Xi,$T3
311	pshufb		$T3,$T2			# ($Xi&7)·0xE0
312	movq		%rax,$T3
313	pclmulqdq	\$0x00,$Xi,$T1		# ·(0xE1<<1)
314	pxor		$Xi,$T2
315	pslldq		\$15,$T2
316	paddd		$T2,$T2			# <<(64+56+1)
317	pxor		$T2,$Xi
318	pclmulqdq	\$0x01,$T3,$Xi
319	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
320	psrldq		\$1,$T1
321	pxor		$T1,$Xhi
322	pslldq		\$7,$Xi
323	pxor		$Xhi,$Xi
324___
325$code.=<<___;
326	pshufb		$T3,$Xi
327	movdqu		$Xi,($Xip)
328	ret
329.cfi_endproc
330.size	gcm_gmult_clmul,.-gcm_gmult_clmul
331___
332}
333
334{ my ($Xip,$Htbl,$inp,$len)=@_4args;
335  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
336  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
337
338$code.=<<___;
339.globl	gcm_ghash_clmul
340.type	gcm_ghash_clmul,\@abi-omnipotent
341.align	32
342gcm_ghash_clmul:
343.cfi_startproc
344.seh_startproc
345	_CET_ENDBR
346.L_ghash_clmul:
347___
348$code.=<<___ if ($win64);
349	lea	-0x88(%rsp),%rax
350	lea	-0x20(%rax),%rsp
351.seh_stackalloc	0x20+0x88
352	movaps	%xmm6,-0x20(%rax)
353.seh_savexmm	%xmm6, 0x20-0x20
354	movaps	%xmm7,-0x10(%rax)
355.seh_savexmm	%xmm7, 0x20-0x10
356	movaps	%xmm8,0(%rax)
357.seh_savexmm	%xmm8, 0x20+0
358	movaps	%xmm9,0x10(%rax)
359.seh_savexmm	%xmm9, 0x20+0x10
360	movaps	%xmm10,0x20(%rax)
361.seh_savexmm	%xmm10, 0x20+0x20
362	movaps	%xmm11,0x30(%rax)
363.seh_savexmm	%xmm11, 0x20+0x30
364	movaps	%xmm12,0x40(%rax)
365.seh_savexmm	%xmm12, 0x20+0x40
366	movaps	%xmm13,0x50(%rax)
367.seh_savexmm	%xmm13, 0x20+0x50
368	movaps	%xmm14,0x60(%rax)
369.seh_savexmm	%xmm14, 0x20+0x60
370	movaps	%xmm15,0x70(%rax)
371.seh_savexmm	%xmm15, 0x20+0x70
372.seh_endprologue
373___
374$code.=<<___;
375	movdqa		.Lbswap_mask(%rip),$T3
376
377	movdqu		($Xip),$Xi
378	movdqu		($Htbl),$Hkey
379	movdqu		0x20($Htbl),$HK
380	pshufb		$T3,$Xi
381
382	sub		\$0x10,$len
383	jz		.Lodd_tail
384
385	movdqu		0x10($Htbl),$Hkey2
386___
387if ($do4xaggr) {
388my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
389
390$code.=<<___;
391	cmp		\$0x30,$len
392	jb		.Lskip4x
393
394	sub		\$0x30,$len
395	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
396	movdqu		0x30($Htbl),$Hkey3
397	movdqu		0x40($Htbl),$Hkey4
398
399	#######
400	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
401	#
402	movdqu		0x30($inp),$Xln
403	 movdqu		0x20($inp),$Xl
404	pshufb		$T3,$Xln
405	 pshufb		$T3,$Xl
406	movdqa		$Xln,$Xhn
407	pshufd		\$0b01001110,$Xln,$Xmn
408	pxor		$Xln,$Xmn
409	pclmulqdq	\$0x00,$Hkey,$Xln
410	pclmulqdq	\$0x11,$Hkey,$Xhn
411	pclmulqdq	\$0x00,$HK,$Xmn
412
413	movdqa		$Xl,$Xh
414	pshufd		\$0b01001110,$Xl,$Xm
415	pxor		$Xl,$Xm
416	pclmulqdq	\$0x00,$Hkey2,$Xl
417	pclmulqdq	\$0x11,$Hkey2,$Xh
418	pclmulqdq	\$0x10,$HK,$Xm
419	xorps		$Xl,$Xln
420	xorps		$Xh,$Xhn
421	movups		0x50($Htbl),$HK
422	xorps		$Xm,$Xmn
423
424	movdqu		0x10($inp),$Xl
425	 movdqu		0($inp),$T1
426	pshufb		$T3,$Xl
427	 pshufb		$T3,$T1
428	movdqa		$Xl,$Xh
429	pshufd		\$0b01001110,$Xl,$Xm
430	 pxor		$T1,$Xi
431	pxor		$Xl,$Xm
432	pclmulqdq	\$0x00,$Hkey3,$Xl
433	 movdqa		$Xi,$Xhi
434	 pshufd		\$0b01001110,$Xi,$T1
435	 pxor		$Xi,$T1
436	pclmulqdq	\$0x11,$Hkey3,$Xh
437	pclmulqdq	\$0x00,$HK,$Xm
438	xorps		$Xl,$Xln
439	xorps		$Xh,$Xhn
440
441	lea	0x40($inp),$inp
442	sub	\$0x40,$len
443	jc	.Ltail4x
444
445	jmp	.Lmod4_loop
446.align	32
447.Lmod4_loop:
448	pclmulqdq	\$0x00,$Hkey4,$Xi
449	xorps		$Xm,$Xmn
450	 movdqu		0x30($inp),$Xl
451	 pshufb		$T3,$Xl
452	pclmulqdq	\$0x11,$Hkey4,$Xhi
453	xorps		$Xln,$Xi
454	 movdqu		0x20($inp),$Xln
455	 movdqa		$Xl,$Xh
456	pclmulqdq	\$0x10,$HK,$T1
457	 pshufd		\$0b01001110,$Xl,$Xm
458	xorps		$Xhn,$Xhi
459	 pxor		$Xl,$Xm
460	 pshufb		$T3,$Xln
461	movups		0x20($Htbl),$HK
462	xorps		$Xmn,$T1
463	 pclmulqdq	\$0x00,$Hkey,$Xl
464	 pshufd		\$0b01001110,$Xln,$Xmn
465
466	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
467	 movdqa		$Xln,$Xhn
468	pxor		$Xhi,$T1		#
469	 pxor		$Xln,$Xmn
470	movdqa		$T1,$T2			#
471	 pclmulqdq	\$0x11,$Hkey,$Xh
472	pslldq		\$8,$T1
473	psrldq		\$8,$T2			#
474	pxor		$T1,$Xi
475	movdqa		.L7_mask(%rip),$T1
476	pxor		$T2,$Xhi		#
477	movq		%rax,$T2
478
479	pand		$Xi,$T1			# 1st phase
480	pshufb		$T1,$T2			#
481	pxor		$Xi,$T2			#
482	 pclmulqdq	\$0x00,$HK,$Xm
483	psllq		\$57,$T2		#
484	movdqa		$T2,$T1			#
485	pslldq		\$8,$T2
486	 pclmulqdq	\$0x00,$Hkey2,$Xln
487	psrldq		\$8,$T1			#
488	pxor		$T2,$Xi
489	pxor		$T1,$Xhi		#
490	movdqu		0($inp),$T1
491
492	movdqa		$Xi,$T2			# 2nd phase
493	psrlq		\$1,$Xi
494	 pclmulqdq	\$0x11,$Hkey2,$Xhn
495	 xorps		$Xl,$Xln
496	 movdqu		0x10($inp),$Xl
497	 pshufb		$T3,$Xl
498	 pclmulqdq	\$0x10,$HK,$Xmn
499	 xorps		$Xh,$Xhn
500	 movups		0x50($Htbl),$HK
501	pshufb		$T3,$T1
502	pxor		$T2,$Xhi		#
503	pxor		$Xi,$T2
504	psrlq		\$5,$Xi
505
506	 movdqa		$Xl,$Xh
507	 pxor		$Xm,$Xmn
508	 pshufd		\$0b01001110,$Xl,$Xm
509	pxor		$T2,$Xi			#
510	pxor		$T1,$Xhi
511	 pxor		$Xl,$Xm
512	 pclmulqdq	\$0x00,$Hkey3,$Xl
513	psrlq		\$1,$Xi			#
514	pxor		$Xhi,$Xi		#
515	movdqa		$Xi,$Xhi
516	 pclmulqdq	\$0x11,$Hkey3,$Xh
517	 xorps		$Xl,$Xln
518	pshufd		\$0b01001110,$Xi,$T1
519	pxor		$Xi,$T1
520
521	 pclmulqdq	\$0x00,$HK,$Xm
522	 xorps		$Xh,$Xhn
523
524	lea	0x40($inp),$inp
525	sub	\$0x40,$len
526	jnc	.Lmod4_loop
527
528.Ltail4x:
529	pclmulqdq	\$0x00,$Hkey4,$Xi
530	pclmulqdq	\$0x11,$Hkey4,$Xhi
531	pclmulqdq	\$0x10,$HK,$T1
532	xorps		$Xm,$Xmn
533	xorps		$Xln,$Xi
534	xorps		$Xhn,$Xhi
535	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
536	pxor		$Xmn,$T1
537
538	pxor		$Xhi,$T1		#
539	pxor		$Xi,$Xhi
540
541	movdqa		$T1,$T2			#
542	psrldq		\$8,$T1
543	pslldq		\$8,$T2			#
544	pxor		$T1,$Xhi
545	pxor		$T2,$Xi			#
546___
547	&reduction_alg9($Xhi,$Xi);
548$code.=<<___;
549	add	\$0x40,$len
550	jz	.Ldone
551	movdqu	0x20($Htbl),$HK
552	sub	\$0x10,$len
553	jz	.Lodd_tail
554.Lskip4x:
555___
556}
557$code.=<<___;
558	#######
559	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
560	#	[(H*Ii+1) + (H*Xi+1)] mod P =
561	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
562	#
563	movdqu		($inp),$T1		# Ii
564	movdqu		16($inp),$Xln		# Ii+1
565	pshufb		$T3,$T1
566	pshufb		$T3,$Xln
567	pxor		$T1,$Xi			# Ii+Xi
568
569	movdqa		$Xln,$Xhn
570	pshufd		\$0b01001110,$Xln,$Xmn
571	pxor		$Xln,$Xmn
572	pclmulqdq	\$0x00,$Hkey,$Xln
573	pclmulqdq	\$0x11,$Hkey,$Xhn
574	pclmulqdq	\$0x00,$HK,$Xmn
575
576	lea		32($inp),$inp		# i+=2
577	nop
578	sub		\$0x20,$len
579	jbe		.Leven_tail
580	nop
581	jmp		.Lmod_loop
582
583.align	32
584.Lmod_loop:
585	movdqa		$Xi,$Xhi
586	movdqa		$Xmn,$T1
587	pshufd		\$0b01001110,$Xi,$Xmn	#
588	pxor		$Xi,$Xmn		#
589
590	pclmulqdq	\$0x00,$Hkey2,$Xi
591	pclmulqdq	\$0x11,$Hkey2,$Xhi
592	pclmulqdq	\$0x10,$HK,$Xmn
593
594	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
595	pxor		$Xhn,$Xhi
596	  movdqu	($inp),$T2		# Ii
597	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
598	  pshufb	$T3,$T2
599	  movdqu	16($inp),$Xln		# Ii+1
600
601	pxor		$Xhi,$T1
602	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
603	pxor		$T1,$Xmn
604	 pshufb		$T3,$Xln
605	movdqa		$Xmn,$T1		#
606	psrldq		\$8,$T1
607	pslldq		\$8,$Xmn		#
608	pxor		$T1,$Xhi
609	pxor		$Xmn,$Xi		#
610
611	movdqa		$Xln,$Xhn		#
612
613	  movdqa	$Xi,$T2			# 1st phase
614	  movdqa	$Xi,$T1
615	  psllq		\$5,$Xi
616	  pxor		$Xi,$T1			#
617	pclmulqdq	\$0x00,$Hkey,$Xln	#######
618	  psllq		\$1,$Xi
619	  pxor		$T1,$Xi			#
620	  psllq		\$57,$Xi		#
621	  movdqa	$Xi,$T1			#
622	  pslldq	\$8,$Xi
623	  psrldq	\$8,$T1			#
624	  pxor		$T2,$Xi
625	pshufd		\$0b01001110,$Xhn,$Xmn
626	  pxor		$T1,$Xhi		#
627	pxor		$Xhn,$Xmn		#
628
629	  movdqa	$Xi,$T2			# 2nd phase
630	  psrlq		\$1,$Xi
631	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
632	  pxor		$T2,$Xhi		#
633	  pxor		$Xi,$T2
634	  psrlq		\$5,$Xi
635	  pxor		$T2,$Xi			#
636	lea		32($inp),$inp
637	  psrlq		\$1,$Xi			#
638	pclmulqdq	\$0x00,$HK,$Xmn		#######
639	  pxor		$Xhi,$Xi		#
640
641	sub		\$0x20,$len
642	ja		.Lmod_loop
643
644.Leven_tail:
645	 movdqa		$Xi,$Xhi
646	 movdqa		$Xmn,$T1
647	 pshufd		\$0b01001110,$Xi,$Xmn	#
648	 pxor		$Xi,$Xmn		#
649
650	pclmulqdq	\$0x00,$Hkey2,$Xi
651	pclmulqdq	\$0x11,$Hkey2,$Xhi
652	pclmulqdq	\$0x10,$HK,$Xmn
653
654	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
655	pxor		$Xhn,$Xhi
656	pxor		$Xi,$T1
657	pxor		$Xhi,$T1
658	pxor		$T1,$Xmn
659	movdqa		$Xmn,$T1		#
660	psrldq		\$8,$T1
661	pslldq		\$8,$Xmn		#
662	pxor		$T1,$Xhi
663	pxor		$Xmn,$Xi		#
664___
665	&reduction_alg9	($Xhi,$Xi);
666$code.=<<___;
667	test		$len,$len
668	jnz		.Ldone
669
670.Lodd_tail:
671	movdqu		($inp),$T1		# Ii
672	pshufb		$T3,$T1
673	pxor		$T1,$Xi			# Ii+Xi
674___
675	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
676	&reduction_alg9	($Xhi,$Xi);
677$code.=<<___;
678.Ldone:
679	pshufb		$T3,$Xi
680	movdqu		$Xi,($Xip)
681___
682$code.=<<___ if ($win64);
683	movaps	(%rsp),%xmm6
684	movaps	0x10(%rsp),%xmm7
685	movaps	0x20(%rsp),%xmm8
686	movaps	0x30(%rsp),%xmm9
687	movaps	0x40(%rsp),%xmm10
688	movaps	0x50(%rsp),%xmm11
689	movaps	0x60(%rsp),%xmm12
690	movaps	0x70(%rsp),%xmm13
691	movaps	0x80(%rsp),%xmm14
692	movaps	0x90(%rsp),%xmm15
693	lea	0xa8(%rsp),%rsp
694___
695$code.=<<___;
696	ret
697.cfi_endproc
698.seh_endproc
699.size	gcm_ghash_clmul,.-gcm_ghash_clmul
700___
701}
702
703$code.=<<___;
704.globl	gcm_init_avx
705.type	gcm_init_avx,\@abi-omnipotent
706.align	32
707gcm_init_avx:
708.cfi_startproc
709.seh_startproc
710	_CET_ENDBR
711___
712if ($avx) {
713my ($Htbl,$Xip)=@_4args;
714my $HK="%xmm6";
715
716$code.=<<___ if ($win64);
717	sub	\$0x18,%rsp
718.seh_stackalloc	0x18
719	movaps	%xmm6,(%rsp)
720.seh_savexmm	%xmm6, 0
721.seh_endprologue
722___
723$code.=<<___;
724	vzeroupper
725
726	vmovdqu		($Xip),$Hkey
727	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
728
729	# <<1 twist
730	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
731	vpsrlq		\$63,$Hkey,$T1
732	vpsllq		\$1,$Hkey,$Hkey
733	vpxor		$T3,$T3,$T3		#
734	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
735	vpslldq		\$8,$T1,$T1
736	vpor		$T1,$Hkey,$Hkey		# H<<=1
737
738	# magic reduction
739	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
740	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
741
742	vpunpckhqdq	$Hkey,$Hkey,$HK
743	vmovdqa		$Hkey,$Xi
744	vpxor		$Hkey,$HK,$HK
745	mov		\$4,%r10		# up to H^8
746	jmp		.Linit_start_avx
747___
748
749sub clmul64x64_avx {
750my ($Xhi,$Xi,$Hkey,$HK)=@_;
751
752if (!defined($HK)) {	$HK = $T2;
753$code.=<<___;
754	vpunpckhqdq	$Xi,$Xi,$T1
755	vpunpckhqdq	$Hkey,$Hkey,$T2
756	vpxor		$Xi,$T1,$T1		#
757	vpxor		$Hkey,$T2,$T2
758___
759} else {
760$code.=<<___;
761	vpunpckhqdq	$Xi,$Xi,$T1
762	vpxor		$Xi,$T1,$T1		#
763___
764}
765$code.=<<___;
766	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
767	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
768	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
769	vpxor		$Xi,$Xhi,$T2		#
770	vpxor		$T2,$T1,$T1		#
771
772	vpslldq		\$8,$T1,$T2		#
773	vpsrldq		\$8,$T1,$T1
774	vpxor		$T2,$Xi,$Xi		#
775	vpxor		$T1,$Xhi,$Xhi
776___
777}
778
779sub reduction_avx {
780my ($Xhi,$Xi) = @_;
781
782$code.=<<___;
783	vpsllq		\$57,$Xi,$T1		# 1st phase
784	vpsllq		\$62,$Xi,$T2
785	vpxor		$T1,$T2,$T2		#
786	vpsllq		\$63,$Xi,$T1
787	vpxor		$T1,$T2,$T2		#
788	vpslldq		\$8,$T2,$T1		#
789	vpsrldq		\$8,$T2,$T2
790	vpxor		$T1,$Xi,$Xi		#
791	vpxor		$T2,$Xhi,$Xhi
792
793	vpsrlq		\$1,$Xi,$T2		# 2nd phase
794	vpxor		$Xi,$Xhi,$Xhi
795	vpxor		$T2,$Xi,$Xi		#
796	vpsrlq		\$5,$T2,$T2
797	vpxor		$T2,$Xi,$Xi		#
798	vpsrlq		\$1,$Xi,$Xi		#
799	vpxor		$Xhi,$Xi,$Xi		#
800___
801}
802
803$code.=<<___;
804.align	32
805.Linit_loop_avx:
806	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
807	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
808___
809	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
810	&reduction_avx	($Xhi,$Xi);
811$code.=<<___;
812.Linit_start_avx:
813	vmovdqa		$Xi,$T3
814___
815	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
816	&reduction_avx	($Xhi,$Xi);
817$code.=<<___;
818	vpshufd		\$0b01001110,$T3,$T1
819	vpshufd		\$0b01001110,$Xi,$T2
820	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
821	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
822	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
823	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
824	lea		0x30($Htbl),$Htbl
825	sub		\$1,%r10
826	jnz		.Linit_loop_avx
827
828	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
829	vmovdqu		$T3,-0x10($Htbl)
830
831	vzeroupper
832___
833$code.=<<___ if ($win64);
834	movaps	(%rsp),%xmm6
835	lea	0x18(%rsp),%rsp
836___
837$code.=<<___;
838	ret
839.seh_endproc
840.cfi_endproc
841.size	gcm_init_avx,.-gcm_init_avx
842___
843} else {
844$code.=<<___;
845	jmp	.L_init_clmul
846.size	gcm_init_avx,.-gcm_init_avx
847___
848}
849
850$code.=<<___;
851.globl	gcm_gmult_avx
852.type	gcm_gmult_avx,\@abi-omnipotent
853.align	32
854gcm_gmult_avx:
855.cfi_startproc
856	_CET_ENDBR
857	jmp	.L_gmult_clmul
858.cfi_endproc
859.size	gcm_gmult_avx,.-gcm_gmult_avx
860___
861
862$code.=<<___;
863.globl	gcm_ghash_avx
864.type	gcm_ghash_avx,\@abi-omnipotent
865.align	32
866gcm_ghash_avx:
867.cfi_startproc
868.seh_startproc
869	_CET_ENDBR
870___
871if ($avx) {
872my ($Xip,$Htbl,$inp,$len)=@_4args;
873my ($Xlo,$Xhi,$Xmi,
874    $Zlo,$Zhi,$Zmi,
875    $Hkey,$HK,$T1,$T2,
876    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
877
878$code.=<<___ if ($win64);
879	lea	-0x88(%rsp),%rax
880	lea	-0x20(%rax),%rsp
881.seh_stackalloc	0x20+0x88
882	movaps	%xmm6,-0x20(%rax)
883.seh_savexmm	%xmm6, 0x20-0x20
884	movaps	%xmm7,-0x10(%rax)
885.seh_savexmm	%xmm7, 0x20-0x10
886	movaps	%xmm8,0(%rax)
887.seh_savexmm	%xmm8, 0x20+0
888	movaps	%xmm9,0x10(%rax)
889.seh_savexmm	%xmm9, 0x20+0x10
890	movaps	%xmm10,0x20(%rax)
891.seh_savexmm	%xmm10, 0x20+0x20
892	movaps	%xmm11,0x30(%rax)
893.seh_savexmm	%xmm11, 0x20+0x30
894	movaps	%xmm12,0x40(%rax)
895.seh_savexmm	%xmm12, 0x20+0x40
896	movaps	%xmm13,0x50(%rax)
897.seh_savexmm	%xmm13, 0x20+0x50
898	movaps	%xmm14,0x60(%rax)
899.seh_savexmm	%xmm14, 0x20+0x60
900	movaps	%xmm15,0x70(%rax)
901.seh_savexmm	%xmm15, 0x20+0x70
902.seh_endprologue
903___
904$code.=<<___;
905	vzeroupper
906
907	vmovdqu		($Xip),$Xi		# load $Xi
908	lea		.L0x1c2_polynomial(%rip),%r10
909	lea		0x40($Htbl),$Htbl	# size optimization
910	vmovdqu		.Lbswap_mask(%rip),$bswap
911	vpshufb		$bswap,$Xi,$Xi
912	cmp		\$0x80,$len
913	jb		.Lshort_avx
914	sub		\$0x80,$len
915
916	vmovdqu		0x70($inp),$Ii		# I[7]
917	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
918	vpshufb		$bswap,$Ii,$Ii
919	vmovdqu		0x20-0x40($Htbl),$HK
920
921	vpunpckhqdq	$Ii,$Ii,$T2
922	 vmovdqu	0x60($inp),$Ij		# I[6]
923	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
924	vpxor		$Ii,$T2,$T2
925	 vpshufb	$bswap,$Ij,$Ij
926	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
927	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
928	 vpunpckhqdq	$Ij,$Ij,$T1
929	 vmovdqu	0x50($inp),$Ii		# I[5]
930	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
931	 vpxor		$Ij,$T1,$T1
932
933	 vpshufb	$bswap,$Ii,$Ii
934	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
935	 vpunpckhqdq	$Ii,$Ii,$T2
936	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
937	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
938	 vpxor		$Ii,$T2,$T2
939	 vmovdqu	0x40($inp),$Ij		# I[4]
940	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
941	 vmovdqu	0x50-0x40($Htbl),$HK
942
943	 vpshufb	$bswap,$Ij,$Ij
944	vpxor		$Xlo,$Zlo,$Zlo
945	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
946	vpxor		$Xhi,$Zhi,$Zhi
947	 vpunpckhqdq	$Ij,$Ij,$T1
948	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
949	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
950	vpxor		$Xmi,$Zmi,$Zmi
951	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
952	 vpxor		$Ij,$T1,$T1
953
954	 vmovdqu	0x30($inp),$Ii		# I[3]
955	vpxor		$Zlo,$Xlo,$Xlo
956	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
957	vpxor		$Zhi,$Xhi,$Xhi
958	 vpshufb	$bswap,$Ii,$Ii
959	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
960	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
961	vpxor		$Zmi,$Xmi,$Xmi
962	 vpunpckhqdq	$Ii,$Ii,$T2
963	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
964	 vmovdqu	0x80-0x40($Htbl),$HK
965	 vpxor		$Ii,$T2,$T2
966
967	 vmovdqu	0x20($inp),$Ij		# I[2]
968	vpxor		$Xlo,$Zlo,$Zlo
969	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
970	vpxor		$Xhi,$Zhi,$Zhi
971	 vpshufb	$bswap,$Ij,$Ij
972	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
973	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
974	vpxor		$Xmi,$Zmi,$Zmi
975	 vpunpckhqdq	$Ij,$Ij,$T1
976	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
977	 vpxor		$Ij,$T1,$T1
978
979	 vmovdqu	0x10($inp),$Ii		# I[1]
980	vpxor		$Zlo,$Xlo,$Xlo
981	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
982	vpxor		$Zhi,$Xhi,$Xhi
983	 vpshufb	$bswap,$Ii,$Ii
984	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
985	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
986	vpxor		$Zmi,$Xmi,$Xmi
987	 vpunpckhqdq	$Ii,$Ii,$T2
988	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
989	 vmovdqu	0xb0-0x40($Htbl),$HK
990	 vpxor		$Ii,$T2,$T2
991
992	 vmovdqu	($inp),$Ij		# I[0]
993	vpxor		$Xlo,$Zlo,$Zlo
994	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
995	vpxor		$Xhi,$Zhi,$Zhi
996	 vpshufb	$bswap,$Ij,$Ij
997	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
998	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
999	vpxor		$Xmi,$Zmi,$Zmi
1000	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
1001
1002	lea		0x80($inp),$inp
1003	cmp		\$0x80,$len
1004	jb		.Ltail_avx
1005
1006	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1007	sub		\$0x80,$len
1008	jmp		.Loop8x_avx
1009
1010.align	32
1011.Loop8x_avx:
1012	vpunpckhqdq	$Ij,$Ij,$T1
1013	 vmovdqu	0x70($inp),$Ii		# I[7]
1014	vpxor		$Xlo,$Zlo,$Zlo
1015	vpxor		$Ij,$T1,$T1
1016	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
1017	 vpshufb	$bswap,$Ii,$Ii
1018	vpxor		$Xhi,$Zhi,$Zhi
1019	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
1020	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
1021	 vpunpckhqdq	$Ii,$Ii,$T2
1022	vpxor		$Xmi,$Zmi,$Zmi
1023	vpclmulqdq	\$0x00,$HK,$T1,$Tred
1024	 vmovdqu	0x20-0x40($Htbl),$HK
1025	 vpxor		$Ii,$T2,$T2
1026
1027	  vmovdqu	0x60($inp),$Ij		# I[6]
1028	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1029	vpxor		$Zlo,$Xi,$Xi		# collect result
1030	  vpshufb	$bswap,$Ij,$Ij
1031	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1032	vxorps		$Zhi,$Xo,$Xo
1033	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
1034	 vpunpckhqdq	$Ij,$Ij,$T1
1035	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1036	vpxor		$Zmi,$Tred,$Tred
1037	 vxorps		$Ij,$T1,$T1
1038
1039	  vmovdqu	0x50($inp),$Ii		# I[5]
1040	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
1041	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1042	vpxor		$Xo,$Tred,$Tred
1043	vpslldq		\$8,$Tred,$T2
1044	 vpxor		$Xlo,$Zlo,$Zlo
1045	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1046	vpsrldq		\$8,$Tred,$Tred
1047	vpxor		$T2, $Xi, $Xi
1048	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
1049	  vpshufb	$bswap,$Ii,$Ii
1050	vxorps		$Tred,$Xo, $Xo
1051	 vpxor		$Xhi,$Zhi,$Zhi
1052	 vpunpckhqdq	$Ii,$Ii,$T2
1053	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1054	  vmovdqu	0x50-0x40($Htbl),$HK
1055	 vpxor		$Ii,$T2,$T2
1056	 vpxor		$Xmi,$Zmi,$Zmi
1057
1058	  vmovdqu	0x40($inp),$Ij		# I[4]
1059	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
1060	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1061	  vpshufb	$bswap,$Ij,$Ij
1062	 vpxor		$Zlo,$Xlo,$Xlo
1063	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1064	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
1065	 vpunpckhqdq	$Ij,$Ij,$T1
1066	 vpxor		$Zhi,$Xhi,$Xhi
1067	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1068	 vxorps		$Ij,$T1,$T1
1069	 vpxor		$Zmi,$Xmi,$Xmi
1070
1071	  vmovdqu	0x30($inp),$Ii		# I[3]
1072	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1073	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1074	  vpshufb	$bswap,$Ii,$Ii
1075	 vpxor		$Xlo,$Zlo,$Zlo
1076	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1077	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
1078	 vpunpckhqdq	$Ii,$Ii,$T2
1079	 vpxor		$Xhi,$Zhi,$Zhi
1080	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1081	  vmovdqu	0x80-0x40($Htbl),$HK
1082	 vpxor		$Ii,$T2,$T2
1083	 vpxor		$Xmi,$Zmi,$Zmi
1084
1085	  vmovdqu	0x20($inp),$Ij		# I[2]
1086	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1087	  vpshufb	$bswap,$Ij,$Ij
1088	 vpxor		$Zlo,$Xlo,$Xlo
1089	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1090	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
1091	 vpunpckhqdq	$Ij,$Ij,$T1
1092	 vpxor		$Zhi,$Xhi,$Xhi
1093	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1094	 vpxor		$Ij,$T1,$T1
1095	 vpxor		$Zmi,$Xmi,$Xmi
1096	vxorps		$Tred,$Xi,$Xi
1097
1098	  vmovdqu	0x10($inp),$Ii		# I[1]
1099	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
1100	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1101	  vpshufb	$bswap,$Ii,$Ii
1102	 vpxor		$Xlo,$Zlo,$Zlo
1103	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1104	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
1105	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1106	vxorps		$Xo,$Tred,$Tred
1107	 vpunpckhqdq	$Ii,$Ii,$T2
1108	 vpxor		$Xhi,$Zhi,$Zhi
1109	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1110	  vmovdqu	0xb0-0x40($Htbl),$HK
1111	 vpxor		$Ii,$T2,$T2
1112	 vpxor		$Xmi,$Zmi,$Zmi
1113
1114	  vmovdqu	($inp),$Ij		# I[0]
1115	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1116	  vpshufb	$bswap,$Ij,$Ij
1117	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1118	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
1119	vpxor		$Tred,$Ij,$Ij
1120	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
1121	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1122
1123	lea		0x80($inp),$inp
1124	sub		\$0x80,$len
1125	jnc		.Loop8x_avx
1126
1127	add		\$0x80,$len
1128	jmp		.Ltail_no_xor_avx
1129
1130.align	32
1131.Lshort_avx:
1132	vmovdqu		-0x10($inp,$len),$Ii	# very last word
1133	lea		($inp,$len),$inp
1134	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
1135	vmovdqu		0x20-0x40($Htbl),$HK
1136	vpshufb		$bswap,$Ii,$Ij
1137
1138	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
1139	vmovdqa		$Xhi,$Zhi		# $Zhi and
1140	vmovdqa		$Xmi,$Zmi		# $Zmi
1141	sub		\$0x10,$len
1142	jz		.Ltail_avx
1143
1144	vpunpckhqdq	$Ij,$Ij,$T1
1145	vpxor		$Xlo,$Zlo,$Zlo
1146	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1147	vpxor		$Ij,$T1,$T1
1148	 vmovdqu	-0x20($inp),$Ii
1149	vpxor		$Xhi,$Zhi,$Zhi
1150	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1151	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
1152	 vpshufb	$bswap,$Ii,$Ij
1153	vpxor		$Xmi,$Zmi,$Zmi
1154	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1155	vpsrldq		\$8,$HK,$HK
1156	sub		\$0x10,$len
1157	jz		.Ltail_avx
1158
1159	vpunpckhqdq	$Ij,$Ij,$T1
1160	vpxor		$Xlo,$Zlo,$Zlo
1161	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1162	vpxor		$Ij,$T1,$T1
1163	 vmovdqu	-0x30($inp),$Ii
1164	vpxor		$Xhi,$Zhi,$Zhi
1165	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1166	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
1167	 vpshufb	$bswap,$Ii,$Ij
1168	vpxor		$Xmi,$Zmi,$Zmi
1169	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1170	vmovdqu		0x50-0x40($Htbl),$HK
1171	sub		\$0x10,$len
1172	jz		.Ltail_avx
1173
1174	vpunpckhqdq	$Ij,$Ij,$T1
1175	vpxor		$Xlo,$Zlo,$Zlo
1176	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1177	vpxor		$Ij,$T1,$T1
1178	 vmovdqu	-0x40($inp),$Ii
1179	vpxor		$Xhi,$Zhi,$Zhi
1180	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1181	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
1182	 vpshufb	$bswap,$Ii,$Ij
1183	vpxor		$Xmi,$Zmi,$Zmi
1184	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1185	vpsrldq		\$8,$HK,$HK
1186	sub		\$0x10,$len
1187	jz		.Ltail_avx
1188
1189	vpunpckhqdq	$Ij,$Ij,$T1
1190	vpxor		$Xlo,$Zlo,$Zlo
1191	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1192	vpxor		$Ij,$T1,$T1
1193	 vmovdqu	-0x50($inp),$Ii
1194	vpxor		$Xhi,$Zhi,$Zhi
1195	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1196	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
1197	 vpshufb	$bswap,$Ii,$Ij
1198	vpxor		$Xmi,$Zmi,$Zmi
1199	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1200	vmovdqu		0x80-0x40($Htbl),$HK
1201	sub		\$0x10,$len
1202	jz		.Ltail_avx
1203
1204	vpunpckhqdq	$Ij,$Ij,$T1
1205	vpxor		$Xlo,$Zlo,$Zlo
1206	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1207	vpxor		$Ij,$T1,$T1
1208	 vmovdqu	-0x60($inp),$Ii
1209	vpxor		$Xhi,$Zhi,$Zhi
1210	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1211	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
1212	 vpshufb	$bswap,$Ii,$Ij
1213	vpxor		$Xmi,$Zmi,$Zmi
1214	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1215	vpsrldq		\$8,$HK,$HK
1216	sub		\$0x10,$len
1217	jz		.Ltail_avx
1218
1219	vpunpckhqdq	$Ij,$Ij,$T1
1220	vpxor		$Xlo,$Zlo,$Zlo
1221	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1222	vpxor		$Ij,$T1,$T1
1223	 vmovdqu	-0x70($inp),$Ii
1224	vpxor		$Xhi,$Zhi,$Zhi
1225	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1226	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
1227	 vpshufb	$bswap,$Ii,$Ij
1228	vpxor		$Xmi,$Zmi,$Zmi
1229	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1230	vmovq		0xb8-0x40($Htbl),$HK
1231	sub		\$0x10,$len
1232	jmp		.Ltail_avx
1233
1234.align	32
1235.Ltail_avx:
1236	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1237.Ltail_no_xor_avx:
1238	vpunpckhqdq	$Ij,$Ij,$T1
1239	vpxor		$Xlo,$Zlo,$Zlo
1240	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1241	vpxor		$Ij,$T1,$T1
1242	vpxor		$Xhi,$Zhi,$Zhi
1243	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1244	vpxor		$Xmi,$Zmi,$Zmi
1245	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1246
1247	vmovdqu		(%r10),$Tred
1248
1249	vpxor		$Xlo,$Zlo,$Xi
1250	vpxor		$Xhi,$Zhi,$Xo
1251	vpxor		$Xmi,$Zmi,$Zmi
1252
1253	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
1254	vpxor		$Xo, $Zmi,$Zmi
1255	vpslldq		\$8, $Zmi,$T2
1256	vpsrldq		\$8, $Zmi,$Zmi
1257	vpxor		$T2, $Xi, $Xi
1258	vpxor		$Zmi,$Xo, $Xo
1259
1260	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
1261	vpalignr	\$8,$Xi,$Xi,$Xi
1262	vpxor		$T2,$Xi,$Xi
1263
1264	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
1265	vpalignr	\$8,$Xi,$Xi,$Xi
1266	vpxor		$Xo,$Xi,$Xi
1267	vpxor		$T2,$Xi,$Xi
1268
1269	cmp		\$0,$len
1270	jne		.Lshort_avx
1271
1272	vpshufb		$bswap,$Xi,$Xi
1273	vmovdqu		$Xi,($Xip)
1274	vzeroupper
1275___
1276$code.=<<___ if ($win64);
1277	movaps	(%rsp),%xmm6
1278	movaps	0x10(%rsp),%xmm7
1279	movaps	0x20(%rsp),%xmm8
1280	movaps	0x30(%rsp),%xmm9
1281	movaps	0x40(%rsp),%xmm10
1282	movaps	0x50(%rsp),%xmm11
1283	movaps	0x60(%rsp),%xmm12
1284	movaps	0x70(%rsp),%xmm13
1285	movaps	0x80(%rsp),%xmm14
1286	movaps	0x90(%rsp),%xmm15
1287	lea	0xa8(%rsp),%rsp
1288___
1289$code.=<<___;
1290	ret
1291.cfi_endproc
1292.seh_endproc
1293.size	gcm_ghash_avx,.-gcm_ghash_avx
1294___
1295} else {
1296$code.=<<___;
1297	jmp	.L_ghash_clmul
1298.size	gcm_ghash_avx,.-gcm_ghash_avx
1299___
1300}
1301
1302$code.=<<___;
1303.section .rodata
1304.align	64
1305.Lbswap_mask:
1306	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1307.L0x1c2_polynomial:
1308	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1309.L7_mask:
1310	.long	7,0,7,0
1311.align	64
1312
1313.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1314.align	64
1315.text
1316___
1317
1318$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1319
1320print $code;
1321
1322close STDOUT or die "error closing STDOUT: $!";
1323