• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# March, June 2010
18#
19# The module implements "4-bit" GCM GHASH function and underlying
20# single multiplication operation in GF(2^128). "4-bit" means that
21# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22# function features so called "528B" variant utilizing additional
23# 256+16 bytes of per-key storage [+512 bytes shared table].
24# Performance results are for this streamed GHASH subroutine and are
25# expressed in cycles per processed byte, less is better:
26#
27#		gcc 3.4.x(*)	assembler
28#
29# P4		28.6		14.0		+100%
30# Opteron	19.3		7.7		+150%
31# Core2		17.8		8.1(**)		+120%
32# Atom		31.6		16.8		+88%
33# VIA Nano	21.8		10.1		+115%
34#
35# (*)	comparison is not completely fair, because C results are
36#	for vanilla "256B" implementation, while assembler results
37#	are for "528B";-)
38# (**)	it's mystery [to me] why Core2 result is not same as for
39#	Opteron;
40
41# May 2010
42#
43# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44# See ghash-x86.pl for background information and details about coding
45# techniques.
46#
47# Special thanks to David Woodhouse for providing access to a
48# Westmere-based system on behalf of Intel Open Source Technology Centre.
49
50# December 2012
51#
52# Overhaul: aggregate Karatsuba post-processing, improve ILP in
53# reduction_alg9, increase reduction aggregate factor to 4x. As for
54# the latter. ghash-x86.pl discusses that it makes lesser sense to
55# increase aggregate factor. Then why increase here? Critical path
56# consists of 3 independent pclmulqdq instructions, Karatsuba post-
57# processing and reduction. "On top" of this we lay down aggregated
58# multiplication operations, triplets of independent pclmulqdq's. As
59# issue rate for pclmulqdq is limited, it makes lesser sense to
60# aggregate more multiplications than it takes to perform remaining
61# non-multiplication operations. 2x is near-optimal coefficient for
62# contemporary Intel CPUs (therefore modest improvement coefficient),
63# but not for Bulldozer. Latter is because logical SIMD operations
64# are twice as slow in comparison to Intel, so that critical path is
65# longer. A CPU with higher pclmulqdq issue rate would also benefit
66# from higher aggregate factor...
67#
68# Westmere	1.78(+13%)
69# Sandy Bridge	1.80(+8%)
70# Ivy Bridge	1.80(+7%)
71# Haswell	0.55(+93%) (if system doesn't support AVX)
72# Broadwell	0.45(+110%)(if system doesn't support AVX)
73# Skylake	0.44(+110%)(if system doesn't support AVX)
74# Bulldozer	1.49(+27%)
75# Silvermont	2.88(+13%)
76# Knights L	2.12(-)    (if system doesn't support AVX)
77# Goldmont	1.08(+24%)
78
79# March 2013
80#
81# ... 8x aggregate factor AVX code path is using reduction algorithm
82# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84# sub-optimally in comparison to above mentioned version. But thanks
85# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86# it performs in 0.41 cycles per byte on Haswell processor, in
87# 0.29 on Broadwell, and in 0.36 on Skylake.
88#
89# Knights Landing achieves 1.09 cpb.
90#
91# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
92
93# This file was patched in BoringSSL to remove the variable-time 4-bit
94# implementation.
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# See the notes about |$avx| in aesni-gcm-x86_64.pl; otherwise tags will be
108# computed incorrectly.
109#
110# In upstream, this is controlled by shelling out to the compiler to check
111# versions, but BoringSSL is intended to be used with pre-generated perlasm
112# output, so this isn't useful anyway.
113$avx = 1;
114
115open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
116*STDOUT=*OUT;
117
118$do4xaggr=1;
119
120
121$code=<<___;
122.text
123___
124
125
126######################################################################
127# PCLMULQDQ version.
128
129@_4args=$win64?	("%rcx","%rdx","%r8", "%r9") :	# Win64 order
130		("%rdi","%rsi","%rdx","%rcx");	# Unix order
131
132($Xi,$Xhi)=("%xmm0","%xmm1");	$Hkey="%xmm2";
133($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
134
135sub clmul64x64_T2 {	# minimal register pressure
136my ($Xhi,$Xi,$Hkey,$HK)=@_;
137
138if (!defined($HK)) {	$HK = $T2;
139$code.=<<___;
140	movdqa		$Xi,$Xhi		#
141	pshufd		\$0b01001110,$Xi,$T1
142	pshufd		\$0b01001110,$Hkey,$T2
143	pxor		$Xi,$T1			#
144	pxor		$Hkey,$T2
145___
146} else {
147$code.=<<___;
148	movdqa		$Xi,$Xhi		#
149	pshufd		\$0b01001110,$Xi,$T1
150	pxor		$Xi,$T1			#
151___
152}
153$code.=<<___;
154	pclmulqdq	\$0x00,$Hkey,$Xi	#######
155	pclmulqdq	\$0x11,$Hkey,$Xhi	#######
156	pclmulqdq	\$0x00,$HK,$T1		#######
157	pxor		$Xi,$T1			#
158	pxor		$Xhi,$T1		#
159
160	movdqa		$T1,$T2			#
161	psrldq		\$8,$T1
162	pslldq		\$8,$T2			#
163	pxor		$T1,$Xhi
164	pxor		$T2,$Xi			#
165___
166}
167
168sub reduction_alg9 {	# 17/11 times faster than Intel version
169my ($Xhi,$Xi) = @_;
170
171$code.=<<___;
172	# 1st phase
173	movdqa		$Xi,$T2			#
174	movdqa		$Xi,$T1
175	psllq		\$5,$Xi
176	pxor		$Xi,$T1			#
177	psllq		\$1,$Xi
178	pxor		$T1,$Xi			#
179	psllq		\$57,$Xi		#
180	movdqa		$Xi,$T1			#
181	pslldq		\$8,$Xi
182	psrldq		\$8,$T1			#
183	pxor		$T2,$Xi
184	pxor		$T1,$Xhi		#
185
186	# 2nd phase
187	movdqa		$Xi,$T2
188	psrlq		\$1,$Xi
189	pxor		$T2,$Xhi		#
190	pxor		$Xi,$T2
191	psrlq		\$5,$Xi
192	pxor		$T2,$Xi			#
193	psrlq		\$1,$Xi			#
194	pxor		$Xhi,$Xi		#
195___
196}
197
198{ my ($Htbl,$Xip)=@_4args;
199  my $HK="%xmm6";
200
201$code.=<<___;
202.globl	gcm_init_clmul
203.type	gcm_init_clmul,\@abi-omnipotent
204.align	16
205gcm_init_clmul:
206.cfi_startproc
207.seh_startproc
208	_CET_ENDBR
209.L_init_clmul:
210___
211$code.=<<___ if ($win64);
212	sub	\$0x18,%rsp
213.seh_allocstack	0x18
214	movaps	%xmm6,(%rsp)
215.seh_savexmm128	%xmm6, 0
216___
217$code.=<<___;
218	movdqu		($Xip),$Hkey
219	pshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
220
221	# <<1 twist
222	pshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
223	movdqa		$Hkey,$T1
224	psllq		\$1,$Hkey
225	pxor		$T3,$T3			#
226	psrlq		\$63,$T1
227	pcmpgtd		$T2,$T3			# broadcast carry bit
228	pslldq		\$8,$T1
229	por		$T1,$Hkey		# H<<=1
230
231	# magic reduction
232	pand		.L0x1c2_polynomial(%rip),$T3
233	pxor		$T3,$Hkey		# if(carry) H^=0x1c2_polynomial
234
235	# calculate H^2
236	pshufd		\$0b01001110,$Hkey,$HK
237	movdqa		$Hkey,$Xi
238	pxor		$Hkey,$HK
239___
240	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);
241	&reduction_alg9	($Xhi,$Xi);
242$code.=<<___;
243	pshufd		\$0b01001110,$Hkey,$T1
244	pshufd		\$0b01001110,$Xi,$T2
245	pxor		$Hkey,$T1		# Karatsuba pre-processing
246	movdqu		$Hkey,0x00($Htbl)	# save H
247	pxor		$Xi,$T2			# Karatsuba pre-processing
248	movdqu		$Xi,0x10($Htbl)		# save H^2
249	palignr		\$8,$T1,$T2		# low part is H.lo^H.hi...
250	movdqu		$T2,0x20($Htbl)		# save Karatsuba "salt"
251___
252if ($do4xaggr) {
253	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^3
254	&reduction_alg9	($Xhi,$Xi);
255$code.=<<___;
256	movdqa		$Xi,$T3
257___
258	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H^4
259	&reduction_alg9	($Xhi,$Xi);
260$code.=<<___;
261	pshufd		\$0b01001110,$T3,$T1
262	pshufd		\$0b01001110,$Xi,$T2
263	pxor		$T3,$T1			# Karatsuba pre-processing
264	movdqu		$T3,0x30($Htbl)		# save H^3
265	pxor		$Xi,$T2			# Karatsuba pre-processing
266	movdqu		$Xi,0x40($Htbl)		# save H^4
267	palignr		\$8,$T1,$T2		# low part is H^3.lo^H^3.hi...
268	movdqu		$T2,0x50($Htbl)		# save Karatsuba "salt"
269___
270}
271$code.=<<___ if ($win64);
272	movaps	(%rsp),%xmm6
273	lea	0x18(%rsp),%rsp
274___
275$code.=<<___;
276	ret
277.cfi_endproc
278.seh_endproc
279.size	gcm_init_clmul,.-gcm_init_clmul
280___
281}
282
283{ my ($Xip,$Htbl)=@_4args;
284
285$code.=<<___;
286.globl	gcm_gmult_clmul
287.type	gcm_gmult_clmul,\@abi-omnipotent
288.align	16
289gcm_gmult_clmul:
290.cfi_startproc
291	_CET_ENDBR
292.L_gmult_clmul:
293	movdqu		($Xip),$Xi
294	movdqa		.Lbswap_mask(%rip),$T3
295	movdqu		($Htbl),$Hkey
296	movdqu		0x20($Htbl),$T2
297	pshufb		$T3,$Xi
298___
299	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$T2);
300$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
301	# experimental alternative. special thing about is that there
302	# no dependency between the two multiplications...
303	mov		\$`0xE1<<1`,%eax
304	mov		\$0xA040608020C0E000,%r10	# ((7..0)·0xE0)&0xff
305	mov		\$0x07,%r11d
306	movq		%rax,$T1
307	movq		%r10,$T2
308	movq		%r11,$T3		# borrow $T3
309	pand		$Xi,$T3
310	pshufb		$T3,$T2			# ($Xi&7)·0xE0
311	movq		%rax,$T3
312	pclmulqdq	\$0x00,$Xi,$T1		# ·(0xE1<<1)
313	pxor		$Xi,$T2
314	pslldq		\$15,$T2
315	paddd		$T2,$T2			# <<(64+56+1)
316	pxor		$T2,$Xi
317	pclmulqdq	\$0x01,$T3,$Xi
318	movdqa		.Lbswap_mask(%rip),$T3	# reload $T3
319	psrldq		\$1,$T1
320	pxor		$T1,$Xhi
321	pslldq		\$7,$Xi
322	pxor		$Xhi,$Xi
323___
324$code.=<<___;
325	pshufb		$T3,$Xi
326	movdqu		$Xi,($Xip)
327	ret
328.cfi_endproc
329.size	gcm_gmult_clmul,.-gcm_gmult_clmul
330___
331}
332
333{ my ($Xip,$Htbl,$inp,$len)=@_4args;
334  my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
335  my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
336
337$code.=<<___;
338.globl	gcm_ghash_clmul
339.type	gcm_ghash_clmul,\@abi-omnipotent
340.align	32
341gcm_ghash_clmul:
342.cfi_startproc
343.seh_startproc
344	_CET_ENDBR
345.L_ghash_clmul:
346___
347$code.=<<___ if ($win64);
348	lea	-0x88(%rsp),%rax
349	lea	-0x20(%rax),%rsp
350.seh_allocstack	0x20+0x88
351	movaps	%xmm6,-0x20(%rax)
352.seh_savexmm128	%xmm6, 0x20-0x20
353	movaps	%xmm7,-0x10(%rax)
354.seh_savexmm128	%xmm7, 0x20-0x10
355	movaps	%xmm8,0(%rax)
356.seh_savexmm128	%xmm8, 0x20+0
357	movaps	%xmm9,0x10(%rax)
358.seh_savexmm128	%xmm9, 0x20+0x10
359	movaps	%xmm10,0x20(%rax)
360.seh_savexmm128	%xmm10, 0x20+0x20
361	movaps	%xmm11,0x30(%rax)
362.seh_savexmm128	%xmm11, 0x20+0x30
363	movaps	%xmm12,0x40(%rax)
364.seh_savexmm128	%xmm12, 0x20+0x40
365	movaps	%xmm13,0x50(%rax)
366.seh_savexmm128	%xmm13, 0x20+0x50
367	movaps	%xmm14,0x60(%rax)
368.seh_savexmm128	%xmm14, 0x20+0x60
369	movaps	%xmm15,0x70(%rax)
370.seh_savexmm128	%xmm15, 0x20+0x70
371___
372$code.=<<___;
373	movdqa		.Lbswap_mask(%rip),$T3
374
375	movdqu		($Xip),$Xi
376	movdqu		($Htbl),$Hkey
377	movdqu		0x20($Htbl),$HK
378	pshufb		$T3,$Xi
379
380	sub		\$0x10,$len
381	jz		.Lodd_tail
382
383	movdqu		0x10($Htbl),$Hkey2
384___
385if ($do4xaggr) {
386my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
387
388$code.=<<___;
389	cmp		\$0x30,$len
390	jb		.Lskip4x
391
392	sub		\$0x30,$len
393	mov		\$0xA040608020C0E000,%rax	# ((7..0)·0xE0)&0xff
394	movdqu		0x30($Htbl),$Hkey3
395	movdqu		0x40($Htbl),$Hkey4
396
397	#######
398	# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
399	#
400	movdqu		0x30($inp),$Xln
401	 movdqu		0x20($inp),$Xl
402	pshufb		$T3,$Xln
403	 pshufb		$T3,$Xl
404	movdqa		$Xln,$Xhn
405	pshufd		\$0b01001110,$Xln,$Xmn
406	pxor		$Xln,$Xmn
407	pclmulqdq	\$0x00,$Hkey,$Xln
408	pclmulqdq	\$0x11,$Hkey,$Xhn
409	pclmulqdq	\$0x00,$HK,$Xmn
410
411	movdqa		$Xl,$Xh
412	pshufd		\$0b01001110,$Xl,$Xm
413	pxor		$Xl,$Xm
414	pclmulqdq	\$0x00,$Hkey2,$Xl
415	pclmulqdq	\$0x11,$Hkey2,$Xh
416	pclmulqdq	\$0x10,$HK,$Xm
417	xorps		$Xl,$Xln
418	xorps		$Xh,$Xhn
419	movups		0x50($Htbl),$HK
420	xorps		$Xm,$Xmn
421
422	movdqu		0x10($inp),$Xl
423	 movdqu		0($inp),$T1
424	pshufb		$T3,$Xl
425	 pshufb		$T3,$T1
426	movdqa		$Xl,$Xh
427	pshufd		\$0b01001110,$Xl,$Xm
428	 pxor		$T1,$Xi
429	pxor		$Xl,$Xm
430	pclmulqdq	\$0x00,$Hkey3,$Xl
431	 movdqa		$Xi,$Xhi
432	 pshufd		\$0b01001110,$Xi,$T1
433	 pxor		$Xi,$T1
434	pclmulqdq	\$0x11,$Hkey3,$Xh
435	pclmulqdq	\$0x00,$HK,$Xm
436	xorps		$Xl,$Xln
437	xorps		$Xh,$Xhn
438
439	lea	0x40($inp),$inp
440	sub	\$0x40,$len
441	jc	.Ltail4x
442
443	jmp	.Lmod4_loop
444.align	32
445.Lmod4_loop:
446	pclmulqdq	\$0x00,$Hkey4,$Xi
447	xorps		$Xm,$Xmn
448	 movdqu		0x30($inp),$Xl
449	 pshufb		$T3,$Xl
450	pclmulqdq	\$0x11,$Hkey4,$Xhi
451	xorps		$Xln,$Xi
452	 movdqu		0x20($inp),$Xln
453	 movdqa		$Xl,$Xh
454	pclmulqdq	\$0x10,$HK,$T1
455	 pshufd		\$0b01001110,$Xl,$Xm
456	xorps		$Xhn,$Xhi
457	 pxor		$Xl,$Xm
458	 pshufb		$T3,$Xln
459	movups		0x20($Htbl),$HK
460	xorps		$Xmn,$T1
461	 pclmulqdq	\$0x00,$Hkey,$Xl
462	 pshufd		\$0b01001110,$Xln,$Xmn
463
464	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
465	 movdqa		$Xln,$Xhn
466	pxor		$Xhi,$T1		#
467	 pxor		$Xln,$Xmn
468	movdqa		$T1,$T2			#
469	 pclmulqdq	\$0x11,$Hkey,$Xh
470	pslldq		\$8,$T1
471	psrldq		\$8,$T2			#
472	pxor		$T1,$Xi
473	movdqa		.L7_mask(%rip),$T1
474	pxor		$T2,$Xhi		#
475	movq		%rax,$T2
476
477	pand		$Xi,$T1			# 1st phase
478	pshufb		$T1,$T2			#
479	pxor		$Xi,$T2			#
480	 pclmulqdq	\$0x00,$HK,$Xm
481	psllq		\$57,$T2		#
482	movdqa		$T2,$T1			#
483	pslldq		\$8,$T2
484	 pclmulqdq	\$0x00,$Hkey2,$Xln
485	psrldq		\$8,$T1			#
486	pxor		$T2,$Xi
487	pxor		$T1,$Xhi		#
488	movdqu		0($inp),$T1
489
490	movdqa		$Xi,$T2			# 2nd phase
491	psrlq		\$1,$Xi
492	 pclmulqdq	\$0x11,$Hkey2,$Xhn
493	 xorps		$Xl,$Xln
494	 movdqu		0x10($inp),$Xl
495	 pshufb		$T3,$Xl
496	 pclmulqdq	\$0x10,$HK,$Xmn
497	 xorps		$Xh,$Xhn
498	 movups		0x50($Htbl),$HK
499	pshufb		$T3,$T1
500	pxor		$T2,$Xhi		#
501	pxor		$Xi,$T2
502	psrlq		\$5,$Xi
503
504	 movdqa		$Xl,$Xh
505	 pxor		$Xm,$Xmn
506	 pshufd		\$0b01001110,$Xl,$Xm
507	pxor		$T2,$Xi			#
508	pxor		$T1,$Xhi
509	 pxor		$Xl,$Xm
510	 pclmulqdq	\$0x00,$Hkey3,$Xl
511	psrlq		\$1,$Xi			#
512	pxor		$Xhi,$Xi		#
513	movdqa		$Xi,$Xhi
514	 pclmulqdq	\$0x11,$Hkey3,$Xh
515	 xorps		$Xl,$Xln
516	pshufd		\$0b01001110,$Xi,$T1
517	pxor		$Xi,$T1
518
519	 pclmulqdq	\$0x00,$HK,$Xm
520	 xorps		$Xh,$Xhn
521
522	lea	0x40($inp),$inp
523	sub	\$0x40,$len
524	jnc	.Lmod4_loop
525
526.Ltail4x:
527	pclmulqdq	\$0x00,$Hkey4,$Xi
528	pclmulqdq	\$0x11,$Hkey4,$Xhi
529	pclmulqdq	\$0x10,$HK,$T1
530	xorps		$Xm,$Xmn
531	xorps		$Xln,$Xi
532	xorps		$Xhn,$Xhi
533	pxor		$Xi,$Xhi		# aggregated Karatsuba post-processing
534	pxor		$Xmn,$T1
535
536	pxor		$Xhi,$T1		#
537	pxor		$Xi,$Xhi
538
539	movdqa		$T1,$T2			#
540	psrldq		\$8,$T1
541	pslldq		\$8,$T2			#
542	pxor		$T1,$Xhi
543	pxor		$T2,$Xi			#
544___
545	&reduction_alg9($Xhi,$Xi);
546$code.=<<___;
547	add	\$0x40,$len
548	jz	.Ldone
549	movdqu	0x20($Htbl),$HK
550	sub	\$0x10,$len
551	jz	.Lodd_tail
552.Lskip4x:
553___
554}
555$code.=<<___;
556	#######
557	# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
558	#	[(H*Ii+1) + (H*Xi+1)] mod P =
559	#	[(H*Ii+1) + H^2*(Ii+Xi)] mod P
560	#
561	movdqu		($inp),$T1		# Ii
562	movdqu		16($inp),$Xln		# Ii+1
563	pshufb		$T3,$T1
564	pshufb		$T3,$Xln
565	pxor		$T1,$Xi			# Ii+Xi
566
567	movdqa		$Xln,$Xhn
568	pshufd		\$0b01001110,$Xln,$Xmn
569	pxor		$Xln,$Xmn
570	pclmulqdq	\$0x00,$Hkey,$Xln
571	pclmulqdq	\$0x11,$Hkey,$Xhn
572	pclmulqdq	\$0x00,$HK,$Xmn
573
574	lea		32($inp),$inp		# i+=2
575	nop
576	sub		\$0x20,$len
577	jbe		.Leven_tail
578	nop
579	jmp		.Lmod_loop
580
581.align	32
582.Lmod_loop:
583	movdqa		$Xi,$Xhi
584	movdqa		$Xmn,$T1
585	pshufd		\$0b01001110,$Xi,$Xmn	#
586	pxor		$Xi,$Xmn		#
587
588	pclmulqdq	\$0x00,$Hkey2,$Xi
589	pclmulqdq	\$0x11,$Hkey2,$Xhi
590	pclmulqdq	\$0x10,$HK,$Xmn
591
592	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
593	pxor		$Xhn,$Xhi
594	  movdqu	($inp),$T2		# Ii
595	pxor		$Xi,$T1			# aggregated Karatsuba post-processing
596	  pshufb	$T3,$T2
597	  movdqu	16($inp),$Xln		# Ii+1
598
599	pxor		$Xhi,$T1
600	  pxor		$T2,$Xhi		# "Ii+Xi", consume early
601	pxor		$T1,$Xmn
602	 pshufb		$T3,$Xln
603	movdqa		$Xmn,$T1		#
604	psrldq		\$8,$T1
605	pslldq		\$8,$Xmn		#
606	pxor		$T1,$Xhi
607	pxor		$Xmn,$Xi		#
608
609	movdqa		$Xln,$Xhn		#
610
611	  movdqa	$Xi,$T2			# 1st phase
612	  movdqa	$Xi,$T1
613	  psllq		\$5,$Xi
614	  pxor		$Xi,$T1			#
615	pclmulqdq	\$0x00,$Hkey,$Xln	#######
616	  psllq		\$1,$Xi
617	  pxor		$T1,$Xi			#
618	  psllq		\$57,$Xi		#
619	  movdqa	$Xi,$T1			#
620	  pslldq	\$8,$Xi
621	  psrldq	\$8,$T1			#
622	  pxor		$T2,$Xi
623	pshufd		\$0b01001110,$Xhn,$Xmn
624	  pxor		$T1,$Xhi		#
625	pxor		$Xhn,$Xmn		#
626
627	  movdqa	$Xi,$T2			# 2nd phase
628	  psrlq		\$1,$Xi
629	pclmulqdq	\$0x11,$Hkey,$Xhn	#######
630	  pxor		$T2,$Xhi		#
631	  pxor		$Xi,$T2
632	  psrlq		\$5,$Xi
633	  pxor		$T2,$Xi			#
634	lea		32($inp),$inp
635	  psrlq		\$1,$Xi			#
636	pclmulqdq	\$0x00,$HK,$Xmn		#######
637	  pxor		$Xhi,$Xi		#
638
639	sub		\$0x20,$len
640	ja		.Lmod_loop
641
642.Leven_tail:
643	 movdqa		$Xi,$Xhi
644	 movdqa		$Xmn,$T1
645	 pshufd		\$0b01001110,$Xi,$Xmn	#
646	 pxor		$Xi,$Xmn		#
647
648	pclmulqdq	\$0x00,$Hkey2,$Xi
649	pclmulqdq	\$0x11,$Hkey2,$Xhi
650	pclmulqdq	\$0x10,$HK,$Xmn
651
652	pxor		$Xln,$Xi		# (H*Ii+1) + H^2*(Ii+Xi)
653	pxor		$Xhn,$Xhi
654	pxor		$Xi,$T1
655	pxor		$Xhi,$T1
656	pxor		$T1,$Xmn
657	movdqa		$Xmn,$T1		#
658	psrldq		\$8,$T1
659	pslldq		\$8,$Xmn		#
660	pxor		$T1,$Xhi
661	pxor		$Xmn,$Xi		#
662___
663	&reduction_alg9	($Xhi,$Xi);
664$code.=<<___;
665	test		$len,$len
666	jnz		.Ldone
667
668.Lodd_tail:
669	movdqu		($inp),$T1		# Ii
670	pshufb		$T3,$T1
671	pxor		$T1,$Xi			# Ii+Xi
672___
673	&clmul64x64_T2	($Xhi,$Xi,$Hkey,$HK);	# H*(Ii+Xi)
674	&reduction_alg9	($Xhi,$Xi);
675$code.=<<___;
676.Ldone:
677	pshufb		$T3,$Xi
678	movdqu		$Xi,($Xip)
679___
680$code.=<<___ if ($win64);
681	movaps	(%rsp),%xmm6
682	movaps	0x10(%rsp),%xmm7
683	movaps	0x20(%rsp),%xmm8
684	movaps	0x30(%rsp),%xmm9
685	movaps	0x40(%rsp),%xmm10
686	movaps	0x50(%rsp),%xmm11
687	movaps	0x60(%rsp),%xmm12
688	movaps	0x70(%rsp),%xmm13
689	movaps	0x80(%rsp),%xmm14
690	movaps	0x90(%rsp),%xmm15
691	lea	0xa8(%rsp),%rsp
692___
693$code.=<<___;
694	ret
695.cfi_endproc
696.seh_endproc
697.size	gcm_ghash_clmul,.-gcm_ghash_clmul
698___
699}
700
701$code.=<<___;
702.globl	gcm_init_avx
703.type	gcm_init_avx,\@abi-omnipotent
704.align	32
705gcm_init_avx:
706.cfi_startproc
707	_CET_ENDBR
708___
709if ($avx) {
710my ($Htbl,$Xip)=@_4args;
711my $HK="%xmm6";
712
713$code.=<<___ if ($win64);
714.seh_startproc
715	sub	\$0x18,%rsp
716.seh_allocstack	0x18
717	movaps	%xmm6,(%rsp)
718.seh_savexmm128	%xmm6, 0
719___
720$code.=<<___;
721	vzeroupper
722
723	vmovdqu		($Xip),$Hkey
724	vpshufd		\$0b01001110,$Hkey,$Hkey	# dword swap
725
726	# <<1 twist
727	vpshufd		\$0b11111111,$Hkey,$T2	# broadcast uppermost dword
728	vpsrlq		\$63,$Hkey,$T1
729	vpsllq		\$1,$Hkey,$Hkey
730	vpxor		$T3,$T3,$T3		#
731	vpcmpgtd	$T2,$T3,$T3		# broadcast carry bit
732	vpslldq		\$8,$T1,$T1
733	vpor		$T1,$Hkey,$Hkey		# H<<=1
734
735	# magic reduction
736	vpand		.L0x1c2_polynomial(%rip),$T3,$T3
737	vpxor		$T3,$Hkey,$Hkey		# if(carry) H^=0x1c2_polynomial
738
739	vpunpckhqdq	$Hkey,$Hkey,$HK
740	vmovdqa		$Hkey,$Xi
741	vpxor		$Hkey,$HK,$HK
742	mov		\$4,%r10		# up to H^8
743	jmp		.Linit_start_avx
744___
745
746sub clmul64x64_avx {
747my ($Xhi,$Xi,$Hkey,$HK)=@_;
748
749if (!defined($HK)) {	$HK = $T2;
750$code.=<<___;
751	vpunpckhqdq	$Xi,$Xi,$T1
752	vpunpckhqdq	$Hkey,$Hkey,$T2
753	vpxor		$Xi,$T1,$T1		#
754	vpxor		$Hkey,$T2,$T2
755___
756} else {
757$code.=<<___;
758	vpunpckhqdq	$Xi,$Xi,$T1
759	vpxor		$Xi,$T1,$T1		#
760___
761}
762$code.=<<___;
763	vpclmulqdq	\$0x11,$Hkey,$Xi,$Xhi	#######
764	vpclmulqdq	\$0x00,$Hkey,$Xi,$Xi	#######
765	vpclmulqdq	\$0x00,$HK,$T1,$T1	#######
766	vpxor		$Xi,$Xhi,$T2		#
767	vpxor		$T2,$T1,$T1		#
768
769	vpslldq		\$8,$T1,$T2		#
770	vpsrldq		\$8,$T1,$T1
771	vpxor		$T2,$Xi,$Xi		#
772	vpxor		$T1,$Xhi,$Xhi
773___
774}
775
776sub reduction_avx {
777my ($Xhi,$Xi) = @_;
778
779$code.=<<___;
780	vpsllq		\$57,$Xi,$T1		# 1st phase
781	vpsllq		\$62,$Xi,$T2
782	vpxor		$T1,$T2,$T2		#
783	vpsllq		\$63,$Xi,$T1
784	vpxor		$T1,$T2,$T2		#
785	vpslldq		\$8,$T2,$T1		#
786	vpsrldq		\$8,$T2,$T2
787	vpxor		$T1,$Xi,$Xi		#
788	vpxor		$T2,$Xhi,$Xhi
789
790	vpsrlq		\$1,$Xi,$T2		# 2nd phase
791	vpxor		$Xi,$Xhi,$Xhi
792	vpxor		$T2,$Xi,$Xi		#
793	vpsrlq		\$5,$T2,$T2
794	vpxor		$T2,$Xi,$Xi		#
795	vpsrlq		\$1,$Xi,$Xi		#
796	vpxor		$Xhi,$Xi,$Xi		#
797___
798}
799
800$code.=<<___;
801.align	32
802.Linit_loop_avx:
803	vpalignr	\$8,$T1,$T2,$T3		# low part is H.lo^H.hi...
804	vmovdqu		$T3,-0x10($Htbl)	# save Karatsuba "salt"
805___
806	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^3,5,7
807	&reduction_avx	($Xhi,$Xi);
808$code.=<<___;
809.Linit_start_avx:
810	vmovdqa		$Xi,$T3
811___
812	&clmul64x64_avx	($Xhi,$Xi,$Hkey,$HK);	# calculate H^2,4,6,8
813	&reduction_avx	($Xhi,$Xi);
814$code.=<<___;
815	vpshufd		\$0b01001110,$T3,$T1
816	vpshufd		\$0b01001110,$Xi,$T2
817	vpxor		$T3,$T1,$T1		# Karatsuba pre-processing
818	vmovdqu		$T3,0x00($Htbl)		# save H^1,3,5,7
819	vpxor		$Xi,$T2,$T2		# Karatsuba pre-processing
820	vmovdqu		$Xi,0x10($Htbl)		# save H^2,4,6,8
821	lea		0x30($Htbl),$Htbl
822	sub		\$1,%r10
823	jnz		.Linit_loop_avx
824
825	vpalignr	\$8,$T2,$T1,$T3		# last "salt" is flipped
826	vmovdqu		$T3,-0x10($Htbl)
827
828	vzeroupper
829___
830$code.=<<___ if ($win64);
831	movaps	(%rsp),%xmm6
832	lea	0x18(%rsp),%rsp
833___
834$code.=<<___;
835	ret
836.seh_endproc
837.cfi_endproc
838.size	gcm_init_avx,.-gcm_init_avx
839___
840} else {
841$code.=<<___;
842	jmp	.L_init_clmul
843.size	gcm_init_avx,.-gcm_init_avx
844___
845}
846
847$code.=<<___;
848.globl	gcm_gmult_avx
849.type	gcm_gmult_avx,\@abi-omnipotent
850.align	32
851gcm_gmult_avx:
852.cfi_startproc
853	_CET_ENDBR
854	jmp	.L_gmult_clmul
855.cfi_endproc
856.size	gcm_gmult_avx,.-gcm_gmult_avx
857___
858
859$code.=<<___;
860.globl	gcm_ghash_avx
861.type	gcm_ghash_avx,\@abi-omnipotent
862.align	32
863gcm_ghash_avx:
864.cfi_startproc
865	_CET_ENDBR
866___
867if ($avx) {
868my ($Xip,$Htbl,$inp,$len)=@_4args;
869my ($Xlo,$Xhi,$Xmi,
870    $Zlo,$Zhi,$Zmi,
871    $Hkey,$HK,$T1,$T2,
872    $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
873
874$code.=<<___ if ($win64);
875.seh_startproc
876	lea	-0x88(%rsp),%rax
877	lea	-0x20(%rax),%rsp
878.seh_allocstack	0x20+0x88
879	movaps	%xmm6,-0x20(%rax)
880.seh_savexmm128	%xmm6, 0x20-0x20
881	movaps	%xmm7,-0x10(%rax)
882.seh_savexmm128	%xmm7, 0x20-0x10
883	movaps	%xmm8,0(%rax)
884.seh_savexmm128	%xmm8, 0x20+0
885	movaps	%xmm9,0x10(%rax)
886.seh_savexmm128	%xmm9, 0x20+0x10
887	movaps	%xmm10,0x20(%rax)
888.seh_savexmm128	%xmm10, 0x20+0x20
889	movaps	%xmm11,0x30(%rax)
890.seh_savexmm128	%xmm11, 0x20+0x30
891	movaps	%xmm12,0x40(%rax)
892.seh_savexmm128	%xmm12, 0x20+0x40
893	movaps	%xmm13,0x50(%rax)
894.seh_savexmm128	%xmm13, 0x20+0x50
895	movaps	%xmm14,0x60(%rax)
896.seh_savexmm128	%xmm14, 0x20+0x60
897	movaps	%xmm15,0x70(%rax)
898.seh_savexmm128	%xmm15, 0x20+0x70
899___
900$code.=<<___;
901	vzeroupper
902
903	vmovdqu		($Xip),$Xi		# load $Xi
904	lea		.L0x1c2_polynomial(%rip),%r10
905	lea		0x40($Htbl),$Htbl	# size optimization
906	vmovdqu		.Lbswap_mask(%rip),$bswap
907	vpshufb		$bswap,$Xi,$Xi
908	cmp		\$0x80,$len
909	jb		.Lshort_avx
910	sub		\$0x80,$len
911
912	vmovdqu		0x70($inp),$Ii		# I[7]
913	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
914	vpshufb		$bswap,$Ii,$Ii
915	vmovdqu		0x20-0x40($Htbl),$HK
916
917	vpunpckhqdq	$Ii,$Ii,$T2
918	 vmovdqu	0x60($inp),$Ij		# I[6]
919	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
920	vpxor		$Ii,$T2,$T2
921	 vpshufb	$bswap,$Ij,$Ij
922	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
923	 vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
924	 vpunpckhqdq	$Ij,$Ij,$T1
925	 vmovdqu	0x50($inp),$Ii		# I[5]
926	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
927	 vpxor		$Ij,$T1,$T1
928
929	 vpshufb	$bswap,$Ii,$Ii
930	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
931	 vpunpckhqdq	$Ii,$Ii,$T2
932	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
933	 vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
934	 vpxor		$Ii,$T2,$T2
935	 vmovdqu	0x40($inp),$Ij		# I[4]
936	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
937	 vmovdqu	0x50-0x40($Htbl),$HK
938
939	 vpshufb	$bswap,$Ij,$Ij
940	vpxor		$Xlo,$Zlo,$Zlo
941	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
942	vpxor		$Xhi,$Zhi,$Zhi
943	 vpunpckhqdq	$Ij,$Ij,$T1
944	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
945	 vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
946	vpxor		$Xmi,$Zmi,$Zmi
947	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
948	 vpxor		$Ij,$T1,$T1
949
950	 vmovdqu	0x30($inp),$Ii		# I[3]
951	vpxor		$Zlo,$Xlo,$Xlo
952	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
953	vpxor		$Zhi,$Xhi,$Xhi
954	 vpshufb	$bswap,$Ii,$Ii
955	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
956	 vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
957	vpxor		$Zmi,$Xmi,$Xmi
958	 vpunpckhqdq	$Ii,$Ii,$T2
959	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
960	 vmovdqu	0x80-0x40($Htbl),$HK
961	 vpxor		$Ii,$T2,$T2
962
963	 vmovdqu	0x20($inp),$Ij		# I[2]
964	vpxor		$Xlo,$Zlo,$Zlo
965	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
966	vpxor		$Xhi,$Zhi,$Zhi
967	 vpshufb	$bswap,$Ij,$Ij
968	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
969	 vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
970	vpxor		$Xmi,$Zmi,$Zmi
971	 vpunpckhqdq	$Ij,$Ij,$T1
972	vpclmulqdq	\$0x00,$HK,$T2,$Xmi
973	 vpxor		$Ij,$T1,$T1
974
975	 vmovdqu	0x10($inp),$Ii		# I[1]
976	vpxor		$Zlo,$Xlo,$Xlo
977	vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
978	vpxor		$Zhi,$Xhi,$Xhi
979	 vpshufb	$bswap,$Ii,$Ii
980	vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
981	 vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
982	vpxor		$Zmi,$Xmi,$Xmi
983	 vpunpckhqdq	$Ii,$Ii,$T2
984	vpclmulqdq	\$0x10,$HK,$T1,$Zmi
985	 vmovdqu	0xb0-0x40($Htbl),$HK
986	 vpxor		$Ii,$T2,$T2
987
988	 vmovdqu	($inp),$Ij		# I[0]
989	vpxor		$Xlo,$Zlo,$Zlo
990	vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
991	vpxor		$Xhi,$Zhi,$Zhi
992	 vpshufb	$bswap,$Ij,$Ij
993	vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
994	 vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
995	vpxor		$Xmi,$Zmi,$Zmi
996	vpclmulqdq	\$0x10,$HK,$T2,$Xmi
997
998	lea		0x80($inp),$inp
999	cmp		\$0x80,$len
1000	jb		.Ltail_avx
1001
1002	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1003	sub		\$0x80,$len
1004	jmp		.Loop8x_avx
1005
1006.align	32
1007.Loop8x_avx:
1008	vpunpckhqdq	$Ij,$Ij,$T1
1009	 vmovdqu	0x70($inp),$Ii		# I[7]
1010	vpxor		$Xlo,$Zlo,$Zlo
1011	vpxor		$Ij,$T1,$T1
1012	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xi
1013	 vpshufb	$bswap,$Ii,$Ii
1014	vpxor		$Xhi,$Zhi,$Zhi
1015	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xo
1016	 vmovdqu	0x00-0x40($Htbl),$Hkey	# $Hkey^1
1017	 vpunpckhqdq	$Ii,$Ii,$T2
1018	vpxor		$Xmi,$Zmi,$Zmi
1019	vpclmulqdq	\$0x00,$HK,$T1,$Tred
1020	 vmovdqu	0x20-0x40($Htbl),$HK
1021	 vpxor		$Ii,$T2,$T2
1022
1023	  vmovdqu	0x60($inp),$Ij		# I[6]
1024	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1025	vpxor		$Zlo,$Xi,$Xi		# collect result
1026	  vpshufb	$bswap,$Ij,$Ij
1027	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1028	vxorps		$Zhi,$Xo,$Xo
1029	  vmovdqu	0x10-0x40($Htbl),$Hkey	# $Hkey^2
1030	 vpunpckhqdq	$Ij,$Ij,$T1
1031	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1032	vpxor		$Zmi,$Tred,$Tred
1033	 vxorps		$Ij,$T1,$T1
1034
1035	  vmovdqu	0x50($inp),$Ii		# I[5]
1036	vpxor		$Xi,$Tred,$Tred		# aggregated Karatsuba post-processing
1037	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1038	vpxor		$Xo,$Tred,$Tred
1039	vpslldq		\$8,$Tred,$T2
1040	 vpxor		$Xlo,$Zlo,$Zlo
1041	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1042	vpsrldq		\$8,$Tred,$Tred
1043	vpxor		$T2, $Xi, $Xi
1044	  vmovdqu	0x30-0x40($Htbl),$Hkey	# $Hkey^3
1045	  vpshufb	$bswap,$Ii,$Ii
1046	vxorps		$Tred,$Xo, $Xo
1047	 vpxor		$Xhi,$Zhi,$Zhi
1048	 vpunpckhqdq	$Ii,$Ii,$T2
1049	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1050	  vmovdqu	0x50-0x40($Htbl),$HK
1051	 vpxor		$Ii,$T2,$T2
1052	 vpxor		$Xmi,$Zmi,$Zmi
1053
1054	  vmovdqu	0x40($inp),$Ij		# I[4]
1055	vpalignr	\$8,$Xi,$Xi,$Tred	# 1st phase
1056	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1057	  vpshufb	$bswap,$Ij,$Ij
1058	 vpxor		$Zlo,$Xlo,$Xlo
1059	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1060	  vmovdqu	0x40-0x40($Htbl),$Hkey	# $Hkey^4
1061	 vpunpckhqdq	$Ij,$Ij,$T1
1062	 vpxor		$Zhi,$Xhi,$Xhi
1063	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1064	 vxorps		$Ij,$T1,$T1
1065	 vpxor		$Zmi,$Xmi,$Xmi
1066
1067	  vmovdqu	0x30($inp),$Ii		# I[3]
1068	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1069	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1070	  vpshufb	$bswap,$Ii,$Ii
1071	 vpxor		$Xlo,$Zlo,$Zlo
1072	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1073	  vmovdqu	0x60-0x40($Htbl),$Hkey	# $Hkey^5
1074	 vpunpckhqdq	$Ii,$Ii,$T2
1075	 vpxor		$Xhi,$Zhi,$Zhi
1076	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1077	  vmovdqu	0x80-0x40($Htbl),$HK
1078	 vpxor		$Ii,$T2,$T2
1079	 vpxor		$Xmi,$Zmi,$Zmi
1080
1081	  vmovdqu	0x20($inp),$Ij		# I[2]
1082	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1083	  vpshufb	$bswap,$Ij,$Ij
1084	 vpxor		$Zlo,$Xlo,$Xlo
1085	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1086	  vmovdqu	0x70-0x40($Htbl),$Hkey	# $Hkey^6
1087	 vpunpckhqdq	$Ij,$Ij,$T1
1088	 vpxor		$Zhi,$Xhi,$Xhi
1089	 vpclmulqdq	\$0x00,$HK,  $T2,$Xmi
1090	 vpxor		$Ij,$T1,$T1
1091	 vpxor		$Zmi,$Xmi,$Xmi
1092	vxorps		$Tred,$Xi,$Xi
1093
1094	  vmovdqu	0x10($inp),$Ii		# I[1]
1095	vpalignr	\$8,$Xi,$Xi,$Tred	# 2nd phase
1096	 vpclmulqdq	\$0x00,$Hkey,$Ij,$Zlo
1097	  vpshufb	$bswap,$Ii,$Ii
1098	 vpxor		$Xlo,$Zlo,$Zlo
1099	 vpclmulqdq	\$0x11,$Hkey,$Ij,$Zhi
1100	  vmovdqu	0x90-0x40($Htbl),$Hkey	# $Hkey^7
1101	vpclmulqdq	\$0x10,(%r10),$Xi,$Xi
1102	vxorps		$Xo,$Tred,$Tred
1103	 vpunpckhqdq	$Ii,$Ii,$T2
1104	 vpxor		$Xhi,$Zhi,$Zhi
1105	 vpclmulqdq	\$0x10,$HK,  $T1,$Zmi
1106	  vmovdqu	0xb0-0x40($Htbl),$HK
1107	 vpxor		$Ii,$T2,$T2
1108	 vpxor		$Xmi,$Zmi,$Zmi
1109
1110	  vmovdqu	($inp),$Ij		# I[0]
1111	 vpclmulqdq	\$0x00,$Hkey,$Ii,$Xlo
1112	  vpshufb	$bswap,$Ij,$Ij
1113	 vpclmulqdq	\$0x11,$Hkey,$Ii,$Xhi
1114	  vmovdqu	0xa0-0x40($Htbl),$Hkey	# $Hkey^8
1115	vpxor		$Tred,$Ij,$Ij
1116	 vpclmulqdq	\$0x10,$HK,  $T2,$Xmi
1117	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1118
1119	lea		0x80($inp),$inp
1120	sub		\$0x80,$len
1121	jnc		.Loop8x_avx
1122
1123	add		\$0x80,$len
1124	jmp		.Ltail_no_xor_avx
1125
1126.align	32
1127.Lshort_avx:
1128	vmovdqu		-0x10($inp,$len),$Ii	# very last word
1129	lea		($inp,$len),$inp
1130	vmovdqu		0x00-0x40($Htbl),$Hkey	# $Hkey^1
1131	vmovdqu		0x20-0x40($Htbl),$HK
1132	vpshufb		$bswap,$Ii,$Ij
1133
1134	vmovdqa		$Xlo,$Zlo		# subtle way to zero $Zlo,
1135	vmovdqa		$Xhi,$Zhi		# $Zhi and
1136	vmovdqa		$Xmi,$Zmi		# $Zmi
1137	sub		\$0x10,$len
1138	jz		.Ltail_avx
1139
1140	vpunpckhqdq	$Ij,$Ij,$T1
1141	vpxor		$Xlo,$Zlo,$Zlo
1142	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1143	vpxor		$Ij,$T1,$T1
1144	 vmovdqu	-0x20($inp),$Ii
1145	vpxor		$Xhi,$Zhi,$Zhi
1146	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1147	vmovdqu		0x10-0x40($Htbl),$Hkey	# $Hkey^2
1148	 vpshufb	$bswap,$Ii,$Ij
1149	vpxor		$Xmi,$Zmi,$Zmi
1150	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1151	vpsrldq		\$8,$HK,$HK
1152	sub		\$0x10,$len
1153	jz		.Ltail_avx
1154
1155	vpunpckhqdq	$Ij,$Ij,$T1
1156	vpxor		$Xlo,$Zlo,$Zlo
1157	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1158	vpxor		$Ij,$T1,$T1
1159	 vmovdqu	-0x30($inp),$Ii
1160	vpxor		$Xhi,$Zhi,$Zhi
1161	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1162	vmovdqu		0x30-0x40($Htbl),$Hkey	# $Hkey^3
1163	 vpshufb	$bswap,$Ii,$Ij
1164	vpxor		$Xmi,$Zmi,$Zmi
1165	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1166	vmovdqu		0x50-0x40($Htbl),$HK
1167	sub		\$0x10,$len
1168	jz		.Ltail_avx
1169
1170	vpunpckhqdq	$Ij,$Ij,$T1
1171	vpxor		$Xlo,$Zlo,$Zlo
1172	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1173	vpxor		$Ij,$T1,$T1
1174	 vmovdqu	-0x40($inp),$Ii
1175	vpxor		$Xhi,$Zhi,$Zhi
1176	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1177	vmovdqu		0x40-0x40($Htbl),$Hkey	# $Hkey^4
1178	 vpshufb	$bswap,$Ii,$Ij
1179	vpxor		$Xmi,$Zmi,$Zmi
1180	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1181	vpsrldq		\$8,$HK,$HK
1182	sub		\$0x10,$len
1183	jz		.Ltail_avx
1184
1185	vpunpckhqdq	$Ij,$Ij,$T1
1186	vpxor		$Xlo,$Zlo,$Zlo
1187	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1188	vpxor		$Ij,$T1,$T1
1189	 vmovdqu	-0x50($inp),$Ii
1190	vpxor		$Xhi,$Zhi,$Zhi
1191	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1192	vmovdqu		0x60-0x40($Htbl),$Hkey	# $Hkey^5
1193	 vpshufb	$bswap,$Ii,$Ij
1194	vpxor		$Xmi,$Zmi,$Zmi
1195	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1196	vmovdqu		0x80-0x40($Htbl),$HK
1197	sub		\$0x10,$len
1198	jz		.Ltail_avx
1199
1200	vpunpckhqdq	$Ij,$Ij,$T1
1201	vpxor		$Xlo,$Zlo,$Zlo
1202	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1203	vpxor		$Ij,$T1,$T1
1204	 vmovdqu	-0x60($inp),$Ii
1205	vpxor		$Xhi,$Zhi,$Zhi
1206	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1207	vmovdqu		0x70-0x40($Htbl),$Hkey	# $Hkey^6
1208	 vpshufb	$bswap,$Ii,$Ij
1209	vpxor		$Xmi,$Zmi,$Zmi
1210	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1211	vpsrldq		\$8,$HK,$HK
1212	sub		\$0x10,$len
1213	jz		.Ltail_avx
1214
1215	vpunpckhqdq	$Ij,$Ij,$T1
1216	vpxor		$Xlo,$Zlo,$Zlo
1217	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1218	vpxor		$Ij,$T1,$T1
1219	 vmovdqu	-0x70($inp),$Ii
1220	vpxor		$Xhi,$Zhi,$Zhi
1221	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1222	vmovdqu		0x90-0x40($Htbl),$Hkey	# $Hkey^7
1223	 vpshufb	$bswap,$Ii,$Ij
1224	vpxor		$Xmi,$Zmi,$Zmi
1225	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1226	vmovq		0xb8-0x40($Htbl),$HK
1227	sub		\$0x10,$len
1228	jmp		.Ltail_avx
1229
1230.align	32
1231.Ltail_avx:
1232	vpxor		$Xi,$Ij,$Ij		# accumulate $Xi
1233.Ltail_no_xor_avx:
1234	vpunpckhqdq	$Ij,$Ij,$T1
1235	vpxor		$Xlo,$Zlo,$Zlo
1236	vpclmulqdq	\$0x00,$Hkey,$Ij,$Xlo
1237	vpxor		$Ij,$T1,$T1
1238	vpxor		$Xhi,$Zhi,$Zhi
1239	vpclmulqdq	\$0x11,$Hkey,$Ij,$Xhi
1240	vpxor		$Xmi,$Zmi,$Zmi
1241	vpclmulqdq	\$0x00,$HK,$T1,$Xmi
1242
1243	vmovdqu		(%r10),$Tred
1244
1245	vpxor		$Xlo,$Zlo,$Xi
1246	vpxor		$Xhi,$Zhi,$Xo
1247	vpxor		$Xmi,$Zmi,$Zmi
1248
1249	vpxor		$Xi, $Zmi,$Zmi		# aggregated Karatsuba post-processing
1250	vpxor		$Xo, $Zmi,$Zmi
1251	vpslldq		\$8, $Zmi,$T2
1252	vpsrldq		\$8, $Zmi,$Zmi
1253	vpxor		$T2, $Xi, $Xi
1254	vpxor		$Zmi,$Xo, $Xo
1255
1256	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 1st phase
1257	vpalignr	\$8,$Xi,$Xi,$Xi
1258	vpxor		$T2,$Xi,$Xi
1259
1260	vpclmulqdq	\$0x10,$Tred,$Xi,$T2	# 2nd phase
1261	vpalignr	\$8,$Xi,$Xi,$Xi
1262	vpxor		$Xo,$Xi,$Xi
1263	vpxor		$T2,$Xi,$Xi
1264
1265	cmp		\$0,$len
1266	jne		.Lshort_avx
1267
1268	vpshufb		$bswap,$Xi,$Xi
1269	vmovdqu		$Xi,($Xip)
1270	vzeroupper
1271___
1272$code.=<<___ if ($win64);
1273	movaps	(%rsp),%xmm6
1274	movaps	0x10(%rsp),%xmm7
1275	movaps	0x20(%rsp),%xmm8
1276	movaps	0x30(%rsp),%xmm9
1277	movaps	0x40(%rsp),%xmm10
1278	movaps	0x50(%rsp),%xmm11
1279	movaps	0x60(%rsp),%xmm12
1280	movaps	0x70(%rsp),%xmm13
1281	movaps	0x80(%rsp),%xmm14
1282	movaps	0x90(%rsp),%xmm15
1283	lea	0xa8(%rsp),%rsp
1284___
1285$code.=<<___;
1286	ret
1287.cfi_endproc
1288.seh_endproc
1289.size	gcm_ghash_avx,.-gcm_ghash_avx
1290___
1291} else {
1292$code.=<<___;
1293	jmp	.L_ghash_clmul
1294.size	gcm_ghash_avx,.-gcm_ghash_avx
1295___
1296}
1297
1298$code.=<<___;
1299.section .rodata
1300.align	64
1301.Lbswap_mask:
1302	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1303.L0x1c2_polynomial:
1304	.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1305.L7_mask:
1306	.long	7,0,7,0
1307.align	64
1308
1309.asciz	"GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1310.align	64
1311.text
1312___
1313
1314$code =~ s/\`([^\`]*)\`/eval($1)/gem;
1315
1316print $code;
1317
1318close STDOUT or die "error closing STDOUT: $!";
1319