• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52# May 2013.
53#
54# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
55# and loading pair of consecutive blocks to 256-bit %ymm registers)
56# did not provide impressive performance improvement till a crucial
57# hint regarding the number of Xupdate iterations to pre-compute in
58# advance was provided by Ilya Albrekht of Intel Corp.
59
60# March 2014.
61#
62# Add support for Intel SHA Extensions.
63
64######################################################################
65# Current performance is summarized in following table. Numbers are
66# CPU clock cycles spent to process single byte (less is better).
67#
68#		x86_64		SSSE3		AVX[2]
69# P4		9.05		-
70# Opteron	6.26		-
71# Core2		6.55		6.05/+8%	-
72# Westmere	6.73		5.30/+27%	-
73# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
74# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
75# Haswell	5.45		4.15/+31%	3.57/+53%
76# Skylake	5.18		4.06/+28%	3.54/+46%
77# Bulldozer	9.11		5.95/+53%
78# VIA Nano	9.32		7.15/+30%
79# Atom		10.3		9.17/+12%
80# Silvermont	13.1(*)		9.37/+40%
81# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
82#
83# (*)	obviously suboptimal result, nothing was done about it,
84#	because SSSE3 code is compiled unconditionally;
85# (**)	SHAEXT result
86
87$flavour = shift;
88$output  = shift;
89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96die "can't locate x86_64-xlate.pl";
97
98# In upstream, this is controlled by shelling out to the compiler to check
99# versions, but BoringSSL is intended to be used with pre-generated perlasm
100# output, so this isn't useful anyway.
101#
102# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
103# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
104# did not tie them together until after $shaext was added.
105$avx = 1;
106
107# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
108# been tested.
109$shaext=0;	### set to zero if compiling for 1.0.1
110$avx=1		if (!$shaext && $avx);
111
112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
113*STDOUT=*OUT;
114
115$ctx="%rdi";	# 1st arg
116$inp="%rsi";	# 2nd arg
117$num="%rdx";	# 3rd arg
118
119# reassign arguments in order to produce more compact code
120$ctx="%r8";
121$inp="%r9";
122$num="%r10";
123
124$t0="%eax";
125$t1="%ebx";
126$t2="%ecx";
127@xi=("%edx","%ebp","%r14d");
128$A="%esi";
129$B="%edi";
130$C="%r11d";
131$D="%r12d";
132$E="%r13d";
133
134@V=($A,$B,$C,$D,$E);
135
136sub BODY_00_19 {
137my ($i,$a,$b,$c,$d,$e)=@_;
138my $j=$i+1;
139$code.=<<___ if ($i==0);
140	mov	`4*$i`($inp),$xi[0]
141	bswap	$xi[0]
142___
143$code.=<<___ if ($i<15);
144	mov	`4*$j`($inp),$xi[1]
145	mov	$d,$t0
146	mov	$xi[0],`4*$i`(%rsp)
147	mov	$a,$t2
148	bswap	$xi[1]
149	xor	$c,$t0
150	rol	\$5,$t2
151	and	$b,$t0
152	lea	0x5a827999($xi[0],$e),$e
153	add	$t2,$e
154	xor	$d,$t0
155	rol	\$30,$b
156	add	$t0,$e
157___
158$code.=<<___ if ($i>=15);
159	xor	`4*($j%16)`(%rsp),$xi[1]
160	mov	$d,$t0
161	mov	$xi[0],`4*($i%16)`(%rsp)
162	mov	$a,$t2
163	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
164	xor	$c,$t0
165	rol	\$5,$t2
166	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
167	and	$b,$t0
168	lea	0x5a827999($xi[0],$e),$e
169	rol	\$30,$b
170	xor	$d,$t0
171	add	$t2,$e
172	rol	\$1,$xi[1]
173	add	$t0,$e
174___
175push(@xi,shift(@xi));
176}
177
178sub BODY_20_39 {
179my ($i,$a,$b,$c,$d,$e)=@_;
180my $j=$i+1;
181my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
182$code.=<<___ if ($i<79);
183	xor	`4*($j%16)`(%rsp),$xi[1]
184	mov	$b,$t0
185	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
186	mov	$a,$t2
187	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
188	xor	$d,$t0
189	rol	\$5,$t2
190	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
191	lea	$K($xi[0],$e),$e
192	xor	$c,$t0
193	add	$t2,$e
194	rol	\$30,$b
195	add	$t0,$e
196	rol	\$1,$xi[1]
197___
198$code.=<<___ if ($i==79);
199	mov	$b,$t0
200	mov	$a,$t2
201	xor	$d,$t0
202	lea	$K($xi[0],$e),$e
203	rol	\$5,$t2
204	xor	$c,$t0
205	add	$t2,$e
206	rol	\$30,$b
207	add	$t0,$e
208___
209push(@xi,shift(@xi));
210}
211
212sub BODY_40_59 {
213my ($i,$a,$b,$c,$d,$e)=@_;
214my $j=$i+1;
215$code.=<<___;
216	xor	`4*($j%16)`(%rsp),$xi[1]
217	mov	$d,$t0
218	mov	$xi[0],`4*($i%16)`(%rsp)
219	mov	$d,$t1
220	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
221	and	$c,$t0
222	mov	$a,$t2
223	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
224	lea	0x8f1bbcdc($xi[0],$e),$e
225	xor	$c,$t1
226	rol	\$5,$t2
227	add	$t0,$e
228	rol	\$1,$xi[1]
229	and	$b,$t1
230	add	$t2,$e
231	rol	\$30,$b
232	add	$t1,$e
233___
234push(@xi,shift(@xi));
235}
236
237$code.=<<___;
238.text
239.extern	OPENSSL_ia32cap_P
240
241.globl	sha1_block_data_order
242.type	sha1_block_data_order,\@function,3
243.align	16
244sha1_block_data_order:
245	leaq	OPENSSL_ia32cap_P(%rip),%r10
246	mov	0(%r10),%r9d
247	mov	4(%r10),%r8d
248	mov	8(%r10),%r10d
249	test	\$`1<<9`,%r8d		# check SSSE3 bit
250	jz	.Lialu
251___
252$code.=<<___ if ($shaext);
253	test	\$`1<<29`,%r10d		# check SHA bit
254	jnz	_shaext_shortcut
255___
256$code.=<<___ if ($avx>1);
257	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
258	cmp	\$`1<<3|1<<5|1<<8`,%r10d
259	je	_avx2_shortcut
260___
261$code.=<<___ if ($avx);
262	and	\$`1<<28`,%r8d		# mask AVX bit
263	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
264	or	%r9d,%r8d
265	cmp	\$`1<<28|1<<30`,%r8d
266	je	_avx_shortcut
267___
268$code.=<<___;
269	jmp	_ssse3_shortcut
270
271.align	16
272.Lialu:
273	mov	%rsp,%rax
274	push	%rbx
275	push	%rbp
276	push	%r12
277	push	%r13
278	push	%r14
279	mov	%rdi,$ctx	# reassigned argument
280	sub	\$`8+16*4`,%rsp
281	mov	%rsi,$inp	# reassigned argument
282	and	\$-64,%rsp
283	mov	%rdx,$num	# reassigned argument
284	mov	%rax,`16*4`(%rsp)
285.Lprologue:
286
287	mov	0($ctx),$A
288	mov	4($ctx),$B
289	mov	8($ctx),$C
290	mov	12($ctx),$D
291	mov	16($ctx),$E
292	jmp	.Lloop
293
294.align	16
295.Lloop:
296___
297for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
298for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
299for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
300for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
301$code.=<<___;
302	add	0($ctx),$A
303	add	4($ctx),$B
304	add	8($ctx),$C
305	add	12($ctx),$D
306	add	16($ctx),$E
307	mov	$A,0($ctx)
308	mov	$B,4($ctx)
309	mov	$C,8($ctx)
310	mov	$D,12($ctx)
311	mov	$E,16($ctx)
312
313	sub	\$1,$num
314	lea	`16*4`($inp),$inp
315	jnz	.Lloop
316
317	mov	`16*4`(%rsp),%rsi
318	mov	-40(%rsi),%r14
319	mov	-32(%rsi),%r13
320	mov	-24(%rsi),%r12
321	mov	-16(%rsi),%rbp
322	mov	-8(%rsi),%rbx
323	lea	(%rsi),%rsp
324.Lepilogue:
325	ret
326.size	sha1_block_data_order,.-sha1_block_data_order
327___
328if ($shaext) {{{
329######################################################################
330# Intel SHA Extensions implementation of SHA1 update function.
331#
332my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
333my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
334my @MSG=map("%xmm$_",(4..7));
335
336$code.=<<___;
337.type	sha1_block_data_order_shaext,\@function,3
338.align	32
339sha1_block_data_order_shaext:
340_shaext_shortcut:
341___
342$code.=<<___ if ($win64);
343	lea	`-8-4*16`(%rsp),%rsp
344	movaps	%xmm6,-8-4*16(%rax)
345	movaps	%xmm7,-8-3*16(%rax)
346	movaps	%xmm8,-8-2*16(%rax)
347	movaps	%xmm9,-8-1*16(%rax)
348.Lprologue_shaext:
349___
350$code.=<<___;
351	movdqu	($ctx),$ABCD
352	movd	16($ctx),$E
353	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
354
355	movdqu	($inp),@MSG[0]
356	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
357	movdqu	0x10($inp),@MSG[1]
358	pshufd	\$0b00011011,$E,$E		# flip word order
359	movdqu	0x20($inp),@MSG[2]
360	pshufb	$BSWAP,@MSG[0]
361	movdqu	0x30($inp),@MSG[3]
362	pshufb	$BSWAP,@MSG[1]
363	pshufb	$BSWAP,@MSG[2]
364	movdqa	$E,$E_SAVE			# offload $E
365	pshufb	$BSWAP,@MSG[3]
366	jmp	.Loop_shaext
367
368.align	16
369.Loop_shaext:
370	dec		$num
371	lea		0x40($inp),%r8		# next input block
372	paddd		@MSG[0],$E
373	cmovne		%r8,$inp
374	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
375___
376for($i=0;$i<20-4;$i+=2) {
377$code.=<<___;
378	sha1msg1	@MSG[1],@MSG[0]
379	movdqa		$ABCD,$E_
380	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
381	sha1nexte	@MSG[1],$E_
382	pxor		@MSG[2],@MSG[0]
383	sha1msg1	@MSG[2],@MSG[1]
384	sha1msg2	@MSG[3],@MSG[0]
385
386	movdqa		$ABCD,$E
387	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
388	sha1nexte	@MSG[2],$E
389	pxor		@MSG[3],@MSG[1]
390	sha1msg2	@MSG[0],@MSG[1]
391___
392	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
393}
394$code.=<<___;
395	movdqu		($inp),@MSG[0]
396	movdqa		$ABCD,$E_
397	sha1rnds4	\$3,$E,$ABCD		# 64-67
398	sha1nexte	@MSG[1],$E_
399	movdqu		0x10($inp),@MSG[1]
400	pshufb		$BSWAP,@MSG[0]
401
402	movdqa		$ABCD,$E
403	sha1rnds4	\$3,$E_,$ABCD		# 68-71
404	sha1nexte	@MSG[2],$E
405	movdqu		0x20($inp),@MSG[2]
406	pshufb		$BSWAP,@MSG[1]
407
408	movdqa		$ABCD,$E_
409	sha1rnds4	\$3,$E,$ABCD		# 72-75
410	sha1nexte	@MSG[3],$E_
411	movdqu		0x30($inp),@MSG[3]
412	pshufb		$BSWAP,@MSG[2]
413
414	movdqa		$ABCD,$E
415	sha1rnds4	\$3,$E_,$ABCD		# 76-79
416	sha1nexte	$E_SAVE,$E
417	pshufb		$BSWAP,@MSG[3]
418
419	paddd		$ABCD_SAVE,$ABCD
420	movdqa		$E,$E_SAVE		# offload $E
421
422	jnz		.Loop_shaext
423
424	pshufd	\$0b00011011,$ABCD,$ABCD
425	pshufd	\$0b00011011,$E,$E
426	movdqu	$ABCD,($ctx)
427	movd	$E,16($ctx)
428___
429$code.=<<___ if ($win64);
430	movaps	-8-4*16(%rax),%xmm6
431	movaps	-8-3*16(%rax),%xmm7
432	movaps	-8-2*16(%rax),%xmm8
433	movaps	-8-1*16(%rax),%xmm9
434	mov	%rax,%rsp
435.Lepilogue_shaext:
436___
437$code.=<<___;
438	ret
439.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
440___
441}}}
442{{{
443my $Xi=4;
444my @X=map("%xmm$_",(4..7,0..3));
445my @Tx=map("%xmm$_",(8..10));
446my $Kx="%xmm11";
447my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
448my @T=("%esi","%edi");
449my $j=0;
450my $rx=0;
451my $K_XX_XX="%r14";
452my $fp="%r11";
453
454my $_rol=sub { &rol(@_) };
455my $_ror=sub { &ror(@_) };
456
457{ my $sn;
458sub align32() {
459  ++$sn;
460$code.=<<___;
461	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
462.align	32
463.Lalign32_$sn:
464___
465}
466}
467
468$code.=<<___;
469.type	sha1_block_data_order_ssse3,\@function,3
470.align	16
471sha1_block_data_order_ssse3:
472_ssse3_shortcut:
473	mov	%rsp,$fp	# frame pointer
474	push	%rbx
475	push	%rbp
476	push	%r12
477	push	%r13		# redundant, done to share Win64 SE handler
478	push	%r14
479	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
480___
481$code.=<<___ if ($win64);
482	movaps	%xmm6,-40-6*16($fp)
483	movaps	%xmm7,-40-5*16($fp)
484	movaps	%xmm8,-40-4*16($fp)
485	movaps	%xmm9,-40-3*16($fp)
486	movaps	%xmm10,-40-2*16($fp)
487	movaps	%xmm11,-40-1*16($fp)
488.Lprologue_ssse3:
489___
490$code.=<<___;
491	and	\$-64,%rsp
492	mov	%rdi,$ctx	# reassigned argument
493	mov	%rsi,$inp	# reassigned argument
494	mov	%rdx,$num	# reassigned argument
495
496	shl	\$6,$num
497	add	$inp,$num
498	lea	K_XX_XX+64(%rip),$K_XX_XX
499
500	mov	0($ctx),$A		# load context
501	mov	4($ctx),$B
502	mov	8($ctx),$C
503	mov	12($ctx),$D
504	mov	$B,@T[0]		# magic seed
505	mov	16($ctx),$E
506	mov	$C,@T[1]
507	xor	$D,@T[1]
508	and	@T[1],@T[0]
509
510	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
511	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
512	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
513	movdqu	16($inp),@X[-3&7]
514	movdqu	32($inp),@X[-2&7]
515	movdqu	48($inp),@X[-1&7]
516	pshufb	@X[2],@X[-4&7]		# byte swap
517	pshufb	@X[2],@X[-3&7]
518	pshufb	@X[2],@X[-2&7]
519	add	\$64,$inp
520	paddd	@Tx[1],@X[-4&7]		# add K_00_19
521	pshufb	@X[2],@X[-1&7]
522	paddd	@Tx[1],@X[-3&7]
523	paddd	@Tx[1],@X[-2&7]
524	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
525	psubd	@Tx[1],@X[-4&7]		# restore X[]
526	movdqa	@X[-3&7],16(%rsp)
527	psubd	@Tx[1],@X[-3&7]
528	movdqa	@X[-2&7],32(%rsp)
529	psubd	@Tx[1],@X[-2&7]
530	jmp	.Loop_ssse3
531___
532
533sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
534{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
535  my $arg = pop;
536    $arg = "\$$arg" if ($arg*1 eq $arg);
537    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
538}
539
540sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
541{ use integer;
542  my $body = shift;
543  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
544  my ($a,$b,$c,$d,$e);
545
546	 eval(shift(@insns));		# ror
547	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
548	 eval(shift(@insns));
549	&movdqa	(@Tx[0],@X[-1&7]);
550	  &paddd	(@Tx[1],@X[-1&7]);
551	 eval(shift(@insns));
552	 eval(shift(@insns));
553
554	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
555	 eval(shift(@insns));
556	 eval(shift(@insns));		# rol
557	 eval(shift(@insns));
558	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
559	 eval(shift(@insns));
560	 eval(shift(@insns));
561
562	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
563	 eval(shift(@insns));
564	 eval(shift(@insns));		# ror
565	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
566	 eval(shift(@insns));
567	 eval(shift(@insns));
568	 eval(shift(@insns));
569
570	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
571	 eval(shift(@insns));
572	 eval(shift(@insns));		# rol
573	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
574	 eval(shift(@insns));
575	 eval(shift(@insns));
576
577	&movdqa	(@Tx[2],@X[0]);
578	 eval(shift(@insns));
579	 eval(shift(@insns));
580	 eval(shift(@insns));		# ror
581	&movdqa	(@Tx[0],@X[0]);
582	 eval(shift(@insns));
583
584	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
585	&paddd	(@X[0],@X[0]);
586	 eval(shift(@insns));
587	 eval(shift(@insns));
588
589	&psrld	(@Tx[0],31);
590	 eval(shift(@insns));
591	 eval(shift(@insns));		# rol
592	 eval(shift(@insns));
593	&movdqa	(@Tx[1],@Tx[2]);
594	 eval(shift(@insns));
595	 eval(shift(@insns));
596
597	&psrld	(@Tx[2],30);
598	 eval(shift(@insns));
599	 eval(shift(@insns));		# ror
600	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
601	 eval(shift(@insns));
602	 eval(shift(@insns));
603	 eval(shift(@insns));
604
605	&pslld	(@Tx[1],2);
606	&pxor	(@X[0],@Tx[2]);
607	 eval(shift(@insns));
608	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
609	 eval(shift(@insns));		# rol
610	 eval(shift(@insns));
611	 eval(shift(@insns));
612
613	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
614	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
615
616	 foreach (@insns) { eval; }	# remaining instructions [if any]
617
618  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
619		push(@Tx,shift(@Tx));
620}
621
622sub Xupdate_ssse3_32_79()
623{ use integer;
624  my $body = shift;
625  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
626  my ($a,$b,$c,$d,$e);
627
628	 eval(shift(@insns))		if ($Xi==8);
629	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
630	 eval(shift(@insns))		if ($Xi==8);
631	 eval(shift(@insns));		# body_20_39
632	 eval(shift(@insns));
633	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
634	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
635	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
636	 eval(shift(@insns));
637	 eval(shift(@insns));		# rol
638
639	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
640	 eval(shift(@insns));
641	 eval(shift(@insns));
642	if ($Xi%5) {
643	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
644	} else {			# ... or load next one
645	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
646	}
647	 eval(shift(@insns));		# ror
648	  &paddd	(@Tx[1],@X[-1&7]);
649	 eval(shift(@insns));
650
651	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
652	 eval(shift(@insns));		# body_20_39
653	 eval(shift(@insns));
654	 eval(shift(@insns));
655	 eval(shift(@insns));		# rol
656	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
657
658	&movdqa	(@Tx[0],@X[0]);
659	 eval(shift(@insns));
660	 eval(shift(@insns));
661	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
662	 eval(shift(@insns));		# ror
663	 eval(shift(@insns));
664	 eval(shift(@insns));		# body_20_39
665
666	&pslld	(@X[0],2);
667	 eval(shift(@insns));
668	 eval(shift(@insns));
669	&psrld	(@Tx[0],30);
670	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
671	 eval(shift(@insns));
672	 eval(shift(@insns));
673	 eval(shift(@insns));		# ror
674
675	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
676	 eval(shift(@insns));
677	 eval(shift(@insns));		# body_20_39
678	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
679	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
680	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
681	 eval(shift(@insns));
682	 eval(shift(@insns));		# rol
683	 eval(shift(@insns));
684	 eval(shift(@insns));
685	 eval(shift(@insns));		# rol
686	 eval(shift(@insns));
687
688	 foreach (@insns) { eval; }	# remaining instructions
689
690  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
691		push(@Tx,shift(@Tx));
692}
693
694sub Xuplast_ssse3_80()
695{ use integer;
696  my $body = shift;
697  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
698  my ($a,$b,$c,$d,$e);
699
700	 eval(shift(@insns));
701	 eval(shift(@insns));
702	 eval(shift(@insns));
703	 eval(shift(@insns));
704	  &paddd	(@Tx[1],@X[-1&7]);
705	 eval(shift(@insns));
706	 eval(shift(@insns));
707
708	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
709
710	 foreach (@insns) { eval; }		# remaining instructions
711
712	&cmp	($inp,$num);
713	&je	(".Ldone_ssse3");
714
715	unshift(@Tx,pop(@Tx));
716
717	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
718	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
719	&movdqu	(@X[-4&7],"0($inp)");		# load input
720	&movdqu	(@X[-3&7],"16($inp)");
721	&movdqu	(@X[-2&7],"32($inp)");
722	&movdqu	(@X[-1&7],"48($inp)");
723	&pshufb	(@X[-4&7],@X[2]);		# byte swap
724	&add	($inp,64);
725
726  $Xi=0;
727}
728
729sub Xloop_ssse3()
730{ use integer;
731  my $body = shift;
732  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
733  my ($a,$b,$c,$d,$e);
734
735	 eval(shift(@insns));
736	 eval(shift(@insns));
737	 eval(shift(@insns));
738	&pshufb	(@X[($Xi-3)&7],@X[2]);
739	 eval(shift(@insns));
740	 eval(shift(@insns));
741	 eval(shift(@insns));
742	 eval(shift(@insns));
743	&paddd	(@X[($Xi-4)&7],@Tx[1]);
744	 eval(shift(@insns));
745	 eval(shift(@insns));
746	 eval(shift(@insns));
747	 eval(shift(@insns));
748	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
749	 eval(shift(@insns));
750	 eval(shift(@insns));
751	 eval(shift(@insns));
752	 eval(shift(@insns));
753	&psubd	(@X[($Xi-4)&7],@Tx[1]);
754
755	foreach (@insns) { eval; }
756  $Xi++;
757}
758
759sub Xtail_ssse3()
760{ use integer;
761  my $body = shift;
762  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
763  my ($a,$b,$c,$d,$e);
764
765	foreach (@insns) { eval; }
766}
767
768sub body_00_19 () {	# ((c^d)&b)^d
769	# on start @T[0]=(c^d)&b
770	return &body_20_39() if ($rx==19); $rx++;
771	(
772	'($a,$b,$c,$d,$e)=@V;'.
773	'&$_ror	($b,$j?7:2)',	# $b>>>2
774	'&xor	(@T[0],$d)',
775	'&mov	(@T[1],$a)',	# $b for next round
776
777	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
778	'&xor	($b,$c)',	# $c^$d for next round
779
780	'&$_rol	($a,5)',
781	'&add	($e,@T[0])',
782	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
783
784	'&xor	($b,$c)',	# restore $b
785	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
786	);
787}
788
789sub body_20_39 () {	# b^d^c
790	# on entry @T[0]=b^d
791	return &body_40_59() if ($rx==39); $rx++;
792	(
793	'($a,$b,$c,$d,$e)=@V;'.
794	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
795	'&xor	(@T[0],$d)	if($j==19);'.
796	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
797	'&mov	(@T[1],$a)',	# $b for next round
798
799	'&$_rol	($a,5)',
800	'&add	($e,@T[0])',
801	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
802
803	'&$_ror	($b,7)',	# $b>>>2
804	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
805	);
806}
807
808sub body_40_59 () {	# ((b^c)&(c^d))^c
809	# on entry @T[0]=(b^c), (c^=d)
810	$rx++;
811	(
812	'($a,$b,$c,$d,$e)=@V;'.
813	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
814	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
815	'&xor	($c,$d)		if ($j>=40)',	# restore $c
816
817	'&$_ror	($b,7)',	# $b>>>2
818	'&mov	(@T[1],$a)',	# $b for next round
819	'&xor	(@T[0],$c)',
820
821	'&$_rol	($a,5)',
822	'&add	($e,@T[0])',
823	'&xor	(@T[1],$c)	if ($j==59);'.
824	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
825
826	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
827	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
828	);
829}
830$code.=<<___;
831.align	16
832.Loop_ssse3:
833___
834	&Xupdate_ssse3_16_31(\&body_00_19);
835	&Xupdate_ssse3_16_31(\&body_00_19);
836	&Xupdate_ssse3_16_31(\&body_00_19);
837	&Xupdate_ssse3_16_31(\&body_00_19);
838	&Xupdate_ssse3_32_79(\&body_00_19);
839	&Xupdate_ssse3_32_79(\&body_20_39);
840	&Xupdate_ssse3_32_79(\&body_20_39);
841	&Xupdate_ssse3_32_79(\&body_20_39);
842	&Xupdate_ssse3_32_79(\&body_20_39);
843	&Xupdate_ssse3_32_79(\&body_20_39);
844	&Xupdate_ssse3_32_79(\&body_40_59);
845	&Xupdate_ssse3_32_79(\&body_40_59);
846	&Xupdate_ssse3_32_79(\&body_40_59);
847	&Xupdate_ssse3_32_79(\&body_40_59);
848	&Xupdate_ssse3_32_79(\&body_40_59);
849	&Xupdate_ssse3_32_79(\&body_20_39);
850	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
851
852				$saved_j=$j; @saved_V=@V;
853
854	&Xloop_ssse3(\&body_20_39);
855	&Xloop_ssse3(\&body_20_39);
856	&Xloop_ssse3(\&body_20_39);
857
858$code.=<<___;
859	add	0($ctx),$A			# update context
860	add	4($ctx),@T[0]
861	add	8($ctx),$C
862	add	12($ctx),$D
863	mov	$A,0($ctx)
864	add	16($ctx),$E
865	mov	@T[0],4($ctx)
866	mov	@T[0],$B			# magic seed
867	mov	$C,8($ctx)
868	mov	$C,@T[1]
869	mov	$D,12($ctx)
870	xor	$D,@T[1]
871	mov	$E,16($ctx)
872	and	@T[1],@T[0]
873	jmp	.Loop_ssse3
874
875.align	16
876.Ldone_ssse3:
877___
878				$j=$saved_j; @V=@saved_V;
879
880	&Xtail_ssse3(\&body_20_39);
881	&Xtail_ssse3(\&body_20_39);
882	&Xtail_ssse3(\&body_20_39);
883
884$code.=<<___;
885	add	0($ctx),$A			# update context
886	add	4($ctx),@T[0]
887	add	8($ctx),$C
888	mov	$A,0($ctx)
889	add	12($ctx),$D
890	mov	@T[0],4($ctx)
891	add	16($ctx),$E
892	mov	$C,8($ctx)
893	mov	$D,12($ctx)
894	mov	$E,16($ctx)
895___
896$code.=<<___ if ($win64);
897	movaps	-40-6*16($fp),%xmm6
898	movaps	-40-5*16($fp),%xmm7
899	movaps	-40-4*16($fp),%xmm8
900	movaps	-40-3*16($fp),%xmm9
901	movaps	-40-2*16($fp),%xmm10
902	movaps	-40-1*16($fp),%xmm11
903___
904$code.=<<___;
905	mov	-40($fp),%r14
906	mov	-32($fp),%r13
907	mov	-24($fp),%r12
908	mov	-16($fp),%rbp
909	mov	-8($fp),%rbx
910	lea	($fp),%rsp
911.Lepilogue_ssse3:
912	ret
913.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
914___
915
916if ($avx) {
917$Xi=4;				# reset variables
918@X=map("%xmm$_",(4..7,0..3));
919@Tx=map("%xmm$_",(8..10));
920$j=0;
921$rx=0;
922
923my $done_avx_label=".Ldone_avx";
924
925my $_rol=sub { &shld(@_[0],@_) };
926my $_ror=sub { &shrd(@_[0],@_) };
927
928$code.=<<___;
929.type	sha1_block_data_order_avx,\@function,3
930.align	16
931sha1_block_data_order_avx:
932_avx_shortcut:
933	mov	%rsp,$fp
934	push	%rbx
935	push	%rbp
936	push	%r12
937	push	%r13		# redundant, done to share Win64 SE handler
938	push	%r14
939	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
940	vzeroupper
941___
942$code.=<<___ if ($win64);
943	vmovaps	%xmm6,-40-6*16($fp)
944	vmovaps	%xmm7,-40-5*16($fp)
945	vmovaps	%xmm8,-40-4*16($fp)
946	vmovaps	%xmm9,-40-3*16($fp)
947	vmovaps	%xmm10,-40-2*16($fp)
948	vmovaps	%xmm11,-40-1*16($fp)
949.Lprologue_avx:
950___
951$code.=<<___;
952	and	\$-64,%rsp
953	mov	%rdi,$ctx	# reassigned argument
954	mov	%rsi,$inp	# reassigned argument
955	mov	%rdx,$num	# reassigned argument
956
957	shl	\$6,$num
958	add	$inp,$num
959	lea	K_XX_XX+64(%rip),$K_XX_XX
960
961	mov	0($ctx),$A		# load context
962	mov	4($ctx),$B
963	mov	8($ctx),$C
964	mov	12($ctx),$D
965	mov	$B,@T[0]		# magic seed
966	mov	16($ctx),$E
967	mov	$C,@T[1]
968	xor	$D,@T[1]
969	and	@T[1],@T[0]
970
971	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
972	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
973	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
974	vmovdqu	16($inp),@X[-3&7]
975	vmovdqu	32($inp),@X[-2&7]
976	vmovdqu	48($inp),@X[-1&7]
977	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
978	add	\$64,$inp
979	vpshufb	@X[2],@X[-3&7],@X[-3&7]
980	vpshufb	@X[2],@X[-2&7],@X[-2&7]
981	vpshufb	@X[2],@X[-1&7],@X[-1&7]
982	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
983	vpaddd	$Kx,@X[-3&7],@X[1]
984	vpaddd	$Kx,@X[-2&7],@X[2]
985	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
986	vmovdqa	@X[1],16(%rsp)
987	vmovdqa	@X[2],32(%rsp)
988	jmp	.Loop_avx
989___
990
991sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
992{ use integer;
993  my $body = shift;
994  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
995  my ($a,$b,$c,$d,$e);
996
997	 eval(shift(@insns));
998	 eval(shift(@insns));
999	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1000	 eval(shift(@insns));
1001	 eval(shift(@insns));
1002
1003	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1004	 eval(shift(@insns));
1005	 eval(shift(@insns));
1006	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1007	 eval(shift(@insns));
1008	 eval(shift(@insns));
1009	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1010	 eval(shift(@insns));
1011	 eval(shift(@insns));
1012
1013	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1014	 eval(shift(@insns));
1015	 eval(shift(@insns));
1016	 eval(shift(@insns));
1017	 eval(shift(@insns));
1018
1019	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1020	 eval(shift(@insns));
1021	 eval(shift(@insns));
1022	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1023	 eval(shift(@insns));
1024	 eval(shift(@insns));
1025
1026	&vpsrld	(@Tx[0],@X[0],31);
1027	 eval(shift(@insns));
1028	 eval(shift(@insns));
1029	 eval(shift(@insns));
1030	 eval(shift(@insns));
1031
1032	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1033	&vpaddd	(@X[0],@X[0],@X[0]);
1034	 eval(shift(@insns));
1035	 eval(shift(@insns));
1036	 eval(shift(@insns));
1037	 eval(shift(@insns));
1038
1039	&vpsrld	(@Tx[1],@Tx[2],30);
1040	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1041	 eval(shift(@insns));
1042	 eval(shift(@insns));
1043	 eval(shift(@insns));
1044	 eval(shift(@insns));
1045
1046	&vpslld	(@Tx[2],@Tx[2],2);
1047	&vpxor	(@X[0],@X[0],@Tx[1]);
1048	 eval(shift(@insns));
1049	 eval(shift(@insns));
1050	 eval(shift(@insns));
1051	 eval(shift(@insns));
1052
1053	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1054	 eval(shift(@insns));
1055	 eval(shift(@insns));
1056	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1057	 eval(shift(@insns));
1058	 eval(shift(@insns));
1059
1060
1061	 foreach (@insns) { eval; }	# remaining instructions [if any]
1062
1063  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1064}
1065
1066sub Xupdate_avx_32_79()
1067{ use integer;
1068  my $body = shift;
1069  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1070  my ($a,$b,$c,$d,$e);
1071
1072	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1073	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1074	 eval(shift(@insns));		# body_20_39
1075	 eval(shift(@insns));
1076	 eval(shift(@insns));
1077	 eval(shift(@insns));		# rol
1078
1079	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1080	 eval(shift(@insns));
1081	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1082	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1083	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1084	 eval(shift(@insns));		# ror
1085	 eval(shift(@insns));
1086
1087	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1088	 eval(shift(@insns));		# body_20_39
1089	 eval(shift(@insns));
1090	 eval(shift(@insns));
1091	 eval(shift(@insns));		# rol
1092
1093	&vpsrld	(@Tx[0],@X[0],30);
1094	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1095	 eval(shift(@insns));
1096	 eval(shift(@insns));
1097	 eval(shift(@insns));		# ror
1098	 eval(shift(@insns));
1099
1100	&vpslld	(@X[0],@X[0],2);
1101	 eval(shift(@insns));		# body_20_39
1102	 eval(shift(@insns));
1103	 eval(shift(@insns));
1104	 eval(shift(@insns));		# rol
1105	 eval(shift(@insns));
1106	 eval(shift(@insns));
1107	 eval(shift(@insns));		# ror
1108	 eval(shift(@insns));
1109
1110	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1111	 eval(shift(@insns));		# body_20_39
1112	 eval(shift(@insns));
1113	 eval(shift(@insns));
1114	 eval(shift(@insns));		# rol
1115	 eval(shift(@insns));
1116	 eval(shift(@insns));
1117	 eval(shift(@insns));		# rol
1118	 eval(shift(@insns));
1119
1120	 foreach (@insns) { eval; }	# remaining instructions
1121
1122  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1123}
1124
1125sub Xuplast_avx_80()
1126{ use integer;
1127  my $body = shift;
1128  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1129  my ($a,$b,$c,$d,$e);
1130
1131	 eval(shift(@insns));
1132	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1133	 eval(shift(@insns));
1134	 eval(shift(@insns));
1135	 eval(shift(@insns));
1136	 eval(shift(@insns));
1137
1138	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1139
1140	 foreach (@insns) { eval; }		# remaining instructions
1141
1142	&cmp	($inp,$num);
1143	&je	($done_avx_label);
1144
1145	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1146	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1147	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1148	&vmovdqu(@X[-3&7],"16($inp)");
1149	&vmovdqu(@X[-2&7],"32($inp)");
1150	&vmovdqu(@X[-1&7],"48($inp)");
1151	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1152	&add	($inp,64);
1153
1154  $Xi=0;
1155}
1156
1157sub Xloop_avx()
1158{ use integer;
1159  my $body = shift;
1160  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1161  my ($a,$b,$c,$d,$e);
1162
1163	 eval(shift(@insns));
1164	 eval(shift(@insns));
1165	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1166	 eval(shift(@insns));
1167	 eval(shift(@insns));
1168	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1169	 eval(shift(@insns));
1170	 eval(shift(@insns));
1171	 eval(shift(@insns));
1172	 eval(shift(@insns));
1173	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1174	 eval(shift(@insns));
1175	 eval(shift(@insns));
1176
1177	foreach (@insns) { eval; }
1178  $Xi++;
1179}
1180
1181sub Xtail_avx()
1182{ use integer;
1183  my $body = shift;
1184  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1185  my ($a,$b,$c,$d,$e);
1186
1187	foreach (@insns) { eval; }
1188}
1189
1190$code.=<<___;
1191.align	16
1192.Loop_avx:
1193___
1194	&Xupdate_avx_16_31(\&body_00_19);
1195	&Xupdate_avx_16_31(\&body_00_19);
1196	&Xupdate_avx_16_31(\&body_00_19);
1197	&Xupdate_avx_16_31(\&body_00_19);
1198	&Xupdate_avx_32_79(\&body_00_19);
1199	&Xupdate_avx_32_79(\&body_20_39);
1200	&Xupdate_avx_32_79(\&body_20_39);
1201	&Xupdate_avx_32_79(\&body_20_39);
1202	&Xupdate_avx_32_79(\&body_20_39);
1203	&Xupdate_avx_32_79(\&body_20_39);
1204	&Xupdate_avx_32_79(\&body_40_59);
1205	&Xupdate_avx_32_79(\&body_40_59);
1206	&Xupdate_avx_32_79(\&body_40_59);
1207	&Xupdate_avx_32_79(\&body_40_59);
1208	&Xupdate_avx_32_79(\&body_40_59);
1209	&Xupdate_avx_32_79(\&body_20_39);
1210	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1211
1212				$saved_j=$j; @saved_V=@V;
1213
1214	&Xloop_avx(\&body_20_39);
1215	&Xloop_avx(\&body_20_39);
1216	&Xloop_avx(\&body_20_39);
1217
1218$code.=<<___;
1219	add	0($ctx),$A			# update context
1220	add	4($ctx),@T[0]
1221	add	8($ctx),$C
1222	add	12($ctx),$D
1223	mov	$A,0($ctx)
1224	add	16($ctx),$E
1225	mov	@T[0],4($ctx)
1226	mov	@T[0],$B			# magic seed
1227	mov	$C,8($ctx)
1228	mov	$C,@T[1]
1229	mov	$D,12($ctx)
1230	xor	$D,@T[1]
1231	mov	$E,16($ctx)
1232	and	@T[1],@T[0]
1233	jmp	.Loop_avx
1234
1235.align	16
1236$done_avx_label:
1237___
1238				$j=$saved_j; @V=@saved_V;
1239
1240	&Xtail_avx(\&body_20_39);
1241	&Xtail_avx(\&body_20_39);
1242	&Xtail_avx(\&body_20_39);
1243
1244$code.=<<___;
1245	vzeroupper
1246
1247	add	0($ctx),$A			# update context
1248	add	4($ctx),@T[0]
1249	add	8($ctx),$C
1250	mov	$A,0($ctx)
1251	add	12($ctx),$D
1252	mov	@T[0],4($ctx)
1253	add	16($ctx),$E
1254	mov	$C,8($ctx)
1255	mov	$D,12($ctx)
1256	mov	$E,16($ctx)
1257___
1258$code.=<<___ if ($win64);
1259	movaps	-40-6*16($fp),%xmm6
1260	movaps	-40-5*16($fp),%xmm7
1261	movaps	-40-4*16($fp),%xmm8
1262	movaps	-40-3*16($fp),%xmm9
1263	movaps	-40-2*16($fp),%xmm10
1264	movaps	-40-1*16($fp),%xmm11
1265___
1266$code.=<<___;
1267	mov	-40($fp),%r14
1268	mov	-32($fp),%r13
1269	mov	-24($fp),%r12
1270	mov	-16($fp),%rbp
1271	mov	-8($fp),%rbx
1272	lea	($fp),%rsp
1273.Lepilogue_avx:
1274	ret
1275.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1276___
1277
1278if ($avx>1) {
1279use integer;
1280$Xi=4;					# reset variables
1281@X=map("%ymm$_",(4..7,0..3));
1282@Tx=map("%ymm$_",(8..10));
1283$Kx="%ymm11";
1284$j=0;
1285
1286my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1287my ($a5,$t0)=("%r12d","%edi");
1288
1289my ($A,$F,$B,$C,$D,$E)=@ROTX;
1290my $rx=0;
1291my $frame="%r13";
1292
1293$code.=<<___;
1294.type	sha1_block_data_order_avx2,\@function,3
1295.align	16
1296sha1_block_data_order_avx2:
1297_avx2_shortcut:
1298	mov	%rsp,$fp
1299	push	%rbx
1300	push	%rbp
1301	push	%r12
1302	push	%r13
1303	push	%r14
1304	vzeroupper
1305___
1306$code.=<<___ if ($win64);
1307	lea	-6*16(%rsp),%rsp
1308	vmovaps	%xmm6,-40-6*16($fp)
1309	vmovaps	%xmm7,-40-5*16($fp)
1310	vmovaps	%xmm8,-40-4*16($fp)
1311	vmovaps	%xmm9,-40-3*16($fp)
1312	vmovaps	%xmm10,-40-2*16($fp)
1313	vmovaps	%xmm11,-40-1*16($fp)
1314.Lprologue_avx2:
1315___
1316$code.=<<___;
1317	mov	%rdi,$ctx		# reassigned argument
1318	mov	%rsi,$inp		# reassigned argument
1319	mov	%rdx,$num		# reassigned argument
1320
1321	lea	-640(%rsp),%rsp
1322	shl	\$6,$num
1323	 lea	64($inp),$frame
1324	and	\$-128,%rsp
1325	add	$inp,$num
1326	lea	K_XX_XX+64(%rip),$K_XX_XX
1327
1328	mov	0($ctx),$A		# load context
1329	 cmp	$num,$frame
1330	 cmovae	$inp,$frame		# next or same block
1331	mov	4($ctx),$F
1332	mov	8($ctx),$C
1333	mov	12($ctx),$D
1334	mov	16($ctx),$E
1335	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1336
1337	vmovdqu		($inp),%xmm0
1338	vmovdqu		16($inp),%xmm1
1339	vmovdqu		32($inp),%xmm2
1340	vmovdqu		48($inp),%xmm3
1341	lea		64($inp),$inp
1342	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1343	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1344	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1345	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1346	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1347	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1348	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1349	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1350	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1351
1352	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1353	vpaddd	$Kx,@X[-3&7],@X[1]
1354	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1355	vpaddd	$Kx,@X[-2&7],@X[2]
1356	vmovdqu	@X[1],32(%rsp)
1357	vpaddd	$Kx,@X[-1&7],@X[3]
1358	vmovdqu	@X[2],64(%rsp)
1359	vmovdqu	@X[3],96(%rsp)
1360___
1361for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1362    use integer;
1363
1364	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1365	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1366	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1367	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1368	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1369	&vpsrld	(@Tx[0],@X[0],31);
1370	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1371	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1372	&vpaddd	(@X[0],@X[0],@X[0]);
1373	&vpsrld	(@Tx[1],@Tx[2],30);
1374	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1375	&vpslld	(@Tx[2],@Tx[2],2);
1376	&vpxor	(@X[0],@X[0],@Tx[1]);
1377	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1378	&vpaddd	(@Tx[1],@X[0],$Kx);
1379	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1380
1381	push(@X,shift(@X));	# "rotate" X[]
1382}
1383$code.=<<___;
1384	lea	128(%rsp),$frame
1385	jmp	.Loop_avx2
1386.align	32
1387.Loop_avx2:
1388	rorx	\$2,$F,$B
1389	andn	$D,$F,$t0
1390	and	$C,$F
1391	xor	$t0,$F
1392___
1393sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1394	# at start $f=(b&c)^(~b&d), $b>>>=2
1395	return &bodyx_20_39() if ($rx==19); $rx++;
1396	(
1397	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1398
1399	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1400	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1401	'&andn	($t0,$a,$c)',			# ~b&d for next round
1402
1403	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1404	'&rorx	($a5,$a,27)',			# a<<<5
1405	'&rorx	($f,$a,2)',			# b>>>2 for next round
1406	'&and	($a,$b)',			# b&c for next round
1407
1408	'&add	($e,$a5)',			# e+=a<<<5
1409	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1410
1411	'unshift(@ROTX,pop(@ROTX)); $j++;'
1412	)
1413}
1414
1415sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1416	# on entry $f=b^c^d, $b>>>=2
1417	return &bodyx_40_59() if ($rx==39); $rx++;
1418	(
1419	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1420
1421	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1422	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1423
1424	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1425	'&rorx	($a5,$a,27)',			# a<<<5
1426	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1427	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1428
1429	'&add	($e,$a5)',			# e+=a<<<5
1430	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1431
1432	'unshift(@ROTX,pop(@ROTX)); $j++;'
1433	)
1434}
1435
1436sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1437	# on entry $f=((b^c)&(c^d)), $b>>>=2
1438	$rx++;
1439	(
1440	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1441
1442	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1443	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1444	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1445	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1446	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1447
1448	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1449	'&rorx	($a5,$a,27)',			# a<<<5
1450	'&rorx	($f,$a,2)',			# b>>>2 in next round
1451	'&xor	($a,$b)',			# b^c for next round
1452
1453	'&add	($e,$a5)',			# e+=a<<<5
1454	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1455	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1456
1457	'unshift(@ROTX,pop(@ROTX)); $j++;'
1458	)
1459}
1460
1461sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
1462{ use integer;
1463  my $body = shift;
1464  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1465  my ($a,$b,$c,$d,$e);
1466
1467	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1468	 eval(shift(@insns));
1469	 eval(shift(@insns));
1470	 eval(shift(@insns));
1471	 eval(shift(@insns));
1472
1473	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1474	 eval(shift(@insns));
1475	 eval(shift(@insns));
1476	 eval(shift(@insns));
1477
1478	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1479	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1480	 eval(shift(@insns));
1481	 eval(shift(@insns));
1482
1483	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1484	 eval(shift(@insns));
1485	 eval(shift(@insns));
1486	 eval(shift(@insns));
1487	 eval(shift(@insns));
1488
1489	&vpsrld	(@Tx[0],@X[0],31);
1490	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1491	 eval(shift(@insns));
1492	 eval(shift(@insns));
1493	 eval(shift(@insns));
1494
1495	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1496	&vpaddd	(@X[0],@X[0],@X[0]);
1497	 eval(shift(@insns));
1498	 eval(shift(@insns));
1499
1500	&vpsrld	(@Tx[1],@Tx[2],30);
1501	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1502	 eval(shift(@insns));
1503	 eval(shift(@insns));
1504
1505	&vpslld	(@Tx[2],@Tx[2],2);
1506	&vpxor	(@X[0],@X[0],@Tx[1]);
1507	 eval(shift(@insns));
1508	 eval(shift(@insns));
1509
1510	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1511	 eval(shift(@insns));
1512	 eval(shift(@insns));
1513	 eval(shift(@insns));
1514
1515	&vpaddd	(@Tx[1],@X[0],$Kx);
1516	 eval(shift(@insns));
1517	 eval(shift(@insns));
1518	 eval(shift(@insns));
1519	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1520
1521	 foreach (@insns) { eval; }	# remaining instructions [if any]
1522
1523	$Xi++;
1524	push(@X,shift(@X));	# "rotate" X[]
1525}
1526
1527sub Xupdate_avx2_32_79()
1528{ use integer;
1529  my $body = shift;
1530  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1531  my ($a,$b,$c,$d,$e);
1532
1533	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1534	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1535	 eval(shift(@insns));
1536	 eval(shift(@insns));
1537
1538	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1539	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1540	 eval(shift(@insns));
1541	 eval(shift(@insns));
1542	 eval(shift(@insns));
1543
1544	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1545	 eval(shift(@insns));
1546	 eval(shift(@insns));
1547	 eval(shift(@insns));
1548
1549	&vpsrld	(@Tx[0],@X[0],30);
1550	&vpslld	(@X[0],@X[0],2);
1551	 eval(shift(@insns));
1552	 eval(shift(@insns));
1553	 eval(shift(@insns));
1554
1555	#&vpslld	(@X[0],@X[0],2);
1556	 eval(shift(@insns));
1557	 eval(shift(@insns));
1558	 eval(shift(@insns));
1559
1560	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1561	 eval(shift(@insns));
1562	 eval(shift(@insns));
1563	 eval(shift(@insns));
1564	 eval(shift(@insns));
1565
1566	&vpaddd	(@Tx[1],@X[0],$Kx);
1567	 eval(shift(@insns));
1568	 eval(shift(@insns));
1569	 eval(shift(@insns));
1570	 eval(shift(@insns));
1571
1572	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1573
1574	 foreach (@insns) { eval; }	# remaining instructions
1575
1576	$Xi++;
1577	push(@X,shift(@X));	# "rotate" X[]
1578}
1579
1580sub Xloop_avx2()
1581{ use integer;
1582  my $body = shift;
1583  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1584  my ($a,$b,$c,$d,$e);
1585
1586	 foreach (@insns) { eval; }
1587}
1588
1589	&align32();
1590	&Xupdate_avx2_32_79(\&bodyx_00_19);
1591	&Xupdate_avx2_32_79(\&bodyx_00_19);
1592	&Xupdate_avx2_32_79(\&bodyx_00_19);
1593	&Xupdate_avx2_32_79(\&bodyx_00_19);
1594
1595	&Xupdate_avx2_32_79(\&bodyx_20_39);
1596	&Xupdate_avx2_32_79(\&bodyx_20_39);
1597	&Xupdate_avx2_32_79(\&bodyx_20_39);
1598	&Xupdate_avx2_32_79(\&bodyx_20_39);
1599
1600	&align32();
1601	&Xupdate_avx2_32_79(\&bodyx_40_59);
1602	&Xupdate_avx2_32_79(\&bodyx_40_59);
1603	&Xupdate_avx2_32_79(\&bodyx_40_59);
1604	&Xupdate_avx2_32_79(\&bodyx_40_59);
1605
1606	&Xloop_avx2(\&bodyx_20_39);
1607	&Xloop_avx2(\&bodyx_20_39);
1608	&Xloop_avx2(\&bodyx_20_39);
1609	&Xloop_avx2(\&bodyx_20_39);
1610
1611$code.=<<___;
1612	lea	128($inp),$frame
1613	lea	128($inp),%rdi			# borrow $t0
1614	cmp	$num,$frame
1615	cmovae	$inp,$frame			# next or previous block
1616
1617	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1618	add	0($ctx),@ROTX[0]		# update context
1619	add	4($ctx),@ROTX[1]
1620	add	8($ctx),@ROTX[3]
1621	mov	@ROTX[0],0($ctx)
1622	add	12($ctx),@ROTX[4]
1623	mov	@ROTX[1],4($ctx)
1624	 mov	@ROTX[0],$A			# A=d
1625	add	16($ctx),@ROTX[5]
1626	 mov	@ROTX[3],$a5
1627	mov	@ROTX[3],8($ctx)
1628	 mov	@ROTX[4],$D			# D=b
1629	 #xchg	@ROTX[5],$F			# F=c, C=f
1630	mov	@ROTX[4],12($ctx)
1631	 mov	@ROTX[1],$F			# F=e
1632	mov	@ROTX[5],16($ctx)
1633	#mov	$F,16($ctx)
1634	 mov	@ROTX[5],$E			# E=c
1635	 mov	$a5,$C				# C=f
1636	 #xchg	$F,$E				# E=c, F=e
1637
1638	cmp	$num,$inp
1639	je	.Ldone_avx2
1640___
1641
1642$Xi=4;				# reset variables
1643@X=map("%ymm$_",(4..7,0..3));
1644
1645$code.=<<___;
1646	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1647	cmp	$num,%rdi			# borrowed $t0
1648	ja	.Last_avx2
1649
1650	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1651	vmovdqu		-48(%rdi),%xmm1
1652	vmovdqu		-32(%rdi),%xmm2
1653	vmovdqu		-16(%rdi),%xmm3
1654	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1655	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1656	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1657	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1658	jmp	.Last_avx2
1659
1660.align	32
1661.Last_avx2:
1662	lea	128+16(%rsp),$frame
1663	rorx	\$2,$F,$B
1664	andn	$D,$F,$t0
1665	and	$C,$F
1666	xor	$t0,$F
1667	sub	\$-128,$inp
1668___
1669	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1670
1671	&Xloop_avx2	(\&bodyx_00_19);
1672	&Xloop_avx2	(\&bodyx_00_19);
1673	&Xloop_avx2	(\&bodyx_00_19);
1674	&Xloop_avx2	(\&bodyx_00_19);
1675
1676	&Xloop_avx2	(\&bodyx_20_39);
1677	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1678	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1679	&Xloop_avx2	(\&bodyx_20_39);
1680	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1681	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1682	&Xloop_avx2	(\&bodyx_20_39);
1683	  &vmovdqu	("0(%rsp)",@Tx[0]);
1684	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1685	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1686	&Xloop_avx2	(\&bodyx_20_39);
1687	  &vmovdqu	("32(%rsp)",@Tx[1]);
1688	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1689	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1690
1691	&Xloop_avx2	(\&bodyx_40_59);
1692	&align32	();
1693	  &vmovdqu	("64(%rsp)",@X[2]);
1694	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1695	&Xloop_avx2	(\&bodyx_40_59);
1696	  &vmovdqu	("96(%rsp)",@X[3]);
1697	&Xloop_avx2	(\&bodyx_40_59);
1698	&Xupdate_avx2_16_31(\&bodyx_40_59);
1699
1700	&Xupdate_avx2_16_31(\&bodyx_20_39);
1701	&Xupdate_avx2_16_31(\&bodyx_20_39);
1702	&Xupdate_avx2_16_31(\&bodyx_20_39);
1703	&Xloop_avx2	(\&bodyx_20_39);
1704
1705$code.=<<___;
1706	lea	128(%rsp),$frame
1707
1708	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1709	add	0($ctx),@ROTX[0]		# update context
1710	add	4($ctx),@ROTX[1]
1711	add	8($ctx),@ROTX[3]
1712	mov	@ROTX[0],0($ctx)
1713	add	12($ctx),@ROTX[4]
1714	mov	@ROTX[1],4($ctx)
1715	 mov	@ROTX[0],$A			# A=d
1716	add	16($ctx),@ROTX[5]
1717	 mov	@ROTX[3],$a5
1718	mov	@ROTX[3],8($ctx)
1719	 mov	@ROTX[4],$D			# D=b
1720	 #xchg	@ROTX[5],$F			# F=c, C=f
1721	mov	@ROTX[4],12($ctx)
1722	 mov	@ROTX[1],$F			# F=e
1723	mov	@ROTX[5],16($ctx)
1724	#mov	$F,16($ctx)
1725	 mov	@ROTX[5],$E			# E=c
1726	 mov	$a5,$C				# C=f
1727	 #xchg	$F,$E				# E=c, F=e
1728
1729	cmp	$num,$inp
1730	jbe	.Loop_avx2
1731
1732.Ldone_avx2:
1733	vzeroupper
1734___
1735$code.=<<___ if ($win64);
1736	movaps	-40-6*16($fp),%xmm6
1737	movaps	-40-5*16($fp),%xmm7
1738	movaps	-40-4*16($fp),%xmm8
1739	movaps	-40-3*16($fp),%xmm9
1740	movaps	-40-2*16($fp),%xmm10
1741	movaps	-40-1*16($fp),%xmm11
1742___
1743$code.=<<___;
1744	mov	-40($fp),%r14
1745	mov	-32($fp),%r13
1746	mov	-24($fp),%r12
1747	mov	-16($fp),%rbp
1748	mov	-8($fp),%rbx
1749	lea	($fp),%rsp
1750.Lepilogue_avx2:
1751	ret
1752.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1753___
1754}
1755}
1756$code.=<<___;
1757.align	64
1758K_XX_XX:
1759.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1760.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1761.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1762.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1763.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1764.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1765.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1766.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1767.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1768.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1769.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1770___
1771}}}
1772$code.=<<___;
1773.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1774.align	64
1775___
1776
1777# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1778#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1779if ($win64) {
1780$rec="%rcx";
1781$frame="%rdx";
1782$context="%r8";
1783$disp="%r9";
1784
1785$code.=<<___;
1786.extern	__imp_RtlVirtualUnwind
1787.type	se_handler,\@abi-omnipotent
1788.align	16
1789se_handler:
1790	push	%rsi
1791	push	%rdi
1792	push	%rbx
1793	push	%rbp
1794	push	%r12
1795	push	%r13
1796	push	%r14
1797	push	%r15
1798	pushfq
1799	sub	\$64,%rsp
1800
1801	mov	120($context),%rax	# pull context->Rax
1802	mov	248($context),%rbx	# pull context->Rip
1803
1804	lea	.Lprologue(%rip),%r10
1805	cmp	%r10,%rbx		# context->Rip<.Lprologue
1806	jb	.Lcommon_seh_tail
1807
1808	mov	152($context),%rax	# pull context->Rsp
1809
1810	lea	.Lepilogue(%rip),%r10
1811	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1812	jae	.Lcommon_seh_tail
1813
1814	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1815
1816	mov	-8(%rax),%rbx
1817	mov	-16(%rax),%rbp
1818	mov	-24(%rax),%r12
1819	mov	-32(%rax),%r13
1820	mov	-40(%rax),%r14
1821	mov	%rbx,144($context)	# restore context->Rbx
1822	mov	%rbp,160($context)	# restore context->Rbp
1823	mov	%r12,216($context)	# restore context->R12
1824	mov	%r13,224($context)	# restore context->R13
1825	mov	%r14,232($context)	# restore context->R14
1826
1827	jmp	.Lcommon_seh_tail
1828.size	se_handler,.-se_handler
1829___
1830
1831$code.=<<___ if ($shaext);
1832.type	shaext_handler,\@abi-omnipotent
1833.align	16
1834shaext_handler:
1835	push	%rsi
1836	push	%rdi
1837	push	%rbx
1838	push	%rbp
1839	push	%r12
1840	push	%r13
1841	push	%r14
1842	push	%r15
1843	pushfq
1844	sub	\$64,%rsp
1845
1846	mov	120($context),%rax	# pull context->Rax
1847	mov	248($context),%rbx	# pull context->Rip
1848
1849	lea	.Lprologue_shaext(%rip),%r10
1850	cmp	%r10,%rbx		# context->Rip<.Lprologue
1851	jb	.Lcommon_seh_tail
1852
1853	lea	.Lepilogue_shaext(%rip),%r10
1854	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1855	jae	.Lcommon_seh_tail
1856
1857	lea	-8-4*16(%rax),%rsi
1858	lea	512($context),%rdi	# &context.Xmm6
1859	mov	\$8,%ecx
1860	.long	0xa548f3fc		# cld; rep movsq
1861
1862	jmp	.Lcommon_seh_tail
1863.size	shaext_handler,.-shaext_handler
1864___
1865
1866$code.=<<___;
1867.type	ssse3_handler,\@abi-omnipotent
1868.align	16
1869ssse3_handler:
1870	push	%rsi
1871	push	%rdi
1872	push	%rbx
1873	push	%rbp
1874	push	%r12
1875	push	%r13
1876	push	%r14
1877	push	%r15
1878	pushfq
1879	sub	\$64,%rsp
1880
1881	mov	120($context),%rax	# pull context->Rax
1882	mov	248($context),%rbx	# pull context->Rip
1883
1884	mov	8($disp),%rsi		# disp->ImageBase
1885	mov	56($disp),%r11		# disp->HandlerData
1886
1887	mov	0(%r11),%r10d		# HandlerData[0]
1888	lea	(%rsi,%r10),%r10	# prologue label
1889	cmp	%r10,%rbx		# context->Rip<prologue label
1890	jb	.Lcommon_seh_tail
1891
1892	mov	208($context),%rax	# pull context->R11
1893
1894	mov	4(%r11),%r10d		# HandlerData[1]
1895	lea	(%rsi,%r10),%r10	# epilogue label
1896	cmp	%r10,%rbx		# context->Rip>=epilogue label
1897	jae	.Lcommon_seh_tail
1898
1899	lea	-40-6*16(%rax),%rsi
1900	lea	512($context),%rdi	# &context.Xmm6
1901	mov	\$12,%ecx
1902	.long	0xa548f3fc		# cld; rep movsq
1903
1904	mov	-8(%rax),%rbx
1905	mov	-16(%rax),%rbp
1906	mov	-24(%rax),%r12
1907	mov	-32(%rax),%r13
1908	mov	-40(%rax),%r14
1909	mov	%rbx,144($context)	# restore context->Rbx
1910	mov	%rbp,160($context)	# restore context->Rbp
1911	mov	%r12,216($context)	# restore cotnext->R12
1912	mov	%r13,224($context)	# restore cotnext->R13
1913	mov	%r14,232($context)	# restore cotnext->R14
1914
1915.Lcommon_seh_tail:
1916	mov	8(%rax),%rdi
1917	mov	16(%rax),%rsi
1918	mov	%rax,152($context)	# restore context->Rsp
1919	mov	%rsi,168($context)	# restore context->Rsi
1920	mov	%rdi,176($context)	# restore context->Rdi
1921
1922	mov	40($disp),%rdi		# disp->ContextRecord
1923	mov	$context,%rsi		# context
1924	mov	\$154,%ecx		# sizeof(CONTEXT)
1925	.long	0xa548f3fc		# cld; rep movsq
1926
1927	mov	$disp,%rsi
1928	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1929	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1930	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1931	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1932	mov	40(%rsi),%r10		# disp->ContextRecord
1933	lea	56(%rsi),%r11		# &disp->HandlerData
1934	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1935	mov	%r10,32(%rsp)		# arg5
1936	mov	%r11,40(%rsp)		# arg6
1937	mov	%r12,48(%rsp)		# arg7
1938	mov	%rcx,56(%rsp)		# arg8, (NULL)
1939	call	*__imp_RtlVirtualUnwind(%rip)
1940
1941	mov	\$1,%eax		# ExceptionContinueSearch
1942	add	\$64,%rsp
1943	popfq
1944	pop	%r15
1945	pop	%r14
1946	pop	%r13
1947	pop	%r12
1948	pop	%rbp
1949	pop	%rbx
1950	pop	%rdi
1951	pop	%rsi
1952	ret
1953.size	ssse3_handler,.-ssse3_handler
1954
1955.section	.pdata
1956.align	4
1957	.rva	.LSEH_begin_sha1_block_data_order
1958	.rva	.LSEH_end_sha1_block_data_order
1959	.rva	.LSEH_info_sha1_block_data_order
1960___
1961$code.=<<___ if ($shaext);
1962	.rva	.LSEH_begin_sha1_block_data_order_shaext
1963	.rva	.LSEH_end_sha1_block_data_order_shaext
1964	.rva	.LSEH_info_sha1_block_data_order_shaext
1965___
1966$code.=<<___;
1967	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1968	.rva	.LSEH_end_sha1_block_data_order_ssse3
1969	.rva	.LSEH_info_sha1_block_data_order_ssse3
1970___
1971$code.=<<___ if ($avx);
1972	.rva	.LSEH_begin_sha1_block_data_order_avx
1973	.rva	.LSEH_end_sha1_block_data_order_avx
1974	.rva	.LSEH_info_sha1_block_data_order_avx
1975___
1976$code.=<<___ if ($avx>1);
1977	.rva	.LSEH_begin_sha1_block_data_order_avx2
1978	.rva	.LSEH_end_sha1_block_data_order_avx2
1979	.rva	.LSEH_info_sha1_block_data_order_avx2
1980___
1981$code.=<<___;
1982.section	.xdata
1983.align	8
1984.LSEH_info_sha1_block_data_order:
1985	.byte	9,0,0,0
1986	.rva	se_handler
1987___
1988$code.=<<___ if ($shaext);
1989.LSEH_info_sha1_block_data_order_shaext:
1990	.byte	9,0,0,0
1991	.rva	shaext_handler
1992___
1993$code.=<<___;
1994.LSEH_info_sha1_block_data_order_ssse3:
1995	.byte	9,0,0,0
1996	.rva	ssse3_handler
1997	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1998___
1999$code.=<<___ if ($avx);
2000.LSEH_info_sha1_block_data_order_avx:
2001	.byte	9,0,0,0
2002	.rva	ssse3_handler
2003	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2004___
2005$code.=<<___ if ($avx>1);
2006.LSEH_info_sha1_block_data_order_avx2:
2007	.byte	9,0,0,0
2008	.rva	ssse3_handler
2009	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2010___
2011}
2012
2013####################################################################
2014
2015sub sha1rnds4 {
2016    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2017      my @opcode=(0x0f,0x3a,0xcc);
2018	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2019	my $c=$1;
2020	push @opcode,$c=~/^0/?oct($c):$c;
2021	return ".byte\t".join(',',@opcode);
2022    } else {
2023	return "sha1rnds4\t".@_[0];
2024    }
2025}
2026
2027sub sha1op38 {
2028    my $instr = shift;
2029    my %opcodelet = (
2030		"sha1nexte" => 0xc8,
2031  		"sha1msg1"  => 0xc9,
2032		"sha1msg2"  => 0xca	);
2033
2034    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2035      my @opcode=(0x0f,0x38);
2036      my $rex=0;
2037	$rex|=0x04			if ($2>=8);
2038	$rex|=0x01			if ($1>=8);
2039	unshift @opcode,0x40|$rex	if ($rex);
2040	push @opcode,$opcodelet{$instr};
2041	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2042	return ".byte\t".join(',',@opcode);
2043    } else {
2044	return $instr."\t".@_[0];
2045    }
2046}
2047
2048foreach (split("\n",$code)) {
2049	s/\`([^\`]*)\`/eval $1/geo;
2050
2051	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2052	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2053
2054	print $_,"\n";
2055}
2056close STDOUT;
2057