• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# sha1_block procedure for x86_64.
11#
12# It was brought to my attention that on EM64T compiler-generated code
13# was far behind 32-bit assembler implementation. This is unlike on
14# Opteron where compiler-generated code was only 15% behind 32-bit
15# assembler, which originally made it hard to motivate the effort.
16# There was suggestion to mechanically translate 32-bit code, but I
17# dismissed it, reasoning that x86_64 offers enough register bank
18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
19# implementation:-) However! While 64-bit code does perform better
20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
21# x86_64 does offer larger *addressable* bank, but out-of-order core
22# reaches for even more registers through dynamic aliasing, and EM64T
23# core must have managed to run-time optimize even 32-bit code just as
24# good as 64-bit one. Performance improvement is summarized in the
25# following table:
26#
27#		gcc 3.4		32-bit asm	cycles/byte
28# Opteron	+45%		+20%		6.8
29# Xeon P4	+65%		+0%		9.9
30# Core2		+60%		+10%		7.0
31
32# August 2009.
33#
34# The code was revised to minimize code size and to maximize
35# "distance" between instructions producing input to 'lea'
36# instruction and the 'lea' instruction itself, which is essential
37# for Intel Atom core.
38
39# October 2010.
40#
41# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
42# is to offload message schedule denoted by Wt in NIST specification,
43# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
44# for background and implementation details. The only difference from
45# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
46# to free temporary registers.
47
48# April 2011.
49#
50# Add AVX code path. See sha1-586.pl for further information.
51
52# May 2013.
53#
54# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
55# and loading pair of consecutive blocks to 256-bit %ymm registers)
56# did not provide impressive performance improvement till a crucial
57# hint regarding the number of Xupdate iterations to pre-compute in
58# advance was provided by Ilya Albrekht of Intel Corp.
59
60# March 2014.
61#
62# Add support for Intel SHA Extensions.
63
64######################################################################
65# Current performance is summarized in following table. Numbers are
66# CPU clock cycles spent to process single byte (less is better).
67#
68#		x86_64		SSSE3		AVX[2]
69# P4		9.05		-
70# Opteron	6.26		-
71# Core2		6.55		6.05/+8%	-
72# Westmere	6.73		5.30/+27%	-
73# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
74# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
75# Haswell	5.45		4.15/+31%	3.57/+53%
76# Skylake	5.18		4.06/+28%	3.54/+46%
77# Bulldozer	9.11		5.95/+53%
78# VIA Nano	9.32		7.15/+30%
79# Atom		10.3		9.17/+12%
80# Silvermont	13.1(*)		9.37/+40%
81# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
82#
83# (*)	obviously suboptimal result, nothing was done about it,
84#	because SSSE3 code is compiled unconditionally;
85# (**)	SHAEXT result
86
87$flavour = shift;
88$output  = shift;
89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
90
91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
92
93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
96die "can't locate x86_64-xlate.pl";
97
98# In upstream, this is controlled by shelling out to the compiler to check
99# versions, but BoringSSL is intended to be used with pre-generated perlasm
100# output, so this isn't useful anyway.
101#
102# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
103# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
104# did not tie them together until after $shaext was added.
105$avx = 1;
106
107# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
108# been tested.
109$shaext=0;	### set to zero if compiling for 1.0.1
110$avx=1		if (!$shaext && $avx);
111
112open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
113*STDOUT=*OUT;
114
115$ctx="%rdi";	# 1st arg
116$inp="%rsi";	# 2nd arg
117$num="%rdx";	# 3rd arg
118
119# reassign arguments in order to produce more compact code
120$ctx="%r8";
121$inp="%r9";
122$num="%r10";
123
124$t0="%eax";
125$t1="%ebx";
126$t2="%ecx";
127@xi=("%edx","%ebp","%r14d");
128$A="%esi";
129$B="%edi";
130$C="%r11d";
131$D="%r12d";
132$E="%r13d";
133
134@V=($A,$B,$C,$D,$E);
135
136sub BODY_00_19 {
137my ($i,$a,$b,$c,$d,$e)=@_;
138my $j=$i+1;
139$code.=<<___ if ($i==0);
140	mov	`4*$i`($inp),$xi[0]
141	bswap	$xi[0]
142___
143$code.=<<___ if ($i<15);
144	mov	`4*$j`($inp),$xi[1]
145	mov	$d,$t0
146	mov	$xi[0],`4*$i`(%rsp)
147	mov	$a,$t2
148	bswap	$xi[1]
149	xor	$c,$t0
150	rol	\$5,$t2
151	and	$b,$t0
152	lea	0x5a827999($xi[0],$e),$e
153	add	$t2,$e
154	xor	$d,$t0
155	rol	\$30,$b
156	add	$t0,$e
157___
158$code.=<<___ if ($i>=15);
159	xor	`4*($j%16)`(%rsp),$xi[1]
160	mov	$d,$t0
161	mov	$xi[0],`4*($i%16)`(%rsp)
162	mov	$a,$t2
163	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
164	xor	$c,$t0
165	rol	\$5,$t2
166	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
167	and	$b,$t0
168	lea	0x5a827999($xi[0],$e),$e
169	rol	\$30,$b
170	xor	$d,$t0
171	add	$t2,$e
172	rol	\$1,$xi[1]
173	add	$t0,$e
174___
175push(@xi,shift(@xi));
176}
177
178sub BODY_20_39 {
179my ($i,$a,$b,$c,$d,$e)=@_;
180my $j=$i+1;
181my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
182$code.=<<___ if ($i<79);
183	xor	`4*($j%16)`(%rsp),$xi[1]
184	mov	$b,$t0
185	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
186	mov	$a,$t2
187	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
188	xor	$d,$t0
189	rol	\$5,$t2
190	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
191	lea	$K($xi[0],$e),$e
192	xor	$c,$t0
193	add	$t2,$e
194	rol	\$30,$b
195	add	$t0,$e
196	rol	\$1,$xi[1]
197___
198$code.=<<___ if ($i==79);
199	mov	$b,$t0
200	mov	$a,$t2
201	xor	$d,$t0
202	lea	$K($xi[0],$e),$e
203	rol	\$5,$t2
204	xor	$c,$t0
205	add	$t2,$e
206	rol	\$30,$b
207	add	$t0,$e
208___
209push(@xi,shift(@xi));
210}
211
212sub BODY_40_59 {
213my ($i,$a,$b,$c,$d,$e)=@_;
214my $j=$i+1;
215$code.=<<___;
216	xor	`4*($j%16)`(%rsp),$xi[1]
217	mov	$d,$t0
218	mov	$xi[0],`4*($i%16)`(%rsp)
219	mov	$d,$t1
220	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
221	and	$c,$t0
222	mov	$a,$t2
223	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
224	lea	0x8f1bbcdc($xi[0],$e),$e
225	xor	$c,$t1
226	rol	\$5,$t2
227	add	$t0,$e
228	rol	\$1,$xi[1]
229	and	$b,$t1
230	add	$t2,$e
231	rol	\$30,$b
232	add	$t1,$e
233___
234push(@xi,shift(@xi));
235}
236
237$code.=<<___;
238.text
239.extern	OPENSSL_ia32cap_P
240
241.globl	sha1_block_data_order
242.type	sha1_block_data_order,\@function,3
243.align	16
244sha1_block_data_order:
245	mov	OPENSSL_ia32cap_P+0(%rip),%r9d
246	mov	OPENSSL_ia32cap_P+4(%rip),%r8d
247	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
248	test	\$`1<<9`,%r8d		# check SSSE3 bit
249	jz	.Lialu
250___
251$code.=<<___ if ($shaext);
252	test	\$`1<<29`,%r10d		# check SHA bit
253	jnz	_shaext_shortcut
254___
255$code.=<<___ if ($avx>1);
256	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
257	cmp	\$`1<<3|1<<5|1<<8`,%r10d
258	je	_avx2_shortcut
259___
260$code.=<<___ if ($avx);
261	and	\$`1<<28`,%r8d		# mask AVX bit
262	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
263	or	%r9d,%r8d
264	cmp	\$`1<<28|1<<30`,%r8d
265	je	_avx_shortcut
266___
267$code.=<<___;
268	jmp	_ssse3_shortcut
269
270.align	16
271.Lialu:
272	mov	%rsp,%rax
273	push	%rbx
274	push	%rbp
275	push	%r12
276	push	%r13
277	push	%r14
278	mov	%rdi,$ctx	# reassigned argument
279	sub	\$`8+16*4`,%rsp
280	mov	%rsi,$inp	# reassigned argument
281	and	\$-64,%rsp
282	mov	%rdx,$num	# reassigned argument
283	mov	%rax,`16*4`(%rsp)
284.Lprologue:
285
286	mov	0($ctx),$A
287	mov	4($ctx),$B
288	mov	8($ctx),$C
289	mov	12($ctx),$D
290	mov	16($ctx),$E
291	jmp	.Lloop
292
293.align	16
294.Lloop:
295___
296for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
297for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
298for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
299for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
300$code.=<<___;
301	add	0($ctx),$A
302	add	4($ctx),$B
303	add	8($ctx),$C
304	add	12($ctx),$D
305	add	16($ctx),$E
306	mov	$A,0($ctx)
307	mov	$B,4($ctx)
308	mov	$C,8($ctx)
309	mov	$D,12($ctx)
310	mov	$E,16($ctx)
311
312	sub	\$1,$num
313	lea	`16*4`($inp),$inp
314	jnz	.Lloop
315
316	mov	`16*4`(%rsp),%rsi
317	mov	-40(%rsi),%r14
318	mov	-32(%rsi),%r13
319	mov	-24(%rsi),%r12
320	mov	-16(%rsi),%rbp
321	mov	-8(%rsi),%rbx
322	lea	(%rsi),%rsp
323.Lepilogue:
324	ret
325.size	sha1_block_data_order,.-sha1_block_data_order
326___
327if ($shaext) {{{
328######################################################################
329# Intel SHA Extensions implementation of SHA1 update function.
330#
331my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
332my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
333my @MSG=map("%xmm$_",(4..7));
334
335$code.=<<___;
336.type	sha1_block_data_order_shaext,\@function,3
337.align	32
338sha1_block_data_order_shaext:
339_shaext_shortcut:
340___
341$code.=<<___ if ($win64);
342	lea	`-8-4*16`(%rsp),%rsp
343	movaps	%xmm6,-8-4*16(%rax)
344	movaps	%xmm7,-8-3*16(%rax)
345	movaps	%xmm8,-8-2*16(%rax)
346	movaps	%xmm9,-8-1*16(%rax)
347.Lprologue_shaext:
348___
349$code.=<<___;
350	movdqu	($ctx),$ABCD
351	movd	16($ctx),$E
352	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
353
354	movdqu	($inp),@MSG[0]
355	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
356	movdqu	0x10($inp),@MSG[1]
357	pshufd	\$0b00011011,$E,$E		# flip word order
358	movdqu	0x20($inp),@MSG[2]
359	pshufb	$BSWAP,@MSG[0]
360	movdqu	0x30($inp),@MSG[3]
361	pshufb	$BSWAP,@MSG[1]
362	pshufb	$BSWAP,@MSG[2]
363	movdqa	$E,$E_SAVE			# offload $E
364	pshufb	$BSWAP,@MSG[3]
365	jmp	.Loop_shaext
366
367.align	16
368.Loop_shaext:
369	dec		$num
370	lea		0x40($inp),%r8		# next input block
371	paddd		@MSG[0],$E
372	cmovne		%r8,$inp
373	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
374___
375for($i=0;$i<20-4;$i+=2) {
376$code.=<<___;
377	sha1msg1	@MSG[1],@MSG[0]
378	movdqa		$ABCD,$E_
379	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
380	sha1nexte	@MSG[1],$E_
381	pxor		@MSG[2],@MSG[0]
382	sha1msg1	@MSG[2],@MSG[1]
383	sha1msg2	@MSG[3],@MSG[0]
384
385	movdqa		$ABCD,$E
386	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
387	sha1nexte	@MSG[2],$E
388	pxor		@MSG[3],@MSG[1]
389	sha1msg2	@MSG[0],@MSG[1]
390___
391	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
392}
393$code.=<<___;
394	movdqu		($inp),@MSG[0]
395	movdqa		$ABCD,$E_
396	sha1rnds4	\$3,$E,$ABCD		# 64-67
397	sha1nexte	@MSG[1],$E_
398	movdqu		0x10($inp),@MSG[1]
399	pshufb		$BSWAP,@MSG[0]
400
401	movdqa		$ABCD,$E
402	sha1rnds4	\$3,$E_,$ABCD		# 68-71
403	sha1nexte	@MSG[2],$E
404	movdqu		0x20($inp),@MSG[2]
405	pshufb		$BSWAP,@MSG[1]
406
407	movdqa		$ABCD,$E_
408	sha1rnds4	\$3,$E,$ABCD		# 72-75
409	sha1nexte	@MSG[3],$E_
410	movdqu		0x30($inp),@MSG[3]
411	pshufb		$BSWAP,@MSG[2]
412
413	movdqa		$ABCD,$E
414	sha1rnds4	\$3,$E_,$ABCD		# 76-79
415	sha1nexte	$E_SAVE,$E
416	pshufb		$BSWAP,@MSG[3]
417
418	paddd		$ABCD_SAVE,$ABCD
419	movdqa		$E,$E_SAVE		# offload $E
420
421	jnz		.Loop_shaext
422
423	pshufd	\$0b00011011,$ABCD,$ABCD
424	pshufd	\$0b00011011,$E,$E
425	movdqu	$ABCD,($ctx)
426	movd	$E,16($ctx)
427___
428$code.=<<___ if ($win64);
429	movaps	-8-4*16(%rax),%xmm6
430	movaps	-8-3*16(%rax),%xmm7
431	movaps	-8-2*16(%rax),%xmm8
432	movaps	-8-1*16(%rax),%xmm9
433	mov	%rax,%rsp
434.Lepilogue_shaext:
435___
436$code.=<<___;
437	ret
438.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
439___
440}}}
441{{{
442my $Xi=4;
443my @X=map("%xmm$_",(4..7,0..3));
444my @Tx=map("%xmm$_",(8..10));
445my $Kx="%xmm11";
446my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
447my @T=("%esi","%edi");
448my $j=0;
449my $rx=0;
450my $K_XX_XX="%r14";
451my $fp="%r11";
452
453my $_rol=sub { &rol(@_) };
454my $_ror=sub { &ror(@_) };
455
456{ my $sn;
457sub align32() {
458  ++$sn;
459$code.=<<___;
460	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
461.align	32
462.Lalign32_$sn:
463___
464}
465}
466
467$code.=<<___;
468.type	sha1_block_data_order_ssse3,\@function,3
469.align	16
470sha1_block_data_order_ssse3:
471_ssse3_shortcut:
472	mov	%rsp,$fp	# frame pointer
473	push	%rbx
474	push	%rbp
475	push	%r12
476	push	%r13		# redundant, done to share Win64 SE handler
477	push	%r14
478	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
479___
480$code.=<<___ if ($win64);
481	movaps	%xmm6,-40-6*16($fp)
482	movaps	%xmm7,-40-5*16($fp)
483	movaps	%xmm8,-40-4*16($fp)
484	movaps	%xmm9,-40-3*16($fp)
485	movaps	%xmm10,-40-2*16($fp)
486	movaps	%xmm11,-40-1*16($fp)
487.Lprologue_ssse3:
488___
489$code.=<<___;
490	and	\$-64,%rsp
491	mov	%rdi,$ctx	# reassigned argument
492	mov	%rsi,$inp	# reassigned argument
493	mov	%rdx,$num	# reassigned argument
494
495	shl	\$6,$num
496	add	$inp,$num
497	lea	K_XX_XX+64(%rip),$K_XX_XX
498
499	mov	0($ctx),$A		# load context
500	mov	4($ctx),$B
501	mov	8($ctx),$C
502	mov	12($ctx),$D
503	mov	$B,@T[0]		# magic seed
504	mov	16($ctx),$E
505	mov	$C,@T[1]
506	xor	$D,@T[1]
507	and	@T[1],@T[0]
508
509	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
510	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
511	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
512	movdqu	16($inp),@X[-3&7]
513	movdqu	32($inp),@X[-2&7]
514	movdqu	48($inp),@X[-1&7]
515	pshufb	@X[2],@X[-4&7]		# byte swap
516	pshufb	@X[2],@X[-3&7]
517	pshufb	@X[2],@X[-2&7]
518	add	\$64,$inp
519	paddd	@Tx[1],@X[-4&7]		# add K_00_19
520	pshufb	@X[2],@X[-1&7]
521	paddd	@Tx[1],@X[-3&7]
522	paddd	@Tx[1],@X[-2&7]
523	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
524	psubd	@Tx[1],@X[-4&7]		# restore X[]
525	movdqa	@X[-3&7],16(%rsp)
526	psubd	@Tx[1],@X[-3&7]
527	movdqa	@X[-2&7],32(%rsp)
528	psubd	@Tx[1],@X[-2&7]
529	jmp	.Loop_ssse3
530___
531
532sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
533{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
534  my $arg = pop;
535    $arg = "\$$arg" if ($arg*1 eq $arg);
536    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
537}
538
539sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
540{ use integer;
541  my $body = shift;
542  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
543  my ($a,$b,$c,$d,$e);
544
545	 eval(shift(@insns));		# ror
546	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
547	 eval(shift(@insns));
548	&movdqa	(@Tx[0],@X[-1&7]);
549	  &paddd	(@Tx[1],@X[-1&7]);
550	 eval(shift(@insns));
551	 eval(shift(@insns));
552
553	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
554	 eval(shift(@insns));
555	 eval(shift(@insns));		# rol
556	 eval(shift(@insns));
557	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
558	 eval(shift(@insns));
559	 eval(shift(@insns));
560
561	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
562	 eval(shift(@insns));
563	 eval(shift(@insns));		# ror
564	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
565	 eval(shift(@insns));
566	 eval(shift(@insns));
567	 eval(shift(@insns));
568
569	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
570	 eval(shift(@insns));
571	 eval(shift(@insns));		# rol
572	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
573	 eval(shift(@insns));
574	 eval(shift(@insns));
575
576	&movdqa	(@Tx[2],@X[0]);
577	 eval(shift(@insns));
578	 eval(shift(@insns));
579	 eval(shift(@insns));		# ror
580	&movdqa	(@Tx[0],@X[0]);
581	 eval(shift(@insns));
582
583	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
584	&paddd	(@X[0],@X[0]);
585	 eval(shift(@insns));
586	 eval(shift(@insns));
587
588	&psrld	(@Tx[0],31);
589	 eval(shift(@insns));
590	 eval(shift(@insns));		# rol
591	 eval(shift(@insns));
592	&movdqa	(@Tx[1],@Tx[2]);
593	 eval(shift(@insns));
594	 eval(shift(@insns));
595
596	&psrld	(@Tx[2],30);
597	 eval(shift(@insns));
598	 eval(shift(@insns));		# ror
599	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
600	 eval(shift(@insns));
601	 eval(shift(@insns));
602	 eval(shift(@insns));
603
604	&pslld	(@Tx[1],2);
605	&pxor	(@X[0],@Tx[2]);
606	 eval(shift(@insns));
607	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
608	 eval(shift(@insns));		# rol
609	 eval(shift(@insns));
610	 eval(shift(@insns));
611
612	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
613	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
614
615	 foreach (@insns) { eval; }	# remaining instructions [if any]
616
617  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
618		push(@Tx,shift(@Tx));
619}
620
621sub Xupdate_ssse3_32_79()
622{ use integer;
623  my $body = shift;
624  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
625  my ($a,$b,$c,$d,$e);
626
627	 eval(shift(@insns))		if ($Xi==8);
628	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
629	 eval(shift(@insns))		if ($Xi==8);
630	 eval(shift(@insns));		# body_20_39
631	 eval(shift(@insns));
632	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
633	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
634	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
635	 eval(shift(@insns));
636	 eval(shift(@insns));		# rol
637
638	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
639	 eval(shift(@insns));
640	 eval(shift(@insns));
641	if ($Xi%5) {
642	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
643	} else {			# ... or load next one
644	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
645	}
646	 eval(shift(@insns));		# ror
647	  &paddd	(@Tx[1],@X[-1&7]);
648	 eval(shift(@insns));
649
650	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
651	 eval(shift(@insns));		# body_20_39
652	 eval(shift(@insns));
653	 eval(shift(@insns));
654	 eval(shift(@insns));		# rol
655	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
656
657	&movdqa	(@Tx[0],@X[0]);
658	 eval(shift(@insns));
659	 eval(shift(@insns));
660	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
661	 eval(shift(@insns));		# ror
662	 eval(shift(@insns));
663	 eval(shift(@insns));		# body_20_39
664
665	&pslld	(@X[0],2);
666	 eval(shift(@insns));
667	 eval(shift(@insns));
668	&psrld	(@Tx[0],30);
669	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
670	 eval(shift(@insns));
671	 eval(shift(@insns));
672	 eval(shift(@insns));		# ror
673
674	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
675	 eval(shift(@insns));
676	 eval(shift(@insns));		# body_20_39
677	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
678	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
679	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
680	 eval(shift(@insns));
681	 eval(shift(@insns));		# rol
682	 eval(shift(@insns));
683	 eval(shift(@insns));
684	 eval(shift(@insns));		# rol
685	 eval(shift(@insns));
686
687	 foreach (@insns) { eval; }	# remaining instructions
688
689  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
690		push(@Tx,shift(@Tx));
691}
692
693sub Xuplast_ssse3_80()
694{ use integer;
695  my $body = shift;
696  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
697  my ($a,$b,$c,$d,$e);
698
699	 eval(shift(@insns));
700	 eval(shift(@insns));
701	 eval(shift(@insns));
702	 eval(shift(@insns));
703	  &paddd	(@Tx[1],@X[-1&7]);
704	 eval(shift(@insns));
705	 eval(shift(@insns));
706
707	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
708
709	 foreach (@insns) { eval; }		# remaining instructions
710
711	&cmp	($inp,$num);
712	&je	(".Ldone_ssse3");
713
714	unshift(@Tx,pop(@Tx));
715
716	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
717	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
718	&movdqu	(@X[-4&7],"0($inp)");		# load input
719	&movdqu	(@X[-3&7],"16($inp)");
720	&movdqu	(@X[-2&7],"32($inp)");
721	&movdqu	(@X[-1&7],"48($inp)");
722	&pshufb	(@X[-4&7],@X[2]);		# byte swap
723	&add	($inp,64);
724
725  $Xi=0;
726}
727
728sub Xloop_ssse3()
729{ use integer;
730  my $body = shift;
731  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
732  my ($a,$b,$c,$d,$e);
733
734	 eval(shift(@insns));
735	 eval(shift(@insns));
736	 eval(shift(@insns));
737	&pshufb	(@X[($Xi-3)&7],@X[2]);
738	 eval(shift(@insns));
739	 eval(shift(@insns));
740	 eval(shift(@insns));
741	 eval(shift(@insns));
742	&paddd	(@X[($Xi-4)&7],@Tx[1]);
743	 eval(shift(@insns));
744	 eval(shift(@insns));
745	 eval(shift(@insns));
746	 eval(shift(@insns));
747	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
748	 eval(shift(@insns));
749	 eval(shift(@insns));
750	 eval(shift(@insns));
751	 eval(shift(@insns));
752	&psubd	(@X[($Xi-4)&7],@Tx[1]);
753
754	foreach (@insns) { eval; }
755  $Xi++;
756}
757
758sub Xtail_ssse3()
759{ use integer;
760  my $body = shift;
761  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
762  my ($a,$b,$c,$d,$e);
763
764	foreach (@insns) { eval; }
765}
766
767sub body_00_19 () {	# ((c^d)&b)^d
768	# on start @T[0]=(c^d)&b
769	return &body_20_39() if ($rx==19); $rx++;
770	(
771	'($a,$b,$c,$d,$e)=@V;'.
772	'&$_ror	($b,$j?7:2)',	# $b>>>2
773	'&xor	(@T[0],$d)',
774	'&mov	(@T[1],$a)',	# $b for next round
775
776	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
777	'&xor	($b,$c)',	# $c^$d for next round
778
779	'&$_rol	($a,5)',
780	'&add	($e,@T[0])',
781	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
782
783	'&xor	($b,$c)',	# restore $b
784	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
785	);
786}
787
788sub body_20_39 () {	# b^d^c
789	# on entry @T[0]=b^d
790	return &body_40_59() if ($rx==39); $rx++;
791	(
792	'($a,$b,$c,$d,$e)=@V;'.
793	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
794	'&xor	(@T[0],$d)	if($j==19);'.
795	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
796	'&mov	(@T[1],$a)',	# $b for next round
797
798	'&$_rol	($a,5)',
799	'&add	($e,@T[0])',
800	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
801
802	'&$_ror	($b,7)',	# $b>>>2
803	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
804	);
805}
806
807sub body_40_59 () {	# ((b^c)&(c^d))^c
808	# on entry @T[0]=(b^c), (c^=d)
809	$rx++;
810	(
811	'($a,$b,$c,$d,$e)=@V;'.
812	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
813	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
814	'&xor	($c,$d)		if ($j>=40)',	# restore $c
815
816	'&$_ror	($b,7)',	# $b>>>2
817	'&mov	(@T[1],$a)',	# $b for next round
818	'&xor	(@T[0],$c)',
819
820	'&$_rol	($a,5)',
821	'&add	($e,@T[0])',
822	'&xor	(@T[1],$c)	if ($j==59);'.
823	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
824
825	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
826	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
827	);
828}
829$code.=<<___;
830.align	16
831.Loop_ssse3:
832___
833	&Xupdate_ssse3_16_31(\&body_00_19);
834	&Xupdate_ssse3_16_31(\&body_00_19);
835	&Xupdate_ssse3_16_31(\&body_00_19);
836	&Xupdate_ssse3_16_31(\&body_00_19);
837	&Xupdate_ssse3_32_79(\&body_00_19);
838	&Xupdate_ssse3_32_79(\&body_20_39);
839	&Xupdate_ssse3_32_79(\&body_20_39);
840	&Xupdate_ssse3_32_79(\&body_20_39);
841	&Xupdate_ssse3_32_79(\&body_20_39);
842	&Xupdate_ssse3_32_79(\&body_20_39);
843	&Xupdate_ssse3_32_79(\&body_40_59);
844	&Xupdate_ssse3_32_79(\&body_40_59);
845	&Xupdate_ssse3_32_79(\&body_40_59);
846	&Xupdate_ssse3_32_79(\&body_40_59);
847	&Xupdate_ssse3_32_79(\&body_40_59);
848	&Xupdate_ssse3_32_79(\&body_20_39);
849	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
850
851				$saved_j=$j; @saved_V=@V;
852
853	&Xloop_ssse3(\&body_20_39);
854	&Xloop_ssse3(\&body_20_39);
855	&Xloop_ssse3(\&body_20_39);
856
857$code.=<<___;
858	add	0($ctx),$A			# update context
859	add	4($ctx),@T[0]
860	add	8($ctx),$C
861	add	12($ctx),$D
862	mov	$A,0($ctx)
863	add	16($ctx),$E
864	mov	@T[0],4($ctx)
865	mov	@T[0],$B			# magic seed
866	mov	$C,8($ctx)
867	mov	$C,@T[1]
868	mov	$D,12($ctx)
869	xor	$D,@T[1]
870	mov	$E,16($ctx)
871	and	@T[1],@T[0]
872	jmp	.Loop_ssse3
873
874.align	16
875.Ldone_ssse3:
876___
877				$j=$saved_j; @V=@saved_V;
878
879	&Xtail_ssse3(\&body_20_39);
880	&Xtail_ssse3(\&body_20_39);
881	&Xtail_ssse3(\&body_20_39);
882
883$code.=<<___;
884	add	0($ctx),$A			# update context
885	add	4($ctx),@T[0]
886	add	8($ctx),$C
887	mov	$A,0($ctx)
888	add	12($ctx),$D
889	mov	@T[0],4($ctx)
890	add	16($ctx),$E
891	mov	$C,8($ctx)
892	mov	$D,12($ctx)
893	mov	$E,16($ctx)
894___
895$code.=<<___ if ($win64);
896	movaps	-40-6*16($fp),%xmm6
897	movaps	-40-5*16($fp),%xmm7
898	movaps	-40-4*16($fp),%xmm8
899	movaps	-40-3*16($fp),%xmm9
900	movaps	-40-2*16($fp),%xmm10
901	movaps	-40-1*16($fp),%xmm11
902___
903$code.=<<___;
904	mov	-40($fp),%r14
905	mov	-32($fp),%r13
906	mov	-24($fp),%r12
907	mov	-16($fp),%rbp
908	mov	-8($fp),%rbx
909	lea	($fp),%rsp
910.Lepilogue_ssse3:
911	ret
912.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
913___
914
915if ($avx) {
916$Xi=4;				# reset variables
917@X=map("%xmm$_",(4..7,0..3));
918@Tx=map("%xmm$_",(8..10));
919$j=0;
920$rx=0;
921
922my $done_avx_label=".Ldone_avx";
923
924my $_rol=sub { &shld(@_[0],@_) };
925my $_ror=sub { &shrd(@_[0],@_) };
926
927$code.=<<___;
928.type	sha1_block_data_order_avx,\@function,3
929.align	16
930sha1_block_data_order_avx:
931_avx_shortcut:
932	mov	%rsp,$fp
933	push	%rbx
934	push	%rbp
935	push	%r12
936	push	%r13		# redundant, done to share Win64 SE handler
937	push	%r14
938	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
939	vzeroupper
940___
941$code.=<<___ if ($win64);
942	vmovaps	%xmm6,-40-6*16($fp)
943	vmovaps	%xmm7,-40-5*16($fp)
944	vmovaps	%xmm8,-40-4*16($fp)
945	vmovaps	%xmm9,-40-3*16($fp)
946	vmovaps	%xmm10,-40-2*16($fp)
947	vmovaps	%xmm11,-40-1*16($fp)
948.Lprologue_avx:
949___
950$code.=<<___;
951	and	\$-64,%rsp
952	mov	%rdi,$ctx	# reassigned argument
953	mov	%rsi,$inp	# reassigned argument
954	mov	%rdx,$num	# reassigned argument
955
956	shl	\$6,$num
957	add	$inp,$num
958	lea	K_XX_XX+64(%rip),$K_XX_XX
959
960	mov	0($ctx),$A		# load context
961	mov	4($ctx),$B
962	mov	8($ctx),$C
963	mov	12($ctx),$D
964	mov	$B,@T[0]		# magic seed
965	mov	16($ctx),$E
966	mov	$C,@T[1]
967	xor	$D,@T[1]
968	and	@T[1],@T[0]
969
970	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
971	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
972	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
973	vmovdqu	16($inp),@X[-3&7]
974	vmovdqu	32($inp),@X[-2&7]
975	vmovdqu	48($inp),@X[-1&7]
976	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
977	add	\$64,$inp
978	vpshufb	@X[2],@X[-3&7],@X[-3&7]
979	vpshufb	@X[2],@X[-2&7],@X[-2&7]
980	vpshufb	@X[2],@X[-1&7],@X[-1&7]
981	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
982	vpaddd	$Kx,@X[-3&7],@X[1]
983	vpaddd	$Kx,@X[-2&7],@X[2]
984	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
985	vmovdqa	@X[1],16(%rsp)
986	vmovdqa	@X[2],32(%rsp)
987	jmp	.Loop_avx
988___
989
990sub Xupdate_avx_16_31()		# recall that $Xi starts wtih 4
991{ use integer;
992  my $body = shift;
993  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
994  my ($a,$b,$c,$d,$e);
995
996	 eval(shift(@insns));
997	 eval(shift(@insns));
998	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
999	 eval(shift(@insns));
1000	 eval(shift(@insns));
1001
1002	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1003	 eval(shift(@insns));
1004	 eval(shift(@insns));
1005	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1006	 eval(shift(@insns));
1007	 eval(shift(@insns));
1008	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1009	 eval(shift(@insns));
1010	 eval(shift(@insns));
1011
1012	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1013	 eval(shift(@insns));
1014	 eval(shift(@insns));
1015	 eval(shift(@insns));
1016	 eval(shift(@insns));
1017
1018	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1019	 eval(shift(@insns));
1020	 eval(shift(@insns));
1021	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1022	 eval(shift(@insns));
1023	 eval(shift(@insns));
1024
1025	&vpsrld	(@Tx[0],@X[0],31);
1026	 eval(shift(@insns));
1027	 eval(shift(@insns));
1028	 eval(shift(@insns));
1029	 eval(shift(@insns));
1030
1031	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1032	&vpaddd	(@X[0],@X[0],@X[0]);
1033	 eval(shift(@insns));
1034	 eval(shift(@insns));
1035	 eval(shift(@insns));
1036	 eval(shift(@insns));
1037
1038	&vpsrld	(@Tx[1],@Tx[2],30);
1039	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1040	 eval(shift(@insns));
1041	 eval(shift(@insns));
1042	 eval(shift(@insns));
1043	 eval(shift(@insns));
1044
1045	&vpslld	(@Tx[2],@Tx[2],2);
1046	&vpxor	(@X[0],@X[0],@Tx[1]);
1047	 eval(shift(@insns));
1048	 eval(shift(@insns));
1049	 eval(shift(@insns));
1050	 eval(shift(@insns));
1051
1052	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1053	 eval(shift(@insns));
1054	 eval(shift(@insns));
1055	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1056	 eval(shift(@insns));
1057	 eval(shift(@insns));
1058
1059
1060	 foreach (@insns) { eval; }	# remaining instructions [if any]
1061
1062  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1063}
1064
1065sub Xupdate_avx_32_79()
1066{ use integer;
1067  my $body = shift;
1068  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1069  my ($a,$b,$c,$d,$e);
1070
1071	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1072	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1073	 eval(shift(@insns));		# body_20_39
1074	 eval(shift(@insns));
1075	 eval(shift(@insns));
1076	 eval(shift(@insns));		# rol
1077
1078	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1079	 eval(shift(@insns));
1080	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1081	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1082	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1083	 eval(shift(@insns));		# ror
1084	 eval(shift(@insns));
1085
1086	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1087	 eval(shift(@insns));		# body_20_39
1088	 eval(shift(@insns));
1089	 eval(shift(@insns));
1090	 eval(shift(@insns));		# rol
1091
1092	&vpsrld	(@Tx[0],@X[0],30);
1093	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1094	 eval(shift(@insns));
1095	 eval(shift(@insns));
1096	 eval(shift(@insns));		# ror
1097	 eval(shift(@insns));
1098
1099	&vpslld	(@X[0],@X[0],2);
1100	 eval(shift(@insns));		# body_20_39
1101	 eval(shift(@insns));
1102	 eval(shift(@insns));
1103	 eval(shift(@insns));		# rol
1104	 eval(shift(@insns));
1105	 eval(shift(@insns));
1106	 eval(shift(@insns));		# ror
1107	 eval(shift(@insns));
1108
1109	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1110	 eval(shift(@insns));		# body_20_39
1111	 eval(shift(@insns));
1112	 eval(shift(@insns));
1113	 eval(shift(@insns));		# rol
1114	 eval(shift(@insns));
1115	 eval(shift(@insns));
1116	 eval(shift(@insns));		# rol
1117	 eval(shift(@insns));
1118
1119	 foreach (@insns) { eval; }	# remaining instructions
1120
1121  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1122}
1123
1124sub Xuplast_avx_80()
1125{ use integer;
1126  my $body = shift;
1127  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1128  my ($a,$b,$c,$d,$e);
1129
1130	 eval(shift(@insns));
1131	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1132	 eval(shift(@insns));
1133	 eval(shift(@insns));
1134	 eval(shift(@insns));
1135	 eval(shift(@insns));
1136
1137	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1138
1139	 foreach (@insns) { eval; }		# remaining instructions
1140
1141	&cmp	($inp,$num);
1142	&je	($done_avx_label);
1143
1144	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1145	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1146	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1147	&vmovdqu(@X[-3&7],"16($inp)");
1148	&vmovdqu(@X[-2&7],"32($inp)");
1149	&vmovdqu(@X[-1&7],"48($inp)");
1150	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1151	&add	($inp,64);
1152
1153  $Xi=0;
1154}
1155
1156sub Xloop_avx()
1157{ use integer;
1158  my $body = shift;
1159  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1160  my ($a,$b,$c,$d,$e);
1161
1162	 eval(shift(@insns));
1163	 eval(shift(@insns));
1164	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1165	 eval(shift(@insns));
1166	 eval(shift(@insns));
1167	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1168	 eval(shift(@insns));
1169	 eval(shift(@insns));
1170	 eval(shift(@insns));
1171	 eval(shift(@insns));
1172	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1173	 eval(shift(@insns));
1174	 eval(shift(@insns));
1175
1176	foreach (@insns) { eval; }
1177  $Xi++;
1178}
1179
1180sub Xtail_avx()
1181{ use integer;
1182  my $body = shift;
1183  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1184  my ($a,$b,$c,$d,$e);
1185
1186	foreach (@insns) { eval; }
1187}
1188
1189$code.=<<___;
1190.align	16
1191.Loop_avx:
1192___
1193	&Xupdate_avx_16_31(\&body_00_19);
1194	&Xupdate_avx_16_31(\&body_00_19);
1195	&Xupdate_avx_16_31(\&body_00_19);
1196	&Xupdate_avx_16_31(\&body_00_19);
1197	&Xupdate_avx_32_79(\&body_00_19);
1198	&Xupdate_avx_32_79(\&body_20_39);
1199	&Xupdate_avx_32_79(\&body_20_39);
1200	&Xupdate_avx_32_79(\&body_20_39);
1201	&Xupdate_avx_32_79(\&body_20_39);
1202	&Xupdate_avx_32_79(\&body_20_39);
1203	&Xupdate_avx_32_79(\&body_40_59);
1204	&Xupdate_avx_32_79(\&body_40_59);
1205	&Xupdate_avx_32_79(\&body_40_59);
1206	&Xupdate_avx_32_79(\&body_40_59);
1207	&Xupdate_avx_32_79(\&body_40_59);
1208	&Xupdate_avx_32_79(\&body_20_39);
1209	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1210
1211				$saved_j=$j; @saved_V=@V;
1212
1213	&Xloop_avx(\&body_20_39);
1214	&Xloop_avx(\&body_20_39);
1215	&Xloop_avx(\&body_20_39);
1216
1217$code.=<<___;
1218	add	0($ctx),$A			# update context
1219	add	4($ctx),@T[0]
1220	add	8($ctx),$C
1221	add	12($ctx),$D
1222	mov	$A,0($ctx)
1223	add	16($ctx),$E
1224	mov	@T[0],4($ctx)
1225	mov	@T[0],$B			# magic seed
1226	mov	$C,8($ctx)
1227	mov	$C,@T[1]
1228	mov	$D,12($ctx)
1229	xor	$D,@T[1]
1230	mov	$E,16($ctx)
1231	and	@T[1],@T[0]
1232	jmp	.Loop_avx
1233
1234.align	16
1235$done_avx_label:
1236___
1237				$j=$saved_j; @V=@saved_V;
1238
1239	&Xtail_avx(\&body_20_39);
1240	&Xtail_avx(\&body_20_39);
1241	&Xtail_avx(\&body_20_39);
1242
1243$code.=<<___;
1244	vzeroupper
1245
1246	add	0($ctx),$A			# update context
1247	add	4($ctx),@T[0]
1248	add	8($ctx),$C
1249	mov	$A,0($ctx)
1250	add	12($ctx),$D
1251	mov	@T[0],4($ctx)
1252	add	16($ctx),$E
1253	mov	$C,8($ctx)
1254	mov	$D,12($ctx)
1255	mov	$E,16($ctx)
1256___
1257$code.=<<___ if ($win64);
1258	movaps	-40-6*16($fp),%xmm6
1259	movaps	-40-5*16($fp),%xmm7
1260	movaps	-40-4*16($fp),%xmm8
1261	movaps	-40-3*16($fp),%xmm9
1262	movaps	-40-2*16($fp),%xmm10
1263	movaps	-40-1*16($fp),%xmm11
1264___
1265$code.=<<___;
1266	mov	-40($fp),%r14
1267	mov	-32($fp),%r13
1268	mov	-24($fp),%r12
1269	mov	-16($fp),%rbp
1270	mov	-8($fp),%rbx
1271	lea	($fp),%rsp
1272.Lepilogue_avx:
1273	ret
1274.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1275___
1276
1277if ($avx>1) {
1278use integer;
1279$Xi=4;					# reset variables
1280@X=map("%ymm$_",(4..7,0..3));
1281@Tx=map("%ymm$_",(8..10));
1282$Kx="%ymm11";
1283$j=0;
1284
1285my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1286my ($a5,$t0)=("%r12d","%edi");
1287
1288my ($A,$F,$B,$C,$D,$E)=@ROTX;
1289my $rx=0;
1290my $frame="%r13";
1291
1292$code.=<<___;
1293.type	sha1_block_data_order_avx2,\@function,3
1294.align	16
1295sha1_block_data_order_avx2:
1296_avx2_shortcut:
1297	mov	%rsp,$fp
1298	push	%rbx
1299	push	%rbp
1300	push	%r12
1301	push	%r13
1302	push	%r14
1303	vzeroupper
1304___
1305$code.=<<___ if ($win64);
1306	lea	-6*16(%rsp),%rsp
1307	vmovaps	%xmm6,-40-6*16($fp)
1308	vmovaps	%xmm7,-40-5*16($fp)
1309	vmovaps	%xmm8,-40-4*16($fp)
1310	vmovaps	%xmm9,-40-3*16($fp)
1311	vmovaps	%xmm10,-40-2*16($fp)
1312	vmovaps	%xmm11,-40-1*16($fp)
1313.Lprologue_avx2:
1314___
1315$code.=<<___;
1316	mov	%rdi,$ctx		# reassigned argument
1317	mov	%rsi,$inp		# reassigned argument
1318	mov	%rdx,$num		# reassigned argument
1319
1320	lea	-640(%rsp),%rsp
1321	shl	\$6,$num
1322	 lea	64($inp),$frame
1323	and	\$-128,%rsp
1324	add	$inp,$num
1325	lea	K_XX_XX+64(%rip),$K_XX_XX
1326
1327	mov	0($ctx),$A		# load context
1328	 cmp	$num,$frame
1329	 cmovae	$inp,$frame		# next or same block
1330	mov	4($ctx),$F
1331	mov	8($ctx),$C
1332	mov	12($ctx),$D
1333	mov	16($ctx),$E
1334	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1335
1336	vmovdqu		($inp),%xmm0
1337	vmovdqu		16($inp),%xmm1
1338	vmovdqu		32($inp),%xmm2
1339	vmovdqu		48($inp),%xmm3
1340	lea		64($inp),$inp
1341	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1342	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1343	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1344	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1345	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1346	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1347	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1348	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1349	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1350
1351	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1352	vpaddd	$Kx,@X[-3&7],@X[1]
1353	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1354	vpaddd	$Kx,@X[-2&7],@X[2]
1355	vmovdqu	@X[1],32(%rsp)
1356	vpaddd	$Kx,@X[-1&7],@X[3]
1357	vmovdqu	@X[2],64(%rsp)
1358	vmovdqu	@X[3],96(%rsp)
1359___
1360for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1361    use integer;
1362
1363	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1364	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1365	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1366	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1367	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1368	&vpsrld	(@Tx[0],@X[0],31);
1369	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1370	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1371	&vpaddd	(@X[0],@X[0],@X[0]);
1372	&vpsrld	(@Tx[1],@Tx[2],30);
1373	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1374	&vpslld	(@Tx[2],@Tx[2],2);
1375	&vpxor	(@X[0],@X[0],@Tx[1]);
1376	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1377	&vpaddd	(@Tx[1],@X[0],$Kx);
1378	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1379
1380	push(@X,shift(@X));	# "rotate" X[]
1381}
1382$code.=<<___;
1383	lea	128(%rsp),$frame
1384	jmp	.Loop_avx2
1385.align	32
1386.Loop_avx2:
1387	rorx	\$2,$F,$B
1388	andn	$D,$F,$t0
1389	and	$C,$F
1390	xor	$t0,$F
1391___
1392sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1393	# at start $f=(b&c)^(~b&d), $b>>>=2
1394	return &bodyx_20_39() if ($rx==19); $rx++;
1395	(
1396	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1397
1398	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1399	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1400	'&andn	($t0,$a,$c)',			# ~b&d for next round
1401
1402	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1403	'&rorx	($a5,$a,27)',			# a<<<5
1404	'&rorx	($f,$a,2)',			# b>>>2 for next round
1405	'&and	($a,$b)',			# b&c for next round
1406
1407	'&add	($e,$a5)',			# e+=a<<<5
1408	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1409
1410	'unshift(@ROTX,pop(@ROTX)); $j++;'
1411	)
1412}
1413
1414sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1415	# on entry $f=b^c^d, $b>>>=2
1416	return &bodyx_40_59() if ($rx==39); $rx++;
1417	(
1418	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1419
1420	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1421	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1422
1423	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1424	'&rorx	($a5,$a,27)',			# a<<<5
1425	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1426	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1427
1428	'&add	($e,$a5)',			# e+=a<<<5
1429	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1430
1431	'unshift(@ROTX,pop(@ROTX)); $j++;'
1432	)
1433}
1434
1435sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1436	# on entry $f=((b^c)&(c^d)), $b>>>=2
1437	$rx++;
1438	(
1439	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1440
1441	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1442	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1443	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1444	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1445	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1446
1447	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1448	'&rorx	($a5,$a,27)',			# a<<<5
1449	'&rorx	($f,$a,2)',			# b>>>2 in next round
1450	'&xor	($a,$b)',			# b^c for next round
1451
1452	'&add	($e,$a5)',			# e+=a<<<5
1453	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1454	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1455
1456	'unshift(@ROTX,pop(@ROTX)); $j++;'
1457	)
1458}
1459
1460sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
1461{ use integer;
1462  my $body = shift;
1463  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1464  my ($a,$b,$c,$d,$e);
1465
1466	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1467	 eval(shift(@insns));
1468	 eval(shift(@insns));
1469	 eval(shift(@insns));
1470	 eval(shift(@insns));
1471
1472	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1473	 eval(shift(@insns));
1474	 eval(shift(@insns));
1475	 eval(shift(@insns));
1476
1477	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1478	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1479	 eval(shift(@insns));
1480	 eval(shift(@insns));
1481
1482	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1483	 eval(shift(@insns));
1484	 eval(shift(@insns));
1485	 eval(shift(@insns));
1486	 eval(shift(@insns));
1487
1488	&vpsrld	(@Tx[0],@X[0],31);
1489	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1490	 eval(shift(@insns));
1491	 eval(shift(@insns));
1492	 eval(shift(@insns));
1493
1494	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1495	&vpaddd	(@X[0],@X[0],@X[0]);
1496	 eval(shift(@insns));
1497	 eval(shift(@insns));
1498
1499	&vpsrld	(@Tx[1],@Tx[2],30);
1500	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1501	 eval(shift(@insns));
1502	 eval(shift(@insns));
1503
1504	&vpslld	(@Tx[2],@Tx[2],2);
1505	&vpxor	(@X[0],@X[0],@Tx[1]);
1506	 eval(shift(@insns));
1507	 eval(shift(@insns));
1508
1509	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1510	 eval(shift(@insns));
1511	 eval(shift(@insns));
1512	 eval(shift(@insns));
1513
1514	&vpaddd	(@Tx[1],@X[0],$Kx);
1515	 eval(shift(@insns));
1516	 eval(shift(@insns));
1517	 eval(shift(@insns));
1518	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1519
1520	 foreach (@insns) { eval; }	# remaining instructions [if any]
1521
1522	$Xi++;
1523	push(@X,shift(@X));	# "rotate" X[]
1524}
1525
1526sub Xupdate_avx2_32_79()
1527{ use integer;
1528  my $body = shift;
1529  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1530  my ($a,$b,$c,$d,$e);
1531
1532	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1533	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1534	 eval(shift(@insns));
1535	 eval(shift(@insns));
1536
1537	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1538	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1539	 eval(shift(@insns));
1540	 eval(shift(@insns));
1541	 eval(shift(@insns));
1542
1543	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1544	 eval(shift(@insns));
1545	 eval(shift(@insns));
1546	 eval(shift(@insns));
1547
1548	&vpsrld	(@Tx[0],@X[0],30);
1549	&vpslld	(@X[0],@X[0],2);
1550	 eval(shift(@insns));
1551	 eval(shift(@insns));
1552	 eval(shift(@insns));
1553
1554	#&vpslld	(@X[0],@X[0],2);
1555	 eval(shift(@insns));
1556	 eval(shift(@insns));
1557	 eval(shift(@insns));
1558
1559	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1560	 eval(shift(@insns));
1561	 eval(shift(@insns));
1562	 eval(shift(@insns));
1563	 eval(shift(@insns));
1564
1565	&vpaddd	(@Tx[1],@X[0],$Kx);
1566	 eval(shift(@insns));
1567	 eval(shift(@insns));
1568	 eval(shift(@insns));
1569	 eval(shift(@insns));
1570
1571	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1572
1573	 foreach (@insns) { eval; }	# remaining instructions
1574
1575	$Xi++;
1576	push(@X,shift(@X));	# "rotate" X[]
1577}
1578
1579sub Xloop_avx2()
1580{ use integer;
1581  my $body = shift;
1582  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1583  my ($a,$b,$c,$d,$e);
1584
1585	 foreach (@insns) { eval; }
1586}
1587
1588	&align32();
1589	&Xupdate_avx2_32_79(\&bodyx_00_19);
1590	&Xupdate_avx2_32_79(\&bodyx_00_19);
1591	&Xupdate_avx2_32_79(\&bodyx_00_19);
1592	&Xupdate_avx2_32_79(\&bodyx_00_19);
1593
1594	&Xupdate_avx2_32_79(\&bodyx_20_39);
1595	&Xupdate_avx2_32_79(\&bodyx_20_39);
1596	&Xupdate_avx2_32_79(\&bodyx_20_39);
1597	&Xupdate_avx2_32_79(\&bodyx_20_39);
1598
1599	&align32();
1600	&Xupdate_avx2_32_79(\&bodyx_40_59);
1601	&Xupdate_avx2_32_79(\&bodyx_40_59);
1602	&Xupdate_avx2_32_79(\&bodyx_40_59);
1603	&Xupdate_avx2_32_79(\&bodyx_40_59);
1604
1605	&Xloop_avx2(\&bodyx_20_39);
1606	&Xloop_avx2(\&bodyx_20_39);
1607	&Xloop_avx2(\&bodyx_20_39);
1608	&Xloop_avx2(\&bodyx_20_39);
1609
1610$code.=<<___;
1611	lea	128($inp),$frame
1612	lea	128($inp),%rdi			# borrow $t0
1613	cmp	$num,$frame
1614	cmovae	$inp,$frame			# next or previous block
1615
1616	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1617	add	0($ctx),@ROTX[0]		# update context
1618	add	4($ctx),@ROTX[1]
1619	add	8($ctx),@ROTX[3]
1620	mov	@ROTX[0],0($ctx)
1621	add	12($ctx),@ROTX[4]
1622	mov	@ROTX[1],4($ctx)
1623	 mov	@ROTX[0],$A			# A=d
1624	add	16($ctx),@ROTX[5]
1625	 mov	@ROTX[3],$a5
1626	mov	@ROTX[3],8($ctx)
1627	 mov	@ROTX[4],$D			# D=b
1628	 #xchg	@ROTX[5],$F			# F=c, C=f
1629	mov	@ROTX[4],12($ctx)
1630	 mov	@ROTX[1],$F			# F=e
1631	mov	@ROTX[5],16($ctx)
1632	#mov	$F,16($ctx)
1633	 mov	@ROTX[5],$E			# E=c
1634	 mov	$a5,$C				# C=f
1635	 #xchg	$F,$E				# E=c, F=e
1636
1637	cmp	$num,$inp
1638	je	.Ldone_avx2
1639___
1640
1641$Xi=4;				# reset variables
1642@X=map("%ymm$_",(4..7,0..3));
1643
1644$code.=<<___;
1645	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1646	cmp	$num,%rdi			# borrowed $t0
1647	ja	.Last_avx2
1648
1649	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1650	vmovdqu		-48(%rdi),%xmm1
1651	vmovdqu		-32(%rdi),%xmm2
1652	vmovdqu		-16(%rdi),%xmm3
1653	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1654	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1655	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1656	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1657	jmp	.Last_avx2
1658
1659.align	32
1660.Last_avx2:
1661	lea	128+16(%rsp),$frame
1662	rorx	\$2,$F,$B
1663	andn	$D,$F,$t0
1664	and	$C,$F
1665	xor	$t0,$F
1666	sub	\$-128,$inp
1667___
1668	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1669
1670	&Xloop_avx2	(\&bodyx_00_19);
1671	&Xloop_avx2	(\&bodyx_00_19);
1672	&Xloop_avx2	(\&bodyx_00_19);
1673	&Xloop_avx2	(\&bodyx_00_19);
1674
1675	&Xloop_avx2	(\&bodyx_20_39);
1676	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1677	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1678	&Xloop_avx2	(\&bodyx_20_39);
1679	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1680	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1681	&Xloop_avx2	(\&bodyx_20_39);
1682	  &vmovdqu	("0(%rsp)",@Tx[0]);
1683	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1684	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1685	&Xloop_avx2	(\&bodyx_20_39);
1686	  &vmovdqu	("32(%rsp)",@Tx[1]);
1687	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1688	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1689
1690	&Xloop_avx2	(\&bodyx_40_59);
1691	&align32	();
1692	  &vmovdqu	("64(%rsp)",@X[2]);
1693	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1694	&Xloop_avx2	(\&bodyx_40_59);
1695	  &vmovdqu	("96(%rsp)",@X[3]);
1696	&Xloop_avx2	(\&bodyx_40_59);
1697	&Xupdate_avx2_16_31(\&bodyx_40_59);
1698
1699	&Xupdate_avx2_16_31(\&bodyx_20_39);
1700	&Xupdate_avx2_16_31(\&bodyx_20_39);
1701	&Xupdate_avx2_16_31(\&bodyx_20_39);
1702	&Xloop_avx2	(\&bodyx_20_39);
1703
1704$code.=<<___;
1705	lea	128(%rsp),$frame
1706
1707	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1708	add	0($ctx),@ROTX[0]		# update context
1709	add	4($ctx),@ROTX[1]
1710	add	8($ctx),@ROTX[3]
1711	mov	@ROTX[0],0($ctx)
1712	add	12($ctx),@ROTX[4]
1713	mov	@ROTX[1],4($ctx)
1714	 mov	@ROTX[0],$A			# A=d
1715	add	16($ctx),@ROTX[5]
1716	 mov	@ROTX[3],$a5
1717	mov	@ROTX[3],8($ctx)
1718	 mov	@ROTX[4],$D			# D=b
1719	 #xchg	@ROTX[5],$F			# F=c, C=f
1720	mov	@ROTX[4],12($ctx)
1721	 mov	@ROTX[1],$F			# F=e
1722	mov	@ROTX[5],16($ctx)
1723	#mov	$F,16($ctx)
1724	 mov	@ROTX[5],$E			# E=c
1725	 mov	$a5,$C				# C=f
1726	 #xchg	$F,$E				# E=c, F=e
1727
1728	cmp	$num,$inp
1729	jbe	.Loop_avx2
1730
1731.Ldone_avx2:
1732	vzeroupper
1733___
1734$code.=<<___ if ($win64);
1735	movaps	-40-6*16($fp),%xmm6
1736	movaps	-40-5*16($fp),%xmm7
1737	movaps	-40-4*16($fp),%xmm8
1738	movaps	-40-3*16($fp),%xmm9
1739	movaps	-40-2*16($fp),%xmm10
1740	movaps	-40-1*16($fp),%xmm11
1741___
1742$code.=<<___;
1743	mov	-40($fp),%r14
1744	mov	-32($fp),%r13
1745	mov	-24($fp),%r12
1746	mov	-16($fp),%rbp
1747	mov	-8($fp),%rbx
1748	lea	($fp),%rsp
1749.Lepilogue_avx2:
1750	ret
1751.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1752___
1753}
1754}
1755$code.=<<___;
1756.align	64
1757K_XX_XX:
1758.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1759.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1760.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1761.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1762.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1763.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1764.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1765.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1766.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1767.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1768.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1769___
1770}}}
1771$code.=<<___;
1772.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1773.align	64
1774___
1775
1776# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1777#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1778if ($win64) {
1779$rec="%rcx";
1780$frame="%rdx";
1781$context="%r8";
1782$disp="%r9";
1783
1784$code.=<<___;
1785.extern	__imp_RtlVirtualUnwind
1786.type	se_handler,\@abi-omnipotent
1787.align	16
1788se_handler:
1789	push	%rsi
1790	push	%rdi
1791	push	%rbx
1792	push	%rbp
1793	push	%r12
1794	push	%r13
1795	push	%r14
1796	push	%r15
1797	pushfq
1798	sub	\$64,%rsp
1799
1800	mov	120($context),%rax	# pull context->Rax
1801	mov	248($context),%rbx	# pull context->Rip
1802
1803	lea	.Lprologue(%rip),%r10
1804	cmp	%r10,%rbx		# context->Rip<.Lprologue
1805	jb	.Lcommon_seh_tail
1806
1807	mov	152($context),%rax	# pull context->Rsp
1808
1809	lea	.Lepilogue(%rip),%r10
1810	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1811	jae	.Lcommon_seh_tail
1812
1813	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1814
1815	mov	-8(%rax),%rbx
1816	mov	-16(%rax),%rbp
1817	mov	-24(%rax),%r12
1818	mov	-32(%rax),%r13
1819	mov	-40(%rax),%r14
1820	mov	%rbx,144($context)	# restore context->Rbx
1821	mov	%rbp,160($context)	# restore context->Rbp
1822	mov	%r12,216($context)	# restore context->R12
1823	mov	%r13,224($context)	# restore context->R13
1824	mov	%r14,232($context)	# restore context->R14
1825
1826	jmp	.Lcommon_seh_tail
1827.size	se_handler,.-se_handler
1828___
1829
1830$code.=<<___ if ($shaext);
1831.type	shaext_handler,\@abi-omnipotent
1832.align	16
1833shaext_handler:
1834	push	%rsi
1835	push	%rdi
1836	push	%rbx
1837	push	%rbp
1838	push	%r12
1839	push	%r13
1840	push	%r14
1841	push	%r15
1842	pushfq
1843	sub	\$64,%rsp
1844
1845	mov	120($context),%rax	# pull context->Rax
1846	mov	248($context),%rbx	# pull context->Rip
1847
1848	lea	.Lprologue_shaext(%rip),%r10
1849	cmp	%r10,%rbx		# context->Rip<.Lprologue
1850	jb	.Lcommon_seh_tail
1851
1852	lea	.Lepilogue_shaext(%rip),%r10
1853	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1854	jae	.Lcommon_seh_tail
1855
1856	lea	-8-4*16(%rax),%rsi
1857	lea	512($context),%rdi	# &context.Xmm6
1858	mov	\$8,%ecx
1859	.long	0xa548f3fc		# cld; rep movsq
1860
1861	jmp	.Lcommon_seh_tail
1862.size	shaext_handler,.-shaext_handler
1863___
1864
1865$code.=<<___;
1866.type	ssse3_handler,\@abi-omnipotent
1867.align	16
1868ssse3_handler:
1869	push	%rsi
1870	push	%rdi
1871	push	%rbx
1872	push	%rbp
1873	push	%r12
1874	push	%r13
1875	push	%r14
1876	push	%r15
1877	pushfq
1878	sub	\$64,%rsp
1879
1880	mov	120($context),%rax	# pull context->Rax
1881	mov	248($context),%rbx	# pull context->Rip
1882
1883	mov	8($disp),%rsi		# disp->ImageBase
1884	mov	56($disp),%r11		# disp->HandlerData
1885
1886	mov	0(%r11),%r10d		# HandlerData[0]
1887	lea	(%rsi,%r10),%r10	# prologue label
1888	cmp	%r10,%rbx		# context->Rip<prologue label
1889	jb	.Lcommon_seh_tail
1890
1891	mov	208($context),%rax	# pull context->R11
1892
1893	mov	4(%r11),%r10d		# HandlerData[1]
1894	lea	(%rsi,%r10),%r10	# epilogue label
1895	cmp	%r10,%rbx		# context->Rip>=epilogue label
1896	jae	.Lcommon_seh_tail
1897
1898	lea	-40-6*16(%rax),%rsi
1899	lea	512($context),%rdi	# &context.Xmm6
1900	mov	\$12,%ecx
1901	.long	0xa548f3fc		# cld; rep movsq
1902
1903	mov	-8(%rax),%rbx
1904	mov	-16(%rax),%rbp
1905	mov	-24(%rax),%r12
1906	mov	-32(%rax),%r13
1907	mov	-40(%rax),%r14
1908	mov	%rbx,144($context)	# restore context->Rbx
1909	mov	%rbp,160($context)	# restore context->Rbp
1910	mov	%r12,216($context)	# restore cotnext->R12
1911	mov	%r13,224($context)	# restore cotnext->R13
1912	mov	%r14,232($context)	# restore cotnext->R14
1913
1914.Lcommon_seh_tail:
1915	mov	8(%rax),%rdi
1916	mov	16(%rax),%rsi
1917	mov	%rax,152($context)	# restore context->Rsp
1918	mov	%rsi,168($context)	# restore context->Rsi
1919	mov	%rdi,176($context)	# restore context->Rdi
1920
1921	mov	40($disp),%rdi		# disp->ContextRecord
1922	mov	$context,%rsi		# context
1923	mov	\$154,%ecx		# sizeof(CONTEXT)
1924	.long	0xa548f3fc		# cld; rep movsq
1925
1926	mov	$disp,%rsi
1927	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1928	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1929	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1930	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1931	mov	40(%rsi),%r10		# disp->ContextRecord
1932	lea	56(%rsi),%r11		# &disp->HandlerData
1933	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1934	mov	%r10,32(%rsp)		# arg5
1935	mov	%r11,40(%rsp)		# arg6
1936	mov	%r12,48(%rsp)		# arg7
1937	mov	%rcx,56(%rsp)		# arg8, (NULL)
1938	call	*__imp_RtlVirtualUnwind(%rip)
1939
1940	mov	\$1,%eax		# ExceptionContinueSearch
1941	add	\$64,%rsp
1942	popfq
1943	pop	%r15
1944	pop	%r14
1945	pop	%r13
1946	pop	%r12
1947	pop	%rbp
1948	pop	%rbx
1949	pop	%rdi
1950	pop	%rsi
1951	ret
1952.size	ssse3_handler,.-ssse3_handler
1953
1954.section	.pdata
1955.align	4
1956	.rva	.LSEH_begin_sha1_block_data_order
1957	.rva	.LSEH_end_sha1_block_data_order
1958	.rva	.LSEH_info_sha1_block_data_order
1959___
1960$code.=<<___ if ($shaext);
1961	.rva	.LSEH_begin_sha1_block_data_order_shaext
1962	.rva	.LSEH_end_sha1_block_data_order_shaext
1963	.rva	.LSEH_info_sha1_block_data_order_shaext
1964___
1965$code.=<<___;
1966	.rva	.LSEH_begin_sha1_block_data_order_ssse3
1967	.rva	.LSEH_end_sha1_block_data_order_ssse3
1968	.rva	.LSEH_info_sha1_block_data_order_ssse3
1969___
1970$code.=<<___ if ($avx);
1971	.rva	.LSEH_begin_sha1_block_data_order_avx
1972	.rva	.LSEH_end_sha1_block_data_order_avx
1973	.rva	.LSEH_info_sha1_block_data_order_avx
1974___
1975$code.=<<___ if ($avx>1);
1976	.rva	.LSEH_begin_sha1_block_data_order_avx2
1977	.rva	.LSEH_end_sha1_block_data_order_avx2
1978	.rva	.LSEH_info_sha1_block_data_order_avx2
1979___
1980$code.=<<___;
1981.section	.xdata
1982.align	8
1983.LSEH_info_sha1_block_data_order:
1984	.byte	9,0,0,0
1985	.rva	se_handler
1986___
1987$code.=<<___ if ($shaext);
1988.LSEH_info_sha1_block_data_order_shaext:
1989	.byte	9,0,0,0
1990	.rva	shaext_handler
1991___
1992$code.=<<___;
1993.LSEH_info_sha1_block_data_order_ssse3:
1994	.byte	9,0,0,0
1995	.rva	ssse3_handler
1996	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1997___
1998$code.=<<___ if ($avx);
1999.LSEH_info_sha1_block_data_order_avx:
2000	.byte	9,0,0,0
2001	.rva	ssse3_handler
2002	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2003___
2004$code.=<<___ if ($avx>1);
2005.LSEH_info_sha1_block_data_order_avx2:
2006	.byte	9,0,0,0
2007	.rva	ssse3_handler
2008	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2009___
2010}
2011
2012####################################################################
2013
2014sub sha1rnds4 {
2015    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2016      my @opcode=(0x0f,0x3a,0xcc);
2017	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2018	my $c=$1;
2019	push @opcode,$c=~/^0/?oct($c):$c;
2020	return ".byte\t".join(',',@opcode);
2021    } else {
2022	return "sha1rnds4\t".@_[0];
2023    }
2024}
2025
2026sub sha1op38 {
2027    my $instr = shift;
2028    my %opcodelet = (
2029		"sha1nexte" => 0xc8,
2030  		"sha1msg1"  => 0xc9,
2031		"sha1msg2"  => 0xca	);
2032
2033    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2034      my @opcode=(0x0f,0x38);
2035      my $rex=0;
2036	$rex|=0x04			if ($2>=8);
2037	$rex|=0x01			if ($1>=8);
2038	unshift @opcode,0x40|$rex	if ($rex);
2039	push @opcode,$opcodelet{$instr};
2040	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2041	return ".byte\t".join(',',@opcode);
2042    } else {
2043	return $instr."\t".@_[0];
2044    }
2045}
2046
2047foreach (split("\n",$code)) {
2048	s/\`([^\`]*)\`/eval $1/geo;
2049
2050	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2051	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2052
2053	print $_,"\n";
2054}
2055close STDOUT;
2056