• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# sha1_block procedure for x86_64.
18#
19# It was brought to my attention that on EM64T compiler-generated code
20# was far behind 32-bit assembler implementation. This is unlike on
21# Opteron where compiler-generated code was only 15% behind 32-bit
22# assembler, which originally made it hard to motivate the effort.
23# There was suggestion to mechanically translate 32-bit code, but I
24# dismissed it, reasoning that x86_64 offers enough register bank
25# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26# implementation:-) However! While 64-bit code does perform better
27# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28# x86_64 does offer larger *addressable* bank, but out-of-order core
29# reaches for even more registers through dynamic aliasing, and EM64T
30# core must have managed to run-time optimize even 32-bit code just as
31# good as 64-bit one. Performance improvement is summarized in the
32# following table:
33#
34#		gcc 3.4		32-bit asm	cycles/byte
35# Opteron	+45%		+20%		6.8
36# Xeon P4	+65%		+0%		9.9
37# Core2		+60%		+10%		7.0
38
39# August 2009.
40#
41# The code was revised to minimize code size and to maximize
42# "distance" between instructions producing input to 'lea'
43# instruction and the 'lea' instruction itself, which is essential
44# for Intel Atom core.
45
46# October 2010.
47#
48# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49# is to offload message schedule denoted by Wt in NIST specification,
50# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51# for background and implementation details. The only difference from
52# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53# to free temporary registers.
54
55# April 2011.
56#
57# Add AVX code path. See sha1-586.pl for further information.
58
59# May 2013.
60#
61# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62# and loading pair of consecutive blocks to 256-bit %ymm registers)
63# did not provide impressive performance improvement till a crucial
64# hint regarding the number of Xupdate iterations to pre-compute in
65# advance was provided by Ilya Albrekht of Intel Corp.
66
67# March 2014.
68#
69# Add support for Intel SHA Extensions.
70
71######################################################################
72# Current performance is summarized in following table. Numbers are
73# CPU clock cycles spent to process single byte (less is better).
74#
75#		x86_64		SSSE3		AVX[2]
76# P4		9.05		-
77# Opteron	6.26		-
78# Core2		6.55		6.05/+8%	-
79# Westmere	6.73		5.30/+27%	-
80# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82# Haswell	5.45		4.15/+31%	3.57/+53%
83# Skylake	5.18		4.06/+28%	3.54/+46%
84# Bulldozer	9.11		5.95/+53%
85# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86# VIA Nano	9.32		7.15/+30%
87# Atom		10.3		9.17/+12%
88# Silvermont	13.1(*)		9.37/+40%
89# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91#
92# (*)	obviously suboptimal result, nothing was done about it,
93#	because SSSE3 code is compiled unconditionally;
94# (**)	SHAEXT result
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# In upstream, this is controlled by shelling out to the compiler to check
108# versions, but BoringSSL is intended to be used with pre-generated perlasm
109# output, so this isn't useful anyway.
110#
111# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
112# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
113# did not tie them together until after $shaext was added.
114$avx = 1;
115
116# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
117# been tested.
118$shaext=0;	### set to zero if compiling for 1.0.1
119$avx=1		if (!$shaext && $avx);
120
121open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
122*STDOUT=*OUT;
123
124$ctx="%rdi";	# 1st arg
125$inp="%rsi";	# 2nd arg
126$num="%rdx";	# 3rd arg
127
128# reassign arguments in order to produce more compact code
129$ctx="%r8";
130$inp="%r9";
131$num="%r10";
132
133$t0="%eax";
134$t1="%ebx";
135$t2="%ecx";
136@xi=("%edx","%ebp","%r14d");
137$A="%esi";
138$B="%edi";
139$C="%r11d";
140$D="%r12d";
141$E="%r13d";
142
143@V=($A,$B,$C,$D,$E);
144
145sub BODY_00_19 {
146my ($i,$a,$b,$c,$d,$e)=@_;
147my $j=$i+1;
148$code.=<<___ if ($i==0);
149	mov	`4*$i`($inp),$xi[0]
150	bswap	$xi[0]
151___
152$code.=<<___ if ($i<15);
153	mov	`4*$j`($inp),$xi[1]
154	mov	$d,$t0
155	mov	$xi[0],`4*$i`(%rsp)
156	mov	$a,$t2
157	bswap	$xi[1]
158	xor	$c,$t0
159	rol	\$5,$t2
160	and	$b,$t0
161	lea	0x5a827999($xi[0],$e),$e
162	add	$t2,$e
163	xor	$d,$t0
164	rol	\$30,$b
165	add	$t0,$e
166___
167$code.=<<___ if ($i>=15);
168	xor	`4*($j%16)`(%rsp),$xi[1]
169	mov	$d,$t0
170	mov	$xi[0],`4*($i%16)`(%rsp)
171	mov	$a,$t2
172	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
173	xor	$c,$t0
174	rol	\$5,$t2
175	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
176	and	$b,$t0
177	lea	0x5a827999($xi[0],$e),$e
178	rol	\$30,$b
179	xor	$d,$t0
180	add	$t2,$e
181	rol	\$1,$xi[1]
182	add	$t0,$e
183___
184push(@xi,shift(@xi));
185}
186
187sub BODY_20_39 {
188my ($i,$a,$b,$c,$d,$e)=@_;
189my $j=$i+1;
190my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
191$code.=<<___ if ($i<79);
192	xor	`4*($j%16)`(%rsp),$xi[1]
193	mov	$b,$t0
194	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
195	mov	$a,$t2
196	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
197	xor	$d,$t0
198	rol	\$5,$t2
199	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
200	lea	$K($xi[0],$e),$e
201	xor	$c,$t0
202	add	$t2,$e
203	rol	\$30,$b
204	add	$t0,$e
205	rol	\$1,$xi[1]
206___
207$code.=<<___ if ($i==79);
208	mov	$b,$t0
209	mov	$a,$t2
210	xor	$d,$t0
211	lea	$K($xi[0],$e),$e
212	rol	\$5,$t2
213	xor	$c,$t0
214	add	$t2,$e
215	rol	\$30,$b
216	add	$t0,$e
217___
218push(@xi,shift(@xi));
219}
220
221sub BODY_40_59 {
222my ($i,$a,$b,$c,$d,$e)=@_;
223my $j=$i+1;
224$code.=<<___;
225	xor	`4*($j%16)`(%rsp),$xi[1]
226	mov	$d,$t0
227	mov	$xi[0],`4*($i%16)`(%rsp)
228	mov	$d,$t1
229	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
230	and	$c,$t0
231	mov	$a,$t2
232	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
233	lea	0x8f1bbcdc($xi[0],$e),$e
234	xor	$c,$t1
235	rol	\$5,$t2
236	add	$t0,$e
237	rol	\$1,$xi[1]
238	and	$b,$t1
239	add	$t2,$e
240	rol	\$30,$b
241	add	$t1,$e
242___
243push(@xi,shift(@xi));
244}
245
246$code.=<<___;
247.text
248.extern	OPENSSL_ia32cap_P
249
250.globl	sha1_block_data_order
251.type	sha1_block_data_order,\@function,3
252.align	16
253sha1_block_data_order:
254.cfi_startproc
255	leaq	OPENSSL_ia32cap_P(%rip),%r10
256	mov	0(%r10),%r9d
257	mov	4(%r10),%r8d
258	mov	8(%r10),%r10d
259	test	\$`1<<9`,%r8d		# check SSSE3 bit
260	jz	.Lialu
261___
262$code.=<<___ if ($shaext);
263	test	\$`1<<29`,%r10d		# check SHA bit
264	jnz	_shaext_shortcut
265___
266$code.=<<___ if ($avx>1);
267	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
268	cmp	\$`1<<3|1<<5|1<<8`,%r10d
269	je	_avx2_shortcut
270___
271$code.=<<___ if ($avx);
272	and	\$`1<<28`,%r8d		# mask AVX bit
273	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
274	or	%r9d,%r8d
275	cmp	\$`1<<28|1<<30`,%r8d
276	je	_avx_shortcut
277___
278$code.=<<___;
279	jmp	_ssse3_shortcut
280
281.align	16
282.Lialu:
283	mov	%rsp,%rax
284.cfi_def_cfa_register	%rax
285	push	%rbx
286.cfi_push	%rbx
287	push	%rbp
288.cfi_push	%rbp
289	push	%r12
290.cfi_push	%r12
291	push	%r13
292.cfi_push	%r13
293	push	%r14
294.cfi_push	%r14
295	mov	%rdi,$ctx	# reassigned argument
296	sub	\$`8+16*4`,%rsp
297	mov	%rsi,$inp	# reassigned argument
298	and	\$-64,%rsp
299	mov	%rdx,$num	# reassigned argument
300	mov	%rax,`16*4`(%rsp)
301.cfi_cfa_expression	%rsp+64,deref,+8
302.Lprologue:
303
304	mov	0($ctx),$A
305	mov	4($ctx),$B
306	mov	8($ctx),$C
307	mov	12($ctx),$D
308	mov	16($ctx),$E
309	jmp	.Lloop
310
311.align	16
312.Lloop:
313___
314for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
315for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
316for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
317for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
318$code.=<<___;
319	add	0($ctx),$A
320	add	4($ctx),$B
321	add	8($ctx),$C
322	add	12($ctx),$D
323	add	16($ctx),$E
324	mov	$A,0($ctx)
325	mov	$B,4($ctx)
326	mov	$C,8($ctx)
327	mov	$D,12($ctx)
328	mov	$E,16($ctx)
329
330	sub	\$1,$num
331	lea	`16*4`($inp),$inp
332	jnz	.Lloop
333
334	mov	`16*4`(%rsp),%rsi
335.cfi_def_cfa	%rsi,8
336	mov	-40(%rsi),%r14
337.cfi_restore	%r14
338	mov	-32(%rsi),%r13
339.cfi_restore	%r13
340	mov	-24(%rsi),%r12
341.cfi_restore	%r12
342	mov	-16(%rsi),%rbp
343.cfi_restore	%rbp
344	mov	-8(%rsi),%rbx
345.cfi_restore	%rbx
346	lea	(%rsi),%rsp
347.cfi_def_cfa_register	%rsp
348.Lepilogue:
349	ret
350.cfi_endproc
351.size	sha1_block_data_order,.-sha1_block_data_order
352___
353if ($shaext) {{{
354######################################################################
355# Intel SHA Extensions implementation of SHA1 update function.
356#
357my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
358my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
359my @MSG=map("%xmm$_",(4..7));
360
361$code.=<<___;
362.type	sha1_block_data_order_shaext,\@function,3
363.align	32
364sha1_block_data_order_shaext:
365_shaext_shortcut:
366.cfi_startproc
367___
368$code.=<<___ if ($win64);
369	lea	`-8-4*16`(%rsp),%rsp
370	movaps	%xmm6,-8-4*16(%rax)
371	movaps	%xmm7,-8-3*16(%rax)
372	movaps	%xmm8,-8-2*16(%rax)
373	movaps	%xmm9,-8-1*16(%rax)
374.Lprologue_shaext:
375___
376$code.=<<___;
377	movdqu	($ctx),$ABCD
378	movd	16($ctx),$E
379	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
380
381	movdqu	($inp),@MSG[0]
382	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
383	movdqu	0x10($inp),@MSG[1]
384	pshufd	\$0b00011011,$E,$E		# flip word order
385	movdqu	0x20($inp),@MSG[2]
386	pshufb	$BSWAP,@MSG[0]
387	movdqu	0x30($inp),@MSG[3]
388	pshufb	$BSWAP,@MSG[1]
389	pshufb	$BSWAP,@MSG[2]
390	movdqa	$E,$E_SAVE			# offload $E
391	pshufb	$BSWAP,@MSG[3]
392	jmp	.Loop_shaext
393
394.align	16
395.Loop_shaext:
396	dec		$num
397	lea		0x40($inp),%r8		# next input block
398	paddd		@MSG[0],$E
399	cmovne		%r8,$inp
400	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
401___
402for($i=0;$i<20-4;$i+=2) {
403$code.=<<___;
404	sha1msg1	@MSG[1],@MSG[0]
405	movdqa		$ABCD,$E_
406	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
407	sha1nexte	@MSG[1],$E_
408	pxor		@MSG[2],@MSG[0]
409	sha1msg1	@MSG[2],@MSG[1]
410	sha1msg2	@MSG[3],@MSG[0]
411
412	movdqa		$ABCD,$E
413	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
414	sha1nexte	@MSG[2],$E
415	pxor		@MSG[3],@MSG[1]
416	sha1msg2	@MSG[0],@MSG[1]
417___
418	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
419}
420$code.=<<___;
421	movdqu		($inp),@MSG[0]
422	movdqa		$ABCD,$E_
423	sha1rnds4	\$3,$E,$ABCD		# 64-67
424	sha1nexte	@MSG[1],$E_
425	movdqu		0x10($inp),@MSG[1]
426	pshufb		$BSWAP,@MSG[0]
427
428	movdqa		$ABCD,$E
429	sha1rnds4	\$3,$E_,$ABCD		# 68-71
430	sha1nexte	@MSG[2],$E
431	movdqu		0x20($inp),@MSG[2]
432	pshufb		$BSWAP,@MSG[1]
433
434	movdqa		$ABCD,$E_
435	sha1rnds4	\$3,$E,$ABCD		# 72-75
436	sha1nexte	@MSG[3],$E_
437	movdqu		0x30($inp),@MSG[3]
438	pshufb		$BSWAP,@MSG[2]
439
440	movdqa		$ABCD,$E
441	sha1rnds4	\$3,$E_,$ABCD		# 76-79
442	sha1nexte	$E_SAVE,$E
443	pshufb		$BSWAP,@MSG[3]
444
445	paddd		$ABCD_SAVE,$ABCD
446	movdqa		$E,$E_SAVE		# offload $E
447
448	jnz		.Loop_shaext
449
450	pshufd	\$0b00011011,$ABCD,$ABCD
451	pshufd	\$0b00011011,$E,$E
452	movdqu	$ABCD,($ctx)
453	movd	$E,16($ctx)
454___
455$code.=<<___ if ($win64);
456	movaps	-8-4*16(%rax),%xmm6
457	movaps	-8-3*16(%rax),%xmm7
458	movaps	-8-2*16(%rax),%xmm8
459	movaps	-8-1*16(%rax),%xmm9
460	mov	%rax,%rsp
461.Lepilogue_shaext:
462___
463$code.=<<___;
464.cfi_endproc
465	ret
466.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
467___
468}}}
469{{{
470my $Xi=4;
471my @X=map("%xmm$_",(4..7,0..3));
472my @Tx=map("%xmm$_",(8..10));
473my $Kx="%xmm11";
474my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
475my @T=("%esi","%edi");
476my $j=0;
477my $rx=0;
478my $K_XX_XX="%r14";
479my $fp="%r11";
480
481my $_rol=sub { &rol(@_) };
482my $_ror=sub { &ror(@_) };
483
484{ my $sn;
485sub align32() {
486  ++$sn;
487$code.=<<___;
488	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
489.align	32
490.Lalign32_$sn:
491___
492}
493}
494
495$code.=<<___;
496.type	sha1_block_data_order_ssse3,\@function,3
497.align	16
498sha1_block_data_order_ssse3:
499_ssse3_shortcut:
500.cfi_startproc
501	mov	%rsp,$fp	# frame pointer
502.cfi_def_cfa_register	$fp
503	push	%rbx
504.cfi_push	%rbx
505	push	%rbp
506.cfi_push	%rbp
507	push	%r12
508.cfi_push	%r12
509	push	%r13		# redundant, done to share Win64 SE handler
510.cfi_push	%r13
511	push	%r14
512.cfi_push	%r14
513	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
514___
515$code.=<<___ if ($win64);
516	movaps	%xmm6,-40-6*16($fp)
517	movaps	%xmm7,-40-5*16($fp)
518	movaps	%xmm8,-40-4*16($fp)
519	movaps	%xmm9,-40-3*16($fp)
520	movaps	%xmm10,-40-2*16($fp)
521	movaps	%xmm11,-40-1*16($fp)
522.Lprologue_ssse3:
523___
524$code.=<<___;
525	and	\$-64,%rsp
526	mov	%rdi,$ctx	# reassigned argument
527	mov	%rsi,$inp	# reassigned argument
528	mov	%rdx,$num	# reassigned argument
529
530	shl	\$6,$num
531	add	$inp,$num
532	lea	K_XX_XX+64(%rip),$K_XX_XX
533
534	mov	0($ctx),$A		# load context
535	mov	4($ctx),$B
536	mov	8($ctx),$C
537	mov	12($ctx),$D
538	mov	$B,@T[0]		# magic seed
539	mov	16($ctx),$E
540	mov	$C,@T[1]
541	xor	$D,@T[1]
542	and	@T[1],@T[0]
543
544	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
545	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
546	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
547	movdqu	16($inp),@X[-3&7]
548	movdqu	32($inp),@X[-2&7]
549	movdqu	48($inp),@X[-1&7]
550	pshufb	@X[2],@X[-4&7]		# byte swap
551	pshufb	@X[2],@X[-3&7]
552	pshufb	@X[2],@X[-2&7]
553	add	\$64,$inp
554	paddd	@Tx[1],@X[-4&7]		# add K_00_19
555	pshufb	@X[2],@X[-1&7]
556	paddd	@Tx[1],@X[-3&7]
557	paddd	@Tx[1],@X[-2&7]
558	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
559	psubd	@Tx[1],@X[-4&7]		# restore X[]
560	movdqa	@X[-3&7],16(%rsp)
561	psubd	@Tx[1],@X[-3&7]
562	movdqa	@X[-2&7],32(%rsp)
563	psubd	@Tx[1],@X[-2&7]
564	jmp	.Loop_ssse3
565___
566
567sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
568{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
569  my $arg = pop;
570    $arg = "\$$arg" if ($arg*1 eq $arg);
571    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
572}
573
574sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
575{ use integer;
576  my $body = shift;
577  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
578  my ($a,$b,$c,$d,$e);
579
580	 eval(shift(@insns));		# ror
581	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
582	 eval(shift(@insns));
583	&movdqa	(@Tx[0],@X[-1&7]);
584	  &paddd	(@Tx[1],@X[-1&7]);
585	 eval(shift(@insns));
586	 eval(shift(@insns));
587
588	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
589	 eval(shift(@insns));
590	 eval(shift(@insns));		# rol
591	 eval(shift(@insns));
592	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
593	 eval(shift(@insns));
594	 eval(shift(@insns));
595
596	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
597	 eval(shift(@insns));
598	 eval(shift(@insns));		# ror
599	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
600	 eval(shift(@insns));
601	 eval(shift(@insns));
602	 eval(shift(@insns));
603
604	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
605	 eval(shift(@insns));
606	 eval(shift(@insns));		# rol
607	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
608	 eval(shift(@insns));
609	 eval(shift(@insns));
610
611	&movdqa	(@Tx[2],@X[0]);
612	 eval(shift(@insns));
613	 eval(shift(@insns));
614	 eval(shift(@insns));		# ror
615	&movdqa	(@Tx[0],@X[0]);
616	 eval(shift(@insns));
617
618	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
619	&paddd	(@X[0],@X[0]);
620	 eval(shift(@insns));
621	 eval(shift(@insns));
622
623	&psrld	(@Tx[0],31);
624	 eval(shift(@insns));
625	 eval(shift(@insns));		# rol
626	 eval(shift(@insns));
627	&movdqa	(@Tx[1],@Tx[2]);
628	 eval(shift(@insns));
629	 eval(shift(@insns));
630
631	&psrld	(@Tx[2],30);
632	 eval(shift(@insns));
633	 eval(shift(@insns));		# ror
634	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
635	 eval(shift(@insns));
636	 eval(shift(@insns));
637	 eval(shift(@insns));
638
639	&pslld	(@Tx[1],2);
640	&pxor	(@X[0],@Tx[2]);
641	 eval(shift(@insns));
642	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
643	 eval(shift(@insns));		# rol
644	 eval(shift(@insns));
645	 eval(shift(@insns));
646
647	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
648	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
649
650	 foreach (@insns) { eval; }	# remaining instructions [if any]
651
652  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
653		push(@Tx,shift(@Tx));
654}
655
656sub Xupdate_ssse3_32_79()
657{ use integer;
658  my $body = shift;
659  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
660  my ($a,$b,$c,$d,$e);
661
662	 eval(shift(@insns))		if ($Xi==8);
663	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
664	 eval(shift(@insns))		if ($Xi==8);
665	 eval(shift(@insns));		# body_20_39
666	 eval(shift(@insns));
667	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
668	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
669	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
670	 eval(shift(@insns));
671	 eval(shift(@insns));		# rol
672
673	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
674	 eval(shift(@insns));
675	 eval(shift(@insns));
676	if ($Xi%5) {
677	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
678	} else {			# ... or load next one
679	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
680	}
681	 eval(shift(@insns));		# ror
682	  &paddd	(@Tx[1],@X[-1&7]);
683	 eval(shift(@insns));
684
685	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
686	 eval(shift(@insns));		# body_20_39
687	 eval(shift(@insns));
688	 eval(shift(@insns));
689	 eval(shift(@insns));		# rol
690	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
691
692	&movdqa	(@Tx[0],@X[0]);
693	 eval(shift(@insns));
694	 eval(shift(@insns));
695	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
696	 eval(shift(@insns));		# ror
697	 eval(shift(@insns));
698	 eval(shift(@insns));		# body_20_39
699
700	&pslld	(@X[0],2);
701	 eval(shift(@insns));
702	 eval(shift(@insns));
703	&psrld	(@Tx[0],30);
704	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
705	 eval(shift(@insns));
706	 eval(shift(@insns));
707	 eval(shift(@insns));		# ror
708
709	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
710	 eval(shift(@insns));
711	 eval(shift(@insns));		# body_20_39
712	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
713	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
714	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
715	 eval(shift(@insns));
716	 eval(shift(@insns));		# rol
717	 eval(shift(@insns));
718	 eval(shift(@insns));
719	 eval(shift(@insns));		# rol
720	 eval(shift(@insns));
721
722	 foreach (@insns) { eval; }	# remaining instructions
723
724  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
725		push(@Tx,shift(@Tx));
726}
727
728sub Xuplast_ssse3_80()
729{ use integer;
730  my $body = shift;
731  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
732  my ($a,$b,$c,$d,$e);
733
734	 eval(shift(@insns));
735	 eval(shift(@insns));
736	 eval(shift(@insns));
737	 eval(shift(@insns));
738	  &paddd	(@Tx[1],@X[-1&7]);
739	 eval(shift(@insns));
740	 eval(shift(@insns));
741
742	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
743
744	 foreach (@insns) { eval; }		# remaining instructions
745
746	&cmp	($inp,$num);
747	&je	(".Ldone_ssse3");
748
749	unshift(@Tx,pop(@Tx));
750
751	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
752	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
753	&movdqu	(@X[-4&7],"0($inp)");		# load input
754	&movdqu	(@X[-3&7],"16($inp)");
755	&movdqu	(@X[-2&7],"32($inp)");
756	&movdqu	(@X[-1&7],"48($inp)");
757	&pshufb	(@X[-4&7],@X[2]);		# byte swap
758	&add	($inp,64);
759
760  $Xi=0;
761}
762
763sub Xloop_ssse3()
764{ use integer;
765  my $body = shift;
766  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
767  my ($a,$b,$c,$d,$e);
768
769	 eval(shift(@insns));
770	 eval(shift(@insns));
771	 eval(shift(@insns));
772	&pshufb	(@X[($Xi-3)&7],@X[2]);
773	 eval(shift(@insns));
774	 eval(shift(@insns));
775	 eval(shift(@insns));
776	 eval(shift(@insns));
777	&paddd	(@X[($Xi-4)&7],@Tx[1]);
778	 eval(shift(@insns));
779	 eval(shift(@insns));
780	 eval(shift(@insns));
781	 eval(shift(@insns));
782	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
783	 eval(shift(@insns));
784	 eval(shift(@insns));
785	 eval(shift(@insns));
786	 eval(shift(@insns));
787	&psubd	(@X[($Xi-4)&7],@Tx[1]);
788
789	foreach (@insns) { eval; }
790  $Xi++;
791}
792
793sub Xtail_ssse3()
794{ use integer;
795  my $body = shift;
796  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
797  my ($a,$b,$c,$d,$e);
798
799	foreach (@insns) { eval; }
800}
801
802sub body_00_19 () {	# ((c^d)&b)^d
803	# on start @T[0]=(c^d)&b
804	return &body_20_39() if ($rx==19); $rx++;
805	(
806	'($a,$b,$c,$d,$e)=@V;'.
807	'&$_ror	($b,$j?7:2)',	# $b>>>2
808	'&xor	(@T[0],$d)',
809	'&mov	(@T[1],$a)',	# $b for next round
810
811	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
812	'&xor	($b,$c)',	# $c^$d for next round
813
814	'&$_rol	($a,5)',
815	'&add	($e,@T[0])',
816	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
817
818	'&xor	($b,$c)',	# restore $b
819	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
820	);
821}
822
823sub body_20_39 () {	# b^d^c
824	# on entry @T[0]=b^d
825	return &body_40_59() if ($rx==39); $rx++;
826	(
827	'($a,$b,$c,$d,$e)=@V;'.
828	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
829	'&xor	(@T[0],$d)	if($j==19);'.
830	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
831	'&mov	(@T[1],$a)',	# $b for next round
832
833	'&$_rol	($a,5)',
834	'&add	($e,@T[0])',
835	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
836
837	'&$_ror	($b,7)',	# $b>>>2
838	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
839	);
840}
841
842sub body_40_59 () {	# ((b^c)&(c^d))^c
843	# on entry @T[0]=(b^c), (c^=d)
844	$rx++;
845	(
846	'($a,$b,$c,$d,$e)=@V;'.
847	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
848	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
849	'&xor	($c,$d)		if ($j>=40)',	# restore $c
850
851	'&$_ror	($b,7)',	# $b>>>2
852	'&mov	(@T[1],$a)',	# $b for next round
853	'&xor	(@T[0],$c)',
854
855	'&$_rol	($a,5)',
856	'&add	($e,@T[0])',
857	'&xor	(@T[1],$c)	if ($j==59);'.
858	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
859
860	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
861	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
862	);
863}
864$code.=<<___;
865.align	16
866.Loop_ssse3:
867___
868	&Xupdate_ssse3_16_31(\&body_00_19);
869	&Xupdate_ssse3_16_31(\&body_00_19);
870	&Xupdate_ssse3_16_31(\&body_00_19);
871	&Xupdate_ssse3_16_31(\&body_00_19);
872	&Xupdate_ssse3_32_79(\&body_00_19);
873	&Xupdate_ssse3_32_79(\&body_20_39);
874	&Xupdate_ssse3_32_79(\&body_20_39);
875	&Xupdate_ssse3_32_79(\&body_20_39);
876	&Xupdate_ssse3_32_79(\&body_20_39);
877	&Xupdate_ssse3_32_79(\&body_20_39);
878	&Xupdate_ssse3_32_79(\&body_40_59);
879	&Xupdate_ssse3_32_79(\&body_40_59);
880	&Xupdate_ssse3_32_79(\&body_40_59);
881	&Xupdate_ssse3_32_79(\&body_40_59);
882	&Xupdate_ssse3_32_79(\&body_40_59);
883	&Xupdate_ssse3_32_79(\&body_20_39);
884	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
885
886				$saved_j=$j; @saved_V=@V;
887
888	&Xloop_ssse3(\&body_20_39);
889	&Xloop_ssse3(\&body_20_39);
890	&Xloop_ssse3(\&body_20_39);
891
892$code.=<<___;
893	add	0($ctx),$A			# update context
894	add	4($ctx),@T[0]
895	add	8($ctx),$C
896	add	12($ctx),$D
897	mov	$A,0($ctx)
898	add	16($ctx),$E
899	mov	@T[0],4($ctx)
900	mov	@T[0],$B			# magic seed
901	mov	$C,8($ctx)
902	mov	$C,@T[1]
903	mov	$D,12($ctx)
904	xor	$D,@T[1]
905	mov	$E,16($ctx)
906	and	@T[1],@T[0]
907	jmp	.Loop_ssse3
908
909.align	16
910.Ldone_ssse3:
911___
912				$j=$saved_j; @V=@saved_V;
913
914	&Xtail_ssse3(\&body_20_39);
915	&Xtail_ssse3(\&body_20_39);
916	&Xtail_ssse3(\&body_20_39);
917
918$code.=<<___;
919	add	0($ctx),$A			# update context
920	add	4($ctx),@T[0]
921	add	8($ctx),$C
922	mov	$A,0($ctx)
923	add	12($ctx),$D
924	mov	@T[0],4($ctx)
925	add	16($ctx),$E
926	mov	$C,8($ctx)
927	mov	$D,12($ctx)
928	mov	$E,16($ctx)
929___
930$code.=<<___ if ($win64);
931	movaps	-40-6*16($fp),%xmm6
932	movaps	-40-5*16($fp),%xmm7
933	movaps	-40-4*16($fp),%xmm8
934	movaps	-40-3*16($fp),%xmm9
935	movaps	-40-2*16($fp),%xmm10
936	movaps	-40-1*16($fp),%xmm11
937___
938$code.=<<___;
939	mov	-40($fp),%r14
940.cfi_restore	%r14
941	mov	-32($fp),%r13
942.cfi_restore	%r13
943	mov	-24($fp),%r12
944.cfi_restore	%r12
945	mov	-16($fp),%rbp
946.cfi_restore	%rbp
947	mov	-8($fp),%rbx
948.cfi_restore	%rbx
949	lea	($fp),%rsp
950.cfi_def_cfa_register	%rsp
951.Lepilogue_ssse3:
952	ret
953.cfi_endproc
954.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
955___
956
957if ($avx) {
958$Xi=4;				# reset variables
959@X=map("%xmm$_",(4..7,0..3));
960@Tx=map("%xmm$_",(8..10));
961$j=0;
962$rx=0;
963
964my $done_avx_label=".Ldone_avx";
965
966my $_rol=sub { &shld(@_[0],@_) };
967my $_ror=sub { &shrd(@_[0],@_) };
968
969$code.=<<___;
970.type	sha1_block_data_order_avx,\@function,3
971.align	16
972sha1_block_data_order_avx:
973_avx_shortcut:
974.cfi_startproc
975	mov	%rsp,$fp
976.cfi_def_cfa_register	$fp
977	push	%rbx
978.cfi_push	%rbx
979	push	%rbp
980.cfi_push	%rbp
981	push	%r12
982.cfi_push	%r12
983	push	%r13		# redundant, done to share Win64 SE handler
984.cfi_push	%r13
985	push	%r14
986.cfi_push	%r14
987	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
988	vzeroupper
989___
990$code.=<<___ if ($win64);
991	vmovaps	%xmm6,-40-6*16($fp)
992	vmovaps	%xmm7,-40-5*16($fp)
993	vmovaps	%xmm8,-40-4*16($fp)
994	vmovaps	%xmm9,-40-3*16($fp)
995	vmovaps	%xmm10,-40-2*16($fp)
996	vmovaps	%xmm11,-40-1*16($fp)
997.Lprologue_avx:
998___
999$code.=<<___;
1000	and	\$-64,%rsp
1001	mov	%rdi,$ctx	# reassigned argument
1002	mov	%rsi,$inp	# reassigned argument
1003	mov	%rdx,$num	# reassigned argument
1004
1005	shl	\$6,$num
1006	add	$inp,$num
1007	lea	K_XX_XX+64(%rip),$K_XX_XX
1008
1009	mov	0($ctx),$A		# load context
1010	mov	4($ctx),$B
1011	mov	8($ctx),$C
1012	mov	12($ctx),$D
1013	mov	$B,@T[0]		# magic seed
1014	mov	16($ctx),$E
1015	mov	$C,@T[1]
1016	xor	$D,@T[1]
1017	and	@T[1],@T[0]
1018
1019	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1020	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
1021	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1022	vmovdqu	16($inp),@X[-3&7]
1023	vmovdqu	32($inp),@X[-2&7]
1024	vmovdqu	48($inp),@X[-1&7]
1025	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1026	add	\$64,$inp
1027	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1028	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1029	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1030	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1031	vpaddd	$Kx,@X[-3&7],@X[1]
1032	vpaddd	$Kx,@X[-2&7],@X[2]
1033	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1034	vmovdqa	@X[1],16(%rsp)
1035	vmovdqa	@X[2],32(%rsp)
1036	jmp	.Loop_avx
1037___
1038
1039sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1040{ use integer;
1041  my $body = shift;
1042  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1043  my ($a,$b,$c,$d,$e);
1044
1045	 eval(shift(@insns));
1046	 eval(shift(@insns));
1047	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1048	 eval(shift(@insns));
1049	 eval(shift(@insns));
1050
1051	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1052	 eval(shift(@insns));
1053	 eval(shift(@insns));
1054	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1055	 eval(shift(@insns));
1056	 eval(shift(@insns));
1057	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1058	 eval(shift(@insns));
1059	 eval(shift(@insns));
1060
1061	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1062	 eval(shift(@insns));
1063	 eval(shift(@insns));
1064	 eval(shift(@insns));
1065	 eval(shift(@insns));
1066
1067	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1068	 eval(shift(@insns));
1069	 eval(shift(@insns));
1070	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1071	 eval(shift(@insns));
1072	 eval(shift(@insns));
1073
1074	&vpsrld	(@Tx[0],@X[0],31);
1075	 eval(shift(@insns));
1076	 eval(shift(@insns));
1077	 eval(shift(@insns));
1078	 eval(shift(@insns));
1079
1080	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1081	&vpaddd	(@X[0],@X[0],@X[0]);
1082	 eval(shift(@insns));
1083	 eval(shift(@insns));
1084	 eval(shift(@insns));
1085	 eval(shift(@insns));
1086
1087	&vpsrld	(@Tx[1],@Tx[2],30);
1088	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1089	 eval(shift(@insns));
1090	 eval(shift(@insns));
1091	 eval(shift(@insns));
1092	 eval(shift(@insns));
1093
1094	&vpslld	(@Tx[2],@Tx[2],2);
1095	&vpxor	(@X[0],@X[0],@Tx[1]);
1096	 eval(shift(@insns));
1097	 eval(shift(@insns));
1098	 eval(shift(@insns));
1099	 eval(shift(@insns));
1100
1101	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1102	 eval(shift(@insns));
1103	 eval(shift(@insns));
1104	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1105	 eval(shift(@insns));
1106	 eval(shift(@insns));
1107
1108
1109	 foreach (@insns) { eval; }	# remaining instructions [if any]
1110
1111  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1112}
1113
1114sub Xupdate_avx_32_79()
1115{ use integer;
1116  my $body = shift;
1117  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1118  my ($a,$b,$c,$d,$e);
1119
1120	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1121	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1122	 eval(shift(@insns));		# body_20_39
1123	 eval(shift(@insns));
1124	 eval(shift(@insns));
1125	 eval(shift(@insns));		# rol
1126
1127	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1128	 eval(shift(@insns));
1129	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1130	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1131	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1132	 eval(shift(@insns));		# ror
1133	 eval(shift(@insns));
1134
1135	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1136	 eval(shift(@insns));		# body_20_39
1137	 eval(shift(@insns));
1138	 eval(shift(@insns));
1139	 eval(shift(@insns));		# rol
1140
1141	&vpsrld	(@Tx[0],@X[0],30);
1142	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1143	 eval(shift(@insns));
1144	 eval(shift(@insns));
1145	 eval(shift(@insns));		# ror
1146	 eval(shift(@insns));
1147
1148	&vpslld	(@X[0],@X[0],2);
1149	 eval(shift(@insns));		# body_20_39
1150	 eval(shift(@insns));
1151	 eval(shift(@insns));
1152	 eval(shift(@insns));		# rol
1153	 eval(shift(@insns));
1154	 eval(shift(@insns));
1155	 eval(shift(@insns));		# ror
1156	 eval(shift(@insns));
1157
1158	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1159	 eval(shift(@insns));		# body_20_39
1160	 eval(shift(@insns));
1161	 eval(shift(@insns));
1162	 eval(shift(@insns));		# rol
1163	 eval(shift(@insns));
1164	 eval(shift(@insns));
1165	 eval(shift(@insns));		# rol
1166	 eval(shift(@insns));
1167
1168	 foreach (@insns) { eval; }	# remaining instructions
1169
1170  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1171}
1172
1173sub Xuplast_avx_80()
1174{ use integer;
1175  my $body = shift;
1176  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1177  my ($a,$b,$c,$d,$e);
1178
1179	 eval(shift(@insns));
1180	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1181	 eval(shift(@insns));
1182	 eval(shift(@insns));
1183	 eval(shift(@insns));
1184	 eval(shift(@insns));
1185
1186	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1187
1188	 foreach (@insns) { eval; }		# remaining instructions
1189
1190	&cmp	($inp,$num);
1191	&je	($done_avx_label);
1192
1193	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1194	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1195	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1196	&vmovdqu(@X[-3&7],"16($inp)");
1197	&vmovdqu(@X[-2&7],"32($inp)");
1198	&vmovdqu(@X[-1&7],"48($inp)");
1199	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1200	&add	($inp,64);
1201
1202  $Xi=0;
1203}
1204
1205sub Xloop_avx()
1206{ use integer;
1207  my $body = shift;
1208  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1209  my ($a,$b,$c,$d,$e);
1210
1211	 eval(shift(@insns));
1212	 eval(shift(@insns));
1213	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1214	 eval(shift(@insns));
1215	 eval(shift(@insns));
1216	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1217	 eval(shift(@insns));
1218	 eval(shift(@insns));
1219	 eval(shift(@insns));
1220	 eval(shift(@insns));
1221	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1222	 eval(shift(@insns));
1223	 eval(shift(@insns));
1224
1225	foreach (@insns) { eval; }
1226  $Xi++;
1227}
1228
1229sub Xtail_avx()
1230{ use integer;
1231  my $body = shift;
1232  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1233  my ($a,$b,$c,$d,$e);
1234
1235	foreach (@insns) { eval; }
1236}
1237
1238$code.=<<___;
1239.align	16
1240.Loop_avx:
1241___
1242	&Xupdate_avx_16_31(\&body_00_19);
1243	&Xupdate_avx_16_31(\&body_00_19);
1244	&Xupdate_avx_16_31(\&body_00_19);
1245	&Xupdate_avx_16_31(\&body_00_19);
1246	&Xupdate_avx_32_79(\&body_00_19);
1247	&Xupdate_avx_32_79(\&body_20_39);
1248	&Xupdate_avx_32_79(\&body_20_39);
1249	&Xupdate_avx_32_79(\&body_20_39);
1250	&Xupdate_avx_32_79(\&body_20_39);
1251	&Xupdate_avx_32_79(\&body_20_39);
1252	&Xupdate_avx_32_79(\&body_40_59);
1253	&Xupdate_avx_32_79(\&body_40_59);
1254	&Xupdate_avx_32_79(\&body_40_59);
1255	&Xupdate_avx_32_79(\&body_40_59);
1256	&Xupdate_avx_32_79(\&body_40_59);
1257	&Xupdate_avx_32_79(\&body_20_39);
1258	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1259
1260				$saved_j=$j; @saved_V=@V;
1261
1262	&Xloop_avx(\&body_20_39);
1263	&Xloop_avx(\&body_20_39);
1264	&Xloop_avx(\&body_20_39);
1265
1266$code.=<<___;
1267	add	0($ctx),$A			# update context
1268	add	4($ctx),@T[0]
1269	add	8($ctx),$C
1270	add	12($ctx),$D
1271	mov	$A,0($ctx)
1272	add	16($ctx),$E
1273	mov	@T[0],4($ctx)
1274	mov	@T[0],$B			# magic seed
1275	mov	$C,8($ctx)
1276	mov	$C,@T[1]
1277	mov	$D,12($ctx)
1278	xor	$D,@T[1]
1279	mov	$E,16($ctx)
1280	and	@T[1],@T[0]
1281	jmp	.Loop_avx
1282
1283.align	16
1284$done_avx_label:
1285___
1286				$j=$saved_j; @V=@saved_V;
1287
1288	&Xtail_avx(\&body_20_39);
1289	&Xtail_avx(\&body_20_39);
1290	&Xtail_avx(\&body_20_39);
1291
1292$code.=<<___;
1293	vzeroupper
1294
1295	add	0($ctx),$A			# update context
1296	add	4($ctx),@T[0]
1297	add	8($ctx),$C
1298	mov	$A,0($ctx)
1299	add	12($ctx),$D
1300	mov	@T[0],4($ctx)
1301	add	16($ctx),$E
1302	mov	$C,8($ctx)
1303	mov	$D,12($ctx)
1304	mov	$E,16($ctx)
1305___
1306$code.=<<___ if ($win64);
1307	movaps	-40-6*16($fp),%xmm6
1308	movaps	-40-5*16($fp),%xmm7
1309	movaps	-40-4*16($fp),%xmm8
1310	movaps	-40-3*16($fp),%xmm9
1311	movaps	-40-2*16($fp),%xmm10
1312	movaps	-40-1*16($fp),%xmm11
1313___
1314$code.=<<___;
1315	mov	-40($fp),%r14
1316.cfi_restore	%r14
1317	mov	-32($fp),%r13
1318.cfi_restore	%r13
1319	mov	-24($fp),%r12
1320.cfi_restore	%r12
1321	mov	-16($fp),%rbp
1322.cfi_restore	%rbp
1323	mov	-8($fp),%rbx
1324.cfi_restore	%rbx
1325	lea	($fp),%rsp
1326.cfi_def_cfa_register	%rsp
1327.Lepilogue_avx:
1328	ret
1329.cfi_endproc
1330.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1331___
1332
1333if ($avx>1) {
1334use integer;
1335$Xi=4;					# reset variables
1336@X=map("%ymm$_",(4..7,0..3));
1337@Tx=map("%ymm$_",(8..10));
1338$Kx="%ymm11";
1339$j=0;
1340
1341my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1342my ($a5,$t0)=("%r12d","%edi");
1343
1344my ($A,$F,$B,$C,$D,$E)=@ROTX;
1345my $rx=0;
1346my $frame="%r13";
1347
1348$code.=<<___;
1349.type	sha1_block_data_order_avx2,\@function,3
1350.align	16
1351sha1_block_data_order_avx2:
1352_avx2_shortcut:
1353.cfi_startproc
1354	mov	%rsp,$fp
1355.cfi_def_cfa_register	$fp
1356	push	%rbx
1357.cfi_push	%rbx
1358	push	%rbp
1359.cfi_push	%rbp
1360	push	%r12
1361.cfi_push	%r12
1362	push	%r13
1363.cfi_push	%r13
1364	push	%r14
1365.cfi_push	%r14
1366	vzeroupper
1367___
1368$code.=<<___ if ($win64);
1369	lea	-6*16(%rsp),%rsp
1370	vmovaps	%xmm6,-40-6*16($fp)
1371	vmovaps	%xmm7,-40-5*16($fp)
1372	vmovaps	%xmm8,-40-4*16($fp)
1373	vmovaps	%xmm9,-40-3*16($fp)
1374	vmovaps	%xmm10,-40-2*16($fp)
1375	vmovaps	%xmm11,-40-1*16($fp)
1376.Lprologue_avx2:
1377___
1378$code.=<<___;
1379	mov	%rdi,$ctx		# reassigned argument
1380	mov	%rsi,$inp		# reassigned argument
1381	mov	%rdx,$num		# reassigned argument
1382
1383	lea	-640(%rsp),%rsp
1384	shl	\$6,$num
1385	 lea	64($inp),$frame
1386	and	\$-128,%rsp
1387	add	$inp,$num
1388	lea	K_XX_XX+64(%rip),$K_XX_XX
1389
1390	mov	0($ctx),$A		# load context
1391	 cmp	$num,$frame
1392	 cmovae	$inp,$frame		# next or same block
1393	mov	4($ctx),$F
1394	mov	8($ctx),$C
1395	mov	12($ctx),$D
1396	mov	16($ctx),$E
1397	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1398
1399	vmovdqu		($inp),%xmm0
1400	vmovdqu		16($inp),%xmm1
1401	vmovdqu		32($inp),%xmm2
1402	vmovdqu		48($inp),%xmm3
1403	lea		64($inp),$inp
1404	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1405	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1406	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1407	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1408	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1409	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1410	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1411	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1412	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1413
1414	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1415	vpaddd	$Kx,@X[-3&7],@X[1]
1416	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1417	vpaddd	$Kx,@X[-2&7],@X[2]
1418	vmovdqu	@X[1],32(%rsp)
1419	vpaddd	$Kx,@X[-1&7],@X[3]
1420	vmovdqu	@X[2],64(%rsp)
1421	vmovdqu	@X[3],96(%rsp)
1422___
1423for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1424    use integer;
1425
1426	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1427	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1428	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1429	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1430	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1431	&vpsrld	(@Tx[0],@X[0],31);
1432	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1433	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1434	&vpaddd	(@X[0],@X[0],@X[0]);
1435	&vpsrld	(@Tx[1],@Tx[2],30);
1436	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1437	&vpslld	(@Tx[2],@Tx[2],2);
1438	&vpxor	(@X[0],@X[0],@Tx[1]);
1439	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1440	&vpaddd	(@Tx[1],@X[0],$Kx);
1441	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1442
1443	push(@X,shift(@X));	# "rotate" X[]
1444}
1445$code.=<<___;
1446	lea	128(%rsp),$frame
1447	jmp	.Loop_avx2
1448.align	32
1449.Loop_avx2:
1450	rorx	\$2,$F,$B
1451	andn	$D,$F,$t0
1452	and	$C,$F
1453	xor	$t0,$F
1454___
1455sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1456	# at start $f=(b&c)^(~b&d), $b>>>=2
1457	return &bodyx_20_39() if ($rx==19); $rx++;
1458	(
1459	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1460
1461	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1462	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1463	'&andn	($t0,$a,$c)',			# ~b&d for next round
1464
1465	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1466	'&rorx	($a5,$a,27)',			# a<<<5
1467	'&rorx	($f,$a,2)',			# b>>>2 for next round
1468	'&and	($a,$b)',			# b&c for next round
1469
1470	'&add	($e,$a5)',			# e+=a<<<5
1471	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1472
1473	'unshift(@ROTX,pop(@ROTX)); $j++;'
1474	)
1475}
1476
1477sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1478	# on entry $f=b^c^d, $b>>>=2
1479	return &bodyx_40_59() if ($rx==39); $rx++;
1480	(
1481	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1482
1483	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1484	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1485
1486	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1487	'&rorx	($a5,$a,27)',			# a<<<5
1488	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1489	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1490
1491	'&add	($e,$a5)',			# e+=a<<<5
1492	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1493
1494	'unshift(@ROTX,pop(@ROTX)); $j++;'
1495	)
1496}
1497
1498sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1499	# on entry $f=((b^c)&(c^d)), $b>>>=2
1500	$rx++;
1501	(
1502	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1503
1504	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1505	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1506	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1507	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1508	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1509
1510	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1511	'&rorx	($a5,$a,27)',			# a<<<5
1512	'&rorx	($f,$a,2)',			# b>>>2 in next round
1513	'&xor	($a,$b)',			# b^c for next round
1514
1515	'&add	($e,$a5)',			# e+=a<<<5
1516	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1517	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1518
1519	'unshift(@ROTX,pop(@ROTX)); $j++;'
1520	)
1521}
1522
1523sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1524{ use integer;
1525  my $body = shift;
1526  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1527  my ($a,$b,$c,$d,$e);
1528
1529	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1530	 eval(shift(@insns));
1531	 eval(shift(@insns));
1532	 eval(shift(@insns));
1533	 eval(shift(@insns));
1534
1535	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1536	 eval(shift(@insns));
1537	 eval(shift(@insns));
1538	 eval(shift(@insns));
1539
1540	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1541	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1542	 eval(shift(@insns));
1543	 eval(shift(@insns));
1544
1545	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1546	 eval(shift(@insns));
1547	 eval(shift(@insns));
1548	 eval(shift(@insns));
1549	 eval(shift(@insns));
1550
1551	&vpsrld	(@Tx[0],@X[0],31);
1552	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1553	 eval(shift(@insns));
1554	 eval(shift(@insns));
1555	 eval(shift(@insns));
1556
1557	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1558	&vpaddd	(@X[0],@X[0],@X[0]);
1559	 eval(shift(@insns));
1560	 eval(shift(@insns));
1561
1562	&vpsrld	(@Tx[1],@Tx[2],30);
1563	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1564	 eval(shift(@insns));
1565	 eval(shift(@insns));
1566
1567	&vpslld	(@Tx[2],@Tx[2],2);
1568	&vpxor	(@X[0],@X[0],@Tx[1]);
1569	 eval(shift(@insns));
1570	 eval(shift(@insns));
1571
1572	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1573	 eval(shift(@insns));
1574	 eval(shift(@insns));
1575	 eval(shift(@insns));
1576
1577	&vpaddd	(@Tx[1],@X[0],$Kx);
1578	 eval(shift(@insns));
1579	 eval(shift(@insns));
1580	 eval(shift(@insns));
1581	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1582
1583	 foreach (@insns) { eval; }	# remaining instructions [if any]
1584
1585	$Xi++;
1586	push(@X,shift(@X));	# "rotate" X[]
1587}
1588
1589sub Xupdate_avx2_32_79()
1590{ use integer;
1591  my $body = shift;
1592  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1593  my ($a,$b,$c,$d,$e);
1594
1595	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1596	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1597	 eval(shift(@insns));
1598	 eval(shift(@insns));
1599
1600	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1601	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1602	 eval(shift(@insns));
1603	 eval(shift(@insns));
1604	 eval(shift(@insns));
1605
1606	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1607	 eval(shift(@insns));
1608	 eval(shift(@insns));
1609	 eval(shift(@insns));
1610
1611	&vpsrld	(@Tx[0],@X[0],30);
1612	&vpslld	(@X[0],@X[0],2);
1613	 eval(shift(@insns));
1614	 eval(shift(@insns));
1615	 eval(shift(@insns));
1616
1617	#&vpslld	(@X[0],@X[0],2);
1618	 eval(shift(@insns));
1619	 eval(shift(@insns));
1620	 eval(shift(@insns));
1621
1622	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1623	 eval(shift(@insns));
1624	 eval(shift(@insns));
1625	 eval(shift(@insns));
1626	 eval(shift(@insns));
1627
1628	&vpaddd	(@Tx[1],@X[0],$Kx);
1629	 eval(shift(@insns));
1630	 eval(shift(@insns));
1631	 eval(shift(@insns));
1632	 eval(shift(@insns));
1633
1634	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1635
1636	 foreach (@insns) { eval; }	# remaining instructions
1637
1638	$Xi++;
1639	push(@X,shift(@X));	# "rotate" X[]
1640}
1641
1642sub Xloop_avx2()
1643{ use integer;
1644  my $body = shift;
1645  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1646  my ($a,$b,$c,$d,$e);
1647
1648	 foreach (@insns) { eval; }
1649}
1650
1651	&align32();
1652	&Xupdate_avx2_32_79(\&bodyx_00_19);
1653	&Xupdate_avx2_32_79(\&bodyx_00_19);
1654	&Xupdate_avx2_32_79(\&bodyx_00_19);
1655	&Xupdate_avx2_32_79(\&bodyx_00_19);
1656
1657	&Xupdate_avx2_32_79(\&bodyx_20_39);
1658	&Xupdate_avx2_32_79(\&bodyx_20_39);
1659	&Xupdate_avx2_32_79(\&bodyx_20_39);
1660	&Xupdate_avx2_32_79(\&bodyx_20_39);
1661
1662	&align32();
1663	&Xupdate_avx2_32_79(\&bodyx_40_59);
1664	&Xupdate_avx2_32_79(\&bodyx_40_59);
1665	&Xupdate_avx2_32_79(\&bodyx_40_59);
1666	&Xupdate_avx2_32_79(\&bodyx_40_59);
1667
1668	&Xloop_avx2(\&bodyx_20_39);
1669	&Xloop_avx2(\&bodyx_20_39);
1670	&Xloop_avx2(\&bodyx_20_39);
1671	&Xloop_avx2(\&bodyx_20_39);
1672
1673$code.=<<___;
1674	lea	128($inp),$frame
1675	lea	128($inp),%rdi			# borrow $t0
1676	cmp	$num,$frame
1677	cmovae	$inp,$frame			# next or previous block
1678
1679	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1680	add	0($ctx),@ROTX[0]		# update context
1681	add	4($ctx),@ROTX[1]
1682	add	8($ctx),@ROTX[3]
1683	mov	@ROTX[0],0($ctx)
1684	add	12($ctx),@ROTX[4]
1685	mov	@ROTX[1],4($ctx)
1686	 mov	@ROTX[0],$A			# A=d
1687	add	16($ctx),@ROTX[5]
1688	 mov	@ROTX[3],$a5
1689	mov	@ROTX[3],8($ctx)
1690	 mov	@ROTX[4],$D			# D=b
1691	 #xchg	@ROTX[5],$F			# F=c, C=f
1692	mov	@ROTX[4],12($ctx)
1693	 mov	@ROTX[1],$F			# F=e
1694	mov	@ROTX[5],16($ctx)
1695	#mov	$F,16($ctx)
1696	 mov	@ROTX[5],$E			# E=c
1697	 mov	$a5,$C				# C=f
1698	 #xchg	$F,$E				# E=c, F=e
1699
1700	cmp	$num,$inp
1701	je	.Ldone_avx2
1702___
1703
1704$Xi=4;				# reset variables
1705@X=map("%ymm$_",(4..7,0..3));
1706
1707$code.=<<___;
1708	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1709	cmp	$num,%rdi			# borrowed $t0
1710	ja	.Last_avx2
1711
1712	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1713	vmovdqu		-48(%rdi),%xmm1
1714	vmovdqu		-32(%rdi),%xmm2
1715	vmovdqu		-16(%rdi),%xmm3
1716	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1717	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1718	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1719	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1720	jmp	.Last_avx2
1721
1722.align	32
1723.Last_avx2:
1724	lea	128+16(%rsp),$frame
1725	rorx	\$2,$F,$B
1726	andn	$D,$F,$t0
1727	and	$C,$F
1728	xor	$t0,$F
1729	sub	\$-128,$inp
1730___
1731	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1732
1733	&Xloop_avx2	(\&bodyx_00_19);
1734	&Xloop_avx2	(\&bodyx_00_19);
1735	&Xloop_avx2	(\&bodyx_00_19);
1736	&Xloop_avx2	(\&bodyx_00_19);
1737
1738	&Xloop_avx2	(\&bodyx_20_39);
1739	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1740	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1741	&Xloop_avx2	(\&bodyx_20_39);
1742	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1743	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1744	&Xloop_avx2	(\&bodyx_20_39);
1745	  &vmovdqu	("0(%rsp)",@Tx[0]);
1746	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1747	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1748	&Xloop_avx2	(\&bodyx_20_39);
1749	  &vmovdqu	("32(%rsp)",@Tx[1]);
1750	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1751	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1752
1753	&Xloop_avx2	(\&bodyx_40_59);
1754	&align32	();
1755	  &vmovdqu	("64(%rsp)",@X[2]);
1756	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1757	&Xloop_avx2	(\&bodyx_40_59);
1758	  &vmovdqu	("96(%rsp)",@X[3]);
1759	&Xloop_avx2	(\&bodyx_40_59);
1760	&Xupdate_avx2_16_31(\&bodyx_40_59);
1761
1762	&Xupdate_avx2_16_31(\&bodyx_20_39);
1763	&Xupdate_avx2_16_31(\&bodyx_20_39);
1764	&Xupdate_avx2_16_31(\&bodyx_20_39);
1765	&Xloop_avx2	(\&bodyx_20_39);
1766
1767$code.=<<___;
1768	lea	128(%rsp),$frame
1769
1770	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1771	add	0($ctx),@ROTX[0]		# update context
1772	add	4($ctx),@ROTX[1]
1773	add	8($ctx),@ROTX[3]
1774	mov	@ROTX[0],0($ctx)
1775	add	12($ctx),@ROTX[4]
1776	mov	@ROTX[1],4($ctx)
1777	 mov	@ROTX[0],$A			# A=d
1778	add	16($ctx),@ROTX[5]
1779	 mov	@ROTX[3],$a5
1780	mov	@ROTX[3],8($ctx)
1781	 mov	@ROTX[4],$D			# D=b
1782	 #xchg	@ROTX[5],$F			# F=c, C=f
1783	mov	@ROTX[4],12($ctx)
1784	 mov	@ROTX[1],$F			# F=e
1785	mov	@ROTX[5],16($ctx)
1786	#mov	$F,16($ctx)
1787	 mov	@ROTX[5],$E			# E=c
1788	 mov	$a5,$C				# C=f
1789	 #xchg	$F,$E				# E=c, F=e
1790
1791	cmp	$num,$inp
1792	jbe	.Loop_avx2
1793
1794.Ldone_avx2:
1795	vzeroupper
1796___
1797$code.=<<___ if ($win64);
1798	movaps	-40-6*16($fp),%xmm6
1799	movaps	-40-5*16($fp),%xmm7
1800	movaps	-40-4*16($fp),%xmm8
1801	movaps	-40-3*16($fp),%xmm9
1802	movaps	-40-2*16($fp),%xmm10
1803	movaps	-40-1*16($fp),%xmm11
1804___
1805$code.=<<___;
1806	mov	-40($fp),%r14
1807.cfi_restore	%r14
1808	mov	-32($fp),%r13
1809.cfi_restore	%r13
1810	mov	-24($fp),%r12
1811.cfi_restore	%r12
1812	mov	-16($fp),%rbp
1813.cfi_restore	%rbp
1814	mov	-8($fp),%rbx
1815.cfi_restore	%rbx
1816	lea	($fp),%rsp
1817.cfi_def_cfa_register	%rsp
1818.Lepilogue_avx2:
1819	ret
1820.cfi_endproc
1821.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1822___
1823}
1824}
1825$code.=<<___;
1826.align	64
1827K_XX_XX:
1828.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1829.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1830.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1831.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1832.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1833.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1834.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1835.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1836.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1837.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1838.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1839___
1840}}}
1841$code.=<<___;
1842.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1843.align	64
1844___
1845
1846# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1847#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1848if ($win64) {
1849$rec="%rcx";
1850$frame="%rdx";
1851$context="%r8";
1852$disp="%r9";
1853
1854$code.=<<___;
1855.extern	__imp_RtlVirtualUnwind
1856.type	se_handler,\@abi-omnipotent
1857.align	16
1858se_handler:
1859	push	%rsi
1860	push	%rdi
1861	push	%rbx
1862	push	%rbp
1863	push	%r12
1864	push	%r13
1865	push	%r14
1866	push	%r15
1867	pushfq
1868	sub	\$64,%rsp
1869
1870	mov	120($context),%rax	# pull context->Rax
1871	mov	248($context),%rbx	# pull context->Rip
1872
1873	lea	.Lprologue(%rip),%r10
1874	cmp	%r10,%rbx		# context->Rip<.Lprologue
1875	jb	.Lcommon_seh_tail
1876
1877	mov	152($context),%rax	# pull context->Rsp
1878
1879	lea	.Lepilogue(%rip),%r10
1880	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1881	jae	.Lcommon_seh_tail
1882
1883	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1884
1885	mov	-8(%rax),%rbx
1886	mov	-16(%rax),%rbp
1887	mov	-24(%rax),%r12
1888	mov	-32(%rax),%r13
1889	mov	-40(%rax),%r14
1890	mov	%rbx,144($context)	# restore context->Rbx
1891	mov	%rbp,160($context)	# restore context->Rbp
1892	mov	%r12,216($context)	# restore context->R12
1893	mov	%r13,224($context)	# restore context->R13
1894	mov	%r14,232($context)	# restore context->R14
1895
1896	jmp	.Lcommon_seh_tail
1897.size	se_handler,.-se_handler
1898___
1899
1900$code.=<<___ if ($shaext);
1901.type	shaext_handler,\@abi-omnipotent
1902.align	16
1903shaext_handler:
1904	push	%rsi
1905	push	%rdi
1906	push	%rbx
1907	push	%rbp
1908	push	%r12
1909	push	%r13
1910	push	%r14
1911	push	%r15
1912	pushfq
1913	sub	\$64,%rsp
1914
1915	mov	120($context),%rax	# pull context->Rax
1916	mov	248($context),%rbx	# pull context->Rip
1917
1918	lea	.Lprologue_shaext(%rip),%r10
1919	cmp	%r10,%rbx		# context->Rip<.Lprologue
1920	jb	.Lcommon_seh_tail
1921
1922	lea	.Lepilogue_shaext(%rip),%r10
1923	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1924	jae	.Lcommon_seh_tail
1925
1926	lea	-8-4*16(%rax),%rsi
1927	lea	512($context),%rdi	# &context.Xmm6
1928	mov	\$8,%ecx
1929	.long	0xa548f3fc		# cld; rep movsq
1930
1931	jmp	.Lcommon_seh_tail
1932.size	shaext_handler,.-shaext_handler
1933___
1934
1935$code.=<<___;
1936.type	ssse3_handler,\@abi-omnipotent
1937.align	16
1938ssse3_handler:
1939	push	%rsi
1940	push	%rdi
1941	push	%rbx
1942	push	%rbp
1943	push	%r12
1944	push	%r13
1945	push	%r14
1946	push	%r15
1947	pushfq
1948	sub	\$64,%rsp
1949
1950	mov	120($context),%rax	# pull context->Rax
1951	mov	248($context),%rbx	# pull context->Rip
1952
1953	mov	8($disp),%rsi		# disp->ImageBase
1954	mov	56($disp),%r11		# disp->HandlerData
1955
1956	mov	0(%r11),%r10d		# HandlerData[0]
1957	lea	(%rsi,%r10),%r10	# prologue label
1958	cmp	%r10,%rbx		# context->Rip<prologue label
1959	jb	.Lcommon_seh_tail
1960
1961	mov	208($context),%rax	# pull context->R11
1962
1963	mov	4(%r11),%r10d		# HandlerData[1]
1964	lea	(%rsi,%r10),%r10	# epilogue label
1965	cmp	%r10,%rbx		# context->Rip>=epilogue label
1966	jae	.Lcommon_seh_tail
1967
1968	lea	-40-6*16(%rax),%rsi
1969	lea	512($context),%rdi	# &context.Xmm6
1970	mov	\$12,%ecx
1971	.long	0xa548f3fc		# cld; rep movsq
1972
1973	mov	-8(%rax),%rbx
1974	mov	-16(%rax),%rbp
1975	mov	-24(%rax),%r12
1976	mov	-32(%rax),%r13
1977	mov	-40(%rax),%r14
1978	mov	%rbx,144($context)	# restore context->Rbx
1979	mov	%rbp,160($context)	# restore context->Rbp
1980	mov	%r12,216($context)	# restore context->R12
1981	mov	%r13,224($context)	# restore context->R13
1982	mov	%r14,232($context)	# restore context->R14
1983
1984.Lcommon_seh_tail:
1985	mov	8(%rax),%rdi
1986	mov	16(%rax),%rsi
1987	mov	%rax,152($context)	# restore context->Rsp
1988	mov	%rsi,168($context)	# restore context->Rsi
1989	mov	%rdi,176($context)	# restore context->Rdi
1990
1991	mov	40($disp),%rdi		# disp->ContextRecord
1992	mov	$context,%rsi		# context
1993	mov	\$154,%ecx		# sizeof(CONTEXT)
1994	.long	0xa548f3fc		# cld; rep movsq
1995
1996	mov	$disp,%rsi
1997	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1998	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1999	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2000	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2001	mov	40(%rsi),%r10		# disp->ContextRecord
2002	lea	56(%rsi),%r11		# &disp->HandlerData
2003	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2004	mov	%r10,32(%rsp)		# arg5
2005	mov	%r11,40(%rsp)		# arg6
2006	mov	%r12,48(%rsp)		# arg7
2007	mov	%rcx,56(%rsp)		# arg8, (NULL)
2008	call	*__imp_RtlVirtualUnwind(%rip)
2009
2010	mov	\$1,%eax		# ExceptionContinueSearch
2011	add	\$64,%rsp
2012	popfq
2013	pop	%r15
2014	pop	%r14
2015	pop	%r13
2016	pop	%r12
2017	pop	%rbp
2018	pop	%rbx
2019	pop	%rdi
2020	pop	%rsi
2021	ret
2022.size	ssse3_handler,.-ssse3_handler
2023
2024.section	.pdata
2025.align	4
2026	.rva	.LSEH_begin_sha1_block_data_order
2027	.rva	.LSEH_end_sha1_block_data_order
2028	.rva	.LSEH_info_sha1_block_data_order
2029___
2030$code.=<<___ if ($shaext);
2031	.rva	.LSEH_begin_sha1_block_data_order_shaext
2032	.rva	.LSEH_end_sha1_block_data_order_shaext
2033	.rva	.LSEH_info_sha1_block_data_order_shaext
2034___
2035$code.=<<___;
2036	.rva	.LSEH_begin_sha1_block_data_order_ssse3
2037	.rva	.LSEH_end_sha1_block_data_order_ssse3
2038	.rva	.LSEH_info_sha1_block_data_order_ssse3
2039___
2040$code.=<<___ if ($avx);
2041	.rva	.LSEH_begin_sha1_block_data_order_avx
2042	.rva	.LSEH_end_sha1_block_data_order_avx
2043	.rva	.LSEH_info_sha1_block_data_order_avx
2044___
2045$code.=<<___ if ($avx>1);
2046	.rva	.LSEH_begin_sha1_block_data_order_avx2
2047	.rva	.LSEH_end_sha1_block_data_order_avx2
2048	.rva	.LSEH_info_sha1_block_data_order_avx2
2049___
2050$code.=<<___;
2051.section	.xdata
2052.align	8
2053.LSEH_info_sha1_block_data_order:
2054	.byte	9,0,0,0
2055	.rva	se_handler
2056___
2057$code.=<<___ if ($shaext);
2058.LSEH_info_sha1_block_data_order_shaext:
2059	.byte	9,0,0,0
2060	.rva	shaext_handler
2061___
2062$code.=<<___;
2063.LSEH_info_sha1_block_data_order_ssse3:
2064	.byte	9,0,0,0
2065	.rva	ssse3_handler
2066	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2067___
2068$code.=<<___ if ($avx);
2069.LSEH_info_sha1_block_data_order_avx:
2070	.byte	9,0,0,0
2071	.rva	ssse3_handler
2072	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2073___
2074$code.=<<___ if ($avx>1);
2075.LSEH_info_sha1_block_data_order_avx2:
2076	.byte	9,0,0,0
2077	.rva	ssse3_handler
2078	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2079___
2080}
2081
2082####################################################################
2083
2084sub sha1rnds4 {
2085    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2086      my @opcode=(0x0f,0x3a,0xcc);
2087	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2088	my $c=$1;
2089	push @opcode,$c=~/^0/?oct($c):$c;
2090	return ".byte\t".join(',',@opcode);
2091    } else {
2092	return "sha1rnds4\t".@_[0];
2093    }
2094}
2095
2096sub sha1op38 {
2097    my $instr = shift;
2098    my %opcodelet = (
2099		"sha1nexte" => 0xc8,
2100  		"sha1msg1"  => 0xc9,
2101		"sha1msg2"  => 0xca	);
2102
2103    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2104      my @opcode=(0x0f,0x38);
2105      my $rex=0;
2106	$rex|=0x04			if ($2>=8);
2107	$rex|=0x01			if ($1>=8);
2108	unshift @opcode,0x40|$rex	if ($rex);
2109	push @opcode,$opcodelet{$instr};
2110	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2111	return ".byte\t".join(',',@opcode);
2112    } else {
2113	return $instr."\t".@_[0];
2114    }
2115}
2116
2117foreach (split("\n",$code)) {
2118	s/\`([^\`]*)\`/eval $1/geo;
2119
2120	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2121	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2122
2123	print $_,"\n";
2124}
2125close STDOUT or die "error closing STDOUT";
2126