• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# sha1_block procedure for x86_64.
18#
19# It was brought to my attention that on EM64T compiler-generated code
20# was far behind 32-bit assembler implementation. This is unlike on
21# Opteron where compiler-generated code was only 15% behind 32-bit
22# assembler, which originally made it hard to motivate the effort.
23# There was suggestion to mechanically translate 32-bit code, but I
24# dismissed it, reasoning that x86_64 offers enough register bank
25# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26# implementation:-) However! While 64-bit code does perform better
27# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28# x86_64 does offer larger *addressable* bank, but out-of-order core
29# reaches for even more registers through dynamic aliasing, and EM64T
30# core must have managed to run-time optimize even 32-bit code just as
31# good as 64-bit one. Performance improvement is summarized in the
32# following table:
33#
34#		gcc 3.4		32-bit asm	cycles/byte
35# Opteron	+45%		+20%		6.8
36# Xeon P4	+65%		+0%		9.9
37# Core2		+60%		+10%		7.0
38
39# August 2009.
40#
41# The code was revised to minimize code size and to maximize
42# "distance" between instructions producing input to 'lea'
43# instruction and the 'lea' instruction itself, which is essential
44# for Intel Atom core.
45
46# October 2010.
47#
48# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49# is to offload message schedule denoted by Wt in NIST specification,
50# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51# for background and implementation details. The only difference from
52# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53# to free temporary registers.
54
55# April 2011.
56#
57# Add AVX code path. See sha1-586.pl for further information.
58
59# May 2013.
60#
61# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62# and loading pair of consecutive blocks to 256-bit %ymm registers)
63# did not provide impressive performance improvement till a crucial
64# hint regarding the number of Xupdate iterations to pre-compute in
65# advance was provided by Ilya Albrekht of Intel Corp.
66
67# March 2014.
68#
69# Add support for Intel SHA Extensions.
70
71######################################################################
72# Current performance is summarized in following table. Numbers are
73# CPU clock cycles spent to process single byte (less is better).
74#
75#		x86_64		SSSE3		AVX[2]
76# P4		9.05		-
77# Opteron	6.26		-
78# Core2		6.55		6.05/+8%	-
79# Westmere	6.73		5.30/+27%	-
80# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82# Haswell	5.45		4.15/+31%	3.57/+53%
83# Skylake	5.18		4.06/+28%	3.54/+46%
84# Bulldozer	9.11		5.95/+53%
85# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86# VIA Nano	9.32		7.15/+30%
87# Atom		10.3		9.17/+12%
88# Silvermont	13.1(*)		9.37/+40%
89# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91#
92# (*)	obviously suboptimal result, nothing was done about it,
93#	because SSSE3 code is compiled unconditionally;
94# (**)	SHAEXT result
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# In upstream, this is controlled by shelling out to the compiler to check
108# versions, but BoringSSL is intended to be used with pre-generated perlasm
109# output, so this isn't useful anyway.
110$avx = 2;
111$shaext=1;	### set to zero if compiling for 1.0.1
112
113open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
114*STDOUT=*OUT;
115
116$ctx="%rdi";	# 1st arg
117$inp="%rsi";	# 2nd arg
118$num="%rdx";	# 3rd arg
119
120# reassign arguments in order to produce more compact code
121$ctx="%r8";
122$inp="%r9";
123$num="%r10";
124
125$t0="%eax";
126$t1="%ebx";
127$t2="%ecx";
128@xi=("%edx","%ebp","%r14d");
129$A="%esi";
130$B="%edi";
131$C="%r11d";
132$D="%r12d";
133$E="%r13d";
134
135@V=($A,$B,$C,$D,$E);
136
137sub BODY_00_19 {
138my ($i,$a,$b,$c,$d,$e)=@_;
139my $j=$i+1;
140$code.=<<___ if ($i==0);
141	mov	`4*$i`($inp),$xi[0]
142	bswap	$xi[0]
143___
144$code.=<<___ if ($i<15);
145	mov	`4*$j`($inp),$xi[1]
146	mov	$d,$t0
147	mov	$xi[0],`4*$i`(%rsp)
148	mov	$a,$t2
149	bswap	$xi[1]
150	xor	$c,$t0
151	rol	\$5,$t2
152	and	$b,$t0
153	lea	0x5a827999($xi[0],$e),$e
154	add	$t2,$e
155	xor	$d,$t0
156	rol	\$30,$b
157	add	$t0,$e
158___
159$code.=<<___ if ($i>=15);
160	xor	`4*($j%16)`(%rsp),$xi[1]
161	mov	$d,$t0
162	mov	$xi[0],`4*($i%16)`(%rsp)
163	mov	$a,$t2
164	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
165	xor	$c,$t0
166	rol	\$5,$t2
167	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
168	and	$b,$t0
169	lea	0x5a827999($xi[0],$e),$e
170	rol	\$30,$b
171	xor	$d,$t0
172	add	$t2,$e
173	rol	\$1,$xi[1]
174	add	$t0,$e
175___
176push(@xi,shift(@xi));
177}
178
179sub BODY_20_39 {
180my ($i,$a,$b,$c,$d,$e)=@_;
181my $j=$i+1;
182my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
183$code.=<<___ if ($i<79);
184	xor	`4*($j%16)`(%rsp),$xi[1]
185	mov	$b,$t0
186	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
187	mov	$a,$t2
188	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
189	xor	$d,$t0
190	rol	\$5,$t2
191	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
192	lea	$K($xi[0],$e),$e
193	xor	$c,$t0
194	add	$t2,$e
195	rol	\$30,$b
196	add	$t0,$e
197	rol	\$1,$xi[1]
198___
199$code.=<<___ if ($i==79);
200	mov	$b,$t0
201	mov	$a,$t2
202	xor	$d,$t0
203	lea	$K($xi[0],$e),$e
204	rol	\$5,$t2
205	xor	$c,$t0
206	add	$t2,$e
207	rol	\$30,$b
208	add	$t0,$e
209___
210push(@xi,shift(@xi));
211}
212
213sub BODY_40_59 {
214my ($i,$a,$b,$c,$d,$e)=@_;
215my $j=$i+1;
216$code.=<<___;
217	xor	`4*($j%16)`(%rsp),$xi[1]
218	mov	$d,$t0
219	mov	$xi[0],`4*($i%16)`(%rsp)
220	mov	$d,$t1
221	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
222	and	$c,$t0
223	mov	$a,$t2
224	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
225	lea	0x8f1bbcdc($xi[0],$e),$e
226	xor	$c,$t1
227	rol	\$5,$t2
228	add	$t0,$e
229	rol	\$1,$xi[1]
230	and	$b,$t1
231	add	$t2,$e
232	rol	\$30,$b
233	add	$t1,$e
234___
235push(@xi,shift(@xi));
236}
237
238$code.=<<___;
239.text
240.extern	OPENSSL_ia32cap_P
241
242.globl	sha1_block_data_order
243.type	sha1_block_data_order,\@function,3
244.align	16
245sha1_block_data_order:
246.cfi_startproc
247	leaq	OPENSSL_ia32cap_P(%rip),%r10
248	mov	0(%r10),%r9d
249	mov	4(%r10),%r8d
250	mov	8(%r10),%r10d
251	test	\$`1<<9`,%r8d		# check SSSE3 bit
252	jz	.Lialu
253___
254$code.=<<___ if ($shaext);
255	test	\$`1<<29`,%r10d		# check SHA bit
256	jnz	_shaext_shortcut
257___
258$code.=<<___ if ($avx>1);
259	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
260	cmp	\$`1<<3|1<<5|1<<8`,%r10d
261	je	_avx2_shortcut
262___
263$code.=<<___ if ($avx);
264	and	\$`1<<28`,%r8d		# mask AVX bit
265	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
266	or	%r9d,%r8d
267	cmp	\$`1<<28|1<<30`,%r8d
268	je	_avx_shortcut
269___
270$code.=<<___;
271	jmp	_ssse3_shortcut
272
273.align	16
274.Lialu:
275	mov	%rsp,%rax
276.cfi_def_cfa_register	%rax
277	push	%rbx
278.cfi_push	%rbx
279	push	%rbp
280.cfi_push	%rbp
281	push	%r12
282.cfi_push	%r12
283	push	%r13
284.cfi_push	%r13
285	push	%r14
286.cfi_push	%r14
287	mov	%rdi,$ctx	# reassigned argument
288	sub	\$`8+16*4`,%rsp
289	mov	%rsi,$inp	# reassigned argument
290	and	\$-64,%rsp
291	mov	%rdx,$num	# reassigned argument
292	mov	%rax,`16*4`(%rsp)
293.cfi_cfa_expression	%rsp+64,deref,+8
294.Lprologue:
295
296	mov	0($ctx),$A
297	mov	4($ctx),$B
298	mov	8($ctx),$C
299	mov	12($ctx),$D
300	mov	16($ctx),$E
301	jmp	.Lloop
302
303.align	16
304.Lloop:
305___
306for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
307for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
308for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
309for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
310$code.=<<___;
311	add	0($ctx),$A
312	add	4($ctx),$B
313	add	8($ctx),$C
314	add	12($ctx),$D
315	add	16($ctx),$E
316	mov	$A,0($ctx)
317	mov	$B,4($ctx)
318	mov	$C,8($ctx)
319	mov	$D,12($ctx)
320	mov	$E,16($ctx)
321
322	sub	\$1,$num
323	lea	`16*4`($inp),$inp
324	jnz	.Lloop
325
326	mov	`16*4`(%rsp),%rsi
327.cfi_def_cfa	%rsi,8
328	mov	-40(%rsi),%r14
329.cfi_restore	%r14
330	mov	-32(%rsi),%r13
331.cfi_restore	%r13
332	mov	-24(%rsi),%r12
333.cfi_restore	%r12
334	mov	-16(%rsi),%rbp
335.cfi_restore	%rbp
336	mov	-8(%rsi),%rbx
337.cfi_restore	%rbx
338	lea	(%rsi),%rsp
339.cfi_def_cfa_register	%rsp
340.Lepilogue:
341	ret
342.cfi_endproc
343.size	sha1_block_data_order,.-sha1_block_data_order
344___
345if ($shaext) {{{
346######################################################################
347# Intel SHA Extensions implementation of SHA1 update function.
348#
349my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
350my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
351my @MSG=map("%xmm$_",(4..7));
352
353$code.=<<___;
354.type	sha1_block_data_order_shaext,\@function,3
355.align	32
356sha1_block_data_order_shaext:
357_shaext_shortcut:
358.cfi_startproc
359___
360$code.=<<___ if ($win64);
361	lea	`-8-4*16`(%rsp),%rsp
362	movaps	%xmm6,-8-4*16(%rax)
363	movaps	%xmm7,-8-3*16(%rax)
364	movaps	%xmm8,-8-2*16(%rax)
365	movaps	%xmm9,-8-1*16(%rax)
366.Lprologue_shaext:
367___
368$code.=<<___;
369	movdqu	($ctx),$ABCD
370	movd	16($ctx),$E
371	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
372
373	movdqu	($inp),@MSG[0]
374	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
375	movdqu	0x10($inp),@MSG[1]
376	pshufd	\$0b00011011,$E,$E		# flip word order
377	movdqu	0x20($inp),@MSG[2]
378	pshufb	$BSWAP,@MSG[0]
379	movdqu	0x30($inp),@MSG[3]
380	pshufb	$BSWAP,@MSG[1]
381	pshufb	$BSWAP,@MSG[2]
382	movdqa	$E,$E_SAVE			# offload $E
383	pshufb	$BSWAP,@MSG[3]
384	jmp	.Loop_shaext
385
386.align	16
387.Loop_shaext:
388	dec		$num
389	lea		0x40($inp),%r8		# next input block
390	paddd		@MSG[0],$E
391	cmovne		%r8,$inp
392	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
393___
394for($i=0;$i<20-4;$i+=2) {
395$code.=<<___;
396	sha1msg1	@MSG[1],@MSG[0]
397	movdqa		$ABCD,$E_
398	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
399	sha1nexte	@MSG[1],$E_
400	pxor		@MSG[2],@MSG[0]
401	sha1msg1	@MSG[2],@MSG[1]
402	sha1msg2	@MSG[3],@MSG[0]
403
404	movdqa		$ABCD,$E
405	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
406	sha1nexte	@MSG[2],$E
407	pxor		@MSG[3],@MSG[1]
408	sha1msg2	@MSG[0],@MSG[1]
409___
410	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
411}
412$code.=<<___;
413	movdqu		($inp),@MSG[0]
414	movdqa		$ABCD,$E_
415	sha1rnds4	\$3,$E,$ABCD		# 64-67
416	sha1nexte	@MSG[1],$E_
417	movdqu		0x10($inp),@MSG[1]
418	pshufb		$BSWAP,@MSG[0]
419
420	movdqa		$ABCD,$E
421	sha1rnds4	\$3,$E_,$ABCD		# 68-71
422	sha1nexte	@MSG[2],$E
423	movdqu		0x20($inp),@MSG[2]
424	pshufb		$BSWAP,@MSG[1]
425
426	movdqa		$ABCD,$E_
427	sha1rnds4	\$3,$E,$ABCD		# 72-75
428	sha1nexte	@MSG[3],$E_
429	movdqu		0x30($inp),@MSG[3]
430	pshufb		$BSWAP,@MSG[2]
431
432	movdqa		$ABCD,$E
433	sha1rnds4	\$3,$E_,$ABCD		# 76-79
434	sha1nexte	$E_SAVE,$E
435	pshufb		$BSWAP,@MSG[3]
436
437	paddd		$ABCD_SAVE,$ABCD
438	movdqa		$E,$E_SAVE		# offload $E
439
440	jnz		.Loop_shaext
441
442	pshufd	\$0b00011011,$ABCD,$ABCD
443	pshufd	\$0b00011011,$E,$E
444	movdqu	$ABCD,($ctx)
445	movd	$E,16($ctx)
446___
447$code.=<<___ if ($win64);
448	movaps	-8-4*16(%rax),%xmm6
449	movaps	-8-3*16(%rax),%xmm7
450	movaps	-8-2*16(%rax),%xmm8
451	movaps	-8-1*16(%rax),%xmm9
452	mov	%rax,%rsp
453.Lepilogue_shaext:
454___
455$code.=<<___;
456	ret
457.cfi_endproc
458.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
459___
460}}}
461{{{
462my $Xi=4;
463my @X=map("%xmm$_",(4..7,0..3));
464my @Tx=map("%xmm$_",(8..10));
465my $Kx="%xmm11";
466my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
467my @T=("%esi","%edi");
468my $j=0;
469my $rx=0;
470my $K_XX_XX="%r14";
471my $fp="%r11";
472
473my $_rol=sub { &rol(@_) };
474my $_ror=sub { &ror(@_) };
475
476{ my $sn;
477sub align32() {
478  ++$sn;
479$code.=<<___;
480	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
481.align	32
482.Lalign32_$sn:
483___
484}
485}
486
487$code.=<<___;
488.type	sha1_block_data_order_ssse3,\@function,3
489.align	16
490sha1_block_data_order_ssse3:
491_ssse3_shortcut:
492.cfi_startproc
493	mov	%rsp,$fp	# frame pointer
494.cfi_def_cfa_register	$fp
495	push	%rbx
496.cfi_push	%rbx
497	push	%rbp
498.cfi_push	%rbp
499	push	%r12
500.cfi_push	%r12
501	push	%r13		# redundant, done to share Win64 SE handler
502.cfi_push	%r13
503	push	%r14
504.cfi_push	%r14
505	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
506___
507$code.=<<___ if ($win64);
508	movaps	%xmm6,-40-6*16($fp)
509	movaps	%xmm7,-40-5*16($fp)
510	movaps	%xmm8,-40-4*16($fp)
511	movaps	%xmm9,-40-3*16($fp)
512	movaps	%xmm10,-40-2*16($fp)
513	movaps	%xmm11,-40-1*16($fp)
514.Lprologue_ssse3:
515___
516$code.=<<___;
517	and	\$-64,%rsp
518	mov	%rdi,$ctx	# reassigned argument
519	mov	%rsi,$inp	# reassigned argument
520	mov	%rdx,$num	# reassigned argument
521
522	shl	\$6,$num
523	add	$inp,$num
524	lea	K_XX_XX+64(%rip),$K_XX_XX
525
526	mov	0($ctx),$A		# load context
527	mov	4($ctx),$B
528	mov	8($ctx),$C
529	mov	12($ctx),$D
530	mov	$B,@T[0]		# magic seed
531	mov	16($ctx),$E
532	mov	$C,@T[1]
533	xor	$D,@T[1]
534	and	@T[1],@T[0]
535
536	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
537	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
538	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
539	movdqu	16($inp),@X[-3&7]
540	movdqu	32($inp),@X[-2&7]
541	movdqu	48($inp),@X[-1&7]
542	pshufb	@X[2],@X[-4&7]		# byte swap
543	pshufb	@X[2],@X[-3&7]
544	pshufb	@X[2],@X[-2&7]
545	add	\$64,$inp
546	paddd	@Tx[1],@X[-4&7]		# add K_00_19
547	pshufb	@X[2],@X[-1&7]
548	paddd	@Tx[1],@X[-3&7]
549	paddd	@Tx[1],@X[-2&7]
550	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
551	psubd	@Tx[1],@X[-4&7]		# restore X[]
552	movdqa	@X[-3&7],16(%rsp)
553	psubd	@Tx[1],@X[-3&7]
554	movdqa	@X[-2&7],32(%rsp)
555	psubd	@Tx[1],@X[-2&7]
556	jmp	.Loop_ssse3
557___
558
559sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
560{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
561  my $arg = pop;
562    $arg = "\$$arg" if ($arg*1 eq $arg);
563    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
564}
565
566sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
567{ use integer;
568  my $body = shift;
569  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
570  my ($a,$b,$c,$d,$e);
571
572	 eval(shift(@insns));		# ror
573	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
574	 eval(shift(@insns));
575	&movdqa	(@Tx[0],@X[-1&7]);
576	  &paddd	(@Tx[1],@X[-1&7]);
577	 eval(shift(@insns));
578	 eval(shift(@insns));
579
580	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
581	 eval(shift(@insns));
582	 eval(shift(@insns));		# rol
583	 eval(shift(@insns));
584	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
585	 eval(shift(@insns));
586	 eval(shift(@insns));
587
588	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
589	 eval(shift(@insns));
590	 eval(shift(@insns));		# ror
591	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
592	 eval(shift(@insns));
593	 eval(shift(@insns));
594	 eval(shift(@insns));
595
596	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
597	 eval(shift(@insns));
598	 eval(shift(@insns));		# rol
599	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
600	 eval(shift(@insns));
601	 eval(shift(@insns));
602
603	&movdqa	(@Tx[2],@X[0]);
604	 eval(shift(@insns));
605	 eval(shift(@insns));
606	 eval(shift(@insns));		# ror
607	&movdqa	(@Tx[0],@X[0]);
608	 eval(shift(@insns));
609
610	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
611	&paddd	(@X[0],@X[0]);
612	 eval(shift(@insns));
613	 eval(shift(@insns));
614
615	&psrld	(@Tx[0],31);
616	 eval(shift(@insns));
617	 eval(shift(@insns));		# rol
618	 eval(shift(@insns));
619	&movdqa	(@Tx[1],@Tx[2]);
620	 eval(shift(@insns));
621	 eval(shift(@insns));
622
623	&psrld	(@Tx[2],30);
624	 eval(shift(@insns));
625	 eval(shift(@insns));		# ror
626	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
627	 eval(shift(@insns));
628	 eval(shift(@insns));
629	 eval(shift(@insns));
630
631	&pslld	(@Tx[1],2);
632	&pxor	(@X[0],@Tx[2]);
633	 eval(shift(@insns));
634	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
635	 eval(shift(@insns));		# rol
636	 eval(shift(@insns));
637	 eval(shift(@insns));
638
639	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
640	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
641
642	 foreach (@insns) { eval; }	# remaining instructions [if any]
643
644  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
645		push(@Tx,shift(@Tx));
646}
647
648sub Xupdate_ssse3_32_79()
649{ use integer;
650  my $body = shift;
651  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
652  my ($a,$b,$c,$d,$e);
653
654	 eval(shift(@insns))		if ($Xi==8);
655	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
656	 eval(shift(@insns))		if ($Xi==8);
657	 eval(shift(@insns));		# body_20_39
658	 eval(shift(@insns));
659	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
660	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
661	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
662	 eval(shift(@insns));
663	 eval(shift(@insns));		# rol
664
665	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
666	 eval(shift(@insns));
667	 eval(shift(@insns));
668	if ($Xi%5) {
669	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
670	} else {			# ... or load next one
671	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
672	}
673	 eval(shift(@insns));		# ror
674	  &paddd	(@Tx[1],@X[-1&7]);
675	 eval(shift(@insns));
676
677	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
678	 eval(shift(@insns));		# body_20_39
679	 eval(shift(@insns));
680	 eval(shift(@insns));
681	 eval(shift(@insns));		# rol
682	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
683
684	&movdqa	(@Tx[0],@X[0]);
685	 eval(shift(@insns));
686	 eval(shift(@insns));
687	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
688	 eval(shift(@insns));		# ror
689	 eval(shift(@insns));
690	 eval(shift(@insns));		# body_20_39
691
692	&pslld	(@X[0],2);
693	 eval(shift(@insns));
694	 eval(shift(@insns));
695	&psrld	(@Tx[0],30);
696	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
697	 eval(shift(@insns));
698	 eval(shift(@insns));
699	 eval(shift(@insns));		# ror
700
701	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
702	 eval(shift(@insns));
703	 eval(shift(@insns));		# body_20_39
704	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
705	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
706	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
707	 eval(shift(@insns));
708	 eval(shift(@insns));		# rol
709	 eval(shift(@insns));
710	 eval(shift(@insns));
711	 eval(shift(@insns));		# rol
712	 eval(shift(@insns));
713
714	 foreach (@insns) { eval; }	# remaining instructions
715
716  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
717		push(@Tx,shift(@Tx));
718}
719
720sub Xuplast_ssse3_80()
721{ use integer;
722  my $body = shift;
723  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
724  my ($a,$b,$c,$d,$e);
725
726	 eval(shift(@insns));
727	 eval(shift(@insns));
728	 eval(shift(@insns));
729	 eval(shift(@insns));
730	  &paddd	(@Tx[1],@X[-1&7]);
731	 eval(shift(@insns));
732	 eval(shift(@insns));
733
734	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
735
736	 foreach (@insns) { eval; }		# remaining instructions
737
738	&cmp	($inp,$num);
739	&je	(".Ldone_ssse3");
740
741	unshift(@Tx,pop(@Tx));
742
743	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
744	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
745	&movdqu	(@X[-4&7],"0($inp)");		# load input
746	&movdqu	(@X[-3&7],"16($inp)");
747	&movdqu	(@X[-2&7],"32($inp)");
748	&movdqu	(@X[-1&7],"48($inp)");
749	&pshufb	(@X[-4&7],@X[2]);		# byte swap
750	&add	($inp,64);
751
752  $Xi=0;
753}
754
755sub Xloop_ssse3()
756{ use integer;
757  my $body = shift;
758  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
759  my ($a,$b,$c,$d,$e);
760
761	 eval(shift(@insns));
762	 eval(shift(@insns));
763	 eval(shift(@insns));
764	&pshufb	(@X[($Xi-3)&7],@X[2]);
765	 eval(shift(@insns));
766	 eval(shift(@insns));
767	 eval(shift(@insns));
768	 eval(shift(@insns));
769	&paddd	(@X[($Xi-4)&7],@Tx[1]);
770	 eval(shift(@insns));
771	 eval(shift(@insns));
772	 eval(shift(@insns));
773	 eval(shift(@insns));
774	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
775	 eval(shift(@insns));
776	 eval(shift(@insns));
777	 eval(shift(@insns));
778	 eval(shift(@insns));
779	&psubd	(@X[($Xi-4)&7],@Tx[1]);
780
781	foreach (@insns) { eval; }
782  $Xi++;
783}
784
785sub Xtail_ssse3()
786{ use integer;
787  my $body = shift;
788  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
789  my ($a,$b,$c,$d,$e);
790
791	foreach (@insns) { eval; }
792}
793
794sub body_00_19 () {	# ((c^d)&b)^d
795	# on start @T[0]=(c^d)&b
796	return &body_20_39() if ($rx==19); $rx++;
797	(
798	'($a,$b,$c,$d,$e)=@V;'.
799	'&$_ror	($b,$j?7:2)',	# $b>>>2
800	'&xor	(@T[0],$d)',
801	'&mov	(@T[1],$a)',	# $b for next round
802
803	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
804	'&xor	($b,$c)',	# $c^$d for next round
805
806	'&$_rol	($a,5)',
807	'&add	($e,@T[0])',
808	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
809
810	'&xor	($b,$c)',	# restore $b
811	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
812	);
813}
814
815sub body_20_39 () {	# b^d^c
816	# on entry @T[0]=b^d
817	return &body_40_59() if ($rx==39); $rx++;
818	(
819	'($a,$b,$c,$d,$e)=@V;'.
820	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
821	'&xor	(@T[0],$d)	if($j==19);'.
822	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
823	'&mov	(@T[1],$a)',	# $b for next round
824
825	'&$_rol	($a,5)',
826	'&add	($e,@T[0])',
827	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
828
829	'&$_ror	($b,7)',	# $b>>>2
830	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
831	);
832}
833
834sub body_40_59 () {	# ((b^c)&(c^d))^c
835	# on entry @T[0]=(b^c), (c^=d)
836	$rx++;
837	(
838	'($a,$b,$c,$d,$e)=@V;'.
839	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
840	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
841	'&xor	($c,$d)		if ($j>=40)',	# restore $c
842
843	'&$_ror	($b,7)',	# $b>>>2
844	'&mov	(@T[1],$a)',	# $b for next round
845	'&xor	(@T[0],$c)',
846
847	'&$_rol	($a,5)',
848	'&add	($e,@T[0])',
849	'&xor	(@T[1],$c)	if ($j==59);'.
850	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
851
852	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
853	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
854	);
855}
856$code.=<<___;
857.align	16
858.Loop_ssse3:
859___
860	&Xupdate_ssse3_16_31(\&body_00_19);
861	&Xupdate_ssse3_16_31(\&body_00_19);
862	&Xupdate_ssse3_16_31(\&body_00_19);
863	&Xupdate_ssse3_16_31(\&body_00_19);
864	&Xupdate_ssse3_32_79(\&body_00_19);
865	&Xupdate_ssse3_32_79(\&body_20_39);
866	&Xupdate_ssse3_32_79(\&body_20_39);
867	&Xupdate_ssse3_32_79(\&body_20_39);
868	&Xupdate_ssse3_32_79(\&body_20_39);
869	&Xupdate_ssse3_32_79(\&body_20_39);
870	&Xupdate_ssse3_32_79(\&body_40_59);
871	&Xupdate_ssse3_32_79(\&body_40_59);
872	&Xupdate_ssse3_32_79(\&body_40_59);
873	&Xupdate_ssse3_32_79(\&body_40_59);
874	&Xupdate_ssse3_32_79(\&body_40_59);
875	&Xupdate_ssse3_32_79(\&body_20_39);
876	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
877
878				$saved_j=$j; @saved_V=@V;
879
880	&Xloop_ssse3(\&body_20_39);
881	&Xloop_ssse3(\&body_20_39);
882	&Xloop_ssse3(\&body_20_39);
883
884$code.=<<___;
885	add	0($ctx),$A			# update context
886	add	4($ctx),@T[0]
887	add	8($ctx),$C
888	add	12($ctx),$D
889	mov	$A,0($ctx)
890	add	16($ctx),$E
891	mov	@T[0],4($ctx)
892	mov	@T[0],$B			# magic seed
893	mov	$C,8($ctx)
894	mov	$C,@T[1]
895	mov	$D,12($ctx)
896	xor	$D,@T[1]
897	mov	$E,16($ctx)
898	and	@T[1],@T[0]
899	jmp	.Loop_ssse3
900
901.align	16
902.Ldone_ssse3:
903___
904				$j=$saved_j; @V=@saved_V;
905
906	&Xtail_ssse3(\&body_20_39);
907	&Xtail_ssse3(\&body_20_39);
908	&Xtail_ssse3(\&body_20_39);
909
910$code.=<<___;
911	add	0($ctx),$A			# update context
912	add	4($ctx),@T[0]
913	add	8($ctx),$C
914	mov	$A,0($ctx)
915	add	12($ctx),$D
916	mov	@T[0],4($ctx)
917	add	16($ctx),$E
918	mov	$C,8($ctx)
919	mov	$D,12($ctx)
920	mov	$E,16($ctx)
921___
922$code.=<<___ if ($win64);
923	movaps	-40-6*16($fp),%xmm6
924	movaps	-40-5*16($fp),%xmm7
925	movaps	-40-4*16($fp),%xmm8
926	movaps	-40-3*16($fp),%xmm9
927	movaps	-40-2*16($fp),%xmm10
928	movaps	-40-1*16($fp),%xmm11
929___
930$code.=<<___;
931	mov	-40($fp),%r14
932.cfi_restore	%r14
933	mov	-32($fp),%r13
934.cfi_restore	%r13
935	mov	-24($fp),%r12
936.cfi_restore	%r12
937	mov	-16($fp),%rbp
938.cfi_restore	%rbp
939	mov	-8($fp),%rbx
940.cfi_restore	%rbx
941	lea	($fp),%rsp
942.cfi_def_cfa_register	%rsp
943.Lepilogue_ssse3:
944	ret
945.cfi_endproc
946.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
947___
948
949if ($avx) {
950$Xi=4;				# reset variables
951@X=map("%xmm$_",(4..7,0..3));
952@Tx=map("%xmm$_",(8..10));
953$j=0;
954$rx=0;
955
956my $done_avx_label=".Ldone_avx";
957
958my $_rol=sub { &shld(@_[0],@_) };
959my $_ror=sub { &shrd(@_[0],@_) };
960
961$code.=<<___;
962.type	sha1_block_data_order_avx,\@function,3
963.align	16
964sha1_block_data_order_avx:
965_avx_shortcut:
966.cfi_startproc
967	mov	%rsp,$fp
968.cfi_def_cfa_register	$fp
969	push	%rbx
970.cfi_push	%rbx
971	push	%rbp
972.cfi_push	%rbp
973	push	%r12
974.cfi_push	%r12
975	push	%r13		# redundant, done to share Win64 SE handler
976.cfi_push	%r13
977	push	%r14
978.cfi_push	%r14
979	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
980	vzeroupper
981___
982$code.=<<___ if ($win64);
983	vmovaps	%xmm6,-40-6*16($fp)
984	vmovaps	%xmm7,-40-5*16($fp)
985	vmovaps	%xmm8,-40-4*16($fp)
986	vmovaps	%xmm9,-40-3*16($fp)
987	vmovaps	%xmm10,-40-2*16($fp)
988	vmovaps	%xmm11,-40-1*16($fp)
989.Lprologue_avx:
990___
991$code.=<<___;
992	and	\$-64,%rsp
993	mov	%rdi,$ctx	# reassigned argument
994	mov	%rsi,$inp	# reassigned argument
995	mov	%rdx,$num	# reassigned argument
996
997	shl	\$6,$num
998	add	$inp,$num
999	lea	K_XX_XX+64(%rip),$K_XX_XX
1000
1001	mov	0($ctx),$A		# load context
1002	mov	4($ctx),$B
1003	mov	8($ctx),$C
1004	mov	12($ctx),$D
1005	mov	$B,@T[0]		# magic seed
1006	mov	16($ctx),$E
1007	mov	$C,@T[1]
1008	xor	$D,@T[1]
1009	and	@T[1],@T[0]
1010
1011	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1012	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
1013	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1014	vmovdqu	16($inp),@X[-3&7]
1015	vmovdqu	32($inp),@X[-2&7]
1016	vmovdqu	48($inp),@X[-1&7]
1017	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1018	add	\$64,$inp
1019	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1020	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1021	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1022	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1023	vpaddd	$Kx,@X[-3&7],@X[1]
1024	vpaddd	$Kx,@X[-2&7],@X[2]
1025	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1026	vmovdqa	@X[1],16(%rsp)
1027	vmovdqa	@X[2],32(%rsp)
1028	jmp	.Loop_avx
1029___
1030
1031sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1032{ use integer;
1033  my $body = shift;
1034  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1035  my ($a,$b,$c,$d,$e);
1036
1037	 eval(shift(@insns));
1038	 eval(shift(@insns));
1039	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1040	 eval(shift(@insns));
1041	 eval(shift(@insns));
1042
1043	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1044	 eval(shift(@insns));
1045	 eval(shift(@insns));
1046	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1047	 eval(shift(@insns));
1048	 eval(shift(@insns));
1049	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1050	 eval(shift(@insns));
1051	 eval(shift(@insns));
1052
1053	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1054	 eval(shift(@insns));
1055	 eval(shift(@insns));
1056	 eval(shift(@insns));
1057	 eval(shift(@insns));
1058
1059	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1060	 eval(shift(@insns));
1061	 eval(shift(@insns));
1062	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1063	 eval(shift(@insns));
1064	 eval(shift(@insns));
1065
1066	&vpsrld	(@Tx[0],@X[0],31);
1067	 eval(shift(@insns));
1068	 eval(shift(@insns));
1069	 eval(shift(@insns));
1070	 eval(shift(@insns));
1071
1072	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1073	&vpaddd	(@X[0],@X[0],@X[0]);
1074	 eval(shift(@insns));
1075	 eval(shift(@insns));
1076	 eval(shift(@insns));
1077	 eval(shift(@insns));
1078
1079	&vpsrld	(@Tx[1],@Tx[2],30);
1080	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1081	 eval(shift(@insns));
1082	 eval(shift(@insns));
1083	 eval(shift(@insns));
1084	 eval(shift(@insns));
1085
1086	&vpslld	(@Tx[2],@Tx[2],2);
1087	&vpxor	(@X[0],@X[0],@Tx[1]);
1088	 eval(shift(@insns));
1089	 eval(shift(@insns));
1090	 eval(shift(@insns));
1091	 eval(shift(@insns));
1092
1093	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1094	 eval(shift(@insns));
1095	 eval(shift(@insns));
1096	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1097	 eval(shift(@insns));
1098	 eval(shift(@insns));
1099
1100
1101	 foreach (@insns) { eval; }	# remaining instructions [if any]
1102
1103  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1104}
1105
1106sub Xupdate_avx_32_79()
1107{ use integer;
1108  my $body = shift;
1109  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1110  my ($a,$b,$c,$d,$e);
1111
1112	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1113	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1114	 eval(shift(@insns));		# body_20_39
1115	 eval(shift(@insns));
1116	 eval(shift(@insns));
1117	 eval(shift(@insns));		# rol
1118
1119	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1120	 eval(shift(@insns));
1121	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1122	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1123	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1124	 eval(shift(@insns));		# ror
1125	 eval(shift(@insns));
1126
1127	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1128	 eval(shift(@insns));		# body_20_39
1129	 eval(shift(@insns));
1130	 eval(shift(@insns));
1131	 eval(shift(@insns));		# rol
1132
1133	&vpsrld	(@Tx[0],@X[0],30);
1134	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1135	 eval(shift(@insns));
1136	 eval(shift(@insns));
1137	 eval(shift(@insns));		# ror
1138	 eval(shift(@insns));
1139
1140	&vpslld	(@X[0],@X[0],2);
1141	 eval(shift(@insns));		# body_20_39
1142	 eval(shift(@insns));
1143	 eval(shift(@insns));
1144	 eval(shift(@insns));		# rol
1145	 eval(shift(@insns));
1146	 eval(shift(@insns));
1147	 eval(shift(@insns));		# ror
1148	 eval(shift(@insns));
1149
1150	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1151	 eval(shift(@insns));		# body_20_39
1152	 eval(shift(@insns));
1153	 eval(shift(@insns));
1154	 eval(shift(@insns));		# rol
1155	 eval(shift(@insns));
1156	 eval(shift(@insns));
1157	 eval(shift(@insns));		# rol
1158	 eval(shift(@insns));
1159
1160	 foreach (@insns) { eval; }	# remaining instructions
1161
1162  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1163}
1164
1165sub Xuplast_avx_80()
1166{ use integer;
1167  my $body = shift;
1168  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1169  my ($a,$b,$c,$d,$e);
1170
1171	 eval(shift(@insns));
1172	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1173	 eval(shift(@insns));
1174	 eval(shift(@insns));
1175	 eval(shift(@insns));
1176	 eval(shift(@insns));
1177
1178	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1179
1180	 foreach (@insns) { eval; }		# remaining instructions
1181
1182	&cmp	($inp,$num);
1183	&je	($done_avx_label);
1184
1185	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1186	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1187	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1188	&vmovdqu(@X[-3&7],"16($inp)");
1189	&vmovdqu(@X[-2&7],"32($inp)");
1190	&vmovdqu(@X[-1&7],"48($inp)");
1191	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1192	&add	($inp,64);
1193
1194  $Xi=0;
1195}
1196
1197sub Xloop_avx()
1198{ use integer;
1199  my $body = shift;
1200  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1201  my ($a,$b,$c,$d,$e);
1202
1203	 eval(shift(@insns));
1204	 eval(shift(@insns));
1205	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1206	 eval(shift(@insns));
1207	 eval(shift(@insns));
1208	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1209	 eval(shift(@insns));
1210	 eval(shift(@insns));
1211	 eval(shift(@insns));
1212	 eval(shift(@insns));
1213	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1214	 eval(shift(@insns));
1215	 eval(shift(@insns));
1216
1217	foreach (@insns) { eval; }
1218  $Xi++;
1219}
1220
1221sub Xtail_avx()
1222{ use integer;
1223  my $body = shift;
1224  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1225  my ($a,$b,$c,$d,$e);
1226
1227	foreach (@insns) { eval; }
1228}
1229
1230$code.=<<___;
1231.align	16
1232.Loop_avx:
1233___
1234	&Xupdate_avx_16_31(\&body_00_19);
1235	&Xupdate_avx_16_31(\&body_00_19);
1236	&Xupdate_avx_16_31(\&body_00_19);
1237	&Xupdate_avx_16_31(\&body_00_19);
1238	&Xupdate_avx_32_79(\&body_00_19);
1239	&Xupdate_avx_32_79(\&body_20_39);
1240	&Xupdate_avx_32_79(\&body_20_39);
1241	&Xupdate_avx_32_79(\&body_20_39);
1242	&Xupdate_avx_32_79(\&body_20_39);
1243	&Xupdate_avx_32_79(\&body_20_39);
1244	&Xupdate_avx_32_79(\&body_40_59);
1245	&Xupdate_avx_32_79(\&body_40_59);
1246	&Xupdate_avx_32_79(\&body_40_59);
1247	&Xupdate_avx_32_79(\&body_40_59);
1248	&Xupdate_avx_32_79(\&body_40_59);
1249	&Xupdate_avx_32_79(\&body_20_39);
1250	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1251
1252				$saved_j=$j; @saved_V=@V;
1253
1254	&Xloop_avx(\&body_20_39);
1255	&Xloop_avx(\&body_20_39);
1256	&Xloop_avx(\&body_20_39);
1257
1258$code.=<<___;
1259	add	0($ctx),$A			# update context
1260	add	4($ctx),@T[0]
1261	add	8($ctx),$C
1262	add	12($ctx),$D
1263	mov	$A,0($ctx)
1264	add	16($ctx),$E
1265	mov	@T[0],4($ctx)
1266	mov	@T[0],$B			# magic seed
1267	mov	$C,8($ctx)
1268	mov	$C,@T[1]
1269	mov	$D,12($ctx)
1270	xor	$D,@T[1]
1271	mov	$E,16($ctx)
1272	and	@T[1],@T[0]
1273	jmp	.Loop_avx
1274
1275.align	16
1276$done_avx_label:
1277___
1278				$j=$saved_j; @V=@saved_V;
1279
1280	&Xtail_avx(\&body_20_39);
1281	&Xtail_avx(\&body_20_39);
1282	&Xtail_avx(\&body_20_39);
1283
1284$code.=<<___;
1285	vzeroupper
1286
1287	add	0($ctx),$A			# update context
1288	add	4($ctx),@T[0]
1289	add	8($ctx),$C
1290	mov	$A,0($ctx)
1291	add	12($ctx),$D
1292	mov	@T[0],4($ctx)
1293	add	16($ctx),$E
1294	mov	$C,8($ctx)
1295	mov	$D,12($ctx)
1296	mov	$E,16($ctx)
1297___
1298$code.=<<___ if ($win64);
1299	movaps	-40-6*16($fp),%xmm6
1300	movaps	-40-5*16($fp),%xmm7
1301	movaps	-40-4*16($fp),%xmm8
1302	movaps	-40-3*16($fp),%xmm9
1303	movaps	-40-2*16($fp),%xmm10
1304	movaps	-40-1*16($fp),%xmm11
1305___
1306$code.=<<___;
1307	mov	-40($fp),%r14
1308.cfi_restore	%r14
1309	mov	-32($fp),%r13
1310.cfi_restore	%r13
1311	mov	-24($fp),%r12
1312.cfi_restore	%r12
1313	mov	-16($fp),%rbp
1314.cfi_restore	%rbp
1315	mov	-8($fp),%rbx
1316.cfi_restore	%rbx
1317	lea	($fp),%rsp
1318.cfi_def_cfa_register	%rsp
1319.Lepilogue_avx:
1320	ret
1321.cfi_endproc
1322.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1323___
1324
1325if ($avx>1) {
1326use integer;
1327$Xi=4;					# reset variables
1328@X=map("%ymm$_",(4..7,0..3));
1329@Tx=map("%ymm$_",(8..10));
1330$Kx="%ymm11";
1331$j=0;
1332
1333my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1334my ($a5,$t0)=("%r12d","%edi");
1335
1336my ($A,$F,$B,$C,$D,$E)=@ROTX;
1337my $rx=0;
1338my $frame="%r13";
1339
1340$code.=<<___;
1341.type	sha1_block_data_order_avx2,\@function,3
1342.align	16
1343sha1_block_data_order_avx2:
1344_avx2_shortcut:
1345.cfi_startproc
1346	mov	%rsp,$fp
1347.cfi_def_cfa_register	$fp
1348	push	%rbx
1349.cfi_push	%rbx
1350	push	%rbp
1351.cfi_push	%rbp
1352	push	%r12
1353.cfi_push	%r12
1354	push	%r13
1355.cfi_push	%r13
1356	push	%r14
1357.cfi_push	%r14
1358	vzeroupper
1359___
1360$code.=<<___ if ($win64);
1361	lea	-6*16(%rsp),%rsp
1362	vmovaps	%xmm6,-40-6*16($fp)
1363	vmovaps	%xmm7,-40-5*16($fp)
1364	vmovaps	%xmm8,-40-4*16($fp)
1365	vmovaps	%xmm9,-40-3*16($fp)
1366	vmovaps	%xmm10,-40-2*16($fp)
1367	vmovaps	%xmm11,-40-1*16($fp)
1368.Lprologue_avx2:
1369___
1370$code.=<<___;
1371	mov	%rdi,$ctx		# reassigned argument
1372	mov	%rsi,$inp		# reassigned argument
1373	mov	%rdx,$num		# reassigned argument
1374
1375	lea	-640(%rsp),%rsp
1376	shl	\$6,$num
1377	 lea	64($inp),$frame
1378	and	\$-128,%rsp
1379	add	$inp,$num
1380	lea	K_XX_XX+64(%rip),$K_XX_XX
1381
1382	mov	0($ctx),$A		# load context
1383	 cmp	$num,$frame
1384	 cmovae	$inp,$frame		# next or same block
1385	mov	4($ctx),$F
1386	mov	8($ctx),$C
1387	mov	12($ctx),$D
1388	mov	16($ctx),$E
1389	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1390
1391	vmovdqu		($inp),%xmm0
1392	vmovdqu		16($inp),%xmm1
1393	vmovdqu		32($inp),%xmm2
1394	vmovdqu		48($inp),%xmm3
1395	lea		64($inp),$inp
1396	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1397	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1398	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1399	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1400	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1401	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1402	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1403	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1404	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1405
1406	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1407	vpaddd	$Kx,@X[-3&7],@X[1]
1408	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1409	vpaddd	$Kx,@X[-2&7],@X[2]
1410	vmovdqu	@X[1],32(%rsp)
1411	vpaddd	$Kx,@X[-1&7],@X[3]
1412	vmovdqu	@X[2],64(%rsp)
1413	vmovdqu	@X[3],96(%rsp)
1414___
1415for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1416    use integer;
1417
1418	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1419	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1420	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1421	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1422	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1423	&vpsrld	(@Tx[0],@X[0],31);
1424	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1425	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1426	&vpaddd	(@X[0],@X[0],@X[0]);
1427	&vpsrld	(@Tx[1],@Tx[2],30);
1428	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1429	&vpslld	(@Tx[2],@Tx[2],2);
1430	&vpxor	(@X[0],@X[0],@Tx[1]);
1431	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1432	&vpaddd	(@Tx[1],@X[0],$Kx);
1433	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1434
1435	push(@X,shift(@X));	# "rotate" X[]
1436}
1437$code.=<<___;
1438	lea	128(%rsp),$frame
1439	jmp	.Loop_avx2
1440.align	32
1441.Loop_avx2:
1442	rorx	\$2,$F,$B
1443	andn	$D,$F,$t0
1444	and	$C,$F
1445	xor	$t0,$F
1446___
1447sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1448	# at start $f=(b&c)^(~b&d), $b>>>=2
1449	return &bodyx_20_39() if ($rx==19); $rx++;
1450	(
1451	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1452
1453	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1454	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1455	'&andn	($t0,$a,$c)',			# ~b&d for next round
1456
1457	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1458	'&rorx	($a5,$a,27)',			# a<<<5
1459	'&rorx	($f,$a,2)',			# b>>>2 for next round
1460	'&and	($a,$b)',			# b&c for next round
1461
1462	'&add	($e,$a5)',			# e+=a<<<5
1463	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1464
1465	'unshift(@ROTX,pop(@ROTX)); $j++;'
1466	)
1467}
1468
1469sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1470	# on entry $f=b^c^d, $b>>>=2
1471	return &bodyx_40_59() if ($rx==39); $rx++;
1472	(
1473	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1474
1475	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1476	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1477
1478	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1479	'&rorx	($a5,$a,27)',			# a<<<5
1480	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1481	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1482
1483	'&add	($e,$a5)',			# e+=a<<<5
1484	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1485
1486	'unshift(@ROTX,pop(@ROTX)); $j++;'
1487	)
1488}
1489
1490sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1491	# on entry $f=((b^c)&(c^d)), $b>>>=2
1492	$rx++;
1493	(
1494	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1495
1496	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1497	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1498	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1499	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1500	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1501
1502	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1503	'&rorx	($a5,$a,27)',			# a<<<5
1504	'&rorx	($f,$a,2)',			# b>>>2 in next round
1505	'&xor	($a,$b)',			# b^c for next round
1506
1507	'&add	($e,$a5)',			# e+=a<<<5
1508	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1509	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1510
1511	'unshift(@ROTX,pop(@ROTX)); $j++;'
1512	)
1513}
1514
1515sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1516{ use integer;
1517  my $body = shift;
1518  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1519  my ($a,$b,$c,$d,$e);
1520
1521	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1522	 eval(shift(@insns));
1523	 eval(shift(@insns));
1524	 eval(shift(@insns));
1525	 eval(shift(@insns));
1526
1527	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1528	 eval(shift(@insns));
1529	 eval(shift(@insns));
1530	 eval(shift(@insns));
1531
1532	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1533	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1534	 eval(shift(@insns));
1535	 eval(shift(@insns));
1536
1537	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1538	 eval(shift(@insns));
1539	 eval(shift(@insns));
1540	 eval(shift(@insns));
1541	 eval(shift(@insns));
1542
1543	&vpsrld	(@Tx[0],@X[0],31);
1544	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1545	 eval(shift(@insns));
1546	 eval(shift(@insns));
1547	 eval(shift(@insns));
1548
1549	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1550	&vpaddd	(@X[0],@X[0],@X[0]);
1551	 eval(shift(@insns));
1552	 eval(shift(@insns));
1553
1554	&vpsrld	(@Tx[1],@Tx[2],30);
1555	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1556	 eval(shift(@insns));
1557	 eval(shift(@insns));
1558
1559	&vpslld	(@Tx[2],@Tx[2],2);
1560	&vpxor	(@X[0],@X[0],@Tx[1]);
1561	 eval(shift(@insns));
1562	 eval(shift(@insns));
1563
1564	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1565	 eval(shift(@insns));
1566	 eval(shift(@insns));
1567	 eval(shift(@insns));
1568
1569	&vpaddd	(@Tx[1],@X[0],$Kx);
1570	 eval(shift(@insns));
1571	 eval(shift(@insns));
1572	 eval(shift(@insns));
1573	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1574
1575	 foreach (@insns) { eval; }	# remaining instructions [if any]
1576
1577	$Xi++;
1578	push(@X,shift(@X));	# "rotate" X[]
1579}
1580
1581sub Xupdate_avx2_32_79()
1582{ use integer;
1583  my $body = shift;
1584  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1585  my ($a,$b,$c,$d,$e);
1586
1587	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1588	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1589	 eval(shift(@insns));
1590	 eval(shift(@insns));
1591
1592	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1593	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1594	 eval(shift(@insns));
1595	 eval(shift(@insns));
1596	 eval(shift(@insns));
1597
1598	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1599	 eval(shift(@insns));
1600	 eval(shift(@insns));
1601	 eval(shift(@insns));
1602
1603	&vpsrld	(@Tx[0],@X[0],30);
1604	&vpslld	(@X[0],@X[0],2);
1605	 eval(shift(@insns));
1606	 eval(shift(@insns));
1607	 eval(shift(@insns));
1608
1609	#&vpslld	(@X[0],@X[0],2);
1610	 eval(shift(@insns));
1611	 eval(shift(@insns));
1612	 eval(shift(@insns));
1613
1614	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1615	 eval(shift(@insns));
1616	 eval(shift(@insns));
1617	 eval(shift(@insns));
1618	 eval(shift(@insns));
1619
1620	&vpaddd	(@Tx[1],@X[0],$Kx);
1621	 eval(shift(@insns));
1622	 eval(shift(@insns));
1623	 eval(shift(@insns));
1624	 eval(shift(@insns));
1625
1626	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1627
1628	 foreach (@insns) { eval; }	# remaining instructions
1629
1630	$Xi++;
1631	push(@X,shift(@X));	# "rotate" X[]
1632}
1633
1634sub Xloop_avx2()
1635{ use integer;
1636  my $body = shift;
1637  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1638  my ($a,$b,$c,$d,$e);
1639
1640	 foreach (@insns) { eval; }
1641}
1642
1643	&align32();
1644	&Xupdate_avx2_32_79(\&bodyx_00_19);
1645	&Xupdate_avx2_32_79(\&bodyx_00_19);
1646	&Xupdate_avx2_32_79(\&bodyx_00_19);
1647	&Xupdate_avx2_32_79(\&bodyx_00_19);
1648
1649	&Xupdate_avx2_32_79(\&bodyx_20_39);
1650	&Xupdate_avx2_32_79(\&bodyx_20_39);
1651	&Xupdate_avx2_32_79(\&bodyx_20_39);
1652	&Xupdate_avx2_32_79(\&bodyx_20_39);
1653
1654	&align32();
1655	&Xupdate_avx2_32_79(\&bodyx_40_59);
1656	&Xupdate_avx2_32_79(\&bodyx_40_59);
1657	&Xupdate_avx2_32_79(\&bodyx_40_59);
1658	&Xupdate_avx2_32_79(\&bodyx_40_59);
1659
1660	&Xloop_avx2(\&bodyx_20_39);
1661	&Xloop_avx2(\&bodyx_20_39);
1662	&Xloop_avx2(\&bodyx_20_39);
1663	&Xloop_avx2(\&bodyx_20_39);
1664
1665$code.=<<___;
1666	lea	128($inp),$frame
1667	lea	128($inp),%rdi			# borrow $t0
1668	cmp	$num,$frame
1669	cmovae	$inp,$frame			# next or previous block
1670
1671	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1672	add	0($ctx),@ROTX[0]		# update context
1673	add	4($ctx),@ROTX[1]
1674	add	8($ctx),@ROTX[3]
1675	mov	@ROTX[0],0($ctx)
1676	add	12($ctx),@ROTX[4]
1677	mov	@ROTX[1],4($ctx)
1678	 mov	@ROTX[0],$A			# A=d
1679	add	16($ctx),@ROTX[5]
1680	 mov	@ROTX[3],$a5
1681	mov	@ROTX[3],8($ctx)
1682	 mov	@ROTX[4],$D			# D=b
1683	 #xchg	@ROTX[5],$F			# F=c, C=f
1684	mov	@ROTX[4],12($ctx)
1685	 mov	@ROTX[1],$F			# F=e
1686	mov	@ROTX[5],16($ctx)
1687	#mov	$F,16($ctx)
1688	 mov	@ROTX[5],$E			# E=c
1689	 mov	$a5,$C				# C=f
1690	 #xchg	$F,$E				# E=c, F=e
1691
1692	cmp	$num,$inp
1693	je	.Ldone_avx2
1694___
1695
1696$Xi=4;				# reset variables
1697@X=map("%ymm$_",(4..7,0..3));
1698
1699$code.=<<___;
1700	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1701	cmp	$num,%rdi			# borrowed $t0
1702	ja	.Last_avx2
1703
1704	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1705	vmovdqu		-48(%rdi),%xmm1
1706	vmovdqu		-32(%rdi),%xmm2
1707	vmovdqu		-16(%rdi),%xmm3
1708	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1709	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1710	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1711	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1712	jmp	.Last_avx2
1713
1714.align	32
1715.Last_avx2:
1716	lea	128+16(%rsp),$frame
1717	rorx	\$2,$F,$B
1718	andn	$D,$F,$t0
1719	and	$C,$F
1720	xor	$t0,$F
1721	sub	\$-128,$inp
1722___
1723	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1724
1725	&Xloop_avx2	(\&bodyx_00_19);
1726	&Xloop_avx2	(\&bodyx_00_19);
1727	&Xloop_avx2	(\&bodyx_00_19);
1728	&Xloop_avx2	(\&bodyx_00_19);
1729
1730	&Xloop_avx2	(\&bodyx_20_39);
1731	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1732	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1733	&Xloop_avx2	(\&bodyx_20_39);
1734	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1735	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1736	&Xloop_avx2	(\&bodyx_20_39);
1737	  &vmovdqu	("0(%rsp)",@Tx[0]);
1738	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1739	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1740	&Xloop_avx2	(\&bodyx_20_39);
1741	  &vmovdqu	("32(%rsp)",@Tx[1]);
1742	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1743	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1744
1745	&Xloop_avx2	(\&bodyx_40_59);
1746	&align32	();
1747	  &vmovdqu	("64(%rsp)",@X[2]);
1748	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1749	&Xloop_avx2	(\&bodyx_40_59);
1750	  &vmovdqu	("96(%rsp)",@X[3]);
1751	&Xloop_avx2	(\&bodyx_40_59);
1752	&Xupdate_avx2_16_31(\&bodyx_40_59);
1753
1754	&Xupdate_avx2_16_31(\&bodyx_20_39);
1755	&Xupdate_avx2_16_31(\&bodyx_20_39);
1756	&Xupdate_avx2_16_31(\&bodyx_20_39);
1757	&Xloop_avx2	(\&bodyx_20_39);
1758
1759$code.=<<___;
1760	lea	128(%rsp),$frame
1761
1762	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1763	add	0($ctx),@ROTX[0]		# update context
1764	add	4($ctx),@ROTX[1]
1765	add	8($ctx),@ROTX[3]
1766	mov	@ROTX[0],0($ctx)
1767	add	12($ctx),@ROTX[4]
1768	mov	@ROTX[1],4($ctx)
1769	 mov	@ROTX[0],$A			# A=d
1770	add	16($ctx),@ROTX[5]
1771	 mov	@ROTX[3],$a5
1772	mov	@ROTX[3],8($ctx)
1773	 mov	@ROTX[4],$D			# D=b
1774	 #xchg	@ROTX[5],$F			# F=c, C=f
1775	mov	@ROTX[4],12($ctx)
1776	 mov	@ROTX[1],$F			# F=e
1777	mov	@ROTX[5],16($ctx)
1778	#mov	$F,16($ctx)
1779	 mov	@ROTX[5],$E			# E=c
1780	 mov	$a5,$C				# C=f
1781	 #xchg	$F,$E				# E=c, F=e
1782
1783	cmp	$num,$inp
1784	jbe	.Loop_avx2
1785
1786.Ldone_avx2:
1787	vzeroupper
1788___
1789$code.=<<___ if ($win64);
1790	movaps	-40-6*16($fp),%xmm6
1791	movaps	-40-5*16($fp),%xmm7
1792	movaps	-40-4*16($fp),%xmm8
1793	movaps	-40-3*16($fp),%xmm9
1794	movaps	-40-2*16($fp),%xmm10
1795	movaps	-40-1*16($fp),%xmm11
1796___
1797$code.=<<___;
1798	mov	-40($fp),%r14
1799.cfi_restore	%r14
1800	mov	-32($fp),%r13
1801.cfi_restore	%r13
1802	mov	-24($fp),%r12
1803.cfi_restore	%r12
1804	mov	-16($fp),%rbp
1805.cfi_restore	%rbp
1806	mov	-8($fp),%rbx
1807.cfi_restore	%rbx
1808	lea	($fp),%rsp
1809.cfi_def_cfa_register	%rsp
1810.Lepilogue_avx2:
1811	ret
1812.cfi_endproc
1813.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1814___
1815}
1816}
1817$code.=<<___;
1818.align	64
1819K_XX_XX:
1820.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1821.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1822.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1823.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1824.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1825.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1826.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1827.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1828.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1829.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1830.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1831___
1832}}}
1833$code.=<<___;
1834.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1835.align	64
1836___
1837
1838# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1839#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1840if ($win64) {
1841$rec="%rcx";
1842$frame="%rdx";
1843$context="%r8";
1844$disp="%r9";
1845
1846$code.=<<___;
1847.extern	__imp_RtlVirtualUnwind
1848.type	se_handler,\@abi-omnipotent
1849.align	16
1850se_handler:
1851	push	%rsi
1852	push	%rdi
1853	push	%rbx
1854	push	%rbp
1855	push	%r12
1856	push	%r13
1857	push	%r14
1858	push	%r15
1859	pushfq
1860	sub	\$64,%rsp
1861
1862	mov	120($context),%rax	# pull context->Rax
1863	mov	248($context),%rbx	# pull context->Rip
1864
1865	lea	.Lprologue(%rip),%r10
1866	cmp	%r10,%rbx		# context->Rip<.Lprologue
1867	jb	.Lcommon_seh_tail
1868
1869	mov	152($context),%rax	# pull context->Rsp
1870
1871	lea	.Lepilogue(%rip),%r10
1872	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1873	jae	.Lcommon_seh_tail
1874
1875	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1876
1877	mov	-8(%rax),%rbx
1878	mov	-16(%rax),%rbp
1879	mov	-24(%rax),%r12
1880	mov	-32(%rax),%r13
1881	mov	-40(%rax),%r14
1882	mov	%rbx,144($context)	# restore context->Rbx
1883	mov	%rbp,160($context)	# restore context->Rbp
1884	mov	%r12,216($context)	# restore context->R12
1885	mov	%r13,224($context)	# restore context->R13
1886	mov	%r14,232($context)	# restore context->R14
1887
1888	jmp	.Lcommon_seh_tail
1889.size	se_handler,.-se_handler
1890___
1891
1892$code.=<<___ if ($shaext);
1893.type	shaext_handler,\@abi-omnipotent
1894.align	16
1895shaext_handler:
1896	push	%rsi
1897	push	%rdi
1898	push	%rbx
1899	push	%rbp
1900	push	%r12
1901	push	%r13
1902	push	%r14
1903	push	%r15
1904	pushfq
1905	sub	\$64,%rsp
1906
1907	mov	120($context),%rax	# pull context->Rax
1908	mov	248($context),%rbx	# pull context->Rip
1909
1910	lea	.Lprologue_shaext(%rip),%r10
1911	cmp	%r10,%rbx		# context->Rip<.Lprologue
1912	jb	.Lcommon_seh_tail
1913
1914	lea	.Lepilogue_shaext(%rip),%r10
1915	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1916	jae	.Lcommon_seh_tail
1917
1918	lea	-8-4*16(%rax),%rsi
1919	lea	512($context),%rdi	# &context.Xmm6
1920	mov	\$8,%ecx
1921	.long	0xa548f3fc		# cld; rep movsq
1922
1923	jmp	.Lcommon_seh_tail
1924.size	shaext_handler,.-shaext_handler
1925___
1926
1927$code.=<<___;
1928.type	ssse3_handler,\@abi-omnipotent
1929.align	16
1930ssse3_handler:
1931	push	%rsi
1932	push	%rdi
1933	push	%rbx
1934	push	%rbp
1935	push	%r12
1936	push	%r13
1937	push	%r14
1938	push	%r15
1939	pushfq
1940	sub	\$64,%rsp
1941
1942	mov	120($context),%rax	# pull context->Rax
1943	mov	248($context),%rbx	# pull context->Rip
1944
1945	mov	8($disp),%rsi		# disp->ImageBase
1946	mov	56($disp),%r11		# disp->HandlerData
1947
1948	mov	0(%r11),%r10d		# HandlerData[0]
1949	lea	(%rsi,%r10),%r10	# prologue label
1950	cmp	%r10,%rbx		# context->Rip<prologue label
1951	jb	.Lcommon_seh_tail
1952
1953	mov	208($context),%rax	# pull context->R11
1954
1955	mov	4(%r11),%r10d		# HandlerData[1]
1956	lea	(%rsi,%r10),%r10	# epilogue label
1957	cmp	%r10,%rbx		# context->Rip>=epilogue label
1958	jae	.Lcommon_seh_tail
1959
1960	lea	-40-6*16(%rax),%rsi
1961	lea	512($context),%rdi	# &context.Xmm6
1962	mov	\$12,%ecx
1963	.long	0xa548f3fc		# cld; rep movsq
1964
1965	mov	-8(%rax),%rbx
1966	mov	-16(%rax),%rbp
1967	mov	-24(%rax),%r12
1968	mov	-32(%rax),%r13
1969	mov	-40(%rax),%r14
1970	mov	%rbx,144($context)	# restore context->Rbx
1971	mov	%rbp,160($context)	# restore context->Rbp
1972	mov	%r12,216($context)	# restore context->R12
1973	mov	%r13,224($context)	# restore context->R13
1974	mov	%r14,232($context)	# restore context->R14
1975
1976.Lcommon_seh_tail:
1977	mov	8(%rax),%rdi
1978	mov	16(%rax),%rsi
1979	mov	%rax,152($context)	# restore context->Rsp
1980	mov	%rsi,168($context)	# restore context->Rsi
1981	mov	%rdi,176($context)	# restore context->Rdi
1982
1983	mov	40($disp),%rdi		# disp->ContextRecord
1984	mov	$context,%rsi		# context
1985	mov	\$154,%ecx		# sizeof(CONTEXT)
1986	.long	0xa548f3fc		# cld; rep movsq
1987
1988	mov	$disp,%rsi
1989	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1990	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1991	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1992	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1993	mov	40(%rsi),%r10		# disp->ContextRecord
1994	lea	56(%rsi),%r11		# &disp->HandlerData
1995	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1996	mov	%r10,32(%rsp)		# arg5
1997	mov	%r11,40(%rsp)		# arg6
1998	mov	%r12,48(%rsp)		# arg7
1999	mov	%rcx,56(%rsp)		# arg8, (NULL)
2000	call	*__imp_RtlVirtualUnwind(%rip)
2001
2002	mov	\$1,%eax		# ExceptionContinueSearch
2003	add	\$64,%rsp
2004	popfq
2005	pop	%r15
2006	pop	%r14
2007	pop	%r13
2008	pop	%r12
2009	pop	%rbp
2010	pop	%rbx
2011	pop	%rdi
2012	pop	%rsi
2013	ret
2014.size	ssse3_handler,.-ssse3_handler
2015
2016.section	.pdata
2017.align	4
2018	.rva	.LSEH_begin_sha1_block_data_order
2019	.rva	.LSEH_end_sha1_block_data_order
2020	.rva	.LSEH_info_sha1_block_data_order
2021___
2022$code.=<<___ if ($shaext);
2023	.rva	.LSEH_begin_sha1_block_data_order_shaext
2024	.rva	.LSEH_end_sha1_block_data_order_shaext
2025	.rva	.LSEH_info_sha1_block_data_order_shaext
2026___
2027$code.=<<___;
2028	.rva	.LSEH_begin_sha1_block_data_order_ssse3
2029	.rva	.LSEH_end_sha1_block_data_order_ssse3
2030	.rva	.LSEH_info_sha1_block_data_order_ssse3
2031___
2032$code.=<<___ if ($avx);
2033	.rva	.LSEH_begin_sha1_block_data_order_avx
2034	.rva	.LSEH_end_sha1_block_data_order_avx
2035	.rva	.LSEH_info_sha1_block_data_order_avx
2036___
2037$code.=<<___ if ($avx>1);
2038	.rva	.LSEH_begin_sha1_block_data_order_avx2
2039	.rva	.LSEH_end_sha1_block_data_order_avx2
2040	.rva	.LSEH_info_sha1_block_data_order_avx2
2041___
2042$code.=<<___;
2043.section	.xdata
2044.align	8
2045.LSEH_info_sha1_block_data_order:
2046	.byte	9,0,0,0
2047	.rva	se_handler
2048___
2049$code.=<<___ if ($shaext);
2050.LSEH_info_sha1_block_data_order_shaext:
2051	.byte	9,0,0,0
2052	.rva	shaext_handler
2053___
2054$code.=<<___;
2055.LSEH_info_sha1_block_data_order_ssse3:
2056	.byte	9,0,0,0
2057	.rva	ssse3_handler
2058	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2059___
2060$code.=<<___ if ($avx);
2061.LSEH_info_sha1_block_data_order_avx:
2062	.byte	9,0,0,0
2063	.rva	ssse3_handler
2064	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2065___
2066$code.=<<___ if ($avx>1);
2067.LSEH_info_sha1_block_data_order_avx2:
2068	.byte	9,0,0,0
2069	.rva	ssse3_handler
2070	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2071___
2072}
2073
2074####################################################################
2075
2076sub sha1rnds4 {
2077    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2078      my @opcode=(0x0f,0x3a,0xcc);
2079	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2080	my $c=$1;
2081	push @opcode,$c=~/^0/?oct($c):$c;
2082	return ".byte\t".join(',',@opcode);
2083    } else {
2084	return "sha1rnds4\t".@_[0];
2085    }
2086}
2087
2088sub sha1op38 {
2089    my $instr = shift;
2090    my %opcodelet = (
2091		"sha1nexte" => 0xc8,
2092  		"sha1msg1"  => 0xc9,
2093		"sha1msg2"  => 0xca	);
2094
2095    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2096      my @opcode=(0x0f,0x38);
2097      my $rex=0;
2098	$rex|=0x04			if ($2>=8);
2099	$rex|=0x01			if ($1>=8);
2100	unshift @opcode,0x40|$rex	if ($rex);
2101	push @opcode,$opcodelet{$instr};
2102	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2103	return ".byte\t".join(',',@opcode);
2104    } else {
2105	return $instr."\t".@_[0];
2106    }
2107}
2108
2109foreach (split("\n",$code)) {
2110	s/\`([^\`]*)\`/eval $1/geo;
2111
2112	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2113	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2114
2115	print $_,"\n";
2116}
2117close STDOUT or die "error closing STDOUT";
2118