• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# sha1_block procedure for x86_64.
18#
19# It was brought to my attention that on EM64T compiler-generated code
20# was far behind 32-bit assembler implementation. This is unlike on
21# Opteron where compiler-generated code was only 15% behind 32-bit
22# assembler, which originally made it hard to motivate the effort.
23# There was suggestion to mechanically translate 32-bit code, but I
24# dismissed it, reasoning that x86_64 offers enough register bank
25# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26# implementation:-) However! While 64-bit code does perform better
27# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28# x86_64 does offer larger *addressable* bank, but out-of-order core
29# reaches for even more registers through dynamic aliasing, and EM64T
30# core must have managed to run-time optimize even 32-bit code just as
31# good as 64-bit one. Performance improvement is summarized in the
32# following table:
33#
34#		gcc 3.4		32-bit asm	cycles/byte
35# Opteron	+45%		+20%		6.8
36# Xeon P4	+65%		+0%		9.9
37# Core2		+60%		+10%		7.0
38
39# August 2009.
40#
41# The code was revised to minimize code size and to maximize
42# "distance" between instructions producing input to 'lea'
43# instruction and the 'lea' instruction itself, which is essential
44# for Intel Atom core.
45
46# October 2010.
47#
48# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49# is to offload message schedule denoted by Wt in NIST specification,
50# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51# for background and implementation details. The only difference from
52# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53# to free temporary registers.
54
55# April 2011.
56#
57# Add AVX code path. See sha1-586.pl for further information.
58
59# May 2013.
60#
61# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62# and loading pair of consecutive blocks to 256-bit %ymm registers)
63# did not provide impressive performance improvement till a crucial
64# hint regarding the number of Xupdate iterations to pre-compute in
65# advance was provided by Ilya Albrekht of Intel Corp.
66
67# March 2014.
68#
69# Add support for Intel SHA Extensions.
70
71######################################################################
72# Current performance is summarized in following table. Numbers are
73# CPU clock cycles spent to process single byte (less is better).
74#
75#		x86_64		SSSE3		AVX[2]
76# P4		9.05		-
77# Opteron	6.26		-
78# Core2		6.55		6.05/+8%	-
79# Westmere	6.73		5.30/+27%	-
80# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82# Haswell	5.45		4.15/+31%	3.57/+53%
83# Skylake	5.18		4.06/+28%	3.54/+46%
84# Bulldozer	9.11		5.95/+53%
85# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86# VIA Nano	9.32		7.15/+30%
87# Atom		10.3		9.17/+12%
88# Silvermont	13.1(*)		9.37/+40%
89# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91#
92# (*)	obviously suboptimal result, nothing was done about it,
93#	because SSSE3 code is compiled unconditionally;
94# (**)	SHAEXT result
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# In upstream, this is controlled by shelling out to the compiler to check
108# versions, but BoringSSL is intended to be used with pre-generated perlasm
109# output, so this isn't useful anyway.
110$avx = 2;
111$shaext=1;	### set to zero if compiling for 1.0.1
112
113open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
114*STDOUT=*OUT;
115
116$ctx="%rdi";	# 1st arg
117$inp="%rsi";	# 2nd arg
118$num="%rdx";	# 3rd arg
119
120# reassign arguments in order to produce more compact code
121$ctx="%r8";
122$inp="%r9";
123$num="%r10";
124
125$t0="%eax";
126$t1="%ebx";
127$t2="%ecx";
128@xi=("%edx","%ebp","%r14d");
129$A="%esi";
130$B="%edi";
131$C="%r11d";
132$D="%r12d";
133$E="%r13d";
134
135@V=($A,$B,$C,$D,$E);
136
137sub BODY_00_19 {
138my ($i,$a,$b,$c,$d,$e)=@_;
139my $j=$i+1;
140$code.=<<___ if ($i==0);
141	mov	`4*$i`($inp),$xi[0]
142	bswap	$xi[0]
143___
144$code.=<<___ if ($i<15);
145	mov	`4*$j`($inp),$xi[1]
146	mov	$d,$t0
147	mov	$xi[0],`4*$i`(%rsp)
148	mov	$a,$t2
149	bswap	$xi[1]
150	xor	$c,$t0
151	rol	\$5,$t2
152	and	$b,$t0
153	lea	0x5a827999($xi[0],$e),$e
154	add	$t2,$e
155	xor	$d,$t0
156	rol	\$30,$b
157	add	$t0,$e
158___
159$code.=<<___ if ($i>=15);
160	xor	`4*($j%16)`(%rsp),$xi[1]
161	mov	$d,$t0
162	mov	$xi[0],`4*($i%16)`(%rsp)
163	mov	$a,$t2
164	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
165	xor	$c,$t0
166	rol	\$5,$t2
167	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
168	and	$b,$t0
169	lea	0x5a827999($xi[0],$e),$e
170	rol	\$30,$b
171	xor	$d,$t0
172	add	$t2,$e
173	rol	\$1,$xi[1]
174	add	$t0,$e
175___
176push(@xi,shift(@xi));
177}
178
179sub BODY_20_39 {
180my ($i,$a,$b,$c,$d,$e)=@_;
181my $j=$i+1;
182my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
183$code.=<<___ if ($i<79);
184	xor	`4*($j%16)`(%rsp),$xi[1]
185	mov	$b,$t0
186	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
187	mov	$a,$t2
188	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
189	xor	$d,$t0
190	rol	\$5,$t2
191	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
192	lea	$K($xi[0],$e),$e
193	xor	$c,$t0
194	add	$t2,$e
195	rol	\$30,$b
196	add	$t0,$e
197	rol	\$1,$xi[1]
198___
199$code.=<<___ if ($i==79);
200	mov	$b,$t0
201	mov	$a,$t2
202	xor	$d,$t0
203	lea	$K($xi[0],$e),$e
204	rol	\$5,$t2
205	xor	$c,$t0
206	add	$t2,$e
207	rol	\$30,$b
208	add	$t0,$e
209___
210push(@xi,shift(@xi));
211}
212
213sub BODY_40_59 {
214my ($i,$a,$b,$c,$d,$e)=@_;
215my $j=$i+1;
216$code.=<<___;
217	xor	`4*($j%16)`(%rsp),$xi[1]
218	mov	$d,$t0
219	mov	$xi[0],`4*($i%16)`(%rsp)
220	mov	$d,$t1
221	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
222	and	$c,$t0
223	mov	$a,$t2
224	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
225	lea	0x8f1bbcdc($xi[0],$e),$e
226	xor	$c,$t1
227	rol	\$5,$t2
228	add	$t0,$e
229	rol	\$1,$xi[1]
230	and	$b,$t1
231	add	$t2,$e
232	rol	\$30,$b
233	add	$t1,$e
234___
235push(@xi,shift(@xi));
236}
237
238$code.=<<___;
239.text
240
241.globl	sha1_block_data_order_nohw
242.type	sha1_block_data_order_nohw,\@function,3
243.align	16
244sha1_block_data_order_nohw:
245.cfi_startproc
246	_CET_ENDBR
247	mov	%rsp,%rax
248.cfi_def_cfa_register	%rax
249	push	%rbx
250.cfi_push	%rbx
251	push	%rbp
252.cfi_push	%rbp
253	push	%r12
254.cfi_push	%r12
255	push	%r13
256.cfi_push	%r13
257	push	%r14
258.cfi_push	%r14
259	mov	%rdi,$ctx	# reassigned argument
260	sub	\$`8+16*4`,%rsp
261	mov	%rsi,$inp	# reassigned argument
262	and	\$-64,%rsp
263	mov	%rdx,$num	# reassigned argument
264	mov	%rax,`16*4`(%rsp)
265.cfi_cfa_expression	%rsp+64,deref,+8
266.Lprologue:
267
268	mov	0($ctx),$A
269	mov	4($ctx),$B
270	mov	8($ctx),$C
271	mov	12($ctx),$D
272	mov	16($ctx),$E
273	jmp	.Lloop
274
275.align	16
276.Lloop:
277___
278for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
279for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
280for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
281for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
282$code.=<<___;
283	add	0($ctx),$A
284	add	4($ctx),$B
285	add	8($ctx),$C
286	add	12($ctx),$D
287	add	16($ctx),$E
288	mov	$A,0($ctx)
289	mov	$B,4($ctx)
290	mov	$C,8($ctx)
291	mov	$D,12($ctx)
292	mov	$E,16($ctx)
293
294	sub	\$1,$num
295	lea	`16*4`($inp),$inp
296	jnz	.Lloop
297
298	mov	`16*4`(%rsp),%rsi
299.cfi_def_cfa	%rsi,8
300	mov	-40(%rsi),%r14
301.cfi_restore	%r14
302	mov	-32(%rsi),%r13
303.cfi_restore	%r13
304	mov	-24(%rsi),%r12
305.cfi_restore	%r12
306	mov	-16(%rsi),%rbp
307.cfi_restore	%rbp
308	mov	-8(%rsi),%rbx
309.cfi_restore	%rbx
310	lea	(%rsi),%rsp
311.cfi_def_cfa_register	%rsp
312.Lepilogue:
313	ret
314.cfi_endproc
315.size	sha1_block_data_order_nohw,.-sha1_block_data_order_nohw
316___
317if ($shaext) {{{
318######################################################################
319# Intel SHA Extensions implementation of SHA1 update function.
320#
321my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
322my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
323my @MSG=map("%xmm$_",(4..7));
324
325$code.=<<___;
326.globl	sha1_block_data_order_hw
327.type	sha1_block_data_order_hw,\@function,3
328.align	32
329sha1_block_data_order_hw:
330.cfi_startproc
331	_CET_ENDBR
332___
333$code.=<<___ if ($win64);
334	lea	`-8-4*16`(%rsp),%rsp
335	movaps	%xmm6,-8-4*16(%rax)
336	movaps	%xmm7,-8-3*16(%rax)
337	movaps	%xmm8,-8-2*16(%rax)
338	movaps	%xmm9,-8-1*16(%rax)
339.Lprologue_shaext:
340___
341$code.=<<___;
342	movdqu	($ctx),$ABCD
343	movd	16($ctx),$E
344	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
345
346	movdqu	($inp),@MSG[0]
347	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
348	movdqu	0x10($inp),@MSG[1]
349	pshufd	\$0b00011011,$E,$E		# flip word order
350	movdqu	0x20($inp),@MSG[2]
351	pshufb	$BSWAP,@MSG[0]
352	movdqu	0x30($inp),@MSG[3]
353	pshufb	$BSWAP,@MSG[1]
354	pshufb	$BSWAP,@MSG[2]
355	movdqa	$E,$E_SAVE			# offload $E
356	pshufb	$BSWAP,@MSG[3]
357	jmp	.Loop_shaext
358
359.align	16
360.Loop_shaext:
361	dec		$num
362	lea		0x40($inp),%r8		# next input block
363	paddd		@MSG[0],$E
364	cmovne		%r8,$inp
365	prefetcht0	512($inp)
366	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
367___
368for($i=0;$i<20-4;$i+=2) {
369$code.=<<___;
370	sha1msg1	@MSG[1],@MSG[0]
371	movdqa		$ABCD,$E_
372	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
373	sha1nexte	@MSG[1],$E_
374	pxor		@MSG[2],@MSG[0]
375	sha1msg1	@MSG[2],@MSG[1]
376	sha1msg2	@MSG[3],@MSG[0]
377
378	movdqa		$ABCD,$E
379	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
380	sha1nexte	@MSG[2],$E
381	pxor		@MSG[3],@MSG[1]
382	sha1msg2	@MSG[0],@MSG[1]
383___
384	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
385}
386$code.=<<___;
387	movdqu		($inp),@MSG[0]
388	movdqa		$ABCD,$E_
389	sha1rnds4	\$3,$E,$ABCD		# 64-67
390	sha1nexte	@MSG[1],$E_
391	movdqu		0x10($inp),@MSG[1]
392	pshufb		$BSWAP,@MSG[0]
393
394	movdqa		$ABCD,$E
395	sha1rnds4	\$3,$E_,$ABCD		# 68-71
396	sha1nexte	@MSG[2],$E
397	movdqu		0x20($inp),@MSG[2]
398	pshufb		$BSWAP,@MSG[1]
399
400	movdqa		$ABCD,$E_
401	sha1rnds4	\$3,$E,$ABCD		# 72-75
402	sha1nexte	@MSG[3],$E_
403	movdqu		0x30($inp),@MSG[3]
404	pshufb		$BSWAP,@MSG[2]
405
406	movdqa		$ABCD,$E
407	sha1rnds4	\$3,$E_,$ABCD		# 76-79
408	sha1nexte	$E_SAVE,$E
409	pshufb		$BSWAP,@MSG[3]
410
411	paddd		$ABCD_SAVE,$ABCD
412	movdqa		$E,$E_SAVE		# offload $E
413
414	jnz		.Loop_shaext
415
416	pshufd	\$0b00011011,$ABCD,$ABCD
417	pshufd	\$0b00011011,$E,$E
418	movdqu	$ABCD,($ctx)
419	movd	$E,16($ctx)
420___
421$code.=<<___ if ($win64);
422	movaps	-8-4*16(%rax),%xmm6
423	movaps	-8-3*16(%rax),%xmm7
424	movaps	-8-2*16(%rax),%xmm8
425	movaps	-8-1*16(%rax),%xmm9
426	mov	%rax,%rsp
427.Lepilogue_shaext:
428___
429$code.=<<___;
430	ret
431.cfi_endproc
432.size	sha1_block_data_order_hw,.-sha1_block_data_order_hw
433___
434}}}
435{{{
436my $Xi=4;
437my @X=map("%xmm$_",(4..7,0..3));
438my @Tx=map("%xmm$_",(8..10));
439my $Kx="%xmm11";
440my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
441my @T=("%esi","%edi");
442my $j=0;
443my $rx=0;
444my $K_XX_XX="%r14";
445my $fp="%r11";
446
447my $_rol=sub { &rol(@_) };
448my $_ror=sub { &ror(@_) };
449
450{ my $sn;
451sub align32() {
452  ++$sn;
453$code.=<<___;
454	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
455.align	32
456.Lalign32_$sn:
457___
458}
459}
460
461$code.=<<___;
462.globl	sha1_block_data_order_ssse3
463.type	sha1_block_data_order_ssse3,\@function,3
464.align	16
465sha1_block_data_order_ssse3:
466.cfi_startproc
467	_CET_ENDBR
468	mov	%rsp,$fp	# frame pointer
469.cfi_def_cfa_register	$fp
470	push	%rbx
471.cfi_push	%rbx
472	push	%rbp
473.cfi_push	%rbp
474	push	%r12
475.cfi_push	%r12
476	push	%r13		# redundant, done to share Win64 SE handler
477.cfi_push	%r13
478	push	%r14
479.cfi_push	%r14
480	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
481___
482$code.=<<___ if ($win64);
483	movaps	%xmm6,-40-6*16($fp)
484	movaps	%xmm7,-40-5*16($fp)
485	movaps	%xmm8,-40-4*16($fp)
486	movaps	%xmm9,-40-3*16($fp)
487	movaps	%xmm10,-40-2*16($fp)
488	movaps	%xmm11,-40-1*16($fp)
489.Lprologue_ssse3:
490___
491$code.=<<___;
492	and	\$-64,%rsp
493	mov	%rdi,$ctx	# reassigned argument
494	mov	%rsi,$inp	# reassigned argument
495	mov	%rdx,$num	# reassigned argument
496
497	shl	\$6,$num
498	add	$inp,$num
499	lea	K_XX_XX+64(%rip),$K_XX_XX
500
501	mov	0($ctx),$A		# load context
502	mov	4($ctx),$B
503	mov	8($ctx),$C
504	mov	12($ctx),$D
505	mov	$B,@T[0]		# magic seed
506	mov	16($ctx),$E
507	mov	$C,@T[1]
508	xor	$D,@T[1]
509	and	@T[1],@T[0]
510
511	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
512	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
513	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
514	movdqu	16($inp),@X[-3&7]
515	movdqu	32($inp),@X[-2&7]
516	movdqu	48($inp),@X[-1&7]
517	pshufb	@X[2],@X[-4&7]		# byte swap
518	pshufb	@X[2],@X[-3&7]
519	pshufb	@X[2],@X[-2&7]
520	add	\$64,$inp
521	paddd	@Tx[1],@X[-4&7]		# add K_00_19
522	pshufb	@X[2],@X[-1&7]
523	paddd	@Tx[1],@X[-3&7]
524	paddd	@Tx[1],@X[-2&7]
525	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
526	psubd	@Tx[1],@X[-4&7]		# restore X[]
527	movdqa	@X[-3&7],16(%rsp)
528	psubd	@Tx[1],@X[-3&7]
529	movdqa	@X[-2&7],32(%rsp)
530	psubd	@Tx[1],@X[-2&7]
531	jmp	.Loop_ssse3
532___
533
534sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
535{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
536  my $arg = pop;
537    $arg = "\$$arg" if ($arg*1 eq $arg);
538    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
539}
540
541sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
542{ use integer;
543  my $body = shift;
544  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
545  my ($a,$b,$c,$d,$e);
546
547	 eval(shift(@insns));		# ror
548	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
549	 eval(shift(@insns));
550	&movdqa	(@Tx[0],@X[-1&7]);
551	  &paddd	(@Tx[1],@X[-1&7]);
552	 eval(shift(@insns));
553	 eval(shift(@insns));
554
555	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
556	 eval(shift(@insns));
557	 eval(shift(@insns));		# rol
558	 eval(shift(@insns));
559	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
560	 eval(shift(@insns));
561	 eval(shift(@insns));
562
563	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
564	 eval(shift(@insns));
565	 eval(shift(@insns));		# ror
566	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
567	 eval(shift(@insns));
568	 eval(shift(@insns));
569	 eval(shift(@insns));
570
571	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
572	 eval(shift(@insns));
573	 eval(shift(@insns));		# rol
574	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
575	 eval(shift(@insns));
576	 eval(shift(@insns));
577
578	&movdqa	(@Tx[2],@X[0]);
579	 eval(shift(@insns));
580	 eval(shift(@insns));
581	 eval(shift(@insns));		# ror
582	&movdqa	(@Tx[0],@X[0]);
583	 eval(shift(@insns));
584
585	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
586	&paddd	(@X[0],@X[0]);
587	 eval(shift(@insns));
588	 eval(shift(@insns));
589
590	&psrld	(@Tx[0],31);
591	 eval(shift(@insns));
592	 eval(shift(@insns));		# rol
593	 eval(shift(@insns));
594	&movdqa	(@Tx[1],@Tx[2]);
595	 eval(shift(@insns));
596	 eval(shift(@insns));
597
598	&psrld	(@Tx[2],30);
599	 eval(shift(@insns));
600	 eval(shift(@insns));		# ror
601	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
602	 eval(shift(@insns));
603	 eval(shift(@insns));
604	 eval(shift(@insns));
605
606	&pslld	(@Tx[1],2);
607	&pxor	(@X[0],@Tx[2]);
608	 eval(shift(@insns));
609	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
610	 eval(shift(@insns));		# rol
611	 eval(shift(@insns));
612	 eval(shift(@insns));
613
614	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
615	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
616
617	 foreach (@insns) { eval; }	# remaining instructions [if any]
618
619  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
620		push(@Tx,shift(@Tx));
621}
622
623sub Xupdate_ssse3_32_79()
624{ use integer;
625  my $body = shift;
626  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
627  my ($a,$b,$c,$d,$e);
628
629	 eval(shift(@insns))		if ($Xi==8);
630	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
631	 eval(shift(@insns))		if ($Xi==8);
632	 eval(shift(@insns));		# body_20_39
633	 eval(shift(@insns));
634	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
635	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
636	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
637	 eval(shift(@insns));
638	 eval(shift(@insns));		# rol
639
640	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
641	 eval(shift(@insns));
642	 eval(shift(@insns));
643	if ($Xi%5) {
644	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
645	} else {			# ... or load next one
646	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
647	}
648	 eval(shift(@insns));		# ror
649	  &paddd	(@Tx[1],@X[-1&7]);
650	 eval(shift(@insns));
651
652	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
653	 eval(shift(@insns));		# body_20_39
654	 eval(shift(@insns));
655	 eval(shift(@insns));
656	 eval(shift(@insns));		# rol
657	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
658
659	&movdqa	(@Tx[0],@X[0]);
660	 eval(shift(@insns));
661	 eval(shift(@insns));
662	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
663	 eval(shift(@insns));		# ror
664	 eval(shift(@insns));
665	 eval(shift(@insns));		# body_20_39
666
667	&pslld	(@X[0],2);
668	 eval(shift(@insns));
669	 eval(shift(@insns));
670	&psrld	(@Tx[0],30);
671	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
672	 eval(shift(@insns));
673	 eval(shift(@insns));
674	 eval(shift(@insns));		# ror
675
676	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
677	 eval(shift(@insns));
678	 eval(shift(@insns));		# body_20_39
679	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
680	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
681	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
682	 eval(shift(@insns));
683	 eval(shift(@insns));		# rol
684	 eval(shift(@insns));
685	 eval(shift(@insns));
686	 eval(shift(@insns));		# rol
687	 eval(shift(@insns));
688
689	 foreach (@insns) { eval; }	# remaining instructions
690
691  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
692		push(@Tx,shift(@Tx));
693}
694
695sub Xuplast_ssse3_80()
696{ use integer;
697  my $body = shift;
698  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
699  my ($a,$b,$c,$d,$e);
700
701	 eval(shift(@insns));
702	 eval(shift(@insns));
703	 eval(shift(@insns));
704	 eval(shift(@insns));
705	  &paddd	(@Tx[1],@X[-1&7]);
706	 eval(shift(@insns));
707	 eval(shift(@insns));
708
709	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
710
711	 foreach (@insns) { eval; }		# remaining instructions
712
713	&cmp	($inp,$num);
714	&je	(".Ldone_ssse3");
715
716	unshift(@Tx,pop(@Tx));
717
718	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
719	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
720	&movdqu	(@X[-4&7],"0($inp)");		# load input
721	&movdqu	(@X[-3&7],"16($inp)");
722	&movdqu	(@X[-2&7],"32($inp)");
723	&movdqu	(@X[-1&7],"48($inp)");
724	&pshufb	(@X[-4&7],@X[2]);		# byte swap
725	&add	($inp,64);
726
727  $Xi=0;
728}
729
730sub Xloop_ssse3()
731{ use integer;
732  my $body = shift;
733  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
734  my ($a,$b,$c,$d,$e);
735
736	 eval(shift(@insns));
737	 eval(shift(@insns));
738	 eval(shift(@insns));
739	&pshufb	(@X[($Xi-3)&7],@X[2]);
740	 eval(shift(@insns));
741	 eval(shift(@insns));
742	 eval(shift(@insns));
743	 eval(shift(@insns));
744	&paddd	(@X[($Xi-4)&7],@Tx[1]);
745	 eval(shift(@insns));
746	 eval(shift(@insns));
747	 eval(shift(@insns));
748	 eval(shift(@insns));
749	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
750	 eval(shift(@insns));
751	 eval(shift(@insns));
752	 eval(shift(@insns));
753	 eval(shift(@insns));
754	&psubd	(@X[($Xi-4)&7],@Tx[1]);
755
756	foreach (@insns) { eval; }
757  $Xi++;
758}
759
760sub Xtail_ssse3()
761{ use integer;
762  my $body = shift;
763  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
764  my ($a,$b,$c,$d,$e);
765
766	foreach (@insns) { eval; }
767}
768
769sub body_00_19 () {	# ((c^d)&b)^d
770	# on start @T[0]=(c^d)&b
771	return &body_20_39() if ($rx==19); $rx++;
772	(
773	'($a,$b,$c,$d,$e)=@V;'.
774	'&$_ror	($b,$j?7:2)',	# $b>>>2
775	'&xor	(@T[0],$d)',
776	'&mov	(@T[1],$a)',	# $b for next round
777
778	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
779	'&xor	($b,$c)',	# $c^$d for next round
780
781	'&$_rol	($a,5)',
782	'&add	($e,@T[0])',
783	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
784
785	'&xor	($b,$c)',	# restore $b
786	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
787	);
788}
789
790sub body_20_39 () {	# b^d^c
791	# on entry @T[0]=b^d
792	return &body_40_59() if ($rx==39); $rx++;
793	(
794	'($a,$b,$c,$d,$e)=@V;'.
795	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
796	'&xor	(@T[0],$d)	if($j==19);'.
797	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
798	'&mov	(@T[1],$a)',	# $b for next round
799
800	'&$_rol	($a,5)',
801	'&add	($e,@T[0])',
802	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
803
804	'&$_ror	($b,7)',	# $b>>>2
805	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
806	);
807}
808
809sub body_40_59 () {	# ((b^c)&(c^d))^c
810	# on entry @T[0]=(b^c), (c^=d)
811	$rx++;
812	(
813	'($a,$b,$c,$d,$e)=@V;'.
814	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
815	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
816	'&xor	($c,$d)		if ($j>=40)',	# restore $c
817
818	'&$_ror	($b,7)',	# $b>>>2
819	'&mov	(@T[1],$a)',	# $b for next round
820	'&xor	(@T[0],$c)',
821
822	'&$_rol	($a,5)',
823	'&add	($e,@T[0])',
824	'&xor	(@T[1],$c)	if ($j==59);'.
825	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
826
827	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
828	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
829	);
830}
831$code.=<<___;
832.align	16
833.Loop_ssse3:
834___
835	&Xupdate_ssse3_16_31(\&body_00_19);
836	&Xupdate_ssse3_16_31(\&body_00_19);
837	&Xupdate_ssse3_16_31(\&body_00_19);
838	&Xupdate_ssse3_16_31(\&body_00_19);
839	&Xupdate_ssse3_32_79(\&body_00_19);
840	&Xupdate_ssse3_32_79(\&body_20_39);
841	&Xupdate_ssse3_32_79(\&body_20_39);
842	&Xupdate_ssse3_32_79(\&body_20_39);
843	&Xupdate_ssse3_32_79(\&body_20_39);
844	&Xupdate_ssse3_32_79(\&body_20_39);
845	&Xupdate_ssse3_32_79(\&body_40_59);
846	&Xupdate_ssse3_32_79(\&body_40_59);
847	&Xupdate_ssse3_32_79(\&body_40_59);
848	&Xupdate_ssse3_32_79(\&body_40_59);
849	&Xupdate_ssse3_32_79(\&body_40_59);
850	&Xupdate_ssse3_32_79(\&body_20_39);
851	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
852
853				$saved_j=$j; @saved_V=@V;
854
855	&Xloop_ssse3(\&body_20_39);
856	&Xloop_ssse3(\&body_20_39);
857	&Xloop_ssse3(\&body_20_39);
858
859$code.=<<___;
860	add	0($ctx),$A			# update context
861	add	4($ctx),@T[0]
862	add	8($ctx),$C
863	add	12($ctx),$D
864	mov	$A,0($ctx)
865	add	16($ctx),$E
866	mov	@T[0],4($ctx)
867	mov	@T[0],$B			# magic seed
868	mov	$C,8($ctx)
869	mov	$C,@T[1]
870	mov	$D,12($ctx)
871	xor	$D,@T[1]
872	mov	$E,16($ctx)
873	and	@T[1],@T[0]
874	jmp	.Loop_ssse3
875
876.align	16
877.Ldone_ssse3:
878___
879				$j=$saved_j; @V=@saved_V;
880
881	&Xtail_ssse3(\&body_20_39);
882	&Xtail_ssse3(\&body_20_39);
883	&Xtail_ssse3(\&body_20_39);
884
885$code.=<<___;
886	add	0($ctx),$A			# update context
887	add	4($ctx),@T[0]
888	add	8($ctx),$C
889	mov	$A,0($ctx)
890	add	12($ctx),$D
891	mov	@T[0],4($ctx)
892	add	16($ctx),$E
893	mov	$C,8($ctx)
894	mov	$D,12($ctx)
895	mov	$E,16($ctx)
896___
897$code.=<<___ if ($win64);
898	movaps	-40-6*16($fp),%xmm6
899	movaps	-40-5*16($fp),%xmm7
900	movaps	-40-4*16($fp),%xmm8
901	movaps	-40-3*16($fp),%xmm9
902	movaps	-40-2*16($fp),%xmm10
903	movaps	-40-1*16($fp),%xmm11
904___
905$code.=<<___;
906	mov	-40($fp),%r14
907.cfi_restore	%r14
908	mov	-32($fp),%r13
909.cfi_restore	%r13
910	mov	-24($fp),%r12
911.cfi_restore	%r12
912	mov	-16($fp),%rbp
913.cfi_restore	%rbp
914	mov	-8($fp),%rbx
915.cfi_restore	%rbx
916	lea	($fp),%rsp
917.cfi_def_cfa_register	%rsp
918.Lepilogue_ssse3:
919	ret
920.cfi_endproc
921.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
922___
923
924if ($avx) {
925$Xi=4;				# reset variables
926@X=map("%xmm$_",(4..7,0..3));
927@Tx=map("%xmm$_",(8..10));
928$j=0;
929$rx=0;
930
931my $done_avx_label=".Ldone_avx";
932
933my $_rol=sub { &shld(@_[0],@_) };
934my $_ror=sub { &shrd(@_[0],@_) };
935
936$code.=<<___;
937.globl	sha1_block_data_order_avx
938.type	sha1_block_data_order_avx,\@function,3
939.align	16
940sha1_block_data_order_avx:
941.cfi_startproc
942	_CET_ENDBR
943	mov	%rsp,$fp
944.cfi_def_cfa_register	$fp
945	push	%rbx
946.cfi_push	%rbx
947	push	%rbp
948.cfi_push	%rbp
949	push	%r12
950.cfi_push	%r12
951	push	%r13		# redundant, done to share Win64 SE handler
952.cfi_push	%r13
953	push	%r14
954.cfi_push	%r14
955	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
956	vzeroupper
957___
958$code.=<<___ if ($win64);
959	vmovaps	%xmm6,-40-6*16($fp)
960	vmovaps	%xmm7,-40-5*16($fp)
961	vmovaps	%xmm8,-40-4*16($fp)
962	vmovaps	%xmm9,-40-3*16($fp)
963	vmovaps	%xmm10,-40-2*16($fp)
964	vmovaps	%xmm11,-40-1*16($fp)
965.Lprologue_avx:
966___
967$code.=<<___;
968	and	\$-64,%rsp
969	mov	%rdi,$ctx	# reassigned argument
970	mov	%rsi,$inp	# reassigned argument
971	mov	%rdx,$num	# reassigned argument
972
973	shl	\$6,$num
974	add	$inp,$num
975	lea	K_XX_XX+64(%rip),$K_XX_XX
976
977	mov	0($ctx),$A		# load context
978	mov	4($ctx),$B
979	mov	8($ctx),$C
980	mov	12($ctx),$D
981	mov	$B,@T[0]		# magic seed
982	mov	16($ctx),$E
983	mov	$C,@T[1]
984	xor	$D,@T[1]
985	and	@T[1],@T[0]
986
987	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
988	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
989	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
990	vmovdqu	16($inp),@X[-3&7]
991	vmovdqu	32($inp),@X[-2&7]
992	vmovdqu	48($inp),@X[-1&7]
993	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
994	add	\$64,$inp
995	vpshufb	@X[2],@X[-3&7],@X[-3&7]
996	vpshufb	@X[2],@X[-2&7],@X[-2&7]
997	vpshufb	@X[2],@X[-1&7],@X[-1&7]
998	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
999	vpaddd	$Kx,@X[-3&7],@X[1]
1000	vpaddd	$Kx,@X[-2&7],@X[2]
1001	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1002	vmovdqa	@X[1],16(%rsp)
1003	vmovdqa	@X[2],32(%rsp)
1004	jmp	.Loop_avx
1005___
1006
1007sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1008{ use integer;
1009  my $body = shift;
1010  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1011  my ($a,$b,$c,$d,$e);
1012
1013	 eval(shift(@insns));
1014	 eval(shift(@insns));
1015	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1016	 eval(shift(@insns));
1017	 eval(shift(@insns));
1018
1019	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1020	 eval(shift(@insns));
1021	 eval(shift(@insns));
1022	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1023	 eval(shift(@insns));
1024	 eval(shift(@insns));
1025	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1026	 eval(shift(@insns));
1027	 eval(shift(@insns));
1028
1029	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1030	 eval(shift(@insns));
1031	 eval(shift(@insns));
1032	 eval(shift(@insns));
1033	 eval(shift(@insns));
1034
1035	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1036	 eval(shift(@insns));
1037	 eval(shift(@insns));
1038	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1039	 eval(shift(@insns));
1040	 eval(shift(@insns));
1041
1042	&vpsrld	(@Tx[0],@X[0],31);
1043	 eval(shift(@insns));
1044	 eval(shift(@insns));
1045	 eval(shift(@insns));
1046	 eval(shift(@insns));
1047
1048	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1049	&vpaddd	(@X[0],@X[0],@X[0]);
1050	 eval(shift(@insns));
1051	 eval(shift(@insns));
1052	 eval(shift(@insns));
1053	 eval(shift(@insns));
1054
1055	&vpsrld	(@Tx[1],@Tx[2],30);
1056	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1057	 eval(shift(@insns));
1058	 eval(shift(@insns));
1059	 eval(shift(@insns));
1060	 eval(shift(@insns));
1061
1062	&vpslld	(@Tx[2],@Tx[2],2);
1063	&vpxor	(@X[0],@X[0],@Tx[1]);
1064	 eval(shift(@insns));
1065	 eval(shift(@insns));
1066	 eval(shift(@insns));
1067	 eval(shift(@insns));
1068
1069	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1070	 eval(shift(@insns));
1071	 eval(shift(@insns));
1072	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1073	 eval(shift(@insns));
1074	 eval(shift(@insns));
1075
1076
1077	 foreach (@insns) { eval; }	# remaining instructions [if any]
1078
1079  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1080}
1081
1082sub Xupdate_avx_32_79()
1083{ use integer;
1084  my $body = shift;
1085  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1086  my ($a,$b,$c,$d,$e);
1087
1088	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1089	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1090	 eval(shift(@insns));		# body_20_39
1091	 eval(shift(@insns));
1092	 eval(shift(@insns));
1093	 eval(shift(@insns));		# rol
1094
1095	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1096	 eval(shift(@insns));
1097	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1098	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1099	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1100	 eval(shift(@insns));		# ror
1101	 eval(shift(@insns));
1102
1103	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1104	 eval(shift(@insns));		# body_20_39
1105	 eval(shift(@insns));
1106	 eval(shift(@insns));
1107	 eval(shift(@insns));		# rol
1108
1109	&vpsrld	(@Tx[0],@X[0],30);
1110	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1111	 eval(shift(@insns));
1112	 eval(shift(@insns));
1113	 eval(shift(@insns));		# ror
1114	 eval(shift(@insns));
1115
1116	&vpslld	(@X[0],@X[0],2);
1117	 eval(shift(@insns));		# body_20_39
1118	 eval(shift(@insns));
1119	 eval(shift(@insns));
1120	 eval(shift(@insns));		# rol
1121	 eval(shift(@insns));
1122	 eval(shift(@insns));
1123	 eval(shift(@insns));		# ror
1124	 eval(shift(@insns));
1125
1126	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1127	 eval(shift(@insns));		# body_20_39
1128	 eval(shift(@insns));
1129	 eval(shift(@insns));
1130	 eval(shift(@insns));		# rol
1131	 eval(shift(@insns));
1132	 eval(shift(@insns));
1133	 eval(shift(@insns));		# rol
1134	 eval(shift(@insns));
1135
1136	 foreach (@insns) { eval; }	# remaining instructions
1137
1138  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1139}
1140
1141sub Xuplast_avx_80()
1142{ use integer;
1143  my $body = shift;
1144  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1145  my ($a,$b,$c,$d,$e);
1146
1147	 eval(shift(@insns));
1148	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1149	 eval(shift(@insns));
1150	 eval(shift(@insns));
1151	 eval(shift(@insns));
1152	 eval(shift(@insns));
1153
1154	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1155
1156	 foreach (@insns) { eval; }		# remaining instructions
1157
1158	&cmp	($inp,$num);
1159	&je	($done_avx_label);
1160
1161	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1162	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1163	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1164	&vmovdqu(@X[-3&7],"16($inp)");
1165	&vmovdqu(@X[-2&7],"32($inp)");
1166	&vmovdqu(@X[-1&7],"48($inp)");
1167	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1168	&add	($inp,64);
1169
1170  $Xi=0;
1171}
1172
1173sub Xloop_avx()
1174{ use integer;
1175  my $body = shift;
1176  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1177  my ($a,$b,$c,$d,$e);
1178
1179	 eval(shift(@insns));
1180	 eval(shift(@insns));
1181	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1182	 eval(shift(@insns));
1183	 eval(shift(@insns));
1184	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1185	 eval(shift(@insns));
1186	 eval(shift(@insns));
1187	 eval(shift(@insns));
1188	 eval(shift(@insns));
1189	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1190	 eval(shift(@insns));
1191	 eval(shift(@insns));
1192
1193	foreach (@insns) { eval; }
1194  $Xi++;
1195}
1196
1197sub Xtail_avx()
1198{ use integer;
1199  my $body = shift;
1200  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1201  my ($a,$b,$c,$d,$e);
1202
1203	foreach (@insns) { eval; }
1204}
1205
1206$code.=<<___;
1207.align	16
1208.Loop_avx:
1209___
1210	&Xupdate_avx_16_31(\&body_00_19);
1211	&Xupdate_avx_16_31(\&body_00_19);
1212	&Xupdate_avx_16_31(\&body_00_19);
1213	&Xupdate_avx_16_31(\&body_00_19);
1214	&Xupdate_avx_32_79(\&body_00_19);
1215	&Xupdate_avx_32_79(\&body_20_39);
1216	&Xupdate_avx_32_79(\&body_20_39);
1217	&Xupdate_avx_32_79(\&body_20_39);
1218	&Xupdate_avx_32_79(\&body_20_39);
1219	&Xupdate_avx_32_79(\&body_20_39);
1220	&Xupdate_avx_32_79(\&body_40_59);
1221	&Xupdate_avx_32_79(\&body_40_59);
1222	&Xupdate_avx_32_79(\&body_40_59);
1223	&Xupdate_avx_32_79(\&body_40_59);
1224	&Xupdate_avx_32_79(\&body_40_59);
1225	&Xupdate_avx_32_79(\&body_20_39);
1226	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1227
1228				$saved_j=$j; @saved_V=@V;
1229
1230	&Xloop_avx(\&body_20_39);
1231	&Xloop_avx(\&body_20_39);
1232	&Xloop_avx(\&body_20_39);
1233
1234$code.=<<___;
1235	add	0($ctx),$A			# update context
1236	add	4($ctx),@T[0]
1237	add	8($ctx),$C
1238	add	12($ctx),$D
1239	mov	$A,0($ctx)
1240	add	16($ctx),$E
1241	mov	@T[0],4($ctx)
1242	mov	@T[0],$B			# magic seed
1243	mov	$C,8($ctx)
1244	mov	$C,@T[1]
1245	mov	$D,12($ctx)
1246	xor	$D,@T[1]
1247	mov	$E,16($ctx)
1248	and	@T[1],@T[0]
1249	jmp	.Loop_avx
1250
1251.align	16
1252$done_avx_label:
1253___
1254				$j=$saved_j; @V=@saved_V;
1255
1256	&Xtail_avx(\&body_20_39);
1257	&Xtail_avx(\&body_20_39);
1258	&Xtail_avx(\&body_20_39);
1259
1260$code.=<<___;
1261	vzeroupper
1262
1263	add	0($ctx),$A			# update context
1264	add	4($ctx),@T[0]
1265	add	8($ctx),$C
1266	mov	$A,0($ctx)
1267	add	12($ctx),$D
1268	mov	@T[0],4($ctx)
1269	add	16($ctx),$E
1270	mov	$C,8($ctx)
1271	mov	$D,12($ctx)
1272	mov	$E,16($ctx)
1273___
1274$code.=<<___ if ($win64);
1275	movaps	-40-6*16($fp),%xmm6
1276	movaps	-40-5*16($fp),%xmm7
1277	movaps	-40-4*16($fp),%xmm8
1278	movaps	-40-3*16($fp),%xmm9
1279	movaps	-40-2*16($fp),%xmm10
1280	movaps	-40-1*16($fp),%xmm11
1281___
1282$code.=<<___;
1283	mov	-40($fp),%r14
1284.cfi_restore	%r14
1285	mov	-32($fp),%r13
1286.cfi_restore	%r13
1287	mov	-24($fp),%r12
1288.cfi_restore	%r12
1289	mov	-16($fp),%rbp
1290.cfi_restore	%rbp
1291	mov	-8($fp),%rbx
1292.cfi_restore	%rbx
1293	lea	($fp),%rsp
1294.cfi_def_cfa_register	%rsp
1295.Lepilogue_avx:
1296	ret
1297.cfi_endproc
1298.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1299___
1300
1301if ($avx>1) {
1302use integer;
1303$Xi=4;					# reset variables
1304@X=map("%ymm$_",(4..7,0..3));
1305@Tx=map("%ymm$_",(8..10));
1306$Kx="%ymm11";
1307$j=0;
1308
1309my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1310my ($a5,$t0)=("%r12d","%edi");
1311
1312my ($A,$F,$B,$C,$D,$E)=@ROTX;
1313my $rx=0;
1314my $frame="%r13";
1315
1316$code.=<<___;
1317.globl	sha1_block_data_order_avx2
1318.type	sha1_block_data_order_avx2,\@function,3
1319.align	16
1320sha1_block_data_order_avx2:
1321.cfi_startproc
1322	_CET_ENDBR
1323	mov	%rsp,$fp
1324.cfi_def_cfa_register	$fp
1325	push	%rbx
1326.cfi_push	%rbx
1327	push	%rbp
1328.cfi_push	%rbp
1329	push	%r12
1330.cfi_push	%r12
1331	push	%r13
1332.cfi_push	%r13
1333	push	%r14
1334.cfi_push	%r14
1335	vzeroupper
1336___
1337$code.=<<___ if ($win64);
1338	lea	-6*16(%rsp),%rsp
1339	vmovaps	%xmm6,-40-6*16($fp)
1340	vmovaps	%xmm7,-40-5*16($fp)
1341	vmovaps	%xmm8,-40-4*16($fp)
1342	vmovaps	%xmm9,-40-3*16($fp)
1343	vmovaps	%xmm10,-40-2*16($fp)
1344	vmovaps	%xmm11,-40-1*16($fp)
1345.Lprologue_avx2:
1346___
1347$code.=<<___;
1348	mov	%rdi,$ctx		# reassigned argument
1349	mov	%rsi,$inp		# reassigned argument
1350	mov	%rdx,$num		# reassigned argument
1351
1352	lea	-640(%rsp),%rsp
1353	shl	\$6,$num
1354	 lea	64($inp),$frame
1355	and	\$-128,%rsp
1356	add	$inp,$num
1357	lea	K_XX_XX+64(%rip),$K_XX_XX
1358
1359	mov	0($ctx),$A		# load context
1360	 cmp	$num,$frame
1361	 cmovae	$inp,$frame		# next or same block
1362	mov	4($ctx),$F
1363	mov	8($ctx),$C
1364	mov	12($ctx),$D
1365	mov	16($ctx),$E
1366	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1367
1368	vmovdqu		($inp),%xmm0
1369	vmovdqu		16($inp),%xmm1
1370	vmovdqu		32($inp),%xmm2
1371	vmovdqu		48($inp),%xmm3
1372	lea		64($inp),$inp
1373	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1374	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1375	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1376	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1377	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1378	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1379	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1380	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1381	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1382
1383	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1384	vpaddd	$Kx,@X[-3&7],@X[1]
1385	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1386	vpaddd	$Kx,@X[-2&7],@X[2]
1387	vmovdqu	@X[1],32(%rsp)
1388	vpaddd	$Kx,@X[-1&7],@X[3]
1389	vmovdqu	@X[2],64(%rsp)
1390	vmovdqu	@X[3],96(%rsp)
1391___
1392for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1393    use integer;
1394
1395	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1396	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1397	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1398	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1399	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1400	&vpsrld	(@Tx[0],@X[0],31);
1401	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1402	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1403	&vpaddd	(@X[0],@X[0],@X[0]);
1404	&vpsrld	(@Tx[1],@Tx[2],30);
1405	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1406	&vpslld	(@Tx[2],@Tx[2],2);
1407	&vpxor	(@X[0],@X[0],@Tx[1]);
1408	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1409	&vpaddd	(@Tx[1],@X[0],$Kx);
1410	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1411
1412	push(@X,shift(@X));	# "rotate" X[]
1413}
1414$code.=<<___;
1415	lea	128(%rsp),$frame
1416	jmp	.Loop_avx2
1417.align	32
1418.Loop_avx2:
1419	rorx	\$2,$F,$B
1420	andn	$D,$F,$t0
1421	and	$C,$F
1422	xor	$t0,$F
1423___
1424sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1425	# at start $f=(b&c)^(~b&d), $b>>>=2
1426	return &bodyx_20_39() if ($rx==19); $rx++;
1427	(
1428	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1429
1430	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1431	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1432	'&andn	($t0,$a,$c)',			# ~b&d for next round
1433
1434	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1435	'&rorx	($a5,$a,27)',			# a<<<5
1436	'&rorx	($f,$a,2)',			# b>>>2 for next round
1437	'&and	($a,$b)',			# b&c for next round
1438
1439	'&add	($e,$a5)',			# e+=a<<<5
1440	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1441
1442	'unshift(@ROTX,pop(@ROTX)); $j++;'
1443	)
1444}
1445
1446sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1447	# on entry $f=b^c^d, $b>>>=2
1448	return &bodyx_40_59() if ($rx==39); $rx++;
1449	(
1450	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1451
1452	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1453	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1454
1455	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1456	'&rorx	($a5,$a,27)',			# a<<<5
1457	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1458	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1459
1460	'&add	($e,$a5)',			# e+=a<<<5
1461	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1462
1463	'unshift(@ROTX,pop(@ROTX)); $j++;'
1464	)
1465}
1466
1467sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1468	# on entry $f=((b^c)&(c^d)), $b>>>=2
1469	$rx++;
1470	(
1471	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1472
1473	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1474	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1475	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1476	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1477	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1478
1479	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1480	'&rorx	($a5,$a,27)',			# a<<<5
1481	'&rorx	($f,$a,2)',			# b>>>2 in next round
1482	'&xor	($a,$b)',			# b^c for next round
1483
1484	'&add	($e,$a5)',			# e+=a<<<5
1485	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1486	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1487
1488	'unshift(@ROTX,pop(@ROTX)); $j++;'
1489	)
1490}
1491
1492sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1493{ use integer;
1494  my $body = shift;
1495  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1496  my ($a,$b,$c,$d,$e);
1497
1498	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1499	 eval(shift(@insns));
1500	 eval(shift(@insns));
1501	 eval(shift(@insns));
1502	 eval(shift(@insns));
1503
1504	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1505	 eval(shift(@insns));
1506	 eval(shift(@insns));
1507	 eval(shift(@insns));
1508
1509	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1510	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1511	 eval(shift(@insns));
1512	 eval(shift(@insns));
1513
1514	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1515	 eval(shift(@insns));
1516	 eval(shift(@insns));
1517	 eval(shift(@insns));
1518	 eval(shift(@insns));
1519
1520	&vpsrld	(@Tx[0],@X[0],31);
1521	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1522	 eval(shift(@insns));
1523	 eval(shift(@insns));
1524	 eval(shift(@insns));
1525
1526	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1527	&vpaddd	(@X[0],@X[0],@X[0]);
1528	 eval(shift(@insns));
1529	 eval(shift(@insns));
1530
1531	&vpsrld	(@Tx[1],@Tx[2],30);
1532	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1533	 eval(shift(@insns));
1534	 eval(shift(@insns));
1535
1536	&vpslld	(@Tx[2],@Tx[2],2);
1537	&vpxor	(@X[0],@X[0],@Tx[1]);
1538	 eval(shift(@insns));
1539	 eval(shift(@insns));
1540
1541	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1542	 eval(shift(@insns));
1543	 eval(shift(@insns));
1544	 eval(shift(@insns));
1545
1546	&vpaddd	(@Tx[1],@X[0],$Kx);
1547	 eval(shift(@insns));
1548	 eval(shift(@insns));
1549	 eval(shift(@insns));
1550	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1551
1552	 foreach (@insns) { eval; }	# remaining instructions [if any]
1553
1554	$Xi++;
1555	push(@X,shift(@X));	# "rotate" X[]
1556}
1557
1558sub Xupdate_avx2_32_79()
1559{ use integer;
1560  my $body = shift;
1561  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1562  my ($a,$b,$c,$d,$e);
1563
1564	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1565	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1566	 eval(shift(@insns));
1567	 eval(shift(@insns));
1568
1569	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1570	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1571	 eval(shift(@insns));
1572	 eval(shift(@insns));
1573	 eval(shift(@insns));
1574
1575	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1576	 eval(shift(@insns));
1577	 eval(shift(@insns));
1578	 eval(shift(@insns));
1579
1580	&vpsrld	(@Tx[0],@X[0],30);
1581	&vpslld	(@X[0],@X[0],2);
1582	 eval(shift(@insns));
1583	 eval(shift(@insns));
1584	 eval(shift(@insns));
1585
1586	#&vpslld	(@X[0],@X[0],2);
1587	 eval(shift(@insns));
1588	 eval(shift(@insns));
1589	 eval(shift(@insns));
1590
1591	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1592	 eval(shift(@insns));
1593	 eval(shift(@insns));
1594	 eval(shift(@insns));
1595	 eval(shift(@insns));
1596
1597	&vpaddd	(@Tx[1],@X[0],$Kx);
1598	 eval(shift(@insns));
1599	 eval(shift(@insns));
1600	 eval(shift(@insns));
1601	 eval(shift(@insns));
1602
1603	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1604
1605	 foreach (@insns) { eval; }	# remaining instructions
1606
1607	$Xi++;
1608	push(@X,shift(@X));	# "rotate" X[]
1609}
1610
1611sub Xloop_avx2()
1612{ use integer;
1613  my $body = shift;
1614  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1615  my ($a,$b,$c,$d,$e);
1616
1617	 foreach (@insns) { eval; }
1618}
1619
1620	&align32();
1621	&Xupdate_avx2_32_79(\&bodyx_00_19);
1622	&Xupdate_avx2_32_79(\&bodyx_00_19);
1623	&Xupdate_avx2_32_79(\&bodyx_00_19);
1624	&Xupdate_avx2_32_79(\&bodyx_00_19);
1625
1626	&Xupdate_avx2_32_79(\&bodyx_20_39);
1627	&Xupdate_avx2_32_79(\&bodyx_20_39);
1628	&Xupdate_avx2_32_79(\&bodyx_20_39);
1629	&Xupdate_avx2_32_79(\&bodyx_20_39);
1630
1631	&align32();
1632	&Xupdate_avx2_32_79(\&bodyx_40_59);
1633	&Xupdate_avx2_32_79(\&bodyx_40_59);
1634	&Xupdate_avx2_32_79(\&bodyx_40_59);
1635	&Xupdate_avx2_32_79(\&bodyx_40_59);
1636
1637	&Xloop_avx2(\&bodyx_20_39);
1638	&Xloop_avx2(\&bodyx_20_39);
1639	&Xloop_avx2(\&bodyx_20_39);
1640	&Xloop_avx2(\&bodyx_20_39);
1641
1642$code.=<<___;
1643	lea	128($inp),$frame
1644	lea	128($inp),%rdi			# borrow $t0
1645	cmp	$num,$frame
1646	cmovae	$inp,$frame			# next or previous block
1647
1648	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1649	add	0($ctx),@ROTX[0]		# update context
1650	add	4($ctx),@ROTX[1]
1651	add	8($ctx),@ROTX[3]
1652	mov	@ROTX[0],0($ctx)
1653	add	12($ctx),@ROTX[4]
1654	mov	@ROTX[1],4($ctx)
1655	 mov	@ROTX[0],$A			# A=d
1656	add	16($ctx),@ROTX[5]
1657	 mov	@ROTX[3],$a5
1658	mov	@ROTX[3],8($ctx)
1659	 mov	@ROTX[4],$D			# D=b
1660	 #xchg	@ROTX[5],$F			# F=c, C=f
1661	mov	@ROTX[4],12($ctx)
1662	 mov	@ROTX[1],$F			# F=e
1663	mov	@ROTX[5],16($ctx)
1664	#mov	$F,16($ctx)
1665	 mov	@ROTX[5],$E			# E=c
1666	 mov	$a5,$C				# C=f
1667	 #xchg	$F,$E				# E=c, F=e
1668
1669	cmp	$num,$inp
1670	je	.Ldone_avx2
1671___
1672
1673$Xi=4;				# reset variables
1674@X=map("%ymm$_",(4..7,0..3));
1675
1676$code.=<<___;
1677	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1678	cmp	$num,%rdi			# borrowed $t0
1679	ja	.Last_avx2
1680
1681	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1682	vmovdqu		-48(%rdi),%xmm1
1683	vmovdqu		-32(%rdi),%xmm2
1684	vmovdqu		-16(%rdi),%xmm3
1685	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1686	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1687	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1688	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1689	jmp	.Last_avx2
1690
1691.align	32
1692.Last_avx2:
1693	lea	128+16(%rsp),$frame
1694	rorx	\$2,$F,$B
1695	andn	$D,$F,$t0
1696	and	$C,$F
1697	xor	$t0,$F
1698	sub	\$-128,$inp
1699___
1700	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1701
1702	&Xloop_avx2	(\&bodyx_00_19);
1703	&Xloop_avx2	(\&bodyx_00_19);
1704	&Xloop_avx2	(\&bodyx_00_19);
1705	&Xloop_avx2	(\&bodyx_00_19);
1706
1707	&Xloop_avx2	(\&bodyx_20_39);
1708	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1709	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1710	&Xloop_avx2	(\&bodyx_20_39);
1711	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1712	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1713	&Xloop_avx2	(\&bodyx_20_39);
1714	  &vmovdqu	("0(%rsp)",@Tx[0]);
1715	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1716	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1717	&Xloop_avx2	(\&bodyx_20_39);
1718	  &vmovdqu	("32(%rsp)",@Tx[1]);
1719	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1720	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1721
1722	&Xloop_avx2	(\&bodyx_40_59);
1723	&align32	();
1724	  &vmovdqu	("64(%rsp)",@X[2]);
1725	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1726	&Xloop_avx2	(\&bodyx_40_59);
1727	  &vmovdqu	("96(%rsp)",@X[3]);
1728	&Xloop_avx2	(\&bodyx_40_59);
1729	&Xupdate_avx2_16_31(\&bodyx_40_59);
1730
1731	&Xupdate_avx2_16_31(\&bodyx_20_39);
1732	&Xupdate_avx2_16_31(\&bodyx_20_39);
1733	&Xupdate_avx2_16_31(\&bodyx_20_39);
1734	&Xloop_avx2	(\&bodyx_20_39);
1735
1736$code.=<<___;
1737	lea	128(%rsp),$frame
1738
1739	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1740	add	0($ctx),@ROTX[0]		# update context
1741	add	4($ctx),@ROTX[1]
1742	add	8($ctx),@ROTX[3]
1743	mov	@ROTX[0],0($ctx)
1744	add	12($ctx),@ROTX[4]
1745	mov	@ROTX[1],4($ctx)
1746	 mov	@ROTX[0],$A			# A=d
1747	add	16($ctx),@ROTX[5]
1748	 mov	@ROTX[3],$a5
1749	mov	@ROTX[3],8($ctx)
1750	 mov	@ROTX[4],$D			# D=b
1751	 #xchg	@ROTX[5],$F			# F=c, C=f
1752	mov	@ROTX[4],12($ctx)
1753	 mov	@ROTX[1],$F			# F=e
1754	mov	@ROTX[5],16($ctx)
1755	#mov	$F,16($ctx)
1756	 mov	@ROTX[5],$E			# E=c
1757	 mov	$a5,$C				# C=f
1758	 #xchg	$F,$E				# E=c, F=e
1759
1760	cmp	$num,$inp
1761	jbe	.Loop_avx2
1762
1763.Ldone_avx2:
1764	vzeroupper
1765___
1766$code.=<<___ if ($win64);
1767	movaps	-40-6*16($fp),%xmm6
1768	movaps	-40-5*16($fp),%xmm7
1769	movaps	-40-4*16($fp),%xmm8
1770	movaps	-40-3*16($fp),%xmm9
1771	movaps	-40-2*16($fp),%xmm10
1772	movaps	-40-1*16($fp),%xmm11
1773___
1774$code.=<<___;
1775	mov	-40($fp),%r14
1776.cfi_restore	%r14
1777	mov	-32($fp),%r13
1778.cfi_restore	%r13
1779	mov	-24($fp),%r12
1780.cfi_restore	%r12
1781	mov	-16($fp),%rbp
1782.cfi_restore	%rbp
1783	mov	-8($fp),%rbx
1784.cfi_restore	%rbx
1785	lea	($fp),%rsp
1786.cfi_def_cfa_register	%rsp
1787.Lepilogue_avx2:
1788	ret
1789.cfi_endproc
1790.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1791___
1792}
1793}
1794$code.=<<___;
1795.section .rodata
1796.align	64
1797K_XX_XX:
1798.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1799.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1800.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1801.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1802.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1803.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1804.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1805.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1806.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1807.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1808.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1809___
1810}}}
1811$code.=<<___;
1812.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1813.align	64
1814.text
1815___
1816
1817# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1818#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1819if ($win64) {
1820$rec="%rcx";
1821$frame="%rdx";
1822$context="%r8";
1823$disp="%r9";
1824
1825$code.=<<___;
1826.extern	__imp_RtlVirtualUnwind
1827.type	se_handler,\@abi-omnipotent
1828.align	16
1829se_handler:
1830	push	%rsi
1831	push	%rdi
1832	push	%rbx
1833	push	%rbp
1834	push	%r12
1835	push	%r13
1836	push	%r14
1837	push	%r15
1838	pushfq
1839	sub	\$64,%rsp
1840
1841	mov	120($context),%rax	# pull context->Rax
1842	mov	248($context),%rbx	# pull context->Rip
1843
1844	lea	.Lprologue(%rip),%r10
1845	cmp	%r10,%rbx		# context->Rip<.Lprologue
1846	jb	.Lcommon_seh_tail
1847
1848	mov	152($context),%rax	# pull context->Rsp
1849
1850	lea	.Lepilogue(%rip),%r10
1851	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1852	jae	.Lcommon_seh_tail
1853
1854	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1855
1856	mov	-8(%rax),%rbx
1857	mov	-16(%rax),%rbp
1858	mov	-24(%rax),%r12
1859	mov	-32(%rax),%r13
1860	mov	-40(%rax),%r14
1861	mov	%rbx,144($context)	# restore context->Rbx
1862	mov	%rbp,160($context)	# restore context->Rbp
1863	mov	%r12,216($context)	# restore context->R12
1864	mov	%r13,224($context)	# restore context->R13
1865	mov	%r14,232($context)	# restore context->R14
1866
1867	jmp	.Lcommon_seh_tail
1868.size	se_handler,.-se_handler
1869___
1870
1871$code.=<<___ if ($shaext);
1872.type	shaext_handler,\@abi-omnipotent
1873.align	16
1874shaext_handler:
1875	push	%rsi
1876	push	%rdi
1877	push	%rbx
1878	push	%rbp
1879	push	%r12
1880	push	%r13
1881	push	%r14
1882	push	%r15
1883	pushfq
1884	sub	\$64,%rsp
1885
1886	mov	120($context),%rax	# pull context->Rax
1887	mov	248($context),%rbx	# pull context->Rip
1888
1889	lea	.Lprologue_shaext(%rip),%r10
1890	cmp	%r10,%rbx		# context->Rip<.Lprologue
1891	jb	.Lcommon_seh_tail
1892
1893	lea	.Lepilogue_shaext(%rip),%r10
1894	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1895	jae	.Lcommon_seh_tail
1896
1897	lea	-8-4*16(%rax),%rsi
1898	lea	512($context),%rdi	# &context.Xmm6
1899	mov	\$8,%ecx
1900	.long	0xa548f3fc		# cld; rep movsq
1901
1902	jmp	.Lcommon_seh_tail
1903.size	shaext_handler,.-shaext_handler
1904___
1905
1906$code.=<<___;
1907.type	ssse3_handler,\@abi-omnipotent
1908.align	16
1909ssse3_handler:
1910	push	%rsi
1911	push	%rdi
1912	push	%rbx
1913	push	%rbp
1914	push	%r12
1915	push	%r13
1916	push	%r14
1917	push	%r15
1918	pushfq
1919	sub	\$64,%rsp
1920
1921	mov	120($context),%rax	# pull context->Rax
1922	mov	248($context),%rbx	# pull context->Rip
1923
1924	mov	8($disp),%rsi		# disp->ImageBase
1925	mov	56($disp),%r11		# disp->HandlerData
1926
1927	mov	0(%r11),%r10d		# HandlerData[0]
1928	lea	(%rsi,%r10),%r10	# prologue label
1929	cmp	%r10,%rbx		# context->Rip<prologue label
1930	jb	.Lcommon_seh_tail
1931
1932	mov	208($context),%rax	# pull context->R11
1933
1934	mov	4(%r11),%r10d		# HandlerData[1]
1935	lea	(%rsi,%r10),%r10	# epilogue label
1936	cmp	%r10,%rbx		# context->Rip>=epilogue label
1937	jae	.Lcommon_seh_tail
1938
1939	lea	-40-6*16(%rax),%rsi
1940	lea	512($context),%rdi	# &context.Xmm6
1941	mov	\$12,%ecx
1942	.long	0xa548f3fc		# cld; rep movsq
1943
1944	mov	-8(%rax),%rbx
1945	mov	-16(%rax),%rbp
1946	mov	-24(%rax),%r12
1947	mov	-32(%rax),%r13
1948	mov	-40(%rax),%r14
1949	mov	%rbx,144($context)	# restore context->Rbx
1950	mov	%rbp,160($context)	# restore context->Rbp
1951	mov	%r12,216($context)	# restore context->R12
1952	mov	%r13,224($context)	# restore context->R13
1953	mov	%r14,232($context)	# restore context->R14
1954
1955.Lcommon_seh_tail:
1956	mov	8(%rax),%rdi
1957	mov	16(%rax),%rsi
1958	mov	%rax,152($context)	# restore context->Rsp
1959	mov	%rsi,168($context)	# restore context->Rsi
1960	mov	%rdi,176($context)	# restore context->Rdi
1961
1962	mov	40($disp),%rdi		# disp->ContextRecord
1963	mov	$context,%rsi		# context
1964	mov	\$154,%ecx		# sizeof(CONTEXT)
1965	.long	0xa548f3fc		# cld; rep movsq
1966
1967	mov	$disp,%rsi
1968	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1969	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1970	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1971	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1972	mov	40(%rsi),%r10		# disp->ContextRecord
1973	lea	56(%rsi),%r11		# &disp->HandlerData
1974	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1975	mov	%r10,32(%rsp)		# arg5
1976	mov	%r11,40(%rsp)		# arg6
1977	mov	%r12,48(%rsp)		# arg7
1978	mov	%rcx,56(%rsp)		# arg8, (NULL)
1979	call	*__imp_RtlVirtualUnwind(%rip)
1980
1981	mov	\$1,%eax		# ExceptionContinueSearch
1982	add	\$64,%rsp
1983	popfq
1984	pop	%r15
1985	pop	%r14
1986	pop	%r13
1987	pop	%r12
1988	pop	%rbp
1989	pop	%rbx
1990	pop	%rdi
1991	pop	%rsi
1992	ret
1993.size	ssse3_handler,.-ssse3_handler
1994
1995.section	.pdata
1996.align	4
1997	.rva	.LSEH_begin_sha1_block_data_order_nohw
1998	.rva	.LSEH_end_sha1_block_data_order_nohw
1999	.rva	.LSEH_info_sha1_block_data_order_nohw
2000___
2001$code.=<<___ if ($shaext);
2002	.rva	.LSEH_begin_sha1_block_data_order_hw
2003	.rva	.LSEH_end_sha1_block_data_order_hw
2004	.rva	.LSEH_info_sha1_block_data_order_hw
2005___
2006$code.=<<___;
2007	.rva	.LSEH_begin_sha1_block_data_order_ssse3
2008	.rva	.LSEH_end_sha1_block_data_order_ssse3
2009	.rva	.LSEH_info_sha1_block_data_order_ssse3
2010___
2011$code.=<<___ if ($avx);
2012	.rva	.LSEH_begin_sha1_block_data_order_avx
2013	.rva	.LSEH_end_sha1_block_data_order_avx
2014	.rva	.LSEH_info_sha1_block_data_order_avx
2015___
2016$code.=<<___ if ($avx>1);
2017	.rva	.LSEH_begin_sha1_block_data_order_avx2
2018	.rva	.LSEH_end_sha1_block_data_order_avx2
2019	.rva	.LSEH_info_sha1_block_data_order_avx2
2020___
2021$code.=<<___;
2022.section	.xdata
2023.align	8
2024.LSEH_info_sha1_block_data_order_nohw:
2025	.byte	9,0,0,0
2026	.rva	se_handler
2027___
2028$code.=<<___ if ($shaext);
2029.LSEH_info_sha1_block_data_order_hw:
2030	.byte	9,0,0,0
2031	.rva	shaext_handler
2032___
2033$code.=<<___;
2034.LSEH_info_sha1_block_data_order_ssse3:
2035	.byte	9,0,0,0
2036	.rva	ssse3_handler
2037	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2038___
2039$code.=<<___ if ($avx);
2040.LSEH_info_sha1_block_data_order_avx:
2041	.byte	9,0,0,0
2042	.rva	ssse3_handler
2043	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2044___
2045$code.=<<___ if ($avx>1);
2046.LSEH_info_sha1_block_data_order_avx2:
2047	.byte	9,0,0,0
2048	.rva	ssse3_handler
2049	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2050___
2051}
2052
2053####################################################################
2054
2055sub sha1rnds4 {
2056    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2057      my @opcode=(0x0f,0x3a,0xcc);
2058	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2059	my $c=$1;
2060	push @opcode,$c=~/^0/?oct($c):$c;
2061	return ".byte\t".join(',',@opcode);
2062    } else {
2063	return "sha1rnds4\t".@_[0];
2064    }
2065}
2066
2067sub sha1op38 {
2068    my $instr = shift;
2069    my %opcodelet = (
2070		"sha1nexte" => 0xc8,
2071  		"sha1msg1"  => 0xc9,
2072		"sha1msg2"  => 0xca	);
2073
2074    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2075      my @opcode=(0x0f,0x38);
2076      my $rex=0;
2077	$rex|=0x04			if ($2>=8);
2078	$rex|=0x01			if ($1>=8);
2079	unshift @opcode,0x40|$rex	if ($rex);
2080	push @opcode,$opcodelet{$instr};
2081	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2082	return ".byte\t".join(',',@opcode);
2083    } else {
2084	return $instr."\t".@_[0];
2085    }
2086}
2087
2088foreach (split("\n",$code)) {
2089	s/\`([^\`]*)\`/eval $1/geo;
2090
2091	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2092	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2093
2094	print $_,"\n";
2095}
2096close STDOUT or die "error closing STDOUT: $!";
2097