• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# sha1_block procedure for x86_64.
18#
19# It was brought to my attention that on EM64T compiler-generated code
20# was far behind 32-bit assembler implementation. This is unlike on
21# Opteron where compiler-generated code was only 15% behind 32-bit
22# assembler, which originally made it hard to motivate the effort.
23# There was suggestion to mechanically translate 32-bit code, but I
24# dismissed it, reasoning that x86_64 offers enough register bank
25# capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26# implementation:-) However! While 64-bit code does perform better
27# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28# x86_64 does offer larger *addressable* bank, but out-of-order core
29# reaches for even more registers through dynamic aliasing, and EM64T
30# core must have managed to run-time optimize even 32-bit code just as
31# good as 64-bit one. Performance improvement is summarized in the
32# following table:
33#
34#		gcc 3.4		32-bit asm	cycles/byte
35# Opteron	+45%		+20%		6.8
36# Xeon P4	+65%		+0%		9.9
37# Core2		+60%		+10%		7.0
38
39# August 2009.
40#
41# The code was revised to minimize code size and to maximize
42# "distance" between instructions producing input to 'lea'
43# instruction and the 'lea' instruction itself, which is essential
44# for Intel Atom core.
45
46# October 2010.
47#
48# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49# is to offload message schedule denoted by Wt in NIST specification,
50# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51# for background and implementation details. The only difference from
52# 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53# to free temporary registers.
54
55# April 2011.
56#
57# Add AVX code path. See sha1-586.pl for further information.
58
59# May 2013.
60#
61# Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62# and loading pair of consecutive blocks to 256-bit %ymm registers)
63# did not provide impressive performance improvement till a crucial
64# hint regarding the number of Xupdate iterations to pre-compute in
65# advance was provided by Ilya Albrekht of Intel Corp.
66
67# March 2014.
68#
69# Add support for Intel SHA Extensions.
70
71######################################################################
72# Current performance is summarized in following table. Numbers are
73# CPU clock cycles spent to process single byte (less is better).
74#
75#		x86_64		SSSE3		AVX[2]
76# P4		9.05		-
77# Opteron	6.26		-
78# Core2		6.55		6.05/+8%	-
79# Westmere	6.73		5.30/+27%	-
80# Sandy Bridge	7.70		6.10/+26%	4.99/+54%
81# Ivy Bridge	6.06		4.67/+30%	4.60/+32%
82# Haswell	5.45		4.15/+31%	3.57/+53%
83# Skylake	5.18		4.06/+28%	3.54/+46%
84# Bulldozer	9.11		5.95/+53%
85# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
86# VIA Nano	9.32		7.15/+30%
87# Atom		10.3		9.17/+12%
88# Silvermont	13.1(*)		9.37/+40%
89# Knights L	13.2(*)		9.68/+36%	8.30/+59%
90# Goldmont	8.13		6.42/+27%	1.70/+380%(**)
91#
92# (*)	obviously suboptimal result, nothing was done about it,
93#	because SSSE3 code is compiled unconditionally;
94# (**)	SHAEXT result
95
96$flavour = shift;
97$output  = shift;
98if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99
100$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101
102$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105die "can't locate x86_64-xlate.pl";
106
107# In upstream, this is controlled by shelling out to the compiler to check
108# versions, but BoringSSL is intended to be used with pre-generated perlasm
109# output, so this isn't useful anyway.
110$avx = 2;
111$shaext=1;	### set to zero if compiling for 1.0.1
112
113open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
114*STDOUT=*OUT;
115
116$ctx="%rdi";	# 1st arg
117$inp="%rsi";	# 2nd arg
118$num="%rdx";	# 3rd arg
119
120# reassign arguments in order to produce more compact code
121$ctx="%r8";
122$inp="%r9";
123$num="%r10";
124
125$t0="%eax";
126$t1="%ebx";
127$t2="%ecx";
128@xi=("%edx","%ebp","%r14d");
129$A="%esi";
130$B="%edi";
131$C="%r11d";
132$D="%r12d";
133$E="%r13d";
134
135@V=($A,$B,$C,$D,$E);
136
137sub BODY_00_19 {
138my ($i,$a,$b,$c,$d,$e)=@_;
139my $j=$i+1;
140$code.=<<___ if ($i==0);
141	mov	`4*$i`($inp),$xi[0]
142	bswap	$xi[0]
143___
144$code.=<<___ if ($i<15);
145	mov	`4*$j`($inp),$xi[1]
146	mov	$d,$t0
147	mov	$xi[0],`4*$i`(%rsp)
148	mov	$a,$t2
149	bswap	$xi[1]
150	xor	$c,$t0
151	rol	\$5,$t2
152	and	$b,$t0
153	lea	0x5a827999($xi[0],$e),$e
154	add	$t2,$e
155	xor	$d,$t0
156	rol	\$30,$b
157	add	$t0,$e
158___
159$code.=<<___ if ($i>=15);
160	xor	`4*($j%16)`(%rsp),$xi[1]
161	mov	$d,$t0
162	mov	$xi[0],`4*($i%16)`(%rsp)
163	mov	$a,$t2
164	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
165	xor	$c,$t0
166	rol	\$5,$t2
167	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
168	and	$b,$t0
169	lea	0x5a827999($xi[0],$e),$e
170	rol	\$30,$b
171	xor	$d,$t0
172	add	$t2,$e
173	rol	\$1,$xi[1]
174	add	$t0,$e
175___
176push(@xi,shift(@xi));
177}
178
179sub BODY_20_39 {
180my ($i,$a,$b,$c,$d,$e)=@_;
181my $j=$i+1;
182my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
183$code.=<<___ if ($i<79);
184	xor	`4*($j%16)`(%rsp),$xi[1]
185	mov	$b,$t0
186	`"mov	$xi[0],".4*($i%16)."(%rsp)"	if ($i<72)`
187	mov	$a,$t2
188	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
189	xor	$d,$t0
190	rol	\$5,$t2
191	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
192	lea	$K($xi[0],$e),$e
193	xor	$c,$t0
194	add	$t2,$e
195	rol	\$30,$b
196	add	$t0,$e
197	rol	\$1,$xi[1]
198___
199$code.=<<___ if ($i==79);
200	mov	$b,$t0
201	mov	$a,$t2
202	xor	$d,$t0
203	lea	$K($xi[0],$e),$e
204	rol	\$5,$t2
205	xor	$c,$t0
206	add	$t2,$e
207	rol	\$30,$b
208	add	$t0,$e
209___
210push(@xi,shift(@xi));
211}
212
213sub BODY_40_59 {
214my ($i,$a,$b,$c,$d,$e)=@_;
215my $j=$i+1;
216$code.=<<___;
217	xor	`4*($j%16)`(%rsp),$xi[1]
218	mov	$d,$t0
219	mov	$xi[0],`4*($i%16)`(%rsp)
220	mov	$d,$t1
221	xor	`4*(($j+2)%16)`(%rsp),$xi[1]
222	and	$c,$t0
223	mov	$a,$t2
224	xor	`4*(($j+8)%16)`(%rsp),$xi[1]
225	lea	0x8f1bbcdc($xi[0],$e),$e
226	xor	$c,$t1
227	rol	\$5,$t2
228	add	$t0,$e
229	rol	\$1,$xi[1]
230	and	$b,$t1
231	add	$t2,$e
232	rol	\$30,$b
233	add	$t1,$e
234___
235push(@xi,shift(@xi));
236}
237
238$code.=<<___;
239.text
240.extern	OPENSSL_ia32cap_P
241
242.globl	sha1_block_data_order
243.type	sha1_block_data_order,\@function,3
244.align	16
245sha1_block_data_order:
246.cfi_startproc
247	leaq	OPENSSL_ia32cap_P(%rip),%r10
248	mov	0(%r10),%r9d
249	mov	4(%r10),%r8d
250	mov	8(%r10),%r10d
251	test	\$`1<<9`,%r8d		# check SSSE3 bit
252	jz	.Lialu
253___
254$code.=<<___ if ($shaext);
255	test	\$`1<<29`,%r10d		# check SHA bit
256	jnz	_shaext_shortcut
257___
258$code.=<<___ if ($avx>1);
259	and	\$`1<<3|1<<5|1<<8`,%r10d	# check AVX2+BMI1+BMI2
260	cmp	\$`1<<3|1<<5|1<<8`,%r10d
261	je	_avx2_shortcut
262___
263$code.=<<___ if ($avx);
264	and	\$`1<<28`,%r8d		# mask AVX bit
265	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
266	or	%r9d,%r8d
267	cmp	\$`1<<28|1<<30`,%r8d
268	je	_avx_shortcut
269___
270$code.=<<___;
271	jmp	_ssse3_shortcut
272
273.align	16
274.Lialu:
275	mov	%rsp,%rax
276.cfi_def_cfa_register	%rax
277	push	%rbx
278.cfi_push	%rbx
279	push	%rbp
280.cfi_push	%rbp
281	push	%r12
282.cfi_push	%r12
283	push	%r13
284.cfi_push	%r13
285	push	%r14
286.cfi_push	%r14
287	mov	%rdi,$ctx	# reassigned argument
288	sub	\$`8+16*4`,%rsp
289	mov	%rsi,$inp	# reassigned argument
290	and	\$-64,%rsp
291	mov	%rdx,$num	# reassigned argument
292	mov	%rax,`16*4`(%rsp)
293.cfi_cfa_expression	%rsp+64,deref,+8
294.Lprologue:
295
296	mov	0($ctx),$A
297	mov	4($ctx),$B
298	mov	8($ctx),$C
299	mov	12($ctx),$D
300	mov	16($ctx),$E
301	jmp	.Lloop
302
303.align	16
304.Lloop:
305___
306for($i=0;$i<20;$i++)	{ &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
307for(;$i<40;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
308for(;$i<60;$i++)	{ &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
309for(;$i<80;$i++)	{ &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
310$code.=<<___;
311	add	0($ctx),$A
312	add	4($ctx),$B
313	add	8($ctx),$C
314	add	12($ctx),$D
315	add	16($ctx),$E
316	mov	$A,0($ctx)
317	mov	$B,4($ctx)
318	mov	$C,8($ctx)
319	mov	$D,12($ctx)
320	mov	$E,16($ctx)
321
322	sub	\$1,$num
323	lea	`16*4`($inp),$inp
324	jnz	.Lloop
325
326	mov	`16*4`(%rsp),%rsi
327.cfi_def_cfa	%rsi,8
328	mov	-40(%rsi),%r14
329.cfi_restore	%r14
330	mov	-32(%rsi),%r13
331.cfi_restore	%r13
332	mov	-24(%rsi),%r12
333.cfi_restore	%r12
334	mov	-16(%rsi),%rbp
335.cfi_restore	%rbp
336	mov	-8(%rsi),%rbx
337.cfi_restore	%rbx
338	lea	(%rsi),%rsp
339.cfi_def_cfa_register	%rsp
340.Lepilogue:
341	ret
342.cfi_endproc
343.size	sha1_block_data_order,.-sha1_block_data_order
344___
345if ($shaext) {{{
346######################################################################
347# Intel SHA Extensions implementation of SHA1 update function.
348#
349my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
350my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
351my @MSG=map("%xmm$_",(4..7));
352
353$code.=<<___;
354.type	sha1_block_data_order_shaext,\@function,3
355.align	32
356sha1_block_data_order_shaext:
357_shaext_shortcut:
358.cfi_startproc
359___
360$code.=<<___ if ($win64);
361	lea	`-8-4*16`(%rsp),%rsp
362	movaps	%xmm6,-8-4*16(%rax)
363	movaps	%xmm7,-8-3*16(%rax)
364	movaps	%xmm8,-8-2*16(%rax)
365	movaps	%xmm9,-8-1*16(%rax)
366.Lprologue_shaext:
367___
368$code.=<<___;
369	movdqu	($ctx),$ABCD
370	movd	16($ctx),$E
371	movdqa	K_XX_XX+0xa0(%rip),$BSWAP	# byte-n-word swap
372
373	movdqu	($inp),@MSG[0]
374	pshufd	\$0b00011011,$ABCD,$ABCD	# flip word order
375	movdqu	0x10($inp),@MSG[1]
376	pshufd	\$0b00011011,$E,$E		# flip word order
377	movdqu	0x20($inp),@MSG[2]
378	pshufb	$BSWAP,@MSG[0]
379	movdqu	0x30($inp),@MSG[3]
380	pshufb	$BSWAP,@MSG[1]
381	pshufb	$BSWAP,@MSG[2]
382	movdqa	$E,$E_SAVE			# offload $E
383	pshufb	$BSWAP,@MSG[3]
384	jmp	.Loop_shaext
385
386.align	16
387.Loop_shaext:
388	dec		$num
389	lea		0x40($inp),%r8		# next input block
390	paddd		@MSG[0],$E
391	cmovne		%r8,$inp
392	prefetcht0	512($inp)
393	movdqa		$ABCD,$ABCD_SAVE	# offload $ABCD
394___
395for($i=0;$i<20-4;$i+=2) {
396$code.=<<___;
397	sha1msg1	@MSG[1],@MSG[0]
398	movdqa		$ABCD,$E_
399	sha1rnds4	\$`int($i/5)`,$E,$ABCD	# 0-3...
400	sha1nexte	@MSG[1],$E_
401	pxor		@MSG[2],@MSG[0]
402	sha1msg1	@MSG[2],@MSG[1]
403	sha1msg2	@MSG[3],@MSG[0]
404
405	movdqa		$ABCD,$E
406	sha1rnds4	\$`int(($i+1)/5)`,$E_,$ABCD
407	sha1nexte	@MSG[2],$E
408	pxor		@MSG[3],@MSG[1]
409	sha1msg2	@MSG[0],@MSG[1]
410___
411	push(@MSG,shift(@MSG));	push(@MSG,shift(@MSG));
412}
413$code.=<<___;
414	movdqu		($inp),@MSG[0]
415	movdqa		$ABCD,$E_
416	sha1rnds4	\$3,$E,$ABCD		# 64-67
417	sha1nexte	@MSG[1],$E_
418	movdqu		0x10($inp),@MSG[1]
419	pshufb		$BSWAP,@MSG[0]
420
421	movdqa		$ABCD,$E
422	sha1rnds4	\$3,$E_,$ABCD		# 68-71
423	sha1nexte	@MSG[2],$E
424	movdqu		0x20($inp),@MSG[2]
425	pshufb		$BSWAP,@MSG[1]
426
427	movdqa		$ABCD,$E_
428	sha1rnds4	\$3,$E,$ABCD		# 72-75
429	sha1nexte	@MSG[3],$E_
430	movdqu		0x30($inp),@MSG[3]
431	pshufb		$BSWAP,@MSG[2]
432
433	movdqa		$ABCD,$E
434	sha1rnds4	\$3,$E_,$ABCD		# 76-79
435	sha1nexte	$E_SAVE,$E
436	pshufb		$BSWAP,@MSG[3]
437
438	paddd		$ABCD_SAVE,$ABCD
439	movdqa		$E,$E_SAVE		# offload $E
440
441	jnz		.Loop_shaext
442
443	pshufd	\$0b00011011,$ABCD,$ABCD
444	pshufd	\$0b00011011,$E,$E
445	movdqu	$ABCD,($ctx)
446	movd	$E,16($ctx)
447___
448$code.=<<___ if ($win64);
449	movaps	-8-4*16(%rax),%xmm6
450	movaps	-8-3*16(%rax),%xmm7
451	movaps	-8-2*16(%rax),%xmm8
452	movaps	-8-1*16(%rax),%xmm9
453	mov	%rax,%rsp
454.Lepilogue_shaext:
455___
456$code.=<<___;
457	ret
458.cfi_endproc
459.size	sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
460___
461}}}
462{{{
463my $Xi=4;
464my @X=map("%xmm$_",(4..7,0..3));
465my @Tx=map("%xmm$_",(8..10));
466my $Kx="%xmm11";
467my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");	# size optimization
468my @T=("%esi","%edi");
469my $j=0;
470my $rx=0;
471my $K_XX_XX="%r14";
472my $fp="%r11";
473
474my $_rol=sub { &rol(@_) };
475my $_ror=sub { &ror(@_) };
476
477{ my $sn;
478sub align32() {
479  ++$sn;
480$code.=<<___;
481	jmp	.Lalign32_$sn	# see "Decoded ICache" in manual
482.align	32
483.Lalign32_$sn:
484___
485}
486}
487
488$code.=<<___;
489.type	sha1_block_data_order_ssse3,\@function,3
490.align	16
491sha1_block_data_order_ssse3:
492_ssse3_shortcut:
493.cfi_startproc
494	mov	%rsp,$fp	# frame pointer
495.cfi_def_cfa_register	$fp
496	push	%rbx
497.cfi_push	%rbx
498	push	%rbp
499.cfi_push	%rbp
500	push	%r12
501.cfi_push	%r12
502	push	%r13		# redundant, done to share Win64 SE handler
503.cfi_push	%r13
504	push	%r14
505.cfi_push	%r14
506	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
507___
508$code.=<<___ if ($win64);
509	movaps	%xmm6,-40-6*16($fp)
510	movaps	%xmm7,-40-5*16($fp)
511	movaps	%xmm8,-40-4*16($fp)
512	movaps	%xmm9,-40-3*16($fp)
513	movaps	%xmm10,-40-2*16($fp)
514	movaps	%xmm11,-40-1*16($fp)
515.Lprologue_ssse3:
516___
517$code.=<<___;
518	and	\$-64,%rsp
519	mov	%rdi,$ctx	# reassigned argument
520	mov	%rsi,$inp	# reassigned argument
521	mov	%rdx,$num	# reassigned argument
522
523	shl	\$6,$num
524	add	$inp,$num
525	lea	K_XX_XX+64(%rip),$K_XX_XX
526
527	mov	0($ctx),$A		# load context
528	mov	4($ctx),$B
529	mov	8($ctx),$C
530	mov	12($ctx),$D
531	mov	$B,@T[0]		# magic seed
532	mov	16($ctx),$E
533	mov	$C,@T[1]
534	xor	$D,@T[1]
535	and	@T[1],@T[0]
536
537	movdqa	64($K_XX_XX),@X[2]	# pbswap mask
538	movdqa	-64($K_XX_XX),@Tx[1]	# K_00_19
539	movdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
540	movdqu	16($inp),@X[-3&7]
541	movdqu	32($inp),@X[-2&7]
542	movdqu	48($inp),@X[-1&7]
543	pshufb	@X[2],@X[-4&7]		# byte swap
544	pshufb	@X[2],@X[-3&7]
545	pshufb	@X[2],@X[-2&7]
546	add	\$64,$inp
547	paddd	@Tx[1],@X[-4&7]		# add K_00_19
548	pshufb	@X[2],@X[-1&7]
549	paddd	@Tx[1],@X[-3&7]
550	paddd	@Tx[1],@X[-2&7]
551	movdqa	@X[-4&7],0(%rsp)	# X[]+K xfer to IALU
552	psubd	@Tx[1],@X[-4&7]		# restore X[]
553	movdqa	@X[-3&7],16(%rsp)
554	psubd	@Tx[1],@X[-3&7]
555	movdqa	@X[-2&7],32(%rsp)
556	psubd	@Tx[1],@X[-2&7]
557	jmp	.Loop_ssse3
558___
559
560sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
561{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
562  my $arg = pop;
563    $arg = "\$$arg" if ($arg*1 eq $arg);
564    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
565}
566
567sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
568{ use integer;
569  my $body = shift;
570  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
571  my ($a,$b,$c,$d,$e);
572
573	 eval(shift(@insns));		# ror
574	&pshufd	(@X[0],@X[-4&7],0xee);	# was &movdqa	(@X[0],@X[-3&7]);
575	 eval(shift(@insns));
576	&movdqa	(@Tx[0],@X[-1&7]);
577	  &paddd	(@Tx[1],@X[-1&7]);
578	 eval(shift(@insns));
579	 eval(shift(@insns));
580
581	&punpcklqdq(@X[0],@X[-3&7]);	# compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
582	 eval(shift(@insns));
583	 eval(shift(@insns));		# rol
584	 eval(shift(@insns));
585	&psrldq	(@Tx[0],4);		# "X[-3]", 3 dwords
586	 eval(shift(@insns));
587	 eval(shift(@insns));
588
589	&pxor	(@X[0],@X[-4&7]);	# "X[0]"^="X[-16]"
590	 eval(shift(@insns));
591	 eval(shift(@insns));		# ror
592	&pxor	(@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
593	 eval(shift(@insns));
594	 eval(shift(@insns));
595	 eval(shift(@insns));
596
597	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
598	 eval(shift(@insns));
599	 eval(shift(@insns));		# rol
600	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
601	 eval(shift(@insns));
602	 eval(shift(@insns));
603
604	&movdqa	(@Tx[2],@X[0]);
605	 eval(shift(@insns));
606	 eval(shift(@insns));
607	 eval(shift(@insns));		# ror
608	&movdqa	(@Tx[0],@X[0]);
609	 eval(shift(@insns));
610
611	&pslldq	(@Tx[2],12);		# "X[0]"<<96, extract one dword
612	&paddd	(@X[0],@X[0]);
613	 eval(shift(@insns));
614	 eval(shift(@insns));
615
616	&psrld	(@Tx[0],31);
617	 eval(shift(@insns));
618	 eval(shift(@insns));		# rol
619	 eval(shift(@insns));
620	&movdqa	(@Tx[1],@Tx[2]);
621	 eval(shift(@insns));
622	 eval(shift(@insns));
623
624	&psrld	(@Tx[2],30);
625	 eval(shift(@insns));
626	 eval(shift(@insns));		# ror
627	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=1
628	 eval(shift(@insns));
629	 eval(shift(@insns));
630	 eval(shift(@insns));
631
632	&pslld	(@Tx[1],2);
633	&pxor	(@X[0],@Tx[2]);
634	 eval(shift(@insns));
635	  &movdqa	(@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)");	# K_XX_XX
636	 eval(shift(@insns));		# rol
637	 eval(shift(@insns));
638	 eval(shift(@insns));
639
640	&pxor	(@X[0],@Tx[1]);		# "X[0]"^=("X[0]">>96)<<<2
641	&pshufd (@Tx[1],@X[-1&7],0xee)	if ($Xi==7);	# was &movdqa	(@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
642
643	 foreach (@insns) { eval; }	# remaining instructions [if any]
644
645  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
646		push(@Tx,shift(@Tx));
647}
648
649sub Xupdate_ssse3_32_79()
650{ use integer;
651  my $body = shift;
652  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
653  my ($a,$b,$c,$d,$e);
654
655	 eval(shift(@insns))		if ($Xi==8);
656	&pxor	(@X[0],@X[-4&7]);	# "X[0]"="X[-32]"^"X[-16]"
657	 eval(shift(@insns))		if ($Xi==8);
658	 eval(shift(@insns));		# body_20_39
659	 eval(shift(@insns));
660	 eval(shift(@insns))		if (@insns[1] =~ /_ror/);
661	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
662	&punpcklqdq(@Tx[0],@X[-1&7]);	# compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
663	 eval(shift(@insns));
664	 eval(shift(@insns));		# rol
665
666	&pxor	(@X[0],@X[-7&7]);	# "X[0]"^="X[-28]"
667	 eval(shift(@insns));
668	 eval(shift(@insns));
669	if ($Xi%5) {
670	  &movdqa	(@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
671	} else {			# ... or load next one
672	  &movdqa	(@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
673	}
674	 eval(shift(@insns));		# ror
675	  &paddd	(@Tx[1],@X[-1&7]);
676	 eval(shift(@insns));
677
678	&pxor	(@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
679	 eval(shift(@insns));		# body_20_39
680	 eval(shift(@insns));
681	 eval(shift(@insns));
682	 eval(shift(@insns));		# rol
683	 eval(shift(@insns))		if (@insns[0] =~ /_ror/);
684
685	&movdqa	(@Tx[0],@X[0]);
686	 eval(shift(@insns));
687	 eval(shift(@insns));
688	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
689	 eval(shift(@insns));		# ror
690	 eval(shift(@insns));
691	 eval(shift(@insns));		# body_20_39
692
693	&pslld	(@X[0],2);
694	 eval(shift(@insns));
695	 eval(shift(@insns));
696	&psrld	(@Tx[0],30);
697	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);# rol
698	 eval(shift(@insns));
699	 eval(shift(@insns));
700	 eval(shift(@insns));		# ror
701
702	&por	(@X[0],@Tx[0]);		# "X[0]"<<<=2
703	 eval(shift(@insns));
704	 eval(shift(@insns));		# body_20_39
705	 eval(shift(@insns))		if (@insns[1] =~ /_rol/);
706	 eval(shift(@insns))		if (@insns[0] =~ /_rol/);
707	  &pshufd(@Tx[1],@X[-1&7],0xee)	if ($Xi<19);	# was &movdqa	(@Tx[1],@X[0])
708	 eval(shift(@insns));
709	 eval(shift(@insns));		# rol
710	 eval(shift(@insns));
711	 eval(shift(@insns));
712	 eval(shift(@insns));		# rol
713	 eval(shift(@insns));
714
715	 foreach (@insns) { eval; }	# remaining instructions
716
717  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
718		push(@Tx,shift(@Tx));
719}
720
721sub Xuplast_ssse3_80()
722{ use integer;
723  my $body = shift;
724  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
725  my ($a,$b,$c,$d,$e);
726
727	 eval(shift(@insns));
728	 eval(shift(@insns));
729	 eval(shift(@insns));
730	 eval(shift(@insns));
731	  &paddd	(@Tx[1],@X[-1&7]);
732	 eval(shift(@insns));
733	 eval(shift(@insns));
734
735	  &movdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
736
737	 foreach (@insns) { eval; }		# remaining instructions
738
739	&cmp	($inp,$num);
740	&je	(".Ldone_ssse3");
741
742	unshift(@Tx,pop(@Tx));
743
744	&movdqa	(@X[2],"64($K_XX_XX)");		# pbswap mask
745	&movdqa	(@Tx[1],"-64($K_XX_XX)");	# K_00_19
746	&movdqu	(@X[-4&7],"0($inp)");		# load input
747	&movdqu	(@X[-3&7],"16($inp)");
748	&movdqu	(@X[-2&7],"32($inp)");
749	&movdqu	(@X[-1&7],"48($inp)");
750	&pshufb	(@X[-4&7],@X[2]);		# byte swap
751	&add	($inp,64);
752
753  $Xi=0;
754}
755
756sub Xloop_ssse3()
757{ use integer;
758  my $body = shift;
759  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
760  my ($a,$b,$c,$d,$e);
761
762	 eval(shift(@insns));
763	 eval(shift(@insns));
764	 eval(shift(@insns));
765	&pshufb	(@X[($Xi-3)&7],@X[2]);
766	 eval(shift(@insns));
767	 eval(shift(@insns));
768	 eval(shift(@insns));
769	 eval(shift(@insns));
770	&paddd	(@X[($Xi-4)&7],@Tx[1]);
771	 eval(shift(@insns));
772	 eval(shift(@insns));
773	 eval(shift(@insns));
774	 eval(shift(@insns));
775	&movdqa	(eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);	# X[]+K xfer to IALU
776	 eval(shift(@insns));
777	 eval(shift(@insns));
778	 eval(shift(@insns));
779	 eval(shift(@insns));
780	&psubd	(@X[($Xi-4)&7],@Tx[1]);
781
782	foreach (@insns) { eval; }
783  $Xi++;
784}
785
786sub Xtail_ssse3()
787{ use integer;
788  my $body = shift;
789  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
790  my ($a,$b,$c,$d,$e);
791
792	foreach (@insns) { eval; }
793}
794
795sub body_00_19 () {	# ((c^d)&b)^d
796	# on start @T[0]=(c^d)&b
797	return &body_20_39() if ($rx==19); $rx++;
798	(
799	'($a,$b,$c,$d,$e)=@V;'.
800	'&$_ror	($b,$j?7:2)',	# $b>>>2
801	'&xor	(@T[0],$d)',
802	'&mov	(@T[1],$a)',	# $b for next round
803
804	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
805	'&xor	($b,$c)',	# $c^$d for next round
806
807	'&$_rol	($a,5)',
808	'&add	($e,@T[0])',
809	'&and	(@T[1],$b)',	# ($b&($c^$d)) for next round
810
811	'&xor	($b,$c)',	# restore $b
812	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
813	);
814}
815
816sub body_20_39 () {	# b^d^c
817	# on entry @T[0]=b^d
818	return &body_40_59() if ($rx==39); $rx++;
819	(
820	'($a,$b,$c,$d,$e)=@V;'.
821	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
822	'&xor	(@T[0],$d)	if($j==19);'.
823	'&xor	(@T[0],$c)	if($j> 19)',	# ($b^$d^$c)
824	'&mov	(@T[1],$a)',	# $b for next round
825
826	'&$_rol	($a,5)',
827	'&add	($e,@T[0])',
828	'&xor	(@T[1],$c)	if ($j< 79)',	# $b^$d for next round
829
830	'&$_ror	($b,7)',	# $b>>>2
831	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
832	);
833}
834
835sub body_40_59 () {	# ((b^c)&(c^d))^c
836	# on entry @T[0]=(b^c), (c^=d)
837	$rx++;
838	(
839	'($a,$b,$c,$d,$e)=@V;'.
840	'&add	($e,eval(4*($j&15))."(%rsp)")',	# X[]+K xfer
841	'&and	(@T[0],$c)	if ($j>=40)',	# (b^c)&(c^d)
842	'&xor	($c,$d)		if ($j>=40)',	# restore $c
843
844	'&$_ror	($b,7)',	# $b>>>2
845	'&mov	(@T[1],$a)',	# $b for next round
846	'&xor	(@T[0],$c)',
847
848	'&$_rol	($a,5)',
849	'&add	($e,@T[0])',
850	'&xor	(@T[1],$c)	if ($j==59);'.
851	'&xor	(@T[1],$b)	if ($j< 59)',	# b^c for next round
852
853	'&xor	($b,$c)		if ($j< 59)',	# c^d for next round
854	'&add	($e,$a);'	.'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
855	);
856}
857$code.=<<___;
858.align	16
859.Loop_ssse3:
860___
861	&Xupdate_ssse3_16_31(\&body_00_19);
862	&Xupdate_ssse3_16_31(\&body_00_19);
863	&Xupdate_ssse3_16_31(\&body_00_19);
864	&Xupdate_ssse3_16_31(\&body_00_19);
865	&Xupdate_ssse3_32_79(\&body_00_19);
866	&Xupdate_ssse3_32_79(\&body_20_39);
867	&Xupdate_ssse3_32_79(\&body_20_39);
868	&Xupdate_ssse3_32_79(\&body_20_39);
869	&Xupdate_ssse3_32_79(\&body_20_39);
870	&Xupdate_ssse3_32_79(\&body_20_39);
871	&Xupdate_ssse3_32_79(\&body_40_59);
872	&Xupdate_ssse3_32_79(\&body_40_59);
873	&Xupdate_ssse3_32_79(\&body_40_59);
874	&Xupdate_ssse3_32_79(\&body_40_59);
875	&Xupdate_ssse3_32_79(\&body_40_59);
876	&Xupdate_ssse3_32_79(\&body_20_39);
877	&Xuplast_ssse3_80(\&body_20_39);	# can jump to "done"
878
879				$saved_j=$j; @saved_V=@V;
880
881	&Xloop_ssse3(\&body_20_39);
882	&Xloop_ssse3(\&body_20_39);
883	&Xloop_ssse3(\&body_20_39);
884
885$code.=<<___;
886	add	0($ctx),$A			# update context
887	add	4($ctx),@T[0]
888	add	8($ctx),$C
889	add	12($ctx),$D
890	mov	$A,0($ctx)
891	add	16($ctx),$E
892	mov	@T[0],4($ctx)
893	mov	@T[0],$B			# magic seed
894	mov	$C,8($ctx)
895	mov	$C,@T[1]
896	mov	$D,12($ctx)
897	xor	$D,@T[1]
898	mov	$E,16($ctx)
899	and	@T[1],@T[0]
900	jmp	.Loop_ssse3
901
902.align	16
903.Ldone_ssse3:
904___
905				$j=$saved_j; @V=@saved_V;
906
907	&Xtail_ssse3(\&body_20_39);
908	&Xtail_ssse3(\&body_20_39);
909	&Xtail_ssse3(\&body_20_39);
910
911$code.=<<___;
912	add	0($ctx),$A			# update context
913	add	4($ctx),@T[0]
914	add	8($ctx),$C
915	mov	$A,0($ctx)
916	add	12($ctx),$D
917	mov	@T[0],4($ctx)
918	add	16($ctx),$E
919	mov	$C,8($ctx)
920	mov	$D,12($ctx)
921	mov	$E,16($ctx)
922___
923$code.=<<___ if ($win64);
924	movaps	-40-6*16($fp),%xmm6
925	movaps	-40-5*16($fp),%xmm7
926	movaps	-40-4*16($fp),%xmm8
927	movaps	-40-3*16($fp),%xmm9
928	movaps	-40-2*16($fp),%xmm10
929	movaps	-40-1*16($fp),%xmm11
930___
931$code.=<<___;
932	mov	-40($fp),%r14
933.cfi_restore	%r14
934	mov	-32($fp),%r13
935.cfi_restore	%r13
936	mov	-24($fp),%r12
937.cfi_restore	%r12
938	mov	-16($fp),%rbp
939.cfi_restore	%rbp
940	mov	-8($fp),%rbx
941.cfi_restore	%rbx
942	lea	($fp),%rsp
943.cfi_def_cfa_register	%rsp
944.Lepilogue_ssse3:
945	ret
946.cfi_endproc
947.size	sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
948___
949
950if ($avx) {
951$Xi=4;				# reset variables
952@X=map("%xmm$_",(4..7,0..3));
953@Tx=map("%xmm$_",(8..10));
954$j=0;
955$rx=0;
956
957my $done_avx_label=".Ldone_avx";
958
959my $_rol=sub { &shld(@_[0],@_) };
960my $_ror=sub { &shrd(@_[0],@_) };
961
962$code.=<<___;
963.type	sha1_block_data_order_avx,\@function,3
964.align	16
965sha1_block_data_order_avx:
966_avx_shortcut:
967.cfi_startproc
968	mov	%rsp,$fp
969.cfi_def_cfa_register	$fp
970	push	%rbx
971.cfi_push	%rbx
972	push	%rbp
973.cfi_push	%rbp
974	push	%r12
975.cfi_push	%r12
976	push	%r13		# redundant, done to share Win64 SE handler
977.cfi_push	%r13
978	push	%r14
979.cfi_push	%r14
980	lea	`-64-($win64?6*16:0)`(%rsp),%rsp
981	vzeroupper
982___
983$code.=<<___ if ($win64);
984	vmovaps	%xmm6,-40-6*16($fp)
985	vmovaps	%xmm7,-40-5*16($fp)
986	vmovaps	%xmm8,-40-4*16($fp)
987	vmovaps	%xmm9,-40-3*16($fp)
988	vmovaps	%xmm10,-40-2*16($fp)
989	vmovaps	%xmm11,-40-1*16($fp)
990.Lprologue_avx:
991___
992$code.=<<___;
993	and	\$-64,%rsp
994	mov	%rdi,$ctx	# reassigned argument
995	mov	%rsi,$inp	# reassigned argument
996	mov	%rdx,$num	# reassigned argument
997
998	shl	\$6,$num
999	add	$inp,$num
1000	lea	K_XX_XX+64(%rip),$K_XX_XX
1001
1002	mov	0($ctx),$A		# load context
1003	mov	4($ctx),$B
1004	mov	8($ctx),$C
1005	mov	12($ctx),$D
1006	mov	$B,@T[0]		# magic seed
1007	mov	16($ctx),$E
1008	mov	$C,@T[1]
1009	xor	$D,@T[1]
1010	and	@T[1],@T[0]
1011
1012	vmovdqa	64($K_XX_XX),@X[2]	# pbswap mask
1013	vmovdqa	-64($K_XX_XX),$Kx	# K_00_19
1014	vmovdqu	0($inp),@X[-4&7]	# load input to %xmm[0-3]
1015	vmovdqu	16($inp),@X[-3&7]
1016	vmovdqu	32($inp),@X[-2&7]
1017	vmovdqu	48($inp),@X[-1&7]
1018	vpshufb	@X[2],@X[-4&7],@X[-4&7]	# byte swap
1019	add	\$64,$inp
1020	vpshufb	@X[2],@X[-3&7],@X[-3&7]
1021	vpshufb	@X[2],@X[-2&7],@X[-2&7]
1022	vpshufb	@X[2],@X[-1&7],@X[-1&7]
1023	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1024	vpaddd	$Kx,@X[-3&7],@X[1]
1025	vpaddd	$Kx,@X[-2&7],@X[2]
1026	vmovdqa	@X[0],0(%rsp)		# X[]+K xfer to IALU
1027	vmovdqa	@X[1],16(%rsp)
1028	vmovdqa	@X[2],32(%rsp)
1029	jmp	.Loop_avx
1030___
1031
1032sub Xupdate_avx_16_31()		# recall that $Xi starts with 4
1033{ use integer;
1034  my $body = shift;
1035  my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
1036  my ($a,$b,$c,$d,$e);
1037
1038	 eval(shift(@insns));
1039	 eval(shift(@insns));
1040	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1041	 eval(shift(@insns));
1042	 eval(shift(@insns));
1043
1044	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1045	 eval(shift(@insns));
1046	 eval(shift(@insns));
1047	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1048	 eval(shift(@insns));
1049	 eval(shift(@insns));
1050	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1051	 eval(shift(@insns));
1052	 eval(shift(@insns));
1053
1054	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1055	 eval(shift(@insns));
1056	 eval(shift(@insns));
1057	 eval(shift(@insns));
1058	 eval(shift(@insns));
1059
1060	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1061	 eval(shift(@insns));
1062	 eval(shift(@insns));
1063	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1064	 eval(shift(@insns));
1065	 eval(shift(@insns));
1066
1067	&vpsrld	(@Tx[0],@X[0],31);
1068	 eval(shift(@insns));
1069	 eval(shift(@insns));
1070	 eval(shift(@insns));
1071	 eval(shift(@insns));
1072
1073	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1074	&vpaddd	(@X[0],@X[0],@X[0]);
1075	 eval(shift(@insns));
1076	 eval(shift(@insns));
1077	 eval(shift(@insns));
1078	 eval(shift(@insns));
1079
1080	&vpsrld	(@Tx[1],@Tx[2],30);
1081	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1082	 eval(shift(@insns));
1083	 eval(shift(@insns));
1084	 eval(shift(@insns));
1085	 eval(shift(@insns));
1086
1087	&vpslld	(@Tx[2],@Tx[2],2);
1088	&vpxor	(@X[0],@X[0],@Tx[1]);
1089	 eval(shift(@insns));
1090	 eval(shift(@insns));
1091	 eval(shift(@insns));
1092	 eval(shift(@insns));
1093
1094	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1095	 eval(shift(@insns));
1096	 eval(shift(@insns));
1097	  &vmovdqa	($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1098	 eval(shift(@insns));
1099	 eval(shift(@insns));
1100
1101
1102	 foreach (@insns) { eval; }	# remaining instructions [if any]
1103
1104  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1105}
1106
1107sub Xupdate_avx_32_79()
1108{ use integer;
1109  my $body = shift;
1110  my @insns = (&$body,&$body,&$body,&$body);	# 32 to 44 instructions
1111  my ($a,$b,$c,$d,$e);
1112
1113	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1114	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1115	 eval(shift(@insns));		# body_20_39
1116	 eval(shift(@insns));
1117	 eval(shift(@insns));
1118	 eval(shift(@insns));		# rol
1119
1120	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1121	 eval(shift(@insns));
1122	 eval(shift(@insns))	if (@insns[0] !~ /&ro[rl]/);
1123	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1124	  &vmovdqa	($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1125	 eval(shift(@insns));		# ror
1126	 eval(shift(@insns));
1127
1128	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1129	 eval(shift(@insns));		# body_20_39
1130	 eval(shift(@insns));
1131	 eval(shift(@insns));
1132	 eval(shift(@insns));		# rol
1133
1134	&vpsrld	(@Tx[0],@X[0],30);
1135	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1136	 eval(shift(@insns));
1137	 eval(shift(@insns));
1138	 eval(shift(@insns));		# ror
1139	 eval(shift(@insns));
1140
1141	&vpslld	(@X[0],@X[0],2);
1142	 eval(shift(@insns));		# body_20_39
1143	 eval(shift(@insns));
1144	 eval(shift(@insns));
1145	 eval(shift(@insns));		# rol
1146	 eval(shift(@insns));
1147	 eval(shift(@insns));
1148	 eval(shift(@insns));		# ror
1149	 eval(shift(@insns));
1150
1151	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1152	 eval(shift(@insns));		# body_20_39
1153	 eval(shift(@insns));
1154	 eval(shift(@insns));
1155	 eval(shift(@insns));		# rol
1156	 eval(shift(@insns));
1157	 eval(shift(@insns));
1158	 eval(shift(@insns));		# rol
1159	 eval(shift(@insns));
1160
1161	 foreach (@insns) { eval; }	# remaining instructions
1162
1163  $Xi++;	push(@X,shift(@X));	# "rotate" X[]
1164}
1165
1166sub Xuplast_avx_80()
1167{ use integer;
1168  my $body = shift;
1169  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1170  my ($a,$b,$c,$d,$e);
1171
1172	 eval(shift(@insns));
1173	  &vpaddd	(@Tx[1],$Kx,@X[-1&7]);
1174	 eval(shift(@insns));
1175	 eval(shift(@insns));
1176	 eval(shift(@insns));
1177	 eval(shift(@insns));
1178
1179	  &vmovdqa	(eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);	# X[]+K xfer IALU
1180
1181	 foreach (@insns) { eval; }		# remaining instructions
1182
1183	&cmp	($inp,$num);
1184	&je	($done_avx_label);
1185
1186	&vmovdqa(@X[2],"64($K_XX_XX)");		# pbswap mask
1187	&vmovdqa($Kx,"-64($K_XX_XX)");		# K_00_19
1188	&vmovdqu(@X[-4&7],"0($inp)");		# load input
1189	&vmovdqu(@X[-3&7],"16($inp)");
1190	&vmovdqu(@X[-2&7],"32($inp)");
1191	&vmovdqu(@X[-1&7],"48($inp)");
1192	&vpshufb(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1193	&add	($inp,64);
1194
1195  $Xi=0;
1196}
1197
1198sub Xloop_avx()
1199{ use integer;
1200  my $body = shift;
1201  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1202  my ($a,$b,$c,$d,$e);
1203
1204	 eval(shift(@insns));
1205	 eval(shift(@insns));
1206	&vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1207	 eval(shift(@insns));
1208	 eval(shift(@insns));
1209	&vpaddd	(@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1210	 eval(shift(@insns));
1211	 eval(shift(@insns));
1212	 eval(shift(@insns));
1213	 eval(shift(@insns));
1214	&vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);	# X[]+K xfer to IALU
1215	 eval(shift(@insns));
1216	 eval(shift(@insns));
1217
1218	foreach (@insns) { eval; }
1219  $Xi++;
1220}
1221
1222sub Xtail_avx()
1223{ use integer;
1224  my $body = shift;
1225  my @insns = (&$body,&$body,&$body,&$body);	# 32 instructions
1226  my ($a,$b,$c,$d,$e);
1227
1228	foreach (@insns) { eval; }
1229}
1230
1231$code.=<<___;
1232.align	16
1233.Loop_avx:
1234___
1235	&Xupdate_avx_16_31(\&body_00_19);
1236	&Xupdate_avx_16_31(\&body_00_19);
1237	&Xupdate_avx_16_31(\&body_00_19);
1238	&Xupdate_avx_16_31(\&body_00_19);
1239	&Xupdate_avx_32_79(\&body_00_19);
1240	&Xupdate_avx_32_79(\&body_20_39);
1241	&Xupdate_avx_32_79(\&body_20_39);
1242	&Xupdate_avx_32_79(\&body_20_39);
1243	&Xupdate_avx_32_79(\&body_20_39);
1244	&Xupdate_avx_32_79(\&body_20_39);
1245	&Xupdate_avx_32_79(\&body_40_59);
1246	&Xupdate_avx_32_79(\&body_40_59);
1247	&Xupdate_avx_32_79(\&body_40_59);
1248	&Xupdate_avx_32_79(\&body_40_59);
1249	&Xupdate_avx_32_79(\&body_40_59);
1250	&Xupdate_avx_32_79(\&body_20_39);
1251	&Xuplast_avx_80(\&body_20_39);	# can jump to "done"
1252
1253				$saved_j=$j; @saved_V=@V;
1254
1255	&Xloop_avx(\&body_20_39);
1256	&Xloop_avx(\&body_20_39);
1257	&Xloop_avx(\&body_20_39);
1258
1259$code.=<<___;
1260	add	0($ctx),$A			# update context
1261	add	4($ctx),@T[0]
1262	add	8($ctx),$C
1263	add	12($ctx),$D
1264	mov	$A,0($ctx)
1265	add	16($ctx),$E
1266	mov	@T[0],4($ctx)
1267	mov	@T[0],$B			# magic seed
1268	mov	$C,8($ctx)
1269	mov	$C,@T[1]
1270	mov	$D,12($ctx)
1271	xor	$D,@T[1]
1272	mov	$E,16($ctx)
1273	and	@T[1],@T[0]
1274	jmp	.Loop_avx
1275
1276.align	16
1277$done_avx_label:
1278___
1279				$j=$saved_j; @V=@saved_V;
1280
1281	&Xtail_avx(\&body_20_39);
1282	&Xtail_avx(\&body_20_39);
1283	&Xtail_avx(\&body_20_39);
1284
1285$code.=<<___;
1286	vzeroupper
1287
1288	add	0($ctx),$A			# update context
1289	add	4($ctx),@T[0]
1290	add	8($ctx),$C
1291	mov	$A,0($ctx)
1292	add	12($ctx),$D
1293	mov	@T[0],4($ctx)
1294	add	16($ctx),$E
1295	mov	$C,8($ctx)
1296	mov	$D,12($ctx)
1297	mov	$E,16($ctx)
1298___
1299$code.=<<___ if ($win64);
1300	movaps	-40-6*16($fp),%xmm6
1301	movaps	-40-5*16($fp),%xmm7
1302	movaps	-40-4*16($fp),%xmm8
1303	movaps	-40-3*16($fp),%xmm9
1304	movaps	-40-2*16($fp),%xmm10
1305	movaps	-40-1*16($fp),%xmm11
1306___
1307$code.=<<___;
1308	mov	-40($fp),%r14
1309.cfi_restore	%r14
1310	mov	-32($fp),%r13
1311.cfi_restore	%r13
1312	mov	-24($fp),%r12
1313.cfi_restore	%r12
1314	mov	-16($fp),%rbp
1315.cfi_restore	%rbp
1316	mov	-8($fp),%rbx
1317.cfi_restore	%rbx
1318	lea	($fp),%rsp
1319.cfi_def_cfa_register	%rsp
1320.Lepilogue_avx:
1321	ret
1322.cfi_endproc
1323.size	sha1_block_data_order_avx,.-sha1_block_data_order_avx
1324___
1325
1326if ($avx>1) {
1327use integer;
1328$Xi=4;					# reset variables
1329@X=map("%ymm$_",(4..7,0..3));
1330@Tx=map("%ymm$_",(8..10));
1331$Kx="%ymm11";
1332$j=0;
1333
1334my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1335my ($a5,$t0)=("%r12d","%edi");
1336
1337my ($A,$F,$B,$C,$D,$E)=@ROTX;
1338my $rx=0;
1339my $frame="%r13";
1340
1341$code.=<<___;
1342.type	sha1_block_data_order_avx2,\@function,3
1343.align	16
1344sha1_block_data_order_avx2:
1345_avx2_shortcut:
1346.cfi_startproc
1347	mov	%rsp,$fp
1348.cfi_def_cfa_register	$fp
1349	push	%rbx
1350.cfi_push	%rbx
1351	push	%rbp
1352.cfi_push	%rbp
1353	push	%r12
1354.cfi_push	%r12
1355	push	%r13
1356.cfi_push	%r13
1357	push	%r14
1358.cfi_push	%r14
1359	vzeroupper
1360___
1361$code.=<<___ if ($win64);
1362	lea	-6*16(%rsp),%rsp
1363	vmovaps	%xmm6,-40-6*16($fp)
1364	vmovaps	%xmm7,-40-5*16($fp)
1365	vmovaps	%xmm8,-40-4*16($fp)
1366	vmovaps	%xmm9,-40-3*16($fp)
1367	vmovaps	%xmm10,-40-2*16($fp)
1368	vmovaps	%xmm11,-40-1*16($fp)
1369.Lprologue_avx2:
1370___
1371$code.=<<___;
1372	mov	%rdi,$ctx		# reassigned argument
1373	mov	%rsi,$inp		# reassigned argument
1374	mov	%rdx,$num		# reassigned argument
1375
1376	lea	-640(%rsp),%rsp
1377	shl	\$6,$num
1378	 lea	64($inp),$frame
1379	and	\$-128,%rsp
1380	add	$inp,$num
1381	lea	K_XX_XX+64(%rip),$K_XX_XX
1382
1383	mov	0($ctx),$A		# load context
1384	 cmp	$num,$frame
1385	 cmovae	$inp,$frame		# next or same block
1386	mov	4($ctx),$F
1387	mov	8($ctx),$C
1388	mov	12($ctx),$D
1389	mov	16($ctx),$E
1390	vmovdqu	64($K_XX_XX),@X[2]	# pbswap mask
1391
1392	vmovdqu		($inp),%xmm0
1393	vmovdqu		16($inp),%xmm1
1394	vmovdqu		32($inp),%xmm2
1395	vmovdqu		48($inp),%xmm3
1396	lea		64($inp),$inp
1397	vinserti128	\$1,($frame),@X[-4&7],@X[-4&7]
1398	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1399	vpshufb		@X[2],@X[-4&7],@X[-4&7]
1400	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1401	vpshufb		@X[2],@X[-3&7],@X[-3&7]
1402	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1403	vpshufb		@X[2],@X[-2&7],@X[-2&7]
1404	vmovdqu		-64($K_XX_XX),$Kx	# K_00_19
1405	vpshufb		@X[2],@X[-1&7],@X[-1&7]
1406
1407	vpaddd	$Kx,@X[-4&7],@X[0]	# add K_00_19
1408	vpaddd	$Kx,@X[-3&7],@X[1]
1409	vmovdqu	@X[0],0(%rsp)		# X[]+K xfer to IALU
1410	vpaddd	$Kx,@X[-2&7],@X[2]
1411	vmovdqu	@X[1],32(%rsp)
1412	vpaddd	$Kx,@X[-1&7],@X[3]
1413	vmovdqu	@X[2],64(%rsp)
1414	vmovdqu	@X[3],96(%rsp)
1415___
1416for (;$Xi<8;$Xi++) {	# Xupdate_avx2_16_31
1417    use integer;
1418
1419	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1420	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1421	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1422	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1423	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1424	&vpsrld	(@Tx[0],@X[0],31);
1425	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1426	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1427	&vpaddd	(@X[0],@X[0],@X[0]);
1428	&vpsrld	(@Tx[1],@Tx[2],30);
1429	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1430	&vpslld	(@Tx[2],@Tx[2],2);
1431	&vpxor	(@X[0],@X[0],@Tx[1]);
1432	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1433	&vpaddd	(@Tx[1],@X[0],$Kx);
1434	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1435
1436	push(@X,shift(@X));	# "rotate" X[]
1437}
1438$code.=<<___;
1439	lea	128(%rsp),$frame
1440	jmp	.Loop_avx2
1441.align	32
1442.Loop_avx2:
1443	rorx	\$2,$F,$B
1444	andn	$D,$F,$t0
1445	and	$C,$F
1446	xor	$t0,$F
1447___
1448sub bodyx_00_19 () {	# 8 instructions, 3 cycles critical path
1449	# at start $f=(b&c)^(~b&d), $b>>>=2
1450	return &bodyx_20_39() if ($rx==19); $rx++;
1451	(
1452	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1453
1454	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1455	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1456	'&andn	($t0,$a,$c)',			# ~b&d for next round
1457
1458	'&add	($e,$f)',			# e+=(b&c)^(~b&d)
1459	'&rorx	($a5,$a,27)',			# a<<<5
1460	'&rorx	($f,$a,2)',			# b>>>2 for next round
1461	'&and	($a,$b)',			# b&c for next round
1462
1463	'&add	($e,$a5)',			# e+=a<<<5
1464	'&xor	($a,$t0);'.			# f=(b&c)^(~b&d) for next round
1465
1466	'unshift(@ROTX,pop(@ROTX)); $j++;'
1467	)
1468}
1469
1470sub bodyx_20_39 () {	# 7 instructions, 2 cycles critical path
1471	# on entry $f=b^c^d, $b>>>=2
1472	return &bodyx_40_59() if ($rx==39); $rx++;
1473	(
1474	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1475
1476	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1477	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1478
1479	'&lea	($e,"($e,$f)")',		# e+=b^c^d
1480	'&rorx	($a5,$a,27)',			# a<<<5
1481	'&rorx	($f,$a,2)	if ($j<79)',	# b>>>2 in next round
1482	'&xor	($a,$b)		if ($j<79)',	# b^c for next round
1483
1484	'&add	($e,$a5)',			# e+=a<<<5
1485	'&xor	($a,$c)		if ($j<79);'.	# f=b^c^d for next round
1486
1487	'unshift(@ROTX,pop(@ROTX)); $j++;'
1488	)
1489}
1490
1491sub bodyx_40_59 () {	# 10 instructions, 3 cycles critical path
1492	# on entry $f=((b^c)&(c^d)), $b>>>=2
1493	$rx++;
1494	(
1495	'($a,$f,$b,$c,$d,$e)=@ROTX;'.
1496
1497	'&add	($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'.	# e+=X[i]+K
1498	 '&lea	($frame,"256($frame)")	if ($j%32==31);',
1499	'&xor	($f,$c)		if ($j>39)',	# (b^c)&(c^d)^c
1500	'&mov	($t0,$b)	if ($j<59)',	# count on zero latency
1501	'&xor	($t0,$c)	if ($j<59)',	# c^d for next round
1502
1503	'&lea	($e,"($e,$f)")',		# e+=(b^c)&(c^d)^c
1504	'&rorx	($a5,$a,27)',			# a<<<5
1505	'&rorx	($f,$a,2)',			# b>>>2 in next round
1506	'&xor	($a,$b)',			# b^c for next round
1507
1508	'&add	($e,$a5)',			# e+=a<<<5
1509	'&and	($a,$t0)	if ($j< 59);'.	# f=(b^c)&(c^d) for next round
1510	'&xor	($a,$c)		if ($j==59);'.	# f=b^c^d for next round
1511
1512	'unshift(@ROTX,pop(@ROTX)); $j++;'
1513	)
1514}
1515
1516sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
1517{ use integer;
1518  my $body = shift;
1519  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
1520  my ($a,$b,$c,$d,$e);
1521
1522	&vpalignr(@X[0],@X[-3&7],@X[-4&7],8);	# compose "X[-14]" in "X[0]"
1523	 eval(shift(@insns));
1524	 eval(shift(@insns));
1525	 eval(shift(@insns));
1526	 eval(shift(@insns));
1527
1528	&vpsrldq(@Tx[0],@X[-1&7],4);		# "X[-3]", 3 dwords
1529	 eval(shift(@insns));
1530	 eval(shift(@insns));
1531	 eval(shift(@insns));
1532
1533	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"^="X[-16]"
1534	&vpxor	(@Tx[0],@Tx[0],@X[-2&7]);	# "X[-3]"^"X[-8]"
1535	 eval(shift(@insns));
1536	 eval(shift(@insns));
1537
1538	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-3]"^"X[-8]"
1539	 eval(shift(@insns));
1540	 eval(shift(@insns));
1541	 eval(shift(@insns));
1542	 eval(shift(@insns));
1543
1544	&vpsrld	(@Tx[0],@X[0],31);
1545	&vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)")	if ($Xi%5==0);	# K_XX_XX
1546	 eval(shift(@insns));
1547	 eval(shift(@insns));
1548	 eval(shift(@insns));
1549
1550	&vpslldq(@Tx[2],@X[0],12);		# "X[0]"<<96, extract one dword
1551	&vpaddd	(@X[0],@X[0],@X[0]);
1552	 eval(shift(@insns));
1553	 eval(shift(@insns));
1554
1555	&vpsrld	(@Tx[1],@Tx[2],30);
1556	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=1
1557	 eval(shift(@insns));
1558	 eval(shift(@insns));
1559
1560	&vpslld	(@Tx[2],@Tx[2],2);
1561	&vpxor	(@X[0],@X[0],@Tx[1]);
1562	 eval(shift(@insns));
1563	 eval(shift(@insns));
1564
1565	&vpxor	(@X[0],@X[0],@Tx[2]);		# "X[0]"^=("X[0]">>96)<<<2
1566	 eval(shift(@insns));
1567	 eval(shift(@insns));
1568	 eval(shift(@insns));
1569
1570	&vpaddd	(@Tx[1],@X[0],$Kx);
1571	 eval(shift(@insns));
1572	 eval(shift(@insns));
1573	 eval(shift(@insns));
1574	&vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1575
1576	 foreach (@insns) { eval; }	# remaining instructions [if any]
1577
1578	$Xi++;
1579	push(@X,shift(@X));	# "rotate" X[]
1580}
1581
1582sub Xupdate_avx2_32_79()
1583{ use integer;
1584  my $body = shift;
1585  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 to 50 instructions
1586  my ($a,$b,$c,$d,$e);
1587
1588	&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);	# compose "X[-6]"
1589	&vpxor	(@X[0],@X[0],@X[-4&7]);		# "X[0]"="X[-32]"^"X[-16]"
1590	 eval(shift(@insns));
1591	 eval(shift(@insns));
1592
1593	&vpxor	(@X[0],@X[0],@X[-7&7]);		# "X[0]"^="X[-28]"
1594	&vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)")	if ($Xi%5==0);
1595	 eval(shift(@insns));
1596	 eval(shift(@insns));
1597	 eval(shift(@insns));
1598
1599	&vpxor	(@X[0],@X[0],@Tx[0]);		# "X[0]"^="X[-6]"
1600	 eval(shift(@insns));
1601	 eval(shift(@insns));
1602	 eval(shift(@insns));
1603
1604	&vpsrld	(@Tx[0],@X[0],30);
1605	&vpslld	(@X[0],@X[0],2);
1606	 eval(shift(@insns));
1607	 eval(shift(@insns));
1608	 eval(shift(@insns));
1609
1610	#&vpslld	(@X[0],@X[0],2);
1611	 eval(shift(@insns));
1612	 eval(shift(@insns));
1613	 eval(shift(@insns));
1614
1615	&vpor	(@X[0],@X[0],@Tx[0]);		# "X[0]"<<<=2
1616	 eval(shift(@insns));
1617	 eval(shift(@insns));
1618	 eval(shift(@insns));
1619	 eval(shift(@insns));
1620
1621	&vpaddd	(@Tx[1],@X[0],$Kx);
1622	 eval(shift(@insns));
1623	 eval(shift(@insns));
1624	 eval(shift(@insns));
1625	 eval(shift(@insns));
1626
1627	&vmovdqu("32*$Xi(%rsp)",@Tx[1]);	# X[]+K xfer to IALU
1628
1629	 foreach (@insns) { eval; }	# remaining instructions
1630
1631	$Xi++;
1632	push(@X,shift(@X));	# "rotate" X[]
1633}
1634
1635sub Xloop_avx2()
1636{ use integer;
1637  my $body = shift;
1638  my @insns = (&$body,&$body,&$body,&$body,&$body);	# 32 instructions
1639  my ($a,$b,$c,$d,$e);
1640
1641	 foreach (@insns) { eval; }
1642}
1643
1644	&align32();
1645	&Xupdate_avx2_32_79(\&bodyx_00_19);
1646	&Xupdate_avx2_32_79(\&bodyx_00_19);
1647	&Xupdate_avx2_32_79(\&bodyx_00_19);
1648	&Xupdate_avx2_32_79(\&bodyx_00_19);
1649
1650	&Xupdate_avx2_32_79(\&bodyx_20_39);
1651	&Xupdate_avx2_32_79(\&bodyx_20_39);
1652	&Xupdate_avx2_32_79(\&bodyx_20_39);
1653	&Xupdate_avx2_32_79(\&bodyx_20_39);
1654
1655	&align32();
1656	&Xupdate_avx2_32_79(\&bodyx_40_59);
1657	&Xupdate_avx2_32_79(\&bodyx_40_59);
1658	&Xupdate_avx2_32_79(\&bodyx_40_59);
1659	&Xupdate_avx2_32_79(\&bodyx_40_59);
1660
1661	&Xloop_avx2(\&bodyx_20_39);
1662	&Xloop_avx2(\&bodyx_20_39);
1663	&Xloop_avx2(\&bodyx_20_39);
1664	&Xloop_avx2(\&bodyx_20_39);
1665
1666$code.=<<___;
1667	lea	128($inp),$frame
1668	lea	128($inp),%rdi			# borrow $t0
1669	cmp	$num,$frame
1670	cmovae	$inp,$frame			# next or previous block
1671
1672	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1673	add	0($ctx),@ROTX[0]		# update context
1674	add	4($ctx),@ROTX[1]
1675	add	8($ctx),@ROTX[3]
1676	mov	@ROTX[0],0($ctx)
1677	add	12($ctx),@ROTX[4]
1678	mov	@ROTX[1],4($ctx)
1679	 mov	@ROTX[0],$A			# A=d
1680	add	16($ctx),@ROTX[5]
1681	 mov	@ROTX[3],$a5
1682	mov	@ROTX[3],8($ctx)
1683	 mov	@ROTX[4],$D			# D=b
1684	 #xchg	@ROTX[5],$F			# F=c, C=f
1685	mov	@ROTX[4],12($ctx)
1686	 mov	@ROTX[1],$F			# F=e
1687	mov	@ROTX[5],16($ctx)
1688	#mov	$F,16($ctx)
1689	 mov	@ROTX[5],$E			# E=c
1690	 mov	$a5,$C				# C=f
1691	 #xchg	$F,$E				# E=c, F=e
1692
1693	cmp	$num,$inp
1694	je	.Ldone_avx2
1695___
1696
1697$Xi=4;				# reset variables
1698@X=map("%ymm$_",(4..7,0..3));
1699
1700$code.=<<___;
1701	vmovdqu	64($K_XX_XX),@X[2]		# pbswap mask
1702	cmp	$num,%rdi			# borrowed $t0
1703	ja	.Last_avx2
1704
1705	vmovdqu		-64(%rdi),%xmm0		# low part of @X[-4&7]
1706	vmovdqu		-48(%rdi),%xmm1
1707	vmovdqu		-32(%rdi),%xmm2
1708	vmovdqu		-16(%rdi),%xmm3
1709	vinserti128	\$1,0($frame),@X[-4&7],@X[-4&7]
1710	vinserti128	\$1,16($frame),@X[-3&7],@X[-3&7]
1711	vinserti128	\$1,32($frame),@X[-2&7],@X[-2&7]
1712	vinserti128	\$1,48($frame),@X[-1&7],@X[-1&7]
1713	jmp	.Last_avx2
1714
1715.align	32
1716.Last_avx2:
1717	lea	128+16(%rsp),$frame
1718	rorx	\$2,$F,$B
1719	andn	$D,$F,$t0
1720	and	$C,$F
1721	xor	$t0,$F
1722	sub	\$-128,$inp
1723___
1724	$rx=$j=0;	@ROTX=($A,$F,$B,$C,$D,$E);
1725
1726	&Xloop_avx2	(\&bodyx_00_19);
1727	&Xloop_avx2	(\&bodyx_00_19);
1728	&Xloop_avx2	(\&bodyx_00_19);
1729	&Xloop_avx2	(\&bodyx_00_19);
1730
1731	&Xloop_avx2	(\&bodyx_20_39);
1732	  &vmovdqu	($Kx,"-64($K_XX_XX)");		# K_00_19
1733	  &vpshufb	(@X[-4&7],@X[-4&7],@X[2]);	# byte swap
1734	&Xloop_avx2	(\&bodyx_20_39);
1735	  &vpshufb	(@X[-3&7],@X[-3&7],@X[2]);
1736	  &vpaddd	(@Tx[0],@X[-4&7],$Kx);		# add K_00_19
1737	&Xloop_avx2	(\&bodyx_20_39);
1738	  &vmovdqu	("0(%rsp)",@Tx[0]);
1739	  &vpshufb	(@X[-2&7],@X[-2&7],@X[2]);
1740	  &vpaddd	(@Tx[1],@X[-3&7],$Kx);
1741	&Xloop_avx2	(\&bodyx_20_39);
1742	  &vmovdqu	("32(%rsp)",@Tx[1]);
1743	  &vpshufb	(@X[-1&7],@X[-1&7],@X[2]);
1744	  &vpaddd	(@X[2],@X[-2&7],$Kx);
1745
1746	&Xloop_avx2	(\&bodyx_40_59);
1747	&align32	();
1748	  &vmovdqu	("64(%rsp)",@X[2]);
1749	  &vpaddd	(@X[3],@X[-1&7],$Kx);
1750	&Xloop_avx2	(\&bodyx_40_59);
1751	  &vmovdqu	("96(%rsp)",@X[3]);
1752	&Xloop_avx2	(\&bodyx_40_59);
1753	&Xupdate_avx2_16_31(\&bodyx_40_59);
1754
1755	&Xupdate_avx2_16_31(\&bodyx_20_39);
1756	&Xupdate_avx2_16_31(\&bodyx_20_39);
1757	&Xupdate_avx2_16_31(\&bodyx_20_39);
1758	&Xloop_avx2	(\&bodyx_20_39);
1759
1760$code.=<<___;
1761	lea	128(%rsp),$frame
1762
1763	# output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1764	add	0($ctx),@ROTX[0]		# update context
1765	add	4($ctx),@ROTX[1]
1766	add	8($ctx),@ROTX[3]
1767	mov	@ROTX[0],0($ctx)
1768	add	12($ctx),@ROTX[4]
1769	mov	@ROTX[1],4($ctx)
1770	 mov	@ROTX[0],$A			# A=d
1771	add	16($ctx),@ROTX[5]
1772	 mov	@ROTX[3],$a5
1773	mov	@ROTX[3],8($ctx)
1774	 mov	@ROTX[4],$D			# D=b
1775	 #xchg	@ROTX[5],$F			# F=c, C=f
1776	mov	@ROTX[4],12($ctx)
1777	 mov	@ROTX[1],$F			# F=e
1778	mov	@ROTX[5],16($ctx)
1779	#mov	$F,16($ctx)
1780	 mov	@ROTX[5],$E			# E=c
1781	 mov	$a5,$C				# C=f
1782	 #xchg	$F,$E				# E=c, F=e
1783
1784	cmp	$num,$inp
1785	jbe	.Loop_avx2
1786
1787.Ldone_avx2:
1788	vzeroupper
1789___
1790$code.=<<___ if ($win64);
1791	movaps	-40-6*16($fp),%xmm6
1792	movaps	-40-5*16($fp),%xmm7
1793	movaps	-40-4*16($fp),%xmm8
1794	movaps	-40-3*16($fp),%xmm9
1795	movaps	-40-2*16($fp),%xmm10
1796	movaps	-40-1*16($fp),%xmm11
1797___
1798$code.=<<___;
1799	mov	-40($fp),%r14
1800.cfi_restore	%r14
1801	mov	-32($fp),%r13
1802.cfi_restore	%r13
1803	mov	-24($fp),%r12
1804.cfi_restore	%r12
1805	mov	-16($fp),%rbp
1806.cfi_restore	%rbp
1807	mov	-8($fp),%rbx
1808.cfi_restore	%rbx
1809	lea	($fp),%rsp
1810.cfi_def_cfa_register	%rsp
1811.Lepilogue_avx2:
1812	ret
1813.cfi_endproc
1814.size	sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1815___
1816}
1817}
1818$code.=<<___;
1819.section .rodata
1820.align	64
1821K_XX_XX:
1822.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1823.long	0x5a827999,0x5a827999,0x5a827999,0x5a827999	# K_00_19
1824.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1825.long	0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1	# K_20_39
1826.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1827.long	0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc	# K_40_59
1828.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1829.long	0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6	# K_60_79
1830.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1831.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f	# pbswap mask
1832.byte	0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1833___
1834}}}
1835$code.=<<___;
1836.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1837.align	64
1838.text
1839___
1840
1841# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1842#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1843if ($win64) {
1844$rec="%rcx";
1845$frame="%rdx";
1846$context="%r8";
1847$disp="%r9";
1848
1849$code.=<<___;
1850.extern	__imp_RtlVirtualUnwind
1851.type	se_handler,\@abi-omnipotent
1852.align	16
1853se_handler:
1854	push	%rsi
1855	push	%rdi
1856	push	%rbx
1857	push	%rbp
1858	push	%r12
1859	push	%r13
1860	push	%r14
1861	push	%r15
1862	pushfq
1863	sub	\$64,%rsp
1864
1865	mov	120($context),%rax	# pull context->Rax
1866	mov	248($context),%rbx	# pull context->Rip
1867
1868	lea	.Lprologue(%rip),%r10
1869	cmp	%r10,%rbx		# context->Rip<.Lprologue
1870	jb	.Lcommon_seh_tail
1871
1872	mov	152($context),%rax	# pull context->Rsp
1873
1874	lea	.Lepilogue(%rip),%r10
1875	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1876	jae	.Lcommon_seh_tail
1877
1878	mov	`16*4`(%rax),%rax	# pull saved stack pointer
1879
1880	mov	-8(%rax),%rbx
1881	mov	-16(%rax),%rbp
1882	mov	-24(%rax),%r12
1883	mov	-32(%rax),%r13
1884	mov	-40(%rax),%r14
1885	mov	%rbx,144($context)	# restore context->Rbx
1886	mov	%rbp,160($context)	# restore context->Rbp
1887	mov	%r12,216($context)	# restore context->R12
1888	mov	%r13,224($context)	# restore context->R13
1889	mov	%r14,232($context)	# restore context->R14
1890
1891	jmp	.Lcommon_seh_tail
1892.size	se_handler,.-se_handler
1893___
1894
1895$code.=<<___ if ($shaext);
1896.type	shaext_handler,\@abi-omnipotent
1897.align	16
1898shaext_handler:
1899	push	%rsi
1900	push	%rdi
1901	push	%rbx
1902	push	%rbp
1903	push	%r12
1904	push	%r13
1905	push	%r14
1906	push	%r15
1907	pushfq
1908	sub	\$64,%rsp
1909
1910	mov	120($context),%rax	# pull context->Rax
1911	mov	248($context),%rbx	# pull context->Rip
1912
1913	lea	.Lprologue_shaext(%rip),%r10
1914	cmp	%r10,%rbx		# context->Rip<.Lprologue
1915	jb	.Lcommon_seh_tail
1916
1917	lea	.Lepilogue_shaext(%rip),%r10
1918	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1919	jae	.Lcommon_seh_tail
1920
1921	lea	-8-4*16(%rax),%rsi
1922	lea	512($context),%rdi	# &context.Xmm6
1923	mov	\$8,%ecx
1924	.long	0xa548f3fc		# cld; rep movsq
1925
1926	jmp	.Lcommon_seh_tail
1927.size	shaext_handler,.-shaext_handler
1928___
1929
1930$code.=<<___;
1931.type	ssse3_handler,\@abi-omnipotent
1932.align	16
1933ssse3_handler:
1934	push	%rsi
1935	push	%rdi
1936	push	%rbx
1937	push	%rbp
1938	push	%r12
1939	push	%r13
1940	push	%r14
1941	push	%r15
1942	pushfq
1943	sub	\$64,%rsp
1944
1945	mov	120($context),%rax	# pull context->Rax
1946	mov	248($context),%rbx	# pull context->Rip
1947
1948	mov	8($disp),%rsi		# disp->ImageBase
1949	mov	56($disp),%r11		# disp->HandlerData
1950
1951	mov	0(%r11),%r10d		# HandlerData[0]
1952	lea	(%rsi,%r10),%r10	# prologue label
1953	cmp	%r10,%rbx		# context->Rip<prologue label
1954	jb	.Lcommon_seh_tail
1955
1956	mov	208($context),%rax	# pull context->R11
1957
1958	mov	4(%r11),%r10d		# HandlerData[1]
1959	lea	(%rsi,%r10),%r10	# epilogue label
1960	cmp	%r10,%rbx		# context->Rip>=epilogue label
1961	jae	.Lcommon_seh_tail
1962
1963	lea	-40-6*16(%rax),%rsi
1964	lea	512($context),%rdi	# &context.Xmm6
1965	mov	\$12,%ecx
1966	.long	0xa548f3fc		# cld; rep movsq
1967
1968	mov	-8(%rax),%rbx
1969	mov	-16(%rax),%rbp
1970	mov	-24(%rax),%r12
1971	mov	-32(%rax),%r13
1972	mov	-40(%rax),%r14
1973	mov	%rbx,144($context)	# restore context->Rbx
1974	mov	%rbp,160($context)	# restore context->Rbp
1975	mov	%r12,216($context)	# restore context->R12
1976	mov	%r13,224($context)	# restore context->R13
1977	mov	%r14,232($context)	# restore context->R14
1978
1979.Lcommon_seh_tail:
1980	mov	8(%rax),%rdi
1981	mov	16(%rax),%rsi
1982	mov	%rax,152($context)	# restore context->Rsp
1983	mov	%rsi,168($context)	# restore context->Rsi
1984	mov	%rdi,176($context)	# restore context->Rdi
1985
1986	mov	40($disp),%rdi		# disp->ContextRecord
1987	mov	$context,%rsi		# context
1988	mov	\$154,%ecx		# sizeof(CONTEXT)
1989	.long	0xa548f3fc		# cld; rep movsq
1990
1991	mov	$disp,%rsi
1992	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1993	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1994	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1995	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1996	mov	40(%rsi),%r10		# disp->ContextRecord
1997	lea	56(%rsi),%r11		# &disp->HandlerData
1998	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1999	mov	%r10,32(%rsp)		# arg5
2000	mov	%r11,40(%rsp)		# arg6
2001	mov	%r12,48(%rsp)		# arg7
2002	mov	%rcx,56(%rsp)		# arg8, (NULL)
2003	call	*__imp_RtlVirtualUnwind(%rip)
2004
2005	mov	\$1,%eax		# ExceptionContinueSearch
2006	add	\$64,%rsp
2007	popfq
2008	pop	%r15
2009	pop	%r14
2010	pop	%r13
2011	pop	%r12
2012	pop	%rbp
2013	pop	%rbx
2014	pop	%rdi
2015	pop	%rsi
2016	ret
2017.size	ssse3_handler,.-ssse3_handler
2018
2019.section	.pdata
2020.align	4
2021	.rva	.LSEH_begin_sha1_block_data_order
2022	.rva	.LSEH_end_sha1_block_data_order
2023	.rva	.LSEH_info_sha1_block_data_order
2024___
2025$code.=<<___ if ($shaext);
2026	.rva	.LSEH_begin_sha1_block_data_order_shaext
2027	.rva	.LSEH_end_sha1_block_data_order_shaext
2028	.rva	.LSEH_info_sha1_block_data_order_shaext
2029___
2030$code.=<<___;
2031	.rva	.LSEH_begin_sha1_block_data_order_ssse3
2032	.rva	.LSEH_end_sha1_block_data_order_ssse3
2033	.rva	.LSEH_info_sha1_block_data_order_ssse3
2034___
2035$code.=<<___ if ($avx);
2036	.rva	.LSEH_begin_sha1_block_data_order_avx
2037	.rva	.LSEH_end_sha1_block_data_order_avx
2038	.rva	.LSEH_info_sha1_block_data_order_avx
2039___
2040$code.=<<___ if ($avx>1);
2041	.rva	.LSEH_begin_sha1_block_data_order_avx2
2042	.rva	.LSEH_end_sha1_block_data_order_avx2
2043	.rva	.LSEH_info_sha1_block_data_order_avx2
2044___
2045$code.=<<___;
2046.section	.xdata
2047.align	8
2048.LSEH_info_sha1_block_data_order:
2049	.byte	9,0,0,0
2050	.rva	se_handler
2051___
2052$code.=<<___ if ($shaext);
2053.LSEH_info_sha1_block_data_order_shaext:
2054	.byte	9,0,0,0
2055	.rva	shaext_handler
2056___
2057$code.=<<___;
2058.LSEH_info_sha1_block_data_order_ssse3:
2059	.byte	9,0,0,0
2060	.rva	ssse3_handler
2061	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2062___
2063$code.=<<___ if ($avx);
2064.LSEH_info_sha1_block_data_order_avx:
2065	.byte	9,0,0,0
2066	.rva	ssse3_handler
2067	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2068___
2069$code.=<<___ if ($avx>1);
2070.LSEH_info_sha1_block_data_order_avx2:
2071	.byte	9,0,0,0
2072	.rva	ssse3_handler
2073	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2074___
2075}
2076
2077####################################################################
2078
2079sub sha1rnds4 {
2080    if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2081      my @opcode=(0x0f,0x3a,0xcc);
2082	push @opcode,0xc0|($2&7)|(($3&7)<<3);		# ModR/M
2083	my $c=$1;
2084	push @opcode,$c=~/^0/?oct($c):$c;
2085	return ".byte\t".join(',',@opcode);
2086    } else {
2087	return "sha1rnds4\t".@_[0];
2088    }
2089}
2090
2091sub sha1op38 {
2092    my $instr = shift;
2093    my %opcodelet = (
2094		"sha1nexte" => 0xc8,
2095  		"sha1msg1"  => 0xc9,
2096		"sha1msg2"  => 0xca	);
2097
2098    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2099      my @opcode=(0x0f,0x38);
2100      my $rex=0;
2101	$rex|=0x04			if ($2>=8);
2102	$rex|=0x01			if ($1>=8);
2103	unshift @opcode,0x40|$rex	if ($rex);
2104	push @opcode,$opcodelet{$instr};
2105	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2106	return ".byte\t".join(',',@opcode);
2107    } else {
2108	return $instr."\t".@_[0];
2109    }
2110}
2111
2112foreach (split("\n",$code)) {
2113	s/\`([^\`]*)\`/eval $1/geo;
2114
2115	s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo	or
2116	s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
2117
2118	print $_,"\n";
2119}
2120close STDOUT or die "error closing STDOUT: $!";
2121