• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. Rights for redistribution and usage in source and binary
6# forms are granted according to the OpenSSL license.
7# ====================================================================
8#
9# sha256/512_block procedure for x86_64.
10#
11# 40% improvement over compiler-generated code on Opteron. On EM64T
12# sha256 was observed to run >80% faster and sha512 - >40%. No magical
13# tricks, just straight implementation... I really wonder why gcc
14# [being armed with inline assembler] fails to generate as fast code.
15# The only thing which is cool about this module is that it's very
16# same instruction sequence used for both SHA-256 and SHA-512. In
17# former case the instructions operate on 32-bit operands, while in
18# latter - on 64-bit ones. All I had to do is to get one flavor right,
19# the other one passed the test right away:-)
20#
21# sha256_block runs in ~1005 cycles on Opteron, which gives you
22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23# frequency in GHz. sha512_block runs in ~1275 cycles, which results
24# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25# Well, if you compare it to IA-64 implementation, which maintains
26# X[16] in register bank[!], tends to 4 instructions per CPU clock
27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28# issue Opteron pipeline and X[16] maintained in memory. So that *if*
29# there is a way to improve it, *then* the only way would be to try to
30# offload X[16] updates to SSE unit, but that would require "deeper"
31# loop unroll, which in turn would naturally cause size blow-up, not
32# to mention increased complexity! And once again, only *if* it's
33# actually possible to noticeably improve overall ILP, instruction
34# level parallelism, on a given CPU implementation in this case.
35#
36# Special note on Intel EM64T. While Opteron CPU exhibits perfect
37# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
38# [currently available] EM64T CPUs apparently are far from it. On the
39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40# sha256_block:-( This is presumably because 64-bit shifts/rotates
41# apparently are not atomic instructions, but implemented in microcode.
42#
43# May 2012.
44#
45# Optimization including one of Pavel Semjanov's ideas, alternative
46# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47# unfortunately -2% SHA512 on P4 [which nobody should care about
48# that much].
49#
50# June 2012.
51#
52# Add SIMD code paths, see below for improvement coefficients. SSSE3
53# code path was not attempted for SHA512, because improvement is not
54# estimated to be high enough, noticeably less than 9%, to justify
55# the effort, not on pre-AVX processors. [Obviously with exclusion
56# for VIA Nano, but it has SHA512 instruction that is faster and
57# should be used instead.] For reference, corresponding estimated
58# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59# higher coefficients are observed on VIA Nano and Bulldozer has more
60# to do with specifics of their architecture [which is topic for
61# separate discussion].
62#
63# November 2012.
64#
65# Add AVX2 code path. Two consecutive input blocks are loaded to
66# 256-bit %ymm registers, with data from first block to least
67# significant 128-bit halves and data from second to most significant.
68# The data is then processed with same SIMD instruction sequence as
69# for AVX, but with %ymm as operands. Side effect is increased stack
70# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
71# code size increase.
72#
73# March 2014.
74#
75# Add support for Intel SHA Extensions.
76
77######################################################################
78# Current performance in cycles per processed byte (less is better):
79#
80#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
81#
82# AMD K8	14.9	-	    -		    9.57    -
83# P4		17.3	-	    -		    30.8    -
84# Core 2	15.6	13.8(+13%)  -		    9.97    -
85# Westmere	14.8	12.3(+19%)  -		    9.58    -
86# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
87# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
88# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
89# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
90# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
91# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
92# Atom		23.0	18.9(+22%)  -		    14.7    -
93# Silvermont	27.4	20.6(+33%)  -               17.5    -
94# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
95#
96# (*)	whichever best applicable, including SHAEXT;
97# (**)	switch from ror to shrd stands for fair share of improvement;
98# (***)	execution time is fully determined by remaining integer-only
99#	part, body_00_15; reducing the amount of SIMD instructions
100#	below certain limit makes no difference/sense; to conserve
101#	space SHA256 XOP code path is therefore omitted;
102
103$flavour = shift;
104$output  = shift;
105if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
106
107$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
108
109$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
110( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
111( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
112die "can't locate x86_64-xlate.pl";
113
114# In upstream, this is controlled by shelling out to the compiler to check
115# versions, but BoringSSL is intended to be used with pre-generated perlasm
116# output, so this isn't useful anyway.
117#
118# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
119# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
120# did not tie them together until after $shaext was added.
121$avx = 1;
122
123# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
124# been tested.
125$shaext=0;	### set to zero if compiling for 1.0.1
126$avx=1		if (!$shaext && $avx);
127
128open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
129*STDOUT=*OUT;
130
131if ($output =~ /512/) {
132	$func="sha512_block_data_order";
133	$TABLE="K512";
134	$SZ=8;
135	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
136					"%r8", "%r9", "%r10","%r11");
137	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
138	@Sigma0=(28,34,39);
139	@Sigma1=(14,18,41);
140	@sigma0=(1,  8, 7);
141	@sigma1=(19,61, 6);
142	$rounds=80;
143} else {
144	$func="sha256_block_data_order";
145	$TABLE="K256";
146	$SZ=4;
147	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
148					"%r8d","%r9d","%r10d","%r11d");
149	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
150	@Sigma0=( 2,13,22);
151	@Sigma1=( 6,11,25);
152	@sigma0=( 7,18, 3);
153	@sigma1=(17,19,10);
154	$rounds=64;
155}
156
157$ctx="%rdi";	# 1st arg, zapped by $a3
158$inp="%rsi";	# 2nd arg
159$Tbl="%rbp";
160
161$_ctx="16*$SZ+0*8(%rsp)";
162$_inp="16*$SZ+1*8(%rsp)";
163$_end="16*$SZ+2*8(%rsp)";
164$_rsp="16*$SZ+3*8(%rsp)";
165$framesz="16*$SZ+4*8";
166
167
168sub ROUND_00_15()
169{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
170  my $STRIDE=$SZ;
171     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
172
173$code.=<<___;
174	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
175	mov	$f,$a2
176
177	xor	$e,$a0
178	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
179	xor	$g,$a2			# f^g
180
181	mov	$T1,`$SZ*($i&0xf)`(%rsp)
182	xor	$a,$a1
183	and	$e,$a2			# (f^g)&e
184
185	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
186	add	$h,$T1			# T1+=h
187	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
188
189	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
190	xor	$e,$a0
191	add	$a2,$T1			# T1+=Ch(e,f,g)
192
193	mov	$a,$a2
194	add	($Tbl),$T1		# T1+=K[round]
195	xor	$a,$a1
196
197	xor	$b,$a2			# a^b, b^c in next round
198	ror	\$$Sigma1[0],$a0	# Sigma1(e)
199	mov	$b,$h
200
201	and	$a2,$a3
202	ror	\$$Sigma0[0],$a1	# Sigma0(a)
203	add	$a0,$T1			# T1+=Sigma1(e)
204
205	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
206	add	$T1,$d			# d+=T1
207	add	$T1,$h			# h+=T1
208
209	lea	$STRIDE($Tbl),$Tbl	# round++
210___
211$code.=<<___ if ($i<15);
212	add	$a1,$h			# h+=Sigma0(a)
213___
214	($a2,$a3) = ($a3,$a2);
215}
216
217sub ROUND_16_XX()
218{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
219
220$code.=<<___;
221	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
222	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
223
224	mov	$a0,$T1
225	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
226	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
227	mov	$a2,$a1
228	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
229
230	xor	$T1,$a0
231	shr	\$$sigma0[2],$T1
232	ror	\$$sigma0[0],$a0
233	xor	$a1,$a2
234	shr	\$$sigma1[2],$a1
235
236	ror	\$$sigma1[0],$a2
237	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
238	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
239	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
240
241	add	`$SZ*($i&0xf)`(%rsp),$T1
242	mov	$e,$a0
243	add	$a2,$T1
244	mov	$a,$a1
245___
246	&ROUND_00_15(@_);
247}
248
249$code=<<___;
250.text
251
252.extern	OPENSSL_ia32cap_P
253.globl	$func
254.type	$func,\@function,3
255.align	16
256$func:
257___
258$code.=<<___ if ($SZ==4 || $avx);
259	lea	OPENSSL_ia32cap_P(%rip),%r11
260	mov	0(%r11),%r9d
261	mov	4(%r11),%r10d
262	mov	8(%r11),%r11d
263___
264$code.=<<___ if ($SZ==4 && $shaext);
265	test	\$`1<<29`,%r11d		# check for SHA
266	jnz	_shaext_shortcut
267___
268$code.=<<___ if ($avx && $SZ==8);
269	test	\$`1<<11`,%r10d		# check for XOP
270	jnz	.Lxop_shortcut
271___
272$code.=<<___ if ($avx>1);
273	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
274	cmp	\$`1<<8|1<<5|1<<3`,%r11d
275	je	.Lavx2_shortcut
276___
277$code.=<<___ if ($avx);
278	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
279	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
280	or	%r9d,%r10d
281	cmp	\$`1<<28|1<<9|1<<30`,%r10d
282	je	.Lavx_shortcut
283___
284$code.=<<___ if ($SZ==4);
285	test	\$`1<<9`,%r10d
286	jnz	.Lssse3_shortcut
287___
288$code.=<<___;
289	mov	%rsp,%rax		# copy %rsp
290	push	%rbx
291	push	%rbp
292	push	%r12
293	push	%r13
294	push	%r14
295	push	%r15
296	shl	\$4,%rdx		# num*16
297	sub	\$$framesz,%rsp
298	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
299	and	\$-64,%rsp		# align stack frame
300	mov	$ctx,$_ctx		# save ctx, 1st arg
301	mov	$inp,$_inp		# save inp, 2nd arh
302	mov	%rdx,$_end		# save end pointer, "3rd" arg
303	mov	%rax,$_rsp		# save copy of %rsp
304.Lprologue:
305
306	mov	$SZ*0($ctx),$A
307	mov	$SZ*1($ctx),$B
308	mov	$SZ*2($ctx),$C
309	mov	$SZ*3($ctx),$D
310	mov	$SZ*4($ctx),$E
311	mov	$SZ*5($ctx),$F
312	mov	$SZ*6($ctx),$G
313	mov	$SZ*7($ctx),$H
314	jmp	.Lloop
315
316.align	16
317.Lloop:
318	mov	$B,$a3
319	lea	$TABLE(%rip),$Tbl
320	xor	$C,$a3			# magic
321___
322	for($i=0;$i<16;$i++) {
323		$code.="	mov	$SZ*$i($inp),$T1\n";
324		$code.="	mov	@ROT[4],$a0\n";
325		$code.="	mov	@ROT[0],$a1\n";
326		$code.="	bswap	$T1\n";
327		&ROUND_00_15($i,@ROT);
328		unshift(@ROT,pop(@ROT));
329	}
330$code.=<<___;
331	jmp	.Lrounds_16_xx
332.align	16
333.Lrounds_16_xx:
334___
335	for(;$i<32;$i++) {
336		&ROUND_16_XX($i,@ROT);
337		unshift(@ROT,pop(@ROT));
338	}
339
340$code.=<<___;
341	cmpb	\$0,`$SZ-1`($Tbl)
342	jnz	.Lrounds_16_xx
343
344	mov	$_ctx,$ctx
345	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
346	lea	16*$SZ($inp),$inp
347
348	add	$SZ*0($ctx),$A
349	add	$SZ*1($ctx),$B
350	add	$SZ*2($ctx),$C
351	add	$SZ*3($ctx),$D
352	add	$SZ*4($ctx),$E
353	add	$SZ*5($ctx),$F
354	add	$SZ*6($ctx),$G
355	add	$SZ*7($ctx),$H
356
357	cmp	$_end,$inp
358
359	mov	$A,$SZ*0($ctx)
360	mov	$B,$SZ*1($ctx)
361	mov	$C,$SZ*2($ctx)
362	mov	$D,$SZ*3($ctx)
363	mov	$E,$SZ*4($ctx)
364	mov	$F,$SZ*5($ctx)
365	mov	$G,$SZ*6($ctx)
366	mov	$H,$SZ*7($ctx)
367	jb	.Lloop
368
369	mov	$_rsp,%rsi
370	mov	-48(%rsi),%r15
371	mov	-40(%rsi),%r14
372	mov	-32(%rsi),%r13
373	mov	-24(%rsi),%r12
374	mov	-16(%rsi),%rbp
375	mov	-8(%rsi),%rbx
376	lea	(%rsi),%rsp
377.Lepilogue:
378	ret
379.size	$func,.-$func
380___
381
382if ($SZ==4) {
383$code.=<<___;
384.align	64
385.type	$TABLE,\@object
386$TABLE:
387	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
388	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
389	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
390	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
391	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
392	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
393	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
394	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
395	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
396	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
397	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
398	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
399	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
400	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
401	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
402	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
403	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
404	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
405	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
406	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
407	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
408	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
409	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
410	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
411	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
412	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
413	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
414	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
415	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
416	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
417	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
418	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
419
420	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
421	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
422	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
423	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
424	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
425	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
426	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
427___
428} else {
429$code.=<<___;
430.align	64
431.type	$TABLE,\@object
432$TABLE:
433	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
434	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
435	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
436	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
437	.quad	0x3956c25bf348b538,0x59f111f1b605d019
438	.quad	0x3956c25bf348b538,0x59f111f1b605d019
439	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
440	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
441	.quad	0xd807aa98a3030242,0x12835b0145706fbe
442	.quad	0xd807aa98a3030242,0x12835b0145706fbe
443	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
444	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
445	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
446	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
447	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
448	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
449	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
450	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
451	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
452	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
453	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
454	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
455	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
456	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
457	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
458	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
459	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
460	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
461	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
462	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
463	.quad	0x06ca6351e003826f,0x142929670a0e6e70
464	.quad	0x06ca6351e003826f,0x142929670a0e6e70
465	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
466	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
467	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
468	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
469	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
470	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
471	.quad	0x81c2c92e47edaee6,0x92722c851482353b
472	.quad	0x81c2c92e47edaee6,0x92722c851482353b
473	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
474	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
475	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
476	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
477	.quad	0xd192e819d6ef5218,0xd69906245565a910
478	.quad	0xd192e819d6ef5218,0xd69906245565a910
479	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
480	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
481	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
482	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
483	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
484	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
485	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
486	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
487	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
488	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
489	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
490	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
491	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
492	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
493	.quad	0x90befffa23631e28,0xa4506cebde82bde9
494	.quad	0x90befffa23631e28,0xa4506cebde82bde9
495	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
496	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
497	.quad	0xca273eceea26619c,0xd186b8c721c0c207
498	.quad	0xca273eceea26619c,0xd186b8c721c0c207
499	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
500	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
501	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
502	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
503	.quad	0x113f9804bef90dae,0x1b710b35131c471b
504	.quad	0x113f9804bef90dae,0x1b710b35131c471b
505	.quad	0x28db77f523047d84,0x32caab7b40c72493
506	.quad	0x28db77f523047d84,0x32caab7b40c72493
507	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
508	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
509	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
510	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
511	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
512	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
513
514	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
515	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
516	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
517___
518}
519
520######################################################################
521# SIMD code paths
522#
523if ($SZ==4 && $shaext) {{{
524######################################################################
525# Intel SHA Extensions implementation of SHA256 update function.
526#
527my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
528
529my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
530my @MSG=map("%xmm$_",(3..6));
531
532$code.=<<___;
533.type	sha256_block_data_order_shaext,\@function,3
534.align	64
535sha256_block_data_order_shaext:
536_shaext_shortcut:
537___
538$code.=<<___ if ($win64);
539	lea	`-8-5*16`(%rsp),%rsp
540	movaps	%xmm6,-8-5*16(%rax)
541	movaps	%xmm7,-8-4*16(%rax)
542	movaps	%xmm8,-8-3*16(%rax)
543	movaps	%xmm9,-8-2*16(%rax)
544	movaps	%xmm10,-8-1*16(%rax)
545.Lprologue_shaext:
546___
547$code.=<<___;
548	lea		K256+0x80(%rip),$Tbl
549	movdqu		($ctx),$ABEF		# DCBA
550	movdqu		16($ctx),$CDGH		# HGFE
551	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
552
553	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
554	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
555	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
556	movdqa		$TMP,$BSWAP		# offload
557	palignr		\$8,$CDGH,$ABEF		# ABEF
558	punpcklqdq	$Wi,$CDGH		# CDGH
559	jmp		.Loop_shaext
560
561.align	16
562.Loop_shaext:
563	movdqu		($inp),@MSG[0]
564	movdqu		0x10($inp),@MSG[1]
565	movdqu		0x20($inp),@MSG[2]
566	pshufb		$TMP,@MSG[0]
567	movdqu		0x30($inp),@MSG[3]
568
569	movdqa		0*32-0x80($Tbl),$Wi
570	paddd		@MSG[0],$Wi
571	pshufb		$TMP,@MSG[1]
572	movdqa		$CDGH,$CDGH_SAVE	# offload
573	sha256rnds2	$ABEF,$CDGH		# 0-3
574	pshufd		\$0x0e,$Wi,$Wi
575	nop
576	movdqa		$ABEF,$ABEF_SAVE	# offload
577	sha256rnds2	$CDGH,$ABEF
578
579	movdqa		1*32-0x80($Tbl),$Wi
580	paddd		@MSG[1],$Wi
581	pshufb		$TMP,@MSG[2]
582	sha256rnds2	$ABEF,$CDGH		# 4-7
583	pshufd		\$0x0e,$Wi,$Wi
584	lea		0x40($inp),$inp
585	sha256msg1	@MSG[1],@MSG[0]
586	sha256rnds2	$CDGH,$ABEF
587
588	movdqa		2*32-0x80($Tbl),$Wi
589	paddd		@MSG[2],$Wi
590	pshufb		$TMP,@MSG[3]
591	sha256rnds2	$ABEF,$CDGH		# 8-11
592	pshufd		\$0x0e,$Wi,$Wi
593	movdqa		@MSG[3],$TMP
594	palignr		\$4,@MSG[2],$TMP
595	nop
596	paddd		$TMP,@MSG[0]
597	sha256msg1	@MSG[2],@MSG[1]
598	sha256rnds2	$CDGH,$ABEF
599
600	movdqa		3*32-0x80($Tbl),$Wi
601	paddd		@MSG[3],$Wi
602	sha256msg2	@MSG[3],@MSG[0]
603	sha256rnds2	$ABEF,$CDGH		# 12-15
604	pshufd		\$0x0e,$Wi,$Wi
605	movdqa		@MSG[0],$TMP
606	palignr		\$4,@MSG[3],$TMP
607	nop
608	paddd		$TMP,@MSG[1]
609	sha256msg1	@MSG[3],@MSG[2]
610	sha256rnds2	$CDGH,$ABEF
611___
612for($i=4;$i<16-3;$i++) {
613$code.=<<___;
614	movdqa		$i*32-0x80($Tbl),$Wi
615	paddd		@MSG[0],$Wi
616	sha256msg2	@MSG[0],@MSG[1]
617	sha256rnds2	$ABEF,$CDGH		# 16-19...
618	pshufd		\$0x0e,$Wi,$Wi
619	movdqa		@MSG[1],$TMP
620	palignr		\$4,@MSG[0],$TMP
621	nop
622	paddd		$TMP,@MSG[2]
623	sha256msg1	@MSG[0],@MSG[3]
624	sha256rnds2	$CDGH,$ABEF
625___
626	push(@MSG,shift(@MSG));
627}
628$code.=<<___;
629	movdqa		13*32-0x80($Tbl),$Wi
630	paddd		@MSG[0],$Wi
631	sha256msg2	@MSG[0],@MSG[1]
632	sha256rnds2	$ABEF,$CDGH		# 52-55
633	pshufd		\$0x0e,$Wi,$Wi
634	movdqa		@MSG[1],$TMP
635	palignr		\$4,@MSG[0],$TMP
636	sha256rnds2	$CDGH,$ABEF
637	paddd		$TMP,@MSG[2]
638
639	movdqa		14*32-0x80($Tbl),$Wi
640	paddd		@MSG[1],$Wi
641	sha256rnds2	$ABEF,$CDGH		# 56-59
642	pshufd		\$0x0e,$Wi,$Wi
643	sha256msg2	@MSG[1],@MSG[2]
644	movdqa		$BSWAP,$TMP
645	sha256rnds2	$CDGH,$ABEF
646
647	movdqa		15*32-0x80($Tbl),$Wi
648	paddd		@MSG[2],$Wi
649	nop
650	sha256rnds2	$ABEF,$CDGH		# 60-63
651	pshufd		\$0x0e,$Wi,$Wi
652	dec		$num
653	nop
654	sha256rnds2	$CDGH,$ABEF
655
656	paddd		$CDGH_SAVE,$CDGH
657	paddd		$ABEF_SAVE,$ABEF
658	jnz		.Loop_shaext
659
660	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
661	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
662	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
663	punpckhqdq	$CDGH,$ABEF		# DCBA
664	palignr		\$8,$TMP,$CDGH		# HGFE
665
666	movdqu	$ABEF,($ctx)
667	movdqu	$CDGH,16($ctx)
668___
669$code.=<<___ if ($win64);
670	movaps	-8-5*16(%rax),%xmm6
671	movaps	-8-4*16(%rax),%xmm7
672	movaps	-8-3*16(%rax),%xmm8
673	movaps	-8-2*16(%rax),%xmm9
674	movaps	-8-1*16(%rax),%xmm10
675	mov	%rax,%rsp
676.Lepilogue_shaext:
677___
678$code.=<<___;
679	ret
680.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
681___
682}}}
683{{{
684
685my $a4=$T1;
686my ($a,$b,$c,$d,$e,$f,$g,$h);
687
688sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
689{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
690  my $arg = pop;
691    $arg = "\$$arg" if ($arg*1 eq $arg);
692    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
693}
694
695sub body_00_15 () {
696	(
697	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
698
699	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
700	'&mov	($a,$a1)',
701	'&mov	($a4,$f)',
702
703	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
704	'&xor	($a0,$e)',
705	'&xor	($a4,$g)',			# f^g
706
707	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
708	'&xor	($a1,$a)',
709	'&and	($a4,$e)',			# (f^g)&e
710
711	'&xor	($a0,$e)',
712	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
713	'&mov	($a2,$a)',
714
715	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
716	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
717	'&xor	($a2,$b)',			# a^b, b^c in next round
718
719	'&add	($h,$a4)',			# h+=Ch(e,f,g)
720	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
721	'&and	($a3,$a2)',			# (b^c)&(a^b)
722
723	'&xor	($a1,$a)',
724	'&add	($h,$a0)',			# h+=Sigma1(e)
725	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
726
727	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
728	'&add	($d,$h)',			# d+=h
729	'&add	($h,$a3)',			# h+=Maj(a,b,c)
730
731	'&mov	($a0,$d)',
732	'&add	($a1,$h);'.			# h+=Sigma0(a)
733	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
734	);
735}
736
737######################################################################
738# SSSE3 code path
739#
740if ($SZ==4) {	# SHA256 only
741my @X = map("%xmm$_",(0..3));
742my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
743
744$code.=<<___;
745.type	${func}_ssse3,\@function,3
746.align	64
747${func}_ssse3:
748.Lssse3_shortcut:
749	mov	%rsp,%rax		# copy %rsp
750	push	%rbx
751	push	%rbp
752	push	%r12
753	push	%r13
754	push	%r14
755	push	%r15
756	shl	\$4,%rdx		# num*16
757	sub	\$`$framesz+$win64*16*4`,%rsp
758	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
759	and	\$-64,%rsp		# align stack frame
760	mov	$ctx,$_ctx		# save ctx, 1st arg
761	mov	$inp,$_inp		# save inp, 2nd arh
762	mov	%rdx,$_end		# save end pointer, "3rd" arg
763	mov	%rax,$_rsp		# save copy of %rsp
764___
765$code.=<<___ if ($win64);
766	movaps	%xmm6,16*$SZ+32(%rsp)
767	movaps	%xmm7,16*$SZ+48(%rsp)
768	movaps	%xmm8,16*$SZ+64(%rsp)
769	movaps	%xmm9,16*$SZ+80(%rsp)
770___
771$code.=<<___;
772.Lprologue_ssse3:
773
774	mov	$SZ*0($ctx),$A
775	mov	$SZ*1($ctx),$B
776	mov	$SZ*2($ctx),$C
777	mov	$SZ*3($ctx),$D
778	mov	$SZ*4($ctx),$E
779	mov	$SZ*5($ctx),$F
780	mov	$SZ*6($ctx),$G
781	mov	$SZ*7($ctx),$H
782___
783
784$code.=<<___;
785	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
786	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
787	jmp	.Lloop_ssse3
788.align	16
789.Lloop_ssse3:
790	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
791	movdqu	0x00($inp),@X[0]
792	movdqu	0x10($inp),@X[1]
793	movdqu	0x20($inp),@X[2]
794	pshufb	$t3,@X[0]
795	movdqu	0x30($inp),@X[3]
796	lea	$TABLE(%rip),$Tbl
797	pshufb	$t3,@X[1]
798	movdqa	0x00($Tbl),$t0
799	movdqa	0x20($Tbl),$t1
800	pshufb	$t3,@X[2]
801	paddd	@X[0],$t0
802	movdqa	0x40($Tbl),$t2
803	pshufb	$t3,@X[3]
804	movdqa	0x60($Tbl),$t3
805	paddd	@X[1],$t1
806	paddd	@X[2],$t2
807	paddd	@X[3],$t3
808	movdqa	$t0,0x00(%rsp)
809	mov	$A,$a1
810	movdqa	$t1,0x10(%rsp)
811	mov	$B,$a3
812	movdqa	$t2,0x20(%rsp)
813	xor	$C,$a3			# magic
814	movdqa	$t3,0x30(%rsp)
815	mov	$E,$a0
816	jmp	.Lssse3_00_47
817
818.align	16
819.Lssse3_00_47:
820	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
821___
822sub Xupdate_256_SSSE3 () {
823	(
824	'&movdqa	($t0,@X[1]);',
825	'&movdqa	($t3,@X[3])',
826	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
827	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
828	'&movdqa	($t1,$t0)',
829	'&movdqa	($t2,$t0);',
830	'&psrld		($t0,$sigma0[2])',
831	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
832	'&psrld		($t2,$sigma0[0])',
833	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
834	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
835	'&pxor		($t0,$t2)',
836	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
837	'&pxor		($t0,$t1)',
838	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
839	'&pxor		($t0,$t2);',
840	 '&movdqa	($t2,$t3)',
841	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
842	 '&psrld	($t3,$sigma1[2])',
843	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
844	 '&psrlq	($t2,$sigma1[0])',
845	 '&pxor		($t3,$t2);',
846	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
847	 '&pxor		($t3,$t2)',
848	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
849	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
850	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
851	 '&movdqa	($t2,$t3);',
852	 '&psrld	($t3,$sigma1[2])',
853	 '&psrlq	($t2,$sigma1[0])',
854	 '&pxor		($t3,$t2);',
855	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
856	 '&pxor		($t3,$t2);',
857	'&movdqa	($t2,16*2*$j."($Tbl)")',
858	 '&pshufb	($t3,$t5)',
859	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
860	);
861}
862
863sub SSSE3_256_00_47 () {
864my $j = shift;
865my $body = shift;
866my @X = @_;
867my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
868
869    if (0) {
870	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
871	    eval;
872	    eval(shift(@insns));
873	    eval(shift(@insns));
874	    eval(shift(@insns));
875	}
876    } else {			# squeeze extra 4% on Westmere and 19% on Atom
877	  eval(shift(@insns));	#@
878	&movdqa		($t0,@X[1]);
879	  eval(shift(@insns));
880	  eval(shift(@insns));
881	&movdqa		($t3,@X[3]);
882	  eval(shift(@insns));	#@
883	  eval(shift(@insns));
884	  eval(shift(@insns));
885	  eval(shift(@insns));	#@
886	  eval(shift(@insns));
887	&palignr	($t0,@X[0],$SZ);	# X[1..4]
888	  eval(shift(@insns));
889	  eval(shift(@insns));
890	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
891	  eval(shift(@insns));
892	  eval(shift(@insns));
893	  eval(shift(@insns));
894	  eval(shift(@insns));	#@
895	&movdqa		($t1,$t0);
896	  eval(shift(@insns));
897	  eval(shift(@insns));
898	&movdqa		($t2,$t0);
899	  eval(shift(@insns));	#@
900	  eval(shift(@insns));
901	&psrld		($t0,$sigma0[2]);
902	  eval(shift(@insns));
903	  eval(shift(@insns));
904	  eval(shift(@insns));
905	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
906	  eval(shift(@insns));	#@
907	  eval(shift(@insns));
908	&psrld		($t2,$sigma0[0]);
909	  eval(shift(@insns));
910	  eval(shift(@insns));
911	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
912	  eval(shift(@insns));
913	  eval(shift(@insns));	#@
914	&pslld		($t1,8*$SZ-$sigma0[1]);
915	  eval(shift(@insns));
916	  eval(shift(@insns));
917	&pxor		($t0,$t2);
918	  eval(shift(@insns));	#@
919	  eval(shift(@insns));
920	  eval(shift(@insns));
921	  eval(shift(@insns));	#@
922	&psrld		($t2,$sigma0[1]-$sigma0[0]);
923	  eval(shift(@insns));
924	&pxor		($t0,$t1);
925	  eval(shift(@insns));
926	  eval(shift(@insns));
927	&pslld		($t1,$sigma0[1]-$sigma0[0]);
928	  eval(shift(@insns));
929	  eval(shift(@insns));
930	&pxor		($t0,$t2);
931	  eval(shift(@insns));
932	  eval(shift(@insns));	#@
933	 &movdqa	($t2,$t3);
934	  eval(shift(@insns));
935	  eval(shift(@insns));
936	&pxor		($t0,$t1);		# sigma0(X[1..4])
937	  eval(shift(@insns));	#@
938	  eval(shift(@insns));
939	  eval(shift(@insns));
940	 &psrld		($t3,$sigma1[2]);
941	  eval(shift(@insns));
942	  eval(shift(@insns));
943	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
944	  eval(shift(@insns));	#@
945	  eval(shift(@insns));
946	 &psrlq		($t2,$sigma1[0]);
947	  eval(shift(@insns));
948	  eval(shift(@insns));
949	  eval(shift(@insns));
950	 &pxor		($t3,$t2);
951	  eval(shift(@insns));	#@
952	  eval(shift(@insns));
953	  eval(shift(@insns));
954	  eval(shift(@insns));	#@
955	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
956	  eval(shift(@insns));
957	  eval(shift(@insns));
958	 &pxor		($t3,$t2);
959	  eval(shift(@insns));	#@
960	  eval(shift(@insns));
961	  eval(shift(@insns));
962	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
963	 &pshufd	($t3,$t3,0b10000000);
964	  eval(shift(@insns));
965	  eval(shift(@insns));
966	  eval(shift(@insns));
967	 &psrldq	($t3,8);
968	  eval(shift(@insns));
969	  eval(shift(@insns));	#@
970	  eval(shift(@insns));
971	  eval(shift(@insns));
972	  eval(shift(@insns));	#@
973	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
974	  eval(shift(@insns));
975	  eval(shift(@insns));
976	  eval(shift(@insns));
977	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
978	  eval(shift(@insns));
979	  eval(shift(@insns));	#@
980	  eval(shift(@insns));
981	 &movdqa	($t2,$t3);
982	  eval(shift(@insns));
983	  eval(shift(@insns));
984	 &psrld		($t3,$sigma1[2]);
985	  eval(shift(@insns));
986	  eval(shift(@insns));	#@
987	 &psrlq		($t2,$sigma1[0]);
988	  eval(shift(@insns));
989	  eval(shift(@insns));
990	 &pxor		($t3,$t2);
991	  eval(shift(@insns));	#@
992	  eval(shift(@insns));
993	  eval(shift(@insns));
994	  eval(shift(@insns));	#@
995	  eval(shift(@insns));
996	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
997	  eval(shift(@insns));
998	  eval(shift(@insns));
999	  eval(shift(@insns));
1000	 &pxor		($t3,$t2);
1001	  eval(shift(@insns));
1002	  eval(shift(@insns));
1003	  eval(shift(@insns));	#@
1004	 #&pshufb	($t3,$t5);
1005	 &pshufd	($t3,$t3,0b00001000);
1006	  eval(shift(@insns));
1007	  eval(shift(@insns));
1008	&movdqa		($t2,16*2*$j."($Tbl)");
1009	  eval(shift(@insns));	#@
1010	  eval(shift(@insns));
1011	 &pslldq	($t3,8);
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));
1014	  eval(shift(@insns));
1015	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1016	  eval(shift(@insns));	#@
1017	  eval(shift(@insns));
1018	  eval(shift(@insns));
1019    }
1020	&paddd		($t2,@X[0]);
1021	  foreach (@insns) { eval; }		# remaining instructions
1022	&movdqa		(16*$j."(%rsp)",$t2);
1023}
1024
1025    for ($i=0,$j=0; $j<4; $j++) {
1026	&SSSE3_256_00_47($j,\&body_00_15,@X);
1027	push(@X,shift(@X));			# rotate(@X)
1028    }
1029	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1030	&jne	(".Lssse3_00_47");
1031
1032    for ($i=0; $i<16; ) {
1033	foreach(body_00_15()) { eval; }
1034    }
1035$code.=<<___;
1036	mov	$_ctx,$ctx
1037	mov	$a1,$A
1038
1039	add	$SZ*0($ctx),$A
1040	lea	16*$SZ($inp),$inp
1041	add	$SZ*1($ctx),$B
1042	add	$SZ*2($ctx),$C
1043	add	$SZ*3($ctx),$D
1044	add	$SZ*4($ctx),$E
1045	add	$SZ*5($ctx),$F
1046	add	$SZ*6($ctx),$G
1047	add	$SZ*7($ctx),$H
1048
1049	cmp	$_end,$inp
1050
1051	mov	$A,$SZ*0($ctx)
1052	mov	$B,$SZ*1($ctx)
1053	mov	$C,$SZ*2($ctx)
1054	mov	$D,$SZ*3($ctx)
1055	mov	$E,$SZ*4($ctx)
1056	mov	$F,$SZ*5($ctx)
1057	mov	$G,$SZ*6($ctx)
1058	mov	$H,$SZ*7($ctx)
1059	jb	.Lloop_ssse3
1060
1061	mov	$_rsp,%rsi
1062___
1063$code.=<<___ if ($win64);
1064	movaps	16*$SZ+32(%rsp),%xmm6
1065	movaps	16*$SZ+48(%rsp),%xmm7
1066	movaps	16*$SZ+64(%rsp),%xmm8
1067	movaps	16*$SZ+80(%rsp),%xmm9
1068___
1069$code.=<<___;
1070	mov	-48(%rsi),%r15
1071	mov	-40(%rsi),%r14
1072	mov	-32(%rsi),%r13
1073	mov	-24(%rsi),%r12
1074	mov	-16(%rsi),%rbp
1075	mov	-8(%rsi),%rbx
1076	lea	(%rsi),%rsp
1077.Lepilogue_ssse3:
1078	ret
1079.size	${func}_ssse3,.-${func}_ssse3
1080___
1081}
1082
1083if ($avx) {{
1084######################################################################
1085# XOP code path
1086#
1087if ($SZ==8) {	# SHA512 only
1088$code.=<<___;
1089.type	${func}_xop,\@function,3
1090.align	64
1091${func}_xop:
1092.Lxop_shortcut:
1093	mov	%rsp,%rax		# copy %rsp
1094	push	%rbx
1095	push	%rbp
1096	push	%r12
1097	push	%r13
1098	push	%r14
1099	push	%r15
1100	shl	\$4,%rdx		# num*16
1101	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1102	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1103	and	\$-64,%rsp		# align stack frame
1104	mov	$ctx,$_ctx		# save ctx, 1st arg
1105	mov	$inp,$_inp		# save inp, 2nd arh
1106	mov	%rdx,$_end		# save end pointer, "3rd" arg
1107	mov	%rax,$_rsp		# save copy of %rsp
1108___
1109$code.=<<___ if ($win64);
1110	movaps	%xmm6,16*$SZ+32(%rsp)
1111	movaps	%xmm7,16*$SZ+48(%rsp)
1112	movaps	%xmm8,16*$SZ+64(%rsp)
1113	movaps	%xmm9,16*$SZ+80(%rsp)
1114___
1115$code.=<<___ if ($win64 && $SZ>4);
1116	movaps	%xmm10,16*$SZ+96(%rsp)
1117	movaps	%xmm11,16*$SZ+112(%rsp)
1118___
1119$code.=<<___;
1120.Lprologue_xop:
1121
1122	vzeroupper
1123	mov	$SZ*0($ctx),$A
1124	mov	$SZ*1($ctx),$B
1125	mov	$SZ*2($ctx),$C
1126	mov	$SZ*3($ctx),$D
1127	mov	$SZ*4($ctx),$E
1128	mov	$SZ*5($ctx),$F
1129	mov	$SZ*6($ctx),$G
1130	mov	$SZ*7($ctx),$H
1131	jmp	.Lloop_xop
1132___
1133					if ($SZ==4) {	# SHA256
1134    my @X = map("%xmm$_",(0..3));
1135    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1136
1137$code.=<<___;
1138.align	16
1139.Lloop_xop:
1140	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1141	vmovdqu	0x00($inp),@X[0]
1142	vmovdqu	0x10($inp),@X[1]
1143	vmovdqu	0x20($inp),@X[2]
1144	vmovdqu	0x30($inp),@X[3]
1145	vpshufb	$t3,@X[0],@X[0]
1146	lea	$TABLE(%rip),$Tbl
1147	vpshufb	$t3,@X[1],@X[1]
1148	vpshufb	$t3,@X[2],@X[2]
1149	vpaddd	0x00($Tbl),@X[0],$t0
1150	vpshufb	$t3,@X[3],@X[3]
1151	vpaddd	0x20($Tbl),@X[1],$t1
1152	vpaddd	0x40($Tbl),@X[2],$t2
1153	vpaddd	0x60($Tbl),@X[3],$t3
1154	vmovdqa	$t0,0x00(%rsp)
1155	mov	$A,$a1
1156	vmovdqa	$t1,0x10(%rsp)
1157	mov	$B,$a3
1158	vmovdqa	$t2,0x20(%rsp)
1159	xor	$C,$a3			# magic
1160	vmovdqa	$t3,0x30(%rsp)
1161	mov	$E,$a0
1162	jmp	.Lxop_00_47
1163
1164.align	16
1165.Lxop_00_47:
1166	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1167___
1168sub XOP_256_00_47 () {
1169my $j = shift;
1170my $body = shift;
1171my @X = @_;
1172my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1173
1174	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..4]
1175	  eval(shift(@insns));
1176	  eval(shift(@insns));
1177	 &vpalignr	($t3,@X[3],@X[2],$SZ);	# X[9..12]
1178	  eval(shift(@insns));
1179	  eval(shift(@insns));
1180	&vprotd		($t1,$t0,8*$SZ-$sigma0[1]);
1181	  eval(shift(@insns));
1182	  eval(shift(@insns));
1183	&vpsrld		($t0,$t0,$sigma0[2]);
1184	  eval(shift(@insns));
1185	  eval(shift(@insns));
1186	 &vpaddd	(@X[0],@X[0],$t3);	# X[0..3] += X[9..12]
1187	  eval(shift(@insns));
1188	  eval(shift(@insns));
1189	  eval(shift(@insns));
1190	  eval(shift(@insns));
1191	&vprotd		($t2,$t1,$sigma0[1]-$sigma0[0]);
1192	  eval(shift(@insns));
1193	  eval(shift(@insns));
1194	&vpxor		($t0,$t0,$t1);
1195	  eval(shift(@insns));
1196	  eval(shift(@insns));
1197	  eval(shift(@insns));
1198	  eval(shift(@insns));
1199	 &vprotd	($t3,@X[3],8*$SZ-$sigma1[1]);
1200	  eval(shift(@insns));
1201	  eval(shift(@insns));
1202	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..4])
1203	  eval(shift(@insns));
1204	  eval(shift(@insns));
1205	 &vpsrld	($t2,@X[3],$sigma1[2]);
1206	  eval(shift(@insns));
1207	  eval(shift(@insns));
1208	&vpaddd		(@X[0],@X[0],$t0);	# X[0..3] += sigma0(X[1..4])
1209	  eval(shift(@insns));
1210	  eval(shift(@insns));
1211	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1212	  eval(shift(@insns));
1213	  eval(shift(@insns));
1214	 &vpxor		($t3,$t3,$t2);
1215	  eval(shift(@insns));
1216	  eval(shift(@insns));
1217	  eval(shift(@insns));
1218	  eval(shift(@insns));
1219	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1220	  eval(shift(@insns));
1221	  eval(shift(@insns));
1222	  eval(shift(@insns));
1223	  eval(shift(@insns));
1224	&vpsrldq	($t3,$t3,8);
1225	  eval(shift(@insns));
1226	  eval(shift(@insns));
1227	  eval(shift(@insns));
1228	  eval(shift(@insns));
1229	&vpaddd		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1230	  eval(shift(@insns));
1231	  eval(shift(@insns));
1232	  eval(shift(@insns));
1233	  eval(shift(@insns));
1234	 &vprotd	($t3,@X[0],8*$SZ-$sigma1[1]);
1235	  eval(shift(@insns));
1236	  eval(shift(@insns));
1237	 &vpsrld	($t2,@X[0],$sigma1[2]);
1238	  eval(shift(@insns));
1239	  eval(shift(@insns));
1240	 &vprotd	($t1,$t3,$sigma1[1]-$sigma1[0]);
1241	  eval(shift(@insns));
1242	  eval(shift(@insns));
1243	 &vpxor		($t3,$t3,$t2);
1244	  eval(shift(@insns));
1245	  eval(shift(@insns));
1246	  eval(shift(@insns));
1247	  eval(shift(@insns));
1248	 &vpxor		($t3,$t3,$t1);		# sigma1(X[16..17])
1249	  eval(shift(@insns));
1250	  eval(shift(@insns));
1251	  eval(shift(@insns));
1252	  eval(shift(@insns));
1253	&vpslldq	($t3,$t3,8);		# 22 instructions
1254	  eval(shift(@insns));
1255	  eval(shift(@insns));
1256	  eval(shift(@insns));
1257	  eval(shift(@insns));
1258	&vpaddd		(@X[0],@X[0],$t3);	# X[2..3] += sigma1(X[16..17])
1259	  eval(shift(@insns));
1260	  eval(shift(@insns));
1261	  eval(shift(@insns));
1262	  eval(shift(@insns));
1263	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1264	  foreach (@insns) { eval; }		# remaining instructions
1265	&vmovdqa	(16*$j."(%rsp)",$t2);
1266}
1267
1268    for ($i=0,$j=0; $j<4; $j++) {
1269	&XOP_256_00_47($j,\&body_00_15,@X);
1270	push(@X,shift(@X));			# rotate(@X)
1271    }
1272	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1273	&jne	(".Lxop_00_47");
1274
1275    for ($i=0; $i<16; ) {
1276	foreach(body_00_15()) { eval; }
1277    }
1278
1279					} else {	# SHA512
1280    my @X = map("%xmm$_",(0..7));
1281    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1282
1283$code.=<<___;
1284.align	16
1285.Lloop_xop:
1286	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1287	vmovdqu	0x00($inp),@X[0]
1288	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1289	vmovdqu	0x10($inp),@X[1]
1290	vmovdqu	0x20($inp),@X[2]
1291	vpshufb	$t3,@X[0],@X[0]
1292	vmovdqu	0x30($inp),@X[3]
1293	vpshufb	$t3,@X[1],@X[1]
1294	vmovdqu	0x40($inp),@X[4]
1295	vpshufb	$t3,@X[2],@X[2]
1296	vmovdqu	0x50($inp),@X[5]
1297	vpshufb	$t3,@X[3],@X[3]
1298	vmovdqu	0x60($inp),@X[6]
1299	vpshufb	$t3,@X[4],@X[4]
1300	vmovdqu	0x70($inp),@X[7]
1301	vpshufb	$t3,@X[5],@X[5]
1302	vpaddq	-0x80($Tbl),@X[0],$t0
1303	vpshufb	$t3,@X[6],@X[6]
1304	vpaddq	-0x60($Tbl),@X[1],$t1
1305	vpshufb	$t3,@X[7],@X[7]
1306	vpaddq	-0x40($Tbl),@X[2],$t2
1307	vpaddq	-0x20($Tbl),@X[3],$t3
1308	vmovdqa	$t0,0x00(%rsp)
1309	vpaddq	0x00($Tbl),@X[4],$t0
1310	vmovdqa	$t1,0x10(%rsp)
1311	vpaddq	0x20($Tbl),@X[5],$t1
1312	vmovdqa	$t2,0x20(%rsp)
1313	vpaddq	0x40($Tbl),@X[6],$t2
1314	vmovdqa	$t3,0x30(%rsp)
1315	vpaddq	0x60($Tbl),@X[7],$t3
1316	vmovdqa	$t0,0x40(%rsp)
1317	mov	$A,$a1
1318	vmovdqa	$t1,0x50(%rsp)
1319	mov	$B,$a3
1320	vmovdqa	$t2,0x60(%rsp)
1321	xor	$C,$a3			# magic
1322	vmovdqa	$t3,0x70(%rsp)
1323	mov	$E,$a0
1324	jmp	.Lxop_00_47
1325
1326.align	16
1327.Lxop_00_47:
1328	add	\$`16*2*$SZ`,$Tbl
1329___
1330sub XOP_512_00_47 () {
1331my $j = shift;
1332my $body = shift;
1333my @X = @_;
1334my @insns = (&$body,&$body);			# 52 instructions
1335
1336	&vpalignr	($t0,@X[1],@X[0],$SZ);	# X[1..2]
1337	  eval(shift(@insns));
1338	  eval(shift(@insns));
1339	 &vpalignr	($t3,@X[5],@X[4],$SZ);	# X[9..10]
1340	  eval(shift(@insns));
1341	  eval(shift(@insns));
1342	&vprotq		($t1,$t0,8*$SZ-$sigma0[1]);
1343	  eval(shift(@insns));
1344	  eval(shift(@insns));
1345	&vpsrlq		($t0,$t0,$sigma0[2]);
1346	  eval(shift(@insns));
1347	  eval(shift(@insns));
1348	 &vpaddq	(@X[0],@X[0],$t3);	# X[0..1] += X[9..10]
1349	  eval(shift(@insns));
1350	  eval(shift(@insns));
1351	  eval(shift(@insns));
1352	  eval(shift(@insns));
1353	&vprotq		($t2,$t1,$sigma0[1]-$sigma0[0]);
1354	  eval(shift(@insns));
1355	  eval(shift(@insns));
1356	&vpxor		($t0,$t0,$t1);
1357	  eval(shift(@insns));
1358	  eval(shift(@insns));
1359	  eval(shift(@insns));
1360	  eval(shift(@insns));
1361	 &vprotq	($t3,@X[7],8*$SZ-$sigma1[1]);
1362	  eval(shift(@insns));
1363	  eval(shift(@insns));
1364	&vpxor		($t0,$t0,$t2);		# sigma0(X[1..2])
1365	  eval(shift(@insns));
1366	  eval(shift(@insns));
1367	 &vpsrlq	($t2,@X[7],$sigma1[2]);
1368	  eval(shift(@insns));
1369	  eval(shift(@insns));
1370	&vpaddq		(@X[0],@X[0],$t0);	# X[0..1] += sigma0(X[1..2])
1371	  eval(shift(@insns));
1372	  eval(shift(@insns));
1373	 &vprotq	($t1,$t3,$sigma1[1]-$sigma1[0]);
1374	  eval(shift(@insns));
1375	  eval(shift(@insns));
1376	 &vpxor		($t3,$t3,$t2);
1377	  eval(shift(@insns));
1378	  eval(shift(@insns));
1379	  eval(shift(@insns));
1380	  eval(shift(@insns));
1381	 &vpxor		($t3,$t3,$t1);		# sigma1(X[14..15])
1382	  eval(shift(@insns));
1383	  eval(shift(@insns));
1384	  eval(shift(@insns));
1385	  eval(shift(@insns));
1386	&vpaddq		(@X[0],@X[0],$t3);	# X[0..1] += sigma1(X[14..15])
1387	  eval(shift(@insns));
1388	  eval(shift(@insns));
1389	  eval(shift(@insns));
1390	  eval(shift(@insns));
1391	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1392	  foreach (@insns) { eval; }		# remaining instructions
1393	&vmovdqa	(16*$j."(%rsp)",$t2);
1394}
1395
1396    for ($i=0,$j=0; $j<8; $j++) {
1397	&XOP_512_00_47($j,\&body_00_15,@X);
1398	push(@X,shift(@X));			# rotate(@X)
1399    }
1400	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1401	&jne	(".Lxop_00_47");
1402
1403    for ($i=0; $i<16; ) {
1404	foreach(body_00_15()) { eval; }
1405    }
1406}
1407$code.=<<___;
1408	mov	$_ctx,$ctx
1409	mov	$a1,$A
1410
1411	add	$SZ*0($ctx),$A
1412	lea	16*$SZ($inp),$inp
1413	add	$SZ*1($ctx),$B
1414	add	$SZ*2($ctx),$C
1415	add	$SZ*3($ctx),$D
1416	add	$SZ*4($ctx),$E
1417	add	$SZ*5($ctx),$F
1418	add	$SZ*6($ctx),$G
1419	add	$SZ*7($ctx),$H
1420
1421	cmp	$_end,$inp
1422
1423	mov	$A,$SZ*0($ctx)
1424	mov	$B,$SZ*1($ctx)
1425	mov	$C,$SZ*2($ctx)
1426	mov	$D,$SZ*3($ctx)
1427	mov	$E,$SZ*4($ctx)
1428	mov	$F,$SZ*5($ctx)
1429	mov	$G,$SZ*6($ctx)
1430	mov	$H,$SZ*7($ctx)
1431	jb	.Lloop_xop
1432
1433	mov	$_rsp,%rsi
1434	vzeroupper
1435___
1436$code.=<<___ if ($win64);
1437	movaps	16*$SZ+32(%rsp),%xmm6
1438	movaps	16*$SZ+48(%rsp),%xmm7
1439	movaps	16*$SZ+64(%rsp),%xmm8
1440	movaps	16*$SZ+80(%rsp),%xmm9
1441___
1442$code.=<<___ if ($win64 && $SZ>4);
1443	movaps	16*$SZ+96(%rsp),%xmm10
1444	movaps	16*$SZ+112(%rsp),%xmm11
1445___
1446$code.=<<___;
1447	mov	-48(%rsi),%r15
1448	mov	-40(%rsi),%r14
1449	mov	-32(%rsi),%r13
1450	mov	-24(%rsi),%r12
1451	mov	-16(%rsi),%rbp
1452	mov	-8(%rsi),%rbx
1453	lea	(%rsi),%rsp
1454.Lepilogue_xop:
1455	ret
1456.size	${func}_xop,.-${func}_xop
1457___
1458}
1459######################################################################
1460# AVX+shrd code path
1461#
1462local *ror = sub { &shrd(@_[0],@_) };
1463
1464$code.=<<___;
1465.type	${func}_avx,\@function,3
1466.align	64
1467${func}_avx:
1468.Lavx_shortcut:
1469	mov	%rsp,%rax		# copy %rsp
1470	push	%rbx
1471	push	%rbp
1472	push	%r12
1473	push	%r13
1474	push	%r14
1475	push	%r15
1476	shl	\$4,%rdx		# num*16
1477	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1478	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1479	and	\$-64,%rsp		# align stack frame
1480	mov	$ctx,$_ctx		# save ctx, 1st arg
1481	mov	$inp,$_inp		# save inp, 2nd arh
1482	mov	%rdx,$_end		# save end pointer, "3rd" arg
1483	mov	%rax,$_rsp		# save copy of %rsp
1484___
1485$code.=<<___ if ($win64);
1486	movaps	%xmm6,16*$SZ+32(%rsp)
1487	movaps	%xmm7,16*$SZ+48(%rsp)
1488	movaps	%xmm8,16*$SZ+64(%rsp)
1489	movaps	%xmm9,16*$SZ+80(%rsp)
1490___
1491$code.=<<___ if ($win64 && $SZ>4);
1492	movaps	%xmm10,16*$SZ+96(%rsp)
1493	movaps	%xmm11,16*$SZ+112(%rsp)
1494___
1495$code.=<<___;
1496.Lprologue_avx:
1497
1498	vzeroupper
1499	mov	$SZ*0($ctx),$A
1500	mov	$SZ*1($ctx),$B
1501	mov	$SZ*2($ctx),$C
1502	mov	$SZ*3($ctx),$D
1503	mov	$SZ*4($ctx),$E
1504	mov	$SZ*5($ctx),$F
1505	mov	$SZ*6($ctx),$G
1506	mov	$SZ*7($ctx),$H
1507___
1508					if ($SZ==4) {	# SHA256
1509    my @X = map("%xmm$_",(0..3));
1510    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1511
1512$code.=<<___;
1513	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1514	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1515	jmp	.Lloop_avx
1516.align	16
1517.Lloop_avx:
1518	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1519	vmovdqu	0x00($inp),@X[0]
1520	vmovdqu	0x10($inp),@X[1]
1521	vmovdqu	0x20($inp),@X[2]
1522	vmovdqu	0x30($inp),@X[3]
1523	vpshufb	$t3,@X[0],@X[0]
1524	lea	$TABLE(%rip),$Tbl
1525	vpshufb	$t3,@X[1],@X[1]
1526	vpshufb	$t3,@X[2],@X[2]
1527	vpaddd	0x00($Tbl),@X[0],$t0
1528	vpshufb	$t3,@X[3],@X[3]
1529	vpaddd	0x20($Tbl),@X[1],$t1
1530	vpaddd	0x40($Tbl),@X[2],$t2
1531	vpaddd	0x60($Tbl),@X[3],$t3
1532	vmovdqa	$t0,0x00(%rsp)
1533	mov	$A,$a1
1534	vmovdqa	$t1,0x10(%rsp)
1535	mov	$B,$a3
1536	vmovdqa	$t2,0x20(%rsp)
1537	xor	$C,$a3			# magic
1538	vmovdqa	$t3,0x30(%rsp)
1539	mov	$E,$a0
1540	jmp	.Lavx_00_47
1541
1542.align	16
1543.Lavx_00_47:
1544	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1545___
1546sub Xupdate_256_AVX () {
1547	(
1548	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1549	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1550	'&vpsrld	($t2,$t0,$sigma0[0]);',
1551	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1552	'&vpsrld	($t3,$t0,$sigma0[2])',
1553	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1554	'&vpxor		($t0,$t3,$t2)',
1555	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1556	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1557	'&vpxor		($t0,$t0,$t1)',
1558	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1559	'&vpxor		($t0,$t0,$t2)',
1560	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1561	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1562	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1563	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1564	 '&vpxor	($t2,$t2,$t3);',
1565	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1566	 '&vpxor	($t2,$t2,$t3)',
1567	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1568	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1569	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1570	 '&vpsrld	($t2,$t3,$sigma1[2])',
1571	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1572	 '&vpxor	($t2,$t2,$t3);',
1573	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1574	 '&vpxor	($t2,$t2,$t3)',
1575	 '&vpshufb	($t2,$t2,$t5)',
1576	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1577	);
1578}
1579
1580sub AVX_256_00_47 () {
1581my $j = shift;
1582my $body = shift;
1583my @X = @_;
1584my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1585
1586	foreach (Xupdate_256_AVX()) {		# 29 instructions
1587	    eval;
1588	    eval(shift(@insns));
1589	    eval(shift(@insns));
1590	    eval(shift(@insns));
1591	}
1592	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1593	  foreach (@insns) { eval; }		# remaining instructions
1594	&vmovdqa	(16*$j."(%rsp)",$t2);
1595}
1596
1597    for ($i=0,$j=0; $j<4; $j++) {
1598	&AVX_256_00_47($j,\&body_00_15,@X);
1599	push(@X,shift(@X));			# rotate(@X)
1600    }
1601	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1602	&jne	(".Lavx_00_47");
1603
1604    for ($i=0; $i<16; ) {
1605	foreach(body_00_15()) { eval; }
1606    }
1607
1608					} else {	# SHA512
1609    my @X = map("%xmm$_",(0..7));
1610    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1611
1612$code.=<<___;
1613	jmp	.Lloop_avx
1614.align	16
1615.Lloop_avx:
1616	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1617	vmovdqu	0x00($inp),@X[0]
1618	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1619	vmovdqu	0x10($inp),@X[1]
1620	vmovdqu	0x20($inp),@X[2]
1621	vpshufb	$t3,@X[0],@X[0]
1622	vmovdqu	0x30($inp),@X[3]
1623	vpshufb	$t3,@X[1],@X[1]
1624	vmovdqu	0x40($inp),@X[4]
1625	vpshufb	$t3,@X[2],@X[2]
1626	vmovdqu	0x50($inp),@X[5]
1627	vpshufb	$t3,@X[3],@X[3]
1628	vmovdqu	0x60($inp),@X[6]
1629	vpshufb	$t3,@X[4],@X[4]
1630	vmovdqu	0x70($inp),@X[7]
1631	vpshufb	$t3,@X[5],@X[5]
1632	vpaddq	-0x80($Tbl),@X[0],$t0
1633	vpshufb	$t3,@X[6],@X[6]
1634	vpaddq	-0x60($Tbl),@X[1],$t1
1635	vpshufb	$t3,@X[7],@X[7]
1636	vpaddq	-0x40($Tbl),@X[2],$t2
1637	vpaddq	-0x20($Tbl),@X[3],$t3
1638	vmovdqa	$t0,0x00(%rsp)
1639	vpaddq	0x00($Tbl),@X[4],$t0
1640	vmovdqa	$t1,0x10(%rsp)
1641	vpaddq	0x20($Tbl),@X[5],$t1
1642	vmovdqa	$t2,0x20(%rsp)
1643	vpaddq	0x40($Tbl),@X[6],$t2
1644	vmovdqa	$t3,0x30(%rsp)
1645	vpaddq	0x60($Tbl),@X[7],$t3
1646	vmovdqa	$t0,0x40(%rsp)
1647	mov	$A,$a1
1648	vmovdqa	$t1,0x50(%rsp)
1649	mov	$B,$a3
1650	vmovdqa	$t2,0x60(%rsp)
1651	xor	$C,$a3			# magic
1652	vmovdqa	$t3,0x70(%rsp)
1653	mov	$E,$a0
1654	jmp	.Lavx_00_47
1655
1656.align	16
1657.Lavx_00_47:
1658	add	\$`16*2*$SZ`,$Tbl
1659___
1660sub Xupdate_512_AVX () {
1661	(
1662	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1663	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1664	'&vpsrlq	($t2,$t0,$sigma0[0])',
1665	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1666	'&vpsrlq	($t3,$t0,$sigma0[2])',
1667	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1668	 '&vpxor	($t0,$t3,$t2)',
1669	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1670	 '&vpxor	($t0,$t0,$t1)',
1671	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1672	 '&vpxor	($t0,$t0,$t2)',
1673	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1674	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1675	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1676	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1677	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1678	 '&vpxor	($t3,$t3,$t2)',
1679	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1680	 '&vpxor	($t3,$t3,$t1)',
1681	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1682	 '&vpxor	($t3,$t3,$t2)',
1683	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1684	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1685	);
1686}
1687
1688sub AVX_512_00_47 () {
1689my $j = shift;
1690my $body = shift;
1691my @X = @_;
1692my @insns = (&$body,&$body);			# 52 instructions
1693
1694	foreach (Xupdate_512_AVX()) {		# 23 instructions
1695	    eval;
1696	    eval(shift(@insns));
1697	    eval(shift(@insns));
1698	}
1699	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1700	  foreach (@insns) { eval; }		# remaining instructions
1701	&vmovdqa	(16*$j."(%rsp)",$t2);
1702}
1703
1704    for ($i=0,$j=0; $j<8; $j++) {
1705	&AVX_512_00_47($j,\&body_00_15,@X);
1706	push(@X,shift(@X));			# rotate(@X)
1707    }
1708	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1709	&jne	(".Lavx_00_47");
1710
1711    for ($i=0; $i<16; ) {
1712	foreach(body_00_15()) { eval; }
1713    }
1714}
1715$code.=<<___;
1716	mov	$_ctx,$ctx
1717	mov	$a1,$A
1718
1719	add	$SZ*0($ctx),$A
1720	lea	16*$SZ($inp),$inp
1721	add	$SZ*1($ctx),$B
1722	add	$SZ*2($ctx),$C
1723	add	$SZ*3($ctx),$D
1724	add	$SZ*4($ctx),$E
1725	add	$SZ*5($ctx),$F
1726	add	$SZ*6($ctx),$G
1727	add	$SZ*7($ctx),$H
1728
1729	cmp	$_end,$inp
1730
1731	mov	$A,$SZ*0($ctx)
1732	mov	$B,$SZ*1($ctx)
1733	mov	$C,$SZ*2($ctx)
1734	mov	$D,$SZ*3($ctx)
1735	mov	$E,$SZ*4($ctx)
1736	mov	$F,$SZ*5($ctx)
1737	mov	$G,$SZ*6($ctx)
1738	mov	$H,$SZ*7($ctx)
1739	jb	.Lloop_avx
1740
1741	mov	$_rsp,%rsi
1742	vzeroupper
1743___
1744$code.=<<___ if ($win64);
1745	movaps	16*$SZ+32(%rsp),%xmm6
1746	movaps	16*$SZ+48(%rsp),%xmm7
1747	movaps	16*$SZ+64(%rsp),%xmm8
1748	movaps	16*$SZ+80(%rsp),%xmm9
1749___
1750$code.=<<___ if ($win64 && $SZ>4);
1751	movaps	16*$SZ+96(%rsp),%xmm10
1752	movaps	16*$SZ+112(%rsp),%xmm11
1753___
1754$code.=<<___;
1755	mov	-48(%rsi),%r15
1756	mov	-40(%rsi),%r14
1757	mov	-32(%rsi),%r13
1758	mov	-24(%rsi),%r12
1759	mov	-16(%rsi),%rbp
1760	mov	-8(%rsi),%rbx
1761	lea	(%rsi),%rsp
1762.Lepilogue_avx:
1763	ret
1764.size	${func}_avx,.-${func}_avx
1765___
1766
1767if ($avx>1) {{
1768######################################################################
1769# AVX2+BMI code path
1770#
1771my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1772my $PUSH8=8*2*$SZ;
1773use integer;
1774
1775sub bodyx_00_15 () {
1776	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1777	(
1778	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1779
1780	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1781	'&and	($a4,$e)',		# f&e
1782	'&rorx	($a0,$e,$Sigma1[2])',
1783	'&rorx	($a2,$e,$Sigma1[1])',
1784
1785	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1786	'&lea	($h,"($h,$a4)")',
1787	'&andn	($a4,$e,$g)',		# ~e&g
1788	'&xor	($a0,$a2)',
1789
1790	'&rorx	($a1,$e,$Sigma1[0])',
1791	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1792	'&xor	($a0,$a1)',		# Sigma1(e)
1793	'&mov	($a2,$a)',
1794
1795	'&rorx	($a4,$a,$Sigma0[2])',
1796	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1797	'&xor	($a2,$b)',		# a^b, b^c in next round
1798	'&rorx	($a1,$a,$Sigma0[1])',
1799
1800	'&rorx	($a0,$a,$Sigma0[0])',
1801	'&lea	($d,"($d,$h)")',	# d+=h
1802	'&and	($a3,$a2)',		# (b^c)&(a^b)
1803	'&xor	($a1,$a4)',
1804
1805	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1806	'&xor	($a1,$a0)',		# Sigma0(a)
1807	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1808	'&mov	($a4,$e)',		# copy of f in future
1809
1810	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1811	);
1812	# and at the finish one has to $a+=$a1
1813}
1814
1815$code.=<<___;
1816.type	${func}_avx2,\@function,3
1817.align	64
1818${func}_avx2:
1819.Lavx2_shortcut:
1820	mov	%rsp,%rax		# copy %rsp
1821	push	%rbx
1822	push	%rbp
1823	push	%r12
1824	push	%r13
1825	push	%r14
1826	push	%r15
1827	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1828	shl	\$4,%rdx		# num*16
1829	and	\$-256*$SZ,%rsp		# align stack frame
1830	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1831	add	\$`2*$SZ*($rounds-8)`,%rsp
1832	mov	$ctx,$_ctx		# save ctx, 1st arg
1833	mov	$inp,$_inp		# save inp, 2nd arh
1834	mov	%rdx,$_end		# save end pointer, "3rd" arg
1835	mov	%rax,$_rsp		# save copy of %rsp
1836___
1837$code.=<<___ if ($win64);
1838	movaps	%xmm6,16*$SZ+32(%rsp)
1839	movaps	%xmm7,16*$SZ+48(%rsp)
1840	movaps	%xmm8,16*$SZ+64(%rsp)
1841	movaps	%xmm9,16*$SZ+80(%rsp)
1842___
1843$code.=<<___ if ($win64 && $SZ>4);
1844	movaps	%xmm10,16*$SZ+96(%rsp)
1845	movaps	%xmm11,16*$SZ+112(%rsp)
1846___
1847$code.=<<___;
1848.Lprologue_avx2:
1849
1850	vzeroupper
1851	sub	\$-16*$SZ,$inp		# inp++, size optimization
1852	mov	$SZ*0($ctx),$A
1853	mov	$inp,%r12		# borrow $T1
1854	mov	$SZ*1($ctx),$B
1855	cmp	%rdx,$inp		# $_end
1856	mov	$SZ*2($ctx),$C
1857	cmove	%rsp,%r12		# next block or random data
1858	mov	$SZ*3($ctx),$D
1859	mov	$SZ*4($ctx),$E
1860	mov	$SZ*5($ctx),$F
1861	mov	$SZ*6($ctx),$G
1862	mov	$SZ*7($ctx),$H
1863___
1864					if ($SZ==4) {	# SHA256
1865    my @X = map("%ymm$_",(0..3));
1866    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1867
1868$code.=<<___;
1869	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1870	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1871	jmp	.Loop_avx2
1872.align	16
1873.Loop_avx2:
1874	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1875	vmovdqu	-16*$SZ+0($inp),%xmm0
1876	vmovdqu	-16*$SZ+16($inp),%xmm1
1877	vmovdqu	-16*$SZ+32($inp),%xmm2
1878	vmovdqu	-16*$SZ+48($inp),%xmm3
1879	#mov		$inp,$_inp	# offload $inp
1880	vinserti128	\$1,(%r12),@X[0],@X[0]
1881	vinserti128	\$1,16(%r12),@X[1],@X[1]
1882	vpshufb		$t3,@X[0],@X[0]
1883	vinserti128	\$1,32(%r12),@X[2],@X[2]
1884	vpshufb		$t3,@X[1],@X[1]
1885	vinserti128	\$1,48(%r12),@X[3],@X[3]
1886
1887	lea	$TABLE(%rip),$Tbl
1888	vpshufb	$t3,@X[2],@X[2]
1889	vpaddd	0x00($Tbl),@X[0],$t0
1890	vpshufb	$t3,@X[3],@X[3]
1891	vpaddd	0x20($Tbl),@X[1],$t1
1892	vpaddd	0x40($Tbl),@X[2],$t2
1893	vpaddd	0x60($Tbl),@X[3],$t3
1894	vmovdqa	$t0,0x00(%rsp)
1895	xor	$a1,$a1
1896	vmovdqa	$t1,0x20(%rsp)
1897	lea	-$PUSH8(%rsp),%rsp
1898	mov	$B,$a3
1899	vmovdqa	$t2,0x00(%rsp)
1900	xor	$C,$a3			# magic
1901	vmovdqa	$t3,0x20(%rsp)
1902	mov	$F,$a4
1903	sub	\$-16*2*$SZ,$Tbl	# size optimization
1904	jmp	.Lavx2_00_47
1905
1906.align	16
1907.Lavx2_00_47:
1908___
1909
1910sub AVX2_256_00_47 () {
1911my $j = shift;
1912my $body = shift;
1913my @X = @_;
1914my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1915my $base = "+2*$PUSH8(%rsp)";
1916
1917	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1918	foreach (Xupdate_256_AVX()) {		# 29 instructions
1919	    eval;
1920	    eval(shift(@insns));
1921	    eval(shift(@insns));
1922	    eval(shift(@insns));
1923	}
1924	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1925	  foreach (@insns) { eval; }		# remaining instructions
1926	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1927}
1928
1929    for ($i=0,$j=0; $j<4; $j++) {
1930	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1931	push(@X,shift(@X));			# rotate(@X)
1932    }
1933	&lea	($Tbl,16*2*$SZ."($Tbl)");
1934	&cmpb	(($SZ-1)."($Tbl)",0);
1935	&jne	(".Lavx2_00_47");
1936
1937    for ($i=0; $i<16; ) {
1938	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1939	foreach(bodyx_00_15()) { eval; }
1940    }
1941					} else {	# SHA512
1942    my @X = map("%ymm$_",(0..7));
1943    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1944
1945$code.=<<___;
1946	jmp	.Loop_avx2
1947.align	16
1948.Loop_avx2:
1949	vmovdqu	-16*$SZ($inp),%xmm0
1950	vmovdqu	-16*$SZ+16($inp),%xmm1
1951	vmovdqu	-16*$SZ+32($inp),%xmm2
1952	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1953	vmovdqu	-16*$SZ+48($inp),%xmm3
1954	vmovdqu	-16*$SZ+64($inp),%xmm4
1955	vmovdqu	-16*$SZ+80($inp),%xmm5
1956	vmovdqu	-16*$SZ+96($inp),%xmm6
1957	vmovdqu	-16*$SZ+112($inp),%xmm7
1958	#mov	$inp,$_inp	# offload $inp
1959	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1960	vinserti128	\$1,(%r12),@X[0],@X[0]
1961	vinserti128	\$1,16(%r12),@X[1],@X[1]
1962	 vpshufb	$t2,@X[0],@X[0]
1963	vinserti128	\$1,32(%r12),@X[2],@X[2]
1964	 vpshufb	$t2,@X[1],@X[1]
1965	vinserti128	\$1,48(%r12),@X[3],@X[3]
1966	 vpshufb	$t2,@X[2],@X[2]
1967	vinserti128	\$1,64(%r12),@X[4],@X[4]
1968	 vpshufb	$t2,@X[3],@X[3]
1969	vinserti128	\$1,80(%r12),@X[5],@X[5]
1970	 vpshufb	$t2,@X[4],@X[4]
1971	vinserti128	\$1,96(%r12),@X[6],@X[6]
1972	 vpshufb	$t2,@X[5],@X[5]
1973	vinserti128	\$1,112(%r12),@X[7],@X[7]
1974
1975	vpaddq	-0x80($Tbl),@X[0],$t0
1976	vpshufb	$t2,@X[6],@X[6]
1977	vpaddq	-0x60($Tbl),@X[1],$t1
1978	vpshufb	$t2,@X[7],@X[7]
1979	vpaddq	-0x40($Tbl),@X[2],$t2
1980	vpaddq	-0x20($Tbl),@X[3],$t3
1981	vmovdqa	$t0,0x00(%rsp)
1982	vpaddq	0x00($Tbl),@X[4],$t0
1983	vmovdqa	$t1,0x20(%rsp)
1984	vpaddq	0x20($Tbl),@X[5],$t1
1985	vmovdqa	$t2,0x40(%rsp)
1986	vpaddq	0x40($Tbl),@X[6],$t2
1987	vmovdqa	$t3,0x60(%rsp)
1988	lea	-$PUSH8(%rsp),%rsp
1989	vpaddq	0x60($Tbl),@X[7],$t3
1990	vmovdqa	$t0,0x00(%rsp)
1991	xor	$a1,$a1
1992	vmovdqa	$t1,0x20(%rsp)
1993	mov	$B,$a3
1994	vmovdqa	$t2,0x40(%rsp)
1995	xor	$C,$a3			# magic
1996	vmovdqa	$t3,0x60(%rsp)
1997	mov	$F,$a4
1998	add	\$16*2*$SZ,$Tbl
1999	jmp	.Lavx2_00_47
2000
2001.align	16
2002.Lavx2_00_47:
2003___
2004
2005sub AVX2_512_00_47 () {
2006my $j = shift;
2007my $body = shift;
2008my @X = @_;
2009my @insns = (&$body,&$body);			# 48 instructions
2010my $base = "+2*$PUSH8(%rsp)";
2011
2012	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
2013	foreach (Xupdate_512_AVX()) {		# 23 instructions
2014	    eval;
2015	    if ($_ !~ /\;$/) {
2016		eval(shift(@insns));
2017		eval(shift(@insns));
2018		eval(shift(@insns));
2019	    }
2020	}
2021	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
2022	  foreach (@insns) { eval; }		# remaining instructions
2023	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
2024}
2025
2026    for ($i=0,$j=0; $j<8; $j++) {
2027	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
2028	push(@X,shift(@X));			# rotate(@X)
2029    }
2030	&lea	($Tbl,16*2*$SZ."($Tbl)");
2031	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
2032	&jne	(".Lavx2_00_47");
2033
2034    for ($i=0; $i<16; ) {
2035	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2036	foreach(bodyx_00_15()) { eval; }
2037    }
2038}
2039$code.=<<___;
2040	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2041	add	$a1,$A
2042	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2043	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
2044
2045	add	$SZ*0($ctx),$A
2046	add	$SZ*1($ctx),$B
2047	add	$SZ*2($ctx),$C
2048	add	$SZ*3($ctx),$D
2049	add	$SZ*4($ctx),$E
2050	add	$SZ*5($ctx),$F
2051	add	$SZ*6($ctx),$G
2052	add	$SZ*7($ctx),$H
2053
2054	mov	$A,$SZ*0($ctx)
2055	mov	$B,$SZ*1($ctx)
2056	mov	$C,$SZ*2($ctx)
2057	mov	$D,$SZ*3($ctx)
2058	mov	$E,$SZ*4($ctx)
2059	mov	$F,$SZ*5($ctx)
2060	mov	$G,$SZ*6($ctx)
2061	mov	$H,$SZ*7($ctx)
2062
2063	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
2064	je	.Ldone_avx2
2065
2066	xor	$a1,$a1
2067	mov	$B,$a3
2068	xor	$C,$a3			# magic
2069	mov	$F,$a4
2070	jmp	.Lower_avx2
2071.align	16
2072.Lower_avx2:
2073___
2074    for ($i=0; $i<8; ) {
2075	my $base="+16($Tbl)";
2076	foreach(bodyx_00_15()) { eval; }
2077    }
2078$code.=<<___;
2079	lea	-$PUSH8($Tbl),$Tbl
2080	cmp	%rsp,$Tbl
2081	jae	.Lower_avx2
2082
2083	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
2084	add	$a1,$A
2085	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
2086	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
2087
2088	add	$SZ*0($ctx),$A
2089	add	$SZ*1($ctx),$B
2090	add	$SZ*2($ctx),$C
2091	add	$SZ*3($ctx),$D
2092	add	$SZ*4($ctx),$E
2093	add	$SZ*5($ctx),$F
2094	lea	`2*16*$SZ`($inp),$inp	# inp+=2
2095	add	$SZ*6($ctx),$G
2096	mov	$inp,%r12
2097	add	$SZ*7($ctx),$H
2098	cmp	$_end,$inp
2099
2100	mov	$A,$SZ*0($ctx)
2101	cmove	%rsp,%r12		# next block or stale data
2102	mov	$B,$SZ*1($ctx)
2103	mov	$C,$SZ*2($ctx)
2104	mov	$D,$SZ*3($ctx)
2105	mov	$E,$SZ*4($ctx)
2106	mov	$F,$SZ*5($ctx)
2107	mov	$G,$SZ*6($ctx)
2108	mov	$H,$SZ*7($ctx)
2109
2110	jbe	.Loop_avx2
2111	lea	(%rsp),$Tbl
2112
2113.Ldone_avx2:
2114	lea	($Tbl),%rsp
2115	mov	$_rsp,%rsi
2116	vzeroupper
2117___
2118$code.=<<___ if ($win64);
2119	movaps	16*$SZ+32(%rsp),%xmm6
2120	movaps	16*$SZ+48(%rsp),%xmm7
2121	movaps	16*$SZ+64(%rsp),%xmm8
2122	movaps	16*$SZ+80(%rsp),%xmm9
2123___
2124$code.=<<___ if ($win64 && $SZ>4);
2125	movaps	16*$SZ+96(%rsp),%xmm10
2126	movaps	16*$SZ+112(%rsp),%xmm11
2127___
2128$code.=<<___;
2129	mov	-48(%rsi),%r15
2130	mov	-40(%rsi),%r14
2131	mov	-32(%rsi),%r13
2132	mov	-24(%rsi),%r12
2133	mov	-16(%rsi),%rbp
2134	mov	-8(%rsi),%rbx
2135	lea	(%rsi),%rsp
2136.Lepilogue_avx2:
2137	ret
2138.size	${func}_avx2,.-${func}_avx2
2139___
2140}}
2141}}}}}
2142
2143# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2144#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
2145if ($win64) {
2146$rec="%rcx";
2147$frame="%rdx";
2148$context="%r8";
2149$disp="%r9";
2150
2151$code.=<<___;
2152.extern	__imp_RtlVirtualUnwind
2153.type	se_handler,\@abi-omnipotent
2154.align	16
2155se_handler:
2156	push	%rsi
2157	push	%rdi
2158	push	%rbx
2159	push	%rbp
2160	push	%r12
2161	push	%r13
2162	push	%r14
2163	push	%r15
2164	pushfq
2165	sub	\$64,%rsp
2166
2167	mov	120($context),%rax	# pull context->Rax
2168	mov	248($context),%rbx	# pull context->Rip
2169
2170	mov	8($disp),%rsi		# disp->ImageBase
2171	mov	56($disp),%r11		# disp->HanderlData
2172
2173	mov	0(%r11),%r10d		# HandlerData[0]
2174	lea	(%rsi,%r10),%r10	# prologue label
2175	cmp	%r10,%rbx		# context->Rip<prologue label
2176	jb	.Lin_prologue
2177
2178	mov	152($context),%rax	# pull context->Rsp
2179
2180	mov	4(%r11),%r10d		# HandlerData[1]
2181	lea	(%rsi,%r10),%r10	# epilogue label
2182	cmp	%r10,%rbx		# context->Rip>=epilogue label
2183	jae	.Lin_prologue
2184___
2185$code.=<<___ if ($avx>1);
2186	lea	.Lavx2_shortcut(%rip),%r10
2187	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
2188	jb	.Lnot_in_avx2
2189
2190	and	\$-256*$SZ,%rax
2191	add	\$`2*$SZ*($rounds-8)`,%rax
2192.Lnot_in_avx2:
2193___
2194$code.=<<___;
2195	mov	%rax,%rsi		# put aside Rsp
2196	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
2197
2198	mov	-8(%rax),%rbx
2199	mov	-16(%rax),%rbp
2200	mov	-24(%rax),%r12
2201	mov	-32(%rax),%r13
2202	mov	-40(%rax),%r14
2203	mov	-48(%rax),%r15
2204	mov	%rbx,144($context)	# restore context->Rbx
2205	mov	%rbp,160($context)	# restore context->Rbp
2206	mov	%r12,216($context)	# restore context->R12
2207	mov	%r13,224($context)	# restore context->R13
2208	mov	%r14,232($context)	# restore context->R14
2209	mov	%r15,240($context)	# restore context->R15
2210
2211	lea	.Lepilogue(%rip),%r10
2212	cmp	%r10,%rbx
2213	jb	.Lin_prologue		# non-AVX code
2214
2215	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
2216	lea	512($context),%rdi	# &context.Xmm6
2217	mov	\$`$SZ==4?8:12`,%ecx
2218	.long	0xa548f3fc		# cld; rep movsq
2219
2220.Lin_prologue:
2221	mov	8(%rax),%rdi
2222	mov	16(%rax),%rsi
2223	mov	%rax,152($context)	# restore context->Rsp
2224	mov	%rsi,168($context)	# restore context->Rsi
2225	mov	%rdi,176($context)	# restore context->Rdi
2226
2227	mov	40($disp),%rdi		# disp->ContextRecord
2228	mov	$context,%rsi		# context
2229	mov	\$154,%ecx		# sizeof(CONTEXT)
2230	.long	0xa548f3fc		# cld; rep movsq
2231
2232	mov	$disp,%rsi
2233	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
2234	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
2235	mov	0(%rsi),%r8		# arg3, disp->ControlPc
2236	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
2237	mov	40(%rsi),%r10		# disp->ContextRecord
2238	lea	56(%rsi),%r11		# &disp->HandlerData
2239	lea	24(%rsi),%r12		# &disp->EstablisherFrame
2240	mov	%r10,32(%rsp)		# arg5
2241	mov	%r11,40(%rsp)		# arg6
2242	mov	%r12,48(%rsp)		# arg7
2243	mov	%rcx,56(%rsp)		# arg8, (NULL)
2244	call	*__imp_RtlVirtualUnwind(%rip)
2245
2246	mov	\$1,%eax		# ExceptionContinueSearch
2247	add	\$64,%rsp
2248	popfq
2249	pop	%r15
2250	pop	%r14
2251	pop	%r13
2252	pop	%r12
2253	pop	%rbp
2254	pop	%rbx
2255	pop	%rdi
2256	pop	%rsi
2257	ret
2258.size	se_handler,.-se_handler
2259___
2260
2261$code.=<<___ if ($SZ==4 && $shaext);
2262.type	shaext_handler,\@abi-omnipotent
2263.align	16
2264shaext_handler:
2265	push	%rsi
2266	push	%rdi
2267	push	%rbx
2268	push	%rbp
2269	push	%r12
2270	push	%r13
2271	push	%r14
2272	push	%r15
2273	pushfq
2274	sub	\$64,%rsp
2275
2276	mov	120($context),%rax	# pull context->Rax
2277	mov	248($context),%rbx	# pull context->Rip
2278
2279	lea	.Lprologue_shaext(%rip),%r10
2280	cmp	%r10,%rbx		# context->Rip<.Lprologue
2281	jb	.Lin_prologue
2282
2283	lea	.Lepilogue_shaext(%rip),%r10
2284	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
2285	jae	.Lin_prologue
2286
2287	lea	-8-5*16(%rax),%rsi
2288	lea	512($context),%rdi	# &context.Xmm6
2289	mov	\$10,%ecx
2290	.long	0xa548f3fc		# cld; rep movsq
2291
2292	jmp	.Lin_prologue
2293.size	shaext_handler,.-shaext_handler
2294___
2295
2296$code.=<<___;
2297.section	.pdata
2298.align	4
2299	.rva	.LSEH_begin_$func
2300	.rva	.LSEH_end_$func
2301	.rva	.LSEH_info_$func
2302___
2303$code.=<<___ if ($SZ==4 && $shaext);
2304	.rva	.LSEH_begin_${func}_shaext
2305	.rva	.LSEH_end_${func}_shaext
2306	.rva	.LSEH_info_${func}_shaext
2307___
2308$code.=<<___ if ($SZ==4);
2309	.rva	.LSEH_begin_${func}_ssse3
2310	.rva	.LSEH_end_${func}_ssse3
2311	.rva	.LSEH_info_${func}_ssse3
2312___
2313$code.=<<___ if ($avx && $SZ==8);
2314	.rva	.LSEH_begin_${func}_xop
2315	.rva	.LSEH_end_${func}_xop
2316	.rva	.LSEH_info_${func}_xop
2317___
2318$code.=<<___ if ($avx);
2319	.rva	.LSEH_begin_${func}_avx
2320	.rva	.LSEH_end_${func}_avx
2321	.rva	.LSEH_info_${func}_avx
2322___
2323$code.=<<___ if ($avx>1);
2324	.rva	.LSEH_begin_${func}_avx2
2325	.rva	.LSEH_end_${func}_avx2
2326	.rva	.LSEH_info_${func}_avx2
2327___
2328$code.=<<___;
2329.section	.xdata
2330.align	8
2331.LSEH_info_$func:
2332	.byte	9,0,0,0
2333	.rva	se_handler
2334	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2335___
2336$code.=<<___ if ($SZ==4 && $shaext);
2337.LSEH_info_${func}_shaext:
2338	.byte	9,0,0,0
2339	.rva	shaext_handler
2340___
2341$code.=<<___ if ($SZ==4);
2342.LSEH_info_${func}_ssse3:
2343	.byte	9,0,0,0
2344	.rva	se_handler
2345	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2346___
2347$code.=<<___ if ($avx && $SZ==8);
2348.LSEH_info_${func}_xop:
2349	.byte	9,0,0,0
2350	.rva	se_handler
2351	.rva	.Lprologue_xop,.Lepilogue_xop		# HandlerData[]
2352___
2353$code.=<<___ if ($avx);
2354.LSEH_info_${func}_avx:
2355	.byte	9,0,0,0
2356	.rva	se_handler
2357	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2358___
2359$code.=<<___ if ($avx>1);
2360.LSEH_info_${func}_avx2:
2361	.byte	9,0,0,0
2362	.rva	se_handler
2363	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2364___
2365}
2366
2367sub sha256op38 {
2368    my $instr = shift;
2369    my %opcodelet = (
2370		"sha256rnds2" => 0xcb,
2371  		"sha256msg1"  => 0xcc,
2372		"sha256msg2"  => 0xcd	);
2373
2374    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2375      my @opcode=(0x0f,0x38);
2376	push @opcode,$opcodelet{$instr};
2377	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2378	return ".byte\t".join(',',@opcode);
2379    } else {
2380	return $instr."\t".@_[0];
2381    }
2382}
2383
2384foreach (split("\n",$code)) {
2385	s/\`([^\`]*)\`/eval $1/geo;
2386
2387	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2388
2389	print $_,"\n";
2390}
2391close STDOUT;
2392