• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111#
112# Modified from upstream OpenSSL to remove the XOP code.
113
114$flavour = shift;
115$output  = shift;
116if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
117
118$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
119
120$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
121( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
122( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
123die "can't locate x86_64-xlate.pl";
124
125# In upstream, this is controlled by shelling out to the compiler to check
126# versions, but BoringSSL is intended to be used with pre-generated perlasm
127# output, so this isn't useful anyway.
128#
129# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
130# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
131# did not tie them together until after $shaext was added.
132$avx = 1;
133
134# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
135# been tested.
136$shaext=0;	### set to zero if compiling for 1.0.1
137$avx=1		if (!$shaext && $avx);
138
139open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
140*STDOUT=*OUT;
141
142if ($output =~ /512/) {
143	$func="sha512_block_data_order";
144	$TABLE="K512";
145	$SZ=8;
146	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
147					"%r8", "%r9", "%r10","%r11");
148	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
149	@Sigma0=(28,34,39);
150	@Sigma1=(14,18,41);
151	@sigma0=(1,  8, 7);
152	@sigma1=(19,61, 6);
153	$rounds=80;
154} else {
155	$func="sha256_block_data_order";
156	$TABLE="K256";
157	$SZ=4;
158	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
159					"%r8d","%r9d","%r10d","%r11d");
160	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
161	@Sigma0=( 2,13,22);
162	@Sigma1=( 6,11,25);
163	@sigma0=( 7,18, 3);
164	@sigma1=(17,19,10);
165	$rounds=64;
166}
167
168$ctx="%rdi";	# 1st arg, zapped by $a3
169$inp="%rsi";	# 2nd arg
170$Tbl="%rbp";
171
172$_ctx="16*$SZ+0*8(%rsp)";
173$_inp="16*$SZ+1*8(%rsp)";
174$_end="16*$SZ+2*8(%rsp)";
175$_rsp="`16*$SZ+3*8`(%rsp)";
176$framesz="16*$SZ+4*8";
177
178
179sub ROUND_00_15()
180{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
181  my $STRIDE=$SZ;
182     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
183
184$code.=<<___;
185	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
186	mov	$f,$a2
187
188	xor	$e,$a0
189	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
190	xor	$g,$a2			# f^g
191
192	mov	$T1,`$SZ*($i&0xf)`(%rsp)
193	xor	$a,$a1
194	and	$e,$a2			# (f^g)&e
195
196	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
197	add	$h,$T1			# T1+=h
198	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
199
200	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
201	xor	$e,$a0
202	add	$a2,$T1			# T1+=Ch(e,f,g)
203
204	mov	$a,$a2
205	add	($Tbl),$T1		# T1+=K[round]
206	xor	$a,$a1
207
208	xor	$b,$a2			# a^b, b^c in next round
209	ror	\$$Sigma1[0],$a0	# Sigma1(e)
210	mov	$b,$h
211
212	and	$a2,$a3
213	ror	\$$Sigma0[0],$a1	# Sigma0(a)
214	add	$a0,$T1			# T1+=Sigma1(e)
215
216	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
217	add	$T1,$d			# d+=T1
218	add	$T1,$h			# h+=T1
219
220	lea	$STRIDE($Tbl),$Tbl	# round++
221___
222$code.=<<___ if ($i<15);
223	add	$a1,$h			# h+=Sigma0(a)
224___
225	($a2,$a3) = ($a3,$a2);
226}
227
228sub ROUND_16_XX()
229{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
230
231$code.=<<___;
232	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
233	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
234
235	mov	$a0,$T1
236	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
237	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
238	mov	$a2,$a1
239	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
240
241	xor	$T1,$a0
242	shr	\$$sigma0[2],$T1
243	ror	\$$sigma0[0],$a0
244	xor	$a1,$a2
245	shr	\$$sigma1[2],$a1
246
247	ror	\$$sigma1[0],$a2
248	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
249	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
250	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
251
252	add	`$SZ*($i&0xf)`(%rsp),$T1
253	mov	$e,$a0
254	add	$a2,$T1
255	mov	$a,$a1
256___
257	&ROUND_00_15(@_);
258}
259
260$code=<<___;
261.text
262
263.extern	OPENSSL_ia32cap_P
264.globl	$func
265.type	$func,\@function,3
266.align	16
267$func:
268.cfi_startproc
269___
270$code.=<<___ if ($SZ==4 || $avx);
271	leaq	OPENSSL_ia32cap_P(%rip),%r11
272	mov	0(%r11),%r9d
273	mov	4(%r11),%r10d
274	mov	8(%r11),%r11d
275___
276$code.=<<___ if ($SZ==4 && $shaext);
277	test	\$`1<<29`,%r11d		# check for SHA
278	jnz	_shaext_shortcut
279___
280    # XOP codepath removed.
281$code.=<<___ if ($avx>1);
282	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
283	cmp	\$`1<<8|1<<5|1<<3`,%r11d
284	je	.Lavx2_shortcut
285___
286$code.=<<___ if ($avx);
287	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
288	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
289	or	%r9d,%r10d
290	cmp	\$`1<<28|1<<9|1<<30`,%r10d
291	je	.Lavx_shortcut
292___
293$code.=<<___ if ($SZ==4);
294	test	\$`1<<9`,%r10d
295	jnz	.Lssse3_shortcut
296___
297$code.=<<___;
298	mov	%rsp,%rax		# copy %rsp
299.cfi_def_cfa_register	%rax
300	push	%rbx
301.cfi_push	%rbx
302	push	%rbp
303.cfi_push	%rbp
304	push	%r12
305.cfi_push	%r12
306	push	%r13
307.cfi_push	%r13
308	push	%r14
309.cfi_push	%r14
310	push	%r15
311.cfi_push	%r15
312	shl	\$4,%rdx		# num*16
313	sub	\$$framesz,%rsp
314	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
315	and	\$-64,%rsp		# align stack frame
316	mov	$ctx,$_ctx		# save ctx, 1st arg
317	mov	$inp,$_inp		# save inp, 2nd arh
318	mov	%rdx,$_end		# save end pointer, "3rd" arg
319	mov	%rax,$_rsp		# save copy of %rsp
320.cfi_cfa_expression	$_rsp,deref,+8
321.Lprologue:
322
323	mov	$SZ*0($ctx),$A
324	mov	$SZ*1($ctx),$B
325	mov	$SZ*2($ctx),$C
326	mov	$SZ*3($ctx),$D
327	mov	$SZ*4($ctx),$E
328	mov	$SZ*5($ctx),$F
329	mov	$SZ*6($ctx),$G
330	mov	$SZ*7($ctx),$H
331	jmp	.Lloop
332
333.align	16
334.Lloop:
335	mov	$B,$a3
336	lea	$TABLE(%rip),$Tbl
337	xor	$C,$a3			# magic
338___
339	for($i=0;$i<16;$i++) {
340		$code.="	mov	$SZ*$i($inp),$T1\n";
341		$code.="	mov	@ROT[4],$a0\n";
342		$code.="	mov	@ROT[0],$a1\n";
343		$code.="	bswap	$T1\n";
344		&ROUND_00_15($i,@ROT);
345		unshift(@ROT,pop(@ROT));
346	}
347$code.=<<___;
348	jmp	.Lrounds_16_xx
349.align	16
350.Lrounds_16_xx:
351___
352	for(;$i<32;$i++) {
353		&ROUND_16_XX($i,@ROT);
354		unshift(@ROT,pop(@ROT));
355	}
356
357$code.=<<___;
358	cmpb	\$0,`$SZ-1`($Tbl)
359	jnz	.Lrounds_16_xx
360
361	mov	$_ctx,$ctx
362	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
363	lea	16*$SZ($inp),$inp
364
365	add	$SZ*0($ctx),$A
366	add	$SZ*1($ctx),$B
367	add	$SZ*2($ctx),$C
368	add	$SZ*3($ctx),$D
369	add	$SZ*4($ctx),$E
370	add	$SZ*5($ctx),$F
371	add	$SZ*6($ctx),$G
372	add	$SZ*7($ctx),$H
373
374	cmp	$_end,$inp
375
376	mov	$A,$SZ*0($ctx)
377	mov	$B,$SZ*1($ctx)
378	mov	$C,$SZ*2($ctx)
379	mov	$D,$SZ*3($ctx)
380	mov	$E,$SZ*4($ctx)
381	mov	$F,$SZ*5($ctx)
382	mov	$G,$SZ*6($ctx)
383	mov	$H,$SZ*7($ctx)
384	jb	.Lloop
385
386	mov	$_rsp,%rsi
387.cfi_def_cfa	%rsi,8
388	mov	-48(%rsi),%r15
389.cfi_restore	%r15
390	mov	-40(%rsi),%r14
391.cfi_restore	%r14
392	mov	-32(%rsi),%r13
393.cfi_restore	%r13
394	mov	-24(%rsi),%r12
395.cfi_restore	%r12
396	mov	-16(%rsi),%rbp
397.cfi_restore	%rbp
398	mov	-8(%rsi),%rbx
399.cfi_restore	%rbx
400	lea	(%rsi),%rsp
401.cfi_def_cfa_register	%rsp
402.Lepilogue:
403	ret
404.cfi_endproc
405.size	$func,.-$func
406___
407
408if ($SZ==4) {
409$code.=<<___;
410.align	64
411.type	$TABLE,\@object
412$TABLE:
413	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
414	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
415	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
416	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
417	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
418	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
419	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
420	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
421	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
422	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
423	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
424	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
425	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
426	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
427	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
428	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
429	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
430	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
431	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
432	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
433	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
434	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
435	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
436	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
437	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
438	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
439	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
440	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
441	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
442	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
443	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
444	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
445
446	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
447	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
448	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
449	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
450	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
451	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
452	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
453___
454} else {
455$code.=<<___;
456.align	64
457.type	$TABLE,\@object
458$TABLE:
459	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
460	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
461	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
462	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
463	.quad	0x3956c25bf348b538,0x59f111f1b605d019
464	.quad	0x3956c25bf348b538,0x59f111f1b605d019
465	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
466	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
467	.quad	0xd807aa98a3030242,0x12835b0145706fbe
468	.quad	0xd807aa98a3030242,0x12835b0145706fbe
469	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
470	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
471	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
472	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
473	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
474	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
475	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
476	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
477	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
478	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
479	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
480	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
481	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
482	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
483	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
484	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
485	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
486	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
487	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
488	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
489	.quad	0x06ca6351e003826f,0x142929670a0e6e70
490	.quad	0x06ca6351e003826f,0x142929670a0e6e70
491	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
492	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
493	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
494	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
495	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
496	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
497	.quad	0x81c2c92e47edaee6,0x92722c851482353b
498	.quad	0x81c2c92e47edaee6,0x92722c851482353b
499	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
500	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
501	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
502	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
503	.quad	0xd192e819d6ef5218,0xd69906245565a910
504	.quad	0xd192e819d6ef5218,0xd69906245565a910
505	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
506	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
507	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
508	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
509	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
510	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
511	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
512	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
513	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
514	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
515	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
516	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
517	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
518	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
519	.quad	0x90befffa23631e28,0xa4506cebde82bde9
520	.quad	0x90befffa23631e28,0xa4506cebde82bde9
521	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
522	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
523	.quad	0xca273eceea26619c,0xd186b8c721c0c207
524	.quad	0xca273eceea26619c,0xd186b8c721c0c207
525	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
526	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
527	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
528	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
529	.quad	0x113f9804bef90dae,0x1b710b35131c471b
530	.quad	0x113f9804bef90dae,0x1b710b35131c471b
531	.quad	0x28db77f523047d84,0x32caab7b40c72493
532	.quad	0x28db77f523047d84,0x32caab7b40c72493
533	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
534	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
535	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
536	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
537	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
538	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
539
540	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
541	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
542	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
543___
544}
545
546######################################################################
547# SIMD code paths
548#
549if ($SZ==4 && $shaext) {{{
550######################################################################
551# Intel SHA Extensions implementation of SHA256 update function.
552#
553my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
554
555my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
556my @MSG=map("%xmm$_",(3..6));
557
558$code.=<<___;
559.type	sha256_block_data_order_shaext,\@function,3
560.align	64
561sha256_block_data_order_shaext:
562_shaext_shortcut:
563___
564$code.=<<___ if ($win64);
565	lea	`-8-5*16`(%rsp),%rsp
566	movaps	%xmm6,-8-5*16(%rax)
567	movaps	%xmm7,-8-4*16(%rax)
568	movaps	%xmm8,-8-3*16(%rax)
569	movaps	%xmm9,-8-2*16(%rax)
570	movaps	%xmm10,-8-1*16(%rax)
571.Lprologue_shaext:
572___
573$code.=<<___;
574	lea		K256+0x80(%rip),$Tbl
575	movdqu		($ctx),$ABEF		# DCBA
576	movdqu		16($ctx),$CDGH		# HGFE
577	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
578
579	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
580	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
581	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
582	movdqa		$TMP,$BSWAP		# offload
583	palignr		\$8,$CDGH,$ABEF		# ABEF
584	punpcklqdq	$Wi,$CDGH		# CDGH
585	jmp		.Loop_shaext
586
587.align	16
588.Loop_shaext:
589	movdqu		($inp),@MSG[0]
590	movdqu		0x10($inp),@MSG[1]
591	movdqu		0x20($inp),@MSG[2]
592	pshufb		$TMP,@MSG[0]
593	movdqu		0x30($inp),@MSG[3]
594
595	movdqa		0*32-0x80($Tbl),$Wi
596	paddd		@MSG[0],$Wi
597	pshufb		$TMP,@MSG[1]
598	movdqa		$CDGH,$CDGH_SAVE	# offload
599	sha256rnds2	$ABEF,$CDGH		# 0-3
600	pshufd		\$0x0e,$Wi,$Wi
601	nop
602	movdqa		$ABEF,$ABEF_SAVE	# offload
603	sha256rnds2	$CDGH,$ABEF
604
605	movdqa		1*32-0x80($Tbl),$Wi
606	paddd		@MSG[1],$Wi
607	pshufb		$TMP,@MSG[2]
608	sha256rnds2	$ABEF,$CDGH		# 4-7
609	pshufd		\$0x0e,$Wi,$Wi
610	lea		0x40($inp),$inp
611	sha256msg1	@MSG[1],@MSG[0]
612	sha256rnds2	$CDGH,$ABEF
613
614	movdqa		2*32-0x80($Tbl),$Wi
615	paddd		@MSG[2],$Wi
616	pshufb		$TMP,@MSG[3]
617	sha256rnds2	$ABEF,$CDGH		# 8-11
618	pshufd		\$0x0e,$Wi,$Wi
619	movdqa		@MSG[3],$TMP
620	palignr		\$4,@MSG[2],$TMP
621	nop
622	paddd		$TMP,@MSG[0]
623	sha256msg1	@MSG[2],@MSG[1]
624	sha256rnds2	$CDGH,$ABEF
625
626	movdqa		3*32-0x80($Tbl),$Wi
627	paddd		@MSG[3],$Wi
628	sha256msg2	@MSG[3],@MSG[0]
629	sha256rnds2	$ABEF,$CDGH		# 12-15
630	pshufd		\$0x0e,$Wi,$Wi
631	movdqa		@MSG[0],$TMP
632	palignr		\$4,@MSG[3],$TMP
633	nop
634	paddd		$TMP,@MSG[1]
635	sha256msg1	@MSG[3],@MSG[2]
636	sha256rnds2	$CDGH,$ABEF
637___
638for($i=4;$i<16-3;$i++) {
639$code.=<<___;
640	movdqa		$i*32-0x80($Tbl),$Wi
641	paddd		@MSG[0],$Wi
642	sha256msg2	@MSG[0],@MSG[1]
643	sha256rnds2	$ABEF,$CDGH		# 16-19...
644	pshufd		\$0x0e,$Wi,$Wi
645	movdqa		@MSG[1],$TMP
646	palignr		\$4,@MSG[0],$TMP
647	nop
648	paddd		$TMP,@MSG[2]
649	sha256msg1	@MSG[0],@MSG[3]
650	sha256rnds2	$CDGH,$ABEF
651___
652	push(@MSG,shift(@MSG));
653}
654$code.=<<___;
655	movdqa		13*32-0x80($Tbl),$Wi
656	paddd		@MSG[0],$Wi
657	sha256msg2	@MSG[0],@MSG[1]
658	sha256rnds2	$ABEF,$CDGH		# 52-55
659	pshufd		\$0x0e,$Wi,$Wi
660	movdqa		@MSG[1],$TMP
661	palignr		\$4,@MSG[0],$TMP
662	sha256rnds2	$CDGH,$ABEF
663	paddd		$TMP,@MSG[2]
664
665	movdqa		14*32-0x80($Tbl),$Wi
666	paddd		@MSG[1],$Wi
667	sha256rnds2	$ABEF,$CDGH		# 56-59
668	pshufd		\$0x0e,$Wi,$Wi
669	sha256msg2	@MSG[1],@MSG[2]
670	movdqa		$BSWAP,$TMP
671	sha256rnds2	$CDGH,$ABEF
672
673	movdqa		15*32-0x80($Tbl),$Wi
674	paddd		@MSG[2],$Wi
675	nop
676	sha256rnds2	$ABEF,$CDGH		# 60-63
677	pshufd		\$0x0e,$Wi,$Wi
678	dec		$num
679	nop
680	sha256rnds2	$CDGH,$ABEF
681
682	paddd		$CDGH_SAVE,$CDGH
683	paddd		$ABEF_SAVE,$ABEF
684	jnz		.Loop_shaext
685
686	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
687	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
688	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
689	punpckhqdq	$CDGH,$ABEF		# DCBA
690	palignr		\$8,$TMP,$CDGH		# HGFE
691
692	movdqu	$ABEF,($ctx)
693	movdqu	$CDGH,16($ctx)
694___
695$code.=<<___ if ($win64);
696	movaps	-8-5*16(%rax),%xmm6
697	movaps	-8-4*16(%rax),%xmm7
698	movaps	-8-3*16(%rax),%xmm8
699	movaps	-8-2*16(%rax),%xmm9
700	movaps	-8-1*16(%rax),%xmm10
701	mov	%rax,%rsp
702.Lepilogue_shaext:
703___
704$code.=<<___;
705	ret
706.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
707___
708}}}
709{{{
710
711my $a4=$T1;
712my ($a,$b,$c,$d,$e,$f,$g,$h);
713
714sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
715{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
716  my $arg = pop;
717    $arg = "\$$arg" if ($arg*1 eq $arg);
718    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
719}
720
721sub body_00_15 () {
722	(
723	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
724
725	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
726	'&mov	($a,$a1)',
727	'&mov	($a4,$f)',
728
729	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
730	'&xor	($a0,$e)',
731	'&xor	($a4,$g)',			# f^g
732
733	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
734	'&xor	($a1,$a)',
735	'&and	($a4,$e)',			# (f^g)&e
736
737	'&xor	($a0,$e)',
738	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
739	'&mov	($a2,$a)',
740
741	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
742	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
743	'&xor	($a2,$b)',			# a^b, b^c in next round
744
745	'&add	($h,$a4)',			# h+=Ch(e,f,g)
746	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
747	'&and	($a3,$a2)',			# (b^c)&(a^b)
748
749	'&xor	($a1,$a)',
750	'&add	($h,$a0)',			# h+=Sigma1(e)
751	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
752
753	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
754	'&add	($d,$h)',			# d+=h
755	'&add	($h,$a3)',			# h+=Maj(a,b,c)
756
757	'&mov	($a0,$d)',
758	'&add	($a1,$h);'.			# h+=Sigma0(a)
759	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
760	);
761}
762
763######################################################################
764# SSSE3 code path
765#
766if ($SZ==4) {	# SHA256 only
767my @X = map("%xmm$_",(0..3));
768my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
769
770$code.=<<___;
771.type	${func}_ssse3,\@function,3
772.align	64
773${func}_ssse3:
774.cfi_startproc
775.Lssse3_shortcut:
776	mov	%rsp,%rax		# copy %rsp
777.cfi_def_cfa_register	%rax
778	push	%rbx
779.cfi_push	%rbx
780	push	%rbp
781.cfi_push	%rbp
782	push	%r12
783.cfi_push	%r12
784	push	%r13
785.cfi_push	%r13
786	push	%r14
787.cfi_push	%r14
788	push	%r15
789.cfi_push	%r15
790	shl	\$4,%rdx		# num*16
791	sub	\$`$framesz+$win64*16*4`,%rsp
792	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
793	and	\$-64,%rsp		# align stack frame
794	mov	$ctx,$_ctx		# save ctx, 1st arg
795	mov	$inp,$_inp		# save inp, 2nd arh
796	mov	%rdx,$_end		# save end pointer, "3rd" arg
797	mov	%rax,$_rsp		# save copy of %rsp
798.cfi_cfa_expression	$_rsp,deref,+8
799___
800$code.=<<___ if ($win64);
801	movaps	%xmm6,16*$SZ+32(%rsp)
802	movaps	%xmm7,16*$SZ+48(%rsp)
803	movaps	%xmm8,16*$SZ+64(%rsp)
804	movaps	%xmm9,16*$SZ+80(%rsp)
805___
806$code.=<<___;
807.Lprologue_ssse3:
808
809	mov	$SZ*0($ctx),$A
810	mov	$SZ*1($ctx),$B
811	mov	$SZ*2($ctx),$C
812	mov	$SZ*3($ctx),$D
813	mov	$SZ*4($ctx),$E
814	mov	$SZ*5($ctx),$F
815	mov	$SZ*6($ctx),$G
816	mov	$SZ*7($ctx),$H
817___
818
819$code.=<<___;
820	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
821	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
822	jmp	.Lloop_ssse3
823.align	16
824.Lloop_ssse3:
825	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
826	movdqu	0x00($inp),@X[0]
827	movdqu	0x10($inp),@X[1]
828	movdqu	0x20($inp),@X[2]
829	pshufb	$t3,@X[0]
830	movdqu	0x30($inp),@X[3]
831	lea	$TABLE(%rip),$Tbl
832	pshufb	$t3,@X[1]
833	movdqa	0x00($Tbl),$t0
834	movdqa	0x20($Tbl),$t1
835	pshufb	$t3,@X[2]
836	paddd	@X[0],$t0
837	movdqa	0x40($Tbl),$t2
838	pshufb	$t3,@X[3]
839	movdqa	0x60($Tbl),$t3
840	paddd	@X[1],$t1
841	paddd	@X[2],$t2
842	paddd	@X[3],$t3
843	movdqa	$t0,0x00(%rsp)
844	mov	$A,$a1
845	movdqa	$t1,0x10(%rsp)
846	mov	$B,$a3
847	movdqa	$t2,0x20(%rsp)
848	xor	$C,$a3			# magic
849	movdqa	$t3,0x30(%rsp)
850	mov	$E,$a0
851	jmp	.Lssse3_00_47
852
853.align	16
854.Lssse3_00_47:
855	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
856___
857sub Xupdate_256_SSSE3 () {
858	(
859	'&movdqa	($t0,@X[1]);',
860	'&movdqa	($t3,@X[3])',
861	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
862	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
863	'&movdqa	($t1,$t0)',
864	'&movdqa	($t2,$t0);',
865	'&psrld		($t0,$sigma0[2])',
866	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
867	'&psrld		($t2,$sigma0[0])',
868	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
869	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
870	'&pxor		($t0,$t2)',
871	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
872	'&pxor		($t0,$t1)',
873	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
874	'&pxor		($t0,$t2);',
875	 '&movdqa	($t2,$t3)',
876	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
877	 '&psrld	($t3,$sigma1[2])',
878	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
879	 '&psrlq	($t2,$sigma1[0])',
880	 '&pxor		($t3,$t2);',
881	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
882	 '&pxor		($t3,$t2)',
883	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
884	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
885	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
886	 '&movdqa	($t2,$t3);',
887	 '&psrld	($t3,$sigma1[2])',
888	 '&psrlq	($t2,$sigma1[0])',
889	 '&pxor		($t3,$t2);',
890	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
891	 '&pxor		($t3,$t2);',
892	'&movdqa	($t2,16*2*$j."($Tbl)")',
893	 '&pshufb	($t3,$t5)',
894	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
895	);
896}
897
898sub SSSE3_256_00_47 () {
899my $j = shift;
900my $body = shift;
901my @X = @_;
902my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
903
904    if (0) {
905	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
906	    eval;
907	    eval(shift(@insns));
908	    eval(shift(@insns));
909	    eval(shift(@insns));
910	}
911    } else {			# squeeze extra 4% on Westmere and 19% on Atom
912	  eval(shift(@insns));	#@
913	&movdqa		($t0,@X[1]);
914	  eval(shift(@insns));
915	  eval(shift(@insns));
916	&movdqa		($t3,@X[3]);
917	  eval(shift(@insns));	#@
918	  eval(shift(@insns));
919	  eval(shift(@insns));
920	  eval(shift(@insns));	#@
921	  eval(shift(@insns));
922	&palignr	($t0,@X[0],$SZ);	# X[1..4]
923	  eval(shift(@insns));
924	  eval(shift(@insns));
925	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
926	  eval(shift(@insns));
927	  eval(shift(@insns));
928	  eval(shift(@insns));
929	  eval(shift(@insns));	#@
930	&movdqa		($t1,$t0);
931	  eval(shift(@insns));
932	  eval(shift(@insns));
933	&movdqa		($t2,$t0);
934	  eval(shift(@insns));	#@
935	  eval(shift(@insns));
936	&psrld		($t0,$sigma0[2]);
937	  eval(shift(@insns));
938	  eval(shift(@insns));
939	  eval(shift(@insns));
940	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
941	  eval(shift(@insns));	#@
942	  eval(shift(@insns));
943	&psrld		($t2,$sigma0[0]);
944	  eval(shift(@insns));
945	  eval(shift(@insns));
946	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
947	  eval(shift(@insns));
948	  eval(shift(@insns));	#@
949	&pslld		($t1,8*$SZ-$sigma0[1]);
950	  eval(shift(@insns));
951	  eval(shift(@insns));
952	&pxor		($t0,$t2);
953	  eval(shift(@insns));	#@
954	  eval(shift(@insns));
955	  eval(shift(@insns));
956	  eval(shift(@insns));	#@
957	&psrld		($t2,$sigma0[1]-$sigma0[0]);
958	  eval(shift(@insns));
959	&pxor		($t0,$t1);
960	  eval(shift(@insns));
961	  eval(shift(@insns));
962	&pslld		($t1,$sigma0[1]-$sigma0[0]);
963	  eval(shift(@insns));
964	  eval(shift(@insns));
965	&pxor		($t0,$t2);
966	  eval(shift(@insns));
967	  eval(shift(@insns));	#@
968	 &movdqa	($t2,$t3);
969	  eval(shift(@insns));
970	  eval(shift(@insns));
971	&pxor		($t0,$t1);		# sigma0(X[1..4])
972	  eval(shift(@insns));	#@
973	  eval(shift(@insns));
974	  eval(shift(@insns));
975	 &psrld		($t3,$sigma1[2]);
976	  eval(shift(@insns));
977	  eval(shift(@insns));
978	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
979	  eval(shift(@insns));	#@
980	  eval(shift(@insns));
981	 &psrlq		($t2,$sigma1[0]);
982	  eval(shift(@insns));
983	  eval(shift(@insns));
984	  eval(shift(@insns));
985	 &pxor		($t3,$t2);
986	  eval(shift(@insns));	#@
987	  eval(shift(@insns));
988	  eval(shift(@insns));
989	  eval(shift(@insns));	#@
990	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
991	  eval(shift(@insns));
992	  eval(shift(@insns));
993	 &pxor		($t3,$t2);
994	  eval(shift(@insns));	#@
995	  eval(shift(@insns));
996	  eval(shift(@insns));
997	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
998	 &pshufd	($t3,$t3,0b10000000);
999	  eval(shift(@insns));
1000	  eval(shift(@insns));
1001	  eval(shift(@insns));
1002	 &psrldq	($t3,8);
1003	  eval(shift(@insns));
1004	  eval(shift(@insns));	#@
1005	  eval(shift(@insns));
1006	  eval(shift(@insns));
1007	  eval(shift(@insns));	#@
1008	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1009	  eval(shift(@insns));
1010	  eval(shift(@insns));
1011	  eval(shift(@insns));
1012	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1013	  eval(shift(@insns));
1014	  eval(shift(@insns));	#@
1015	  eval(shift(@insns));
1016	 &movdqa	($t2,$t3);
1017	  eval(shift(@insns));
1018	  eval(shift(@insns));
1019	 &psrld		($t3,$sigma1[2]);
1020	  eval(shift(@insns));
1021	  eval(shift(@insns));	#@
1022	 &psrlq		($t2,$sigma1[0]);
1023	  eval(shift(@insns));
1024	  eval(shift(@insns));
1025	 &pxor		($t3,$t2);
1026	  eval(shift(@insns));	#@
1027	  eval(shift(@insns));
1028	  eval(shift(@insns));
1029	  eval(shift(@insns));	#@
1030	  eval(shift(@insns));
1031	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1032	  eval(shift(@insns));
1033	  eval(shift(@insns));
1034	  eval(shift(@insns));
1035	 &pxor		($t3,$t2);
1036	  eval(shift(@insns));
1037	  eval(shift(@insns));
1038	  eval(shift(@insns));	#@
1039	 #&pshufb	($t3,$t5);
1040	 &pshufd	($t3,$t3,0b00001000);
1041	  eval(shift(@insns));
1042	  eval(shift(@insns));
1043	&movdqa		($t2,16*2*$j."($Tbl)");
1044	  eval(shift(@insns));	#@
1045	  eval(shift(@insns));
1046	 &pslldq	($t3,8);
1047	  eval(shift(@insns));
1048	  eval(shift(@insns));
1049	  eval(shift(@insns));
1050	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1051	  eval(shift(@insns));	#@
1052	  eval(shift(@insns));
1053	  eval(shift(@insns));
1054    }
1055	&paddd		($t2,@X[0]);
1056	  foreach (@insns) { eval; }		# remaining instructions
1057	&movdqa		(16*$j."(%rsp)",$t2);
1058}
1059
1060    for ($i=0,$j=0; $j<4; $j++) {
1061	&SSSE3_256_00_47($j,\&body_00_15,@X);
1062	push(@X,shift(@X));			# rotate(@X)
1063    }
1064	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1065	&jne	(".Lssse3_00_47");
1066
1067    for ($i=0; $i<16; ) {
1068	foreach(body_00_15()) { eval; }
1069    }
1070$code.=<<___;
1071	mov	$_ctx,$ctx
1072	mov	$a1,$A
1073
1074	add	$SZ*0($ctx),$A
1075	lea	16*$SZ($inp),$inp
1076	add	$SZ*1($ctx),$B
1077	add	$SZ*2($ctx),$C
1078	add	$SZ*3($ctx),$D
1079	add	$SZ*4($ctx),$E
1080	add	$SZ*5($ctx),$F
1081	add	$SZ*6($ctx),$G
1082	add	$SZ*7($ctx),$H
1083
1084	cmp	$_end,$inp
1085
1086	mov	$A,$SZ*0($ctx)
1087	mov	$B,$SZ*1($ctx)
1088	mov	$C,$SZ*2($ctx)
1089	mov	$D,$SZ*3($ctx)
1090	mov	$E,$SZ*4($ctx)
1091	mov	$F,$SZ*5($ctx)
1092	mov	$G,$SZ*6($ctx)
1093	mov	$H,$SZ*7($ctx)
1094	jb	.Lloop_ssse3
1095
1096	mov	$_rsp,%rsi
1097.cfi_def_cfa	%rsi,8
1098___
1099$code.=<<___ if ($win64);
1100	movaps	16*$SZ+32(%rsp),%xmm6
1101	movaps	16*$SZ+48(%rsp),%xmm7
1102	movaps	16*$SZ+64(%rsp),%xmm8
1103	movaps	16*$SZ+80(%rsp),%xmm9
1104___
1105$code.=<<___;
1106	mov	-48(%rsi),%r15
1107.cfi_restore	%r15
1108	mov	-40(%rsi),%r14
1109.cfi_restore	%r14
1110	mov	-32(%rsi),%r13
1111.cfi_restore	%r13
1112	mov	-24(%rsi),%r12
1113.cfi_restore	%r12
1114	mov	-16(%rsi),%rbp
1115.cfi_restore	%rbp
1116	mov	-8(%rsi),%rbx
1117.cfi_restore	%rbx
1118	lea	(%rsi),%rsp
1119.cfi_def_cfa_register	%rsp
1120.Lepilogue_ssse3:
1121	ret
1122.cfi_endproc
1123.size	${func}_ssse3,.-${func}_ssse3
1124___
1125}
1126
1127if ($avx) {{
1128######################################################################
1129# AVX+shrd code path
1130#
1131local *ror = sub { &shrd(@_[0],@_) };
1132
1133$code.=<<___;
1134.type	${func}_avx,\@function,3
1135.align	64
1136${func}_avx:
1137.cfi_startproc
1138.Lavx_shortcut:
1139	mov	%rsp,%rax		# copy %rsp
1140.cfi_def_cfa_register	%rax
1141	push	%rbx
1142.cfi_push	%rbx
1143	push	%rbp
1144.cfi_push	%rbp
1145	push	%r12
1146.cfi_push	%r12
1147	push	%r13
1148.cfi_push	%r13
1149	push	%r14
1150.cfi_push	%r14
1151	push	%r15
1152.cfi_push	%r15
1153	shl	\$4,%rdx		# num*16
1154	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1155	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1156	and	\$-64,%rsp		# align stack frame
1157	mov	$ctx,$_ctx		# save ctx, 1st arg
1158	mov	$inp,$_inp		# save inp, 2nd arh
1159	mov	%rdx,$_end		# save end pointer, "3rd" arg
1160	mov	%rax,$_rsp		# save copy of %rsp
1161.cfi_cfa_expression	$_rsp,deref,+8
1162___
1163$code.=<<___ if ($win64);
1164	movaps	%xmm6,16*$SZ+32(%rsp)
1165	movaps	%xmm7,16*$SZ+48(%rsp)
1166	movaps	%xmm8,16*$SZ+64(%rsp)
1167	movaps	%xmm9,16*$SZ+80(%rsp)
1168___
1169$code.=<<___ if ($win64 && $SZ>4);
1170	movaps	%xmm10,16*$SZ+96(%rsp)
1171	movaps	%xmm11,16*$SZ+112(%rsp)
1172___
1173$code.=<<___;
1174.Lprologue_avx:
1175
1176	vzeroupper
1177	mov	$SZ*0($ctx),$A
1178	mov	$SZ*1($ctx),$B
1179	mov	$SZ*2($ctx),$C
1180	mov	$SZ*3($ctx),$D
1181	mov	$SZ*4($ctx),$E
1182	mov	$SZ*5($ctx),$F
1183	mov	$SZ*6($ctx),$G
1184	mov	$SZ*7($ctx),$H
1185___
1186					if ($SZ==4) {	# SHA256
1187    my @X = map("%xmm$_",(0..3));
1188    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1189
1190$code.=<<___;
1191	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1192	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1193	jmp	.Lloop_avx
1194.align	16
1195.Lloop_avx:
1196	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1197	vmovdqu	0x00($inp),@X[0]
1198	vmovdqu	0x10($inp),@X[1]
1199	vmovdqu	0x20($inp),@X[2]
1200	vmovdqu	0x30($inp),@X[3]
1201	vpshufb	$t3,@X[0],@X[0]
1202	lea	$TABLE(%rip),$Tbl
1203	vpshufb	$t3,@X[1],@X[1]
1204	vpshufb	$t3,@X[2],@X[2]
1205	vpaddd	0x00($Tbl),@X[0],$t0
1206	vpshufb	$t3,@X[3],@X[3]
1207	vpaddd	0x20($Tbl),@X[1],$t1
1208	vpaddd	0x40($Tbl),@X[2],$t2
1209	vpaddd	0x60($Tbl),@X[3],$t3
1210	vmovdqa	$t0,0x00(%rsp)
1211	mov	$A,$a1
1212	vmovdqa	$t1,0x10(%rsp)
1213	mov	$B,$a3
1214	vmovdqa	$t2,0x20(%rsp)
1215	xor	$C,$a3			# magic
1216	vmovdqa	$t3,0x30(%rsp)
1217	mov	$E,$a0
1218	jmp	.Lavx_00_47
1219
1220.align	16
1221.Lavx_00_47:
1222	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1223___
1224sub Xupdate_256_AVX () {
1225	(
1226	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1227	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1228	'&vpsrld	($t2,$t0,$sigma0[0]);',
1229	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1230	'&vpsrld	($t3,$t0,$sigma0[2])',
1231	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1232	'&vpxor		($t0,$t3,$t2)',
1233	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1234	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1235	'&vpxor		($t0,$t0,$t1)',
1236	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1237	'&vpxor		($t0,$t0,$t2)',
1238	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1239	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1240	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1241	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1242	 '&vpxor	($t2,$t2,$t3);',
1243	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1244	 '&vpxor	($t2,$t2,$t3)',
1245	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1246	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1247	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1248	 '&vpsrld	($t2,$t3,$sigma1[2])',
1249	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1250	 '&vpxor	($t2,$t2,$t3);',
1251	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1252	 '&vpxor	($t2,$t2,$t3)',
1253	 '&vpshufb	($t2,$t2,$t5)',
1254	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1255	);
1256}
1257
1258sub AVX_256_00_47 () {
1259my $j = shift;
1260my $body = shift;
1261my @X = @_;
1262my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1263
1264	foreach (Xupdate_256_AVX()) {		# 29 instructions
1265	    eval;
1266	    eval(shift(@insns));
1267	    eval(shift(@insns));
1268	    eval(shift(@insns));
1269	}
1270	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1271	  foreach (@insns) { eval; }		# remaining instructions
1272	&vmovdqa	(16*$j."(%rsp)",$t2);
1273}
1274
1275    for ($i=0,$j=0; $j<4; $j++) {
1276	&AVX_256_00_47($j,\&body_00_15,@X);
1277	push(@X,shift(@X));			# rotate(@X)
1278    }
1279	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1280	&jne	(".Lavx_00_47");
1281
1282    for ($i=0; $i<16; ) {
1283	foreach(body_00_15()) { eval; }
1284    }
1285
1286					} else {	# SHA512
1287    my @X = map("%xmm$_",(0..7));
1288    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1289
1290$code.=<<___;
1291	jmp	.Lloop_avx
1292.align	16
1293.Lloop_avx:
1294	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1295	vmovdqu	0x00($inp),@X[0]
1296	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1297	vmovdqu	0x10($inp),@X[1]
1298	vmovdqu	0x20($inp),@X[2]
1299	vpshufb	$t3,@X[0],@X[0]
1300	vmovdqu	0x30($inp),@X[3]
1301	vpshufb	$t3,@X[1],@X[1]
1302	vmovdqu	0x40($inp),@X[4]
1303	vpshufb	$t3,@X[2],@X[2]
1304	vmovdqu	0x50($inp),@X[5]
1305	vpshufb	$t3,@X[3],@X[3]
1306	vmovdqu	0x60($inp),@X[6]
1307	vpshufb	$t3,@X[4],@X[4]
1308	vmovdqu	0x70($inp),@X[7]
1309	vpshufb	$t3,@X[5],@X[5]
1310	vpaddq	-0x80($Tbl),@X[0],$t0
1311	vpshufb	$t3,@X[6],@X[6]
1312	vpaddq	-0x60($Tbl),@X[1],$t1
1313	vpshufb	$t3,@X[7],@X[7]
1314	vpaddq	-0x40($Tbl),@X[2],$t2
1315	vpaddq	-0x20($Tbl),@X[3],$t3
1316	vmovdqa	$t0,0x00(%rsp)
1317	vpaddq	0x00($Tbl),@X[4],$t0
1318	vmovdqa	$t1,0x10(%rsp)
1319	vpaddq	0x20($Tbl),@X[5],$t1
1320	vmovdqa	$t2,0x20(%rsp)
1321	vpaddq	0x40($Tbl),@X[6],$t2
1322	vmovdqa	$t3,0x30(%rsp)
1323	vpaddq	0x60($Tbl),@X[7],$t3
1324	vmovdqa	$t0,0x40(%rsp)
1325	mov	$A,$a1
1326	vmovdqa	$t1,0x50(%rsp)
1327	mov	$B,$a3
1328	vmovdqa	$t2,0x60(%rsp)
1329	xor	$C,$a3			# magic
1330	vmovdqa	$t3,0x70(%rsp)
1331	mov	$E,$a0
1332	jmp	.Lavx_00_47
1333
1334.align	16
1335.Lavx_00_47:
1336	add	\$`16*2*$SZ`,$Tbl
1337___
1338sub Xupdate_512_AVX () {
1339	(
1340	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1341	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1342	'&vpsrlq	($t2,$t0,$sigma0[0])',
1343	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1344	'&vpsrlq	($t3,$t0,$sigma0[2])',
1345	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1346	 '&vpxor	($t0,$t3,$t2)',
1347	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1348	 '&vpxor	($t0,$t0,$t1)',
1349	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1350	 '&vpxor	($t0,$t0,$t2)',
1351	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1352	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1353	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1354	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1355	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1356	 '&vpxor	($t3,$t3,$t2)',
1357	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1358	 '&vpxor	($t3,$t3,$t1)',
1359	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1360	 '&vpxor	($t3,$t3,$t2)',
1361	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1362	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1363	);
1364}
1365
1366sub AVX_512_00_47 () {
1367my $j = shift;
1368my $body = shift;
1369my @X = @_;
1370my @insns = (&$body,&$body);			# 52 instructions
1371
1372	foreach (Xupdate_512_AVX()) {		# 23 instructions
1373	    eval;
1374	    eval(shift(@insns));
1375	    eval(shift(@insns));
1376	}
1377	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1378	  foreach (@insns) { eval; }		# remaining instructions
1379	&vmovdqa	(16*$j."(%rsp)",$t2);
1380}
1381
1382    for ($i=0,$j=0; $j<8; $j++) {
1383	&AVX_512_00_47($j,\&body_00_15,@X);
1384	push(@X,shift(@X));			# rotate(@X)
1385    }
1386	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1387	&jne	(".Lavx_00_47");
1388
1389    for ($i=0; $i<16; ) {
1390	foreach(body_00_15()) { eval; }
1391    }
1392}
1393$code.=<<___;
1394	mov	$_ctx,$ctx
1395	mov	$a1,$A
1396
1397	add	$SZ*0($ctx),$A
1398	lea	16*$SZ($inp),$inp
1399	add	$SZ*1($ctx),$B
1400	add	$SZ*2($ctx),$C
1401	add	$SZ*3($ctx),$D
1402	add	$SZ*4($ctx),$E
1403	add	$SZ*5($ctx),$F
1404	add	$SZ*6($ctx),$G
1405	add	$SZ*7($ctx),$H
1406
1407	cmp	$_end,$inp
1408
1409	mov	$A,$SZ*0($ctx)
1410	mov	$B,$SZ*1($ctx)
1411	mov	$C,$SZ*2($ctx)
1412	mov	$D,$SZ*3($ctx)
1413	mov	$E,$SZ*4($ctx)
1414	mov	$F,$SZ*5($ctx)
1415	mov	$G,$SZ*6($ctx)
1416	mov	$H,$SZ*7($ctx)
1417	jb	.Lloop_avx
1418
1419	mov	$_rsp,%rsi
1420.cfi_def_cfa	%rsi,8
1421	vzeroupper
1422___
1423$code.=<<___ if ($win64);
1424	movaps	16*$SZ+32(%rsp),%xmm6
1425	movaps	16*$SZ+48(%rsp),%xmm7
1426	movaps	16*$SZ+64(%rsp),%xmm8
1427	movaps	16*$SZ+80(%rsp),%xmm9
1428___
1429$code.=<<___ if ($win64 && $SZ>4);
1430	movaps	16*$SZ+96(%rsp),%xmm10
1431	movaps	16*$SZ+112(%rsp),%xmm11
1432___
1433$code.=<<___;
1434	mov	-48(%rsi),%r15
1435.cfi_restore	%r15
1436	mov	-40(%rsi),%r14
1437.cfi_restore	%r14
1438	mov	-32(%rsi),%r13
1439.cfi_restore	%r13
1440	mov	-24(%rsi),%r12
1441.cfi_restore	%r12
1442	mov	-16(%rsi),%rbp
1443.cfi_restore	%rbp
1444	mov	-8(%rsi),%rbx
1445.cfi_restore	%rbx
1446	lea	(%rsi),%rsp
1447.cfi_def_cfa_register	%rsp
1448.Lepilogue_avx:
1449	ret
1450.cfi_endproc
1451.size	${func}_avx,.-${func}_avx
1452___
1453
1454if ($avx>1) {{
1455######################################################################
1456# AVX2+BMI code path
1457#
1458my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1459my $PUSH8=8*2*$SZ;
1460use integer;
1461
1462sub bodyx_00_15 () {
1463	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1464	(
1465	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1466
1467	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1468	'&and	($a4,$e)',		# f&e
1469	'&rorx	($a0,$e,$Sigma1[2])',
1470	'&rorx	($a2,$e,$Sigma1[1])',
1471
1472	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1473	'&lea	($h,"($h,$a4)")',
1474	'&andn	($a4,$e,$g)',		# ~e&g
1475	'&xor	($a0,$a2)',
1476
1477	'&rorx	($a1,$e,$Sigma1[0])',
1478	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1479	'&xor	($a0,$a1)',		# Sigma1(e)
1480	'&mov	($a2,$a)',
1481
1482	'&rorx	($a4,$a,$Sigma0[2])',
1483	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1484	'&xor	($a2,$b)',		# a^b, b^c in next round
1485	'&rorx	($a1,$a,$Sigma0[1])',
1486
1487	'&rorx	($a0,$a,$Sigma0[0])',
1488	'&lea	($d,"($d,$h)")',	# d+=h
1489	'&and	($a3,$a2)',		# (b^c)&(a^b)
1490	'&xor	($a1,$a4)',
1491
1492	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1493	'&xor	($a1,$a0)',		# Sigma0(a)
1494	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1495	'&mov	($a4,$e)',		# copy of f in future
1496
1497	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1498	);
1499	# and at the finish one has to $a+=$a1
1500}
1501
1502$code.=<<___;
1503.type	${func}_avx2,\@function,3
1504.align	64
1505${func}_avx2:
1506.cfi_startproc
1507.Lavx2_shortcut:
1508	mov	%rsp,%rax		# copy %rsp
1509.cfi_def_cfa_register	%rax
1510	push	%rbx
1511.cfi_push	%rbx
1512	push	%rbp
1513.cfi_push	%rbp
1514	push	%r12
1515.cfi_push	%r12
1516	push	%r13
1517.cfi_push	%r13
1518	push	%r14
1519.cfi_push	%r14
1520	push	%r15
1521.cfi_push	%r15
1522	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1523	shl	\$4,%rdx		# num*16
1524	and	\$-256*$SZ,%rsp		# align stack frame
1525	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1526	add	\$`2*$SZ*($rounds-8)`,%rsp
1527	mov	$ctx,$_ctx		# save ctx, 1st arg
1528	mov	$inp,$_inp		# save inp, 2nd arh
1529	mov	%rdx,$_end		# save end pointer, "3rd" arg
1530	mov	%rax,$_rsp		# save copy of %rsp
1531.cfi_cfa_expression	$_rsp,deref,+8
1532___
1533$code.=<<___ if ($win64);
1534	movaps	%xmm6,16*$SZ+32(%rsp)
1535	movaps	%xmm7,16*$SZ+48(%rsp)
1536	movaps	%xmm8,16*$SZ+64(%rsp)
1537	movaps	%xmm9,16*$SZ+80(%rsp)
1538___
1539$code.=<<___ if ($win64 && $SZ>4);
1540	movaps	%xmm10,16*$SZ+96(%rsp)
1541	movaps	%xmm11,16*$SZ+112(%rsp)
1542___
1543$code.=<<___;
1544.Lprologue_avx2:
1545
1546	vzeroupper
1547	sub	\$-16*$SZ,$inp		# inp++, size optimization
1548	mov	$SZ*0($ctx),$A
1549	mov	$inp,%r12		# borrow $T1
1550	mov	$SZ*1($ctx),$B
1551	cmp	%rdx,$inp		# $_end
1552	mov	$SZ*2($ctx),$C
1553	cmove	%rsp,%r12		# next block or random data
1554	mov	$SZ*3($ctx),$D
1555	mov	$SZ*4($ctx),$E
1556	mov	$SZ*5($ctx),$F
1557	mov	$SZ*6($ctx),$G
1558	mov	$SZ*7($ctx),$H
1559___
1560					if ($SZ==4) {	# SHA256
1561    my @X = map("%ymm$_",(0..3));
1562    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1563
1564$code.=<<___;
1565	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1566	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1567	jmp	.Loop_avx2
1568.align	16
1569.Loop_avx2:
1570	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1571	vmovdqu	-16*$SZ+0($inp),%xmm0
1572	vmovdqu	-16*$SZ+16($inp),%xmm1
1573	vmovdqu	-16*$SZ+32($inp),%xmm2
1574	vmovdqu	-16*$SZ+48($inp),%xmm3
1575	#mov		$inp,$_inp	# offload $inp
1576	vinserti128	\$1,(%r12),@X[0],@X[0]
1577	vinserti128	\$1,16(%r12),@X[1],@X[1]
1578	vpshufb		$t3,@X[0],@X[0]
1579	vinserti128	\$1,32(%r12),@X[2],@X[2]
1580	vpshufb		$t3,@X[1],@X[1]
1581	vinserti128	\$1,48(%r12),@X[3],@X[3]
1582
1583	lea	$TABLE(%rip),$Tbl
1584	vpshufb	$t3,@X[2],@X[2]
1585	vpaddd	0x00($Tbl),@X[0],$t0
1586	vpshufb	$t3,@X[3],@X[3]
1587	vpaddd	0x20($Tbl),@X[1],$t1
1588	vpaddd	0x40($Tbl),@X[2],$t2
1589	vpaddd	0x60($Tbl),@X[3],$t3
1590	vmovdqa	$t0,0x00(%rsp)
1591	xor	$a1,$a1
1592	vmovdqa	$t1,0x20(%rsp)
1593	lea	-$PUSH8(%rsp),%rsp
1594	mov	$B,$a3
1595	vmovdqa	$t2,0x00(%rsp)
1596	xor	$C,$a3			# magic
1597	vmovdqa	$t3,0x20(%rsp)
1598	mov	$F,$a4
1599	sub	\$-16*2*$SZ,$Tbl	# size optimization
1600	jmp	.Lavx2_00_47
1601
1602.align	16
1603.Lavx2_00_47:
1604___
1605
1606sub AVX2_256_00_47 () {
1607my $j = shift;
1608my $body = shift;
1609my @X = @_;
1610my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1611my $base = "+2*$PUSH8(%rsp)";
1612
1613	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1614	foreach (Xupdate_256_AVX()) {		# 29 instructions
1615	    eval;
1616	    eval(shift(@insns));
1617	    eval(shift(@insns));
1618	    eval(shift(@insns));
1619	}
1620	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1621	  foreach (@insns) { eval; }		# remaining instructions
1622	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1623}
1624
1625    for ($i=0,$j=0; $j<4; $j++) {
1626	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1627	push(@X,shift(@X));			# rotate(@X)
1628    }
1629	&lea	($Tbl,16*2*$SZ."($Tbl)");
1630	&cmpb	(($SZ-1)."($Tbl)",0);
1631	&jne	(".Lavx2_00_47");
1632
1633    for ($i=0; $i<16; ) {
1634	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1635	foreach(bodyx_00_15()) { eval; }
1636    }
1637					} else {	# SHA512
1638    my @X = map("%ymm$_",(0..7));
1639    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1640
1641$code.=<<___;
1642	jmp	.Loop_avx2
1643.align	16
1644.Loop_avx2:
1645	vmovdqu	-16*$SZ($inp),%xmm0
1646	vmovdqu	-16*$SZ+16($inp),%xmm1
1647	vmovdqu	-16*$SZ+32($inp),%xmm2
1648	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1649	vmovdqu	-16*$SZ+48($inp),%xmm3
1650	vmovdqu	-16*$SZ+64($inp),%xmm4
1651	vmovdqu	-16*$SZ+80($inp),%xmm5
1652	vmovdqu	-16*$SZ+96($inp),%xmm6
1653	vmovdqu	-16*$SZ+112($inp),%xmm7
1654	#mov	$inp,$_inp	# offload $inp
1655	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1656	vinserti128	\$1,(%r12),@X[0],@X[0]
1657	vinserti128	\$1,16(%r12),@X[1],@X[1]
1658	 vpshufb	$t2,@X[0],@X[0]
1659	vinserti128	\$1,32(%r12),@X[2],@X[2]
1660	 vpshufb	$t2,@X[1],@X[1]
1661	vinserti128	\$1,48(%r12),@X[3],@X[3]
1662	 vpshufb	$t2,@X[2],@X[2]
1663	vinserti128	\$1,64(%r12),@X[4],@X[4]
1664	 vpshufb	$t2,@X[3],@X[3]
1665	vinserti128	\$1,80(%r12),@X[5],@X[5]
1666	 vpshufb	$t2,@X[4],@X[4]
1667	vinserti128	\$1,96(%r12),@X[6],@X[6]
1668	 vpshufb	$t2,@X[5],@X[5]
1669	vinserti128	\$1,112(%r12),@X[7],@X[7]
1670
1671	vpaddq	-0x80($Tbl),@X[0],$t0
1672	vpshufb	$t2,@X[6],@X[6]
1673	vpaddq	-0x60($Tbl),@X[1],$t1
1674	vpshufb	$t2,@X[7],@X[7]
1675	vpaddq	-0x40($Tbl),@X[2],$t2
1676	vpaddq	-0x20($Tbl),@X[3],$t3
1677	vmovdqa	$t0,0x00(%rsp)
1678	vpaddq	0x00($Tbl),@X[4],$t0
1679	vmovdqa	$t1,0x20(%rsp)
1680	vpaddq	0x20($Tbl),@X[5],$t1
1681	vmovdqa	$t2,0x40(%rsp)
1682	vpaddq	0x40($Tbl),@X[6],$t2
1683	vmovdqa	$t3,0x60(%rsp)
1684	lea	-$PUSH8(%rsp),%rsp
1685	vpaddq	0x60($Tbl),@X[7],$t3
1686	vmovdqa	$t0,0x00(%rsp)
1687	xor	$a1,$a1
1688	vmovdqa	$t1,0x20(%rsp)
1689	mov	$B,$a3
1690	vmovdqa	$t2,0x40(%rsp)
1691	xor	$C,$a3			# magic
1692	vmovdqa	$t3,0x60(%rsp)
1693	mov	$F,$a4
1694	add	\$16*2*$SZ,$Tbl
1695	jmp	.Lavx2_00_47
1696
1697.align	16
1698.Lavx2_00_47:
1699___
1700
1701sub AVX2_512_00_47 () {
1702my $j = shift;
1703my $body = shift;
1704my @X = @_;
1705my @insns = (&$body,&$body);			# 48 instructions
1706my $base = "+2*$PUSH8(%rsp)";
1707
1708	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
1709	foreach (Xupdate_512_AVX()) {		# 23 instructions
1710	    eval;
1711	    if ($_ !~ /\;$/) {
1712		eval(shift(@insns));
1713		eval(shift(@insns));
1714		eval(shift(@insns));
1715	    }
1716	}
1717	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1718	  foreach (@insns) { eval; }		# remaining instructions
1719	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1720}
1721
1722    for ($i=0,$j=0; $j<8; $j++) {
1723	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
1724	push(@X,shift(@X));			# rotate(@X)
1725    }
1726	&lea	($Tbl,16*2*$SZ."($Tbl)");
1727	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
1728	&jne	(".Lavx2_00_47");
1729
1730    for ($i=0; $i<16; ) {
1731	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1732	foreach(bodyx_00_15()) { eval; }
1733    }
1734}
1735$code.=<<___;
1736	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
1737	add	$a1,$A
1738	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
1739	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1740
1741	add	$SZ*0($ctx),$A
1742	add	$SZ*1($ctx),$B
1743	add	$SZ*2($ctx),$C
1744	add	$SZ*3($ctx),$D
1745	add	$SZ*4($ctx),$E
1746	add	$SZ*5($ctx),$F
1747	add	$SZ*6($ctx),$G
1748	add	$SZ*7($ctx),$H
1749
1750	mov	$A,$SZ*0($ctx)
1751	mov	$B,$SZ*1($ctx)
1752	mov	$C,$SZ*2($ctx)
1753	mov	$D,$SZ*3($ctx)
1754	mov	$E,$SZ*4($ctx)
1755	mov	$F,$SZ*5($ctx)
1756	mov	$G,$SZ*6($ctx)
1757	mov	$H,$SZ*7($ctx)
1758
1759	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
1760	je	.Ldone_avx2
1761
1762	xor	$a1,$a1
1763	mov	$B,$a3
1764	xor	$C,$a3			# magic
1765	mov	$F,$a4
1766	jmp	.Lower_avx2
1767.align	16
1768.Lower_avx2:
1769___
1770    for ($i=0; $i<8; ) {
1771	my $base="+16($Tbl)";
1772	foreach(bodyx_00_15()) { eval; }
1773    }
1774$code.=<<___;
1775	lea	-$PUSH8($Tbl),$Tbl
1776	cmp	%rsp,$Tbl
1777	jae	.Lower_avx2
1778
1779	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
1780	add	$a1,$A
1781	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
1782	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1783
1784	add	$SZ*0($ctx),$A
1785	add	$SZ*1($ctx),$B
1786	add	$SZ*2($ctx),$C
1787	add	$SZ*3($ctx),$D
1788	add	$SZ*4($ctx),$E
1789	add	$SZ*5($ctx),$F
1790	lea	`2*16*$SZ`($inp),$inp	# inp+=2
1791	add	$SZ*6($ctx),$G
1792	mov	$inp,%r12
1793	add	$SZ*7($ctx),$H
1794	cmp	$_end,$inp
1795
1796	mov	$A,$SZ*0($ctx)
1797	cmove	%rsp,%r12		# next block or stale data
1798	mov	$B,$SZ*1($ctx)
1799	mov	$C,$SZ*2($ctx)
1800	mov	$D,$SZ*3($ctx)
1801	mov	$E,$SZ*4($ctx)
1802	mov	$F,$SZ*5($ctx)
1803	mov	$G,$SZ*6($ctx)
1804	mov	$H,$SZ*7($ctx)
1805
1806	jbe	.Loop_avx2
1807	lea	(%rsp),$Tbl
1808
1809.Ldone_avx2:
1810	lea	($Tbl),%rsp
1811	mov	$_rsp,%rsi
1812.cfi_def_cfa	%rsi,8
1813	vzeroupper
1814___
1815$code.=<<___ if ($win64);
1816	movaps	16*$SZ+32(%rsp),%xmm6
1817	movaps	16*$SZ+48(%rsp),%xmm7
1818	movaps	16*$SZ+64(%rsp),%xmm8
1819	movaps	16*$SZ+80(%rsp),%xmm9
1820___
1821$code.=<<___ if ($win64 && $SZ>4);
1822	movaps	16*$SZ+96(%rsp),%xmm10
1823	movaps	16*$SZ+112(%rsp),%xmm11
1824___
1825$code.=<<___;
1826	mov	-48(%rsi),%r15
1827.cfi_restore	%r15
1828	mov	-40(%rsi),%r14
1829.cfi_restore	%r14
1830	mov	-32(%rsi),%r13
1831.cfi_restore	%r13
1832	mov	-24(%rsi),%r12
1833.cfi_restore	%r12
1834	mov	-16(%rsi),%rbp
1835.cfi_restore	%rbp
1836	mov	-8(%rsi),%rbx
1837.cfi_restore	%rbx
1838	lea	(%rsi),%rsp
1839.cfi_def_cfa_register	%rsp
1840.Lepilogue_avx2:
1841	ret
1842.cfi_endproc
1843.size	${func}_avx2,.-${func}_avx2
1844___
1845}}
1846}}}}}
1847
1848# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1849#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1850if ($win64) {
1851$rec="%rcx";
1852$frame="%rdx";
1853$context="%r8";
1854$disp="%r9";
1855
1856$code.=<<___;
1857.extern	__imp_RtlVirtualUnwind
1858.type	se_handler,\@abi-omnipotent
1859.align	16
1860se_handler:
1861	push	%rsi
1862	push	%rdi
1863	push	%rbx
1864	push	%rbp
1865	push	%r12
1866	push	%r13
1867	push	%r14
1868	push	%r15
1869	pushfq
1870	sub	\$64,%rsp
1871
1872	mov	120($context),%rax	# pull context->Rax
1873	mov	248($context),%rbx	# pull context->Rip
1874
1875	mov	8($disp),%rsi		# disp->ImageBase
1876	mov	56($disp),%r11		# disp->HanderlData
1877
1878	mov	0(%r11),%r10d		# HandlerData[0]
1879	lea	(%rsi,%r10),%r10	# prologue label
1880	cmp	%r10,%rbx		# context->Rip<prologue label
1881	jb	.Lin_prologue
1882
1883	mov	152($context),%rax	# pull context->Rsp
1884
1885	mov	4(%r11),%r10d		# HandlerData[1]
1886	lea	(%rsi,%r10),%r10	# epilogue label
1887	cmp	%r10,%rbx		# context->Rip>=epilogue label
1888	jae	.Lin_prologue
1889___
1890$code.=<<___ if ($avx>1);
1891	lea	.Lavx2_shortcut(%rip),%r10
1892	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1893	jb	.Lnot_in_avx2
1894
1895	and	\$-256*$SZ,%rax
1896	add	\$`2*$SZ*($rounds-8)`,%rax
1897.Lnot_in_avx2:
1898___
1899$code.=<<___;
1900	mov	%rax,%rsi		# put aside Rsp
1901	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
1902
1903	mov	-8(%rax),%rbx
1904	mov	-16(%rax),%rbp
1905	mov	-24(%rax),%r12
1906	mov	-32(%rax),%r13
1907	mov	-40(%rax),%r14
1908	mov	-48(%rax),%r15
1909	mov	%rbx,144($context)	# restore context->Rbx
1910	mov	%rbp,160($context)	# restore context->Rbp
1911	mov	%r12,216($context)	# restore context->R12
1912	mov	%r13,224($context)	# restore context->R13
1913	mov	%r14,232($context)	# restore context->R14
1914	mov	%r15,240($context)	# restore context->R15
1915
1916	lea	.Lepilogue(%rip),%r10
1917	cmp	%r10,%rbx
1918	jb	.Lin_prologue		# non-AVX code
1919
1920	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
1921	lea	512($context),%rdi	# &context.Xmm6
1922	mov	\$`$SZ==4?8:12`,%ecx
1923	.long	0xa548f3fc		# cld; rep movsq
1924
1925.Lin_prologue:
1926	mov	8(%rax),%rdi
1927	mov	16(%rax),%rsi
1928	mov	%rax,152($context)	# restore context->Rsp
1929	mov	%rsi,168($context)	# restore context->Rsi
1930	mov	%rdi,176($context)	# restore context->Rdi
1931
1932	mov	40($disp),%rdi		# disp->ContextRecord
1933	mov	$context,%rsi		# context
1934	mov	\$154,%ecx		# sizeof(CONTEXT)
1935	.long	0xa548f3fc		# cld; rep movsq
1936
1937	mov	$disp,%rsi
1938	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1939	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1940	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1941	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1942	mov	40(%rsi),%r10		# disp->ContextRecord
1943	lea	56(%rsi),%r11		# &disp->HandlerData
1944	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1945	mov	%r10,32(%rsp)		# arg5
1946	mov	%r11,40(%rsp)		# arg6
1947	mov	%r12,48(%rsp)		# arg7
1948	mov	%rcx,56(%rsp)		# arg8, (NULL)
1949	call	*__imp_RtlVirtualUnwind(%rip)
1950
1951	mov	\$1,%eax		# ExceptionContinueSearch
1952	add	\$64,%rsp
1953	popfq
1954	pop	%r15
1955	pop	%r14
1956	pop	%r13
1957	pop	%r12
1958	pop	%rbp
1959	pop	%rbx
1960	pop	%rdi
1961	pop	%rsi
1962	ret
1963.size	se_handler,.-se_handler
1964___
1965
1966$code.=<<___ if ($SZ==4 && $shaext);
1967.type	shaext_handler,\@abi-omnipotent
1968.align	16
1969shaext_handler:
1970	push	%rsi
1971	push	%rdi
1972	push	%rbx
1973	push	%rbp
1974	push	%r12
1975	push	%r13
1976	push	%r14
1977	push	%r15
1978	pushfq
1979	sub	\$64,%rsp
1980
1981	mov	120($context),%rax	# pull context->Rax
1982	mov	248($context),%rbx	# pull context->Rip
1983
1984	lea	.Lprologue_shaext(%rip),%r10
1985	cmp	%r10,%rbx		# context->Rip<.Lprologue
1986	jb	.Lin_prologue
1987
1988	lea	.Lepilogue_shaext(%rip),%r10
1989	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1990	jae	.Lin_prologue
1991
1992	lea	-8-5*16(%rax),%rsi
1993	lea	512($context),%rdi	# &context.Xmm6
1994	mov	\$10,%ecx
1995	.long	0xa548f3fc		# cld; rep movsq
1996
1997	jmp	.Lin_prologue
1998.size	shaext_handler,.-shaext_handler
1999___
2000
2001$code.=<<___;
2002.section	.pdata
2003.align	4
2004	.rva	.LSEH_begin_$func
2005	.rva	.LSEH_end_$func
2006	.rva	.LSEH_info_$func
2007___
2008$code.=<<___ if ($SZ==4 && $shaext);
2009	.rva	.LSEH_begin_${func}_shaext
2010	.rva	.LSEH_end_${func}_shaext
2011	.rva	.LSEH_info_${func}_shaext
2012___
2013$code.=<<___ if ($SZ==4);
2014	.rva	.LSEH_begin_${func}_ssse3
2015	.rva	.LSEH_end_${func}_ssse3
2016	.rva	.LSEH_info_${func}_ssse3
2017___
2018$code.=<<___ if ($avx);
2019	.rva	.LSEH_begin_${func}_avx
2020	.rva	.LSEH_end_${func}_avx
2021	.rva	.LSEH_info_${func}_avx
2022___
2023$code.=<<___ if ($avx>1);
2024	.rva	.LSEH_begin_${func}_avx2
2025	.rva	.LSEH_end_${func}_avx2
2026	.rva	.LSEH_info_${func}_avx2
2027___
2028$code.=<<___;
2029.section	.xdata
2030.align	8
2031.LSEH_info_$func:
2032	.byte	9,0,0,0
2033	.rva	se_handler
2034	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2035___
2036$code.=<<___ if ($SZ==4 && $shaext);
2037.LSEH_info_${func}_shaext:
2038	.byte	9,0,0,0
2039	.rva	shaext_handler
2040___
2041$code.=<<___ if ($SZ==4);
2042.LSEH_info_${func}_ssse3:
2043	.byte	9,0,0,0
2044	.rva	se_handler
2045	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2046___
2047$code.=<<___ if ($avx);
2048.LSEH_info_${func}_avx:
2049	.byte	9,0,0,0
2050	.rva	se_handler
2051	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2052___
2053$code.=<<___ if ($avx>1);
2054.LSEH_info_${func}_avx2:
2055	.byte	9,0,0,0
2056	.rva	se_handler
2057	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2058___
2059}
2060
2061sub sha256op38 {
2062    my $instr = shift;
2063    my %opcodelet = (
2064		"sha256rnds2" => 0xcb,
2065  		"sha256msg1"  => 0xcc,
2066		"sha256msg2"  => 0xcd	);
2067
2068    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2069      my @opcode=(0x0f,0x38);
2070	push @opcode,$opcodelet{$instr};
2071	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2072	return ".byte\t".join(',',@opcode);
2073    } else {
2074	return $instr."\t".@_[0];
2075    }
2076}
2077
2078foreach (split("\n",$code)) {
2079	s/\`([^\`]*)\`/eval $1/geo;
2080
2081	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2082
2083	print $_,"\n";
2084}
2085close STDOUT or die "error closing STDOUT";
2086