• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111#
112# Modified from upstream OpenSSL to remove the XOP code.
113
114$flavour = shift;
115$output  = shift;
116if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
117
118$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
119
120$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
121( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
122( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
123die "can't locate x86_64-xlate.pl";
124
125# In upstream, this is controlled by shelling out to the compiler to check
126# versions, but BoringSSL is intended to be used with pre-generated perlasm
127# output, so this isn't useful anyway.
128#
129# TODO(briansmith): Address davidben's concerns about the CFI annotations and
130# Win64 ABI issues at https://github.com/openssl/openssl/issues/8853.
131# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
132# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
133# did not tie them together until after $shaext was added.
134$avx = 1;
135
136# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
137# been tested.
138$shaext=0;	### set to zero if compiling for 1.0.1
139$avx=1		if (!$shaext && $avx);
140
141open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
142*STDOUT=*OUT;
143
144if ($output =~ /sha512-x86_64/) {
145	$func="GFp_sha512_block_data_order";
146	$TABLE="K512";
147	$SZ=8;
148	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
149					"%r8", "%r9", "%r10","%r11");
150	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
151	@Sigma0=(28,34,39);
152	@Sigma1=(14,18,41);
153	@sigma0=(1,  8, 7);
154	@sigma1=(19,61, 6);
155	$rounds=80;
156} else {
157	$func="GFp_sha256_block_data_order";
158	$TABLE="K256";
159	$SZ=4;
160	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
161					"%r8d","%r9d","%r10d","%r11d");
162	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
163	@Sigma0=( 2,13,22);
164	@Sigma1=( 6,11,25);
165	@sigma0=( 7,18, 3);
166	@sigma1=(17,19,10);
167	$rounds=64;
168}
169
170$ctx="%rdi";	# 1st arg, zapped by $a3
171$inp="%rsi";	# 2nd arg
172$Tbl="%rbp";
173
174$_ctx="16*$SZ+0*8(%rsp)";
175$_inp="16*$SZ+1*8(%rsp)";
176$_end="16*$SZ+2*8(%rsp)";
177$_rsp="`16*$SZ+3*8`(%rsp)";
178$framesz="16*$SZ+4*8";
179
180
181sub ROUND_00_15()
182{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
183  my $STRIDE=$SZ;
184     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
185
186$code.=<<___;
187	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
188	mov	$f,$a2
189
190	xor	$e,$a0
191	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
192	xor	$g,$a2			# f^g
193
194	mov	$T1,`$SZ*($i&0xf)`(%rsp)
195	xor	$a,$a1
196	and	$e,$a2			# (f^g)&e
197
198	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
199	add	$h,$T1			# T1+=h
200	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
201
202	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
203	xor	$e,$a0
204	add	$a2,$T1			# T1+=Ch(e,f,g)
205
206	mov	$a,$a2
207	add	($Tbl),$T1		# T1+=K[round]
208	xor	$a,$a1
209
210	xor	$b,$a2			# a^b, b^c in next round
211	ror	\$$Sigma1[0],$a0	# Sigma1(e)
212	mov	$b,$h
213
214	and	$a2,$a3
215	ror	\$$Sigma0[0],$a1	# Sigma0(a)
216	add	$a0,$T1			# T1+=Sigma1(e)
217
218	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
219	add	$T1,$d			# d+=T1
220	add	$T1,$h			# h+=T1
221
222	lea	$STRIDE($Tbl),$Tbl	# round++
223___
224$code.=<<___ if ($i<15);
225	add	$a1,$h			# h+=Sigma0(a)
226___
227	($a2,$a3) = ($a3,$a2);
228}
229
230sub ROUND_16_XX()
231{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
232
233$code.=<<___;
234	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
235	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
236
237	mov	$a0,$T1
238	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
239	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
240	mov	$a2,$a1
241	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
242
243	xor	$T1,$a0
244	shr	\$$sigma0[2],$T1
245	ror	\$$sigma0[0],$a0
246	xor	$a1,$a2
247	shr	\$$sigma1[2],$a1
248
249	ror	\$$sigma1[0],$a2
250	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
251	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
252	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
253
254	add	`$SZ*($i&0xf)`(%rsp),$T1
255	mov	$e,$a0
256	add	$a2,$T1
257	mov	$a,$a1
258___
259	&ROUND_00_15(@_);
260}
261
262$code=<<___;
263.text
264
265.extern	GFp_ia32cap_P
266.globl	$func
267.type	$func,\@function,3
268.align	16
269$func:
270.cfi_startproc
271___
272$code.=<<___ if ($SZ==4 || $avx);
273	leaq	GFp_ia32cap_P(%rip),%r11
274	mov	0(%r11),%r9d
275	mov	4(%r11),%r10d
276	mov	8(%r11),%r11d
277___
278$code.=<<___ if ($SZ==4 && $shaext);
279	test	\$`1<<29`,%r11d		# check for SHA
280	jnz	_shaext_shortcut
281___
282    # XOP codepath removed.
283$code.=<<___ if ($avx>1);
284	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
285	cmp	\$`1<<8|1<<5|1<<3`,%r11d
286	je	.Lavx2_shortcut
287___
288$code.=<<___ if ($avx);
289	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
290	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
291	or	%r9d,%r10d
292	cmp	\$`1<<28|1<<9|1<<30`,%r10d
293	je	.Lavx_shortcut
294___
295$code.=<<___ if ($SZ==4);
296	test	\$`1<<9`,%r10d
297	jnz	.Lssse3_shortcut
298___
299$code.=<<___;
300	mov	%rsp,%rax		# copy %rsp
301.cfi_def_cfa_register	%rax
302	push	%rbx
303.cfi_push	%rbx
304	push	%rbp
305.cfi_push	%rbp
306	push	%r12
307.cfi_push	%r12
308	push	%r13
309.cfi_push	%r13
310	push	%r14
311.cfi_push	%r14
312	push	%r15
313.cfi_push	%r15
314	shl	\$4,%rdx		# num*16
315	sub	\$$framesz,%rsp
316	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
317	and	\$-64,%rsp		# align stack frame
318	mov	$ctx,$_ctx		# save ctx, 1st arg
319	mov	$inp,$_inp		# save inp, 2nd arh
320	mov	%rdx,$_end		# save end pointer, "3rd" arg
321	mov	%rax,$_rsp		# save copy of %rsp
322.cfi_cfa_expression	$_rsp,deref,+8
323.Lprologue:
324
325	mov	$SZ*0($ctx),$A
326	mov	$SZ*1($ctx),$B
327	mov	$SZ*2($ctx),$C
328	mov	$SZ*3($ctx),$D
329	mov	$SZ*4($ctx),$E
330	mov	$SZ*5($ctx),$F
331	mov	$SZ*6($ctx),$G
332	mov	$SZ*7($ctx),$H
333	jmp	.Lloop
334
335.align	16
336.Lloop:
337	mov	$B,$a3
338	lea	$TABLE(%rip),$Tbl
339	xor	$C,$a3			# magic
340___
341	for($i=0;$i<16;$i++) {
342		$code.="	mov	$SZ*$i($inp),$T1\n";
343		$code.="	mov	@ROT[4],$a0\n";
344		$code.="	mov	@ROT[0],$a1\n";
345		$code.="	bswap	$T1\n";
346		&ROUND_00_15($i,@ROT);
347		unshift(@ROT,pop(@ROT));
348	}
349$code.=<<___;
350	jmp	.Lrounds_16_xx
351.align	16
352.Lrounds_16_xx:
353___
354	for(;$i<32;$i++) {
355		&ROUND_16_XX($i,@ROT);
356		unshift(@ROT,pop(@ROT));
357	}
358
359$code.=<<___;
360	cmpb	\$0,`$SZ-1`($Tbl)
361	jnz	.Lrounds_16_xx
362
363	mov	$_ctx,$ctx
364	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
365	lea	16*$SZ($inp),$inp
366
367	add	$SZ*0($ctx),$A
368	add	$SZ*1($ctx),$B
369	add	$SZ*2($ctx),$C
370	add	$SZ*3($ctx),$D
371	add	$SZ*4($ctx),$E
372	add	$SZ*5($ctx),$F
373	add	$SZ*6($ctx),$G
374	add	$SZ*7($ctx),$H
375
376	cmp	$_end,$inp
377
378	mov	$A,$SZ*0($ctx)
379	mov	$B,$SZ*1($ctx)
380	mov	$C,$SZ*2($ctx)
381	mov	$D,$SZ*3($ctx)
382	mov	$E,$SZ*4($ctx)
383	mov	$F,$SZ*5($ctx)
384	mov	$G,$SZ*6($ctx)
385	mov	$H,$SZ*7($ctx)
386	jb	.Lloop
387
388	mov	$_rsp,%rsi
389.cfi_def_cfa	%rsi,8
390	mov	-48(%rsi),%r15
391.cfi_restore	%r15
392	mov	-40(%rsi),%r14
393.cfi_restore	%r14
394	mov	-32(%rsi),%r13
395.cfi_restore	%r13
396	mov	-24(%rsi),%r12
397.cfi_restore	%r12
398	mov	-16(%rsi),%rbp
399.cfi_restore	%rbp
400	mov	-8(%rsi),%rbx
401.cfi_restore	%rbx
402	lea	(%rsi),%rsp
403.cfi_def_cfa_register	%rsp
404.Lepilogue:
405	ret
406.cfi_endproc
407.size	$func,.-$func
408___
409
410if ($SZ==4) {
411$code.=<<___;
412.align	64
413.type	$TABLE,\@object
414$TABLE:
415	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
416	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
417	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
418	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
419	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
420	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
421	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
422	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
423	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
424	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
425	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
426	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
427	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
428	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
429	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
430	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
431	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
432	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
433	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
434	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
435	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
436	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
437	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
438	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
439	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
440	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
441	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
442	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
443	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
444	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
445	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
446	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
447
448	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
449	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
450	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
451	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
452	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
453	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
454	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
455___
456} else {
457$code.=<<___;
458.align	64
459.type	$TABLE,\@object
460$TABLE:
461	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
462	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
463	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
464	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
465	.quad	0x3956c25bf348b538,0x59f111f1b605d019
466	.quad	0x3956c25bf348b538,0x59f111f1b605d019
467	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
468	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
469	.quad	0xd807aa98a3030242,0x12835b0145706fbe
470	.quad	0xd807aa98a3030242,0x12835b0145706fbe
471	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
472	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
473	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
474	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
475	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
476	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
477	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
478	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
479	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
480	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
481	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
482	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
483	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
484	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
485	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
486	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
487	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
488	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
489	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
490	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
491	.quad	0x06ca6351e003826f,0x142929670a0e6e70
492	.quad	0x06ca6351e003826f,0x142929670a0e6e70
493	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
494	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
495	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
496	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
497	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
498	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
499	.quad	0x81c2c92e47edaee6,0x92722c851482353b
500	.quad	0x81c2c92e47edaee6,0x92722c851482353b
501	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
502	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
503	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
504	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
505	.quad	0xd192e819d6ef5218,0xd69906245565a910
506	.quad	0xd192e819d6ef5218,0xd69906245565a910
507	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
508	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
509	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
510	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
511	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
512	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
513	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
514	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
515	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
516	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
517	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
518	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
519	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
520	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
521	.quad	0x90befffa23631e28,0xa4506cebde82bde9
522	.quad	0x90befffa23631e28,0xa4506cebde82bde9
523	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
524	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
525	.quad	0xca273eceea26619c,0xd186b8c721c0c207
526	.quad	0xca273eceea26619c,0xd186b8c721c0c207
527	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
528	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
529	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
530	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
531	.quad	0x113f9804bef90dae,0x1b710b35131c471b
532	.quad	0x113f9804bef90dae,0x1b710b35131c471b
533	.quad	0x28db77f523047d84,0x32caab7b40c72493
534	.quad	0x28db77f523047d84,0x32caab7b40c72493
535	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
536	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
537	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
538	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
539	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
540	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
541
542	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
543	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
544	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
545___
546}
547
548######################################################################
549# SIMD code paths
550#
551if ($SZ==4 && $shaext) {{{
552######################################################################
553# Intel SHA Extensions implementation of SHA256 update function.
554#
555my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
556
557my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
558my @MSG=map("%xmm$_",(3..6));
559
560$code.=<<___;
561.type	GFp_sha256_block_data_order_shaext,\@function,3
562.align	64
563GFp_sha256_block_data_order_shaext:
564_shaext_shortcut:
565___
566$code.=<<___ if ($win64);
567	lea	`-8-5*16`(%rsp),%rsp
568	movaps	%xmm6,-8-5*16(%rax)
569	movaps	%xmm7,-8-4*16(%rax)
570	movaps	%xmm8,-8-3*16(%rax)
571	movaps	%xmm9,-8-2*16(%rax)
572	movaps	%xmm10,-8-1*16(%rax)
573.Lprologue_shaext:
574___
575$code.=<<___;
576	lea		K256+0x80(%rip),$Tbl
577	movdqu		($ctx),$ABEF		# DCBA
578	movdqu		16($ctx),$CDGH		# HGFE
579	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
580
581	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
582	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
583	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
584	movdqa		$TMP,$BSWAP		# offload
585	palignr		\$8,$CDGH,$ABEF		# ABEF
586	punpcklqdq	$Wi,$CDGH		# CDGH
587	jmp		.Loop_shaext
588
589.align	16
590.Loop_shaext:
591	movdqu		($inp),@MSG[0]
592	movdqu		0x10($inp),@MSG[1]
593	movdqu		0x20($inp),@MSG[2]
594	pshufb		$TMP,@MSG[0]
595	movdqu		0x30($inp),@MSG[3]
596
597	movdqa		0*32-0x80($Tbl),$Wi
598	paddd		@MSG[0],$Wi
599	pshufb		$TMP,@MSG[1]
600	movdqa		$CDGH,$CDGH_SAVE	# offload
601	sha256rnds2	$ABEF,$CDGH		# 0-3
602	pshufd		\$0x0e,$Wi,$Wi
603	nop
604	movdqa		$ABEF,$ABEF_SAVE	# offload
605	sha256rnds2	$CDGH,$ABEF
606
607	movdqa		1*32-0x80($Tbl),$Wi
608	paddd		@MSG[1],$Wi
609	pshufb		$TMP,@MSG[2]
610	sha256rnds2	$ABEF,$CDGH		# 4-7
611	pshufd		\$0x0e,$Wi,$Wi
612	lea		0x40($inp),$inp
613	sha256msg1	@MSG[1],@MSG[0]
614	sha256rnds2	$CDGH,$ABEF
615
616	movdqa		2*32-0x80($Tbl),$Wi
617	paddd		@MSG[2],$Wi
618	pshufb		$TMP,@MSG[3]
619	sha256rnds2	$ABEF,$CDGH		# 8-11
620	pshufd		\$0x0e,$Wi,$Wi
621	movdqa		@MSG[3],$TMP
622	palignr		\$4,@MSG[2],$TMP
623	nop
624	paddd		$TMP,@MSG[0]
625	sha256msg1	@MSG[2],@MSG[1]
626	sha256rnds2	$CDGH,$ABEF
627
628	movdqa		3*32-0x80($Tbl),$Wi
629	paddd		@MSG[3],$Wi
630	sha256msg2	@MSG[3],@MSG[0]
631	sha256rnds2	$ABEF,$CDGH		# 12-15
632	pshufd		\$0x0e,$Wi,$Wi
633	movdqa		@MSG[0],$TMP
634	palignr		\$4,@MSG[3],$TMP
635	nop
636	paddd		$TMP,@MSG[1]
637	sha256msg1	@MSG[3],@MSG[2]
638	sha256rnds2	$CDGH,$ABEF
639___
640for($i=4;$i<16-3;$i++) {
641$code.=<<___;
642	movdqa		$i*32-0x80($Tbl),$Wi
643	paddd		@MSG[0],$Wi
644	sha256msg2	@MSG[0],@MSG[1]
645	sha256rnds2	$ABEF,$CDGH		# 16-19...
646	pshufd		\$0x0e,$Wi,$Wi
647	movdqa		@MSG[1],$TMP
648	palignr		\$4,@MSG[0],$TMP
649	nop
650	paddd		$TMP,@MSG[2]
651	sha256msg1	@MSG[0],@MSG[3]
652	sha256rnds2	$CDGH,$ABEF
653___
654	push(@MSG,shift(@MSG));
655}
656$code.=<<___;
657	movdqa		13*32-0x80($Tbl),$Wi
658	paddd		@MSG[0],$Wi
659	sha256msg2	@MSG[0],@MSG[1]
660	sha256rnds2	$ABEF,$CDGH		# 52-55
661	pshufd		\$0x0e,$Wi,$Wi
662	movdqa		@MSG[1],$TMP
663	palignr		\$4,@MSG[0],$TMP
664	sha256rnds2	$CDGH,$ABEF
665	paddd		$TMP,@MSG[2]
666
667	movdqa		14*32-0x80($Tbl),$Wi
668	paddd		@MSG[1],$Wi
669	sha256rnds2	$ABEF,$CDGH		# 56-59
670	pshufd		\$0x0e,$Wi,$Wi
671	sha256msg2	@MSG[1],@MSG[2]
672	movdqa		$BSWAP,$TMP
673	sha256rnds2	$CDGH,$ABEF
674
675	movdqa		15*32-0x80($Tbl),$Wi
676	paddd		@MSG[2],$Wi
677	nop
678	sha256rnds2	$ABEF,$CDGH		# 60-63
679	pshufd		\$0x0e,$Wi,$Wi
680	dec		$num
681	nop
682	sha256rnds2	$CDGH,$ABEF
683
684	paddd		$CDGH_SAVE,$CDGH
685	paddd		$ABEF_SAVE,$ABEF
686	jnz		.Loop_shaext
687
688	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
689	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
690	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
691	punpckhqdq	$CDGH,$ABEF		# DCBA
692	palignr		\$8,$TMP,$CDGH		# HGFE
693
694	movdqu	$ABEF,($ctx)
695	movdqu	$CDGH,16($ctx)
696___
697$code.=<<___ if ($win64);
698	movaps	-8-5*16(%rax),%xmm6
699	movaps	-8-4*16(%rax),%xmm7
700	movaps	-8-3*16(%rax),%xmm8
701	movaps	-8-2*16(%rax),%xmm9
702	movaps	-8-1*16(%rax),%xmm10
703	mov	%rax,%rsp
704.Lepilogue_shaext:
705___
706$code.=<<___;
707	ret
708.size	GFp_sha256_block_data_order_shaext,.-GFp_sha256_block_data_order_shaext
709___
710}}}
711{{{
712
713my $a4=$T1;
714my ($a,$b,$c,$d,$e,$f,$g,$h);
715
716sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
717{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
718  my $arg = pop;
719    $arg = "\$$arg" if ($arg*1 eq $arg);
720    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
721}
722
723sub body_00_15 () {
724	(
725	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
726
727	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
728	'&mov	($a,$a1)',
729	'&mov	($a4,$f)',
730
731	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
732	'&xor	($a0,$e)',
733	'&xor	($a4,$g)',			# f^g
734
735	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
736	'&xor	($a1,$a)',
737	'&and	($a4,$e)',			# (f^g)&e
738
739	'&xor	($a0,$e)',
740	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
741	'&mov	($a2,$a)',
742
743	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
744	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
745	'&xor	($a2,$b)',			# a^b, b^c in next round
746
747	'&add	($h,$a4)',			# h+=Ch(e,f,g)
748	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
749	'&and	($a3,$a2)',			# (b^c)&(a^b)
750
751	'&xor	($a1,$a)',
752	'&add	($h,$a0)',			# h+=Sigma1(e)
753	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
754
755	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
756	'&add	($d,$h)',			# d+=h
757	'&add	($h,$a3)',			# h+=Maj(a,b,c)
758
759	'&mov	($a0,$d)',
760	'&add	($a1,$h);'.			# h+=Sigma0(a)
761	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
762	);
763}
764
765######################################################################
766# SSSE3 code path
767#
768if ($SZ==4) {	# SHA256 only
769my @X = map("%xmm$_",(0..3));
770my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
771
772$code.=<<___;
773.type	${func}_ssse3,\@function,3
774.align	64
775${func}_ssse3:
776.cfi_startproc
777.Lssse3_shortcut:
778	mov	%rsp,%rax		# copy %rsp
779.cfi_def_cfa_register	%rax
780	push	%rbx
781.cfi_push	%rbx
782	push	%rbp
783.cfi_push	%rbp
784	push	%r12
785.cfi_push	%r12
786	push	%r13
787.cfi_push	%r13
788	push	%r14
789.cfi_push	%r14
790	push	%r15
791.cfi_push	%r15
792	shl	\$4,%rdx		# num*16
793	sub	\$`$framesz+$win64*16*4`,%rsp
794	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
795	and	\$-64,%rsp		# align stack frame
796	mov	$ctx,$_ctx		# save ctx, 1st arg
797	mov	$inp,$_inp		# save inp, 2nd arh
798	mov	%rdx,$_end		# save end pointer, "3rd" arg
799	mov	%rax,$_rsp		# save copy of %rsp
800.cfi_cfa_expression	$_rsp,deref,+8
801___
802$code.=<<___ if ($win64);
803	movaps	%xmm6,16*$SZ+32(%rsp)
804	movaps	%xmm7,16*$SZ+48(%rsp)
805	movaps	%xmm8,16*$SZ+64(%rsp)
806	movaps	%xmm9,16*$SZ+80(%rsp)
807___
808$code.=<<___;
809.Lprologue_ssse3:
810
811	mov	$SZ*0($ctx),$A
812	mov	$SZ*1($ctx),$B
813	mov	$SZ*2($ctx),$C
814	mov	$SZ*3($ctx),$D
815	mov	$SZ*4($ctx),$E
816	mov	$SZ*5($ctx),$F
817	mov	$SZ*6($ctx),$G
818	mov	$SZ*7($ctx),$H
819___
820
821$code.=<<___;
822	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
823	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
824	jmp	.Lloop_ssse3
825.align	16
826.Lloop_ssse3:
827	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
828	movdqu	0x00($inp),@X[0]
829	movdqu	0x10($inp),@X[1]
830	movdqu	0x20($inp),@X[2]
831	pshufb	$t3,@X[0]
832	movdqu	0x30($inp),@X[3]
833	lea	$TABLE(%rip),$Tbl
834	pshufb	$t3,@X[1]
835	movdqa	0x00($Tbl),$t0
836	movdqa	0x20($Tbl),$t1
837	pshufb	$t3,@X[2]
838	paddd	@X[0],$t0
839	movdqa	0x40($Tbl),$t2
840	pshufb	$t3,@X[3]
841	movdqa	0x60($Tbl),$t3
842	paddd	@X[1],$t1
843	paddd	@X[2],$t2
844	paddd	@X[3],$t3
845	movdqa	$t0,0x00(%rsp)
846	mov	$A,$a1
847	movdqa	$t1,0x10(%rsp)
848	mov	$B,$a3
849	movdqa	$t2,0x20(%rsp)
850	xor	$C,$a3			# magic
851	movdqa	$t3,0x30(%rsp)
852	mov	$E,$a0
853	jmp	.Lssse3_00_47
854
855.align	16
856.Lssse3_00_47:
857	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
858___
859sub Xupdate_256_SSSE3 () {
860	(
861	'&movdqa	($t0,@X[1]);',
862	'&movdqa	($t3,@X[3])',
863	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
864	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
865	'&movdqa	($t1,$t0)',
866	'&movdqa	($t2,$t0);',
867	'&psrld		($t0,$sigma0[2])',
868	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
869	'&psrld		($t2,$sigma0[0])',
870	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
871	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
872	'&pxor		($t0,$t2)',
873	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
874	'&pxor		($t0,$t1)',
875	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
876	'&pxor		($t0,$t2);',
877	 '&movdqa	($t2,$t3)',
878	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
879	 '&psrld	($t3,$sigma1[2])',
880	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
881	 '&psrlq	($t2,$sigma1[0])',
882	 '&pxor		($t3,$t2);',
883	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
884	 '&pxor		($t3,$t2)',
885	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
886	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
887	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
888	 '&movdqa	($t2,$t3);',
889	 '&psrld	($t3,$sigma1[2])',
890	 '&psrlq	($t2,$sigma1[0])',
891	 '&pxor		($t3,$t2);',
892	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
893	 '&pxor		($t3,$t2);',
894	'&movdqa	($t2,16*2*$j."($Tbl)")',
895	 '&pshufb	($t3,$t5)',
896	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
897	);
898}
899
900sub SSSE3_256_00_47 () {
901my $j = shift;
902my $body = shift;
903my @X = @_;
904my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
905
906    if (0) {
907	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
908	    eval;
909	    eval(shift(@insns));
910	    eval(shift(@insns));
911	    eval(shift(@insns));
912	}
913    } else {			# squeeze extra 4% on Westmere and 19% on Atom
914	  eval(shift(@insns));	#@
915	&movdqa		($t0,@X[1]);
916	  eval(shift(@insns));
917	  eval(shift(@insns));
918	&movdqa		($t3,@X[3]);
919	  eval(shift(@insns));	#@
920	  eval(shift(@insns));
921	  eval(shift(@insns));
922	  eval(shift(@insns));	#@
923	  eval(shift(@insns));
924	&palignr	($t0,@X[0],$SZ);	# X[1..4]
925	  eval(shift(@insns));
926	  eval(shift(@insns));
927	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
928	  eval(shift(@insns));
929	  eval(shift(@insns));
930	  eval(shift(@insns));
931	  eval(shift(@insns));	#@
932	&movdqa		($t1,$t0);
933	  eval(shift(@insns));
934	  eval(shift(@insns));
935	&movdqa		($t2,$t0);
936	  eval(shift(@insns));	#@
937	  eval(shift(@insns));
938	&psrld		($t0,$sigma0[2]);
939	  eval(shift(@insns));
940	  eval(shift(@insns));
941	  eval(shift(@insns));
942	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
943	  eval(shift(@insns));	#@
944	  eval(shift(@insns));
945	&psrld		($t2,$sigma0[0]);
946	  eval(shift(@insns));
947	  eval(shift(@insns));
948	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
949	  eval(shift(@insns));
950	  eval(shift(@insns));	#@
951	&pslld		($t1,8*$SZ-$sigma0[1]);
952	  eval(shift(@insns));
953	  eval(shift(@insns));
954	&pxor		($t0,$t2);
955	  eval(shift(@insns));	#@
956	  eval(shift(@insns));
957	  eval(shift(@insns));
958	  eval(shift(@insns));	#@
959	&psrld		($t2,$sigma0[1]-$sigma0[0]);
960	  eval(shift(@insns));
961	&pxor		($t0,$t1);
962	  eval(shift(@insns));
963	  eval(shift(@insns));
964	&pslld		($t1,$sigma0[1]-$sigma0[0]);
965	  eval(shift(@insns));
966	  eval(shift(@insns));
967	&pxor		($t0,$t2);
968	  eval(shift(@insns));
969	  eval(shift(@insns));	#@
970	 &movdqa	($t2,$t3);
971	  eval(shift(@insns));
972	  eval(shift(@insns));
973	&pxor		($t0,$t1);		# sigma0(X[1..4])
974	  eval(shift(@insns));	#@
975	  eval(shift(@insns));
976	  eval(shift(@insns));
977	 &psrld		($t3,$sigma1[2]);
978	  eval(shift(@insns));
979	  eval(shift(@insns));
980	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
981	  eval(shift(@insns));	#@
982	  eval(shift(@insns));
983	 &psrlq		($t2,$sigma1[0]);
984	  eval(shift(@insns));
985	  eval(shift(@insns));
986	  eval(shift(@insns));
987	 &pxor		($t3,$t2);
988	  eval(shift(@insns));	#@
989	  eval(shift(@insns));
990	  eval(shift(@insns));
991	  eval(shift(@insns));	#@
992	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
993	  eval(shift(@insns));
994	  eval(shift(@insns));
995	 &pxor		($t3,$t2);
996	  eval(shift(@insns));	#@
997	  eval(shift(@insns));
998	  eval(shift(@insns));
999	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
1000	 &pshufd	($t3,$t3,0b10000000);
1001	  eval(shift(@insns));
1002	  eval(shift(@insns));
1003	  eval(shift(@insns));
1004	 &psrldq	($t3,8);
1005	  eval(shift(@insns));
1006	  eval(shift(@insns));	#@
1007	  eval(shift(@insns));
1008	  eval(shift(@insns));
1009	  eval(shift(@insns));	#@
1010	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1011	  eval(shift(@insns));
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));
1014	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1015	  eval(shift(@insns));
1016	  eval(shift(@insns));	#@
1017	  eval(shift(@insns));
1018	 &movdqa	($t2,$t3);
1019	  eval(shift(@insns));
1020	  eval(shift(@insns));
1021	 &psrld		($t3,$sigma1[2]);
1022	  eval(shift(@insns));
1023	  eval(shift(@insns));	#@
1024	 &psrlq		($t2,$sigma1[0]);
1025	  eval(shift(@insns));
1026	  eval(shift(@insns));
1027	 &pxor		($t3,$t2);
1028	  eval(shift(@insns));	#@
1029	  eval(shift(@insns));
1030	  eval(shift(@insns));
1031	  eval(shift(@insns));	#@
1032	  eval(shift(@insns));
1033	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1034	  eval(shift(@insns));
1035	  eval(shift(@insns));
1036	  eval(shift(@insns));
1037	 &pxor		($t3,$t2);
1038	  eval(shift(@insns));
1039	  eval(shift(@insns));
1040	  eval(shift(@insns));	#@
1041	 #&pshufb	($t3,$t5);
1042	 &pshufd	($t3,$t3,0b00001000);
1043	  eval(shift(@insns));
1044	  eval(shift(@insns));
1045	&movdqa		($t2,16*2*$j."($Tbl)");
1046	  eval(shift(@insns));	#@
1047	  eval(shift(@insns));
1048	 &pslldq	($t3,8);
1049	  eval(shift(@insns));
1050	  eval(shift(@insns));
1051	  eval(shift(@insns));
1052	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1053	  eval(shift(@insns));	#@
1054	  eval(shift(@insns));
1055	  eval(shift(@insns));
1056    }
1057	&paddd		($t2,@X[0]);
1058	  foreach (@insns) { eval; }		# remaining instructions
1059	&movdqa		(16*$j."(%rsp)",$t2);
1060}
1061
1062    for ($i=0,$j=0; $j<4; $j++) {
1063	&SSSE3_256_00_47($j,\&body_00_15,@X);
1064	push(@X,shift(@X));			# rotate(@X)
1065    }
1066	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1067	&jne	(".Lssse3_00_47");
1068
1069    for ($i=0; $i<16; ) {
1070	foreach(body_00_15()) { eval; }
1071    }
1072$code.=<<___;
1073	mov	$_ctx,$ctx
1074	mov	$a1,$A
1075
1076	add	$SZ*0($ctx),$A
1077	lea	16*$SZ($inp),$inp
1078	add	$SZ*1($ctx),$B
1079	add	$SZ*2($ctx),$C
1080	add	$SZ*3($ctx),$D
1081	add	$SZ*4($ctx),$E
1082	add	$SZ*5($ctx),$F
1083	add	$SZ*6($ctx),$G
1084	add	$SZ*7($ctx),$H
1085
1086	cmp	$_end,$inp
1087
1088	mov	$A,$SZ*0($ctx)
1089	mov	$B,$SZ*1($ctx)
1090	mov	$C,$SZ*2($ctx)
1091	mov	$D,$SZ*3($ctx)
1092	mov	$E,$SZ*4($ctx)
1093	mov	$F,$SZ*5($ctx)
1094	mov	$G,$SZ*6($ctx)
1095	mov	$H,$SZ*7($ctx)
1096	jb	.Lloop_ssse3
1097
1098	mov	$_rsp,%rsi
1099.cfi_def_cfa	%rsi,8
1100___
1101$code.=<<___ if ($win64);
1102	movaps	16*$SZ+32(%rsp),%xmm6
1103	movaps	16*$SZ+48(%rsp),%xmm7
1104	movaps	16*$SZ+64(%rsp),%xmm8
1105	movaps	16*$SZ+80(%rsp),%xmm9
1106___
1107$code.=<<___;
1108	mov	-48(%rsi),%r15
1109.cfi_restore	%r15
1110	mov	-40(%rsi),%r14
1111.cfi_restore	%r14
1112	mov	-32(%rsi),%r13
1113.cfi_restore	%r13
1114	mov	-24(%rsi),%r12
1115.cfi_restore	%r12
1116	mov	-16(%rsi),%rbp
1117.cfi_restore	%rbp
1118	mov	-8(%rsi),%rbx
1119.cfi_restore	%rbx
1120	lea	(%rsi),%rsp
1121.cfi_def_cfa_register	%rsp
1122.Lepilogue_ssse3:
1123	ret
1124.cfi_endproc
1125.size	${func}_ssse3,.-${func}_ssse3
1126___
1127}
1128
1129if ($avx) {{
1130######################################################################
1131# AVX+shrd code path
1132#
1133local *ror = sub { &shrd(@_[0],@_) };
1134
1135$code.=<<___;
1136.type	${func}_avx,\@function,3
1137.align	64
1138${func}_avx:
1139.cfi_startproc
1140.Lavx_shortcut:
1141	mov	%rsp,%rax		# copy %rsp
1142.cfi_def_cfa_register	%rax
1143	push	%rbx
1144.cfi_push	%rbx
1145	push	%rbp
1146.cfi_push	%rbp
1147	push	%r12
1148.cfi_push	%r12
1149	push	%r13
1150.cfi_push	%r13
1151	push	%r14
1152.cfi_push	%r14
1153	push	%r15
1154.cfi_push	%r15
1155	shl	\$4,%rdx		# num*16
1156	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1157	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1158	and	\$-64,%rsp		# align stack frame
1159	mov	$ctx,$_ctx		# save ctx, 1st arg
1160	mov	$inp,$_inp		# save inp, 2nd arh
1161	mov	%rdx,$_end		# save end pointer, "3rd" arg
1162	mov	%rax,$_rsp		# save copy of %rsp
1163.cfi_cfa_expression	$_rsp,deref,+8
1164___
1165$code.=<<___ if ($win64);
1166	movaps	%xmm6,16*$SZ+32(%rsp)
1167	movaps	%xmm7,16*$SZ+48(%rsp)
1168	movaps	%xmm8,16*$SZ+64(%rsp)
1169	movaps	%xmm9,16*$SZ+80(%rsp)
1170___
1171$code.=<<___ if ($win64 && $SZ>4);
1172	movaps	%xmm10,16*$SZ+96(%rsp)
1173	movaps	%xmm11,16*$SZ+112(%rsp)
1174___
1175$code.=<<___;
1176.Lprologue_avx:
1177
1178	vzeroupper
1179	mov	$SZ*0($ctx),$A
1180	mov	$SZ*1($ctx),$B
1181	mov	$SZ*2($ctx),$C
1182	mov	$SZ*3($ctx),$D
1183	mov	$SZ*4($ctx),$E
1184	mov	$SZ*5($ctx),$F
1185	mov	$SZ*6($ctx),$G
1186	mov	$SZ*7($ctx),$H
1187___
1188					if ($SZ==4) {	# SHA256
1189    my @X = map("%xmm$_",(0..3));
1190    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1191
1192$code.=<<___;
1193	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1194	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1195	jmp	.Lloop_avx
1196.align	16
1197.Lloop_avx:
1198	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1199	vmovdqu	0x00($inp),@X[0]
1200	vmovdqu	0x10($inp),@X[1]
1201	vmovdqu	0x20($inp),@X[2]
1202	vmovdqu	0x30($inp),@X[3]
1203	vpshufb	$t3,@X[0],@X[0]
1204	lea	$TABLE(%rip),$Tbl
1205	vpshufb	$t3,@X[1],@X[1]
1206	vpshufb	$t3,@X[2],@X[2]
1207	vpaddd	0x00($Tbl),@X[0],$t0
1208	vpshufb	$t3,@X[3],@X[3]
1209	vpaddd	0x20($Tbl),@X[1],$t1
1210	vpaddd	0x40($Tbl),@X[2],$t2
1211	vpaddd	0x60($Tbl),@X[3],$t3
1212	vmovdqa	$t0,0x00(%rsp)
1213	mov	$A,$a1
1214	vmovdqa	$t1,0x10(%rsp)
1215	mov	$B,$a3
1216	vmovdqa	$t2,0x20(%rsp)
1217	xor	$C,$a3			# magic
1218	vmovdqa	$t3,0x30(%rsp)
1219	mov	$E,$a0
1220	jmp	.Lavx_00_47
1221
1222.align	16
1223.Lavx_00_47:
1224	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1225___
1226sub Xupdate_256_AVX () {
1227	(
1228	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1229	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1230	'&vpsrld	($t2,$t0,$sigma0[0]);',
1231	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1232	'&vpsrld	($t3,$t0,$sigma0[2])',
1233	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1234	'&vpxor		($t0,$t3,$t2)',
1235	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1236	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1237	'&vpxor		($t0,$t0,$t1)',
1238	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1239	'&vpxor		($t0,$t0,$t2)',
1240	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1241	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1242	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1243	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1244	 '&vpxor	($t2,$t2,$t3);',
1245	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1246	 '&vpxor	($t2,$t2,$t3)',
1247	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1248	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1249	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1250	 '&vpsrld	($t2,$t3,$sigma1[2])',
1251	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1252	 '&vpxor	($t2,$t2,$t3);',
1253	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1254	 '&vpxor	($t2,$t2,$t3)',
1255	 '&vpshufb	($t2,$t2,$t5)',
1256	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1257	);
1258}
1259
1260sub AVX_256_00_47 () {
1261my $j = shift;
1262my $body = shift;
1263my @X = @_;
1264my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1265
1266	foreach (Xupdate_256_AVX()) {		# 29 instructions
1267	    eval;
1268	    eval(shift(@insns));
1269	    eval(shift(@insns));
1270	    eval(shift(@insns));
1271	}
1272	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1273	  foreach (@insns) { eval; }		# remaining instructions
1274	&vmovdqa	(16*$j."(%rsp)",$t2);
1275}
1276
1277    for ($i=0,$j=0; $j<4; $j++) {
1278	&AVX_256_00_47($j,\&body_00_15,@X);
1279	push(@X,shift(@X));			# rotate(@X)
1280    }
1281	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1282	&jne	(".Lavx_00_47");
1283
1284    for ($i=0; $i<16; ) {
1285	foreach(body_00_15()) { eval; }
1286    }
1287
1288					} else {	# SHA512
1289    my @X = map("%xmm$_",(0..7));
1290    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1291
1292$code.=<<___;
1293	jmp	.Lloop_avx
1294.align	16
1295.Lloop_avx:
1296	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1297	vmovdqu	0x00($inp),@X[0]
1298	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1299	vmovdqu	0x10($inp),@X[1]
1300	vmovdqu	0x20($inp),@X[2]
1301	vpshufb	$t3,@X[0],@X[0]
1302	vmovdqu	0x30($inp),@X[3]
1303	vpshufb	$t3,@X[1],@X[1]
1304	vmovdqu	0x40($inp),@X[4]
1305	vpshufb	$t3,@X[2],@X[2]
1306	vmovdqu	0x50($inp),@X[5]
1307	vpshufb	$t3,@X[3],@X[3]
1308	vmovdqu	0x60($inp),@X[6]
1309	vpshufb	$t3,@X[4],@X[4]
1310	vmovdqu	0x70($inp),@X[7]
1311	vpshufb	$t3,@X[5],@X[5]
1312	vpaddq	-0x80($Tbl),@X[0],$t0
1313	vpshufb	$t3,@X[6],@X[6]
1314	vpaddq	-0x60($Tbl),@X[1],$t1
1315	vpshufb	$t3,@X[7],@X[7]
1316	vpaddq	-0x40($Tbl),@X[2],$t2
1317	vpaddq	-0x20($Tbl),@X[3],$t3
1318	vmovdqa	$t0,0x00(%rsp)
1319	vpaddq	0x00($Tbl),@X[4],$t0
1320	vmovdqa	$t1,0x10(%rsp)
1321	vpaddq	0x20($Tbl),@X[5],$t1
1322	vmovdqa	$t2,0x20(%rsp)
1323	vpaddq	0x40($Tbl),@X[6],$t2
1324	vmovdqa	$t3,0x30(%rsp)
1325	vpaddq	0x60($Tbl),@X[7],$t3
1326	vmovdqa	$t0,0x40(%rsp)
1327	mov	$A,$a1
1328	vmovdqa	$t1,0x50(%rsp)
1329	mov	$B,$a3
1330	vmovdqa	$t2,0x60(%rsp)
1331	xor	$C,$a3			# magic
1332	vmovdqa	$t3,0x70(%rsp)
1333	mov	$E,$a0
1334	jmp	.Lavx_00_47
1335
1336.align	16
1337.Lavx_00_47:
1338	add	\$`16*2*$SZ`,$Tbl
1339___
1340sub Xupdate_512_AVX () {
1341	(
1342	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1343	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1344	'&vpsrlq	($t2,$t0,$sigma0[0])',
1345	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1346	'&vpsrlq	($t3,$t0,$sigma0[2])',
1347	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1348	 '&vpxor	($t0,$t3,$t2)',
1349	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1350	 '&vpxor	($t0,$t0,$t1)',
1351	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1352	 '&vpxor	($t0,$t0,$t2)',
1353	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1354	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1355	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1356	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1357	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1358	 '&vpxor	($t3,$t3,$t2)',
1359	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1360	 '&vpxor	($t3,$t3,$t1)',
1361	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1362	 '&vpxor	($t3,$t3,$t2)',
1363	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1364	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1365	);
1366}
1367
1368sub AVX_512_00_47 () {
1369my $j = shift;
1370my $body = shift;
1371my @X = @_;
1372my @insns = (&$body,&$body);			# 52 instructions
1373
1374	foreach (Xupdate_512_AVX()) {		# 23 instructions
1375	    eval;
1376	    eval(shift(@insns));
1377	    eval(shift(@insns));
1378	}
1379	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1380	  foreach (@insns) { eval; }		# remaining instructions
1381	&vmovdqa	(16*$j."(%rsp)",$t2);
1382}
1383
1384    for ($i=0,$j=0; $j<8; $j++) {
1385	&AVX_512_00_47($j,\&body_00_15,@X);
1386	push(@X,shift(@X));			# rotate(@X)
1387    }
1388	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1389	&jne	(".Lavx_00_47");
1390
1391    for ($i=0; $i<16; ) {
1392	foreach(body_00_15()) { eval; }
1393    }
1394}
1395$code.=<<___;
1396	mov	$_ctx,$ctx
1397	mov	$a1,$A
1398
1399	add	$SZ*0($ctx),$A
1400	lea	16*$SZ($inp),$inp
1401	add	$SZ*1($ctx),$B
1402	add	$SZ*2($ctx),$C
1403	add	$SZ*3($ctx),$D
1404	add	$SZ*4($ctx),$E
1405	add	$SZ*5($ctx),$F
1406	add	$SZ*6($ctx),$G
1407	add	$SZ*7($ctx),$H
1408
1409	cmp	$_end,$inp
1410
1411	mov	$A,$SZ*0($ctx)
1412	mov	$B,$SZ*1($ctx)
1413	mov	$C,$SZ*2($ctx)
1414	mov	$D,$SZ*3($ctx)
1415	mov	$E,$SZ*4($ctx)
1416	mov	$F,$SZ*5($ctx)
1417	mov	$G,$SZ*6($ctx)
1418	mov	$H,$SZ*7($ctx)
1419	jb	.Lloop_avx
1420
1421	mov	$_rsp,%rsi
1422.cfi_def_cfa	%rsi,8
1423	vzeroupper
1424___
1425$code.=<<___ if ($win64);
1426	movaps	16*$SZ+32(%rsp),%xmm6
1427	movaps	16*$SZ+48(%rsp),%xmm7
1428	movaps	16*$SZ+64(%rsp),%xmm8
1429	movaps	16*$SZ+80(%rsp),%xmm9
1430___
1431$code.=<<___ if ($win64 && $SZ>4);
1432	movaps	16*$SZ+96(%rsp),%xmm10
1433	movaps	16*$SZ+112(%rsp),%xmm11
1434___
1435$code.=<<___;
1436	mov	-48(%rsi),%r15
1437.cfi_restore	%r15
1438	mov	-40(%rsi),%r14
1439.cfi_restore	%r14
1440	mov	-32(%rsi),%r13
1441.cfi_restore	%r13
1442	mov	-24(%rsi),%r12
1443.cfi_restore	%r12
1444	mov	-16(%rsi),%rbp
1445.cfi_restore	%rbp
1446	mov	-8(%rsi),%rbx
1447.cfi_restore	%rbx
1448	lea	(%rsi),%rsp
1449.cfi_def_cfa_register	%rsp
1450.Lepilogue_avx:
1451	ret
1452.cfi_endproc
1453.size	${func}_avx,.-${func}_avx
1454___
1455
1456}}}}}
1457
1458# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1459#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1460if ($win64) {
1461$rec="%rcx";
1462$frame="%rdx";
1463$context="%r8";
1464$disp="%r9";
1465
1466$code.=<<___;
1467.extern	__imp_RtlVirtualUnwind
1468.type	se_handler,\@abi-omnipotent
1469.align	16
1470se_handler:
1471	push	%rsi
1472	push	%rdi
1473	push	%rbx
1474	push	%rbp
1475	push	%r12
1476	push	%r13
1477	push	%r14
1478	push	%r15
1479	pushfq
1480	sub	\$64,%rsp
1481
1482	mov	120($context),%rax	# pull context->Rax
1483	mov	248($context),%rbx	# pull context->Rip
1484
1485	mov	8($disp),%rsi		# disp->ImageBase
1486	mov	56($disp),%r11		# disp->HanderlData
1487
1488	mov	0(%r11),%r10d		# HandlerData[0]
1489	lea	(%rsi,%r10),%r10	# prologue label
1490	cmp	%r10,%rbx		# context->Rip<prologue label
1491	jb	.Lin_prologue
1492
1493	mov	152($context),%rax	# pull context->Rsp
1494
1495	mov	4(%r11),%r10d		# HandlerData[1]
1496	lea	(%rsi,%r10),%r10	# epilogue label
1497	cmp	%r10,%rbx		# context->Rip>=epilogue label
1498	jae	.Lin_prologue
1499___
1500$code.=<<___;
1501	mov	%rax,%rsi		# put aside Rsp
1502	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
1503
1504	mov	-8(%rax),%rbx
1505	mov	-16(%rax),%rbp
1506	mov	-24(%rax),%r12
1507	mov	-32(%rax),%r13
1508	mov	-40(%rax),%r14
1509	mov	-48(%rax),%r15
1510	mov	%rbx,144($context)	# restore context->Rbx
1511	mov	%rbp,160($context)	# restore context->Rbp
1512	mov	%r12,216($context)	# restore context->R12
1513	mov	%r13,224($context)	# restore context->R13
1514	mov	%r14,232($context)	# restore context->R14
1515	mov	%r15,240($context)	# restore context->R15
1516
1517	lea	.Lepilogue(%rip),%r10
1518	cmp	%r10,%rbx
1519	jb	.Lin_prologue		# non-AVX code
1520
1521	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
1522	lea	512($context),%rdi	# &context.Xmm6
1523	mov	\$`$SZ==4?8:12`,%ecx
1524	.long	0xa548f3fc		# cld; rep movsq
1525
1526.Lin_prologue:
1527	mov	8(%rax),%rdi
1528	mov	16(%rax),%rsi
1529	mov	%rax,152($context)	# restore context->Rsp
1530	mov	%rsi,168($context)	# restore context->Rsi
1531	mov	%rdi,176($context)	# restore context->Rdi
1532
1533	mov	40($disp),%rdi		# disp->ContextRecord
1534	mov	$context,%rsi		# context
1535	mov	\$154,%ecx		# sizeof(CONTEXT)
1536	.long	0xa548f3fc		# cld; rep movsq
1537
1538	mov	$disp,%rsi
1539	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1540	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1541	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1542	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1543	mov	40(%rsi),%r10		# disp->ContextRecord
1544	lea	56(%rsi),%r11		# &disp->HandlerData
1545	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1546	mov	%r10,32(%rsp)		# arg5
1547	mov	%r11,40(%rsp)		# arg6
1548	mov	%r12,48(%rsp)		# arg7
1549	mov	%rcx,56(%rsp)		# arg8, (NULL)
1550	call	*__imp_RtlVirtualUnwind(%rip)
1551
1552	mov	\$1,%eax		# ExceptionContinueSearch
1553	add	\$64,%rsp
1554	popfq
1555	pop	%r15
1556	pop	%r14
1557	pop	%r13
1558	pop	%r12
1559	pop	%rbp
1560	pop	%rbx
1561	pop	%rdi
1562	pop	%rsi
1563	ret
1564.size	se_handler,.-se_handler
1565___
1566
1567$code.=<<___ if ($SZ==4 && $shaext);
1568.type	shaext_handler,\@abi-omnipotent
1569.align	16
1570shaext_handler:
1571	push	%rsi
1572	push	%rdi
1573	push	%rbx
1574	push	%rbp
1575	push	%r12
1576	push	%r13
1577	push	%r14
1578	push	%r15
1579	pushfq
1580	sub	\$64,%rsp
1581
1582	mov	120($context),%rax	# pull context->Rax
1583	mov	248($context),%rbx	# pull context->Rip
1584
1585	lea	.Lprologue_shaext(%rip),%r10
1586	cmp	%r10,%rbx		# context->Rip<.Lprologue
1587	jb	.Lin_prologue
1588
1589	lea	.Lepilogue_shaext(%rip),%r10
1590	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1591	jae	.Lin_prologue
1592
1593	lea	-8-5*16(%rax),%rsi
1594	lea	512($context),%rdi	# &context.Xmm6
1595	mov	\$10,%ecx
1596	.long	0xa548f3fc		# cld; rep movsq
1597
1598	jmp	.Lin_prologue
1599.size	shaext_handler,.-shaext_handler
1600___
1601
1602$code.=<<___;
1603.section	.pdata
1604.align	4
1605	.rva	.LSEH_begin_$func
1606	.rva	.LSEH_end_$func
1607	.rva	.LSEH_info_$func
1608___
1609$code.=<<___ if ($SZ==4 && $shaext);
1610	.rva	.LSEH_begin_${func}_shaext
1611	.rva	.LSEH_end_${func}_shaext
1612	.rva	.LSEH_info_${func}_shaext
1613___
1614$code.=<<___ if ($SZ==4);
1615	.rva	.LSEH_begin_${func}_ssse3
1616	.rva	.LSEH_end_${func}_ssse3
1617	.rva	.LSEH_info_${func}_ssse3
1618___
1619$code.=<<___ if ($avx);
1620	.rva	.LSEH_begin_${func}_avx
1621	.rva	.LSEH_end_${func}_avx
1622	.rva	.LSEH_info_${func}_avx
1623___
1624$code.=<<___;
1625.section	.xdata
1626.align	8
1627.LSEH_info_$func:
1628	.byte	9,0,0,0
1629	.rva	se_handler
1630	.rva	.Lprologue,.Lepilogue			# HandlerData[]
1631___
1632$code.=<<___ if ($SZ==4 && $shaext);
1633.LSEH_info_${func}_shaext:
1634	.byte	9,0,0,0
1635	.rva	shaext_handler
1636___
1637$code.=<<___ if ($SZ==4);
1638.LSEH_info_${func}_ssse3:
1639	.byte	9,0,0,0
1640	.rva	se_handler
1641	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1642___
1643$code.=<<___ if ($avx);
1644.LSEH_info_${func}_avx:
1645	.byte	9,0,0,0
1646	.rva	se_handler
1647	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1648___
1649}
1650
1651sub sha256op38 {
1652    my $instr = shift;
1653    my %opcodelet = (
1654		"sha256rnds2" => 0xcb,
1655  		"sha256msg1"  => 0xcc,
1656		"sha256msg2"  => 0xcd	);
1657
1658    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
1659      my @opcode=(0x0f,0x38);
1660	push @opcode,$opcodelet{$instr};
1661	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1662	return ".byte\t".join(',',@opcode);
1663    } else {
1664	return $instr."\t".@_[0];
1665    }
1666}
1667
1668foreach (split("\n",$code)) {
1669	s/\`([^\`]*)\`/eval $1/geo;
1670
1671	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
1672
1673	print $_,"\n";
1674}
1675close STDOUT or die "error closing STDOUT";
1676