• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111#
112# Modified from upstream OpenSSL to remove the XOP code.
113
114my ($flavour, $output) = @ARGV;
115
116if ($output =~ /sha512-x86_64/) {
117	$func="sha512_block_data_order";
118	$TABLE="K512";
119	$SZ=8;
120	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
121					"%r8", "%r9", "%r10","%r11");
122	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
123	@Sigma0=(28,34,39);
124	@Sigma1=(14,18,41);
125	@sigma0=(1,  8, 7);
126	@sigma1=(19,61, 6);
127	$rounds=80;
128} else {
129	$func="sha256_block_data_order";
130	$TABLE="K256";
131	$SZ=4;
132	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
133					"%r8d","%r9d","%r10d","%r11d");
134	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
135	@Sigma0=( 2,13,22);
136	@Sigma1=( 6,11,25);
137	@sigma0=( 7,18, 3);
138	@sigma1=(17,19,10);
139	$rounds=64;
140}
141
142$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
143
144$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
145( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
146( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
147die "can't locate x86_64-xlate.pl";
148
149# In upstream, this is controlled by shelling out to the compiler to check
150# versions, but BoringSSL is intended to be used with pre-generated perlasm
151# output, so this isn't useful anyway.
152#
153# This file also has an AVX2 implementation, controlled by setting $avx to 2.
154# For now, we intentionally disable it. While it gives a 13-16% perf boost, the
155# CFI annotations are wrong. It allocates stack in a loop and should be
156# rewritten to avoid this.
157$avx = 1;
158$shaext = 1;
159
160open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
161*STDOUT=*OUT;
162
163$ctx="%rdi";	# 1st arg, zapped by $a3
164$inp="%rsi";	# 2nd arg
165$Tbl="%rbp";
166
167$_ctx="16*$SZ+0*8(%rsp)";
168$_inp="16*$SZ+1*8(%rsp)";
169$_end="16*$SZ+2*8(%rsp)";
170$_rsp="`16*$SZ+3*8`(%rsp)";
171$framesz="16*$SZ+4*8";
172
173
174sub ROUND_00_15()
175{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
176  my $STRIDE=$SZ;
177     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
178
179$code.=<<___;
180	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
181	mov	$f,$a2
182
183	xor	$e,$a0
184	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
185	xor	$g,$a2			# f^g
186
187	mov	$T1,`$SZ*($i&0xf)`(%rsp)
188	xor	$a,$a1
189	and	$e,$a2			# (f^g)&e
190
191	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
192	add	$h,$T1			# T1+=h
193	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
194
195	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
196	xor	$e,$a0
197	add	$a2,$T1			# T1+=Ch(e,f,g)
198
199	mov	$a,$a2
200	add	($Tbl),$T1		# T1+=K[round]
201	xor	$a,$a1
202
203	xor	$b,$a2			# a^b, b^c in next round
204	ror	\$$Sigma1[0],$a0	# Sigma1(e)
205	mov	$b,$h
206
207	and	$a2,$a3
208	ror	\$$Sigma0[0],$a1	# Sigma0(a)
209	add	$a0,$T1			# T1+=Sigma1(e)
210
211	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
212	add	$T1,$d			# d+=T1
213	add	$T1,$h			# h+=T1
214
215	lea	$STRIDE($Tbl),$Tbl	# round++
216___
217$code.=<<___ if ($i<15);
218	add	$a1,$h			# h+=Sigma0(a)
219___
220	($a2,$a3) = ($a3,$a2);
221}
222
223sub ROUND_16_XX()
224{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
225
226$code.=<<___;
227	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
228	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
229
230	mov	$a0,$T1
231	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
232	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
233	mov	$a2,$a1
234	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
235
236	xor	$T1,$a0
237	shr	\$$sigma0[2],$T1
238	ror	\$$sigma0[0],$a0
239	xor	$a1,$a2
240	shr	\$$sigma1[2],$a1
241
242	ror	\$$sigma1[0],$a2
243	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
244	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
245	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
246
247	add	`$SZ*($i&0xf)`(%rsp),$T1
248	mov	$e,$a0
249	add	$a2,$T1
250	mov	$a,$a1
251___
252	&ROUND_00_15(@_);
253}
254
255$code=<<___;
256.text
257
258.extern	OPENSSL_ia32cap_P
259.globl	$func
260.type	$func,\@function,3
261.align	16
262$func:
263.cfi_startproc
264	_CET_ENDBR
265___
266$code.=<<___ if ($SZ==4 || $avx);
267	leaq	OPENSSL_ia32cap_P(%rip),%r11
268	mov	0(%r11),%r9d
269	mov	4(%r11),%r10d
270	mov	8(%r11),%r11d
271___
272$code.=<<___ if ($SZ==4 && $shaext);
273	test	\$`1<<29`,%r11d		# check for SHA
274	jnz	.Lshaext_shortcut
275___
276    # XOP codepath removed.
277$code.=<<___ if ($avx>1);
278	and	\$`1<<8|1<<5|1<<3`,%r11d	# check for BMI2+AVX2+BMI1
279	cmp	\$`1<<8|1<<5|1<<3`,%r11d
280	je	.Lavx2_shortcut
281___
282$code.=<<___ if ($avx);
283	and	\$`1<<30`,%r9d		# mask "Intel CPU" bit
284	and	\$`1<<28|1<<9`,%r10d	# mask AVX and SSSE3 bits
285	or	%r9d,%r10d
286	cmp	\$`1<<28|1<<9|1<<30`,%r10d
287	je	.Lavx_shortcut
288___
289$code.=<<___ if ($SZ==4);
290	test	\$`1<<9`,%r10d
291	jnz	.Lssse3_shortcut
292___
293$code.=<<___;
294	mov	%rsp,%rax		# copy %rsp
295.cfi_def_cfa_register	%rax
296	push	%rbx
297.cfi_push	%rbx
298	push	%rbp
299.cfi_push	%rbp
300	push	%r12
301.cfi_push	%r12
302	push	%r13
303.cfi_push	%r13
304	push	%r14
305.cfi_push	%r14
306	push	%r15
307.cfi_push	%r15
308	shl	\$4,%rdx		# num*16
309	sub	\$$framesz,%rsp
310	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
311	and	\$-64,%rsp		# align stack frame
312	mov	$ctx,$_ctx		# save ctx, 1st arg
313	mov	$inp,$_inp		# save inp, 2nd arh
314	mov	%rdx,$_end		# save end pointer, "3rd" arg
315	mov	%rax,$_rsp		# save copy of %rsp
316.cfi_cfa_expression	$_rsp,deref,+8
317.Lprologue:
318
319	mov	$SZ*0($ctx),$A
320	mov	$SZ*1($ctx),$B
321	mov	$SZ*2($ctx),$C
322	mov	$SZ*3($ctx),$D
323	mov	$SZ*4($ctx),$E
324	mov	$SZ*5($ctx),$F
325	mov	$SZ*6($ctx),$G
326	mov	$SZ*7($ctx),$H
327	jmp	.Lloop
328
329.align	16
330.Lloop:
331	mov	$B,$a3
332	lea	$TABLE(%rip),$Tbl
333	xor	$C,$a3			# magic
334___
335	for($i=0;$i<16;$i++) {
336		$code.="	mov	$SZ*$i($inp),$T1\n";
337		$code.="	mov	@ROT[4],$a0\n";
338		$code.="	mov	@ROT[0],$a1\n";
339		$code.="	bswap	$T1\n";
340		&ROUND_00_15($i,@ROT);
341		unshift(@ROT,pop(@ROT));
342	}
343$code.=<<___;
344	jmp	.Lrounds_16_xx
345.align	16
346.Lrounds_16_xx:
347___
348	for(;$i<32;$i++) {
349		&ROUND_16_XX($i,@ROT);
350		unshift(@ROT,pop(@ROT));
351	}
352
353$code.=<<___;
354	cmpb	\$0,`$SZ-1`($Tbl)
355	jnz	.Lrounds_16_xx
356
357	mov	$_ctx,$ctx
358	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
359	lea	16*$SZ($inp),$inp
360
361	add	$SZ*0($ctx),$A
362	add	$SZ*1($ctx),$B
363	add	$SZ*2($ctx),$C
364	add	$SZ*3($ctx),$D
365	add	$SZ*4($ctx),$E
366	add	$SZ*5($ctx),$F
367	add	$SZ*6($ctx),$G
368	add	$SZ*7($ctx),$H
369
370	cmp	$_end,$inp
371
372	mov	$A,$SZ*0($ctx)
373	mov	$B,$SZ*1($ctx)
374	mov	$C,$SZ*2($ctx)
375	mov	$D,$SZ*3($ctx)
376	mov	$E,$SZ*4($ctx)
377	mov	$F,$SZ*5($ctx)
378	mov	$G,$SZ*6($ctx)
379	mov	$H,$SZ*7($ctx)
380	jb	.Lloop
381
382	mov	$_rsp,%rsi
383.cfi_def_cfa	%rsi,8
384	mov	-48(%rsi),%r15
385.cfi_restore	%r15
386	mov	-40(%rsi),%r14
387.cfi_restore	%r14
388	mov	-32(%rsi),%r13
389.cfi_restore	%r13
390	mov	-24(%rsi),%r12
391.cfi_restore	%r12
392	mov	-16(%rsi),%rbp
393.cfi_restore	%rbp
394	mov	-8(%rsi),%rbx
395.cfi_restore	%rbx
396	lea	(%rsi),%rsp
397.cfi_def_cfa_register	%rsp
398.Lepilogue:
399	ret
400.cfi_endproc
401.size	$func,.-$func
402___
403
404if ($SZ==4) {
405$code.=<<___;
406.section .rodata
407.align	64
408.type	$TABLE,\@object
409$TABLE:
410	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
411	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
412	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
413	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
414	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
415	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
416	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
417	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
418	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
419	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
420	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
421	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
422	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
423	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
424	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
425	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
426	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
427	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
428	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
429	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
430	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
431	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
432	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
433	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
434	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
435	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
436	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
437	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
438	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
439	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
440	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
441	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
442
443	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
444	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
445	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
446	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
447	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
448	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
449	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
450.text
451___
452} else {
453$code.=<<___;
454.section .rodata
455.align	64
456.type	$TABLE,\@object
457$TABLE:
458	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
459	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
460	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
461	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
462	.quad	0x3956c25bf348b538,0x59f111f1b605d019
463	.quad	0x3956c25bf348b538,0x59f111f1b605d019
464	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
465	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
466	.quad	0xd807aa98a3030242,0x12835b0145706fbe
467	.quad	0xd807aa98a3030242,0x12835b0145706fbe
468	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
469	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
470	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
471	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
472	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
473	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
474	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
475	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
476	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
477	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
478	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
479	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
480	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
481	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
482	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
483	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
484	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
485	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
486	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
487	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
488	.quad	0x06ca6351e003826f,0x142929670a0e6e70
489	.quad	0x06ca6351e003826f,0x142929670a0e6e70
490	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
491	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
492	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
493	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
494	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
495	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
496	.quad	0x81c2c92e47edaee6,0x92722c851482353b
497	.quad	0x81c2c92e47edaee6,0x92722c851482353b
498	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
499	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
500	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
501	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
502	.quad	0xd192e819d6ef5218,0xd69906245565a910
503	.quad	0xd192e819d6ef5218,0xd69906245565a910
504	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
505	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
506	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
507	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
508	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
509	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
510	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
511	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
512	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
513	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
514	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
515	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
516	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
517	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
518	.quad	0x90befffa23631e28,0xa4506cebde82bde9
519	.quad	0x90befffa23631e28,0xa4506cebde82bde9
520	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
521	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
522	.quad	0xca273eceea26619c,0xd186b8c721c0c207
523	.quad	0xca273eceea26619c,0xd186b8c721c0c207
524	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
525	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
526	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
527	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
528	.quad	0x113f9804bef90dae,0x1b710b35131c471b
529	.quad	0x113f9804bef90dae,0x1b710b35131c471b
530	.quad	0x28db77f523047d84,0x32caab7b40c72493
531	.quad	0x28db77f523047d84,0x32caab7b40c72493
532	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
533	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
534	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
535	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
536	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
537	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
538
539	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
540	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
541	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
542.text
543___
544}
545
546######################################################################
547# SIMD code paths
548#
549if ($SZ==4 && $shaext) {{{
550######################################################################
551# Intel SHA Extensions implementation of SHA256 update function.
552#
553my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
554
555my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
556my @MSG=map("%xmm$_",(3..6));
557
558$code.=<<___;
559.type	sha256_block_data_order_shaext,\@function,3
560.align	64
561sha256_block_data_order_shaext:
562.cfi_startproc
563.Lshaext_shortcut:
564___
565$code.=<<___ if ($win64);
566	lea	`-8-5*16`(%rsp),%rsp
567	movaps	%xmm6,-8-5*16(%rax)
568	movaps	%xmm7,-8-4*16(%rax)
569	movaps	%xmm8,-8-3*16(%rax)
570	movaps	%xmm9,-8-2*16(%rax)
571	movaps	%xmm10,-8-1*16(%rax)
572.Lprologue_shaext:
573___
574$code.=<<___;
575	lea		K256+0x80(%rip),$Tbl
576	movdqu		($ctx),$ABEF		# DCBA
577	movdqu		16($ctx),$CDGH		# HGFE
578	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
579
580	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
581	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
582	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
583	movdqa		$TMP,$BSWAP		# offload
584	palignr		\$8,$CDGH,$ABEF		# ABEF
585	punpcklqdq	$Wi,$CDGH		# CDGH
586	jmp		.Loop_shaext
587
588.align	16
589.Loop_shaext:
590	movdqu		($inp),@MSG[0]
591	movdqu		0x10($inp),@MSG[1]
592	movdqu		0x20($inp),@MSG[2]
593	pshufb		$TMP,@MSG[0]
594	movdqu		0x30($inp),@MSG[3]
595
596	movdqa		0*32-0x80($Tbl),$Wi
597	paddd		@MSG[0],$Wi
598	pshufb		$TMP,@MSG[1]
599	movdqa		$CDGH,$CDGH_SAVE	# offload
600	sha256rnds2	$ABEF,$CDGH		# 0-3
601	pshufd		\$0x0e,$Wi,$Wi
602	nop
603	movdqa		$ABEF,$ABEF_SAVE	# offload
604	sha256rnds2	$CDGH,$ABEF
605
606	movdqa		1*32-0x80($Tbl),$Wi
607	paddd		@MSG[1],$Wi
608	pshufb		$TMP,@MSG[2]
609	sha256rnds2	$ABEF,$CDGH		# 4-7
610	pshufd		\$0x0e,$Wi,$Wi
611	lea		0x40($inp),$inp
612	sha256msg1	@MSG[1],@MSG[0]
613	sha256rnds2	$CDGH,$ABEF
614
615	movdqa		2*32-0x80($Tbl),$Wi
616	paddd		@MSG[2],$Wi
617	pshufb		$TMP,@MSG[3]
618	sha256rnds2	$ABEF,$CDGH		# 8-11
619	pshufd		\$0x0e,$Wi,$Wi
620	movdqa		@MSG[3],$TMP
621	palignr		\$4,@MSG[2],$TMP
622	nop
623	paddd		$TMP,@MSG[0]
624	sha256msg1	@MSG[2],@MSG[1]
625	sha256rnds2	$CDGH,$ABEF
626
627	movdqa		3*32-0x80($Tbl),$Wi
628	paddd		@MSG[3],$Wi
629	sha256msg2	@MSG[3],@MSG[0]
630	sha256rnds2	$ABEF,$CDGH		# 12-15
631	pshufd		\$0x0e,$Wi,$Wi
632	movdqa		@MSG[0],$TMP
633	palignr		\$4,@MSG[3],$TMP
634	nop
635	paddd		$TMP,@MSG[1]
636	sha256msg1	@MSG[3],@MSG[2]
637	sha256rnds2	$CDGH,$ABEF
638___
639for($i=4;$i<16-3;$i++) {
640$code.=<<___;
641	movdqa		$i*32-0x80($Tbl),$Wi
642	paddd		@MSG[0],$Wi
643	sha256msg2	@MSG[0],@MSG[1]
644	sha256rnds2	$ABEF,$CDGH		# 16-19...
645	pshufd		\$0x0e,$Wi,$Wi
646	movdqa		@MSG[1],$TMP
647	palignr		\$4,@MSG[0],$TMP
648	nop
649	paddd		$TMP,@MSG[2]
650	sha256msg1	@MSG[0],@MSG[3]
651	sha256rnds2	$CDGH,$ABEF
652___
653	push(@MSG,shift(@MSG));
654}
655$code.=<<___;
656	movdqa		13*32-0x80($Tbl),$Wi
657	paddd		@MSG[0],$Wi
658	sha256msg2	@MSG[0],@MSG[1]
659	sha256rnds2	$ABEF,$CDGH		# 52-55
660	pshufd		\$0x0e,$Wi,$Wi
661	movdqa		@MSG[1],$TMP
662	palignr		\$4,@MSG[0],$TMP
663	sha256rnds2	$CDGH,$ABEF
664	paddd		$TMP,@MSG[2]
665
666	movdqa		14*32-0x80($Tbl),$Wi
667	paddd		@MSG[1],$Wi
668	sha256rnds2	$ABEF,$CDGH		# 56-59
669	pshufd		\$0x0e,$Wi,$Wi
670	sha256msg2	@MSG[1],@MSG[2]
671	movdqa		$BSWAP,$TMP
672	sha256rnds2	$CDGH,$ABEF
673
674	movdqa		15*32-0x80($Tbl),$Wi
675	paddd		@MSG[2],$Wi
676	nop
677	sha256rnds2	$ABEF,$CDGH		# 60-63
678	pshufd		\$0x0e,$Wi,$Wi
679	dec		$num
680	nop
681	sha256rnds2	$CDGH,$ABEF
682
683	paddd		$CDGH_SAVE,$CDGH
684	paddd		$ABEF_SAVE,$ABEF
685	jnz		.Loop_shaext
686
687	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
688	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
689	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
690	punpckhqdq	$CDGH,$ABEF		# DCBA
691	palignr		\$8,$TMP,$CDGH		# HGFE
692
693	movdqu	$ABEF,($ctx)
694	movdqu	$CDGH,16($ctx)
695___
696$code.=<<___ if ($win64);
697	movaps	-8-5*16(%rax),%xmm6
698	movaps	-8-4*16(%rax),%xmm7
699	movaps	-8-3*16(%rax),%xmm8
700	movaps	-8-2*16(%rax),%xmm9
701	movaps	-8-1*16(%rax),%xmm10
702	mov	%rax,%rsp
703.Lepilogue_shaext:
704___
705$code.=<<___;
706	ret
707.cfi_endproc
708.size	sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
709___
710}}}
711{{{
712
713my $a4=$T1;
714my ($a,$b,$c,$d,$e,$f,$g,$h);
715
716sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
717{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
718  my $arg = pop;
719    $arg = "\$$arg" if ($arg*1 eq $arg);
720    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
721}
722
723sub body_00_15 () {
724	(
725	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
726
727	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
728	'&mov	($a,$a1)',
729	'&mov	($a4,$f)',
730
731	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
732	'&xor	($a0,$e)',
733	'&xor	($a4,$g)',			# f^g
734
735	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
736	'&xor	($a1,$a)',
737	'&and	($a4,$e)',			# (f^g)&e
738
739	'&xor	($a0,$e)',
740	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
741	'&mov	($a2,$a)',
742
743	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
744	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
745	'&xor	($a2,$b)',			# a^b, b^c in next round
746
747	'&add	($h,$a4)',			# h+=Ch(e,f,g)
748	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
749	'&and	($a3,$a2)',			# (b^c)&(a^b)
750
751	'&xor	($a1,$a)',
752	'&add	($h,$a0)',			# h+=Sigma1(e)
753	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
754
755	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
756	'&add	($d,$h)',			# d+=h
757	'&add	($h,$a3)',			# h+=Maj(a,b,c)
758
759	'&mov	($a0,$d)',
760	'&add	($a1,$h);'.			# h+=Sigma0(a)
761	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
762	);
763}
764
765######################################################################
766# SSSE3 code path
767#
768if ($SZ==4) {	# SHA256 only
769my @X = map("%xmm$_",(0..3));
770my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
771
772$code.=<<___;
773.type	${func}_ssse3,\@function,3
774.align	64
775${func}_ssse3:
776.cfi_startproc
777.Lssse3_shortcut:
778	mov	%rsp,%rax		# copy %rsp
779.cfi_def_cfa_register	%rax
780	push	%rbx
781.cfi_push	%rbx
782	push	%rbp
783.cfi_push	%rbp
784	push	%r12
785.cfi_push	%r12
786	push	%r13
787.cfi_push	%r13
788	push	%r14
789.cfi_push	%r14
790	push	%r15
791.cfi_push	%r15
792	shl	\$4,%rdx		# num*16
793	sub	\$`$framesz+$win64*16*4`,%rsp
794	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
795	and	\$-64,%rsp		# align stack frame
796	mov	$ctx,$_ctx		# save ctx, 1st arg
797	mov	$inp,$_inp		# save inp, 2nd arh
798	mov	%rdx,$_end		# save end pointer, "3rd" arg
799	mov	%rax,$_rsp		# save copy of %rsp
800.cfi_cfa_expression	$_rsp,deref,+8
801___
802$code.=<<___ if ($win64);
803	movaps	%xmm6,16*$SZ+32(%rsp)
804	movaps	%xmm7,16*$SZ+48(%rsp)
805	movaps	%xmm8,16*$SZ+64(%rsp)
806	movaps	%xmm9,16*$SZ+80(%rsp)
807___
808$code.=<<___;
809.Lprologue_ssse3:
810
811	mov	$SZ*0($ctx),$A
812	mov	$SZ*1($ctx),$B
813	mov	$SZ*2($ctx),$C
814	mov	$SZ*3($ctx),$D
815	mov	$SZ*4($ctx),$E
816	mov	$SZ*5($ctx),$F
817	mov	$SZ*6($ctx),$G
818	mov	$SZ*7($ctx),$H
819___
820
821$code.=<<___;
822	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
823	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
824	jmp	.Lloop_ssse3
825.align	16
826.Lloop_ssse3:
827	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
828	movdqu	0x00($inp),@X[0]
829	movdqu	0x10($inp),@X[1]
830	movdqu	0x20($inp),@X[2]
831	pshufb	$t3,@X[0]
832	movdqu	0x30($inp),@X[3]
833	lea	$TABLE(%rip),$Tbl
834	pshufb	$t3,@X[1]
835	movdqa	0x00($Tbl),$t0
836	movdqa	0x20($Tbl),$t1
837	pshufb	$t3,@X[2]
838	paddd	@X[0],$t0
839	movdqa	0x40($Tbl),$t2
840	pshufb	$t3,@X[3]
841	movdqa	0x60($Tbl),$t3
842	paddd	@X[1],$t1
843	paddd	@X[2],$t2
844	paddd	@X[3],$t3
845	movdqa	$t0,0x00(%rsp)
846	mov	$A,$a1
847	movdqa	$t1,0x10(%rsp)
848	mov	$B,$a3
849	movdqa	$t2,0x20(%rsp)
850	xor	$C,$a3			# magic
851	movdqa	$t3,0x30(%rsp)
852	mov	$E,$a0
853	jmp	.Lssse3_00_47
854
855.align	16
856.Lssse3_00_47:
857	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
858___
859sub Xupdate_256_SSSE3 () {
860	(
861	'&movdqa	($t0,@X[1]);',
862	'&movdqa	($t3,@X[3])',
863	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
864	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
865	'&movdqa	($t1,$t0)',
866	'&movdqa	($t2,$t0);',
867	'&psrld		($t0,$sigma0[2])',
868	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
869	'&psrld		($t2,$sigma0[0])',
870	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
871	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
872	'&pxor		($t0,$t2)',
873	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
874	'&pxor		($t0,$t1)',
875	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
876	'&pxor		($t0,$t2);',
877	 '&movdqa	($t2,$t3)',
878	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
879	 '&psrld	($t3,$sigma1[2])',
880	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
881	 '&psrlq	($t2,$sigma1[0])',
882	 '&pxor		($t3,$t2);',
883	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
884	 '&pxor		($t3,$t2)',
885	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
886	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
887	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
888	 '&movdqa	($t2,$t3);',
889	 '&psrld	($t3,$sigma1[2])',
890	 '&psrlq	($t2,$sigma1[0])',
891	 '&pxor		($t3,$t2);',
892	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
893	 '&pxor		($t3,$t2);',
894	'&movdqa	($t2,16*2*$j."($Tbl)")',
895	 '&pshufb	($t3,$t5)',
896	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
897	);
898}
899
900sub SSSE3_256_00_47 () {
901my $j = shift;
902my $body = shift;
903my @X = @_;
904my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
905
906    if (0) {
907	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
908	    eval;
909	    eval(shift(@insns));
910	    eval(shift(@insns));
911	    eval(shift(@insns));
912	}
913    } else {			# squeeze extra 4% on Westmere and 19% on Atom
914	  eval(shift(@insns));	#@
915	&movdqa		($t0,@X[1]);
916	  eval(shift(@insns));
917	  eval(shift(@insns));
918	&movdqa		($t3,@X[3]);
919	  eval(shift(@insns));	#@
920	  eval(shift(@insns));
921	  eval(shift(@insns));
922	  eval(shift(@insns));	#@
923	  eval(shift(@insns));
924	&palignr	($t0,@X[0],$SZ);	# X[1..4]
925	  eval(shift(@insns));
926	  eval(shift(@insns));
927	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
928	  eval(shift(@insns));
929	  eval(shift(@insns));
930	  eval(shift(@insns));
931	  eval(shift(@insns));	#@
932	&movdqa		($t1,$t0);
933	  eval(shift(@insns));
934	  eval(shift(@insns));
935	&movdqa		($t2,$t0);
936	  eval(shift(@insns));	#@
937	  eval(shift(@insns));
938	&psrld		($t0,$sigma0[2]);
939	  eval(shift(@insns));
940	  eval(shift(@insns));
941	  eval(shift(@insns));
942	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
943	  eval(shift(@insns));	#@
944	  eval(shift(@insns));
945	&psrld		($t2,$sigma0[0]);
946	  eval(shift(@insns));
947	  eval(shift(@insns));
948	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
949	  eval(shift(@insns));
950	  eval(shift(@insns));	#@
951	&pslld		($t1,8*$SZ-$sigma0[1]);
952	  eval(shift(@insns));
953	  eval(shift(@insns));
954	&pxor		($t0,$t2);
955	  eval(shift(@insns));	#@
956	  eval(shift(@insns));
957	  eval(shift(@insns));
958	  eval(shift(@insns));	#@
959	&psrld		($t2,$sigma0[1]-$sigma0[0]);
960	  eval(shift(@insns));
961	&pxor		($t0,$t1);
962	  eval(shift(@insns));
963	  eval(shift(@insns));
964	&pslld		($t1,$sigma0[1]-$sigma0[0]);
965	  eval(shift(@insns));
966	  eval(shift(@insns));
967	&pxor		($t0,$t2);
968	  eval(shift(@insns));
969	  eval(shift(@insns));	#@
970	 &movdqa	($t2,$t3);
971	  eval(shift(@insns));
972	  eval(shift(@insns));
973	&pxor		($t0,$t1);		# sigma0(X[1..4])
974	  eval(shift(@insns));	#@
975	  eval(shift(@insns));
976	  eval(shift(@insns));
977	 &psrld		($t3,$sigma1[2]);
978	  eval(shift(@insns));
979	  eval(shift(@insns));
980	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
981	  eval(shift(@insns));	#@
982	  eval(shift(@insns));
983	 &psrlq		($t2,$sigma1[0]);
984	  eval(shift(@insns));
985	  eval(shift(@insns));
986	  eval(shift(@insns));
987	 &pxor		($t3,$t2);
988	  eval(shift(@insns));	#@
989	  eval(shift(@insns));
990	  eval(shift(@insns));
991	  eval(shift(@insns));	#@
992	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
993	  eval(shift(@insns));
994	  eval(shift(@insns));
995	 &pxor		($t3,$t2);
996	  eval(shift(@insns));	#@
997	  eval(shift(@insns));
998	  eval(shift(@insns));
999	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
1000	 &pshufd	($t3,$t3,0b10000000);
1001	  eval(shift(@insns));
1002	  eval(shift(@insns));
1003	  eval(shift(@insns));
1004	 &psrldq	($t3,8);
1005	  eval(shift(@insns));
1006	  eval(shift(@insns));	#@
1007	  eval(shift(@insns));
1008	  eval(shift(@insns));
1009	  eval(shift(@insns));	#@
1010	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
1011	  eval(shift(@insns));
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));
1014	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
1015	  eval(shift(@insns));
1016	  eval(shift(@insns));	#@
1017	  eval(shift(@insns));
1018	 &movdqa	($t2,$t3);
1019	  eval(shift(@insns));
1020	  eval(shift(@insns));
1021	 &psrld		($t3,$sigma1[2]);
1022	  eval(shift(@insns));
1023	  eval(shift(@insns));	#@
1024	 &psrlq		($t2,$sigma1[0]);
1025	  eval(shift(@insns));
1026	  eval(shift(@insns));
1027	 &pxor		($t3,$t2);
1028	  eval(shift(@insns));	#@
1029	  eval(shift(@insns));
1030	  eval(shift(@insns));
1031	  eval(shift(@insns));	#@
1032	  eval(shift(@insns));
1033	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1034	  eval(shift(@insns));
1035	  eval(shift(@insns));
1036	  eval(shift(@insns));
1037	 &pxor		($t3,$t2);
1038	  eval(shift(@insns));
1039	  eval(shift(@insns));
1040	  eval(shift(@insns));	#@
1041	 #&pshufb	($t3,$t5);
1042	 &pshufd	($t3,$t3,0b00001000);
1043	  eval(shift(@insns));
1044	  eval(shift(@insns));
1045	&movdqa		($t2,16*2*$j."($Tbl)");
1046	  eval(shift(@insns));	#@
1047	  eval(shift(@insns));
1048	 &pslldq	($t3,8);
1049	  eval(shift(@insns));
1050	  eval(shift(@insns));
1051	  eval(shift(@insns));
1052	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1053	  eval(shift(@insns));	#@
1054	  eval(shift(@insns));
1055	  eval(shift(@insns));
1056    }
1057	&paddd		($t2,@X[0]);
1058	  foreach (@insns) { eval; }		# remaining instructions
1059	&movdqa		(16*$j."(%rsp)",$t2);
1060}
1061
1062    for ($i=0,$j=0; $j<4; $j++) {
1063	&SSSE3_256_00_47($j,\&body_00_15,@X);
1064	push(@X,shift(@X));			# rotate(@X)
1065    }
1066	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1067	&jne	(".Lssse3_00_47");
1068
1069    for ($i=0; $i<16; ) {
1070	foreach(body_00_15()) { eval; }
1071    }
1072$code.=<<___;
1073	mov	$_ctx,$ctx
1074	mov	$a1,$A
1075
1076	add	$SZ*0($ctx),$A
1077	lea	16*$SZ($inp),$inp
1078	add	$SZ*1($ctx),$B
1079	add	$SZ*2($ctx),$C
1080	add	$SZ*3($ctx),$D
1081	add	$SZ*4($ctx),$E
1082	add	$SZ*5($ctx),$F
1083	add	$SZ*6($ctx),$G
1084	add	$SZ*7($ctx),$H
1085
1086	cmp	$_end,$inp
1087
1088	mov	$A,$SZ*0($ctx)
1089	mov	$B,$SZ*1($ctx)
1090	mov	$C,$SZ*2($ctx)
1091	mov	$D,$SZ*3($ctx)
1092	mov	$E,$SZ*4($ctx)
1093	mov	$F,$SZ*5($ctx)
1094	mov	$G,$SZ*6($ctx)
1095	mov	$H,$SZ*7($ctx)
1096	jb	.Lloop_ssse3
1097
1098	mov	$_rsp,%rsi
1099.cfi_def_cfa	%rsi,8
1100___
1101$code.=<<___ if ($win64);
1102	movaps	16*$SZ+32(%rsp),%xmm6
1103	movaps	16*$SZ+48(%rsp),%xmm7
1104	movaps	16*$SZ+64(%rsp),%xmm8
1105	movaps	16*$SZ+80(%rsp),%xmm9
1106___
1107$code.=<<___;
1108	mov	-48(%rsi),%r15
1109.cfi_restore	%r15
1110	mov	-40(%rsi),%r14
1111.cfi_restore	%r14
1112	mov	-32(%rsi),%r13
1113.cfi_restore	%r13
1114	mov	-24(%rsi),%r12
1115.cfi_restore	%r12
1116	mov	-16(%rsi),%rbp
1117.cfi_restore	%rbp
1118	mov	-8(%rsi),%rbx
1119.cfi_restore	%rbx
1120	lea	(%rsi),%rsp
1121.cfi_def_cfa_register	%rsp
1122.Lepilogue_ssse3:
1123	ret
1124.cfi_endproc
1125.size	${func}_ssse3,.-${func}_ssse3
1126___
1127}
1128
1129if ($avx) {{
1130######################################################################
1131# AVX+shrd code path
1132#
1133local *ror = sub { &shrd(@_[0],@_) };
1134
1135$code.=<<___;
1136.type	${func}_avx,\@function,3
1137.align	64
1138${func}_avx:
1139.cfi_startproc
1140.Lavx_shortcut:
1141	mov	%rsp,%rax		# copy %rsp
1142.cfi_def_cfa_register	%rax
1143	push	%rbx
1144.cfi_push	%rbx
1145	push	%rbp
1146.cfi_push	%rbp
1147	push	%r12
1148.cfi_push	%r12
1149	push	%r13
1150.cfi_push	%r13
1151	push	%r14
1152.cfi_push	%r14
1153	push	%r15
1154.cfi_push	%r15
1155	shl	\$4,%rdx		# num*16
1156	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1157	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1158	and	\$-64,%rsp		# align stack frame
1159	mov	$ctx,$_ctx		# save ctx, 1st arg
1160	mov	$inp,$_inp		# save inp, 2nd arh
1161	mov	%rdx,$_end		# save end pointer, "3rd" arg
1162	mov	%rax,$_rsp		# save copy of %rsp
1163.cfi_cfa_expression	$_rsp,deref,+8
1164___
1165$code.=<<___ if ($win64);
1166	movaps	%xmm6,16*$SZ+32(%rsp)
1167	movaps	%xmm7,16*$SZ+48(%rsp)
1168	movaps	%xmm8,16*$SZ+64(%rsp)
1169	movaps	%xmm9,16*$SZ+80(%rsp)
1170___
1171$code.=<<___ if ($win64 && $SZ>4);
1172	movaps	%xmm10,16*$SZ+96(%rsp)
1173	movaps	%xmm11,16*$SZ+112(%rsp)
1174___
1175$code.=<<___;
1176.Lprologue_avx:
1177
1178	vzeroupper
1179	mov	$SZ*0($ctx),$A
1180	mov	$SZ*1($ctx),$B
1181	mov	$SZ*2($ctx),$C
1182	mov	$SZ*3($ctx),$D
1183	mov	$SZ*4($ctx),$E
1184	mov	$SZ*5($ctx),$F
1185	mov	$SZ*6($ctx),$G
1186	mov	$SZ*7($ctx),$H
1187___
1188					if ($SZ==4) {	# SHA256
1189    my @X = map("%xmm$_",(0..3));
1190    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1191
1192$code.=<<___;
1193	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1194	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1195	jmp	.Lloop_avx
1196.align	16
1197.Lloop_avx:
1198	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1199	vmovdqu	0x00($inp),@X[0]
1200	vmovdqu	0x10($inp),@X[1]
1201	vmovdqu	0x20($inp),@X[2]
1202	vmovdqu	0x30($inp),@X[3]
1203	vpshufb	$t3,@X[0],@X[0]
1204	lea	$TABLE(%rip),$Tbl
1205	vpshufb	$t3,@X[1],@X[1]
1206	vpshufb	$t3,@X[2],@X[2]
1207	vpaddd	0x00($Tbl),@X[0],$t0
1208	vpshufb	$t3,@X[3],@X[3]
1209	vpaddd	0x20($Tbl),@X[1],$t1
1210	vpaddd	0x40($Tbl),@X[2],$t2
1211	vpaddd	0x60($Tbl),@X[3],$t3
1212	vmovdqa	$t0,0x00(%rsp)
1213	mov	$A,$a1
1214	vmovdqa	$t1,0x10(%rsp)
1215	mov	$B,$a3
1216	vmovdqa	$t2,0x20(%rsp)
1217	xor	$C,$a3			# magic
1218	vmovdqa	$t3,0x30(%rsp)
1219	mov	$E,$a0
1220	jmp	.Lavx_00_47
1221
1222.align	16
1223.Lavx_00_47:
1224	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1225___
1226sub Xupdate_256_AVX () {
1227	(
1228	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1229	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1230	'&vpsrld	($t2,$t0,$sigma0[0]);',
1231	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1232	'&vpsrld	($t3,$t0,$sigma0[2])',
1233	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1234	'&vpxor		($t0,$t3,$t2)',
1235	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1236	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1237	'&vpxor		($t0,$t0,$t1)',
1238	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1239	'&vpxor		($t0,$t0,$t2)',
1240	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1241	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1242	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1243	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1244	 '&vpxor	($t2,$t2,$t3);',
1245	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1246	 '&vpxor	($t2,$t2,$t3)',
1247	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1248	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1249	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1250	 '&vpsrld	($t2,$t3,$sigma1[2])',
1251	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1252	 '&vpxor	($t2,$t2,$t3);',
1253	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1254	 '&vpxor	($t2,$t2,$t3)',
1255	 '&vpshufb	($t2,$t2,$t5)',
1256	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1257	);
1258}
1259
1260sub AVX_256_00_47 () {
1261my $j = shift;
1262my $body = shift;
1263my @X = @_;
1264my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1265
1266	foreach (Xupdate_256_AVX()) {		# 29 instructions
1267	    eval;
1268	    eval(shift(@insns));
1269	    eval(shift(@insns));
1270	    eval(shift(@insns));
1271	}
1272	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1273	  foreach (@insns) { eval; }		# remaining instructions
1274	&vmovdqa	(16*$j."(%rsp)",$t2);
1275}
1276
1277    for ($i=0,$j=0; $j<4; $j++) {
1278	&AVX_256_00_47($j,\&body_00_15,@X);
1279	push(@X,shift(@X));			# rotate(@X)
1280    }
1281	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1282	&jne	(".Lavx_00_47");
1283
1284    for ($i=0; $i<16; ) {
1285	foreach(body_00_15()) { eval; }
1286    }
1287
1288					} else {	# SHA512
1289    my @X = map("%xmm$_",(0..7));
1290    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1291
1292$code.=<<___;
1293	jmp	.Lloop_avx
1294.align	16
1295.Lloop_avx:
1296	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1297	vmovdqu	0x00($inp),@X[0]
1298	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1299	vmovdqu	0x10($inp),@X[1]
1300	vmovdqu	0x20($inp),@X[2]
1301	vpshufb	$t3,@X[0],@X[0]
1302	vmovdqu	0x30($inp),@X[3]
1303	vpshufb	$t3,@X[1],@X[1]
1304	vmovdqu	0x40($inp),@X[4]
1305	vpshufb	$t3,@X[2],@X[2]
1306	vmovdqu	0x50($inp),@X[5]
1307	vpshufb	$t3,@X[3],@X[3]
1308	vmovdqu	0x60($inp),@X[6]
1309	vpshufb	$t3,@X[4],@X[4]
1310	vmovdqu	0x70($inp),@X[7]
1311	vpshufb	$t3,@X[5],@X[5]
1312	vpaddq	-0x80($Tbl),@X[0],$t0
1313	vpshufb	$t3,@X[6],@X[6]
1314	vpaddq	-0x60($Tbl),@X[1],$t1
1315	vpshufb	$t3,@X[7],@X[7]
1316	vpaddq	-0x40($Tbl),@X[2],$t2
1317	vpaddq	-0x20($Tbl),@X[3],$t3
1318	vmovdqa	$t0,0x00(%rsp)
1319	vpaddq	0x00($Tbl),@X[4],$t0
1320	vmovdqa	$t1,0x10(%rsp)
1321	vpaddq	0x20($Tbl),@X[5],$t1
1322	vmovdqa	$t2,0x20(%rsp)
1323	vpaddq	0x40($Tbl),@X[6],$t2
1324	vmovdqa	$t3,0x30(%rsp)
1325	vpaddq	0x60($Tbl),@X[7],$t3
1326	vmovdqa	$t0,0x40(%rsp)
1327	mov	$A,$a1
1328	vmovdqa	$t1,0x50(%rsp)
1329	mov	$B,$a3
1330	vmovdqa	$t2,0x60(%rsp)
1331	xor	$C,$a3			# magic
1332	vmovdqa	$t3,0x70(%rsp)
1333	mov	$E,$a0
1334	jmp	.Lavx_00_47
1335
1336.align	16
1337.Lavx_00_47:
1338	add	\$`16*2*$SZ`,$Tbl
1339___
1340sub Xupdate_512_AVX () {
1341	(
1342	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1343	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1344	'&vpsrlq	($t2,$t0,$sigma0[0])',
1345	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1346	'&vpsrlq	($t3,$t0,$sigma0[2])',
1347	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1348	 '&vpxor	($t0,$t3,$t2)',
1349	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1350	 '&vpxor	($t0,$t0,$t1)',
1351	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1352	 '&vpxor	($t0,$t0,$t2)',
1353	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1354	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1355	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1356	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1357	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1358	 '&vpxor	($t3,$t3,$t2)',
1359	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1360	 '&vpxor	($t3,$t3,$t1)',
1361	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1362	 '&vpxor	($t3,$t3,$t2)',
1363	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1364	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1365	);
1366}
1367
1368sub AVX_512_00_47 () {
1369my $j = shift;
1370my $body = shift;
1371my @X = @_;
1372my @insns = (&$body,&$body);			# 52 instructions
1373
1374	foreach (Xupdate_512_AVX()) {		# 23 instructions
1375	    eval;
1376	    eval(shift(@insns));
1377	    eval(shift(@insns));
1378	}
1379	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1380	  foreach (@insns) { eval; }		# remaining instructions
1381	&vmovdqa	(16*$j."(%rsp)",$t2);
1382}
1383
1384    for ($i=0,$j=0; $j<8; $j++) {
1385	&AVX_512_00_47($j,\&body_00_15,@X);
1386	push(@X,shift(@X));			# rotate(@X)
1387    }
1388	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1389	&jne	(".Lavx_00_47");
1390
1391    for ($i=0; $i<16; ) {
1392	foreach(body_00_15()) { eval; }
1393    }
1394}
1395$code.=<<___;
1396	mov	$_ctx,$ctx
1397	mov	$a1,$A
1398
1399	add	$SZ*0($ctx),$A
1400	lea	16*$SZ($inp),$inp
1401	add	$SZ*1($ctx),$B
1402	add	$SZ*2($ctx),$C
1403	add	$SZ*3($ctx),$D
1404	add	$SZ*4($ctx),$E
1405	add	$SZ*5($ctx),$F
1406	add	$SZ*6($ctx),$G
1407	add	$SZ*7($ctx),$H
1408
1409	cmp	$_end,$inp
1410
1411	mov	$A,$SZ*0($ctx)
1412	mov	$B,$SZ*1($ctx)
1413	mov	$C,$SZ*2($ctx)
1414	mov	$D,$SZ*3($ctx)
1415	mov	$E,$SZ*4($ctx)
1416	mov	$F,$SZ*5($ctx)
1417	mov	$G,$SZ*6($ctx)
1418	mov	$H,$SZ*7($ctx)
1419	jb	.Lloop_avx
1420
1421	mov	$_rsp,%rsi
1422.cfi_def_cfa	%rsi,8
1423	vzeroupper
1424___
1425$code.=<<___ if ($win64);
1426	movaps	16*$SZ+32(%rsp),%xmm6
1427	movaps	16*$SZ+48(%rsp),%xmm7
1428	movaps	16*$SZ+64(%rsp),%xmm8
1429	movaps	16*$SZ+80(%rsp),%xmm9
1430___
1431$code.=<<___ if ($win64 && $SZ>4);
1432	movaps	16*$SZ+96(%rsp),%xmm10
1433	movaps	16*$SZ+112(%rsp),%xmm11
1434___
1435$code.=<<___;
1436	mov	-48(%rsi),%r15
1437.cfi_restore	%r15
1438	mov	-40(%rsi),%r14
1439.cfi_restore	%r14
1440	mov	-32(%rsi),%r13
1441.cfi_restore	%r13
1442	mov	-24(%rsi),%r12
1443.cfi_restore	%r12
1444	mov	-16(%rsi),%rbp
1445.cfi_restore	%rbp
1446	mov	-8(%rsi),%rbx
1447.cfi_restore	%rbx
1448	lea	(%rsi),%rsp
1449.cfi_def_cfa_register	%rsp
1450.Lepilogue_avx:
1451	ret
1452.cfi_endproc
1453.size	${func}_avx,.-${func}_avx
1454___
1455
1456}}}}}
1457
1458# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1459#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1460if ($win64) {
1461$rec="%rcx";
1462$frame="%rdx";
1463$context="%r8";
1464$disp="%r9";
1465
1466$code.=<<___;
1467.extern	__imp_RtlVirtualUnwind
1468.type	se_handler,\@abi-omnipotent
1469.align	16
1470se_handler:
1471	push	%rsi
1472	push	%rdi
1473	push	%rbx
1474	push	%rbp
1475	push	%r12
1476	push	%r13
1477	push	%r14
1478	push	%r15
1479	pushfq
1480	sub	\$64,%rsp
1481
1482	mov	120($context),%rax	# pull context->Rax
1483	mov	248($context),%rbx	# pull context->Rip
1484
1485	mov	8($disp),%rsi		# disp->ImageBase
1486	mov	56($disp),%r11		# disp->HanderlData
1487
1488	mov	0(%r11),%r10d		# HandlerData[0]
1489	lea	(%rsi,%r10),%r10	# prologue label
1490	cmp	%r10,%rbx		# context->Rip<prologue label
1491	jb	.Lin_prologue
1492
1493	mov	152($context),%rax	# pull context->Rsp
1494
1495	mov	4(%r11),%r10d		# HandlerData[1]
1496	lea	(%rsi,%r10),%r10	# epilogue label
1497	cmp	%r10,%rbx		# context->Rip>=epilogue label
1498	jae	.Lin_prologue
1499___
1500$code.=<<___;
1501	mov	%rax,%rsi		# put aside Rsp
1502	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
1503
1504	mov	-8(%rax),%rbx
1505	mov	-16(%rax),%rbp
1506	mov	-24(%rax),%r12
1507	mov	-32(%rax),%r13
1508	mov	-40(%rax),%r14
1509	mov	-48(%rax),%r15
1510	mov	%rbx,144($context)	# restore context->Rbx
1511	mov	%rbp,160($context)	# restore context->Rbp
1512	mov	%r12,216($context)	# restore context->R12
1513	mov	%r13,224($context)	# restore context->R13
1514	mov	%r14,232($context)	# restore context->R14
1515	mov	%r15,240($context)	# restore context->R15
1516
1517	lea	.Lepilogue(%rip),%r10
1518	cmp	%r10,%rbx
1519	jb	.Lin_prologue		# non-AVX code
1520
1521	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
1522	lea	512($context),%rdi	# &context.Xmm6
1523	mov	\$`$SZ==4?8:12`,%ecx
1524	.long	0xa548f3fc		# cld; rep movsq
1525
1526.Lin_prologue:
1527	mov	8(%rax),%rdi
1528	mov	16(%rax),%rsi
1529	mov	%rax,152($context)	# restore context->Rsp
1530	mov	%rsi,168($context)	# restore context->Rsi
1531	mov	%rdi,176($context)	# restore context->Rdi
1532
1533	mov	40($disp),%rdi		# disp->ContextRecord
1534	mov	$context,%rsi		# context
1535	mov	\$154,%ecx		# sizeof(CONTEXT)
1536	.long	0xa548f3fc		# cld; rep movsq
1537
1538	mov	$disp,%rsi
1539	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1540	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1541	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1542	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1543	mov	40(%rsi),%r10		# disp->ContextRecord
1544	lea	56(%rsi),%r11		# &disp->HandlerData
1545	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1546	mov	%r10,32(%rsp)		# arg5
1547	mov	%r11,40(%rsp)		# arg6
1548	mov	%r12,48(%rsp)		# arg7
1549	mov	%rcx,56(%rsp)		# arg8, (NULL)
1550	call	*__imp_RtlVirtualUnwind(%rip)
1551
1552	mov	\$1,%eax		# ExceptionContinueSearch
1553	add	\$64,%rsp
1554	popfq
1555	pop	%r15
1556	pop	%r14
1557	pop	%r13
1558	pop	%r12
1559	pop	%rbp
1560	pop	%rbx
1561	pop	%rdi
1562	pop	%rsi
1563	ret
1564.size	se_handler,.-se_handler
1565___
1566
1567$code.=<<___ if ($SZ==4 && $shaext);
1568.type	shaext_handler,\@abi-omnipotent
1569.align	16
1570shaext_handler:
1571	push	%rsi
1572	push	%rdi
1573	push	%rbx
1574	push	%rbp
1575	push	%r12
1576	push	%r13
1577	push	%r14
1578	push	%r15
1579	pushfq
1580	sub	\$64,%rsp
1581
1582	mov	120($context),%rax	# pull context->Rax
1583	mov	248($context),%rbx	# pull context->Rip
1584
1585	lea	.Lprologue_shaext(%rip),%r10
1586	cmp	%r10,%rbx		# context->Rip<.Lprologue
1587	jb	.Lin_prologue
1588
1589	lea	.Lepilogue_shaext(%rip),%r10
1590	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1591	jae	.Lin_prologue
1592
1593	lea	-8-5*16(%rax),%rsi
1594	lea	512($context),%rdi	# &context.Xmm6
1595	mov	\$10,%ecx
1596	.long	0xa548f3fc		# cld; rep movsq
1597
1598	jmp	.Lin_prologue
1599.size	shaext_handler,.-shaext_handler
1600___
1601
1602$code.=<<___;
1603.section	.pdata
1604.align	4
1605	.rva	.LSEH_begin_$func
1606	.rva	.LSEH_end_$func
1607	.rva	.LSEH_info_$func
1608___
1609$code.=<<___ if ($SZ==4 && $shaext);
1610	.rva	.LSEH_begin_${func}_shaext
1611	.rva	.LSEH_end_${func}_shaext
1612	.rva	.LSEH_info_${func}_shaext
1613___
1614$code.=<<___ if ($SZ==4);
1615	.rva	.LSEH_begin_${func}_ssse3
1616	.rva	.LSEH_end_${func}_ssse3
1617	.rva	.LSEH_info_${func}_ssse3
1618___
1619$code.=<<___ if ($avx);
1620	.rva	.LSEH_begin_${func}_avx
1621	.rva	.LSEH_end_${func}_avx
1622	.rva	.LSEH_info_${func}_avx
1623___
1624$code.=<<___;
1625.section	.xdata
1626.align	8
1627.LSEH_info_$func:
1628	.byte	9,0,0,0
1629	.rva	se_handler
1630	.rva	.Lprologue,.Lepilogue			# HandlerData[]
1631___
1632$code.=<<___ if ($SZ==4 && $shaext);
1633.LSEH_info_${func}_shaext:
1634	.byte	9,0,0,0
1635	.rva	shaext_handler
1636___
1637$code.=<<___ if ($SZ==4);
1638.LSEH_info_${func}_ssse3:
1639	.byte	9,0,0,0
1640	.rva	se_handler
1641	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
1642___
1643$code.=<<___ if ($avx);
1644.LSEH_info_${func}_avx:
1645	.byte	9,0,0,0
1646	.rva	se_handler
1647	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
1648___
1649}
1650
1651sub sha256op38 {
1652    my $instr = shift;
1653    my %opcodelet = (
1654		"sha256rnds2" => 0xcb,
1655  		"sha256msg1"  => 0xcc,
1656		"sha256msg2"  => 0xcd	);
1657
1658    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
1659      my @opcode=(0x0f,0x38);
1660	push @opcode,$opcodelet{$instr};
1661	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
1662	return ".byte\t".join(',',@opcode);
1663    } else {
1664	return $instr."\t".@_[0];
1665    }
1666}
1667
1668foreach (split("\n",$code)) {
1669	s/\`([^\`]*)\`/eval $1/geo;
1670
1671	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
1672
1673	print $_,"\n";
1674}
1675close STDOUT or die "error closing STDOUT: $!";
1676