• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. Rights for redistribution and usage in source and binary
13# forms are granted according to the OpenSSL license.
14# ====================================================================
15#
16# sha256/512_block procedure for x86_64.
17#
18# 40% improvement over compiler-generated code on Opteron. On EM64T
19# sha256 was observed to run >80% faster and sha512 - >40%. No magical
20# tricks, just straight implementation... I really wonder why gcc
21# [being armed with inline assembler] fails to generate as fast code.
22# The only thing which is cool about this module is that it's very
23# same instruction sequence used for both SHA-256 and SHA-512. In
24# former case the instructions operate on 32-bit operands, while in
25# latter - on 64-bit ones. All I had to do is to get one flavor right,
26# the other one passed the test right away:-)
27#
28# sha256_block runs in ~1005 cycles on Opteron, which gives you
29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30# frequency in GHz. sha512_block runs in ~1275 cycles, which results
31# in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32# Well, if you compare it to IA-64 implementation, which maintains
33# X[16] in register bank[!], tends to 4 instructions per CPU clock
34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35# issue Opteron pipeline and X[16] maintained in memory. So that *if*
36# there is a way to improve it, *then* the only way would be to try to
37# offload X[16] updates to SSE unit, but that would require "deeper"
38# loop unroll, which in turn would naturally cause size blow-up, not
39# to mention increased complexity! And once again, only *if* it's
40# actually possible to noticeably improve overall ILP, instruction
41# level parallelism, on a given CPU implementation in this case.
42#
43# Special note on Intel EM64T. While Opteron CPU exhibits perfect
44# performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45# [currently available] EM64T CPUs apparently are far from it. On the
46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47# sha256_block:-( This is presumably because 64-bit shifts/rotates
48# apparently are not atomic instructions, but implemented in microcode.
49#
50# May 2012.
51#
52# Optimization including one of Pavel Semjanov's ideas, alternative
53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54# unfortunately -2% SHA512 on P4 [which nobody should care about
55# that much].
56#
57# June 2012.
58#
59# Add SIMD code paths, see below for improvement coefficients. SSSE3
60# code path was not attempted for SHA512, because improvement is not
61# estimated to be high enough, noticeably less than 9%, to justify
62# the effort, not on pre-AVX processors. [Obviously with exclusion
63# for VIA Nano, but it has SHA512 instruction that is faster and
64# should be used instead.] For reference, corresponding estimated
65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66# higher coefficients are observed on VIA Nano and Bulldozer has more
67# to do with specifics of their architecture [which is topic for
68# separate discussion].
69#
70# November 2012.
71#
72# Add AVX2 code path. Two consecutive input blocks are loaded to
73# 256-bit %ymm registers, with data from first block to least
74# significant 128-bit halves and data from second to most significant.
75# The data is then processed with same SIMD instruction sequence as
76# for AVX, but with %ymm as operands. Side effect is increased stack
77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
78# code size increase.
79#
80# March 2014.
81#
82# Add support for Intel SHA Extensions.
83
84######################################################################
85# Current performance in cycles per processed byte (less is better):
86#
87#		SHA256	SSSE3       AVX/XOP(*)	    SHA512  AVX/XOP(*)
88#
89# AMD K8	14.9	-	    -		    9.57    -
90# P4		17.3	-	    -		    30.8    -
91# Core 2	15.6	13.8(+13%)  -		    9.97    -
92# Westmere	14.8	12.3(+19%)  -		    9.58    -
93# Sandy Bridge	17.4	14.2(+23%)  11.6(+50%(**))  11.2    8.10(+38%(**))
94# Ivy Bridge	12.6	10.5(+20%)  10.3(+22%)	    8.17    7.22(+13%)
95# Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
96# Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
97# Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
98# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
99# VIA Nano	23.0	16.5(+39%)  -		    14.7    -
100# Atom		23.0	18.9(+22%)  -		    14.7    -
101# Silvermont	27.4	20.6(+33%)  -               17.5    -
102# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
103# Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
104#
105# (*)	whichever best applicable, including SHAEXT;
106# (**)	switch from ror to shrd stands for fair share of improvement;
107# (***)	execution time is fully determined by remaining integer-only
108#	part, body_00_15; reducing the amount of SIMD instructions
109#	below certain limit makes no difference/sense; to conserve
110#	space SHA256 XOP code path is therefore omitted;
111#
112# Modified from upstream OpenSSL to remove the XOP code.
113
114my ($flavour, $hash, $output) = @ARGV;
115
116if ($hash eq "sha512") {
117	$func="sha512_block_data_order";
118	$TABLE="K512";
119	$SZ=8;
120	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
121					"%r8", "%r9", "%r10","%r11");
122	($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
123	@Sigma0=(28,34,39);
124	@Sigma1=(14,18,41);
125	@sigma0=(1,  8, 7);
126	@sigma1=(19,61, 6);
127	$rounds=80;
128} elsif ($hash eq "sha256") {
129	$func="sha256_block_data_order";
130	$TABLE="K256";
131	$SZ=4;
132	@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
133					"%r8d","%r9d","%r10d","%r11d");
134	($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
135	@Sigma0=( 2,13,22);
136	@Sigma1=( 6,11,25);
137	@sigma0=( 7,18, 3);
138	@sigma1=(17,19,10);
139	$rounds=64;
140} else {
141	die "unknown hash: $hash";
142}
143
144$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
145
146$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
147( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
148( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
149die "can't locate x86_64-xlate.pl";
150
151# In upstream, this is controlled by shelling out to the compiler to check
152# versions, but BoringSSL is intended to be used with pre-generated perlasm
153# output, so this isn't useful anyway.
154#
155# This file also has an AVX2 implementation, controlled by setting $avx to 2.
156# For now, we intentionally disable it. While it gives a 13-16% perf boost, the
157# CFI annotations are wrong. It allocates stack in a loop and should be
158# rewritten to avoid this.
159$avx = 1;
160$shaext = 1;
161
162open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
163*STDOUT=*OUT;
164
165$ctx="%rdi";	# 1st arg, zapped by $a3
166$inp="%rsi";	# 2nd arg
167$Tbl="%rbp";
168
169$_ctx="16*$SZ+0*8(%rsp)";
170$_inp="16*$SZ+1*8(%rsp)";
171$_end="16*$SZ+2*8(%rsp)";
172$_rsp="`16*$SZ+3*8`(%rsp)";
173$framesz="16*$SZ+4*8";
174
175
176sub ROUND_00_15()
177{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
178  my $STRIDE=$SZ;
179     $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
180
181$code.=<<___;
182	ror	\$`$Sigma1[2]-$Sigma1[1]`,$a0
183	mov	$f,$a2
184
185	xor	$e,$a0
186	ror	\$`$Sigma0[2]-$Sigma0[1]`,$a1
187	xor	$g,$a2			# f^g
188
189	mov	$T1,`$SZ*($i&0xf)`(%rsp)
190	xor	$a,$a1
191	and	$e,$a2			# (f^g)&e
192
193	ror	\$`$Sigma1[1]-$Sigma1[0]`,$a0
194	add	$h,$T1			# T1+=h
195	xor	$g,$a2			# Ch(e,f,g)=((f^g)&e)^g
196
197	ror	\$`$Sigma0[1]-$Sigma0[0]`,$a1
198	xor	$e,$a0
199	add	$a2,$T1			# T1+=Ch(e,f,g)
200
201	mov	$a,$a2
202	add	($Tbl),$T1		# T1+=K[round]
203	xor	$a,$a1
204
205	xor	$b,$a2			# a^b, b^c in next round
206	ror	\$$Sigma1[0],$a0	# Sigma1(e)
207	mov	$b,$h
208
209	and	$a2,$a3
210	ror	\$$Sigma0[0],$a1	# Sigma0(a)
211	add	$a0,$T1			# T1+=Sigma1(e)
212
213	xor	$a3,$h			# h=Maj(a,b,c)=Ch(a^b,c,b)
214	add	$T1,$d			# d+=T1
215	add	$T1,$h			# h+=T1
216
217	lea	$STRIDE($Tbl),$Tbl	# round++
218___
219$code.=<<___ if ($i<15);
220	add	$a1,$h			# h+=Sigma0(a)
221___
222	($a2,$a3) = ($a3,$a2);
223}
224
225sub ROUND_16_XX()
226{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
227
228$code.=<<___;
229	mov	`$SZ*(($i+1)&0xf)`(%rsp),$a0
230	mov	`$SZ*(($i+14)&0xf)`(%rsp),$a2
231
232	mov	$a0,$T1
233	ror	\$`$sigma0[1]-$sigma0[0]`,$a0
234	add	$a1,$a			# modulo-scheduled h+=Sigma0(a)
235	mov	$a2,$a1
236	ror	\$`$sigma1[1]-$sigma1[0]`,$a2
237
238	xor	$T1,$a0
239	shr	\$$sigma0[2],$T1
240	ror	\$$sigma0[0],$a0
241	xor	$a1,$a2
242	shr	\$$sigma1[2],$a1
243
244	ror	\$$sigma1[0],$a2
245	xor	$a0,$T1			# sigma0(X[(i+1)&0xf])
246	xor	$a1,$a2			# sigma1(X[(i+14)&0xf])
247	add	`$SZ*(($i+9)&0xf)`(%rsp),$T1
248
249	add	`$SZ*($i&0xf)`(%rsp),$T1
250	mov	$e,$a0
251	add	$a2,$T1
252	mov	$a,$a1
253___
254	&ROUND_00_15(@_);
255}
256
257$code=<<___;
258.text
259
260.globl	${func}_nohw
261.type	${func}_nohw,\@function,3
262.align	16
263${func}_nohw:
264.cfi_startproc
265	_CET_ENDBR
266	mov	%rsp,%rax		# copy %rsp
267.cfi_def_cfa_register	%rax
268	push	%rbx
269.cfi_push	%rbx
270	push	%rbp
271.cfi_push	%rbp
272	push	%r12
273.cfi_push	%r12
274	push	%r13
275.cfi_push	%r13
276	push	%r14
277.cfi_push	%r14
278	push	%r15
279.cfi_push	%r15
280	shl	\$4,%rdx		# num*16
281	sub	\$$framesz,%rsp
282	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
283	and	\$-64,%rsp		# align stack frame
284	mov	$ctx,$_ctx		# save ctx, 1st arg
285	mov	$inp,$_inp		# save inp, 2nd arh
286	mov	%rdx,$_end		# save end pointer, "3rd" arg
287	mov	%rax,$_rsp		# save copy of %rsp
288.cfi_cfa_expression	$_rsp,deref,+8
289.Lprologue:
290
291	mov	$SZ*0($ctx),$A
292	mov	$SZ*1($ctx),$B
293	mov	$SZ*2($ctx),$C
294	mov	$SZ*3($ctx),$D
295	mov	$SZ*4($ctx),$E
296	mov	$SZ*5($ctx),$F
297	mov	$SZ*6($ctx),$G
298	mov	$SZ*7($ctx),$H
299	jmp	.Lloop
300
301.align	16
302.Lloop:
303	mov	$B,$a3
304	lea	$TABLE(%rip),$Tbl
305	xor	$C,$a3			# magic
306___
307	for($i=0;$i<16;$i++) {
308		$code.="	mov	$SZ*$i($inp),$T1\n";
309		$code.="	mov	@ROT[4],$a0\n";
310		$code.="	mov	@ROT[0],$a1\n";
311		$code.="	bswap	$T1\n";
312		&ROUND_00_15($i,@ROT);
313		unshift(@ROT,pop(@ROT));
314	}
315$code.=<<___;
316	jmp	.Lrounds_16_xx
317.align	16
318.Lrounds_16_xx:
319___
320	for(;$i<32;$i++) {
321		&ROUND_16_XX($i,@ROT);
322		unshift(@ROT,pop(@ROT));
323	}
324
325$code.=<<___;
326	cmpb	\$0,`$SZ-1`($Tbl)
327	jnz	.Lrounds_16_xx
328
329	mov	$_ctx,$ctx
330	add	$a1,$A			# modulo-scheduled h+=Sigma0(a)
331	lea	16*$SZ($inp),$inp
332
333	add	$SZ*0($ctx),$A
334	add	$SZ*1($ctx),$B
335	add	$SZ*2($ctx),$C
336	add	$SZ*3($ctx),$D
337	add	$SZ*4($ctx),$E
338	add	$SZ*5($ctx),$F
339	add	$SZ*6($ctx),$G
340	add	$SZ*7($ctx),$H
341
342	cmp	$_end,$inp
343
344	mov	$A,$SZ*0($ctx)
345	mov	$B,$SZ*1($ctx)
346	mov	$C,$SZ*2($ctx)
347	mov	$D,$SZ*3($ctx)
348	mov	$E,$SZ*4($ctx)
349	mov	$F,$SZ*5($ctx)
350	mov	$G,$SZ*6($ctx)
351	mov	$H,$SZ*7($ctx)
352	jb	.Lloop
353
354	mov	$_rsp,%rsi
355.cfi_def_cfa	%rsi,8
356	mov	-48(%rsi),%r15
357.cfi_restore	%r15
358	mov	-40(%rsi),%r14
359.cfi_restore	%r14
360	mov	-32(%rsi),%r13
361.cfi_restore	%r13
362	mov	-24(%rsi),%r12
363.cfi_restore	%r12
364	mov	-16(%rsi),%rbp
365.cfi_restore	%rbp
366	mov	-8(%rsi),%rbx
367.cfi_restore	%rbx
368	lea	(%rsi),%rsp
369.cfi_def_cfa_register	%rsp
370.Lepilogue:
371	ret
372.cfi_endproc
373.size	${func}_nohw,.-${func}_nohw
374___
375
376if ($SZ==4) {
377$code.=<<___;
378.section .rodata
379.align	64
380.type	$TABLE,\@object
381$TABLE:
382	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
383	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
384	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
385	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
386	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
387	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
388	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
389	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
390	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
391	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
392	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
393	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
394	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
395	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
396	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
397	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
398	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
399	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
400	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
401	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
402	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
403	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
404	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
405	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
406	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
407	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
408	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
409	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
410	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
411	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
412	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
413	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
414
415	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
416	.long	0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
417	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
418	.long	0x03020100,0x0b0a0908,0xffffffff,0xffffffff
419	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
420	.long	0xffffffff,0xffffffff,0x03020100,0x0b0a0908
421	.asciz	"SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
422.text
423___
424} else {
425$code.=<<___;
426.section .rodata
427.align	64
428.type	$TABLE,\@object
429$TABLE:
430	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
431	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
432	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
433	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
434	.quad	0x3956c25bf348b538,0x59f111f1b605d019
435	.quad	0x3956c25bf348b538,0x59f111f1b605d019
436	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
437	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
438	.quad	0xd807aa98a3030242,0x12835b0145706fbe
439	.quad	0xd807aa98a3030242,0x12835b0145706fbe
440	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
441	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
442	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
443	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
444	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
445	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
446	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
447	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
448	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
449	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
450	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
451	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
452	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
453	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
454	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
455	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
456	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
457	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
458	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
459	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
460	.quad	0x06ca6351e003826f,0x142929670a0e6e70
461	.quad	0x06ca6351e003826f,0x142929670a0e6e70
462	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
463	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
464	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
465	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
466	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
467	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
468	.quad	0x81c2c92e47edaee6,0x92722c851482353b
469	.quad	0x81c2c92e47edaee6,0x92722c851482353b
470	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
471	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
472	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
473	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
474	.quad	0xd192e819d6ef5218,0xd69906245565a910
475	.quad	0xd192e819d6ef5218,0xd69906245565a910
476	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
477	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
478	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
479	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
480	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
481	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
482	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
483	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
484	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
485	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
486	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
487	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
488	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
489	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
490	.quad	0x90befffa23631e28,0xa4506cebde82bde9
491	.quad	0x90befffa23631e28,0xa4506cebde82bde9
492	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
493	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
494	.quad	0xca273eceea26619c,0xd186b8c721c0c207
495	.quad	0xca273eceea26619c,0xd186b8c721c0c207
496	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
497	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
498	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
499	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
500	.quad	0x113f9804bef90dae,0x1b710b35131c471b
501	.quad	0x113f9804bef90dae,0x1b710b35131c471b
502	.quad	0x28db77f523047d84,0x32caab7b40c72493
503	.quad	0x28db77f523047d84,0x32caab7b40c72493
504	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
505	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
506	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
507	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
508	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
509	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
510
511	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
512	.quad	0x0001020304050607,0x08090a0b0c0d0e0f
513	.asciz	"SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
514.text
515___
516}
517
518######################################################################
519# SIMD code paths
520#
521if ($SZ==4 && $shaext) {{{
522######################################################################
523# Intel SHA Extensions implementation of SHA256 update function.
524#
525my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
526
527my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
528my @MSG=map("%xmm$_",(3..6));
529
530$code.=<<___;
531.globl	sha256_block_data_order_hw
532.type	sha256_block_data_order_hw,\@function,3
533.align	64
534sha256_block_data_order_hw:
535.cfi_startproc
536	_CET_ENDBR
537___
538$code.=<<___ if ($win64);
539	lea	`-8-5*16`(%rsp),%rsp
540	movaps	%xmm6,-8-5*16(%rax)
541	movaps	%xmm7,-8-4*16(%rax)
542	movaps	%xmm8,-8-3*16(%rax)
543	movaps	%xmm9,-8-2*16(%rax)
544	movaps	%xmm10,-8-1*16(%rax)
545.Lprologue_shaext:
546___
547$code.=<<___;
548	lea		K256+0x80(%rip),$Tbl
549	movdqu		($ctx),$ABEF		# DCBA
550	movdqu		16($ctx),$CDGH		# HGFE
551	movdqa		0x200-0x80($Tbl),$TMP	# byte swap mask
552
553	pshufd		\$0x1b,$ABEF,$Wi	# ABCD
554	pshufd		\$0xb1,$ABEF,$ABEF	# CDAB
555	pshufd		\$0x1b,$CDGH,$CDGH	# EFGH
556	movdqa		$TMP,$BSWAP		# offload
557	palignr		\$8,$CDGH,$ABEF		# ABEF
558	punpcklqdq	$Wi,$CDGH		# CDGH
559	jmp		.Loop_shaext
560
561.align	16
562.Loop_shaext:
563	movdqu		($inp),@MSG[0]
564	movdqu		0x10($inp),@MSG[1]
565	movdqu		0x20($inp),@MSG[2]
566	pshufb		$TMP,@MSG[0]
567	movdqu		0x30($inp),@MSG[3]
568
569	movdqa		0*32-0x80($Tbl),$Wi
570	paddd		@MSG[0],$Wi
571	pshufb		$TMP,@MSG[1]
572	movdqa		$CDGH,$CDGH_SAVE	# offload
573	sha256rnds2	$ABEF,$CDGH		# 0-3
574	pshufd		\$0x0e,$Wi,$Wi
575	nop
576	movdqa		$ABEF,$ABEF_SAVE	# offload
577	sha256rnds2	$CDGH,$ABEF
578
579	movdqa		1*32-0x80($Tbl),$Wi
580	paddd		@MSG[1],$Wi
581	pshufb		$TMP,@MSG[2]
582	sha256rnds2	$ABEF,$CDGH		# 4-7
583	pshufd		\$0x0e,$Wi,$Wi
584	lea		0x40($inp),$inp
585	sha256msg1	@MSG[1],@MSG[0]
586	sha256rnds2	$CDGH,$ABEF
587
588	movdqa		2*32-0x80($Tbl),$Wi
589	paddd		@MSG[2],$Wi
590	pshufb		$TMP,@MSG[3]
591	sha256rnds2	$ABEF,$CDGH		# 8-11
592	pshufd		\$0x0e,$Wi,$Wi
593	movdqa		@MSG[3],$TMP
594	palignr		\$4,@MSG[2],$TMP
595	nop
596	paddd		$TMP,@MSG[0]
597	sha256msg1	@MSG[2],@MSG[1]
598	sha256rnds2	$CDGH,$ABEF
599
600	movdqa		3*32-0x80($Tbl),$Wi
601	paddd		@MSG[3],$Wi
602	sha256msg2	@MSG[3],@MSG[0]
603	sha256rnds2	$ABEF,$CDGH		# 12-15
604	pshufd		\$0x0e,$Wi,$Wi
605	movdqa		@MSG[0],$TMP
606	palignr		\$4,@MSG[3],$TMP
607	nop
608	paddd		$TMP,@MSG[1]
609	sha256msg1	@MSG[3],@MSG[2]
610	sha256rnds2	$CDGH,$ABEF
611___
612for($i=4;$i<16-3;$i++) {
613$code.=<<___;
614	movdqa		$i*32-0x80($Tbl),$Wi
615	paddd		@MSG[0],$Wi
616	sha256msg2	@MSG[0],@MSG[1]
617	sha256rnds2	$ABEF,$CDGH		# 16-19...
618	pshufd		\$0x0e,$Wi,$Wi
619	movdqa		@MSG[1],$TMP
620	palignr		\$4,@MSG[0],$TMP
621	nop
622	paddd		$TMP,@MSG[2]
623	sha256msg1	@MSG[0],@MSG[3]
624	sha256rnds2	$CDGH,$ABEF
625___
626	push(@MSG,shift(@MSG));
627}
628$code.=<<___;
629	movdqa		13*32-0x80($Tbl),$Wi
630	paddd		@MSG[0],$Wi
631	sha256msg2	@MSG[0],@MSG[1]
632	sha256rnds2	$ABEF,$CDGH		# 52-55
633	pshufd		\$0x0e,$Wi,$Wi
634	movdqa		@MSG[1],$TMP
635	palignr		\$4,@MSG[0],$TMP
636	sha256rnds2	$CDGH,$ABEF
637	paddd		$TMP,@MSG[2]
638
639	movdqa		14*32-0x80($Tbl),$Wi
640	paddd		@MSG[1],$Wi
641	sha256rnds2	$ABEF,$CDGH		# 56-59
642	pshufd		\$0x0e,$Wi,$Wi
643	sha256msg2	@MSG[1],@MSG[2]
644	movdqa		$BSWAP,$TMP
645	sha256rnds2	$CDGH,$ABEF
646
647	movdqa		15*32-0x80($Tbl),$Wi
648	paddd		@MSG[2],$Wi
649	nop
650	sha256rnds2	$ABEF,$CDGH		# 60-63
651	pshufd		\$0x0e,$Wi,$Wi
652	dec		$num
653	nop
654	sha256rnds2	$CDGH,$ABEF
655
656	paddd		$CDGH_SAVE,$CDGH
657	paddd		$ABEF_SAVE,$ABEF
658	jnz		.Loop_shaext
659
660	pshufd		\$0xb1,$CDGH,$CDGH	# DCHG
661	pshufd		\$0x1b,$ABEF,$TMP	# FEBA
662	pshufd		\$0xb1,$ABEF,$ABEF	# BAFE
663	punpckhqdq	$CDGH,$ABEF		# DCBA
664	palignr		\$8,$TMP,$CDGH		# HGFE
665
666	movdqu	$ABEF,($ctx)
667	movdqu	$CDGH,16($ctx)
668___
669$code.=<<___ if ($win64);
670	movaps	-8-5*16(%rax),%xmm6
671	movaps	-8-4*16(%rax),%xmm7
672	movaps	-8-3*16(%rax),%xmm8
673	movaps	-8-2*16(%rax),%xmm9
674	movaps	-8-1*16(%rax),%xmm10
675	mov	%rax,%rsp
676.Lepilogue_shaext:
677___
678$code.=<<___;
679	ret
680.cfi_endproc
681.size	sha256_block_data_order_hw,.-sha256_block_data_order_hw
682___
683}}}
684{{{
685
686my $a4=$T1;
687my ($a,$b,$c,$d,$e,$f,$g,$h);
688
689sub AUTOLOAD()		# thunk [simplified] 32-bit style perlasm
690{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
691  my $arg = pop;
692    $arg = "\$$arg" if ($arg*1 eq $arg);
693    $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
694}
695
696sub body_00_15 () {
697	(
698	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
699
700	'&ror	($a0,$Sigma1[2]-$Sigma1[1])',
701	'&mov	($a,$a1)',
702	'&mov	($a4,$f)',
703
704	'&ror	($a1,$Sigma0[2]-$Sigma0[1])',
705	'&xor	($a0,$e)',
706	'&xor	($a4,$g)',			# f^g
707
708	'&ror	($a0,$Sigma1[1]-$Sigma1[0])',
709	'&xor	($a1,$a)',
710	'&and	($a4,$e)',			# (f^g)&e
711
712	'&xor	($a0,$e)',
713	'&add	($h,$SZ*($i&15)."(%rsp)")',	# h+=X[i]+K[i]
714	'&mov	($a2,$a)',
715
716	'&xor	($a4,$g)',			# Ch(e,f,g)=((f^g)&e)^g
717	'&ror	($a1,$Sigma0[1]-$Sigma0[0])',
718	'&xor	($a2,$b)',			# a^b, b^c in next round
719
720	'&add	($h,$a4)',			# h+=Ch(e,f,g)
721	'&ror	($a0,$Sigma1[0])',		# Sigma1(e)
722	'&and	($a3,$a2)',			# (b^c)&(a^b)
723
724	'&xor	($a1,$a)',
725	'&add	($h,$a0)',			# h+=Sigma1(e)
726	'&xor	($a3,$b)',			# Maj(a,b,c)=Ch(a^b,c,b)
727
728	'&ror	($a1,$Sigma0[0])',		# Sigma0(a)
729	'&add	($d,$h)',			# d+=h
730	'&add	($h,$a3)',			# h+=Maj(a,b,c)
731
732	'&mov	($a0,$d)',
733	'&add	($a1,$h);'.			# h+=Sigma0(a)
734	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
735	);
736}
737
738######################################################################
739# SSSE3 code path
740#
741if ($SZ==4) {	# SHA256 only
742my @X = map("%xmm$_",(0..3));
743my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
744
745$code.=<<___;
746.globl	${func}_ssse3
747.type	${func}_ssse3,\@function,3
748.align	64
749${func}_ssse3:
750.cfi_startproc
751	_CET_ENDBR
752	mov	%rsp,%rax		# copy %rsp
753.cfi_def_cfa_register	%rax
754	push	%rbx
755.cfi_push	%rbx
756	push	%rbp
757.cfi_push	%rbp
758	push	%r12
759.cfi_push	%r12
760	push	%r13
761.cfi_push	%r13
762	push	%r14
763.cfi_push	%r14
764	push	%r15
765.cfi_push	%r15
766	shl	\$4,%rdx		# num*16
767	sub	\$`$framesz+$win64*16*4`,%rsp
768	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
769	and	\$-64,%rsp		# align stack frame
770	mov	$ctx,$_ctx		# save ctx, 1st arg
771	mov	$inp,$_inp		# save inp, 2nd arh
772	mov	%rdx,$_end		# save end pointer, "3rd" arg
773	mov	%rax,$_rsp		# save copy of %rsp
774.cfi_cfa_expression	$_rsp,deref,+8
775___
776$code.=<<___ if ($win64);
777	movaps	%xmm6,16*$SZ+32(%rsp)
778	movaps	%xmm7,16*$SZ+48(%rsp)
779	movaps	%xmm8,16*$SZ+64(%rsp)
780	movaps	%xmm9,16*$SZ+80(%rsp)
781___
782$code.=<<___;
783.Lprologue_ssse3:
784
785	mov	$SZ*0($ctx),$A
786	mov	$SZ*1($ctx),$B
787	mov	$SZ*2($ctx),$C
788	mov	$SZ*3($ctx),$D
789	mov	$SZ*4($ctx),$E
790	mov	$SZ*5($ctx),$F
791	mov	$SZ*6($ctx),$G
792	mov	$SZ*7($ctx),$H
793___
794
795$code.=<<___;
796	#movdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
797	#movdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
798	jmp	.Lloop_ssse3
799.align	16
800.Lloop_ssse3:
801	movdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
802	movdqu	0x00($inp),@X[0]
803	movdqu	0x10($inp),@X[1]
804	movdqu	0x20($inp),@X[2]
805	pshufb	$t3,@X[0]
806	movdqu	0x30($inp),@X[3]
807	lea	$TABLE(%rip),$Tbl
808	pshufb	$t3,@X[1]
809	movdqa	0x00($Tbl),$t0
810	movdqa	0x20($Tbl),$t1
811	pshufb	$t3,@X[2]
812	paddd	@X[0],$t0
813	movdqa	0x40($Tbl),$t2
814	pshufb	$t3,@X[3]
815	movdqa	0x60($Tbl),$t3
816	paddd	@X[1],$t1
817	paddd	@X[2],$t2
818	paddd	@X[3],$t3
819	movdqa	$t0,0x00(%rsp)
820	mov	$A,$a1
821	movdqa	$t1,0x10(%rsp)
822	mov	$B,$a3
823	movdqa	$t2,0x20(%rsp)
824	xor	$C,$a3			# magic
825	movdqa	$t3,0x30(%rsp)
826	mov	$E,$a0
827	jmp	.Lssse3_00_47
828
829.align	16
830.Lssse3_00_47:
831	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
832___
833sub Xupdate_256_SSSE3 () {
834	(
835	'&movdqa	($t0,@X[1]);',
836	'&movdqa	($t3,@X[3])',
837	'&palignr	($t0,@X[0],$SZ)',	# X[1..4]
838	 '&palignr	($t3,@X[2],$SZ);',	# X[9..12]
839	'&movdqa	($t1,$t0)',
840	'&movdqa	($t2,$t0);',
841	'&psrld		($t0,$sigma0[2])',
842	 '&paddd	(@X[0],$t3);',		# X[0..3] += X[9..12]
843	'&psrld		($t2,$sigma0[0])',
844	 '&pshufd	($t3,@X[3],0b11111010)',# X[14..15]
845	'&pslld		($t1,8*$SZ-$sigma0[1]);'.
846	'&pxor		($t0,$t2)',
847	'&psrld		($t2,$sigma0[1]-$sigma0[0]);'.
848	'&pxor		($t0,$t1)',
849	'&pslld		($t1,$sigma0[1]-$sigma0[0]);'.
850	'&pxor		($t0,$t2);',
851	 '&movdqa	($t2,$t3)',
852	'&pxor		($t0,$t1);',		# sigma0(X[1..4])
853	 '&psrld	($t3,$sigma1[2])',
854	'&paddd		(@X[0],$t0);',		# X[0..3] += sigma0(X[1..4])
855	 '&psrlq	($t2,$sigma1[0])',
856	 '&pxor		($t3,$t2);',
857	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
858	 '&pxor		($t3,$t2)',
859	 '&pshufb	($t3,$t4)',		# sigma1(X[14..15])
860	'&paddd		(@X[0],$t3)',		# X[0..1] += sigma1(X[14..15])
861	 '&pshufd	($t3,@X[0],0b01010000)',# X[16..17]
862	 '&movdqa	($t2,$t3);',
863	 '&psrld	($t3,$sigma1[2])',
864	 '&psrlq	($t2,$sigma1[0])',
865	 '&pxor		($t3,$t2);',
866	 '&psrlq	($t2,$sigma1[1]-$sigma1[0])',
867	 '&pxor		($t3,$t2);',
868	'&movdqa	($t2,16*2*$j."($Tbl)")',
869	 '&pshufb	($t3,$t5)',
870	'&paddd		(@X[0],$t3)'		# X[2..3] += sigma1(X[16..17])
871	);
872}
873
874sub SSSE3_256_00_47 () {
875my $j = shift;
876my $body = shift;
877my @X = @_;
878my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
879
880    if (0) {
881	foreach (Xupdate_256_SSSE3()) {		# 36 instructions
882	    eval;
883	    eval(shift(@insns));
884	    eval(shift(@insns));
885	    eval(shift(@insns));
886	}
887    } else {			# squeeze extra 4% on Westmere and 19% on Atom
888	  eval(shift(@insns));	#@
889	&movdqa		($t0,@X[1]);
890	  eval(shift(@insns));
891	  eval(shift(@insns));
892	&movdqa		($t3,@X[3]);
893	  eval(shift(@insns));	#@
894	  eval(shift(@insns));
895	  eval(shift(@insns));
896	  eval(shift(@insns));	#@
897	  eval(shift(@insns));
898	&palignr	($t0,@X[0],$SZ);	# X[1..4]
899	  eval(shift(@insns));
900	  eval(shift(@insns));
901	 &palignr	($t3,@X[2],$SZ);	# X[9..12]
902	  eval(shift(@insns));
903	  eval(shift(@insns));
904	  eval(shift(@insns));
905	  eval(shift(@insns));	#@
906	&movdqa		($t1,$t0);
907	  eval(shift(@insns));
908	  eval(shift(@insns));
909	&movdqa		($t2,$t0);
910	  eval(shift(@insns));	#@
911	  eval(shift(@insns));
912	&psrld		($t0,$sigma0[2]);
913	  eval(shift(@insns));
914	  eval(shift(@insns));
915	  eval(shift(@insns));
916	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
917	  eval(shift(@insns));	#@
918	  eval(shift(@insns));
919	&psrld		($t2,$sigma0[0]);
920	  eval(shift(@insns));
921	  eval(shift(@insns));
922	 &pshufd	($t3,@X[3],0b11111010);	# X[4..15]
923	  eval(shift(@insns));
924	  eval(shift(@insns));	#@
925	&pslld		($t1,8*$SZ-$sigma0[1]);
926	  eval(shift(@insns));
927	  eval(shift(@insns));
928	&pxor		($t0,$t2);
929	  eval(shift(@insns));	#@
930	  eval(shift(@insns));
931	  eval(shift(@insns));
932	  eval(shift(@insns));	#@
933	&psrld		($t2,$sigma0[1]-$sigma0[0]);
934	  eval(shift(@insns));
935	&pxor		($t0,$t1);
936	  eval(shift(@insns));
937	  eval(shift(@insns));
938	&pslld		($t1,$sigma0[1]-$sigma0[0]);
939	  eval(shift(@insns));
940	  eval(shift(@insns));
941	&pxor		($t0,$t2);
942	  eval(shift(@insns));
943	  eval(shift(@insns));	#@
944	 &movdqa	($t2,$t3);
945	  eval(shift(@insns));
946	  eval(shift(@insns));
947	&pxor		($t0,$t1);		# sigma0(X[1..4])
948	  eval(shift(@insns));	#@
949	  eval(shift(@insns));
950	  eval(shift(@insns));
951	 &psrld		($t3,$sigma1[2]);
952	  eval(shift(@insns));
953	  eval(shift(@insns));
954	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
955	  eval(shift(@insns));	#@
956	  eval(shift(@insns));
957	 &psrlq		($t2,$sigma1[0]);
958	  eval(shift(@insns));
959	  eval(shift(@insns));
960	  eval(shift(@insns));
961	 &pxor		($t3,$t2);
962	  eval(shift(@insns));	#@
963	  eval(shift(@insns));
964	  eval(shift(@insns));
965	  eval(shift(@insns));	#@
966	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
967	  eval(shift(@insns));
968	  eval(shift(@insns));
969	 &pxor		($t3,$t2);
970	  eval(shift(@insns));	#@
971	  eval(shift(@insns));
972	  eval(shift(@insns));
973	 #&pshufb	($t3,$t4);		# sigma1(X[14..15])
974	 &pshufd	($t3,$t3,0b10000000);
975	  eval(shift(@insns));
976	  eval(shift(@insns));
977	  eval(shift(@insns));
978	 &psrldq	($t3,8);
979	  eval(shift(@insns));
980	  eval(shift(@insns));	#@
981	  eval(shift(@insns));
982	  eval(shift(@insns));
983	  eval(shift(@insns));	#@
984	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
985	  eval(shift(@insns));
986	  eval(shift(@insns));
987	  eval(shift(@insns));
988	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
989	  eval(shift(@insns));
990	  eval(shift(@insns));	#@
991	  eval(shift(@insns));
992	 &movdqa	($t2,$t3);
993	  eval(shift(@insns));
994	  eval(shift(@insns));
995	 &psrld		($t3,$sigma1[2]);
996	  eval(shift(@insns));
997	  eval(shift(@insns));	#@
998	 &psrlq		($t2,$sigma1[0]);
999	  eval(shift(@insns));
1000	  eval(shift(@insns));
1001	 &pxor		($t3,$t2);
1002	  eval(shift(@insns));	#@
1003	  eval(shift(@insns));
1004	  eval(shift(@insns));
1005	  eval(shift(@insns));	#@
1006	  eval(shift(@insns));
1007	 &psrlq		($t2,$sigma1[1]-$sigma1[0]);
1008	  eval(shift(@insns));
1009	  eval(shift(@insns));
1010	  eval(shift(@insns));
1011	 &pxor		($t3,$t2);
1012	  eval(shift(@insns));
1013	  eval(shift(@insns));
1014	  eval(shift(@insns));	#@
1015	 #&pshufb	($t3,$t5);
1016	 &pshufd	($t3,$t3,0b00001000);
1017	  eval(shift(@insns));
1018	  eval(shift(@insns));
1019	&movdqa		($t2,16*2*$j."($Tbl)");
1020	  eval(shift(@insns));	#@
1021	  eval(shift(@insns));
1022	 &pslldq	($t3,8);
1023	  eval(shift(@insns));
1024	  eval(shift(@insns));
1025	  eval(shift(@insns));
1026	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
1027	  eval(shift(@insns));	#@
1028	  eval(shift(@insns));
1029	  eval(shift(@insns));
1030    }
1031	&paddd		($t2,@X[0]);
1032	  foreach (@insns) { eval; }		# remaining instructions
1033	&movdqa		(16*$j."(%rsp)",$t2);
1034}
1035
1036    for ($i=0,$j=0; $j<4; $j++) {
1037	&SSSE3_256_00_47($j,\&body_00_15,@X);
1038	push(@X,shift(@X));			# rotate(@X)
1039    }
1040	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1041	&jne	(".Lssse3_00_47");
1042
1043    for ($i=0; $i<16; ) {
1044	foreach(body_00_15()) { eval; }
1045    }
1046$code.=<<___;
1047	mov	$_ctx,$ctx
1048	mov	$a1,$A
1049
1050	add	$SZ*0($ctx),$A
1051	lea	16*$SZ($inp),$inp
1052	add	$SZ*1($ctx),$B
1053	add	$SZ*2($ctx),$C
1054	add	$SZ*3($ctx),$D
1055	add	$SZ*4($ctx),$E
1056	add	$SZ*5($ctx),$F
1057	add	$SZ*6($ctx),$G
1058	add	$SZ*7($ctx),$H
1059
1060	cmp	$_end,$inp
1061
1062	mov	$A,$SZ*0($ctx)
1063	mov	$B,$SZ*1($ctx)
1064	mov	$C,$SZ*2($ctx)
1065	mov	$D,$SZ*3($ctx)
1066	mov	$E,$SZ*4($ctx)
1067	mov	$F,$SZ*5($ctx)
1068	mov	$G,$SZ*6($ctx)
1069	mov	$H,$SZ*7($ctx)
1070	jb	.Lloop_ssse3
1071
1072	mov	$_rsp,%rsi
1073.cfi_def_cfa	%rsi,8
1074___
1075$code.=<<___ if ($win64);
1076	movaps	16*$SZ+32(%rsp),%xmm6
1077	movaps	16*$SZ+48(%rsp),%xmm7
1078	movaps	16*$SZ+64(%rsp),%xmm8
1079	movaps	16*$SZ+80(%rsp),%xmm9
1080___
1081$code.=<<___;
1082	mov	-48(%rsi),%r15
1083.cfi_restore	%r15
1084	mov	-40(%rsi),%r14
1085.cfi_restore	%r14
1086	mov	-32(%rsi),%r13
1087.cfi_restore	%r13
1088	mov	-24(%rsi),%r12
1089.cfi_restore	%r12
1090	mov	-16(%rsi),%rbp
1091.cfi_restore	%rbp
1092	mov	-8(%rsi),%rbx
1093.cfi_restore	%rbx
1094	lea	(%rsi),%rsp
1095.cfi_def_cfa_register	%rsp
1096.Lepilogue_ssse3:
1097	ret
1098.cfi_endproc
1099.size	${func}_ssse3,.-${func}_ssse3
1100___
1101}
1102
1103if ($avx) {{
1104######################################################################
1105# AVX+shrd code path
1106#
1107local *ror = sub { &shrd(@_[0],@_) };
1108
1109$code.=<<___;
1110.globl	${func}_avx
1111.type	${func}_avx,\@function,3
1112.align	64
1113${func}_avx:
1114.cfi_startproc
1115	_CET_ENDBR
1116	mov	%rsp,%rax		# copy %rsp
1117.cfi_def_cfa_register	%rax
1118	push	%rbx
1119.cfi_push	%rbx
1120	push	%rbp
1121.cfi_push	%rbp
1122	push	%r12
1123.cfi_push	%r12
1124	push	%r13
1125.cfi_push	%r13
1126	push	%r14
1127.cfi_push	%r14
1128	push	%r15
1129.cfi_push	%r15
1130	shl	\$4,%rdx		# num*16
1131	sub	\$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1132	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1133	and	\$-64,%rsp		# align stack frame
1134	mov	$ctx,$_ctx		# save ctx, 1st arg
1135	mov	$inp,$_inp		# save inp, 2nd arh
1136	mov	%rdx,$_end		# save end pointer, "3rd" arg
1137	mov	%rax,$_rsp		# save copy of %rsp
1138.cfi_cfa_expression	$_rsp,deref,+8
1139___
1140$code.=<<___ if ($win64);
1141	movaps	%xmm6,16*$SZ+32(%rsp)
1142	movaps	%xmm7,16*$SZ+48(%rsp)
1143	movaps	%xmm8,16*$SZ+64(%rsp)
1144	movaps	%xmm9,16*$SZ+80(%rsp)
1145___
1146$code.=<<___ if ($win64 && $SZ>4);
1147	movaps	%xmm10,16*$SZ+96(%rsp)
1148	movaps	%xmm11,16*$SZ+112(%rsp)
1149___
1150$code.=<<___;
1151.Lprologue_avx:
1152
1153	vzeroupper
1154	mov	$SZ*0($ctx),$A
1155	mov	$SZ*1($ctx),$B
1156	mov	$SZ*2($ctx),$C
1157	mov	$SZ*3($ctx),$D
1158	mov	$SZ*4($ctx),$E
1159	mov	$SZ*5($ctx),$F
1160	mov	$SZ*6($ctx),$G
1161	mov	$SZ*7($ctx),$H
1162___
1163					if ($SZ==4) {	# SHA256
1164    my @X = map("%xmm$_",(0..3));
1165    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1166
1167$code.=<<___;
1168	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1169	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1170	jmp	.Lloop_avx
1171.align	16
1172.Lloop_avx:
1173	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1174	vmovdqu	0x00($inp),@X[0]
1175	vmovdqu	0x10($inp),@X[1]
1176	vmovdqu	0x20($inp),@X[2]
1177	vmovdqu	0x30($inp),@X[3]
1178	vpshufb	$t3,@X[0],@X[0]
1179	lea	$TABLE(%rip),$Tbl
1180	vpshufb	$t3,@X[1],@X[1]
1181	vpshufb	$t3,@X[2],@X[2]
1182	vpaddd	0x00($Tbl),@X[0],$t0
1183	vpshufb	$t3,@X[3],@X[3]
1184	vpaddd	0x20($Tbl),@X[1],$t1
1185	vpaddd	0x40($Tbl),@X[2],$t2
1186	vpaddd	0x60($Tbl),@X[3],$t3
1187	vmovdqa	$t0,0x00(%rsp)
1188	mov	$A,$a1
1189	vmovdqa	$t1,0x10(%rsp)
1190	mov	$B,$a3
1191	vmovdqa	$t2,0x20(%rsp)
1192	xor	$C,$a3			# magic
1193	vmovdqa	$t3,0x30(%rsp)
1194	mov	$E,$a0
1195	jmp	.Lavx_00_47
1196
1197.align	16
1198.Lavx_00_47:
1199	sub	\$`-16*2*$SZ`,$Tbl	# size optimization
1200___
1201sub Xupdate_256_AVX () {
1202	(
1203	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..4]
1204	 '&vpalignr	($t3,@X[3],@X[2],$SZ)',	# X[9..12]
1205	'&vpsrld	($t2,$t0,$sigma0[0]);',
1206	 '&vpaddd	(@X[0],@X[0],$t3)',	# X[0..3] += X[9..12]
1207	'&vpsrld	($t3,$t0,$sigma0[2])',
1208	'&vpslld	($t1,$t0,8*$SZ-$sigma0[1]);',
1209	'&vpxor		($t0,$t3,$t2)',
1210	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1211	'&vpsrld	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1212	'&vpxor		($t0,$t0,$t1)',
1213	'&vpslld	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1214	'&vpxor		($t0,$t0,$t2)',
1215	 '&vpsrld	($t2,$t3,$sigma1[2]);',
1216	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..4])
1217	 '&vpsrlq	($t3,$t3,$sigma1[0]);',
1218	'&vpaddd	(@X[0],@X[0],$t0)',	# X[0..3] += sigma0(X[1..4])
1219	 '&vpxor	($t2,$t2,$t3);',
1220	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1221	 '&vpxor	($t2,$t2,$t3)',
1222	 '&vpshufb	($t2,$t2,$t4)',		# sigma1(X[14..15])
1223	'&vpaddd	(@X[0],@X[0],$t2)',	# X[0..1] += sigma1(X[14..15])
1224	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1225	 '&vpsrld	($t2,$t3,$sigma1[2])',
1226	 '&vpsrlq	($t3,$t3,$sigma1[0])',
1227	 '&vpxor	($t2,$t2,$t3);',
1228	 '&vpsrlq	($t3,$t3,$sigma1[1]-$sigma1[0])',
1229	 '&vpxor	($t2,$t2,$t3)',
1230	 '&vpshufb	($t2,$t2,$t5)',
1231	'&vpaddd	(@X[0],@X[0],$t2)'	# X[2..3] += sigma1(X[16..17])
1232	);
1233}
1234
1235sub AVX_256_00_47 () {
1236my $j = shift;
1237my $body = shift;
1238my @X = @_;
1239my @insns = (&$body,&$body,&$body,&$body);	# 104 instructions
1240
1241	foreach (Xupdate_256_AVX()) {		# 29 instructions
1242	    eval;
1243	    eval(shift(@insns));
1244	    eval(shift(@insns));
1245	    eval(shift(@insns));
1246	}
1247	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1248	  foreach (@insns) { eval; }		# remaining instructions
1249	&vmovdqa	(16*$j."(%rsp)",$t2);
1250}
1251
1252    for ($i=0,$j=0; $j<4; $j++) {
1253	&AVX_256_00_47($j,\&body_00_15,@X);
1254	push(@X,shift(@X));			# rotate(@X)
1255    }
1256	&cmpb	($SZ-1+16*2*$SZ."($Tbl)",0);
1257	&jne	(".Lavx_00_47");
1258
1259    for ($i=0; $i<16; ) {
1260	foreach(body_00_15()) { eval; }
1261    }
1262
1263					} else {	# SHA512
1264    my @X = map("%xmm$_",(0..7));
1265    my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1266
1267$code.=<<___;
1268	jmp	.Lloop_avx
1269.align	16
1270.Lloop_avx:
1271	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1272	vmovdqu	0x00($inp),@X[0]
1273	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1274	vmovdqu	0x10($inp),@X[1]
1275	vmovdqu	0x20($inp),@X[2]
1276	vpshufb	$t3,@X[0],@X[0]
1277	vmovdqu	0x30($inp),@X[3]
1278	vpshufb	$t3,@X[1],@X[1]
1279	vmovdqu	0x40($inp),@X[4]
1280	vpshufb	$t3,@X[2],@X[2]
1281	vmovdqu	0x50($inp),@X[5]
1282	vpshufb	$t3,@X[3],@X[3]
1283	vmovdqu	0x60($inp),@X[6]
1284	vpshufb	$t3,@X[4],@X[4]
1285	vmovdqu	0x70($inp),@X[7]
1286	vpshufb	$t3,@X[5],@X[5]
1287	vpaddq	-0x80($Tbl),@X[0],$t0
1288	vpshufb	$t3,@X[6],@X[6]
1289	vpaddq	-0x60($Tbl),@X[1],$t1
1290	vpshufb	$t3,@X[7],@X[7]
1291	vpaddq	-0x40($Tbl),@X[2],$t2
1292	vpaddq	-0x20($Tbl),@X[3],$t3
1293	vmovdqa	$t0,0x00(%rsp)
1294	vpaddq	0x00($Tbl),@X[4],$t0
1295	vmovdqa	$t1,0x10(%rsp)
1296	vpaddq	0x20($Tbl),@X[5],$t1
1297	vmovdqa	$t2,0x20(%rsp)
1298	vpaddq	0x40($Tbl),@X[6],$t2
1299	vmovdqa	$t3,0x30(%rsp)
1300	vpaddq	0x60($Tbl),@X[7],$t3
1301	vmovdqa	$t0,0x40(%rsp)
1302	mov	$A,$a1
1303	vmovdqa	$t1,0x50(%rsp)
1304	mov	$B,$a3
1305	vmovdqa	$t2,0x60(%rsp)
1306	xor	$C,$a3			# magic
1307	vmovdqa	$t3,0x70(%rsp)
1308	mov	$E,$a0
1309	jmp	.Lavx_00_47
1310
1311.align	16
1312.Lavx_00_47:
1313	add	\$`16*2*$SZ`,$Tbl
1314___
1315sub Xupdate_512_AVX () {
1316	(
1317	'&vpalignr	($t0,@X[1],@X[0],$SZ)',	# X[1..2]
1318	 '&vpalignr	($t3,@X[5],@X[4],$SZ)',	# X[9..10]
1319	'&vpsrlq	($t2,$t0,$sigma0[0])',
1320	 '&vpaddq	(@X[0],@X[0],$t3);',	# X[0..1] += X[9..10]
1321	'&vpsrlq	($t3,$t0,$sigma0[2])',
1322	'&vpsllq	($t1,$t0,8*$SZ-$sigma0[1]);',
1323	 '&vpxor	($t0,$t3,$t2)',
1324	'&vpsrlq	($t2,$t2,$sigma0[1]-$sigma0[0]);',
1325	 '&vpxor	($t0,$t0,$t1)',
1326	'&vpsllq	($t1,$t1,$sigma0[1]-$sigma0[0]);',
1327	 '&vpxor	($t0,$t0,$t2)',
1328	 '&vpsrlq	($t3,@X[7],$sigma1[2]);',
1329	'&vpxor		($t0,$t0,$t1)',		# sigma0(X[1..2])
1330	 '&vpsllq	($t2,@X[7],8*$SZ-$sigma1[1]);',
1331	'&vpaddq	(@X[0],@X[0],$t0)',	# X[0..1] += sigma0(X[1..2])
1332	 '&vpsrlq	($t1,@X[7],$sigma1[0]);',
1333	 '&vpxor	($t3,$t3,$t2)',
1334	 '&vpsllq	($t2,$t2,$sigma1[1]-$sigma1[0]);',
1335	 '&vpxor	($t3,$t3,$t1)',
1336	 '&vpsrlq	($t1,$t1,$sigma1[1]-$sigma1[0]);',
1337	 '&vpxor	($t3,$t3,$t2)',
1338	 '&vpxor	($t3,$t3,$t1)',		# sigma1(X[14..15])
1339	'&vpaddq	(@X[0],@X[0],$t3)',	# X[0..1] += sigma1(X[14..15])
1340	);
1341}
1342
1343sub AVX_512_00_47 () {
1344my $j = shift;
1345my $body = shift;
1346my @X = @_;
1347my @insns = (&$body,&$body);			# 52 instructions
1348
1349	foreach (Xupdate_512_AVX()) {		# 23 instructions
1350	    eval;
1351	    eval(shift(@insns));
1352	    eval(shift(@insns));
1353	}
1354	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1355	  foreach (@insns) { eval; }		# remaining instructions
1356	&vmovdqa	(16*$j."(%rsp)",$t2);
1357}
1358
1359    for ($i=0,$j=0; $j<8; $j++) {
1360	&AVX_512_00_47($j,\&body_00_15,@X);
1361	push(@X,shift(@X));			# rotate(@X)
1362    }
1363	&cmpb	($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1364	&jne	(".Lavx_00_47");
1365
1366    for ($i=0; $i<16; ) {
1367	foreach(body_00_15()) { eval; }
1368    }
1369}
1370$code.=<<___;
1371	mov	$_ctx,$ctx
1372	mov	$a1,$A
1373
1374	add	$SZ*0($ctx),$A
1375	lea	16*$SZ($inp),$inp
1376	add	$SZ*1($ctx),$B
1377	add	$SZ*2($ctx),$C
1378	add	$SZ*3($ctx),$D
1379	add	$SZ*4($ctx),$E
1380	add	$SZ*5($ctx),$F
1381	add	$SZ*6($ctx),$G
1382	add	$SZ*7($ctx),$H
1383
1384	cmp	$_end,$inp
1385
1386	mov	$A,$SZ*0($ctx)
1387	mov	$B,$SZ*1($ctx)
1388	mov	$C,$SZ*2($ctx)
1389	mov	$D,$SZ*3($ctx)
1390	mov	$E,$SZ*4($ctx)
1391	mov	$F,$SZ*5($ctx)
1392	mov	$G,$SZ*6($ctx)
1393	mov	$H,$SZ*7($ctx)
1394	jb	.Lloop_avx
1395
1396	mov	$_rsp,%rsi
1397.cfi_def_cfa	%rsi,8
1398	vzeroupper
1399___
1400$code.=<<___ if ($win64);
1401	movaps	16*$SZ+32(%rsp),%xmm6
1402	movaps	16*$SZ+48(%rsp),%xmm7
1403	movaps	16*$SZ+64(%rsp),%xmm8
1404	movaps	16*$SZ+80(%rsp),%xmm9
1405___
1406$code.=<<___ if ($win64 && $SZ>4);
1407	movaps	16*$SZ+96(%rsp),%xmm10
1408	movaps	16*$SZ+112(%rsp),%xmm11
1409___
1410$code.=<<___;
1411	mov	-48(%rsi),%r15
1412.cfi_restore	%r15
1413	mov	-40(%rsi),%r14
1414.cfi_restore	%r14
1415	mov	-32(%rsi),%r13
1416.cfi_restore	%r13
1417	mov	-24(%rsi),%r12
1418.cfi_restore	%r12
1419	mov	-16(%rsi),%rbp
1420.cfi_restore	%rbp
1421	mov	-8(%rsi),%rbx
1422.cfi_restore	%rbx
1423	lea	(%rsi),%rsp
1424.cfi_def_cfa_register	%rsp
1425.Lepilogue_avx:
1426	ret
1427.cfi_endproc
1428.size	${func}_avx,.-${func}_avx
1429___
1430
1431if ($avx>1) {{
1432######################################################################
1433# AVX2+BMI code path
1434#
1435my $a5=$SZ==4?"%esi":"%rsi";	# zap $inp
1436my $PUSH8=8*2*$SZ;
1437use integer;
1438
1439sub bodyx_00_15 () {
1440	# at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1441	(
1442	'($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1443
1444	'&add	($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
1445	'&and	($a4,$e)',		# f&e
1446	'&rorx	($a0,$e,$Sigma1[2])',
1447	'&rorx	($a2,$e,$Sigma1[1])',
1448
1449	'&lea	($a,"($a,$a1)")',	# h+=Sigma0(a) from the past
1450	'&lea	($h,"($h,$a4)")',
1451	'&andn	($a4,$e,$g)',		# ~e&g
1452	'&xor	($a0,$a2)',
1453
1454	'&rorx	($a1,$e,$Sigma1[0])',
1455	'&lea	($h,"($h,$a4)")',	# h+=Ch(e,f,g)=(e&f)+(~e&g)
1456	'&xor	($a0,$a1)',		# Sigma1(e)
1457	'&mov	($a2,$a)',
1458
1459	'&rorx	($a4,$a,$Sigma0[2])',
1460	'&lea	($h,"($h,$a0)")',	# h+=Sigma1(e)
1461	'&xor	($a2,$b)',		# a^b, b^c in next round
1462	'&rorx	($a1,$a,$Sigma0[1])',
1463
1464	'&rorx	($a0,$a,$Sigma0[0])',
1465	'&lea	($d,"($d,$h)")',	# d+=h
1466	'&and	($a3,$a2)',		# (b^c)&(a^b)
1467	'&xor	($a1,$a4)',
1468
1469	'&xor	($a3,$b)',		# Maj(a,b,c)=Ch(a^b,c,b)
1470	'&xor	($a1,$a0)',		# Sigma0(a)
1471	'&lea	($h,"($h,$a3)");'.	# h+=Maj(a,b,c)
1472	'&mov	($a4,$e)',		# copy of f in future
1473
1474	'($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1475	);
1476	# and at the finish one has to $a+=$a1
1477}
1478
1479$code.=<<___;
1480.type	${func}_avx2,\@function,3
1481.align	64
1482${func}_avx2:
1483.cfi_startproc
1484.Lavx2_shortcut:
1485	mov	%rsp,%rax		# copy %rsp
1486.cfi_def_cfa_register	%rax
1487	push	%rbx
1488.cfi_push	%rbx
1489	push	%rbp
1490.cfi_push	%rbp
1491	push	%r12
1492.cfi_push	%r12
1493	push	%r13
1494.cfi_push	%r13
1495	push	%r14
1496.cfi_push	%r14
1497	push	%r15
1498.cfi_push	%r15
1499	sub	\$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1500	shl	\$4,%rdx		# num*16
1501	and	\$-256*$SZ,%rsp		# align stack frame
1502	lea	($inp,%rdx,$SZ),%rdx	# inp+num*16*$SZ
1503	add	\$`2*$SZ*($rounds-8)`,%rsp
1504	mov	$ctx,$_ctx		# save ctx, 1st arg
1505	mov	$inp,$_inp		# save inp, 2nd arh
1506	mov	%rdx,$_end		# save end pointer, "3rd" arg
1507	mov	%rax,$_rsp		# save copy of %rsp
1508.cfi_cfa_expression	$_rsp,deref,+8
1509___
1510$code.=<<___ if ($win64);
1511	movaps	%xmm6,16*$SZ+32(%rsp)
1512	movaps	%xmm7,16*$SZ+48(%rsp)
1513	movaps	%xmm8,16*$SZ+64(%rsp)
1514	movaps	%xmm9,16*$SZ+80(%rsp)
1515___
1516$code.=<<___ if ($win64 && $SZ>4);
1517	movaps	%xmm10,16*$SZ+96(%rsp)
1518	movaps	%xmm11,16*$SZ+112(%rsp)
1519___
1520$code.=<<___;
1521.Lprologue_avx2:
1522
1523	vzeroupper
1524	sub	\$-16*$SZ,$inp		# inp++, size optimization
1525	mov	$SZ*0($ctx),$A
1526	mov	$inp,%r12		# borrow $T1
1527	mov	$SZ*1($ctx),$B
1528	cmp	%rdx,$inp		# $_end
1529	mov	$SZ*2($ctx),$C
1530	cmove	%rsp,%r12		# next block or random data
1531	mov	$SZ*3($ctx),$D
1532	mov	$SZ*4($ctx),$E
1533	mov	$SZ*5($ctx),$F
1534	mov	$SZ*6($ctx),$G
1535	mov	$SZ*7($ctx),$H
1536___
1537					if ($SZ==4) {	# SHA256
1538    my @X = map("%ymm$_",(0..3));
1539    my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1540
1541$code.=<<___;
1542	vmovdqa	$TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1543	vmovdqa	$TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1544	jmp	.Loop_avx2
1545.align	16
1546.Loop_avx2:
1547	vmovdqa	$TABLE+`$SZ*2*$rounds`(%rip),$t3
1548	vmovdqu	-16*$SZ+0($inp),%xmm0
1549	vmovdqu	-16*$SZ+16($inp),%xmm1
1550	vmovdqu	-16*$SZ+32($inp),%xmm2
1551	vmovdqu	-16*$SZ+48($inp),%xmm3
1552	#mov		$inp,$_inp	# offload $inp
1553	vinserti128	\$1,(%r12),@X[0],@X[0]
1554	vinserti128	\$1,16(%r12),@X[1],@X[1]
1555	vpshufb		$t3,@X[0],@X[0]
1556	vinserti128	\$1,32(%r12),@X[2],@X[2]
1557	vpshufb		$t3,@X[1],@X[1]
1558	vinserti128	\$1,48(%r12),@X[3],@X[3]
1559
1560	lea	$TABLE(%rip),$Tbl
1561	vpshufb	$t3,@X[2],@X[2]
1562	vpaddd	0x00($Tbl),@X[0],$t0
1563	vpshufb	$t3,@X[3],@X[3]
1564	vpaddd	0x20($Tbl),@X[1],$t1
1565	vpaddd	0x40($Tbl),@X[2],$t2
1566	vpaddd	0x60($Tbl),@X[3],$t3
1567	vmovdqa	$t0,0x00(%rsp)
1568	xor	$a1,$a1
1569	vmovdqa	$t1,0x20(%rsp)
1570	lea	-$PUSH8(%rsp),%rsp
1571	mov	$B,$a3
1572	vmovdqa	$t2,0x00(%rsp)
1573	xor	$C,$a3			# magic
1574	vmovdqa	$t3,0x20(%rsp)
1575	mov	$F,$a4
1576	sub	\$-16*2*$SZ,$Tbl	# size optimization
1577	jmp	.Lavx2_00_47
1578
1579.align	16
1580.Lavx2_00_47:
1581___
1582
1583sub AVX2_256_00_47 () {
1584my $j = shift;
1585my $body = shift;
1586my @X = @_;
1587my @insns = (&$body,&$body,&$body,&$body);	# 96 instructions
1588my $base = "+2*$PUSH8(%rsp)";
1589
1590	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%2)==0);
1591	foreach (Xupdate_256_AVX()) {		# 29 instructions
1592	    eval;
1593	    eval(shift(@insns));
1594	    eval(shift(@insns));
1595	    eval(shift(@insns));
1596	}
1597	&vpaddd		($t2,@X[0],16*2*$j."($Tbl)");
1598	  foreach (@insns) { eval; }		# remaining instructions
1599	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1600}
1601
1602    for ($i=0,$j=0; $j<4; $j++) {
1603	&AVX2_256_00_47($j,\&bodyx_00_15,@X);
1604	push(@X,shift(@X));			# rotate(@X)
1605    }
1606	&lea	($Tbl,16*2*$SZ."($Tbl)");
1607	&cmpb	(($SZ-1)."($Tbl)",0);
1608	&jne	(".Lavx2_00_47");
1609
1610    for ($i=0; $i<16; ) {
1611	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1612	foreach(bodyx_00_15()) { eval; }
1613    }
1614					} else {	# SHA512
1615    my @X = map("%ymm$_",(0..7));
1616    my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1617
1618$code.=<<___;
1619	jmp	.Loop_avx2
1620.align	16
1621.Loop_avx2:
1622	vmovdqu	-16*$SZ($inp),%xmm0
1623	vmovdqu	-16*$SZ+16($inp),%xmm1
1624	vmovdqu	-16*$SZ+32($inp),%xmm2
1625	lea	$TABLE+0x80(%rip),$Tbl	# size optimization
1626	vmovdqu	-16*$SZ+48($inp),%xmm3
1627	vmovdqu	-16*$SZ+64($inp),%xmm4
1628	vmovdqu	-16*$SZ+80($inp),%xmm5
1629	vmovdqu	-16*$SZ+96($inp),%xmm6
1630	vmovdqu	-16*$SZ+112($inp),%xmm7
1631	#mov	$inp,$_inp	# offload $inp
1632	vmovdqa	`$SZ*2*$rounds-0x80`($Tbl),$t2
1633	vinserti128	\$1,(%r12),@X[0],@X[0]
1634	vinserti128	\$1,16(%r12),@X[1],@X[1]
1635	 vpshufb	$t2,@X[0],@X[0]
1636	vinserti128	\$1,32(%r12),@X[2],@X[2]
1637	 vpshufb	$t2,@X[1],@X[1]
1638	vinserti128	\$1,48(%r12),@X[3],@X[3]
1639	 vpshufb	$t2,@X[2],@X[2]
1640	vinserti128	\$1,64(%r12),@X[4],@X[4]
1641	 vpshufb	$t2,@X[3],@X[3]
1642	vinserti128	\$1,80(%r12),@X[5],@X[5]
1643	 vpshufb	$t2,@X[4],@X[4]
1644	vinserti128	\$1,96(%r12),@X[6],@X[6]
1645	 vpshufb	$t2,@X[5],@X[5]
1646	vinserti128	\$1,112(%r12),@X[7],@X[7]
1647
1648	vpaddq	-0x80($Tbl),@X[0],$t0
1649	vpshufb	$t2,@X[6],@X[6]
1650	vpaddq	-0x60($Tbl),@X[1],$t1
1651	vpshufb	$t2,@X[7],@X[7]
1652	vpaddq	-0x40($Tbl),@X[2],$t2
1653	vpaddq	-0x20($Tbl),@X[3],$t3
1654	vmovdqa	$t0,0x00(%rsp)
1655	vpaddq	0x00($Tbl),@X[4],$t0
1656	vmovdqa	$t1,0x20(%rsp)
1657	vpaddq	0x20($Tbl),@X[5],$t1
1658	vmovdqa	$t2,0x40(%rsp)
1659	vpaddq	0x40($Tbl),@X[6],$t2
1660	vmovdqa	$t3,0x60(%rsp)
1661	lea	-$PUSH8(%rsp),%rsp
1662	vpaddq	0x60($Tbl),@X[7],$t3
1663	vmovdqa	$t0,0x00(%rsp)
1664	xor	$a1,$a1
1665	vmovdqa	$t1,0x20(%rsp)
1666	mov	$B,$a3
1667	vmovdqa	$t2,0x40(%rsp)
1668	xor	$C,$a3			# magic
1669	vmovdqa	$t3,0x60(%rsp)
1670	mov	$F,$a4
1671	add	\$16*2*$SZ,$Tbl
1672	jmp	.Lavx2_00_47
1673
1674.align	16
1675.Lavx2_00_47:
1676___
1677
1678sub AVX2_512_00_47 () {
1679my $j = shift;
1680my $body = shift;
1681my @X = @_;
1682my @insns = (&$body,&$body);			# 48 instructions
1683my $base = "+2*$PUSH8(%rsp)";
1684
1685	&lea	("%rsp","-$PUSH8(%rsp)")	if (($j%4)==0);
1686	foreach (Xupdate_512_AVX()) {		# 23 instructions
1687	    eval;
1688	    if ($_ !~ /\;$/) {
1689		eval(shift(@insns));
1690		eval(shift(@insns));
1691		eval(shift(@insns));
1692	    }
1693	}
1694	&vpaddq		($t2,@X[0],16*2*$j-0x80."($Tbl)");
1695	  foreach (@insns) { eval; }		# remaining instructions
1696	&vmovdqa	((32*$j)%$PUSH8."(%rsp)",$t2);
1697}
1698
1699    for ($i=0,$j=0; $j<8; $j++) {
1700	&AVX2_512_00_47($j,\&bodyx_00_15,@X);
1701	push(@X,shift(@X));			# rotate(@X)
1702    }
1703	&lea	($Tbl,16*2*$SZ."($Tbl)");
1704	&cmpb	(($SZ-1-0x80)."($Tbl)",0);
1705	&jne	(".Lavx2_00_47");
1706
1707    for ($i=0; $i<16; ) {
1708	my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1709	foreach(bodyx_00_15()) { eval; }
1710    }
1711}
1712$code.=<<___;
1713	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
1714	add	$a1,$A
1715	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
1716	lea	`2*$SZ*($rounds-8)`(%rsp),$Tbl
1717
1718	add	$SZ*0($ctx),$A
1719	add	$SZ*1($ctx),$B
1720	add	$SZ*2($ctx),$C
1721	add	$SZ*3($ctx),$D
1722	add	$SZ*4($ctx),$E
1723	add	$SZ*5($ctx),$F
1724	add	$SZ*6($ctx),$G
1725	add	$SZ*7($ctx),$H
1726
1727	mov	$A,$SZ*0($ctx)
1728	mov	$B,$SZ*1($ctx)
1729	mov	$C,$SZ*2($ctx)
1730	mov	$D,$SZ*3($ctx)
1731	mov	$E,$SZ*4($ctx)
1732	mov	$F,$SZ*5($ctx)
1733	mov	$G,$SZ*6($ctx)
1734	mov	$H,$SZ*7($ctx)
1735
1736	cmp	`$PUSH8+2*8`($Tbl),$inp	# $_end
1737	je	.Ldone_avx2
1738
1739	xor	$a1,$a1
1740	mov	$B,$a3
1741	xor	$C,$a3			# magic
1742	mov	$F,$a4
1743	jmp	.Lower_avx2
1744.align	16
1745.Lower_avx2:
1746___
1747    for ($i=0; $i<8; ) {
1748	my $base="+16($Tbl)";
1749	foreach(bodyx_00_15()) { eval; }
1750    }
1751$code.=<<___;
1752	lea	-$PUSH8($Tbl),$Tbl
1753	cmp	%rsp,$Tbl
1754	jae	.Lower_avx2
1755
1756	mov	`2*$SZ*$rounds`(%rsp),$ctx	# $_ctx
1757	add	$a1,$A
1758	#mov	`2*$SZ*$rounds+8`(%rsp),$inp	# $_inp
1759	lea	`2*$SZ*($rounds-8)`(%rsp),%rsp
1760
1761	add	$SZ*0($ctx),$A
1762	add	$SZ*1($ctx),$B
1763	add	$SZ*2($ctx),$C
1764	add	$SZ*3($ctx),$D
1765	add	$SZ*4($ctx),$E
1766	add	$SZ*5($ctx),$F
1767	lea	`2*16*$SZ`($inp),$inp	# inp+=2
1768	add	$SZ*6($ctx),$G
1769	mov	$inp,%r12
1770	add	$SZ*7($ctx),$H
1771	cmp	$_end,$inp
1772
1773	mov	$A,$SZ*0($ctx)
1774	cmove	%rsp,%r12		# next block or stale data
1775	mov	$B,$SZ*1($ctx)
1776	mov	$C,$SZ*2($ctx)
1777	mov	$D,$SZ*3($ctx)
1778	mov	$E,$SZ*4($ctx)
1779	mov	$F,$SZ*5($ctx)
1780	mov	$G,$SZ*6($ctx)
1781	mov	$H,$SZ*7($ctx)
1782
1783	jbe	.Loop_avx2
1784	lea	(%rsp),$Tbl
1785
1786.Ldone_avx2:
1787	lea	($Tbl),%rsp
1788	mov	$_rsp,%rsi
1789.cfi_def_cfa	%rsi,8
1790	vzeroupper
1791___
1792$code.=<<___ if ($win64);
1793	movaps	16*$SZ+32(%rsp),%xmm6
1794	movaps	16*$SZ+48(%rsp),%xmm7
1795	movaps	16*$SZ+64(%rsp),%xmm8
1796	movaps	16*$SZ+80(%rsp),%xmm9
1797___
1798$code.=<<___ if ($win64 && $SZ>4);
1799	movaps	16*$SZ+96(%rsp),%xmm10
1800	movaps	16*$SZ+112(%rsp),%xmm11
1801___
1802$code.=<<___;
1803	mov	-48(%rsi),%r15
1804.cfi_restore	%r15
1805	mov	-40(%rsi),%r14
1806.cfi_restore	%r14
1807	mov	-32(%rsi),%r13
1808.cfi_restore	%r13
1809	mov	-24(%rsi),%r12
1810.cfi_restore	%r12
1811	mov	-16(%rsi),%rbp
1812.cfi_restore	%rbp
1813	mov	-8(%rsi),%rbx
1814.cfi_restore	%rbx
1815	lea	(%rsi),%rsp
1816.cfi_def_cfa_register	%rsp
1817.Lepilogue_avx2:
1818	ret
1819.cfi_endproc
1820.size	${func}_avx2,.-${func}_avx2
1821___
1822}}
1823}}}}}
1824
1825# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1826#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1827if ($win64) {
1828$rec="%rcx";
1829$frame="%rdx";
1830$context="%r8";
1831$disp="%r9";
1832
1833$code.=<<___;
1834.extern	__imp_RtlVirtualUnwind
1835.type	se_handler,\@abi-omnipotent
1836.align	16
1837se_handler:
1838	push	%rsi
1839	push	%rdi
1840	push	%rbx
1841	push	%rbp
1842	push	%r12
1843	push	%r13
1844	push	%r14
1845	push	%r15
1846	pushfq
1847	sub	\$64,%rsp
1848
1849	mov	120($context),%rax	# pull context->Rax
1850	mov	248($context),%rbx	# pull context->Rip
1851
1852	mov	8($disp),%rsi		# disp->ImageBase
1853	mov	56($disp),%r11		# disp->HanderlData
1854
1855	mov	0(%r11),%r10d		# HandlerData[0]
1856	lea	(%rsi,%r10),%r10	# prologue label
1857	cmp	%r10,%rbx		# context->Rip<prologue label
1858	jb	.Lin_prologue
1859
1860	mov	152($context),%rax	# pull context->Rsp
1861
1862	mov	4(%r11),%r10d		# HandlerData[1]
1863	lea	(%rsi,%r10),%r10	# epilogue label
1864	cmp	%r10,%rbx		# context->Rip>=epilogue label
1865	jae	.Lin_prologue
1866___
1867$code.=<<___ if ($avx>1);
1868	lea	.Lavx2_shortcut(%rip),%r10
1869	cmp	%r10,%rbx		# context->Rip<avx2_shortcut
1870	jb	.Lnot_in_avx2
1871
1872	and	\$-256*$SZ,%rax
1873	add	\$`2*$SZ*($rounds-8)`,%rax
1874.Lnot_in_avx2:
1875___
1876$code.=<<___;
1877	mov	%rax,%rsi		# put aside Rsp
1878	mov	16*$SZ+3*8(%rax),%rax	# pull $_rsp
1879
1880	mov	-8(%rax),%rbx
1881	mov	-16(%rax),%rbp
1882	mov	-24(%rax),%r12
1883	mov	-32(%rax),%r13
1884	mov	-40(%rax),%r14
1885	mov	-48(%rax),%r15
1886	mov	%rbx,144($context)	# restore context->Rbx
1887	mov	%rbp,160($context)	# restore context->Rbp
1888	mov	%r12,216($context)	# restore context->R12
1889	mov	%r13,224($context)	# restore context->R13
1890	mov	%r14,232($context)	# restore context->R14
1891	mov	%r15,240($context)	# restore context->R15
1892
1893	lea	.Lepilogue(%rip),%r10
1894	cmp	%r10,%rbx
1895	jb	.Lin_prologue		# non-AVX code
1896
1897	lea	16*$SZ+4*8(%rsi),%rsi	# Xmm6- save area
1898	lea	512($context),%rdi	# &context.Xmm6
1899	mov	\$`$SZ==4?8:12`,%ecx
1900	.long	0xa548f3fc		# cld; rep movsq
1901
1902.Lin_prologue:
1903	mov	8(%rax),%rdi
1904	mov	16(%rax),%rsi
1905	mov	%rax,152($context)	# restore context->Rsp
1906	mov	%rsi,168($context)	# restore context->Rsi
1907	mov	%rdi,176($context)	# restore context->Rdi
1908
1909	mov	40($disp),%rdi		# disp->ContextRecord
1910	mov	$context,%rsi		# context
1911	mov	\$154,%ecx		# sizeof(CONTEXT)
1912	.long	0xa548f3fc		# cld; rep movsq
1913
1914	mov	$disp,%rsi
1915	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1916	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1917	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1918	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1919	mov	40(%rsi),%r10		# disp->ContextRecord
1920	lea	56(%rsi),%r11		# &disp->HandlerData
1921	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1922	mov	%r10,32(%rsp)		# arg5
1923	mov	%r11,40(%rsp)		# arg6
1924	mov	%r12,48(%rsp)		# arg7
1925	mov	%rcx,56(%rsp)		# arg8, (NULL)
1926	call	*__imp_RtlVirtualUnwind(%rip)
1927
1928	mov	\$1,%eax		# ExceptionContinueSearch
1929	add	\$64,%rsp
1930	popfq
1931	pop	%r15
1932	pop	%r14
1933	pop	%r13
1934	pop	%r12
1935	pop	%rbp
1936	pop	%rbx
1937	pop	%rdi
1938	pop	%rsi
1939	ret
1940.size	se_handler,.-se_handler
1941___
1942
1943$code.=<<___ if ($SZ==4 && $shaext);
1944.type	shaext_handler,\@abi-omnipotent
1945.align	16
1946shaext_handler:
1947	push	%rsi
1948	push	%rdi
1949	push	%rbx
1950	push	%rbp
1951	push	%r12
1952	push	%r13
1953	push	%r14
1954	push	%r15
1955	pushfq
1956	sub	\$64,%rsp
1957
1958	mov	120($context),%rax	# pull context->Rax
1959	mov	248($context),%rbx	# pull context->Rip
1960
1961	lea	.Lprologue_shaext(%rip),%r10
1962	cmp	%r10,%rbx		# context->Rip<.Lprologue
1963	jb	.Lin_prologue
1964
1965	lea	.Lepilogue_shaext(%rip),%r10
1966	cmp	%r10,%rbx		# context->Rip>=.Lepilogue
1967	jae	.Lin_prologue
1968
1969	lea	-8-5*16(%rax),%rsi
1970	lea	512($context),%rdi	# &context.Xmm6
1971	mov	\$10,%ecx
1972	.long	0xa548f3fc		# cld; rep movsq
1973
1974	jmp	.Lin_prologue
1975.size	shaext_handler,.-shaext_handler
1976___
1977
1978$code.=<<___;
1979.section	.pdata
1980.align	4
1981	.rva	.LSEH_begin_${func}_nohw
1982	.rva	.LSEH_end_${func}_nohw
1983	.rva	.LSEH_info_${func}_nohw
1984___
1985$code.=<<___ if ($SZ==4 && $shaext);
1986	.rva	.LSEH_begin_${func}_hw
1987	.rva	.LSEH_end_${func}_hw
1988	.rva	.LSEH_info_${func}_hw
1989___
1990$code.=<<___ if ($SZ==4);
1991	.rva	.LSEH_begin_${func}_ssse3
1992	.rva	.LSEH_end_${func}_ssse3
1993	.rva	.LSEH_info_${func}_ssse3
1994___
1995$code.=<<___ if ($avx);
1996	.rva	.LSEH_begin_${func}_avx
1997	.rva	.LSEH_end_${func}_avx
1998	.rva	.LSEH_info_${func}_avx
1999___
2000$code.=<<___ if ($avx>1);
2001	.rva	.LSEH_begin_${func}_avx2
2002	.rva	.LSEH_end_${func}_avx2
2003	.rva	.LSEH_info_${func}_avx2
2004___
2005$code.=<<___;
2006.section	.xdata
2007.align	8
2008.LSEH_info_${func}_nohw:
2009	.byte	9,0,0,0
2010	.rva	se_handler
2011	.rva	.Lprologue,.Lepilogue			# HandlerData[]
2012___
2013$code.=<<___ if ($SZ==4 && $shaext);
2014.LSEH_info_${func}_hw:
2015	.byte	9,0,0,0
2016	.rva	shaext_handler
2017___
2018$code.=<<___ if ($SZ==4);
2019.LSEH_info_${func}_ssse3:
2020	.byte	9,0,0,0
2021	.rva	se_handler
2022	.rva	.Lprologue_ssse3,.Lepilogue_ssse3	# HandlerData[]
2023___
2024$code.=<<___ if ($avx);
2025.LSEH_info_${func}_avx:
2026	.byte	9,0,0,0
2027	.rva	se_handler
2028	.rva	.Lprologue_avx,.Lepilogue_avx		# HandlerData[]
2029___
2030$code.=<<___ if ($avx>1);
2031.LSEH_info_${func}_avx2:
2032	.byte	9,0,0,0
2033	.rva	se_handler
2034	.rva	.Lprologue_avx2,.Lepilogue_avx2		# HandlerData[]
2035___
2036}
2037
2038sub sha256op38 {
2039    my $instr = shift;
2040    my %opcodelet = (
2041		"sha256rnds2" => 0xcb,
2042  		"sha256msg1"  => 0xcc,
2043		"sha256msg2"  => 0xcd	);
2044
2045    if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2046      my @opcode=(0x0f,0x38);
2047	push @opcode,$opcodelet{$instr};
2048	push @opcode,0xc0|($1&7)|(($2&7)<<3);		# ModR/M
2049	return ".byte\t".join(',',@opcode);
2050    } else {
2051	return $instr."\t".@_[0];
2052    }
2053}
2054
2055foreach (split("\n",$code)) {
2056	s/\`([^\`]*)\`/eval $1/geo;
2057
2058	s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2059
2060	print $_,"\n";
2061}
2062close STDOUT or die "error closing STDOUT: $!";
2063