• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# SHA256 block transform for x86. September 2007.
11#
12# Performance improvement over compiler generated code varies from
13# 10% to 40% [see below]. Not very impressive on some µ-archs, but
14# it's 5 times smaller and optimizies amount of writes.
15#
16# May 2012.
17#
18# Optimization including two of Pavel Semjanov's ideas, alternative
19# Maj and full unroll, resulted in ~20-25% improvement on most CPUs,
20# ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost
21# 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not
22# on P4, where it kills performance, nor Sandy Bridge, where folded
23# loop is approximately as fast...
24#
25# June 2012.
26#
27# Add AMD XOP-specific code path, >30% improvement on Bulldozer over
28# May version, >60% over original. Add AVX+shrd code path, >25%
29# improvement on Sandy Bridge over May version, 60% over original.
30#
31# May 2013.
32#
33# Replace AMD XOP code path with SSSE3 to cover more processors.
34# (Biggest improvement coefficient is on upcoming Atom Silvermont,
35# not shown.) Add AVX+BMI code path.
36#
37# March 2014.
38#
39# Add support for Intel SHA Extensions.
40#
41# Performance in clock cycles per processed byte (less is better):
42#
43#		gcc	icc	x86 asm(*)	SIMD	x86_64 asm(**)
44# Pentium	46	57	40/38		-	-
45# PIII		36	33	27/24		-	-
46# P4		41	38	28		-	17.3
47# AMD K8	27	25	19/15.5		-	14.9
48# Core2		26	23	18/15.6		14.3	13.8
49# Westmere	27	-	19/15.7		13.4	12.3
50# Sandy Bridge	25	-	15.9		12.4	11.6
51# Ivy Bridge	24	-	15.0		11.4	10.3
52# Haswell	22	-	13.9		9.46	7.80
53# Skylake	20	-	14.9		9.50	7.70
54# Bulldozer	36	-	27/22		17.0	13.6
55# VIA Nano	36	-	25/22		16.8	16.5
56# Atom		50	-	30/25		21.9	18.9
57# Silvermont	40	-	34/31		22.9	20.6
58# Goldmont	29	-	20		16.3(***)
59#
60# (*)	numbers after slash are for unrolled loop, where applicable;
61# (**)	x86_64 assembly performance is presented for reference
62#	purposes, results are best-available;
63# (***)	SHAEXT result is 4.1, strangely enough better than 64-bit one;
64
65$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
66push(@INC,"${dir}","${dir}../../../perlasm");
67require "x86asm.pl";
68
69$output=pop;
70open STDOUT,">$output";
71
72&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386");
73
74$xmm=$avx=0;
75for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); }
76
77# In upstream, this is controlled by shelling out to the compiler to check
78# versions, but BoringSSL is intended to be used with pre-generated perlasm
79# output, so this isn't useful anyway.
80#
81# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2.
82$avx = 1;
83
84$avx = 0 unless ($xmm);
85
86$shaext=$xmm;	### set to zero if compiling for 1.0.1
87
88# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
89# been tested.
90$shaext = 0;
91
92$unroll_after = 64*4;	# If pre-evicted from L1P cache first spin of
93			# fully unrolled loop was measured to run about
94			# 3-4x slower. If slowdown coefficient is N and
95			# unrolled loop is m times faster, then you break
96			# even at (N-1)/(m-1) blocks. Then it needs to be
97			# adjusted for probability of code being evicted,
98			# code size/cache size=1/4. Typical m is 1.15...
99
100$A="eax";
101$E="edx";
102$T="ebx";
103$Aoff=&DWP(4,"esp");
104$Boff=&DWP(8,"esp");
105$Coff=&DWP(12,"esp");
106$Doff=&DWP(16,"esp");
107$Eoff=&DWP(20,"esp");
108$Foff=&DWP(24,"esp");
109$Goff=&DWP(28,"esp");
110$Hoff=&DWP(32,"esp");
111$Xoff=&DWP(36,"esp");
112$K256="ebp";
113
114sub BODY_16_63() {
115	&mov	($T,"ecx");			# "ecx" is preloaded
116	 &mov	("esi",&DWP(4*(9+15+16-14),"esp"));
117	&ror	("ecx",18-7);
118	 &mov	("edi","esi");
119	&ror	("esi",19-17);
120	 &xor	("ecx",$T);
121	 &shr	($T,3);
122	&ror	("ecx",7);
123	 &xor	("esi","edi");
124	 &xor	($T,"ecx");			# T = sigma0(X[-15])
125	&ror	("esi",17);
126	 &add	($T,&DWP(4*(9+15+16),"esp"));	# T += X[-16]
127	&shr	("edi",10);
128	 &add	($T,&DWP(4*(9+15+16-9),"esp"));	# T += X[-7]
129	#&xor	("edi","esi")			# sigma1(X[-2])
130	# &add	($T,"edi");			# T += sigma1(X[-2])
131	# &mov	(&DWP(4*(9+15),"esp"),$T);	# save X[0]
132
133	&BODY_00_15(1);
134}
135sub BODY_00_15() {
136    my $in_16_63=shift;
137
138	&mov	("ecx",$E);
139	 &xor	("edi","esi")			if ($in_16_63);	# sigma1(X[-2])
140	 &mov	("esi",$Foff);
141	&ror	("ecx",25-11);
142	 &add	($T,"edi")			if ($in_16_63);	# T += sigma1(X[-2])
143	 &mov	("edi",$Goff);
144	&xor	("ecx",$E);
145	 &xor	("esi","edi");
146	 &mov	($T,&DWP(4*(9+15),"esp"))	if (!$in_16_63);
147	 &mov	(&DWP(4*(9+15),"esp"),$T)	if ($in_16_63);	# save X[0]
148	&ror	("ecx",11-6);
149	 &and	("esi",$E);
150	 &mov	($Eoff,$E);		# modulo-scheduled
151	&xor	($E,"ecx");
152	 &add	($T,$Hoff);		# T += h
153	 &xor	("esi","edi");		# Ch(e,f,g)
154	&ror	($E,6);			# Sigma1(e)
155	 &mov	("ecx",$A);
156	 &add	($T,"esi");		# T += Ch(e,f,g)
157
158	&ror	("ecx",22-13);
159	 &add	($T,$E);		# T += Sigma1(e)
160	 &mov	("edi",$Boff);
161	&xor	("ecx",$A);
162	 &mov	($Aoff,$A);		# modulo-scheduled
163	 &lea	("esp",&DWP(-4,"esp"));
164	&ror	("ecx",13-2);
165	 &mov	("esi",&DWP(0,$K256));
166	&xor	("ecx",$A);
167	 &mov	($E,$Eoff);		# e in next iteration, d in this one
168	 &xor	($A,"edi");		# a ^= b
169	&ror	("ecx",2);		# Sigma0(a)
170
171	 &add	($T,"esi");		# T+= K[i]
172	 &mov	(&DWP(0,"esp"),$A);	# (b^c) in next round
173	&add	($E,$T);		# d += T
174	 &and	($A,&DWP(4,"esp"));	# a &= (b^c)
175	&add	($T,"ecx");		# T += Sigma0(a)
176	 &xor	($A,"edi");		# h = Maj(a,b,c) = Ch(a^b,c,b)
177	 &mov	("ecx",&DWP(4*(9+15+16-1),"esp"))	if ($in_16_63);	# preload T
178	&add	($K256,4);
179	 &add	($A,$T);		# h += T
180}
181
182&external_label("OPENSSL_ia32cap_P")		if (!$i386);
183
184&function_begin("sha256_block_data_order");
185	&mov	("esi",wparam(0));	# ctx
186	&mov	("edi",wparam(1));	# inp
187	&mov	("eax",wparam(2));	# num
188	&mov	("ebx","esp");		# saved sp
189
190	&call	(&label("pic_point"));	# make it PIC!
191&set_label("pic_point");
192	&blindpop($K256);
193	&lea	($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256));
194
195	&sub	("esp",16);
196	&and	("esp",-64);
197
198	&shl	("eax",6);
199	&add	("eax","edi");
200	&mov	(&DWP(0,"esp"),"esi");	# ctx
201	&mov	(&DWP(4,"esp"),"edi");	# inp
202	&mov	(&DWP(8,"esp"),"eax");	# inp+num*128
203	&mov	(&DWP(12,"esp"),"ebx");	# saved sp
204						if (!$i386 && $xmm) {
205	&picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256"));
206	&mov	("ecx",&DWP(0,"edx"));
207	&mov	("ebx",&DWP(4,"edx"));
208	&test	("ecx",1<<20);		# check for P4
209	&jnz	(&label("loop"));
210	&mov	("edx",&DWP(8,"edx"))	if ($xmm);
211	&test	("ecx",1<<24);		# check for FXSR
212	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
213	&and	("ecx",1<<30);		# mask "Intel CPU" bit
214	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
215	&test	("edx",1<<29)		if ($shaext);	# check for SHA
216	&jnz	(&label("shaext"))	if ($shaext);
217	&or	("ecx","ebx");
218	&and	("ecx",1<<28|1<<30);
219	&cmp	("ecx",1<<28|1<<30);
220					if ($xmm) {
221	&je	(&label("AVX"))		if ($avx);
222	&test	("ebx",1<<9);		# check for SSSE3
223	&jnz	(&label("SSSE3"));
224					} else {
225	&je	(&label("loop_shrd"));
226					}
227						if ($unroll_after) {
228&set_label("no_xmm");
229	&sub	("eax","edi");
230	&cmp	("eax",$unroll_after);
231	&jae	(&label("unrolled"));
232						} }
233	&jmp	(&label("loop"));
234
235sub COMPACT_LOOP() {
236my $suffix=shift;
237
238&set_label("loop$suffix",$suffix?32:16);
239    # copy input block to stack reversing byte and dword order
240    for($i=0;$i<4;$i++) {
241	&mov	("eax",&DWP($i*16+0,"edi"));
242	&mov	("ebx",&DWP($i*16+4,"edi"));
243	&mov	("ecx",&DWP($i*16+8,"edi"));
244	&bswap	("eax");
245	&mov	("edx",&DWP($i*16+12,"edi"));
246	&bswap	("ebx");
247	&push	("eax");
248	&bswap	("ecx");
249	&push	("ebx");
250	&bswap	("edx");
251	&push	("ecx");
252	&push	("edx");
253    }
254	&add	("edi",64);
255	&lea	("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H
256	&mov	(&DWP(4*(9+16)+4,"esp"),"edi");
257
258	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
259	&mov	($A,&DWP(0,"esi"));
260	&mov	("ebx",&DWP(4,"esi"));
261	&mov	("ecx",&DWP(8,"esi"));
262	&mov	("edi",&DWP(12,"esi"));
263	# &mov	($Aoff,$A);
264	&mov	($Boff,"ebx");
265	&xor	("ebx","ecx");
266	&mov	($Coff,"ecx");
267	&mov	($Doff,"edi");
268	&mov	(&DWP(0,"esp"),"ebx");	# magic
269	&mov	($E,&DWP(16,"esi"));
270	&mov	("ebx",&DWP(20,"esi"));
271	&mov	("ecx",&DWP(24,"esi"));
272	&mov	("edi",&DWP(28,"esi"));
273	# &mov	($Eoff,$E);
274	&mov	($Foff,"ebx");
275	&mov	($Goff,"ecx");
276	&mov	($Hoff,"edi");
277
278&set_label("00_15$suffix",16);
279
280	&BODY_00_15();
281
282	&cmp	("esi",0xc19bf174);
283	&jne	(&label("00_15$suffix"));
284
285	&mov	("ecx",&DWP(4*(9+15+16-1),"esp"));	# preloaded in BODY_00_15(1)
286	&jmp	(&label("16_63$suffix"));
287
288&set_label("16_63$suffix",16);
289
290	&BODY_16_63();
291
292	&cmp	("esi",0xc67178f2);
293	&jne	(&label("16_63$suffix"));
294
295	&mov	("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx
296	# &mov	($A,$Aoff);
297	&mov	("ebx",$Boff);
298	# &mov	("edi",$Coff);
299	&mov	("ecx",$Doff);
300	&add	($A,&DWP(0,"esi"));
301	&add	("ebx",&DWP(4,"esi"));
302	&add	("edi",&DWP(8,"esi"));
303	&add	("ecx",&DWP(12,"esi"));
304	&mov	(&DWP(0,"esi"),$A);
305	&mov	(&DWP(4,"esi"),"ebx");
306	&mov	(&DWP(8,"esi"),"edi");
307	&mov	(&DWP(12,"esi"),"ecx");
308	# &mov	($E,$Eoff);
309	&mov	("eax",$Foff);
310	&mov	("ebx",$Goff);
311	&mov	("ecx",$Hoff);
312	&mov	("edi",&DWP(4*(9+16+64)+4,"esp"));#inp
313	&add	($E,&DWP(16,"esi"));
314	&add	("eax",&DWP(20,"esi"));
315	&add	("ebx",&DWP(24,"esi"));
316	&add	("ecx",&DWP(28,"esi"));
317	&mov	(&DWP(16,"esi"),$E);
318	&mov	(&DWP(20,"esi"),"eax");
319	&mov	(&DWP(24,"esi"),"ebx");
320	&mov	(&DWP(28,"esi"),"ecx");
321
322	&lea	("esp",&DWP(4*(9+16+64),"esp"));# destroy frame
323	&sub	($K256,4*64);			# rewind K
324
325	&cmp	("edi",&DWP(8,"esp"));		# are we done yet?
326	&jb	(&label("loop$suffix"));
327}
328	&COMPACT_LOOP();
329	&mov	("esp",&DWP(12,"esp"));		# restore sp
330&function_end_A();
331						if (!$i386 && !$xmm) {
332	# ~20% improvement on Sandy Bridge
333	local *ror = sub { &shrd(@_[0],@_) };
334	&COMPACT_LOOP("_shrd");
335	&mov	("esp",&DWP(12,"esp"));		# restore sp
336&function_end_A();
337						}
338
339&set_label("K256",64);	# Yes! I keep it in the code segment!
340@K256=(	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
341	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
342	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
343	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
344	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
345	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
346	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
347	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
348	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
349	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
350	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
351	0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
352	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
353	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
354	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
355	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2	);
356&data_word(@K256);
357&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f);	# byte swap mask
358&asciz("SHA256 block transform for x86, CRYPTOGAMS by <appro\@openssl.org>");
359
360($a,$b,$c,$d,$e,$f,$g,$h)=(0..7);	# offsets
361sub off { &DWP(4*(((shift)-$i)&7),"esp"); }
362
363if (!$i386 && $unroll_after) {
364my @AH=($A,$K256);
365
366&set_label("unrolled",16);
367	&lea	("esp",&DWP(-96,"esp"));
368	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
369	&mov	($AH[0],&DWP(0,"esi"));
370	&mov	($AH[1],&DWP(4,"esi"));
371	&mov	("ecx",&DWP(8,"esi"));
372	&mov	("ebx",&DWP(12,"esi"));
373	#&mov	(&DWP(0,"esp"),$AH[0]);
374	&mov	(&DWP(4,"esp"),$AH[1]);
375	&xor	($AH[1],"ecx");		# magic
376	&mov	(&DWP(8,"esp"),"ecx");
377	&mov	(&DWP(12,"esp"),"ebx");
378	&mov	($E,&DWP(16,"esi"));
379	&mov	("ebx",&DWP(20,"esi"));
380	&mov	("ecx",&DWP(24,"esi"));
381	&mov	("esi",&DWP(28,"esi"));
382	#&mov	(&DWP(16,"esp"),$E);
383	&mov	(&DWP(20,"esp"),"ebx");
384	&mov	(&DWP(24,"esp"),"ecx");
385	&mov	(&DWP(28,"esp"),"esi");
386	&jmp	(&label("grand_loop"));
387
388&set_label("grand_loop",16);
389    # copy input block to stack reversing byte order
390    for($i=0;$i<5;$i++) {
391	&mov	("ebx",&DWP(12*$i+0,"edi"));
392	&mov	("ecx",&DWP(12*$i+4,"edi"));
393	&bswap	("ebx");
394	&mov	("esi",&DWP(12*$i+8,"edi"));
395	&bswap	("ecx");
396	&mov	(&DWP(32+12*$i+0,"esp"),"ebx");
397	&bswap	("esi");
398	&mov	(&DWP(32+12*$i+4,"esp"),"ecx");
399	&mov	(&DWP(32+12*$i+8,"esp"),"esi");
400    }
401	&mov	("ebx",&DWP($i*12,"edi"));
402	&add	("edi",64);
403	&bswap	("ebx");
404	&mov	(&DWP(96+4,"esp"),"edi");
405	&mov	(&DWP(32+12*$i,"esp"),"ebx");
406
407    my ($t1,$t2) = ("ecx","esi");
408
409    for ($i=0;$i<64;$i++) {
410
411      if ($i>=16) {
412	&mov	($T,$t1);			# $t1 is preloaded
413	# &mov	($t2,&DWP(32+4*(($i+14)&15),"esp"));
414	&ror	($t1,18-7);
415	 &mov	("edi",$t2);
416	&ror	($t2,19-17);
417	 &xor	($t1,$T);
418	 &shr	($T,3);
419	&ror	($t1,7);
420	 &xor	($t2,"edi");
421	 &xor	($T,$t1);			# T = sigma0(X[-15])
422	&ror	($t2,17);
423	 &add	($T,&DWP(32+4*($i&15),"esp"));	# T += X[-16]
424	&shr	("edi",10);
425	 &add	($T,&DWP(32+4*(($i+9)&15),"esp"));	# T += X[-7]
426	#&xor	("edi",$t2)			# sigma1(X[-2])
427	# &add	($T,"edi");			# T += sigma1(X[-2])
428	# &mov	(&DWP(4*(9+15),"esp"),$T);	# save X[0]
429      }
430	&mov	($t1,$E);
431	 &xor	("edi",$t2)			if ($i>=16);	# sigma1(X[-2])
432	 &mov	($t2,&off($f));
433	&ror	($E,25-11);
434	 &add	($T,"edi")			if ($i>=16);	# T += sigma1(X[-2])
435	 &mov	("edi",&off($g));
436	&xor	($E,$t1);
437	 &mov	($T,&DWP(32+4*($i&15),"esp"))	if ($i<16);	# X[i]
438	 &mov	(&DWP(32+4*($i&15),"esp"),$T)	if ($i>=16 && $i<62);	# save X[0]
439	 &xor	($t2,"edi");
440	&ror	($E,11-6);
441	 &and	($t2,$t1);
442	 &mov	(&off($e),$t1);		# save $E, modulo-scheduled
443	&xor	($E,$t1);
444	 &add	($T,&off($h));		# T += h
445	 &xor	("edi",$t2);		# Ch(e,f,g)
446	&ror	($E,6);			# Sigma1(e)
447	 &mov	($t1,$AH[0]);
448	 &add	($T,"edi");		# T += Ch(e,f,g)
449
450	&ror	($t1,22-13);
451	 &mov	($t2,$AH[0]);
452	 &mov	("edi",&off($b));
453	&xor	($t1,$AH[0]);
454	 &mov	(&off($a),$AH[0]);	# save $A, modulo-scheduled
455	 &xor	($AH[0],"edi");		# a ^= b, (b^c) in next round
456	&ror	($t1,13-2);
457	 &and	($AH[1],$AH[0]);	# (b^c) &= (a^b)
458	 &lea	($E,&DWP(@K256[$i],$T,$E));	# T += Sigma1(1)+K[i]
459	&xor	($t1,$t2);
460	 &xor	($AH[1],"edi");		# h = Maj(a,b,c) = Ch(a^b,c,b)
461	 &mov	($t2,&DWP(32+4*(($i+2)&15),"esp"))	if ($i>=15 && $i<63);
462	&ror	($t1,2);		# Sigma0(a)
463
464	 &add	($AH[1],$E);		# h += T
465	 &add	($E,&off($d));		# d += T
466	&add	($AH[1],$t1);		# h += Sigma0(a)
467	 &mov	($t1,&DWP(32+4*(($i+15)&15),"esp"))	if ($i>=15 && $i<63);
468
469	@AH = reverse(@AH);		# rotate(a,h)
470	($t1,$t2) = ($t2,$t1);		# rotate(t1,t2)
471    }
472	&mov	("esi",&DWP(96,"esp"));	#ctx
473					#&mov	($AH[0],&DWP(0,"esp"));
474	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
475					#&mov	("edi", &DWP(8,"esp"));
476	&mov	("ecx",&DWP(12,"esp"));
477	&add	($AH[0],&DWP(0,"esi"));
478	&add	($AH[1],&DWP(4,"esi"));
479	&add	("edi",&DWP(8,"esi"));
480	&add	("ecx",&DWP(12,"esi"));
481	&mov	(&DWP(0,"esi"),$AH[0]);
482	&mov	(&DWP(4,"esi"),$AH[1]);
483	&mov	(&DWP(8,"esi"),"edi");
484	&mov	(&DWP(12,"esi"),"ecx");
485	 #&mov	(&DWP(0,"esp"),$AH[0]);
486	 &mov	(&DWP(4,"esp"),$AH[1]);
487	 &xor	($AH[1],"edi");		# magic
488	 &mov	(&DWP(8,"esp"),"edi");
489	 &mov	(&DWP(12,"esp"),"ecx");
490	#&mov	($E,&DWP(16,"esp"));
491	&mov	("edi",&DWP(20,"esp"));
492	&mov	("ebx",&DWP(24,"esp"));
493	&mov	("ecx",&DWP(28,"esp"));
494	&add	($E,&DWP(16,"esi"));
495	&add	("edi",&DWP(20,"esi"));
496	&add	("ebx",&DWP(24,"esi"));
497	&add	("ecx",&DWP(28,"esi"));
498	&mov	(&DWP(16,"esi"),$E);
499	&mov	(&DWP(20,"esi"),"edi");
500	&mov	(&DWP(24,"esi"),"ebx");
501	&mov	(&DWP(28,"esi"),"ecx");
502	 #&mov	(&DWP(16,"esp"),$E);
503	 &mov	(&DWP(20,"esp"),"edi");
504	&mov	("edi",&DWP(96+4,"esp"));	# inp
505	 &mov	(&DWP(24,"esp"),"ebx");
506	 &mov	(&DWP(28,"esp"),"ecx");
507
508	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
509	&jb	(&label("grand_loop"));
510
511	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
512&function_end_A();
513}
514						if (!$i386 && $xmm) {{{
515if ($shaext) {
516######################################################################
517# Intel SHA Extensions implementation of SHA256 update function.
518#
519my ($ctx,$inp,$end)=("esi","edi","eax");
520my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7));
521my @MSG=map("xmm$_",(3..6));
522
523sub sha256op38 {
524 my ($opcodelet,$dst,$src)=@_;
525    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
526    {	&data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);	}
527}
528sub sha256rnds2	{ sha256op38(0xcb,@_); }
529sub sha256msg1	{ sha256op38(0xcc,@_); }
530sub sha256msg2	{ sha256op38(0xcd,@_); }
531
532&set_label("shaext",32);
533	&sub		("esp",32);
534
535	&movdqu		($ABEF,&QWP(0,$ctx));		# DCBA
536	&lea		($K256,&DWP(0x80,$K256));
537	&movdqu		($CDGH,&QWP(16,$ctx));		# HGFE
538	&movdqa		($TMP,&QWP(0x100-0x80,$K256));	# byte swap mask
539
540	&pshufd		($Wi,$ABEF,0x1b);		# ABCD
541	&pshufd		($ABEF,$ABEF,0xb1);		# CDAB
542	&pshufd		($CDGH,$CDGH,0x1b);		# EFGH
543	&palignr	($ABEF,$CDGH,8);		# ABEF
544	&punpcklqdq	($CDGH,$Wi);			# CDGH
545	&jmp		(&label("loop_shaext"));
546
547&set_label("loop_shaext",16);
548	&movdqu		(@MSG[0],&QWP(0,$inp));
549	&movdqu		(@MSG[1],&QWP(0x10,$inp));
550	&movdqu		(@MSG[2],&QWP(0x20,$inp));
551	&pshufb		(@MSG[0],$TMP);
552	&movdqu		(@MSG[3],&QWP(0x30,$inp));
553	&movdqa		(&QWP(16,"esp"),$CDGH);		# offload
554
555	&movdqa		($Wi,&QWP(0*16-0x80,$K256));
556	&paddd		($Wi,@MSG[0]);
557	&pshufb		(@MSG[1],$TMP);
558	&sha256rnds2	($CDGH,$ABEF);			# 0-3
559	&pshufd		($Wi,$Wi,0x0e);
560	&nop		();
561	&movdqa		(&QWP(0,"esp"),$ABEF);		# offload
562	&sha256rnds2	($ABEF,$CDGH);
563
564	&movdqa		($Wi,&QWP(1*16-0x80,$K256));
565	&paddd		($Wi,@MSG[1]);
566	&pshufb		(@MSG[2],$TMP);
567	&sha256rnds2	($CDGH,$ABEF);			# 4-7
568	&pshufd		($Wi,$Wi,0x0e);
569	&lea		($inp,&DWP(0x40,$inp));
570	&sha256msg1	(@MSG[0],@MSG[1]);
571	&sha256rnds2	($ABEF,$CDGH);
572
573	&movdqa		($Wi,&QWP(2*16-0x80,$K256));
574	&paddd		($Wi,@MSG[2]);
575	&pshufb		(@MSG[3],$TMP);
576	&sha256rnds2	($CDGH,$ABEF);			# 8-11
577	&pshufd		($Wi,$Wi,0x0e);
578	&movdqa		($TMP,@MSG[3]);
579	&palignr	($TMP,@MSG[2],4);
580	&nop		();
581	&paddd		(@MSG[0],$TMP);
582	&sha256msg1	(@MSG[1],@MSG[2]);
583	&sha256rnds2	($ABEF,$CDGH);
584
585	&movdqa		($Wi,&QWP(3*16-0x80,$K256));
586	&paddd		($Wi,@MSG[3]);
587	&sha256msg2	(@MSG[0],@MSG[3]);
588	&sha256rnds2	($CDGH,$ABEF);			# 12-15
589	&pshufd		($Wi,$Wi,0x0e);
590	&movdqa		($TMP,@MSG[0]);
591	&palignr	($TMP,@MSG[3],4);
592	&nop		();
593	&paddd		(@MSG[1],$TMP);
594	&sha256msg1	(@MSG[2],@MSG[3]);
595	&sha256rnds2	($ABEF,$CDGH);
596
597for($i=4;$i<16-3;$i++) {
598	&movdqa		($Wi,&QWP($i*16-0x80,$K256));
599	&paddd		($Wi,@MSG[0]);
600	&sha256msg2	(@MSG[1],@MSG[0]);
601	&sha256rnds2	($CDGH,$ABEF);			# 16-19...
602	&pshufd		($Wi,$Wi,0x0e);
603	&movdqa		($TMP,@MSG[1]);
604	&palignr	($TMP,@MSG[0],4);
605	&nop		();
606	&paddd		(@MSG[2],$TMP);
607	&sha256msg1	(@MSG[3],@MSG[0]);
608	&sha256rnds2	($ABEF,$CDGH);
609
610	push(@MSG,shift(@MSG));
611}
612	&movdqa		($Wi,&QWP(13*16-0x80,$K256));
613	&paddd		($Wi,@MSG[0]);
614	&sha256msg2	(@MSG[1],@MSG[0]);
615	&sha256rnds2	($CDGH,$ABEF);			# 52-55
616	&pshufd		($Wi,$Wi,0x0e);
617	&movdqa		($TMP,@MSG[1])
618	&palignr	($TMP,@MSG[0],4);
619	&sha256rnds2	($ABEF,$CDGH);
620	&paddd		(@MSG[2],$TMP);
621
622	&movdqa		($Wi,&QWP(14*16-0x80,$K256));
623	&paddd		($Wi,@MSG[1]);
624	&sha256rnds2	($CDGH,$ABEF);			# 56-59
625	&pshufd		($Wi,$Wi,0x0e);
626	&sha256msg2	(@MSG[2],@MSG[1]);
627	&movdqa		($TMP,&QWP(0x100-0x80,$K256));	# byte swap mask
628	&sha256rnds2	($ABEF,$CDGH);
629
630	&movdqa		($Wi,&QWP(15*16-0x80,$K256));
631	&paddd		($Wi,@MSG[2]);
632	&nop		();
633	&sha256rnds2	($CDGH,$ABEF);			# 60-63
634	&pshufd		($Wi,$Wi,0x0e);
635	&cmp		($end,$inp);
636	&nop		();
637	&sha256rnds2	($ABEF,$CDGH);
638
639	&paddd		($CDGH,&QWP(16,"esp"));
640	&paddd		($ABEF,&QWP(0,"esp"));
641	&jnz		(&label("loop_shaext"));
642
643	&pshufd		($CDGH,$CDGH,0xb1);		# DCHG
644	&pshufd		($TMP,$ABEF,0x1b);		# FEBA
645	&pshufd		($ABEF,$ABEF,0xb1);		# BAFE
646	&punpckhqdq	($ABEF,$CDGH);			# DCBA
647	&palignr	($CDGH,$TMP,8);			# HGFE
648
649	&mov		("esp",&DWP(32+12,"esp"));
650	&movdqu		(&QWP(0,$ctx),$ABEF);
651	&movdqu		(&QWP(16,$ctx),$CDGH);
652&function_end_A();
653}
654
655my @X = map("xmm$_",(0..3));
656my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7));
657my @AH = ($A,$T);
658
659&set_label("SSSE3",32);
660	&lea	("esp",&DWP(-96,"esp"));
661	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
662	&mov	($AH[0],&DWP(0,"esi"));
663	&mov	($AH[1],&DWP(4,"esi"));
664	&mov	("ecx",&DWP(8,"esi"));
665	&mov	("edi",&DWP(12,"esi"));
666	#&mov	(&DWP(0,"esp"),$AH[0]);
667	&mov	(&DWP(4,"esp"),$AH[1]);
668	&xor	($AH[1],"ecx");			# magic
669	&mov	(&DWP(8,"esp"),"ecx");
670	&mov	(&DWP(12,"esp"),"edi");
671	&mov	($E,&DWP(16,"esi"));
672	&mov	("edi",&DWP(20,"esi"));
673	&mov	("ecx",&DWP(24,"esi"));
674	&mov	("esi",&DWP(28,"esi"));
675	#&mov	(&DWP(16,"esp"),$E);
676	&mov	(&DWP(20,"esp"),"edi");
677	&mov	("edi",&DWP(96+4,"esp"));	# inp
678	&mov	(&DWP(24,"esp"),"ecx");
679	&mov	(&DWP(28,"esp"),"esi");
680	&movdqa	($t3,&QWP(256,$K256));
681	&jmp	(&label("grand_ssse3"));
682
683&set_label("grand_ssse3",16);
684	# load input, reverse byte order, add K256[0..15], save to stack
685	&movdqu	(@X[0],&QWP(0,"edi"));
686	&movdqu	(@X[1],&QWP(16,"edi"));
687	&movdqu	(@X[2],&QWP(32,"edi"));
688	&movdqu	(@X[3],&QWP(48,"edi"));
689	&add	("edi",64);
690	&pshufb	(@X[0],$t3);
691	&mov	(&DWP(96+4,"esp"),"edi");
692	&pshufb	(@X[1],$t3);
693	&movdqa	($t0,&QWP(0,$K256));
694	&pshufb	(@X[2],$t3);
695	&movdqa	($t1,&QWP(16,$K256));
696	&paddd	($t0,@X[0]);
697	&pshufb	(@X[3],$t3);
698	&movdqa	($t2,&QWP(32,$K256));
699	&paddd	($t1,@X[1]);
700	&movdqa	($t3,&QWP(48,$K256));
701	&movdqa	(&QWP(32+0,"esp"),$t0);
702	&paddd	($t2,@X[2]);
703	&movdqa	(&QWP(32+16,"esp"),$t1);
704	&paddd	($t3,@X[3]);
705	&movdqa	(&QWP(32+32,"esp"),$t2);
706	&movdqa	(&QWP(32+48,"esp"),$t3);
707	&jmp	(&label("ssse3_00_47"));
708
709&set_label("ssse3_00_47",16);
710	&add		($K256,64);
711
712sub SSSE3_00_47 () {
713my $j = shift;
714my $body = shift;
715my @X = @_;
716my @insns = (&$body,&$body,&$body,&$body);	# 120 instructions
717
718	  eval(shift(@insns));
719	&movdqa		($t0,@X[1]);
720	  eval(shift(@insns));			# @
721	  eval(shift(@insns));
722	&movdqa		($t3,@X[3]);
723	  eval(shift(@insns));
724	  eval(shift(@insns));
725	&palignr	($t0,@X[0],4);		# X[1..4]
726	  eval(shift(@insns));
727	  eval(shift(@insns));			# @
728	  eval(shift(@insns));
729	 &palignr	($t3,@X[2],4);		# X[9..12]
730	  eval(shift(@insns));
731	  eval(shift(@insns));
732	  eval(shift(@insns));
733	&movdqa		($t1,$t0);
734	  eval(shift(@insns));			# @
735	  eval(shift(@insns));
736	&movdqa		($t2,$t0);
737	  eval(shift(@insns));
738	  eval(shift(@insns));
739	&psrld		($t0,3);
740	  eval(shift(@insns));
741	  eval(shift(@insns));			# @
742	 &paddd		(@X[0],$t3);		# X[0..3] += X[9..12]
743	  eval(shift(@insns));
744	  eval(shift(@insns));
745	&psrld		($t2,7);
746	  eval(shift(@insns));
747	  eval(shift(@insns));
748	  eval(shift(@insns));			# @
749	  eval(shift(@insns));
750	 &pshufd	($t3,@X[3],0b11111010);	# X[14..15]
751	  eval(shift(@insns));
752	  eval(shift(@insns));
753	&pslld		($t1,32-18);
754	  eval(shift(@insns));
755	  eval(shift(@insns));			# @
756	&pxor		($t0,$t2);
757	  eval(shift(@insns));
758	  eval(shift(@insns));
759	&psrld		($t2,18-7);
760	  eval(shift(@insns));
761	  eval(shift(@insns));
762	  eval(shift(@insns));			# @
763	&pxor		($t0,$t1);
764	  eval(shift(@insns));
765	  eval(shift(@insns));
766	&pslld		($t1,18-7);
767	  eval(shift(@insns));
768	  eval(shift(@insns));
769	  eval(shift(@insns));			# @
770	&pxor		($t0,$t2);
771	  eval(shift(@insns));
772	  eval(shift(@insns));
773	 &movdqa	($t2,$t3);
774	  eval(shift(@insns));
775	  eval(shift(@insns));
776	  eval(shift(@insns));			# @
777	&pxor		($t0,$t1);		# sigma0(X[1..4])
778	  eval(shift(@insns));
779	  eval(shift(@insns));
780	 &psrld		($t3,10);
781	  eval(shift(@insns));
782	  eval(shift(@insns));
783	  eval(shift(@insns));			# @
784	&paddd		(@X[0],$t0);		# X[0..3] += sigma0(X[1..4])
785	  eval(shift(@insns));
786	  eval(shift(@insns));
787	 &psrlq		($t2,17);
788	  eval(shift(@insns));
789	  eval(shift(@insns));
790	  eval(shift(@insns));			# @
791	 &pxor		($t3,$t2);
792	  eval(shift(@insns));
793	  eval(shift(@insns));
794	 &psrlq		($t2,19-17);
795	  eval(shift(@insns));
796	  eval(shift(@insns));
797	  eval(shift(@insns));			# @
798	 &pxor		($t3,$t2);
799	  eval(shift(@insns));
800	  eval(shift(@insns));
801	 &pshufd	($t3,$t3,0b10000000);
802	  eval(shift(@insns));
803	  eval(shift(@insns));
804	  eval(shift(@insns));			# @
805	  eval(shift(@insns));
806	  eval(shift(@insns));
807	  eval(shift(@insns));
808	  eval(shift(@insns));
809	  eval(shift(@insns));			# @
810	  eval(shift(@insns));
811	 &psrldq	($t3,8);
812	  eval(shift(@insns));
813	  eval(shift(@insns));
814	  eval(shift(@insns));
815	&paddd		(@X[0],$t3);		# X[0..1] += sigma1(X[14..15])
816	  eval(shift(@insns));			# @
817	  eval(shift(@insns));
818	  eval(shift(@insns));
819	  eval(shift(@insns));
820	  eval(shift(@insns));
821	  eval(shift(@insns));			# @
822	  eval(shift(@insns));
823	 &pshufd	($t3,@X[0],0b01010000);	# X[16..17]
824	  eval(shift(@insns));
825	  eval(shift(@insns));
826	  eval(shift(@insns));
827	 &movdqa	($t2,$t3);
828	  eval(shift(@insns));			# @
829	 &psrld		($t3,10);
830	  eval(shift(@insns));
831	 &psrlq		($t2,17);
832	  eval(shift(@insns));
833	  eval(shift(@insns));
834	  eval(shift(@insns));
835	  eval(shift(@insns));			# @
836	 &pxor		($t3,$t2);
837	  eval(shift(@insns));
838	  eval(shift(@insns));
839	 &psrlq		($t2,19-17);
840	  eval(shift(@insns));
841	  eval(shift(@insns));
842	  eval(shift(@insns));			# @
843	 &pxor		($t3,$t2);
844	  eval(shift(@insns));
845	  eval(shift(@insns));
846	  eval(shift(@insns));
847	 &pshufd	($t3,$t3,0b00001000);
848	  eval(shift(@insns));
849	  eval(shift(@insns));			# @
850	&movdqa		($t2,&QWP(16*$j,$K256));
851	  eval(shift(@insns));
852	  eval(shift(@insns));
853	 &pslldq	($t3,8);
854	  eval(shift(@insns));
855	  eval(shift(@insns));
856	  eval(shift(@insns));			# @
857	  eval(shift(@insns));
858	  eval(shift(@insns));
859	  eval(shift(@insns));
860	  eval(shift(@insns));
861	  eval(shift(@insns));			# @
862	&paddd		(@X[0],$t3);		# X[2..3] += sigma1(X[16..17])
863	  eval(shift(@insns));
864	  eval(shift(@insns));
865	  eval(shift(@insns));
866	  eval(shift(@insns));
867	&paddd		($t2,@X[0]);
868	  eval(shift(@insns));			# @
869
870	foreach (@insns) { eval; }		# remaining instructions
871
872	&movdqa		(&QWP(32+16*$j,"esp"),$t2);
873}
874
875sub body_00_15 () {
876	(
877	'&mov	("ecx",$E);',
878	'&ror	($E,25-11);',
879	 '&mov	("esi",&off($f));',
880	'&xor	($E,"ecx");',
881	 '&mov	("edi",&off($g));',
882	 '&xor	("esi","edi");',
883	'&ror	($E,11-6);',
884	 '&and	("esi","ecx");',
885	 '&mov	(&off($e),"ecx");',	# save $E, modulo-scheduled
886	'&xor	($E,"ecx");',
887	 '&xor	("edi","esi");',	# Ch(e,f,g)
888	'&ror	($E,6);',		# T = Sigma1(e)
889	 '&mov	("ecx",$AH[0]);',
890	 '&add	($E,"edi");',		# T += Ch(e,f,g)
891	 '&mov	("edi",&off($b));',
892	'&mov	("esi",$AH[0]);',
893
894	'&ror	("ecx",22-13);',
895	 '&mov	(&off($a),$AH[0]);',	# save $A, modulo-scheduled
896	'&xor	("ecx",$AH[0]);',
897	 '&xor	($AH[0],"edi");',	# a ^= b, (b^c) in next round
898	 '&add	($E,&off($h));',	# T += h
899	'&ror	("ecx",13-2);',
900	 '&and	($AH[1],$AH[0]);',	# (b^c) &= (a^b)
901	'&xor	("ecx","esi");',
902	 '&add	($E,&DWP(32+4*($i&15),"esp"));',	# T += K[i]+X[i]
903	 '&xor	($AH[1],"edi");',	# h = Maj(a,b,c) = Ch(a^b,c,b)
904	'&ror	("ecx",2);',		# Sigma0(a)
905
906	 '&add	($AH[1],$E);',		# h += T
907	 '&add	($E,&off($d));',	# d += T
908	'&add	($AH[1],"ecx");'.	# h += Sigma0(a)
909
910	'@AH = reverse(@AH); $i++;'	# rotate(a,h)
911	);
912}
913
914    for ($i=0,$j=0; $j<4; $j++) {
915	&SSSE3_00_47($j,\&body_00_15,@X);
916	push(@X,shift(@X));		# rotate(@X)
917    }
918	&cmp	(&DWP(16*$j,$K256),0x00010203);
919	&jne	(&label("ssse3_00_47"));
920
921    for ($i=0; $i<16; ) {
922	foreach(body_00_15()) { eval; }
923    }
924
925	&mov	("esi",&DWP(96,"esp"));	#ctx
926					#&mov	($AH[0],&DWP(0,"esp"));
927	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
928					#&mov	("edi", &DWP(8,"esp"));
929	&mov	("ecx",&DWP(12,"esp"));
930	&add	($AH[0],&DWP(0,"esi"));
931	&add	($AH[1],&DWP(4,"esi"));
932	&add	("edi",&DWP(8,"esi"));
933	&add	("ecx",&DWP(12,"esi"));
934	&mov	(&DWP(0,"esi"),$AH[0]);
935	&mov	(&DWP(4,"esi"),$AH[1]);
936	&mov	(&DWP(8,"esi"),"edi");
937	&mov	(&DWP(12,"esi"),"ecx");
938	 #&mov	(&DWP(0,"esp"),$AH[0]);
939	 &mov	(&DWP(4,"esp"),$AH[1]);
940	 &xor	($AH[1],"edi");			# magic
941	 &mov	(&DWP(8,"esp"),"edi");
942	 &mov	(&DWP(12,"esp"),"ecx");
943	#&mov	($E,&DWP(16,"esp"));
944	&mov	("edi",&DWP(20,"esp"));
945	&mov	("ecx",&DWP(24,"esp"));
946	&add	($E,&DWP(16,"esi"));
947	&add	("edi",&DWP(20,"esi"));
948	&add	("ecx",&DWP(24,"esi"));
949	&mov	(&DWP(16,"esi"),$E);
950	&mov	(&DWP(20,"esi"),"edi");
951	 &mov	(&DWP(20,"esp"),"edi");
952	&mov	("edi",&DWP(28,"esp"));
953	&mov	(&DWP(24,"esi"),"ecx");
954	 #&mov	(&DWP(16,"esp"),$E);
955	&add	("edi",&DWP(28,"esi"));
956	 &mov	(&DWP(24,"esp"),"ecx");
957	&mov	(&DWP(28,"esi"),"edi");
958	 &mov	(&DWP(28,"esp"),"edi");
959	&mov	("edi",&DWP(96+4,"esp"));	# inp
960
961	&movdqa	($t3,&QWP(64,$K256));
962	&sub	($K256,3*64);			# rewind K
963	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
964	&jb	(&label("grand_ssse3"));
965
966	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
967&function_end_A();
968						if ($avx) {
969&set_label("AVX",32);
970						if ($avx>1) {
971	&and	("edx",1<<8|1<<3);		# check for BMI2+BMI1
972	&cmp	("edx",1<<8|1<<3);
973	&je	(&label("AVX_BMI"));
974						}
975	&lea	("esp",&DWP(-96,"esp"));
976	&vzeroall	();
977	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
978	&mov	($AH[0],&DWP(0,"esi"));
979	&mov	($AH[1],&DWP(4,"esi"));
980	&mov	("ecx",&DWP(8,"esi"));
981	&mov	("edi",&DWP(12,"esi"));
982	#&mov	(&DWP(0,"esp"),$AH[0]);
983	&mov	(&DWP(4,"esp"),$AH[1]);
984	&xor	($AH[1],"ecx");			# magic
985	&mov	(&DWP(8,"esp"),"ecx");
986	&mov	(&DWP(12,"esp"),"edi");
987	&mov	($E,&DWP(16,"esi"));
988	&mov	("edi",&DWP(20,"esi"));
989	&mov	("ecx",&DWP(24,"esi"));
990	&mov	("esi",&DWP(28,"esi"));
991	#&mov	(&DWP(16,"esp"),$E);
992	&mov	(&DWP(20,"esp"),"edi");
993	&mov	("edi",&DWP(96+4,"esp"));	# inp
994	&mov	(&DWP(24,"esp"),"ecx");
995	&mov	(&DWP(28,"esp"),"esi");
996	&vmovdqa	($t3,&QWP(256,$K256));
997	&jmp	(&label("grand_avx"));
998
999&set_label("grand_avx",32);
1000	# load input, reverse byte order, add K256[0..15], save to stack
1001	&vmovdqu	(@X[0],&QWP(0,"edi"));
1002	&vmovdqu	(@X[1],&QWP(16,"edi"));
1003	&vmovdqu	(@X[2],&QWP(32,"edi"));
1004	&vmovdqu	(@X[3],&QWP(48,"edi"));
1005	&add		("edi",64);
1006	&vpshufb	(@X[0],@X[0],$t3);
1007	&mov		(&DWP(96+4,"esp"),"edi");
1008	&vpshufb	(@X[1],@X[1],$t3);
1009	&vpshufb	(@X[2],@X[2],$t3);
1010	&vpaddd		($t0,@X[0],&QWP(0,$K256));
1011	&vpshufb	(@X[3],@X[3],$t3);
1012	&vpaddd		($t1,@X[1],&QWP(16,$K256));
1013	&vpaddd		($t2,@X[2],&QWP(32,$K256));
1014	&vpaddd		($t3,@X[3],&QWP(48,$K256));
1015	&vmovdqa	(&QWP(32+0,"esp"),$t0);
1016	&vmovdqa	(&QWP(32+16,"esp"),$t1);
1017	&vmovdqa	(&QWP(32+32,"esp"),$t2);
1018	&vmovdqa	(&QWP(32+48,"esp"),$t3);
1019	&jmp		(&label("avx_00_47"));
1020
1021&set_label("avx_00_47",16);
1022	&add		($K256,64);
1023
1024sub Xupdate_AVX () {
1025	(
1026	'&vpalignr	($t0,@X[1],@X[0],4);',	# X[1..4]
1027	 '&vpalignr	($t3,@X[3],@X[2],4);',	# X[9..12]
1028	'&vpsrld	($t2,$t0,7);',
1029	 '&vpaddd	(@X[0],@X[0],$t3);',	# X[0..3] += X[9..16]
1030	'&vpsrld	($t3,$t0,3);',
1031	'&vpslld	($t1,$t0,14);',
1032	'&vpxor		($t0,$t3,$t2);',
1033	 '&vpshufd	($t3,@X[3],0b11111010)',# X[14..15]
1034	'&vpsrld	($t2,$t2,18-7);',
1035	'&vpxor		($t0,$t0,$t1);',
1036	'&vpslld	($t1,$t1,25-14);',
1037	'&vpxor		($t0,$t0,$t2);',
1038	 '&vpsrld	($t2,$t3,10);',
1039	'&vpxor		($t0,$t0,$t1);',	# sigma0(X[1..4])
1040	 '&vpsrlq	($t1,$t3,17);',
1041	'&vpaddd	(@X[0],@X[0],$t0);',	# X[0..3] += sigma0(X[1..4])
1042	 '&vpxor	($t2,$t2,$t1);',
1043	 '&vpsrlq	($t3,$t3,19);',
1044	 '&vpxor	($t2,$t2,$t3);',	# sigma1(X[14..15]
1045	 '&vpshufd	($t3,$t2,0b10000100);',
1046	'&vpsrldq	($t3,$t3,8);',
1047	'&vpaddd	(@X[0],@X[0],$t3);',	# X[0..1] += sigma1(X[14..15])
1048	 '&vpshufd	($t3,@X[0],0b01010000)',# X[16..17]
1049	 '&vpsrld	($t2,$t3,10);',
1050	 '&vpsrlq	($t1,$t3,17);',
1051	 '&vpxor	($t2,$t2,$t1);',
1052	 '&vpsrlq	($t3,$t3,19);',
1053	 '&vpxor	($t2,$t2,$t3);',	# sigma1(X[16..17]
1054	 '&vpshufd	($t3,$t2,0b11101000);',
1055	'&vpslldq	($t3,$t3,8);',
1056	'&vpaddd	(@X[0],@X[0],$t3);'	# X[2..3] += sigma1(X[16..17])
1057	);
1058}
1059
1060local *ror = sub { &shrd(@_[0],@_) };
1061sub AVX_00_47 () {
1062my $j = shift;
1063my $body = shift;
1064my @X = @_;
1065my @insns = (&$body,&$body,&$body,&$body);	# 120 instructions
1066my $insn;
1067
1068	foreach (Xupdate_AVX()) {		# 31 instructions
1069	    eval;
1070	    eval(shift(@insns));
1071	    eval(shift(@insns));
1072	    eval($insn = shift(@insns));
1073	    eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/);
1074	}
1075	&vpaddd		($t2,@X[0],&QWP(16*$j,$K256));
1076	foreach (@insns) { eval; }		# remaining instructions
1077	&vmovdqa	(&QWP(32+16*$j,"esp"),$t2);
1078}
1079
1080    for ($i=0,$j=0; $j<4; $j++) {
1081	&AVX_00_47($j,\&body_00_15,@X);
1082	push(@X,shift(@X));		# rotate(@X)
1083    }
1084	&cmp	(&DWP(16*$j,$K256),0x00010203);
1085	&jne	(&label("avx_00_47"));
1086
1087    for ($i=0; $i<16; ) {
1088	foreach(body_00_15()) { eval; }
1089    }
1090
1091	&mov	("esi",&DWP(96,"esp"));	#ctx
1092					#&mov	($AH[0],&DWP(0,"esp"));
1093	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
1094					#&mov	("edi", &DWP(8,"esp"));
1095	&mov	("ecx",&DWP(12,"esp"));
1096	&add	($AH[0],&DWP(0,"esi"));
1097	&add	($AH[1],&DWP(4,"esi"));
1098	&add	("edi",&DWP(8,"esi"));
1099	&add	("ecx",&DWP(12,"esi"));
1100	&mov	(&DWP(0,"esi"),$AH[0]);
1101	&mov	(&DWP(4,"esi"),$AH[1]);
1102	&mov	(&DWP(8,"esi"),"edi");
1103	&mov	(&DWP(12,"esi"),"ecx");
1104	 #&mov	(&DWP(0,"esp"),$AH[0]);
1105	 &mov	(&DWP(4,"esp"),$AH[1]);
1106	 &xor	($AH[1],"edi");			# magic
1107	 &mov	(&DWP(8,"esp"),"edi");
1108	 &mov	(&DWP(12,"esp"),"ecx");
1109	#&mov	($E,&DWP(16,"esp"));
1110	&mov	("edi",&DWP(20,"esp"));
1111	&mov	("ecx",&DWP(24,"esp"));
1112	&add	($E,&DWP(16,"esi"));
1113	&add	("edi",&DWP(20,"esi"));
1114	&add	("ecx",&DWP(24,"esi"));
1115	&mov	(&DWP(16,"esi"),$E);
1116	&mov	(&DWP(20,"esi"),"edi");
1117	 &mov	(&DWP(20,"esp"),"edi");
1118	&mov	("edi",&DWP(28,"esp"));
1119	&mov	(&DWP(24,"esi"),"ecx");
1120	 #&mov	(&DWP(16,"esp"),$E);
1121	&add	("edi",&DWP(28,"esi"));
1122	 &mov	(&DWP(24,"esp"),"ecx");
1123	&mov	(&DWP(28,"esi"),"edi");
1124	 &mov	(&DWP(28,"esp"),"edi");
1125	&mov	("edi",&DWP(96+4,"esp"));	# inp
1126
1127	&vmovdqa	($t3,&QWP(64,$K256));
1128	&sub	($K256,3*64);			# rewind K
1129	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
1130	&jb	(&label("grand_avx"));
1131
1132	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
1133	&vzeroall	();
1134&function_end_A();
1135						if ($avx>1) {
1136sub bodyx_00_15 () {			# +10%
1137	(
1138	'&rorx	("ecx",$E,6)',
1139	'&rorx	("esi",$E,11)',
1140	 '&mov	(&off($e),$E)',		# save $E, modulo-scheduled
1141	'&rorx	("edi",$E,25)',
1142	'&xor	("ecx","esi")',
1143	 '&andn	("esi",$E,&off($g))',
1144	'&xor	("ecx","edi")',		# Sigma1(e)
1145	 '&and	($E,&off($f))',
1146	 '&mov	(&off($a),$AH[0]);',	# save $A, modulo-scheduled
1147	 '&or	($E,"esi")',		# T = Ch(e,f,g)
1148
1149	'&rorx	("edi",$AH[0],2)',
1150	'&rorx	("esi",$AH[0],13)',
1151	 '&lea	($E,&DWP(0,$E,"ecx"))',	# T += Sigma1(e)
1152	'&rorx	("ecx",$AH[0],22)',
1153	'&xor	("esi","edi")',
1154	 '&mov	("edi",&off($b))',
1155	'&xor	("ecx","esi")',		# Sigma0(a)
1156
1157	 '&xor	($AH[0],"edi")',	# a ^= b, (b^c) in next round
1158	 '&add	($E,&off($h))',		# T += h
1159	 '&and	($AH[1],$AH[0])',	# (b^c) &= (a^b)
1160	 '&add	($E,&DWP(32+4*($i&15),"esp"))',	# T += K[i]+X[i]
1161	 '&xor	($AH[1],"edi")',	# h = Maj(a,b,c) = Ch(a^b,c,b)
1162
1163	 '&add	("ecx",$E)',		# h += T
1164	 '&add	($E,&off($d))',		# d += T
1165	'&lea	($AH[1],&DWP(0,$AH[1],"ecx"));'.	# h += Sigma0(a)
1166
1167	'@AH = reverse(@AH); $i++;'	# rotate(a,h)
1168	);
1169}
1170
1171&set_label("AVX_BMI",32);
1172	&lea	("esp",&DWP(-96,"esp"));
1173	&vzeroall	();
1174	# copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack
1175	&mov	($AH[0],&DWP(0,"esi"));
1176	&mov	($AH[1],&DWP(4,"esi"));
1177	&mov	("ecx",&DWP(8,"esi"));
1178	&mov	("edi",&DWP(12,"esi"));
1179	#&mov	(&DWP(0,"esp"),$AH[0]);
1180	&mov	(&DWP(4,"esp"),$AH[1]);
1181	&xor	($AH[1],"ecx");			# magic
1182	&mov	(&DWP(8,"esp"),"ecx");
1183	&mov	(&DWP(12,"esp"),"edi");
1184	&mov	($E,&DWP(16,"esi"));
1185	&mov	("edi",&DWP(20,"esi"));
1186	&mov	("ecx",&DWP(24,"esi"));
1187	&mov	("esi",&DWP(28,"esi"));
1188	#&mov	(&DWP(16,"esp"),$E);
1189	&mov	(&DWP(20,"esp"),"edi");
1190	&mov	("edi",&DWP(96+4,"esp"));	# inp
1191	&mov	(&DWP(24,"esp"),"ecx");
1192	&mov	(&DWP(28,"esp"),"esi");
1193	&vmovdqa	($t3,&QWP(256,$K256));
1194	&jmp	(&label("grand_avx_bmi"));
1195
1196&set_label("grand_avx_bmi",32);
1197	# load input, reverse byte order, add K256[0..15], save to stack
1198	&vmovdqu	(@X[0],&QWP(0,"edi"));
1199	&vmovdqu	(@X[1],&QWP(16,"edi"));
1200	&vmovdqu	(@X[2],&QWP(32,"edi"));
1201	&vmovdqu	(@X[3],&QWP(48,"edi"));
1202	&add		("edi",64);
1203	&vpshufb	(@X[0],@X[0],$t3);
1204	&mov		(&DWP(96+4,"esp"),"edi");
1205	&vpshufb	(@X[1],@X[1],$t3);
1206	&vpshufb	(@X[2],@X[2],$t3);
1207	&vpaddd		($t0,@X[0],&QWP(0,$K256));
1208	&vpshufb	(@X[3],@X[3],$t3);
1209	&vpaddd		($t1,@X[1],&QWP(16,$K256));
1210	&vpaddd		($t2,@X[2],&QWP(32,$K256));
1211	&vpaddd		($t3,@X[3],&QWP(48,$K256));
1212	&vmovdqa	(&QWP(32+0,"esp"),$t0);
1213	&vmovdqa	(&QWP(32+16,"esp"),$t1);
1214	&vmovdqa	(&QWP(32+32,"esp"),$t2);
1215	&vmovdqa	(&QWP(32+48,"esp"),$t3);
1216	&jmp		(&label("avx_bmi_00_47"));
1217
1218&set_label("avx_bmi_00_47",16);
1219	&add		($K256,64);
1220
1221    for ($i=0,$j=0; $j<4; $j++) {
1222	&AVX_00_47($j,\&bodyx_00_15,@X);
1223	push(@X,shift(@X));		# rotate(@X)
1224    }
1225	&cmp	(&DWP(16*$j,$K256),0x00010203);
1226	&jne	(&label("avx_bmi_00_47"));
1227
1228    for ($i=0; $i<16; ) {
1229	foreach(bodyx_00_15()) { eval; }
1230    }
1231
1232	&mov	("esi",&DWP(96,"esp"));	#ctx
1233					#&mov	($AH[0],&DWP(0,"esp"));
1234	&xor	($AH[1],"edi");		#&mov	($AH[1],&DWP(4,"esp"));
1235					#&mov	("edi", &DWP(8,"esp"));
1236	&mov	("ecx",&DWP(12,"esp"));
1237	&add	($AH[0],&DWP(0,"esi"));
1238	&add	($AH[1],&DWP(4,"esi"));
1239	&add	("edi",&DWP(8,"esi"));
1240	&add	("ecx",&DWP(12,"esi"));
1241	&mov	(&DWP(0,"esi"),$AH[0]);
1242	&mov	(&DWP(4,"esi"),$AH[1]);
1243	&mov	(&DWP(8,"esi"),"edi");
1244	&mov	(&DWP(12,"esi"),"ecx");
1245	 #&mov	(&DWP(0,"esp"),$AH[0]);
1246	 &mov	(&DWP(4,"esp"),$AH[1]);
1247	 &xor	($AH[1],"edi");			# magic
1248	 &mov	(&DWP(8,"esp"),"edi");
1249	 &mov	(&DWP(12,"esp"),"ecx");
1250	#&mov	($E,&DWP(16,"esp"));
1251	&mov	("edi",&DWP(20,"esp"));
1252	&mov	("ecx",&DWP(24,"esp"));
1253	&add	($E,&DWP(16,"esi"));
1254	&add	("edi",&DWP(20,"esi"));
1255	&add	("ecx",&DWP(24,"esi"));
1256	&mov	(&DWP(16,"esi"),$E);
1257	&mov	(&DWP(20,"esi"),"edi");
1258	 &mov	(&DWP(20,"esp"),"edi");
1259	&mov	("edi",&DWP(28,"esp"));
1260	&mov	(&DWP(24,"esi"),"ecx");
1261	 #&mov	(&DWP(16,"esp"),$E);
1262	&add	("edi",&DWP(28,"esi"));
1263	 &mov	(&DWP(24,"esp"),"ecx");
1264	&mov	(&DWP(28,"esi"),"edi");
1265	 &mov	(&DWP(28,"esp"),"edi");
1266	&mov	("edi",&DWP(96+4,"esp"));	# inp
1267
1268	&vmovdqa	($t3,&QWP(64,$K256));
1269	&sub	($K256,3*64);			# rewind K
1270	&cmp	("edi",&DWP(96+8,"esp"));	# are we done yet?
1271	&jb	(&label("grand_avx_bmi"));
1272
1273	&mov	("esp",&DWP(96+12,"esp"));	# restore sp
1274	&vzeroall	();
1275&function_end_A();
1276						}
1277						}
1278						}}}
1279&function_end_B("sha256_block_data_order");
1280
1281&asm_finish();
1282
1283close STDOUT;
1284