• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005
11#
12# This is a "teaser" code, as it can be improved in several ways...
13# First of all non-SSE2 path should be implemented (yes, for now it
14# performs Montgomery multiplication/convolution only on SSE2-capable
15# CPUs such as P4, others fall down to original code). Then inner loop
16# can be unrolled and modulo-scheduled to improve ILP and possibly
17# moved to 128-bit XMM register bank (though it would require input
18# rearrangement and/or increase bus bandwidth utilization). Dedicated
19# squaring procedure should give further performance improvement...
20# Yet, for being draft, the code improves rsa512 *sign* benchmark by
21# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
22
23# December 2006
24#
25# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
26# Integer-only code [being equipped with dedicated squaring procedure]
27# gives ~40% on rsa512 sign benchmark...
28
29$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30push(@INC,"${dir}","${dir}../../perlasm");
31require "x86asm.pl";
32
33$output = pop;
34open STDOUT,">$output";
35
36&asm_init($ARGV[0],$0);
37
38$sse2=0;
39for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
40
41&external_label("OPENSSL_ia32cap_P") if ($sse2);
42
43&function_begin("bn_mul_mont");
44
45$i="edx";
46$j="ecx";
47$ap="esi";	$tp="esi";		# overlapping variables!!!
48$rp="edi";	$bp="edi";		# overlapping variables!!!
49$np="ebp";
50$num="ebx";
51
52$_num=&DWP(4*0,"esp");			# stack top layout
53$_rp=&DWP(4*1,"esp");
54$_ap=&DWP(4*2,"esp");
55$_bp=&DWP(4*3,"esp");
56$_np=&DWP(4*4,"esp");
57$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
58$_sp=&DWP(4*6,"esp");
59$_bpend=&DWP(4*7,"esp");
60$frame=32;				# size of above frame rounded up to 16n
61
62	&xor	("eax","eax");
63	&mov	("edi",&wparam(5));	# int num
64	&cmp	("edi",4);
65	&jl	(&label("just_leave"));
66
67	&lea	("esi",&wparam(0));	# put aside pointer to argument block
68	&lea	("edx",&wparam(1));	# load ap
69	&add	("edi",2);		# extra two words on top of tp
70	&neg	("edi");
71	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
72	&neg	("edi");
73
74	# minimize cache contention by arraning 2K window between stack
75	# pointer and ap argument [np is also position sensitive vector,
76	# but it's assumed to be near ap, as it's allocated at ~same
77	# time].
78	&mov	("eax","ebp");
79	&sub	("eax","edx");
80	&and	("eax",2047);
81	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
82
83	&xor	("edx","ebp");
84	&and	("edx",2048);
85	&xor	("edx",2048);
86	&sub	("ebp","edx");		# this splits them apart modulo 4096
87
88	&and	("ebp",-64);		# align to cache line
89
90	# An OS-agnostic version of __chkstk.
91	#
92	# Some OSes (Windows) insist on stack being "wired" to
93	# physical memory in strictly sequential manner, i.e. if stack
94	# allocation spans two pages, then reference to farmost one can
95	# be punishable by SEGV. But page walking can do good even on
96	# other OSes, because it guarantees that villain thread hits
97	# the guard page before it can make damage to innocent one...
98	&mov	("eax","esp");
99	&sub	("eax","ebp");
100	&and	("eax",-4096);
101	&mov	("edx","esp");		# saved stack pointer!
102	&lea	("esp",&DWP(0,"ebp","eax"));
103	&mov	("eax",&DWP(0,"esp"));
104	&cmp	("esp","ebp");
105	&ja	(&label("page_walk"));
106	&jmp	(&label("page_walk_done"));
107
108&set_label("page_walk",16);
109	&lea	("esp",&DWP(-4096,"esp"));
110	&mov	("eax",&DWP(0,"esp"));
111	&cmp	("esp","ebp");
112	&ja	(&label("page_walk"));
113&set_label("page_walk_done");
114
115	################################# load argument block...
116	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
117	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
118	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
119	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
120	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
121	#&mov	("edi",&DWP(5*4,"esi"));# int num
122
123	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
124	&mov	($_rp,"eax");		# ... save a copy of argument block
125	&mov	($_ap,"ebx");
126	&mov	($_bp,"ecx");
127	&mov	($_np,"ebp");
128	&mov	($_n0,"esi");
129	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
130	#&mov	($_num,$num);		# redundant as $num is not reused
131	&mov	($_sp,"edx");		# saved stack pointer!
132
133if($sse2) {
134$acc0="mm0";	# mmx register bank layout
135$acc1="mm1";
136$car0="mm2";
137$car1="mm3";
138$mul0="mm4";
139$mul1="mm5";
140$temp="mm6";
141$mask="mm7";
142
143	&picmeup("eax","OPENSSL_ia32cap_P");
144	&bt	(&DWP(0,"eax"),26);
145	&jnc	(&label("non_sse2"));
146
147	&mov	("eax",-1);
148	&movd	($mask,"eax");		# mask 32 lower bits
149
150	&mov	($ap,$_ap);		# load input pointers
151	&mov	($bp,$_bp);
152	&mov	($np,$_np);
153
154	&xor	($i,$i);		# i=0
155	&xor	($j,$j);		# j=0
156
157	&movd	($mul0,&DWP(0,$bp));		# bp[0]
158	&movd	($mul1,&DWP(0,$ap));		# ap[0]
159	&movd	($car1,&DWP(0,$np));		# np[0]
160
161	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
162	&movq	($car0,$mul1);
163	&movq	($acc0,$mul1);			# I wish movd worked for
164	&pand	($acc0,$mask);			# inter-register transfers
165
166	&pmuludq($mul1,$_n0q);			# *=n0
167
168	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
169	&paddq	($car1,$acc0);
170
171	&movd	($acc1,&DWP(4,$np));		# np[1]
172	&movd	($acc0,&DWP(4,$ap));		# ap[1]
173
174	&psrlq	($car0,32);
175	&psrlq	($car1,32);
176
177	&inc	($j);				# j++
178&set_label("1st",16);
179	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
180	&pmuludq($acc1,$mul1);			# np[j]*m1
181	&paddq	($car0,$acc0);			# +=c0
182	&paddq	($car1,$acc1);			# +=c1
183
184	&movq	($acc0,$car0);
185	&pand	($acc0,$mask);
186	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
187	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
188	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
189	&psrlq	($car0,32);
190	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
191	&psrlq	($car1,32);
192
193	&lea	($j,&DWP(1,$j));
194	&cmp	($j,$num);
195	&jl	(&label("1st"));
196
197	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
198	&pmuludq($acc1,$mul1);			# np[num-1]*m1
199	&paddq	($car0,$acc0);			# +=c0
200	&paddq	($car1,$acc1);			# +=c1
201
202	&movq	($acc0,$car0);
203	&pand	($acc0,$mask);
204	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
205	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
206
207	&psrlq	($car0,32);
208	&psrlq	($car1,32);
209
210	&paddq	($car1,$car0);
211	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
212
213	&inc	($i);				# i++
214&set_label("outer");
215	&xor	($j,$j);			# j=0
216
217	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
218	&movd	($mul1,&DWP(0,$ap));		# ap[0]
219	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
220	&movd	($car1,&DWP(0,$np));		# np[0]
221	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
222
223	&paddq	($mul1,$temp);			# +=tp[0]
224	&movq	($acc0,$mul1);
225	&movq	($car0,$mul1);
226	&pand	($acc0,$mask);
227
228	&pmuludq($mul1,$_n0q);			# *=n0
229
230	&pmuludq($car1,$mul1);
231	&paddq	($car1,$acc0);
232
233	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
234	&movd	($acc1,&DWP(4,$np));		# np[1]
235	&movd	($acc0,&DWP(4,$ap));		# ap[1]
236
237	&psrlq	($car0,32);
238	&psrlq	($car1,32);
239	&paddq	($car0,$temp);			# +=tp[1]
240
241	&inc	($j);				# j++
242	&dec	($num);
243&set_label("inner");
244	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
245	&pmuludq($acc1,$mul1);			# np[j]*m1
246	&paddq	($car0,$acc0);			# +=c0
247	&paddq	($car1,$acc1);			# +=c1
248
249	&movq	($acc0,$car0);
250	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
251	&pand	($acc0,$mask);
252	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
253	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
254	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
255	&psrlq	($car0,32);
256	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
257	&psrlq	($car1,32);
258	&paddq	($car0,$temp);			# +=tp[j+1]
259
260	&dec	($num);
261	&lea	($j,&DWP(1,$j));		# j++
262	&jnz	(&label("inner"));
263
264	&mov	($num,$j);
265	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
266	&pmuludq($acc1,$mul1);			# np[num-1]*m1
267	&paddq	($car0,$acc0);			# +=c0
268	&paddq	($car1,$acc1);			# +=c1
269
270	&movq	($acc0,$car0);
271	&pand	($acc0,$mask);
272	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
273	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
274	&psrlq	($car0,32);
275	&psrlq	($car1,32);
276
277	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
278	&paddq	($car1,$car0);
279	&paddq	($car1,$temp);
280	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
281
282	&lea	($i,&DWP(1,$i));		# i++
283	&cmp	($i,$num);
284	&jle	(&label("outer"));
285
286	&emms	();				# done with mmx bank
287	&jmp	(&label("common_tail"));
288
289&set_label("non_sse2",16);
290}
291
292if (0) {
293	&mov	("esp",$_sp);
294	&xor	("eax","eax");	# signal "not fast enough [yet]"
295	&jmp	(&label("just_leave"));
296	# While the below code provides competitive performance for
297	# all key lengths on modern Intel cores, it's still more
298	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
299	# means compared to the original integer-only assembler.
300	# 512-bit RSA sign is better by ~40%, but that's about all
301	# one can say about all CPUs...
302} else {
303$inp="esi";	# integer path uses these registers differently
304$word="edi";
305$carry="ebp";
306
307	&mov	($inp,$_ap);
308	&lea	($carry,&DWP(1,$num));
309	&mov	($word,$_bp);
310	&xor	($j,$j);				# j=0
311	&mov	("edx",$inp);
312	&and	($carry,1);				# see if num is even
313	&sub	("edx",$word);				# see if ap==bp
314	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
315	&or	($carry,"edx");
316	&mov	($word,&DWP(0,$word));			# bp[0]
317	&jz	(&label("bn_sqr_mont"));
318	&mov	($_bpend,"eax");
319	&mov	("eax",&DWP(0,$inp));
320	&xor	("edx","edx");
321
322&set_label("mull",16);
323	&mov	($carry,"edx");
324	&mul	($word);				# ap[j]*bp[0]
325	&add	($carry,"eax");
326	&lea	($j,&DWP(1,$j));
327	&adc	("edx",0);
328	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
329	&cmp	($j,$num);
330	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
331	&jl	(&label("mull"));
332
333	&mov	($carry,"edx");
334	&mul	($word);				# ap[num-1]*bp[0]
335	 &mov	($word,$_n0);
336	&add	("eax",$carry);
337	 &mov	($inp,$_np);
338	&adc	("edx",0);
339	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
340
341	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
342	&xor	($j,$j);
343	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
344	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
345
346	&mov	("eax",&DWP(0,$inp));			# np[0]
347	&mul	($word);				# np[0]*m
348	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
349	&mov	("eax",&DWP(4,$inp));			# np[1]
350	&adc	("edx",0);
351	&inc	($j);
352
353	&jmp	(&label("2ndmadd"));
354
355&set_label("1stmadd",16);
356	&mov	($carry,"edx");
357	&mul	($word);				# ap[j]*bp[i]
358	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
359	&lea	($j,&DWP(1,$j));
360	&adc	("edx",0);
361	&add	($carry,"eax");
362	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
363	&adc	("edx",0);
364	&cmp	($j,$num);
365	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
366	&jl	(&label("1stmadd"));
367
368	&mov	($carry,"edx");
369	&mul	($word);				# ap[num-1]*bp[i]
370	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
371	 &mov	($word,$_n0);
372	&adc	("edx",0);
373	 &mov	($inp,$_np);
374	&add	($carry,"eax");
375	&adc	("edx",0);
376	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
377
378	&xor	($j,$j);
379	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
380	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
381	&adc	($j,0);
382	 &mov	("eax",&DWP(0,$inp));			# np[0]
383	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
384	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
385
386	&mul	($word);				# np[0]*m
387	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
388	&mov	("eax",&DWP(4,$inp));			# np[1]
389	&adc	("edx",0);
390	&mov	($j,1);
391
392&set_label("2ndmadd",16);
393	&mov	($carry,"edx");
394	&mul	($word);				# np[j]*m
395	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
396	&lea	($j,&DWP(1,$j));
397	&adc	("edx",0);
398	&add	($carry,"eax");
399	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
400	&adc	("edx",0);
401	&cmp	($j,$num);
402	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
403	&jl	(&label("2ndmadd"));
404
405	&mov	($carry,"edx");
406	&mul	($word);				# np[j]*m
407	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
408	&adc	("edx",0);
409	&add	($carry,"eax");
410	&adc	("edx",0);
411	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
412
413	&xor	("eax","eax");
414	 &mov	($j,$_bp);				# &bp[i]
415	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
416	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
417	 &lea	($j,&DWP(4,$j));
418	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
419	 &cmp	($j,$_bpend);
420	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
421	&je	(&label("common_tail"));
422
423	&mov	($word,&DWP(0,$j));			# bp[i+1]
424	&mov	($inp,$_ap);
425	&mov	($_bp,$j);				# &bp[++i]
426	&xor	($j,$j);
427	&xor	("edx","edx");
428	&mov	("eax",&DWP(0,$inp));
429	&jmp	(&label("1stmadd"));
430
431&set_label("bn_sqr_mont",16);
432$sbit=$num;
433	&mov	($_num,$num);
434	&mov	($_bp,$j);				# i=0
435
436	&mov	("eax",$word);				# ap[0]
437	&mul	($word);				# ap[0]*ap[0]
438	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
439	&mov	($sbit,"edx");
440	&shr	("edx",1);
441	&and	($sbit,1);
442	&inc	($j);
443&set_label("sqr",16);
444	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
445	&mov	($carry,"edx");
446	&mul	($word);				# ap[j]*ap[0]
447	&add	("eax",$carry);
448	&lea	($j,&DWP(1,$j));
449	&adc	("edx",0);
450	&lea	($carry,&DWP(0,$sbit,"eax",2));
451	&shr	("eax",31);
452	&cmp	($j,$_num);
453	&mov	($sbit,"eax");
454	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
455	&jl	(&label("sqr"));
456
457	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
458	&mov	($carry,"edx");
459	&mul	($word);				# ap[num-1]*ap[0]
460	&add	("eax",$carry);
461	 &mov	($word,$_n0);
462	&adc	("edx",0);
463	 &mov	($inp,$_np);
464	&lea	($carry,&DWP(0,$sbit,"eax",2));
465	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
466	&shr	("eax",31);
467	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
468
469	&lea	($carry,&DWP(0,"eax","edx",2));
470	 &mov	("eax",&DWP(0,$inp));			# np[0]
471	&shr	("edx",31);
472	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
473	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
474
475	&mul	($word);				# np[0]*m
476	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
477	&mov	($num,$j);
478	&adc	("edx",0);
479	&mov	("eax",&DWP(4,$inp));			# np[1]
480	&mov	($j,1);
481
482&set_label("3rdmadd",16);
483	&mov	($carry,"edx");
484	&mul	($word);				# np[j]*m
485	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
486	&adc	("edx",0);
487	&add	($carry,"eax");
488	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
489	&adc	("edx",0);
490	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
491
492	&mov	($carry,"edx");
493	&mul	($word);				# np[j+1]*m
494	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
495	&lea	($j,&DWP(2,$j));
496	&adc	("edx",0);
497	&add	($carry,"eax");
498	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
499	&adc	("edx",0);
500	&cmp	($j,$num);
501	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
502	&jl	(&label("3rdmadd"));
503
504	&mov	($carry,"edx");
505	&mul	($word);				# np[j]*m
506	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
507	&adc	("edx",0);
508	&add	($carry,"eax");
509	&adc	("edx",0);
510	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
511
512	&mov	($j,$_bp);				# i
513	&xor	("eax","eax");
514	&mov	($inp,$_ap);
515	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
516	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
517	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
518	&cmp	($j,$num);
519	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
520	&je	(&label("common_tail"));
521
522	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
523	&lea	($j,&DWP(1,$j));
524	&mov	("eax",$word);
525	&mov	($_bp,$j);				# ++i
526	&mul	($word);				# ap[i]*ap[i]
527	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
528	&adc	("edx",0);
529	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
530	&xor	($carry,$carry);
531	&cmp	($j,$num);
532	&lea	($j,&DWP(1,$j));
533	&je	(&label("sqrlast"));
534
535	&mov	($sbit,"edx");				# zaps $num
536	&shr	("edx",1);
537	&and	($sbit,1);
538&set_label("sqradd",16);
539	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
540	&mov	($carry,"edx");
541	&mul	($word);				# ap[j]*ap[i]
542	&add	("eax",$carry);
543	&lea	($carry,&DWP(0,"eax","eax"));
544	&adc	("edx",0);
545	&shr	("eax",31);
546	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
547	&lea	($j,&DWP(1,$j));
548	&adc	("eax",0);
549	&add	($carry,$sbit);
550	&adc	("eax",0);
551	&cmp	($j,$_num);
552	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
553	&mov	($sbit,"eax");
554	&jle	(&label("sqradd"));
555
556	&mov	($carry,"edx");
557	&add	("edx","edx");
558	&shr	($carry,31);
559	&add	("edx",$sbit);
560	&adc	($carry,0);
561&set_label("sqrlast");
562	&mov	($word,$_n0);
563	&mov	($inp,$_np);
564	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
565
566	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
567	&mov	("eax",&DWP(0,$inp));			# np[0]
568	&adc	($carry,0);
569	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
570	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
571
572	&mul	($word);				# np[0]*m
573	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
574	&lea	($num,&DWP(-1,$j));
575	&adc	("edx",0);
576	&mov	($j,1);
577	&mov	("eax",&DWP(4,$inp));			# np[1]
578
579	&jmp	(&label("3rdmadd"));
580}
581
582&set_label("common_tail",16);
583	&mov	($np,$_np);			# load modulus pointer
584	&mov	($rp,$_rp);			# load result pointer
585	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
586
587	&mov	("eax",&DWP(0,$tp));		# tp[0]
588	&mov	($j,$num);			# j=num-1
589	&xor	($i,$i);			# i=0 and clear CF!
590
591&set_label("sub",16);
592	&sbb	("eax",&DWP(0,$np,$i,4));
593	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
594	&dec	($j);				# doesn't affect CF!
595	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
596	&lea	($i,&DWP(1,$i));		# i++
597	&jge	(&label("sub"));
598
599	&sbb	("eax",0);			# handle upmost overflow bit
600	&and	($tp,"eax");
601	&not	("eax");
602	&mov	($np,$rp);
603	&and	($np,"eax");
604	&or	($tp,$np);			# tp=carry?tp:rp
605
606&set_label("copy",16);				# copy or in-place refresh
607	&mov	("eax",&DWP(0,$tp,$num,4));
608	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
609	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
610	&dec	($num);
611	&jge	(&label("copy"));
612
613	&mov	("esp",$_sp);		# pull saved stack pointer
614	&mov	("eax",1);
615&set_label("just_leave");
616&function_end("bn_mul_mont");
617
618&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
619
620&asm_finish();
621
622close STDOUT;
623