• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32$flavour = shift;
33$output  = shift;
34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35
36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37
38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41die "can't locate x86_64-xlate.pl";
42
43open STDOUT,"| $^X $xlate $flavour $output";
44
45# int bn_mul_mont(
46$rp="%rdi";	# BN_ULONG *rp,
47$ap="%rsi";	# const BN_ULONG *ap,
48$bp="%rdx";	# const BN_ULONG *bp,
49$np="%rcx";	# const BN_ULONG *np,
50$n0="%r8";	# const BN_ULONG *n0,
51$num="%r9";	# int num);
52$lo0="%r10";
53$hi0="%r11";
54$hi1="%r13";
55$i="%r14";
56$j="%r15";
57$m0="%rbx";
58$m1="%rbp";
59
60$code=<<___;
61.text
62
63.globl	bn_mul_mont
64.type	bn_mul_mont,\@function,6
65.align	16
66bn_mul_mont:
67	test	\$3,${num}d
68	jnz	.Lmul_enter
69	cmp	\$8,${num}d
70	jb	.Lmul_enter
71	cmp	$ap,$bp
72	jne	.Lmul4x_enter
73	jmp	.Lsqr4x_enter
74
75.align	16
76.Lmul_enter:
77	push	%rbx
78	push	%rbp
79	push	%r12
80	push	%r13
81	push	%r14
82	push	%r15
83
84	mov	${num}d,${num}d
85	lea	2($num),%r10
86	mov	%rsp,%r11
87	neg	%r10
88	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
89	and	\$-1024,%rsp		# minimize TLB usage
90
91	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
92.Lmul_body:
93	mov	$bp,%r12		# reassign $bp
94___
95		$bp="%r12";
96$code.=<<___;
97	mov	($n0),$n0		# pull n0[0] value
98	mov	($bp),$m0		# m0=bp[0]
99	mov	($ap),%rax
100
101	xor	$i,$i			# i=0
102	xor	$j,$j			# j=0
103
104	mov	$n0,$m1
105	mulq	$m0			# ap[0]*bp[0]
106	mov	%rax,$lo0
107	mov	($np),%rax
108
109	imulq	$lo0,$m1		# "tp[0]"*n0
110	mov	%rdx,$hi0
111
112	mulq	$m1			# np[0]*m1
113	add	%rax,$lo0		# discarded
114	mov	8($ap),%rax
115	adc	\$0,%rdx
116	mov	%rdx,$hi1
117
118	lea	1($j),$j		# j++
119	jmp	.L1st_enter
120
121.align	16
122.L1st:
123	add	%rax,$hi1
124	mov	($ap,$j,8),%rax
125	adc	\$0,%rdx
126	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
127	mov	$lo0,$hi0
128	adc	\$0,%rdx
129	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
130	mov	%rdx,$hi1
131
132.L1st_enter:
133	mulq	$m0			# ap[j]*bp[0]
134	add	%rax,$hi0
135	mov	($np,$j,8),%rax
136	adc	\$0,%rdx
137	lea	1($j),$j		# j++
138	mov	%rdx,$lo0
139
140	mulq	$m1			# np[j]*m1
141	cmp	$num,$j
142	jne	.L1st
143
144	add	%rax,$hi1
145	mov	($ap),%rax		# ap[0]
146	adc	\$0,%rdx
147	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
148	adc	\$0,%rdx
149	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
150	mov	%rdx,$hi1
151	mov	$lo0,$hi0
152
153	xor	%rdx,%rdx
154	add	$hi0,$hi1
155	adc	\$0,%rdx
156	mov	$hi1,-8(%rsp,$num,8)
157	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
158
159	lea	1($i),$i		# i++
160	jmp	.Louter
161.align	16
162.Louter:
163	mov	($bp,$i,8),$m0		# m0=bp[i]
164	xor	$j,$j			# j=0
165	mov	$n0,$m1
166	mov	(%rsp),$lo0
167	mulq	$m0			# ap[0]*bp[i]
168	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
169	mov	($np),%rax
170	adc	\$0,%rdx
171
172	imulq	$lo0,$m1		# tp[0]*n0
173	mov	%rdx,$hi0
174
175	mulq	$m1			# np[0]*m1
176	add	%rax,$lo0		# discarded
177	mov	8($ap),%rax
178	adc	\$0,%rdx
179	mov	8(%rsp),$lo0		# tp[1]
180	mov	%rdx,$hi1
181
182	lea	1($j),$j		# j++
183	jmp	.Linner_enter
184
185.align	16
186.Linner:
187	add	%rax,$hi1
188	mov	($ap,$j,8),%rax
189	adc	\$0,%rdx
190	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
191	mov	(%rsp,$j,8),$lo0
192	adc	\$0,%rdx
193	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
194	mov	%rdx,$hi1
195
196.Linner_enter:
197	mulq	$m0			# ap[j]*bp[i]
198	add	%rax,$hi0
199	mov	($np,$j,8),%rax
200	adc	\$0,%rdx
201	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
202	mov	%rdx,$hi0
203	adc	\$0,$hi0
204	lea	1($j),$j		# j++
205
206	mulq	$m1			# np[j]*m1
207	cmp	$num,$j
208	jne	.Linner
209
210	add	%rax,$hi1
211	mov	($ap),%rax		# ap[0]
212	adc	\$0,%rdx
213	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
214	mov	(%rsp,$j,8),$lo0
215	adc	\$0,%rdx
216	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
217	mov	%rdx,$hi1
218
219	xor	%rdx,%rdx
220	add	$hi0,$hi1
221	adc	\$0,%rdx
222	add	$lo0,$hi1		# pull upmost overflow bit
223	adc	\$0,%rdx
224	mov	$hi1,-8(%rsp,$num,8)
225	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
226
227	lea	1($i),$i		# i++
228	cmp	$num,$i
229	jl	.Louter
230
231	xor	$i,$i			# i=0 and clear CF!
232	mov	(%rsp),%rax		# tp[0]
233	lea	(%rsp),$ap		# borrow ap for tp
234	mov	$num,$j			# j=num
235	jmp	.Lsub
236.align	16
237.Lsub:	sbb	($np,$i,8),%rax
238	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
239	mov	8($ap,$i,8),%rax	# tp[i+1]
240	lea	1($i),$i		# i++
241	dec	$j			# doesnn't affect CF!
242	jnz	.Lsub
243
244	sbb	\$0,%rax		# handle upmost overflow bit
245	xor	$i,$i
246	and	%rax,$ap
247	not	%rax
248	mov	$rp,$np
249	and	%rax,$np
250	mov	$num,$j			# j=num
251	or	$np,$ap			# ap=borrow?tp:rp
252.align	16
253.Lcopy:					# copy or in-place refresh
254	mov	($ap,$i,8),%rax
255	mov	$i,(%rsp,$i,8)		# zap temporary vector
256	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
257	lea	1($i),$i
258	sub	\$1,$j
259	jnz	.Lcopy
260
261	mov	8(%rsp,$num,8),%rsi	# restore %rsp
262	mov	\$1,%rax
263	mov	(%rsi),%r15
264	mov	8(%rsi),%r14
265	mov	16(%rsi),%r13
266	mov	24(%rsi),%r12
267	mov	32(%rsi),%rbp
268	mov	40(%rsi),%rbx
269	lea	48(%rsi),%rsp
270.Lmul_epilogue:
271	ret
272.size	bn_mul_mont,.-bn_mul_mont
273___
274{{{
275my @A=("%r10","%r11");
276my @N=("%r13","%rdi");
277$code.=<<___;
278.type	bn_mul4x_mont,\@function,6
279.align	16
280bn_mul4x_mont:
281.Lmul4x_enter:
282	push	%rbx
283	push	%rbp
284	push	%r12
285	push	%r13
286	push	%r14
287	push	%r15
288
289	mov	${num}d,${num}d
290	lea	4($num),%r10
291	mov	%rsp,%r11
292	neg	%r10
293	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
294	and	\$-1024,%rsp		# minimize TLB usage
295
296	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
297.Lmul4x_body:
298	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
299	mov	%rdx,%r12		# reassign $bp
300___
301		$bp="%r12";
302$code.=<<___;
303	mov	($n0),$n0		# pull n0[0] value
304	mov	($bp),$m0		# m0=bp[0]
305	mov	($ap),%rax
306
307	xor	$i,$i			# i=0
308	xor	$j,$j			# j=0
309
310	mov	$n0,$m1
311	mulq	$m0			# ap[0]*bp[0]
312	mov	%rax,$A[0]
313	mov	($np),%rax
314
315	imulq	$A[0],$m1		# "tp[0]"*n0
316	mov	%rdx,$A[1]
317
318	mulq	$m1			# np[0]*m1
319	add	%rax,$A[0]		# discarded
320	mov	8($ap),%rax
321	adc	\$0,%rdx
322	mov	%rdx,$N[1]
323
324	mulq	$m0
325	add	%rax,$A[1]
326	mov	8($np),%rax
327	adc	\$0,%rdx
328	mov	%rdx,$A[0]
329
330	mulq	$m1
331	add	%rax,$N[1]
332	mov	16($ap),%rax
333	adc	\$0,%rdx
334	add	$A[1],$N[1]
335	lea	4($j),$j		# j++
336	adc	\$0,%rdx
337	mov	$N[1],(%rsp)
338	mov	%rdx,$N[0]
339	jmp	.L1st4x
340.align	16
341.L1st4x:
342	mulq	$m0			# ap[j]*bp[0]
343	add	%rax,$A[0]
344	mov	-16($np,$j,8),%rax
345	adc	\$0,%rdx
346	mov	%rdx,$A[1]
347
348	mulq	$m1			# np[j]*m1
349	add	%rax,$N[0]
350	mov	-8($ap,$j,8),%rax
351	adc	\$0,%rdx
352	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
353	adc	\$0,%rdx
354	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
355	mov	%rdx,$N[1]
356
357	mulq	$m0			# ap[j]*bp[0]
358	add	%rax,$A[1]
359	mov	-8($np,$j,8),%rax
360	adc	\$0,%rdx
361	mov	%rdx,$A[0]
362
363	mulq	$m1			# np[j]*m1
364	add	%rax,$N[1]
365	mov	($ap,$j,8),%rax
366	adc	\$0,%rdx
367	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
368	adc	\$0,%rdx
369	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
370	mov	%rdx,$N[0]
371
372	mulq	$m0			# ap[j]*bp[0]
373	add	%rax,$A[0]
374	mov	($np,$j,8),%rax
375	adc	\$0,%rdx
376	mov	%rdx,$A[1]
377
378	mulq	$m1			# np[j]*m1
379	add	%rax,$N[0]
380	mov	8($ap,$j,8),%rax
381	adc	\$0,%rdx
382	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
383	adc	\$0,%rdx
384	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
385	mov	%rdx,$N[1]
386
387	mulq	$m0			# ap[j]*bp[0]
388	add	%rax,$A[1]
389	mov	8($np,$j,8),%rax
390	adc	\$0,%rdx
391	lea	4($j),$j		# j++
392	mov	%rdx,$A[0]
393
394	mulq	$m1			# np[j]*m1
395	add	%rax,$N[1]
396	mov	-16($ap,$j,8),%rax
397	adc	\$0,%rdx
398	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
399	adc	\$0,%rdx
400	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
401	mov	%rdx,$N[0]
402	cmp	$num,$j
403	jl	.L1st4x
404
405	mulq	$m0			# ap[j]*bp[0]
406	add	%rax,$A[0]
407	mov	-16($np,$j,8),%rax
408	adc	\$0,%rdx
409	mov	%rdx,$A[1]
410
411	mulq	$m1			# np[j]*m1
412	add	%rax,$N[0]
413	mov	-8($ap,$j,8),%rax
414	adc	\$0,%rdx
415	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
416	adc	\$0,%rdx
417	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
418	mov	%rdx,$N[1]
419
420	mulq	$m0			# ap[j]*bp[0]
421	add	%rax,$A[1]
422	mov	-8($np,$j,8),%rax
423	adc	\$0,%rdx
424	mov	%rdx,$A[0]
425
426	mulq	$m1			# np[j]*m1
427	add	%rax,$N[1]
428	mov	($ap),%rax		# ap[0]
429	adc	\$0,%rdx
430	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
431	adc	\$0,%rdx
432	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
433	mov	%rdx,$N[0]
434
435	xor	$N[1],$N[1]
436	add	$A[0],$N[0]
437	adc	\$0,$N[1]
438	mov	$N[0],-8(%rsp,$j,8)
439	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
440
441	lea	1($i),$i		# i++
442.align	4
443.Louter4x:
444	mov	($bp,$i,8),$m0		# m0=bp[i]
445	xor	$j,$j			# j=0
446	mov	(%rsp),$A[0]
447	mov	$n0,$m1
448	mulq	$m0			# ap[0]*bp[i]
449	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
450	mov	($np),%rax
451	adc	\$0,%rdx
452
453	imulq	$A[0],$m1		# tp[0]*n0
454	mov	%rdx,$A[1]
455
456	mulq	$m1			# np[0]*m1
457	add	%rax,$A[0]		# "$N[0]", discarded
458	mov	8($ap),%rax
459	adc	\$0,%rdx
460	mov	%rdx,$N[1]
461
462	mulq	$m0			# ap[j]*bp[i]
463	add	%rax,$A[1]
464	mov	8($np),%rax
465	adc	\$0,%rdx
466	add	8(%rsp),$A[1]		# +tp[1]
467	adc	\$0,%rdx
468	mov	%rdx,$A[0]
469
470	mulq	$m1			# np[j]*m1
471	add	%rax,$N[1]
472	mov	16($ap),%rax
473	adc	\$0,%rdx
474	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
475	lea	4($j),$j		# j+=2
476	adc	\$0,%rdx
477	mov	$N[1],(%rsp)		# tp[j-1]
478	mov	%rdx,$N[0]
479	jmp	.Linner4x
480.align	16
481.Linner4x:
482	mulq	$m0			# ap[j]*bp[i]
483	add	%rax,$A[0]
484	mov	-16($np,$j,8),%rax
485	adc	\$0,%rdx
486	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
487	adc	\$0,%rdx
488	mov	%rdx,$A[1]
489
490	mulq	$m1			# np[j]*m1
491	add	%rax,$N[0]
492	mov	-8($ap,$j,8),%rax
493	adc	\$0,%rdx
494	add	$A[0],$N[0]
495	adc	\$0,%rdx
496	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
497	mov	%rdx,$N[1]
498
499	mulq	$m0			# ap[j]*bp[i]
500	add	%rax,$A[1]
501	mov	-8($np,$j,8),%rax
502	adc	\$0,%rdx
503	add	-8(%rsp,$j,8),$A[1]
504	adc	\$0,%rdx
505	mov	%rdx,$A[0]
506
507	mulq	$m1			# np[j]*m1
508	add	%rax,$N[1]
509	mov	($ap,$j,8),%rax
510	adc	\$0,%rdx
511	add	$A[1],$N[1]
512	adc	\$0,%rdx
513	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
514	mov	%rdx,$N[0]
515
516	mulq	$m0			# ap[j]*bp[i]
517	add	%rax,$A[0]
518	mov	($np,$j,8),%rax
519	adc	\$0,%rdx
520	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
521	adc	\$0,%rdx
522	mov	%rdx,$A[1]
523
524	mulq	$m1			# np[j]*m1
525	add	%rax,$N[0]
526	mov	8($ap,$j,8),%rax
527	adc	\$0,%rdx
528	add	$A[0],$N[0]
529	adc	\$0,%rdx
530	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
531	mov	%rdx,$N[1]
532
533	mulq	$m0			# ap[j]*bp[i]
534	add	%rax,$A[1]
535	mov	8($np,$j,8),%rax
536	adc	\$0,%rdx
537	add	8(%rsp,$j,8),$A[1]
538	adc	\$0,%rdx
539	lea	4($j),$j		# j++
540	mov	%rdx,$A[0]
541
542	mulq	$m1			# np[j]*m1
543	add	%rax,$N[1]
544	mov	-16($ap,$j,8),%rax
545	adc	\$0,%rdx
546	add	$A[1],$N[1]
547	adc	\$0,%rdx
548	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
549	mov	%rdx,$N[0]
550	cmp	$num,$j
551	jl	.Linner4x
552
553	mulq	$m0			# ap[j]*bp[i]
554	add	%rax,$A[0]
555	mov	-16($np,$j,8),%rax
556	adc	\$0,%rdx
557	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
558	adc	\$0,%rdx
559	mov	%rdx,$A[1]
560
561	mulq	$m1			# np[j]*m1
562	add	%rax,$N[0]
563	mov	-8($ap,$j,8),%rax
564	adc	\$0,%rdx
565	add	$A[0],$N[0]
566	adc	\$0,%rdx
567	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
568	mov	%rdx,$N[1]
569
570	mulq	$m0			# ap[j]*bp[i]
571	add	%rax,$A[1]
572	mov	-8($np,$j,8),%rax
573	adc	\$0,%rdx
574	add	-8(%rsp,$j,8),$A[1]
575	adc	\$0,%rdx
576	lea	1($i),$i		# i++
577	mov	%rdx,$A[0]
578
579	mulq	$m1			# np[j]*m1
580	add	%rax,$N[1]
581	mov	($ap),%rax		# ap[0]
582	adc	\$0,%rdx
583	add	$A[1],$N[1]
584	adc	\$0,%rdx
585	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
586	mov	%rdx,$N[0]
587
588	xor	$N[1],$N[1]
589	add	$A[0],$N[0]
590	adc	\$0,$N[1]
591	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
592	adc	\$0,$N[1]
593	mov	$N[0],-8(%rsp,$j,8)
594	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
595
596	cmp	$num,$i
597	jl	.Louter4x
598___
599{
600my @ri=("%rax","%rdx",$m0,$m1);
601$code.=<<___;
602	mov	16(%rsp,$num,8),$rp	# restore $rp
603	mov	0(%rsp),@ri[0]		# tp[0]
604	pxor	%xmm0,%xmm0
605	mov	8(%rsp),@ri[1]		# tp[1]
606	shr	\$2,$num		# num/=4
607	lea	(%rsp),$ap		# borrow ap for tp
608	xor	$i,$i			# i=0 and clear CF!
609
610	sub	0($np),@ri[0]
611	mov	16($ap),@ri[2]		# tp[2]
612	mov	24($ap),@ri[3]		# tp[3]
613	sbb	8($np),@ri[1]
614	lea	-1($num),$j		# j=num/4-1
615	jmp	.Lsub4x
616.align	16
617.Lsub4x:
618	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
619	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
620	sbb	16($np,$i,8),@ri[2]
621	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
622	mov	40($ap,$i,8),@ri[1]
623	sbb	24($np,$i,8),@ri[3]
624	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
625	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
626	sbb	32($np,$i,8),@ri[0]
627	mov	48($ap,$i,8),@ri[2]
628	mov	56($ap,$i,8),@ri[3]
629	sbb	40($np,$i,8),@ri[1]
630	lea	4($i),$i		# i++
631	dec	$j			# doesnn't affect CF!
632	jnz	.Lsub4x
633
634	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
635	mov	32($ap,$i,8),@ri[0]	# load overflow bit
636	sbb	16($np,$i,8),@ri[2]
637	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
638	sbb	24($np,$i,8),@ri[3]
639	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
640
641	sbb	\$0,@ri[0]		# handle upmost overflow bit
642	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
643	xor	$i,$i			# i=0
644	and	@ri[0],$ap
645	not	@ri[0]
646	mov	$rp,$np
647	and	@ri[0],$np
648	lea	-1($num),$j
649	or	$np,$ap			# ap=borrow?tp:rp
650
651	movdqu	($ap),%xmm1
652	movdqa	%xmm0,(%rsp)
653	movdqu	%xmm1,($rp)
654	jmp	.Lcopy4x
655.align	16
656.Lcopy4x:					# copy or in-place refresh
657	movdqu	16($ap,$i),%xmm2
658	movdqu	32($ap,$i),%xmm1
659	movdqa	%xmm0,16(%rsp,$i)
660	movdqu	%xmm2,16($rp,$i)
661	movdqa	%xmm0,32(%rsp,$i)
662	movdqu	%xmm1,32($rp,$i)
663	lea	32($i),$i
664	dec	$j
665	jnz	.Lcopy4x
666
667	shl	\$2,$num
668	movdqu	16($ap,$i),%xmm2
669	movdqa	%xmm0,16(%rsp,$i)
670	movdqu	%xmm2,16($rp,$i)
671___
672}
673$code.=<<___;
674	mov	8(%rsp,$num,8),%rsi	# restore %rsp
675	mov	\$1,%rax
676	mov	(%rsi),%r15
677	mov	8(%rsi),%r14
678	mov	16(%rsi),%r13
679	mov	24(%rsi),%r12
680	mov	32(%rsi),%rbp
681	mov	40(%rsi),%rbx
682	lea	48(%rsi),%rsp
683.Lmul4x_epilogue:
684	ret
685.size	bn_mul4x_mont,.-bn_mul4x_mont
686___
687}}}
688{{{
689######################################################################
690# void bn_sqr4x_mont(
691my $rptr="%rdi";	# const BN_ULONG *rptr,
692my $aptr="%rsi";	# const BN_ULONG *aptr,
693my $bptr="%rdx";	# not used
694my $nptr="%rcx";	# const BN_ULONG *nptr,
695my $n0  ="%r8";		# const BN_ULONG *n0);
696my $num ="%r9";		# int num, has to be divisible by 4 and
697			# not less than 8
698
699my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
700my @A0=("%r10","%r11");
701my @A1=("%r12","%r13");
702my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
703
704$code.=<<___;
705.type	bn_sqr4x_mont,\@function,6
706.align	16
707bn_sqr4x_mont:
708.Lsqr4x_enter:
709	push	%rbx
710	push	%rbp
711	push	%r12
712	push	%r13
713	push	%r14
714	push	%r15
715
716	shl	\$3,${num}d		# convert $num to bytes
717	xor	%r10,%r10
718	mov	%rsp,%r11		# put aside %rsp
719	sub	$num,%r10		# -$num
720	mov	($n0),$n0		# *n0
721	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
722	and	\$-1024,%rsp		# minimize TLB usage
723	##############################################################
724	# Stack layout
725	#
726	# +0	saved $num, used in reduction section
727	# +8	&t[2*$num], used in reduction section
728	# +32	saved $rptr
729	# +40	saved $nptr
730	# +48	saved *n0
731	# +56	saved %rsp
732	# +64	t[2*$num]
733	#
734	mov	$rptr,32(%rsp)		# save $rptr
735	mov	$nptr,40(%rsp)
736	mov	$n0,  48(%rsp)
737	mov	%r11, 56(%rsp)		# save original %rsp
738.Lsqr4x_body:
739	##############################################################
740	# Squaring part:
741	#
742	# a) multiply-n-add everything but a[i]*a[i];
743	# b) shift result of a) by 1 to the left and accumulate
744	#    a[i]*a[i] products;
745	#
746	lea	32(%r10),$i		# $i=-($num-32)
747	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
748
749	mov	$num,$j			# $j=$num
750
751					# comments apply to $num==8 case
752	mov	-32($aptr,$i),$a0	# a[0]
753	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
754	mov	-24($aptr,$i),%rax	# a[1]
755	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
756	mov	-16($aptr,$i),$ai	# a[2]
757	mov	%rax,$a1
758
759	mul	$a0			# a[1]*a[0]
760	mov	%rax,$A0[0]		# a[1]*a[0]
761	 mov	$ai,%rax		# a[2]
762	mov	%rdx,$A0[1]
763	mov	$A0[0],-24($tptr,$i)	# t[1]
764
765	xor	$A0[0],$A0[0]
766	mul	$a0			# a[2]*a[0]
767	add	%rax,$A0[1]
768	 mov	$ai,%rax
769	adc	%rdx,$A0[0]
770	mov	$A0[1],-16($tptr,$i)	# t[2]
771
772	lea	-16($i),$j		# j=-16
773
774
775	 mov	8($aptr,$j),$ai		# a[3]
776	mul	$a1			# a[2]*a[1]
777	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
778	 mov	$ai,%rax
779	mov	%rdx,$A1[1]
780
781	xor	$A0[1],$A0[1]
782	add	$A1[0],$A0[0]
783	 lea	16($j),$j
784	adc	\$0,$A0[1]
785	mul	$a0			# a[3]*a[0]
786	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
787	 mov	$ai,%rax
788	adc	%rdx,$A0[1]
789	mov	$A0[0],-8($tptr,$j)	# t[3]
790	jmp	.Lsqr4x_1st
791
792.align	16
793.Lsqr4x_1st:
794	 mov	($aptr,$j),$ai		# a[4]
795	xor	$A1[0],$A1[0]
796	mul	$a1			# a[3]*a[1]
797	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
798	 mov	$ai,%rax
799	adc	%rdx,$A1[0]
800
801	xor	$A0[0],$A0[0]
802	add	$A1[1],$A0[1]
803	adc	\$0,$A0[0]
804	mul	$a0			# a[4]*a[0]
805	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
806	 mov	$ai,%rax		# a[3]
807	adc	%rdx,$A0[0]
808	mov	$A0[1],($tptr,$j)	# t[4]
809
810
811	 mov	8($aptr,$j),$ai		# a[5]
812	xor	$A1[1],$A1[1]
813	mul	$a1			# a[4]*a[3]
814	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
815	 mov	$ai,%rax
816	adc	%rdx,$A1[1]
817
818	xor	$A0[1],$A0[1]
819	add	$A1[0],$A0[0]
820	adc	\$0,$A0[1]
821	mul	$a0			# a[5]*a[2]
822	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
823	 mov	$ai,%rax
824	adc	%rdx,$A0[1]
825	mov	$A0[0],8($tptr,$j)	# t[5]
826
827	 mov	16($aptr,$j),$ai	# a[6]
828	xor	$A1[0],$A1[0]
829	mul	$a1			# a[5]*a[3]
830	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
831	 mov	$ai,%rax
832	adc	%rdx,$A1[0]
833
834	xor	$A0[0],$A0[0]
835	add	$A1[1],$A0[1]
836	adc	\$0,$A0[0]
837	mul	$a0			# a[6]*a[2]
838	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
839	 mov	$ai,%rax		# a[3]
840	adc	%rdx,$A0[0]
841	mov	$A0[1],16($tptr,$j)	# t[6]
842
843
844	 mov	24($aptr,$j),$ai	# a[7]
845	xor	$A1[1],$A1[1]
846	mul	$a1			# a[6]*a[5]
847	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
848	 mov	$ai,%rax
849	adc	%rdx,$A1[1]
850
851	xor	$A0[1],$A0[1]
852	add	$A1[0],$A0[0]
853	 lea	32($j),$j
854	adc	\$0,$A0[1]
855	mul	$a0			# a[7]*a[4]
856	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
857	 mov	$ai,%rax
858	adc	%rdx,$A0[1]
859	mov	$A0[0],-8($tptr,$j)	# t[7]
860
861	cmp	\$0,$j
862	jne	.Lsqr4x_1st
863
864	xor	$A1[0],$A1[0]
865	add	$A0[1],$A1[1]
866	adc	\$0,$A1[0]
867	mul	$a1			# a[7]*a[5]
868	add	%rax,$A1[1]
869	adc	%rdx,$A1[0]
870
871	mov	$A1[1],($tptr)		# t[8]
872	lea	16($i),$i
873	mov	$A1[0],8($tptr)		# t[9]
874	jmp	.Lsqr4x_outer
875
876.align	16
877.Lsqr4x_outer:				# comments apply to $num==6 case
878	mov	-32($aptr,$i),$a0	# a[0]
879	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
880	mov	-24($aptr,$i),%rax	# a[1]
881	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
882	mov	-16($aptr,$i),$ai	# a[2]
883	mov	%rax,$a1
884
885	mov	-24($tptr,$i),$A0[0]	# t[1]
886	xor	$A0[1],$A0[1]
887	mul	$a0			# a[1]*a[0]
888	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
889	 mov	$ai,%rax		# a[2]
890	adc	%rdx,$A0[1]
891	mov	$A0[0],-24($tptr,$i)	# t[1]
892
893	xor	$A0[0],$A0[0]
894	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
895	adc	\$0,$A0[0]
896	mul	$a0			# a[2]*a[0]
897	add	%rax,$A0[1]
898	 mov	$ai,%rax
899	adc	%rdx,$A0[0]
900	mov	$A0[1],-16($tptr,$i)	# t[2]
901
902	lea	-16($i),$j		# j=-16
903	xor	$A1[0],$A1[0]
904
905
906	 mov	8($aptr,$j),$ai		# a[3]
907	xor	$A1[1],$A1[1]
908	add	8($tptr,$j),$A1[0]
909	adc	\$0,$A1[1]
910	mul	$a1			# a[2]*a[1]
911	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
912	 mov	$ai,%rax
913	adc	%rdx,$A1[1]
914
915	xor	$A0[1],$A0[1]
916	add	$A1[0],$A0[0]
917	adc	\$0,$A0[1]
918	mul	$a0			# a[3]*a[0]
919	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
920	 mov	$ai,%rax
921	adc	%rdx,$A0[1]
922	mov	$A0[0],8($tptr,$j)	# t[3]
923
924	lea	16($j),$j
925	jmp	.Lsqr4x_inner
926
927.align	16
928.Lsqr4x_inner:
929	 mov	($aptr,$j),$ai		# a[4]
930	xor	$A1[0],$A1[0]
931	add	($tptr,$j),$A1[1]
932	adc	\$0,$A1[0]
933	mul	$a1			# a[3]*a[1]
934	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
935	 mov	$ai,%rax
936	adc	%rdx,$A1[0]
937
938	xor	$A0[0],$A0[0]
939	add	$A1[1],$A0[1]
940	adc	\$0,$A0[0]
941	mul	$a0			# a[4]*a[0]
942	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
943	 mov	$ai,%rax		# a[3]
944	adc	%rdx,$A0[0]
945	mov	$A0[1],($tptr,$j)	# t[4]
946
947	 mov	8($aptr,$j),$ai		# a[5]
948	xor	$A1[1],$A1[1]
949	add	8($tptr,$j),$A1[0]
950	adc	\$0,$A1[1]
951	mul	$a1			# a[4]*a[3]
952	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
953	 mov	$ai,%rax
954	adc	%rdx,$A1[1]
955
956	xor	$A0[1],$A0[1]
957	add	$A1[0],$A0[0]
958	lea	16($j),$j		# j++
959	adc	\$0,$A0[1]
960	mul	$a0			# a[5]*a[2]
961	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
962	 mov	$ai,%rax
963	adc	%rdx,$A0[1]
964	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
965
966	cmp	\$0,$j
967	jne	.Lsqr4x_inner
968
969	xor	$A1[0],$A1[0]
970	add	$A0[1],$A1[1]
971	adc	\$0,$A1[0]
972	mul	$a1			# a[5]*a[3]
973	add	%rax,$A1[1]
974	adc	%rdx,$A1[0]
975
976	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
977	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
978
979	add	\$16,$i
980	jnz	.Lsqr4x_outer
981
982					# comments apply to $num==4 case
983	mov	-32($aptr),$a0		# a[0]
984	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
985	mov	-24($aptr),%rax		# a[1]
986	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
987	mov	-16($aptr),$ai		# a[2]
988	mov	%rax,$a1
989
990	xor	$A0[1],$A0[1]
991	mul	$a0			# a[1]*a[0]
992	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
993	 mov	$ai,%rax		# a[2]
994	adc	%rdx,$A0[1]
995	mov	$A0[0],-24($tptr)	# t[1]
996
997	xor	$A0[0],$A0[0]
998	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
999	adc	\$0,$A0[0]
1000	mul	$a0			# a[2]*a[0]
1001	add	%rax,$A0[1]
1002	 mov	$ai,%rax
1003	adc	%rdx,$A0[0]
1004	mov	$A0[1],-16($tptr)	# t[2]
1005
1006	 mov	-8($aptr),$ai		# a[3]
1007	mul	$a1			# a[2]*a[1]
1008	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1009	 mov	$ai,%rax
1010	adc	\$0,%rdx
1011
1012	xor	$A0[1],$A0[1]
1013	add	$A1[0],$A0[0]
1014	 mov	%rdx,$A1[1]
1015	adc	\$0,$A0[1]
1016	mul	$a0			# a[3]*a[0]
1017	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1018	 mov	$ai,%rax
1019	adc	%rdx,$A0[1]
1020	mov	$A0[0],-8($tptr)	# t[3]
1021
1022	xor	$A1[0],$A1[0]
1023	add	$A0[1],$A1[1]
1024	adc	\$0,$A1[0]
1025	mul	$a1			# a[3]*a[1]
1026	add	%rax,$A1[1]
1027	 mov	-16($aptr),%rax		# a[2]
1028	adc	%rdx,$A1[0]
1029
1030	mov	$A1[1],($tptr)		# t[4]
1031	mov	$A1[0],8($tptr)		# t[5]
1032
1033	mul	$ai			# a[2]*a[3]
1034___
1035{
1036my ($shift,$carry)=($a0,$a1);
1037my @S=(@A1,$ai,$n0);
1038$code.=<<___;
1039	 add	\$16,$i
1040	 xor	$shift,$shift
1041	 sub	$num,$i			# $i=16-$num
1042	 xor	$carry,$carry
1043
1044	add	$A1[0],%rax		# t[5]
1045	adc	\$0,%rdx
1046	mov	%rax,8($tptr)		# t[5]
1047	mov	%rdx,16($tptr)		# t[6]
1048	mov	$carry,24($tptr)	# t[7]
1049
1050	 mov	-16($aptr,$i),%rax	# a[0]
1051	lea	64(%rsp,$num,2),$tptr
1052	 xor	$A0[0],$A0[0]		# t[0]
1053	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
1054
1055	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1056	shr	\$63,$A0[0]
1057	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1058	shr	\$63,$A0[1]
1059	or	$A0[0],$S[1]		# | t[2*i]>>63
1060	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1061	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1062	mul	%rax			# a[i]*a[i]
1063	neg	$carry			# mov $carry,cf
1064	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1065	adc	%rax,$S[0]
1066	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1067	mov	$S[0],-32($tptr,$i,2)
1068	adc	%rdx,$S[1]
1069
1070	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1071	 mov	$S[1],-24($tptr,$i,2)
1072	 sbb	$carry,$carry		# mov cf,$carry
1073	shr	\$63,$A0[0]
1074	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1075	shr	\$63,$A0[1]
1076	or	$A0[0],$S[3]		# | t[2*i]>>63
1077	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1078	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1079	mul	%rax			# a[i]*a[i]
1080	neg	$carry			# mov $carry,cf
1081	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1082	adc	%rax,$S[2]
1083	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1084	mov	$S[2],-16($tptr,$i,2)
1085	adc	%rdx,$S[3]
1086	lea	16($i),$i
1087	mov	$S[3],-40($tptr,$i,2)
1088	sbb	$carry,$carry		# mov cf,$carry
1089	jmp	.Lsqr4x_shift_n_add
1090
1091.align	16
1092.Lsqr4x_shift_n_add:
1093	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1094	shr	\$63,$A0[0]
1095	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1096	shr	\$63,$A0[1]
1097	or	$A0[0],$S[1]		# | t[2*i]>>63
1098	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1099	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1100	mul	%rax			# a[i]*a[i]
1101	neg	$carry			# mov $carry,cf
1102	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1103	adc	%rax,$S[0]
1104	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1105	mov	$S[0],-32($tptr,$i,2)
1106	adc	%rdx,$S[1]
1107
1108	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1109	 mov	$S[1],-24($tptr,$i,2)
1110	 sbb	$carry,$carry		# mov cf,$carry
1111	shr	\$63,$A0[0]
1112	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1113	shr	\$63,$A0[1]
1114	or	$A0[0],$S[3]		# | t[2*i]>>63
1115	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1116	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1117	mul	%rax			# a[i]*a[i]
1118	neg	$carry			# mov $carry,cf
1119	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1120	adc	%rax,$S[2]
1121	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1122	mov	$S[2],-16($tptr,$i,2)
1123	adc	%rdx,$S[3]
1124
1125	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1126	 mov	$S[3],-8($tptr,$i,2)
1127	 sbb	$carry,$carry		# mov cf,$carry
1128	shr	\$63,$A0[0]
1129	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1130	shr	\$63,$A0[1]
1131	or	$A0[0],$S[1]		# | t[2*i]>>63
1132	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1133	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1134	mul	%rax			# a[i]*a[i]
1135	neg	$carry			# mov $carry,cf
1136	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1137	adc	%rax,$S[0]
1138	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1139	mov	$S[0],0($tptr,$i,2)
1140	adc	%rdx,$S[1]
1141
1142	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1143	 mov	$S[1],8($tptr,$i,2)
1144	 sbb	$carry,$carry		# mov cf,$carry
1145	shr	\$63,$A0[0]
1146	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1147	shr	\$63,$A0[1]
1148	or	$A0[0],$S[3]		# | t[2*i]>>63
1149	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1150	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1151	mul	%rax			# a[i]*a[i]
1152	neg	$carry			# mov $carry,cf
1153	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1154	adc	%rax,$S[2]
1155	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1156	mov	$S[2],16($tptr,$i,2)
1157	adc	%rdx,$S[3]
1158	mov	$S[3],24($tptr,$i,2)
1159	sbb	$carry,$carry		# mov cf,$carry
1160	add	\$32,$i
1161	jnz	.Lsqr4x_shift_n_add
1162
1163	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1164	shr	\$63,$A0[0]
1165	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1166	shr	\$63,$A0[1]
1167	or	$A0[0],$S[1]		# | t[2*i]>>63
1168	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1169	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1170	mul	%rax			# a[i]*a[i]
1171	neg	$carry			# mov $carry,cf
1172	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1173	adc	%rax,$S[0]
1174	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1175	mov	$S[0],-32($tptr)
1176	adc	%rdx,$S[1]
1177
1178	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1179	 mov	$S[1],-24($tptr)
1180	 sbb	$carry,$carry		# mov cf,$carry
1181	shr	\$63,$A0[0]
1182	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1183	shr	\$63,$A0[1]
1184	or	$A0[0],$S[3]		# | t[2*i]>>63
1185	mul	%rax			# a[i]*a[i]
1186	neg	$carry			# mov $carry,cf
1187	adc	%rax,$S[2]
1188	adc	%rdx,$S[3]
1189	mov	$S[2],-16($tptr)
1190	mov	$S[3],-8($tptr)
1191___
1192}
1193##############################################################
1194# Montgomery reduction part, "word-by-word" algorithm.
1195#
1196{
1197my ($topbit,$nptr)=("%rbp",$aptr);
1198my ($m0,$m1)=($a0,$a1);
1199my @Ni=("%rbx","%r9");
1200$code.=<<___;
1201	mov	40(%rsp),$nptr		# restore $nptr
1202	mov	48(%rsp),$n0		# restore *n0
1203	xor	$j,$j
1204	mov	$num,0(%rsp)		# save $num
1205	sub	$num,$j			# $j=-$num
1206	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
1207	 mov	$n0,$m0			#		# modsched #
1208	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
1209	lea	64(%rsp,$num),$tptr	# end of t[] window
1210	mov	%rax,8(%rsp)		# save end of t[] buffer
1211	lea	($nptr,$num),$nptr	# end of n[] buffer
1212	xor	$topbit,$topbit		# $topbit=0
1213
1214	mov	0($nptr,$j),%rax	# n[0]		# modsched #
1215	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1216	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
1217	 mov	%rax,$Ni[0]		#		# modsched #
1218	jmp	.Lsqr4x_mont_outer
1219
1220.align	16
1221.Lsqr4x_mont_outer:
1222	xor	$A0[1],$A0[1]
1223	mul	$m0			# n[0]*m0
1224	add	%rax,$A0[0]		# n[0]*m0+t[0]
1225	 mov	$Ni[1],%rax
1226	adc	%rdx,$A0[1]
1227	mov	$n0,$m1
1228
1229	xor	$A0[0],$A0[0]
1230	add	8($tptr,$j),$A0[1]
1231	adc	\$0,$A0[0]
1232	mul	$m0			# n[1]*m0
1233	add	%rax,$A0[1]		# n[1]*m0+t[1]
1234	 mov	$Ni[0],%rax
1235	adc	%rdx,$A0[0]
1236
1237	imulq	$A0[1],$m1
1238
1239	mov	16($nptr,$j),$Ni[0]	# n[2]
1240	xor	$A1[1],$A1[1]
1241	add	$A0[1],$A1[0]
1242	adc	\$0,$A1[1]
1243	mul	$m1			# n[0]*m1
1244	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
1245	 mov	$Ni[0],%rax
1246	adc	%rdx,$A1[1]
1247	mov	$A1[0],8($tptr,$j)	# "t[1]"
1248
1249	xor	$A0[1],$A0[1]
1250	add	16($tptr,$j),$A0[0]
1251	adc	\$0,$A0[1]
1252	mul	$m0			# n[2]*m0
1253	add	%rax,$A0[0]		# n[2]*m0+t[2]
1254	 mov	$Ni[1],%rax
1255	adc	%rdx,$A0[1]
1256
1257	mov	24($nptr,$j),$Ni[1]	# n[3]
1258	xor	$A1[0],$A1[0]
1259	add	$A0[0],$A1[1]
1260	adc	\$0,$A1[0]
1261	mul	$m1			# n[1]*m1
1262	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
1263	 mov	$Ni[1],%rax
1264	adc	%rdx,$A1[0]
1265	mov	$A1[1],16($tptr,$j)	# "t[2]"
1266
1267	xor	$A0[0],$A0[0]
1268	add	24($tptr,$j),$A0[1]
1269	lea	32($j),$j
1270	adc	\$0,$A0[0]
1271	mul	$m0			# n[3]*m0
1272	add	%rax,$A0[1]		# n[3]*m0+t[3]
1273	 mov	$Ni[0],%rax
1274	adc	%rdx,$A0[0]
1275	jmp	.Lsqr4x_mont_inner
1276
1277.align	16
1278.Lsqr4x_mont_inner:
1279	mov	($nptr,$j),$Ni[0]	# n[4]
1280	xor	$A1[1],$A1[1]
1281	add	$A0[1],$A1[0]
1282	adc	\$0,$A1[1]
1283	mul	$m1			# n[2]*m1
1284	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
1285	 mov	$Ni[0],%rax
1286	adc	%rdx,$A1[1]
1287	mov	$A1[0],-8($tptr,$j)	# "t[3]"
1288
1289	xor	$A0[1],$A0[1]
1290	add	($tptr,$j),$A0[0]
1291	adc	\$0,$A0[1]
1292	mul	$m0			# n[4]*m0
1293	add	%rax,$A0[0]		# n[4]*m0+t[4]
1294	 mov	$Ni[1],%rax
1295	adc	%rdx,$A0[1]
1296
1297	mov	8($nptr,$j),$Ni[1]	# n[5]
1298	xor	$A1[0],$A1[0]
1299	add	$A0[0],$A1[1]
1300	adc	\$0,$A1[0]
1301	mul	$m1			# n[3]*m1
1302	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
1303	 mov	$Ni[1],%rax
1304	adc	%rdx,$A1[0]
1305	mov	$A1[1],($tptr,$j)	# "t[4]"
1306
1307	xor	$A0[0],$A0[0]
1308	add	8($tptr,$j),$A0[1]
1309	adc	\$0,$A0[0]
1310	mul	$m0			# n[5]*m0
1311	add	%rax,$A0[1]		# n[5]*m0+t[5]
1312	 mov	$Ni[0],%rax
1313	adc	%rdx,$A0[0]
1314
1315
1316	mov	16($nptr,$j),$Ni[0]	# n[6]
1317	xor	$A1[1],$A1[1]
1318	add	$A0[1],$A1[0]
1319	adc	\$0,$A1[1]
1320	mul	$m1			# n[4]*m1
1321	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
1322	 mov	$Ni[0],%rax
1323	adc	%rdx,$A1[1]
1324	mov	$A1[0],8($tptr,$j)	# "t[5]"
1325
1326	xor	$A0[1],$A0[1]
1327	add	16($tptr,$j),$A0[0]
1328	adc	\$0,$A0[1]
1329	mul	$m0			# n[6]*m0
1330	add	%rax,$A0[0]		# n[6]*m0+t[6]
1331	 mov	$Ni[1],%rax
1332	adc	%rdx,$A0[1]
1333
1334	mov	24($nptr,$j),$Ni[1]	# n[7]
1335	xor	$A1[0],$A1[0]
1336	add	$A0[0],$A1[1]
1337	adc	\$0,$A1[0]
1338	mul	$m1			# n[5]*m1
1339	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
1340	 mov	$Ni[1],%rax
1341	adc	%rdx,$A1[0]
1342	mov	$A1[1],16($tptr,$j)	# "t[6]"
1343
1344	xor	$A0[0],$A0[0]
1345	add	24($tptr,$j),$A0[1]
1346	lea	32($j),$j
1347	adc	\$0,$A0[0]
1348	mul	$m0			# n[7]*m0
1349	add	%rax,$A0[1]		# n[7]*m0+t[7]
1350	 mov	$Ni[0],%rax
1351	adc	%rdx,$A0[0]
1352	cmp	\$0,$j
1353	jne	.Lsqr4x_mont_inner
1354
1355	 sub	0(%rsp),$j		# $j=-$num	# modsched #
1356	 mov	$n0,$m0			#		# modsched #
1357
1358	xor	$A1[1],$A1[1]
1359	add	$A0[1],$A1[0]
1360	adc	\$0,$A1[1]
1361	mul	$m1			# n[6]*m1
1362	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
1363	mov	$Ni[1],%rax
1364	adc	%rdx,$A1[1]
1365	mov	$A1[0],-8($tptr)	# "t[7]"
1366
1367	xor	$A0[1],$A0[1]
1368	add	($tptr),$A0[0]		# +t[8]
1369	adc	\$0,$A0[1]
1370	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
1371	add	$topbit,$A0[0]
1372	adc	\$0,$A0[1]
1373
1374	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
1375	xor	$A1[0],$A1[0]
1376	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1377	add	$A0[0],$A1[1]
1378	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
1379	adc	\$0,$A1[0]
1380	mul	$m1			# n[7]*m1
1381	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
1382	 mov	$Ni[0],%rax		#		# modsched #
1383	adc	%rdx,$A1[0]
1384	mov	$A1[1],($tptr)		# "t[8]"
1385
1386	xor	$topbit,$topbit
1387	add	8($tptr),$A1[0]		# +t[9]
1388	adc	$topbit,$topbit
1389	add	$A0[1],$A1[0]
1390	lea	16($tptr),$tptr		# "t[$num]>>128"
1391	adc	\$0,$topbit
1392	mov	$A1[0],-8($tptr)	# "t[9]"
1393	cmp	8(%rsp),$tptr		# are we done?
1394	jb	.Lsqr4x_mont_outer
1395
1396	mov	0(%rsp),$num		# restore $num
1397	mov	$topbit,($tptr)		# save $topbit
1398___
1399}
1400##############################################################
1401# Post-condition, 4x unrolled copy from bn_mul_mont
1402#
1403{
1404my ($tptr,$nptr)=("%rbx",$aptr);
1405my @ri=("%rax","%rdx","%r10","%r11");
1406$code.=<<___;
1407	mov	64(%rsp,$num),@ri[0]	# tp[0]
1408	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
1409	mov	40(%rsp),$nptr		# restore $nptr
1410	shr	\$5,$num		# num/4
1411	mov	8($tptr),@ri[1]		# t[1]
1412	xor	$i,$i			# i=0 and clear CF!
1413
1414	mov	32(%rsp),$rptr		# restore $rptr
1415	sub	0($nptr),@ri[0]
1416	mov	16($tptr),@ri[2]	# t[2]
1417	mov	24($tptr),@ri[3]	# t[3]
1418	sbb	8($nptr),@ri[1]
1419	lea	-1($num),$j		# j=num/4-1
1420	jmp	.Lsqr4x_sub
1421.align	16
1422.Lsqr4x_sub:
1423	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1424	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1425	sbb	16($nptr,$i,8),@ri[2]
1426	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
1427	mov	40($tptr,$i,8),@ri[1]
1428	sbb	24($nptr,$i,8),@ri[3]
1429	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1430	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1431	sbb	32($nptr,$i,8),@ri[0]
1432	mov	48($tptr,$i,8),@ri[2]
1433	mov	56($tptr,$i,8),@ri[3]
1434	sbb	40($nptr,$i,8),@ri[1]
1435	lea	4($i),$i		# i++
1436	dec	$j			# doesn't affect CF!
1437	jnz	.Lsqr4x_sub
1438
1439	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1440	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
1441	sbb	16($nptr,$i,8),@ri[2]
1442	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1443	sbb	24($nptr,$i,8),@ri[3]
1444	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1445
1446	sbb	\$0,@ri[0]		# handle upmost overflow bit
1447	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1448	xor	$i,$i			# i=0
1449	and	@ri[0],$tptr
1450	not	@ri[0]
1451	mov	$rptr,$nptr
1452	and	@ri[0],$nptr
1453	lea	-1($num),$j
1454	or	$nptr,$tptr		# tp=borrow?tp:rp
1455
1456	pxor	%xmm0,%xmm0
1457	lea	64(%rsp,$num,8),$nptr
1458	movdqu	($tptr),%xmm1
1459	lea	($nptr,$num,8),$nptr
1460	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
1461	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
1462	movdqu	%xmm1,($rptr)
1463	jmp	.Lsqr4x_copy
1464.align	16
1465.Lsqr4x_copy:				# copy or in-place refresh
1466	movdqu	16($tptr,$i),%xmm2
1467	movdqu	32($tptr,$i),%xmm1
1468	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1469	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
1470	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1471	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
1472	movdqu	%xmm2,16($rptr,$i)
1473	movdqu	%xmm1,32($rptr,$i)
1474	lea	32($i),$i
1475	dec	$j
1476	jnz	.Lsqr4x_copy
1477
1478	movdqu	16($tptr,$i),%xmm2
1479	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1480	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1481	movdqu	%xmm2,16($rptr,$i)
1482___
1483}
1484$code.=<<___;
1485	mov	56(%rsp),%rsi		# restore %rsp
1486	mov	\$1,%rax
1487	mov	0(%rsi),%r15
1488	mov	8(%rsi),%r14
1489	mov	16(%rsi),%r13
1490	mov	24(%rsi),%r12
1491	mov	32(%rsi),%rbp
1492	mov	40(%rsi),%rbx
1493	lea	48(%rsi),%rsp
1494.Lsqr4x_epilogue:
1495	ret
1496.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1497___
1498}}}
1499$code.=<<___;
1500.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1501.align	16
1502___
1503
1504# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1505#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1506if ($win64) {
1507$rec="%rcx";
1508$frame="%rdx";
1509$context="%r8";
1510$disp="%r9";
1511
1512$code.=<<___;
1513.extern	__imp_RtlVirtualUnwind
1514.type	mul_handler,\@abi-omnipotent
1515.align	16
1516mul_handler:
1517	push	%rsi
1518	push	%rdi
1519	push	%rbx
1520	push	%rbp
1521	push	%r12
1522	push	%r13
1523	push	%r14
1524	push	%r15
1525	pushfq
1526	sub	\$64,%rsp
1527
1528	mov	120($context),%rax	# pull context->Rax
1529	mov	248($context),%rbx	# pull context->Rip
1530
1531	mov	8($disp),%rsi		# disp->ImageBase
1532	mov	56($disp),%r11		# disp->HandlerData
1533
1534	mov	0(%r11),%r10d		# HandlerData[0]
1535	lea	(%rsi,%r10),%r10	# end of prologue label
1536	cmp	%r10,%rbx		# context->Rip<end of prologue label
1537	jb	.Lcommon_seh_tail
1538
1539	mov	152($context),%rax	# pull context->Rsp
1540
1541	mov	4(%r11),%r10d		# HandlerData[1]
1542	lea	(%rsi,%r10),%r10	# epilogue label
1543	cmp	%r10,%rbx		# context->Rip>=epilogue label
1544	jae	.Lcommon_seh_tail
1545
1546	mov	192($context),%r10	# pull $num
1547	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1548	lea	48(%rax),%rax
1549
1550	mov	-8(%rax),%rbx
1551	mov	-16(%rax),%rbp
1552	mov	-24(%rax),%r12
1553	mov	-32(%rax),%r13
1554	mov	-40(%rax),%r14
1555	mov	-48(%rax),%r15
1556	mov	%rbx,144($context)	# restore context->Rbx
1557	mov	%rbp,160($context)	# restore context->Rbp
1558	mov	%r12,216($context)	# restore context->R12
1559	mov	%r13,224($context)	# restore context->R13
1560	mov	%r14,232($context)	# restore context->R14
1561	mov	%r15,240($context)	# restore context->R15
1562
1563	jmp	.Lcommon_seh_tail
1564.size	mul_handler,.-mul_handler
1565
1566.type	sqr_handler,\@abi-omnipotent
1567.align	16
1568sqr_handler:
1569	push	%rsi
1570	push	%rdi
1571	push	%rbx
1572	push	%rbp
1573	push	%r12
1574	push	%r13
1575	push	%r14
1576	push	%r15
1577	pushfq
1578	sub	\$64,%rsp
1579
1580	mov	120($context),%rax	# pull context->Rax
1581	mov	248($context),%rbx	# pull context->Rip
1582
1583	lea	.Lsqr4x_body(%rip),%r10
1584	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1585	jb	.Lcommon_seh_tail
1586
1587	mov	152($context),%rax	# pull context->Rsp
1588
1589	lea	.Lsqr4x_epilogue(%rip),%r10
1590	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1591	jae	.Lcommon_seh_tail
1592
1593	mov	56(%rax),%rax		# pull saved stack pointer
1594	lea	48(%rax),%rax
1595
1596	mov	-8(%rax),%rbx
1597	mov	-16(%rax),%rbp
1598	mov	-24(%rax),%r12
1599	mov	-32(%rax),%r13
1600	mov	-40(%rax),%r14
1601	mov	-48(%rax),%r15
1602	mov	%rbx,144($context)	# restore context->Rbx
1603	mov	%rbp,160($context)	# restore context->Rbp
1604	mov	%r12,216($context)	# restore context->R12
1605	mov	%r13,224($context)	# restore context->R13
1606	mov	%r14,232($context)	# restore context->R14
1607	mov	%r15,240($context)	# restore context->R15
1608
1609.Lcommon_seh_tail:
1610	mov	8(%rax),%rdi
1611	mov	16(%rax),%rsi
1612	mov	%rax,152($context)	# restore context->Rsp
1613	mov	%rsi,168($context)	# restore context->Rsi
1614	mov	%rdi,176($context)	# restore context->Rdi
1615
1616	mov	40($disp),%rdi		# disp->ContextRecord
1617	mov	$context,%rsi		# context
1618	mov	\$154,%ecx		# sizeof(CONTEXT)
1619	.long	0xa548f3fc		# cld; rep movsq
1620
1621	mov	$disp,%rsi
1622	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1623	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1624	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1625	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1626	mov	40(%rsi),%r10		# disp->ContextRecord
1627	lea	56(%rsi),%r11		# &disp->HandlerData
1628	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1629	mov	%r10,32(%rsp)		# arg5
1630	mov	%r11,40(%rsp)		# arg6
1631	mov	%r12,48(%rsp)		# arg7
1632	mov	%rcx,56(%rsp)		# arg8, (NULL)
1633	call	*__imp_RtlVirtualUnwind(%rip)
1634
1635	mov	\$1,%eax		# ExceptionContinueSearch
1636	add	\$64,%rsp
1637	popfq
1638	pop	%r15
1639	pop	%r14
1640	pop	%r13
1641	pop	%r12
1642	pop	%rbp
1643	pop	%rbx
1644	pop	%rdi
1645	pop	%rsi
1646	ret
1647.size	sqr_handler,.-sqr_handler
1648
1649.section	.pdata
1650.align	4
1651	.rva	.LSEH_begin_bn_mul_mont
1652	.rva	.LSEH_end_bn_mul_mont
1653	.rva	.LSEH_info_bn_mul_mont
1654
1655	.rva	.LSEH_begin_bn_mul4x_mont
1656	.rva	.LSEH_end_bn_mul4x_mont
1657	.rva	.LSEH_info_bn_mul4x_mont
1658
1659	.rva	.LSEH_begin_bn_sqr4x_mont
1660	.rva	.LSEH_end_bn_sqr4x_mont
1661	.rva	.LSEH_info_bn_sqr4x_mont
1662
1663.section	.xdata
1664.align	8
1665.LSEH_info_bn_mul_mont:
1666	.byte	9,0,0,0
1667	.rva	mul_handler
1668	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1669.LSEH_info_bn_mul4x_mont:
1670	.byte	9,0,0,0
1671	.rva	mul_handler
1672	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1673.LSEH_info_bn_sqr4x_mont:
1674	.byte	9,0,0,0
1675	.rva	sqr_handler
1676___
1677}
1678
1679print $code;
1680close STDOUT;
1681