• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32# June 2013.
33#
34# Optimize reduction in squaring procedure and improve 1024+-bit RSA
35# sign performance by 10-16% on Intel Sandy Bridge and later
36# (virtually same on non-Intel processors).
37
38# August 2013.
39#
40# Add MULX/ADOX/ADCX code path.
41
42$flavour = shift;
43$output  = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
54*STDOUT=*OUT;
55
56# In upstream, this is controlled by shelling out to the compiler to check
57# versions, but BoringSSL is intended to be used with pre-generated perlasm
58# output, so this isn't useful anyway.
59#
60# TODO(davidben): Set $addx to one once build problems are resolved.
61$addx = 0;
62
63# int bn_mul_mont(
64$rp="%rdi";	# BN_ULONG *rp,
65$ap="%rsi";	# const BN_ULONG *ap,
66$bp="%rdx";	# const BN_ULONG *bp,
67$np="%rcx";	# const BN_ULONG *np,
68$n0="%r8";	# const BN_ULONG *n0,
69$num="%r9";	# int num);
70$lo0="%r10";
71$hi0="%r11";
72$hi1="%r13";
73$i="%r14";
74$j="%r15";
75$m0="%rbx";
76$m1="%rbp";
77
78$code=<<___;
79.text
80
81.extern	OPENSSL_ia32cap_P
82
83.globl	bn_mul_mont
84.type	bn_mul_mont,\@function,6
85.align	16
86bn_mul_mont:
87.cfi_startproc
88	mov	${num}d,${num}d
89	mov	%rsp,%rax
90.cfi_def_cfa_register	%rax
91	test	\$3,${num}d
92	jnz	.Lmul_enter
93	cmp	\$8,${num}d
94	jb	.Lmul_enter
95___
96$code.=<<___ if ($addx);
97	leaq	OPENSSL_ia32cap_P(%rip),%r11
98	mov	8(%r11),%r11d
99___
100$code.=<<___;
101	cmp	$ap,$bp
102	jne	.Lmul4x_enter
103	test	\$7,${num}d
104	jz	.Lsqr8x_enter
105	jmp	.Lmul4x_enter
106
107.align	16
108.Lmul_enter:
109	push	%rbx
110.cfi_push	%rbx
111	push	%rbp
112.cfi_push	%rbp
113	push	%r12
114.cfi_push	%r12
115	push	%r13
116.cfi_push	%r13
117	push	%r14
118.cfi_push	%r14
119	push	%r15
120.cfi_push	%r15
121
122	neg	$num
123	mov	%rsp,%r11
124	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
125	neg	$num			# restore $num
126	and	\$-1024,%r10		# minimize TLB usage
127
128	# An OS-agnostic version of __chkstk.
129	#
130	# Some OSes (Windows) insist on stack being "wired" to
131	# physical memory in strictly sequential manner, i.e. if stack
132	# allocation spans two pages, then reference to farmost one can
133	# be punishable by SEGV. But page walking can do good even on
134	# other OSes, because it guarantees that villain thread hits
135	# the guard page before it can make damage to innocent one...
136	sub	%r10,%r11
137	and	\$-4096,%r11
138	lea	(%r10,%r11),%rsp
139	mov	(%rsp),%r11
140	cmp	%r10,%rsp
141	ja	.Lmul_page_walk
142	jmp	.Lmul_page_walk_done
143
144.align	16
145.Lmul_page_walk:
146	lea	-4096(%rsp),%rsp
147	mov	(%rsp),%r11
148	cmp	%r10,%rsp
149	ja	.Lmul_page_walk
150.Lmul_page_walk_done:
151
152	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
153.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
154.Lmul_body:
155	mov	$bp,%r12		# reassign $bp
156___
157		$bp="%r12";
158$code.=<<___;
159	mov	($n0),$n0		# pull n0[0] value
160	mov	($bp),$m0		# m0=bp[0]
161	mov	($ap),%rax
162
163	xor	$i,$i			# i=0
164	xor	$j,$j			# j=0
165
166	mov	$n0,$m1
167	mulq	$m0			# ap[0]*bp[0]
168	mov	%rax,$lo0
169	mov	($np),%rax
170
171	imulq	$lo0,$m1		# "tp[0]"*n0
172	mov	%rdx,$hi0
173
174	mulq	$m1			# np[0]*m1
175	add	%rax,$lo0		# discarded
176	mov	8($ap),%rax
177	adc	\$0,%rdx
178	mov	%rdx,$hi1
179
180	lea	1($j),$j		# j++
181	jmp	.L1st_enter
182
183.align	16
184.L1st:
185	add	%rax,$hi1
186	mov	($ap,$j,8),%rax
187	adc	\$0,%rdx
188	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
189	mov	$lo0,$hi0
190	adc	\$0,%rdx
191	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
192	mov	%rdx,$hi1
193
194.L1st_enter:
195	mulq	$m0			# ap[j]*bp[0]
196	add	%rax,$hi0
197	mov	($np,$j,8),%rax
198	adc	\$0,%rdx
199	lea	1($j),$j		# j++
200	mov	%rdx,$lo0
201
202	mulq	$m1			# np[j]*m1
203	cmp	$num,$j
204	jne	.L1st
205
206	add	%rax,$hi1
207	mov	($ap),%rax		# ap[0]
208	adc	\$0,%rdx
209	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
210	adc	\$0,%rdx
211	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
212	mov	%rdx,$hi1
213	mov	$lo0,$hi0
214
215	xor	%rdx,%rdx
216	add	$hi0,$hi1
217	adc	\$0,%rdx
218	mov	$hi1,-8(%rsp,$num,8)
219	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
220
221	lea	1($i),$i		# i++
222	jmp	.Louter
223.align	16
224.Louter:
225	mov	($bp,$i,8),$m0		# m0=bp[i]
226	xor	$j,$j			# j=0
227	mov	$n0,$m1
228	mov	(%rsp),$lo0
229	mulq	$m0			# ap[0]*bp[i]
230	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
231	mov	($np),%rax
232	adc	\$0,%rdx
233
234	imulq	$lo0,$m1		# tp[0]*n0
235	mov	%rdx,$hi0
236
237	mulq	$m1			# np[0]*m1
238	add	%rax,$lo0		# discarded
239	mov	8($ap),%rax
240	adc	\$0,%rdx
241	mov	8(%rsp),$lo0		# tp[1]
242	mov	%rdx,$hi1
243
244	lea	1($j),$j		# j++
245	jmp	.Linner_enter
246
247.align	16
248.Linner:
249	add	%rax,$hi1
250	mov	($ap,$j,8),%rax
251	adc	\$0,%rdx
252	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
253	mov	(%rsp,$j,8),$lo0
254	adc	\$0,%rdx
255	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
256	mov	%rdx,$hi1
257
258.Linner_enter:
259	mulq	$m0			# ap[j]*bp[i]
260	add	%rax,$hi0
261	mov	($np,$j,8),%rax
262	adc	\$0,%rdx
263	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
264	mov	%rdx,$hi0
265	adc	\$0,$hi0
266	lea	1($j),$j		# j++
267
268	mulq	$m1			# np[j]*m1
269	cmp	$num,$j
270	jne	.Linner
271
272	add	%rax,$hi1
273	mov	($ap),%rax		# ap[0]
274	adc	\$0,%rdx
275	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
276	mov	(%rsp,$j,8),$lo0
277	adc	\$0,%rdx
278	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
279	mov	%rdx,$hi1
280
281	xor	%rdx,%rdx
282	add	$hi0,$hi1
283	adc	\$0,%rdx
284	add	$lo0,$hi1		# pull upmost overflow bit
285	adc	\$0,%rdx
286	mov	$hi1,-8(%rsp,$num,8)
287	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
288
289	lea	1($i),$i		# i++
290	cmp	$num,$i
291	jb	.Louter
292
293	xor	$i,$i			# i=0 and clear CF!
294	mov	(%rsp),%rax		# tp[0]
295	lea	(%rsp),$ap		# borrow ap for tp
296	mov	$num,$j			# j=num
297	jmp	.Lsub
298.align	16
299.Lsub:
300	sbb	($np,$i,8),%rax
301	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
302	mov	8($ap,$i,8),%rax	# tp[i+1]
303	lea	1($i),$i		# i++
304	dec	$j			# doesnn't affect CF!
305	jnz	.Lsub
306
307	sbb	\$0,%rax		# handle upmost overflow bit
308	xor	$i,$i
309	and	%rax,$ap
310	not	%rax
311	mov	$rp,$np
312	and	%rax,$np
313	mov	$num,$j			# j=num
314	or	$np,$ap			# ap=borrow?tp:rp
315.align	16
316.Lcopy:					# copy or in-place refresh
317	mov	($ap,$i,8),%rax
318	mov	$i,(%rsp,$i,8)		# zap temporary vector
319	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
320	lea	1($i),$i
321	sub	\$1,$j
322	jnz	.Lcopy
323
324	mov	8(%rsp,$num,8),%rsi	# restore %rsp
325.cfi_def_cfa	%rsi,8
326	mov	\$1,%rax
327	mov	-48(%rsi),%r15
328.cfi_restore	%r15
329	mov	-40(%rsi),%r14
330.cfi_restore	%r14
331	mov	-32(%rsi),%r13
332.cfi_restore	%r13
333	mov	-24(%rsi),%r12
334.cfi_restore	%r12
335	mov	-16(%rsi),%rbp
336.cfi_restore	%rbp
337	mov	-8(%rsi),%rbx
338.cfi_restore	%rbx
339	lea	(%rsi),%rsp
340.cfi_def_cfa_register	%rsp
341.Lmul_epilogue:
342	ret
343.cfi_endproc
344.size	bn_mul_mont,.-bn_mul_mont
345___
346{{{
347my @A=("%r10","%r11");
348my @N=("%r13","%rdi");
349$code.=<<___;
350.type	bn_mul4x_mont,\@function,6
351.align	16
352bn_mul4x_mont:
353.cfi_startproc
354	mov	${num}d,${num}d
355	mov	%rsp,%rax
356.cfi_def_cfa_register	%rax
357.Lmul4x_enter:
358___
359$code.=<<___ if ($addx);
360	and	\$0x80100,%r11d
361	cmp	\$0x80100,%r11d
362	je	.Lmulx4x_enter
363___
364$code.=<<___;
365	push	%rbx
366.cfi_push	%rbx
367	push	%rbp
368.cfi_push	%rbp
369	push	%r12
370.cfi_push	%r12
371	push	%r13
372.cfi_push	%r13
373	push	%r14
374.cfi_push	%r14
375	push	%r15
376.cfi_push	%r15
377
378	neg	$num
379	mov	%rsp,%r11
380	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
381	neg	$num			# restore
382	and	\$-1024,%r10		# minimize TLB usage
383
384	sub	%r10,%r11
385	and	\$-4096,%r11
386	lea	(%r10,%r11),%rsp
387	mov	(%rsp),%r11
388	cmp	%r10,%rsp
389	ja	.Lmul4x_page_walk
390	jmp	.Lmul4x_page_walk_done
391
392.Lmul4x_page_walk:
393	lea	-4096(%rsp),%rsp
394	mov	(%rsp),%r11
395	cmp	%r10,%rsp
396	ja	.Lmul4x_page_walk
397.Lmul4x_page_walk_done:
398
399	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
400.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
401.Lmul4x_body:
402	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
403	mov	%rdx,%r12		# reassign $bp
404___
405		$bp="%r12";
406$code.=<<___;
407	mov	($n0),$n0		# pull n0[0] value
408	mov	($bp),$m0		# m0=bp[0]
409	mov	($ap),%rax
410
411	xor	$i,$i			# i=0
412	xor	$j,$j			# j=0
413
414	mov	$n0,$m1
415	mulq	$m0			# ap[0]*bp[0]
416	mov	%rax,$A[0]
417	mov	($np),%rax
418
419	imulq	$A[0],$m1		# "tp[0]"*n0
420	mov	%rdx,$A[1]
421
422	mulq	$m1			# np[0]*m1
423	add	%rax,$A[0]		# discarded
424	mov	8($ap),%rax
425	adc	\$0,%rdx
426	mov	%rdx,$N[1]
427
428	mulq	$m0
429	add	%rax,$A[1]
430	mov	8($np),%rax
431	adc	\$0,%rdx
432	mov	%rdx,$A[0]
433
434	mulq	$m1
435	add	%rax,$N[1]
436	mov	16($ap),%rax
437	adc	\$0,%rdx
438	add	$A[1],$N[1]
439	lea	4($j),$j		# j++
440	adc	\$0,%rdx
441	mov	$N[1],(%rsp)
442	mov	%rdx,$N[0]
443	jmp	.L1st4x
444.align	16
445.L1st4x:
446	mulq	$m0			# ap[j]*bp[0]
447	add	%rax,$A[0]
448	mov	-16($np,$j,8),%rax
449	adc	\$0,%rdx
450	mov	%rdx,$A[1]
451
452	mulq	$m1			# np[j]*m1
453	add	%rax,$N[0]
454	mov	-8($ap,$j,8),%rax
455	adc	\$0,%rdx
456	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
457	adc	\$0,%rdx
458	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
459	mov	%rdx,$N[1]
460
461	mulq	$m0			# ap[j]*bp[0]
462	add	%rax,$A[1]
463	mov	-8($np,$j,8),%rax
464	adc	\$0,%rdx
465	mov	%rdx,$A[0]
466
467	mulq	$m1			# np[j]*m1
468	add	%rax,$N[1]
469	mov	($ap,$j,8),%rax
470	adc	\$0,%rdx
471	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
472	adc	\$0,%rdx
473	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
474	mov	%rdx,$N[0]
475
476	mulq	$m0			# ap[j]*bp[0]
477	add	%rax,$A[0]
478	mov	($np,$j,8),%rax
479	adc	\$0,%rdx
480	mov	%rdx,$A[1]
481
482	mulq	$m1			# np[j]*m1
483	add	%rax,$N[0]
484	mov	8($ap,$j,8),%rax
485	adc	\$0,%rdx
486	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
487	adc	\$0,%rdx
488	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
489	mov	%rdx,$N[1]
490
491	mulq	$m0			# ap[j]*bp[0]
492	add	%rax,$A[1]
493	mov	8($np,$j,8),%rax
494	adc	\$0,%rdx
495	lea	4($j),$j		# j++
496	mov	%rdx,$A[0]
497
498	mulq	$m1			# np[j]*m1
499	add	%rax,$N[1]
500	mov	-16($ap,$j,8),%rax
501	adc	\$0,%rdx
502	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
503	adc	\$0,%rdx
504	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
505	mov	%rdx,$N[0]
506	cmp	$num,$j
507	jb	.L1st4x
508
509	mulq	$m0			# ap[j]*bp[0]
510	add	%rax,$A[0]
511	mov	-16($np,$j,8),%rax
512	adc	\$0,%rdx
513	mov	%rdx,$A[1]
514
515	mulq	$m1			# np[j]*m1
516	add	%rax,$N[0]
517	mov	-8($ap,$j,8),%rax
518	adc	\$0,%rdx
519	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
520	adc	\$0,%rdx
521	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
522	mov	%rdx,$N[1]
523
524	mulq	$m0			# ap[j]*bp[0]
525	add	%rax,$A[1]
526	mov	-8($np,$j,8),%rax
527	adc	\$0,%rdx
528	mov	%rdx,$A[0]
529
530	mulq	$m1			# np[j]*m1
531	add	%rax,$N[1]
532	mov	($ap),%rax		# ap[0]
533	adc	\$0,%rdx
534	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
535	adc	\$0,%rdx
536	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
537	mov	%rdx,$N[0]
538
539	xor	$N[1],$N[1]
540	add	$A[0],$N[0]
541	adc	\$0,$N[1]
542	mov	$N[0],-8(%rsp,$j,8)
543	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
544
545	lea	1($i),$i		# i++
546.align	4
547.Louter4x:
548	mov	($bp,$i,8),$m0		# m0=bp[i]
549	xor	$j,$j			# j=0
550	mov	(%rsp),$A[0]
551	mov	$n0,$m1
552	mulq	$m0			# ap[0]*bp[i]
553	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
554	mov	($np),%rax
555	adc	\$0,%rdx
556
557	imulq	$A[0],$m1		# tp[0]*n0
558	mov	%rdx,$A[1]
559
560	mulq	$m1			# np[0]*m1
561	add	%rax,$A[0]		# "$N[0]", discarded
562	mov	8($ap),%rax
563	adc	\$0,%rdx
564	mov	%rdx,$N[1]
565
566	mulq	$m0			# ap[j]*bp[i]
567	add	%rax,$A[1]
568	mov	8($np),%rax
569	adc	\$0,%rdx
570	add	8(%rsp),$A[1]		# +tp[1]
571	adc	\$0,%rdx
572	mov	%rdx,$A[0]
573
574	mulq	$m1			# np[j]*m1
575	add	%rax,$N[1]
576	mov	16($ap),%rax
577	adc	\$0,%rdx
578	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
579	lea	4($j),$j		# j+=2
580	adc	\$0,%rdx
581	mov	$N[1],(%rsp)		# tp[j-1]
582	mov	%rdx,$N[0]
583	jmp	.Linner4x
584.align	16
585.Linner4x:
586	mulq	$m0			# ap[j]*bp[i]
587	add	%rax,$A[0]
588	mov	-16($np,$j,8),%rax
589	adc	\$0,%rdx
590	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
591	adc	\$0,%rdx
592	mov	%rdx,$A[1]
593
594	mulq	$m1			# np[j]*m1
595	add	%rax,$N[0]
596	mov	-8($ap,$j,8),%rax
597	adc	\$0,%rdx
598	add	$A[0],$N[0]
599	adc	\$0,%rdx
600	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
601	mov	%rdx,$N[1]
602
603	mulq	$m0			# ap[j]*bp[i]
604	add	%rax,$A[1]
605	mov	-8($np,$j,8),%rax
606	adc	\$0,%rdx
607	add	-8(%rsp,$j,8),$A[1]
608	adc	\$0,%rdx
609	mov	%rdx,$A[0]
610
611	mulq	$m1			# np[j]*m1
612	add	%rax,$N[1]
613	mov	($ap,$j,8),%rax
614	adc	\$0,%rdx
615	add	$A[1],$N[1]
616	adc	\$0,%rdx
617	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
618	mov	%rdx,$N[0]
619
620	mulq	$m0			# ap[j]*bp[i]
621	add	%rax,$A[0]
622	mov	($np,$j,8),%rax
623	adc	\$0,%rdx
624	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
625	adc	\$0,%rdx
626	mov	%rdx,$A[1]
627
628	mulq	$m1			# np[j]*m1
629	add	%rax,$N[0]
630	mov	8($ap,$j,8),%rax
631	adc	\$0,%rdx
632	add	$A[0],$N[0]
633	adc	\$0,%rdx
634	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
635	mov	%rdx,$N[1]
636
637	mulq	$m0			# ap[j]*bp[i]
638	add	%rax,$A[1]
639	mov	8($np,$j,8),%rax
640	adc	\$0,%rdx
641	add	8(%rsp,$j,8),$A[1]
642	adc	\$0,%rdx
643	lea	4($j),$j		# j++
644	mov	%rdx,$A[0]
645
646	mulq	$m1			# np[j]*m1
647	add	%rax,$N[1]
648	mov	-16($ap,$j,8),%rax
649	adc	\$0,%rdx
650	add	$A[1],$N[1]
651	adc	\$0,%rdx
652	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
653	mov	%rdx,$N[0]
654	cmp	$num,$j
655	jb	.Linner4x
656
657	mulq	$m0			# ap[j]*bp[i]
658	add	%rax,$A[0]
659	mov	-16($np,$j,8),%rax
660	adc	\$0,%rdx
661	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
662	adc	\$0,%rdx
663	mov	%rdx,$A[1]
664
665	mulq	$m1			# np[j]*m1
666	add	%rax,$N[0]
667	mov	-8($ap,$j,8),%rax
668	adc	\$0,%rdx
669	add	$A[0],$N[0]
670	adc	\$0,%rdx
671	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
672	mov	%rdx,$N[1]
673
674	mulq	$m0			# ap[j]*bp[i]
675	add	%rax,$A[1]
676	mov	-8($np,$j,8),%rax
677	adc	\$0,%rdx
678	add	-8(%rsp,$j,8),$A[1]
679	adc	\$0,%rdx
680	lea	1($i),$i		# i++
681	mov	%rdx,$A[0]
682
683	mulq	$m1			# np[j]*m1
684	add	%rax,$N[1]
685	mov	($ap),%rax		# ap[0]
686	adc	\$0,%rdx
687	add	$A[1],$N[1]
688	adc	\$0,%rdx
689	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
690	mov	%rdx,$N[0]
691
692	xor	$N[1],$N[1]
693	add	$A[0],$N[0]
694	adc	\$0,$N[1]
695	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
696	adc	\$0,$N[1]
697	mov	$N[0],-8(%rsp,$j,8)
698	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
699
700	cmp	$num,$i
701	jb	.Louter4x
702___
703{
704my @ri=("%rax","%rdx",$m0,$m1);
705$code.=<<___;
706	mov	16(%rsp,$num,8),$rp	# restore $rp
707	lea	-4($num),$j
708	mov	0(%rsp),@ri[0]		# tp[0]
709	pxor	%xmm0,%xmm0
710	mov	8(%rsp),@ri[1]		# tp[1]
711	shr	\$2,$j			# j=num/4-1
712	lea	(%rsp),$ap		# borrow ap for tp
713	xor	$i,$i			# i=0 and clear CF!
714
715	sub	0($np),@ri[0]
716	mov	16($ap),@ri[2]		# tp[2]
717	mov	24($ap),@ri[3]		# tp[3]
718	sbb	8($np),@ri[1]
719	jmp	.Lsub4x
720.align	16
721.Lsub4x:
722	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
723	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
724	sbb	16($np,$i,8),@ri[2]
725	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
726	mov	40($ap,$i,8),@ri[1]
727	sbb	24($np,$i,8),@ri[3]
728	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
729	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
730	sbb	32($np,$i,8),@ri[0]
731	mov	48($ap,$i,8),@ri[2]
732	mov	56($ap,$i,8),@ri[3]
733	sbb	40($np,$i,8),@ri[1]
734	lea	4($i),$i		# i++
735	dec	$j			# doesnn't affect CF!
736	jnz	.Lsub4x
737
738	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
739	mov	32($ap,$i,8),@ri[0]	# load overflow bit
740	sbb	16($np,$i,8),@ri[2]
741	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
742	sbb	24($np,$i,8),@ri[3]
743	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
744
745	sbb	\$0,@ri[0]		# handle upmost overflow bit
746	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
747	xor	$i,$i			# i=0
748	and	@ri[0],$ap
749	not	@ri[0]
750	mov	$rp,$np
751	and	@ri[0],$np
752	lea	-4($num),$j
753	or	$np,$ap			# ap=borrow?tp:rp
754	shr	\$2,$j			# j=num/4-1
755
756	movdqu	($ap),%xmm1
757	movdqa	%xmm0,(%rsp)
758	movdqu	%xmm1,($rp)
759	jmp	.Lcopy4x
760.align	16
761.Lcopy4x:					# copy or in-place refresh
762	movdqu	16($ap,$i),%xmm2
763	movdqu	32($ap,$i),%xmm1
764	movdqa	%xmm0,16(%rsp,$i)
765	movdqu	%xmm2,16($rp,$i)
766	movdqa	%xmm0,32(%rsp,$i)
767	movdqu	%xmm1,32($rp,$i)
768	lea	32($i),$i
769	dec	$j
770	jnz	.Lcopy4x
771
772	movdqu	16($ap,$i),%xmm2
773	movdqa	%xmm0,16(%rsp,$i)
774	movdqu	%xmm2,16($rp,$i)
775___
776}
777$code.=<<___;
778	mov	8(%rsp,$num,8),%rsi	# restore %rsp
779.cfi_def_cfa	%rsi, 8
780	mov	\$1,%rax
781	mov	-48(%rsi),%r15
782.cfi_restore	%r15
783	mov	-40(%rsi),%r14
784.cfi_restore	%r14
785	mov	-32(%rsi),%r13
786.cfi_restore	%r13
787	mov	-24(%rsi),%r12
788.cfi_restore	%r12
789	mov	-16(%rsi),%rbp
790.cfi_restore	%rbp
791	mov	-8(%rsi),%rbx
792.cfi_restore	%rbx
793	lea	(%rsi),%rsp
794.cfi_def_cfa_register	%rsp
795.Lmul4x_epilogue:
796	ret
797.cfi_endproc
798.size	bn_mul4x_mont,.-bn_mul4x_mont
799___
800}}}
801{{{
802######################################################################
803# void bn_sqr8x_mont(
804my $rptr="%rdi";	# const BN_ULONG *rptr,
805my $aptr="%rsi";	# const BN_ULONG *aptr,
806my $bptr="%rdx";	# not used
807my $nptr="%rcx";	# const BN_ULONG *nptr,
808my $n0  ="%r8";		# const BN_ULONG *n0);
809my $num ="%r9";		# int num, has to be divisible by 8
810
811my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
812my @A0=("%r10","%r11");
813my @A1=("%r12","%r13");
814my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
815
816$code.=<<___	if ($addx);
817.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
818___
819$code.=<<___;
820.extern	bn_sqr8x_internal		# see x86_64-mont5 module
821
822.type	bn_sqr8x_mont,\@function,6
823.align	32
824bn_sqr8x_mont:
825.cfi_startproc
826	mov	%rsp,%rax
827.cfi_def_cfa_register	%rax
828.Lsqr8x_enter:
829	push	%rbx
830.cfi_push	%rbx
831	push	%rbp
832.cfi_push	%rbp
833	push	%r12
834.cfi_push	%r12
835	push	%r13
836.cfi_push	%r13
837	push	%r14
838.cfi_push	%r14
839	push	%r15
840.cfi_push	%r15
841.Lsqr8x_prologue:
842
843	mov	${num}d,%r10d
844	shl	\$3,${num}d		# convert $num to bytes
845	shl	\$3+2,%r10		# 4*$num
846	neg	$num
847
848	##############################################################
849	# ensure that stack frame doesn't alias with $aptr modulo
850	# 4096. this is done to allow memory disambiguation logic
851	# do its job.
852	#
853	lea	-64(%rsp,$num,2),%r11
854	mov	%rsp,%rbp
855	mov	($n0),$n0		# *n0
856	sub	$aptr,%r11
857	and	\$4095,%r11
858	cmp	%r11,%r10
859	jb	.Lsqr8x_sp_alt
860	sub	%r11,%rbp		# align with $aptr
861	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
862	jmp	.Lsqr8x_sp_done
863
864.align	32
865.Lsqr8x_sp_alt:
866	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
867	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
868	sub	%r10,%r11
869	mov	\$0,%r10
870	cmovc	%r10,%r11
871	sub	%r11,%rbp
872.Lsqr8x_sp_done:
873	and	\$-64,%rbp
874	mov	%rsp,%r11
875	sub	%rbp,%r11
876	and	\$-4096,%r11
877	lea	(%rbp,%r11),%rsp
878	mov	(%rsp),%r10
879	cmp	%rbp,%rsp
880	ja	.Lsqr8x_page_walk
881	jmp	.Lsqr8x_page_walk_done
882
883.align	16
884.Lsqr8x_page_walk:
885	lea	-4096(%rsp),%rsp
886	mov	(%rsp),%r10
887	cmp	%rbp,%rsp
888	ja	.Lsqr8x_page_walk
889.Lsqr8x_page_walk_done:
890
891	mov	$num,%r10
892	neg	$num
893
894	mov	$n0,  32(%rsp)
895	mov	%rax, 40(%rsp)		# save original %rsp
896.cfi_cfa_expression	%rsp+40,deref,+8
897.Lsqr8x_body:
898
899	movq	$nptr, %xmm2		# save pointer to modulus
900	pxor	%xmm0,%xmm0
901	movq	$rptr,%xmm1		# save $rptr
902	movq	%r10, %xmm3		# -$num
903___
904$code.=<<___ if ($addx);
905	leaq	OPENSSL_ia32cap_P(%rip),%rax
906	mov	8(%rax),%eax
907	and	\$0x80100,%eax
908	cmp	\$0x80100,%eax
909	jne	.Lsqr8x_nox
910
911	call	bn_sqrx8x_internal	# see x86_64-mont5 module
912					# %rax	top-most carry
913					# %rbp	nptr
914					# %rcx	-8*num
915					# %r8	end of tp[2*num]
916	lea	(%r8,%rcx),%rbx
917	mov	%rcx,$num
918	mov	%rcx,%rdx
919	movq	%xmm1,$rptr
920	sar	\$3+2,%rcx		# %cf=0
921	jmp	.Lsqr8x_sub
922
923.align	32
924.Lsqr8x_nox:
925___
926$code.=<<___;
927	call	bn_sqr8x_internal	# see x86_64-mont5 module
928					# %rax	top-most carry
929					# %rbp	nptr
930					# %r8	-8*num
931					# %rdi	end of tp[2*num]
932	lea	(%rdi,$num),%rbx
933	mov	$num,%rcx
934	mov	$num,%rdx
935	movq	%xmm1,$rptr
936	sar	\$3+2,%rcx		# %cf=0
937	jmp	.Lsqr8x_sub
938
939.align	32
940.Lsqr8x_sub:
941	mov	8*0(%rbx),%r12
942	mov	8*1(%rbx),%r13
943	mov	8*2(%rbx),%r14
944	mov	8*3(%rbx),%r15
945	lea	8*4(%rbx),%rbx
946	sbb	8*0(%rbp),%r12
947	sbb	8*1(%rbp),%r13
948	sbb	8*2(%rbp),%r14
949	sbb	8*3(%rbp),%r15
950	lea	8*4(%rbp),%rbp
951	mov	%r12,8*0($rptr)
952	mov	%r13,8*1($rptr)
953	mov	%r14,8*2($rptr)
954	mov	%r15,8*3($rptr)
955	lea	8*4($rptr),$rptr
956	inc	%rcx			# preserves %cf
957	jnz	.Lsqr8x_sub
958
959	sbb	\$0,%rax		# top-most carry
960	lea	(%rbx,$num),%rbx	# rewind
961	lea	($rptr,$num),$rptr	# rewind
962
963	movq	%rax,%xmm1
964	pxor	%xmm0,%xmm0
965	pshufd	\$0,%xmm1,%xmm1
966	mov	40(%rsp),%rsi		# restore %rsp
967.cfi_def_cfa	%rsi,8
968	jmp	.Lsqr8x_cond_copy
969
970.align	32
971.Lsqr8x_cond_copy:
972	movdqa	16*0(%rbx),%xmm2
973	movdqa	16*1(%rbx),%xmm3
974	lea	16*2(%rbx),%rbx
975	movdqu	16*0($rptr),%xmm4
976	movdqu	16*1($rptr),%xmm5
977	lea	16*2($rptr),$rptr
978	movdqa	%xmm0,-16*2(%rbx)	# zero tp
979	movdqa	%xmm0,-16*1(%rbx)
980	movdqa	%xmm0,-16*2(%rbx,%rdx)
981	movdqa	%xmm0,-16*1(%rbx,%rdx)
982	pcmpeqd	%xmm1,%xmm0
983	pand	%xmm1,%xmm2
984	pand	%xmm1,%xmm3
985	pand	%xmm0,%xmm4
986	pand	%xmm0,%xmm5
987	pxor	%xmm0,%xmm0
988	por	%xmm2,%xmm4
989	por	%xmm3,%xmm5
990	movdqu	%xmm4,-16*2($rptr)
991	movdqu	%xmm5,-16*1($rptr)
992	add	\$32,$num
993	jnz	.Lsqr8x_cond_copy
994
995	mov	\$1,%rax
996	mov	-48(%rsi),%r15
997.cfi_restore	%r15
998	mov	-40(%rsi),%r14
999.cfi_restore	%r14
1000	mov	-32(%rsi),%r13
1001.cfi_restore	%r13
1002	mov	-24(%rsi),%r12
1003.cfi_restore	%r12
1004	mov	-16(%rsi),%rbp
1005.cfi_restore	%rbp
1006	mov	-8(%rsi),%rbx
1007.cfi_restore	%rbx
1008	lea	(%rsi),%rsp
1009.cfi_def_cfa_register	%rsp
1010.Lsqr8x_epilogue:
1011	ret
1012.cfi_endproc
1013.size	bn_sqr8x_mont,.-bn_sqr8x_mont
1014___
1015}}}
1016
1017if ($addx) {{{
1018my $bp="%rdx";	# original value
1019
1020$code.=<<___;
1021.type	bn_mulx4x_mont,\@function,6
1022.align	32
1023bn_mulx4x_mont:
1024.cfi_startproc
1025	mov	%rsp,%rax
1026.cfi_def_cfa_register	%rax
1027.Lmulx4x_enter:
1028	push	%rbx
1029.cfi_push	%rbx
1030	push	%rbp
1031.cfi_push	%rbp
1032	push	%r12
1033.cfi_push	%r12
1034	push	%r13
1035.cfi_push	%r13
1036	push	%r14
1037.cfi_push	%r14
1038	push	%r15
1039.cfi_push	%r15
1040.Lmulx4x_prologue:
1041
1042	shl	\$3,${num}d		# convert $num to bytes
1043	xor	%r10,%r10
1044	sub	$num,%r10		# -$num
1045	mov	($n0),$n0		# *n0
1046	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
1047	and	\$-128,%rbp
1048	mov	%rsp,%r11
1049	sub	%rbp,%r11
1050	and	\$-4096,%r11
1051	lea	(%rbp,%r11),%rsp
1052	mov	(%rsp),%r10
1053	cmp	%rbp,%rsp
1054	ja	.Lmulx4x_page_walk
1055	jmp	.Lmulx4x_page_walk_done
1056
1057.align	16
1058.Lmulx4x_page_walk:
1059	lea	-4096(%rsp),%rsp
1060	mov	(%rsp),%r10
1061	cmp	%rbp,%rsp
1062	ja	.Lmulx4x_page_walk
1063.Lmulx4x_page_walk_done:
1064
1065	lea	($bp,$num),%r10
1066	##############################################################
1067	# Stack layout
1068	# +0	num
1069	# +8	off-loaded &b[i]
1070	# +16	end of b[num]
1071	# +24	saved n0
1072	# +32	saved rp
1073	# +40	saved %rsp
1074	# +48	inner counter
1075	# +56
1076	# +64	tmp[num+1]
1077	#
1078	mov	$num,0(%rsp)		# save $num
1079	shr	\$5,$num
1080	mov	%r10,16(%rsp)		# end of b[num]
1081	sub	\$1,$num
1082	mov	$n0, 24(%rsp)		# save *n0
1083	mov	$rp, 32(%rsp)		# save $rp
1084	mov	%rax,40(%rsp)		# save original %rsp
1085.cfi_cfa_expression	%rsp+40,deref,+8
1086	mov	$num,48(%rsp)		# inner counter
1087	jmp	.Lmulx4x_body
1088
1089.align	32
1090.Lmulx4x_body:
1091___
1092my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
1093   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1094my $rptr=$bptr;
1095$code.=<<___;
1096	lea	8($bp),$bptr
1097	mov	($bp),%rdx		# b[0], $bp==%rdx actually
1098	lea	64+32(%rsp),$tptr
1099	mov	%rdx,$bi
1100
1101	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
1102	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
1103	add	%rax,%r11
1104	mov	$bptr,8(%rsp)		# off-load &b[i]
1105	mulx	2*8($aptr),%r12,%r13	# ...
1106	adc	%r14,%r12
1107	adc	\$0,%r13
1108
1109	mov	$mi,$bptr		# borrow $bptr
1110	imulq	24(%rsp),$mi		# "t[0]"*n0
1111	xor	$zero,$zero		# cf=0, of=0
1112
1113	mulx	3*8($aptr),%rax,%r14
1114	 mov	$mi,%rdx
1115	lea	4*8($aptr),$aptr
1116	adcx	%rax,%r13
1117	adcx	$zero,%r14		# cf=0
1118
1119	mulx	0*8($nptr),%rax,%r10
1120	adcx	%rax,$bptr		# discarded
1121	adox	%r11,%r10
1122	mulx	1*8($nptr),%rax,%r11
1123	adcx	%rax,%r10
1124	adox	%r12,%r11
1125	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
1126	mov	48(%rsp),$bptr		# counter value
1127	mov	%r10,-4*8($tptr)
1128	adcx	%rax,%r11
1129	adox	%r13,%r12
1130	mulx	3*8($nptr),%rax,%r15
1131	 mov	$bi,%rdx
1132	mov	%r11,-3*8($tptr)
1133	adcx	%rax,%r12
1134	adox	$zero,%r15		# of=0
1135	lea	4*8($nptr),$nptr
1136	mov	%r12,-2*8($tptr)
1137
1138	jmp	.Lmulx4x_1st
1139
1140.align	32
1141.Lmulx4x_1st:
1142	adcx	$zero,%r15		# cf=0, modulo-scheduled
1143	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
1144	adcx	%r14,%r10
1145	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
1146	adcx	%rax,%r11
1147	mulx	2*8($aptr),%r12,%rax	# ...
1148	adcx	%r14,%r12
1149	mulx	3*8($aptr),%r13,%r14
1150	 .byte	0x67,0x67
1151	 mov	$mi,%rdx
1152	adcx	%rax,%r13
1153	adcx	$zero,%r14		# cf=0
1154	lea	4*8($aptr),$aptr
1155	lea	4*8($tptr),$tptr
1156
1157	adox	%r15,%r10
1158	mulx	0*8($nptr),%rax,%r15
1159	adcx	%rax,%r10
1160	adox	%r15,%r11
1161	mulx	1*8($nptr),%rax,%r15
1162	adcx	%rax,%r11
1163	adox	%r15,%r12
1164	mulx	2*8($nptr),%rax,%r15
1165	mov	%r10,-5*8($tptr)
1166	adcx	%rax,%r12
1167	mov	%r11,-4*8($tptr)
1168	adox	%r15,%r13
1169	mulx	3*8($nptr),%rax,%r15
1170	 mov	$bi,%rdx
1171	mov	%r12,-3*8($tptr)
1172	adcx	%rax,%r13
1173	adox	$zero,%r15
1174	lea	4*8($nptr),$nptr
1175	mov	%r13,-2*8($tptr)
1176
1177	dec	$bptr			# of=0, pass cf
1178	jnz	.Lmulx4x_1st
1179
1180	mov	0(%rsp),$num		# load num
1181	mov	8(%rsp),$bptr		# re-load &b[i]
1182	adc	$zero,%r15		# modulo-scheduled
1183	add	%r15,%r14
1184	sbb	%r15,%r15		# top-most carry
1185	mov	%r14,-1*8($tptr)
1186	jmp	.Lmulx4x_outer
1187
1188.align	32
1189.Lmulx4x_outer:
1190	mov	($bptr),%rdx		# b[i]
1191	lea	8($bptr),$bptr		# b++
1192	sub	$num,$aptr		# rewind $aptr
1193	mov	%r15,($tptr)		# save top-most carry
1194	lea	64+4*8(%rsp),$tptr
1195	sub	$num,$nptr		# rewind $nptr
1196
1197	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
1198	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1199	mov	%rdx,$bi
1200	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
1201	adox	-4*8($tptr),$mi
1202	adcx	%r14,%r11
1203	mulx	2*8($aptr),%r15,%r13	# ...
1204	adox	-3*8($tptr),%r11
1205	adcx	%r15,%r12
1206	adox	-2*8($tptr),%r12
1207	adcx	$zero,%r13
1208	adox	$zero,%r13
1209
1210	mov	$bptr,8(%rsp)		# off-load &b[i]
1211	mov	$mi,%r15
1212	imulq	24(%rsp),$mi		# "t[0]"*n0
1213	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1214
1215	mulx	3*8($aptr),%rax,%r14
1216	 mov	$mi,%rdx
1217	adcx	%rax,%r13
1218	adox	-1*8($tptr),%r13
1219	adcx	$zero,%r14
1220	lea	4*8($aptr),$aptr
1221	adox	$zero,%r14
1222
1223	mulx	0*8($nptr),%rax,%r10
1224	adcx	%rax,%r15		# discarded
1225	adox	%r11,%r10
1226	mulx	1*8($nptr),%rax,%r11
1227	adcx	%rax,%r10
1228	adox	%r12,%r11
1229	mulx	2*8($nptr),%rax,%r12
1230	mov	%r10,-4*8($tptr)
1231	adcx	%rax,%r11
1232	adox	%r13,%r12
1233	mulx	3*8($nptr),%rax,%r15
1234	 mov	$bi,%rdx
1235	mov	%r11,-3*8($tptr)
1236	lea	4*8($nptr),$nptr
1237	adcx	%rax,%r12
1238	adox	$zero,%r15		# of=0
1239	mov	48(%rsp),$bptr		# counter value
1240	mov	%r12,-2*8($tptr)
1241
1242	jmp	.Lmulx4x_inner
1243
1244.align	32
1245.Lmulx4x_inner:
1246	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
1247	adcx	$zero,%r15		# cf=0, modulo-scheduled
1248	adox	%r14,%r10
1249	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
1250	adcx	0*8($tptr),%r10
1251	adox	%rax,%r11
1252	mulx	2*8($aptr),%r12,%rax	# ...
1253	adcx	1*8($tptr),%r11
1254	adox	%r14,%r12
1255	mulx	3*8($aptr),%r13,%r14
1256	 mov	$mi,%rdx
1257	adcx	2*8($tptr),%r12
1258	adox	%rax,%r13
1259	adcx	3*8($tptr),%r13
1260	adox	$zero,%r14		# of=0
1261	lea	4*8($aptr),$aptr
1262	lea	4*8($tptr),$tptr
1263	adcx	$zero,%r14		# cf=0
1264
1265	adox	%r15,%r10
1266	mulx	0*8($nptr),%rax,%r15
1267	adcx	%rax,%r10
1268	adox	%r15,%r11
1269	mulx	1*8($nptr),%rax,%r15
1270	adcx	%rax,%r11
1271	adox	%r15,%r12
1272	mulx	2*8($nptr),%rax,%r15
1273	mov	%r10,-5*8($tptr)
1274	adcx	%rax,%r12
1275	adox	%r15,%r13
1276	mulx	3*8($nptr),%rax,%r15
1277	 mov	$bi,%rdx
1278	mov	%r11,-4*8($tptr)
1279	mov	%r12,-3*8($tptr)
1280	adcx	%rax,%r13
1281	adox	$zero,%r15
1282	lea	4*8($nptr),$nptr
1283	mov	%r13,-2*8($tptr)
1284
1285	dec	$bptr			# of=0, pass cf
1286	jnz	.Lmulx4x_inner
1287
1288	mov	0(%rsp),$num		# load num
1289	mov	8(%rsp),$bptr		# re-load &b[i]
1290	adc	$zero,%r15		# modulo-scheduled
1291	sub	0*8($tptr),$zero	# pull top-most carry
1292	adc	%r15,%r14
1293	sbb	%r15,%r15		# top-most carry
1294	mov	%r14,-1*8($tptr)
1295
1296	cmp	16(%rsp),$bptr
1297	jne	.Lmulx4x_outer
1298
1299	lea	64(%rsp),$tptr
1300	sub	$num,$nptr		# rewind $nptr
1301	neg	%r15
1302	mov	$num,%rdx
1303	shr	\$3+2,$num		# %cf=0
1304	mov	32(%rsp),$rptr		# restore rp
1305	jmp	.Lmulx4x_sub
1306
1307.align	32
1308.Lmulx4x_sub:
1309	mov	8*0($tptr),%r11
1310	mov	8*1($tptr),%r12
1311	mov	8*2($tptr),%r13
1312	mov	8*3($tptr),%r14
1313	lea	8*4($tptr),$tptr
1314	sbb	8*0($nptr),%r11
1315	sbb	8*1($nptr),%r12
1316	sbb	8*2($nptr),%r13
1317	sbb	8*3($nptr),%r14
1318	lea	8*4($nptr),$nptr
1319	mov	%r11,8*0($rptr)
1320	mov	%r12,8*1($rptr)
1321	mov	%r13,8*2($rptr)
1322	mov	%r14,8*3($rptr)
1323	lea	8*4($rptr),$rptr
1324	dec	$num			# preserves %cf
1325	jnz	.Lmulx4x_sub
1326
1327	sbb	\$0,%r15		# top-most carry
1328	lea	64(%rsp),$tptr
1329	sub	%rdx,$rptr		# rewind
1330
1331	movq	%r15,%xmm1
1332	pxor	%xmm0,%xmm0
1333	pshufd	\$0,%xmm1,%xmm1
1334	mov	40(%rsp),%rsi		# restore %rsp
1335.cfi_def_cfa	%rsi,8
1336	jmp	.Lmulx4x_cond_copy
1337
1338.align	32
1339.Lmulx4x_cond_copy:
1340	movdqa	16*0($tptr),%xmm2
1341	movdqa	16*1($tptr),%xmm3
1342	lea	16*2($tptr),$tptr
1343	movdqu	16*0($rptr),%xmm4
1344	movdqu	16*1($rptr),%xmm5
1345	lea	16*2($rptr),$rptr
1346	movdqa	%xmm0,-16*2($tptr)	# zero tp
1347	movdqa	%xmm0,-16*1($tptr)
1348	pcmpeqd	%xmm1,%xmm0
1349	pand	%xmm1,%xmm2
1350	pand	%xmm1,%xmm3
1351	pand	%xmm0,%xmm4
1352	pand	%xmm0,%xmm5
1353	pxor	%xmm0,%xmm0
1354	por	%xmm2,%xmm4
1355	por	%xmm3,%xmm5
1356	movdqu	%xmm4,-16*2($rptr)
1357	movdqu	%xmm5,-16*1($rptr)
1358	sub	\$32,%rdx
1359	jnz	.Lmulx4x_cond_copy
1360
1361	mov	%rdx,($tptr)
1362
1363	mov	\$1,%rax
1364	mov	-48(%rsi),%r15
1365.cfi_restore	%r15
1366	mov	-40(%rsi),%r14
1367.cfi_restore	%r14
1368	mov	-32(%rsi),%r13
1369.cfi_restore	%r13
1370	mov	-24(%rsi),%r12
1371.cfi_restore	%r12
1372	mov	-16(%rsi),%rbp
1373.cfi_restore	%rbp
1374	mov	-8(%rsi),%rbx
1375.cfi_restore	%rbx
1376	lea	(%rsi),%rsp
1377.cfi_def_cfa_register	%rsp
1378.Lmulx4x_epilogue:
1379	ret
1380.cfi_endproc
1381.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1382___
1383}}}
1384$code.=<<___;
1385.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1386.align	16
1387___
1388
1389# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1390#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1391if ($win64) {
1392$rec="%rcx";
1393$frame="%rdx";
1394$context="%r8";
1395$disp="%r9";
1396
1397$code.=<<___;
1398.extern	__imp_RtlVirtualUnwind
1399.type	mul_handler,\@abi-omnipotent
1400.align	16
1401mul_handler:
1402	push	%rsi
1403	push	%rdi
1404	push	%rbx
1405	push	%rbp
1406	push	%r12
1407	push	%r13
1408	push	%r14
1409	push	%r15
1410	pushfq
1411	sub	\$64,%rsp
1412
1413	mov	120($context),%rax	# pull context->Rax
1414	mov	248($context),%rbx	# pull context->Rip
1415
1416	mov	8($disp),%rsi		# disp->ImageBase
1417	mov	56($disp),%r11		# disp->HandlerData
1418
1419	mov	0(%r11),%r10d		# HandlerData[0]
1420	lea	(%rsi,%r10),%r10	# end of prologue label
1421	cmp	%r10,%rbx		# context->Rip<end of prologue label
1422	jb	.Lcommon_seh_tail
1423
1424	mov	152($context),%rax	# pull context->Rsp
1425
1426	mov	4(%r11),%r10d		# HandlerData[1]
1427	lea	(%rsi,%r10),%r10	# epilogue label
1428	cmp	%r10,%rbx		# context->Rip>=epilogue label
1429	jae	.Lcommon_seh_tail
1430
1431	mov	192($context),%r10	# pull $num
1432	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1433
1434	jmp	.Lcommon_pop_regs
1435.size	mul_handler,.-mul_handler
1436
1437.type	sqr_handler,\@abi-omnipotent
1438.align	16
1439sqr_handler:
1440	push	%rsi
1441	push	%rdi
1442	push	%rbx
1443	push	%rbp
1444	push	%r12
1445	push	%r13
1446	push	%r14
1447	push	%r15
1448	pushfq
1449	sub	\$64,%rsp
1450
1451	mov	120($context),%rax	# pull context->Rax
1452	mov	248($context),%rbx	# pull context->Rip
1453
1454	mov	8($disp),%rsi		# disp->ImageBase
1455	mov	56($disp),%r11		# disp->HandlerData
1456
1457	mov	0(%r11),%r10d		# HandlerData[0]
1458	lea	(%rsi,%r10),%r10	# end of prologue label
1459	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
1460	jb	.Lcommon_seh_tail
1461
1462	mov	4(%r11),%r10d		# HandlerData[1]
1463	lea	(%rsi,%r10),%r10	# body label
1464	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1465	jb	.Lcommon_pop_regs
1466
1467	mov	152($context),%rax	# pull context->Rsp
1468
1469	mov	8(%r11),%r10d		# HandlerData[2]
1470	lea	(%rsi,%r10),%r10	# epilogue label
1471	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1472	jae	.Lcommon_seh_tail
1473
1474	mov	40(%rax),%rax		# pull saved stack pointer
1475
1476.Lcommon_pop_regs:
1477	mov	-8(%rax),%rbx
1478	mov	-16(%rax),%rbp
1479	mov	-24(%rax),%r12
1480	mov	-32(%rax),%r13
1481	mov	-40(%rax),%r14
1482	mov	-48(%rax),%r15
1483	mov	%rbx,144($context)	# restore context->Rbx
1484	mov	%rbp,160($context)	# restore context->Rbp
1485	mov	%r12,216($context)	# restore context->R12
1486	mov	%r13,224($context)	# restore context->R13
1487	mov	%r14,232($context)	# restore context->R14
1488	mov	%r15,240($context)	# restore context->R15
1489
1490.Lcommon_seh_tail:
1491	mov	8(%rax),%rdi
1492	mov	16(%rax),%rsi
1493	mov	%rax,152($context)	# restore context->Rsp
1494	mov	%rsi,168($context)	# restore context->Rsi
1495	mov	%rdi,176($context)	# restore context->Rdi
1496
1497	mov	40($disp),%rdi		# disp->ContextRecord
1498	mov	$context,%rsi		# context
1499	mov	\$154,%ecx		# sizeof(CONTEXT)
1500	.long	0xa548f3fc		# cld; rep movsq
1501
1502	mov	$disp,%rsi
1503	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1504	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1505	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1506	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1507	mov	40(%rsi),%r10		# disp->ContextRecord
1508	lea	56(%rsi),%r11		# &disp->HandlerData
1509	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1510	mov	%r10,32(%rsp)		# arg5
1511	mov	%r11,40(%rsp)		# arg6
1512	mov	%r12,48(%rsp)		# arg7
1513	mov	%rcx,56(%rsp)		# arg8, (NULL)
1514	call	*__imp_RtlVirtualUnwind(%rip)
1515
1516	mov	\$1,%eax		# ExceptionContinueSearch
1517	add	\$64,%rsp
1518	popfq
1519	pop	%r15
1520	pop	%r14
1521	pop	%r13
1522	pop	%r12
1523	pop	%rbp
1524	pop	%rbx
1525	pop	%rdi
1526	pop	%rsi
1527	ret
1528.size	sqr_handler,.-sqr_handler
1529
1530.section	.pdata
1531.align	4
1532	.rva	.LSEH_begin_bn_mul_mont
1533	.rva	.LSEH_end_bn_mul_mont
1534	.rva	.LSEH_info_bn_mul_mont
1535
1536	.rva	.LSEH_begin_bn_mul4x_mont
1537	.rva	.LSEH_end_bn_mul4x_mont
1538	.rva	.LSEH_info_bn_mul4x_mont
1539
1540	.rva	.LSEH_begin_bn_sqr8x_mont
1541	.rva	.LSEH_end_bn_sqr8x_mont
1542	.rva	.LSEH_info_bn_sqr8x_mont
1543___
1544$code.=<<___ if ($addx);
1545	.rva	.LSEH_begin_bn_mulx4x_mont
1546	.rva	.LSEH_end_bn_mulx4x_mont
1547	.rva	.LSEH_info_bn_mulx4x_mont
1548___
1549$code.=<<___;
1550.section	.xdata
1551.align	8
1552.LSEH_info_bn_mul_mont:
1553	.byte	9,0,0,0
1554	.rva	mul_handler
1555	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1556.LSEH_info_bn_mul4x_mont:
1557	.byte	9,0,0,0
1558	.rva	mul_handler
1559	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1560.LSEH_info_bn_sqr8x_mont:
1561	.byte	9,0,0,0
1562	.rva	sqr_handler
1563	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
1564.align	8
1565___
1566$code.=<<___ if ($addx);
1567.LSEH_info_bn_mulx4x_mont:
1568	.byte	9,0,0,0
1569	.rva	sqr_handler
1570	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
1571.align	8
1572___
1573}
1574
1575print $code;
1576close STDOUT;
1577