• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# October 2005.
18#
19# Montgomery multiplication routine for x86_64. While it gives modest
20# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
21# than twice, >2x, as fast. Most common rsa1024 sign is improved by
22# respectful 50%. It remains to be seen if loop unrolling and
23# dedicated squaring routine can provide further improvement...
24
25# July 2011.
26#
27# Add dedicated squaring procedure. Performance improvement varies
28# from platform to platform, but in average it's ~5%/15%/25%/33%
29# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
30
31# August 2011.
32#
33# Unroll and modulo-schedule inner loops in such manner that they
34# are "fallen through" for input lengths of 8, which is critical for
35# 1024-bit RSA *sign*. Average performance improvement in comparison
36# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
37# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
38
39# June 2013.
40#
41# Optimize reduction in squaring procedure and improve 1024+-bit RSA
42# sign performance by 10-16% on Intel Sandy Bridge and later
43# (virtually same on non-Intel processors).
44
45# August 2013.
46#
47# Add MULX/ADOX/ADCX code path.
48
49$flavour = shift;
50$output  = shift;
51if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
52
53$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54
55$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
57( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
58die "can't locate x86_64-xlate.pl";
59
60open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
61*STDOUT=*OUT;
62
63# In upstream, this is controlled by shelling out to the compiler to check
64# versions, but BoringSSL is intended to be used with pre-generated perlasm
65# output, so this isn't useful anyway.
66$addx = 1;
67
68# int bn_mul_mont(
69$rp="%rdi";	# BN_ULONG *rp,
70$ap="%rsi";	# const BN_ULONG *ap,
71$bp="%rdx";	# const BN_ULONG *bp,
72$np="%rcx";	# const BN_ULONG *np,
73$n0="%r8";	# const BN_ULONG *n0,
74# TODO(davidben): The code below treats $num as an int, but C passes in a
75# size_t.
76$num="%r9";	# size_t num);
77$lo0="%r10";
78$hi0="%r11";
79$hi1="%r13";
80$i="%r14";
81$j="%r15";
82$m0="%rbx";
83$m1="%rbp";
84
85$code=<<___;
86.text
87
88.extern	OPENSSL_ia32cap_P
89
90.globl	bn_mul_mont
91.type	bn_mul_mont,\@function,6
92.align	16
93bn_mul_mont:
94.cfi_startproc
95	mov	${num}d,${num}d
96	mov	%rsp,%rax
97.cfi_def_cfa_register	%rax
98	test	\$3,${num}d
99	jnz	.Lmul_enter
100	cmp	\$8,${num}d
101	jb	.Lmul_enter
102___
103$code.=<<___ if ($addx);
104	leaq	OPENSSL_ia32cap_P(%rip),%r11
105	mov	8(%r11),%r11d
106___
107$code.=<<___;
108	cmp	$ap,$bp
109	jne	.Lmul4x_enter
110	test	\$7,${num}d
111	jz	.Lsqr8x_enter
112	jmp	.Lmul4x_enter
113
114.align	16
115.Lmul_enter:
116	push	%rbx
117.cfi_push	%rbx
118	push	%rbp
119.cfi_push	%rbp
120	push	%r12
121.cfi_push	%r12
122	push	%r13
123.cfi_push	%r13
124	push	%r14
125.cfi_push	%r14
126	push	%r15
127.cfi_push	%r15
128
129	neg	$num
130	mov	%rsp,%r11
131	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
132	neg	$num			# restore $num
133	and	\$-1024,%r10		# minimize TLB usage
134
135	# An OS-agnostic version of __chkstk.
136	#
137	# Some OSes (Windows) insist on stack being "wired" to
138	# physical memory in strictly sequential manner, i.e. if stack
139	# allocation spans two pages, then reference to farmost one can
140	# be punishable by SEGV. But page walking can do good even on
141	# other OSes, because it guarantees that villain thread hits
142	# the guard page before it can make damage to innocent one...
143	sub	%r10,%r11
144	and	\$-4096,%r11
145	lea	(%r10,%r11),%rsp
146	mov	(%rsp),%r11
147	cmp	%r10,%rsp
148	ja	.Lmul_page_walk
149	jmp	.Lmul_page_walk_done
150
151.align	16
152.Lmul_page_walk:
153	lea	-4096(%rsp),%rsp
154	mov	(%rsp),%r11
155	cmp	%r10,%rsp
156	ja	.Lmul_page_walk
157.Lmul_page_walk_done:
158
159	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
160.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
161.Lmul_body:
162	mov	$bp,%r12		# reassign $bp
163___
164		$bp="%r12";
165$code.=<<___;
166	mov	($n0),$n0		# pull n0[0] value
167	mov	($bp),$m0		# m0=bp[0]
168	mov	($ap),%rax
169
170	xor	$i,$i			# i=0
171	xor	$j,$j			# j=0
172
173	mov	$n0,$m1
174	mulq	$m0			# ap[0]*bp[0]
175	mov	%rax,$lo0
176	mov	($np),%rax
177
178	imulq	$lo0,$m1		# "tp[0]"*n0
179	mov	%rdx,$hi0
180
181	mulq	$m1			# np[0]*m1
182	add	%rax,$lo0		# discarded
183	mov	8($ap),%rax
184	adc	\$0,%rdx
185	mov	%rdx,$hi1
186
187	lea	1($j),$j		# j++
188	jmp	.L1st_enter
189
190.align	16
191.L1st:
192	add	%rax,$hi1
193	mov	($ap,$j,8),%rax
194	adc	\$0,%rdx
195	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
196	mov	$lo0,$hi0
197	adc	\$0,%rdx
198	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
199	mov	%rdx,$hi1
200
201.L1st_enter:
202	mulq	$m0			# ap[j]*bp[0]
203	add	%rax,$hi0
204	mov	($np,$j,8),%rax
205	adc	\$0,%rdx
206	lea	1($j),$j		# j++
207	mov	%rdx,$lo0
208
209	mulq	$m1			# np[j]*m1
210	cmp	$num,$j
211	jne	.L1st
212
213	add	%rax,$hi1
214	mov	($ap),%rax		# ap[0]
215	adc	\$0,%rdx
216	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
217	adc	\$0,%rdx
218	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
219	mov	%rdx,$hi1
220	mov	$lo0,$hi0
221
222	xor	%rdx,%rdx
223	add	$hi0,$hi1
224	adc	\$0,%rdx
225	mov	$hi1,-8(%rsp,$num,8)
226	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
227
228	lea	1($i),$i		# i++
229	jmp	.Louter
230.align	16
231.Louter:
232	mov	($bp,$i,8),$m0		# m0=bp[i]
233	xor	$j,$j			# j=0
234	mov	$n0,$m1
235	mov	(%rsp),$lo0
236	mulq	$m0			# ap[0]*bp[i]
237	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
238	mov	($np),%rax
239	adc	\$0,%rdx
240
241	imulq	$lo0,$m1		# tp[0]*n0
242	mov	%rdx,$hi0
243
244	mulq	$m1			# np[0]*m1
245	add	%rax,$lo0		# discarded
246	mov	8($ap),%rax
247	adc	\$0,%rdx
248	mov	8(%rsp),$lo0		# tp[1]
249	mov	%rdx,$hi1
250
251	lea	1($j),$j		# j++
252	jmp	.Linner_enter
253
254.align	16
255.Linner:
256	add	%rax,$hi1
257	mov	($ap,$j,8),%rax
258	adc	\$0,%rdx
259	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
260	mov	(%rsp,$j,8),$lo0
261	adc	\$0,%rdx
262	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
263	mov	%rdx,$hi1
264
265.Linner_enter:
266	mulq	$m0			# ap[j]*bp[i]
267	add	%rax,$hi0
268	mov	($np,$j,8),%rax
269	adc	\$0,%rdx
270	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
271	mov	%rdx,$hi0
272	adc	\$0,$hi0
273	lea	1($j),$j		# j++
274
275	mulq	$m1			# np[j]*m1
276	cmp	$num,$j
277	jne	.Linner
278
279	add	%rax,$hi1
280	mov	($ap),%rax		# ap[0]
281	adc	\$0,%rdx
282	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
283	mov	(%rsp,$j,8),$lo0
284	adc	\$0,%rdx
285	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
286	mov	%rdx,$hi1
287
288	xor	%rdx,%rdx
289	add	$hi0,$hi1
290	adc	\$0,%rdx
291	add	$lo0,$hi1		# pull upmost overflow bit
292	adc	\$0,%rdx
293	mov	$hi1,-8(%rsp,$num,8)
294	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
295
296	lea	1($i),$i		# i++
297	cmp	$num,$i
298	jb	.Louter
299
300	xor	$i,$i			# i=0 and clear CF!
301	mov	(%rsp),%rax		# tp[0]
302	mov	$num,$j			# j=num
303
304.align	16
305.Lsub:	sbb	($np,$i,8),%rax
306	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
307	mov	8(%rsp,$i,8),%rax	# tp[i+1]
308	lea	1($i),$i		# i++
309	dec	$j			# doesn't affect CF!
310	jnz	.Lsub
311
312	sbb	\$0,%rax		# handle upmost overflow bit
313	mov	\$-1,%rbx
314	xor	%rax,%rbx		# not %rax
315	xor	$i,$i
316	mov	$num,$j			# j=num
317
318.Lcopy:					# conditional copy
319	mov	($rp,$i,8),%rcx
320	mov	(%rsp,$i,8),%rdx
321	and	%rbx,%rcx
322	and	%rax,%rdx
323	mov	$num,(%rsp,$i,8)	# zap temporary vector
324	or	%rcx,%rdx
325	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
326	lea	1($i),$i
327	sub	\$1,$j
328	jnz	.Lcopy
329
330	mov	8(%rsp,$num,8),%rsi	# restore %rsp
331.cfi_def_cfa	%rsi,8
332	mov	\$1,%rax
333	mov	-48(%rsi),%r15
334.cfi_restore	%r15
335	mov	-40(%rsi),%r14
336.cfi_restore	%r14
337	mov	-32(%rsi),%r13
338.cfi_restore	%r13
339	mov	-24(%rsi),%r12
340.cfi_restore	%r12
341	mov	-16(%rsi),%rbp
342.cfi_restore	%rbp
343	mov	-8(%rsi),%rbx
344.cfi_restore	%rbx
345	lea	(%rsi),%rsp
346.cfi_def_cfa_register	%rsp
347.Lmul_epilogue:
348	ret
349.cfi_endproc
350.size	bn_mul_mont,.-bn_mul_mont
351___
352{{{
353my @A=("%r10","%r11");
354my @N=("%r13","%rdi");
355$code.=<<___;
356.type	bn_mul4x_mont,\@function,6
357.align	16
358bn_mul4x_mont:
359.cfi_startproc
360	mov	${num}d,${num}d
361	mov	%rsp,%rax
362.cfi_def_cfa_register	%rax
363.Lmul4x_enter:
364___
365$code.=<<___ if ($addx);
366	and	\$0x80100,%r11d
367	cmp	\$0x80100,%r11d
368	je	.Lmulx4x_enter
369___
370$code.=<<___;
371	push	%rbx
372.cfi_push	%rbx
373	push	%rbp
374.cfi_push	%rbp
375	push	%r12
376.cfi_push	%r12
377	push	%r13
378.cfi_push	%r13
379	push	%r14
380.cfi_push	%r14
381	push	%r15
382.cfi_push	%r15
383
384	neg	$num
385	mov	%rsp,%r11
386	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
387	neg	$num			# restore
388	and	\$-1024,%r10		# minimize TLB usage
389
390	sub	%r10,%r11
391	and	\$-4096,%r11
392	lea	(%r10,%r11),%rsp
393	mov	(%rsp),%r11
394	cmp	%r10,%rsp
395	ja	.Lmul4x_page_walk
396	jmp	.Lmul4x_page_walk_done
397
398.Lmul4x_page_walk:
399	lea	-4096(%rsp),%rsp
400	mov	(%rsp),%r11
401	cmp	%r10,%rsp
402	ja	.Lmul4x_page_walk
403.Lmul4x_page_walk_done:
404
405	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
406.cfi_cfa_expression	%rsp+8,$num,8,mul,plus,deref,+8
407.Lmul4x_body:
408	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
409	mov	%rdx,%r12		# reassign $bp
410___
411		$bp="%r12";
412$code.=<<___;
413	mov	($n0),$n0		# pull n0[0] value
414	mov	($bp),$m0		# m0=bp[0]
415	mov	($ap),%rax
416
417	xor	$i,$i			# i=0
418	xor	$j,$j			# j=0
419
420	mov	$n0,$m1
421	mulq	$m0			# ap[0]*bp[0]
422	mov	%rax,$A[0]
423	mov	($np),%rax
424
425	imulq	$A[0],$m1		# "tp[0]"*n0
426	mov	%rdx,$A[1]
427
428	mulq	$m1			# np[0]*m1
429	add	%rax,$A[0]		# discarded
430	mov	8($ap),%rax
431	adc	\$0,%rdx
432	mov	%rdx,$N[1]
433
434	mulq	$m0
435	add	%rax,$A[1]
436	mov	8($np),%rax
437	adc	\$0,%rdx
438	mov	%rdx,$A[0]
439
440	mulq	$m1
441	add	%rax,$N[1]
442	mov	16($ap),%rax
443	adc	\$0,%rdx
444	add	$A[1],$N[1]
445	lea	4($j),$j		# j++
446	adc	\$0,%rdx
447	mov	$N[1],(%rsp)
448	mov	%rdx,$N[0]
449	jmp	.L1st4x
450.align	16
451.L1st4x:
452	mulq	$m0			# ap[j]*bp[0]
453	add	%rax,$A[0]
454	mov	-16($np,$j,8),%rax
455	adc	\$0,%rdx
456	mov	%rdx,$A[1]
457
458	mulq	$m1			# np[j]*m1
459	add	%rax,$N[0]
460	mov	-8($ap,$j,8),%rax
461	adc	\$0,%rdx
462	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
463	adc	\$0,%rdx
464	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
465	mov	%rdx,$N[1]
466
467	mulq	$m0			# ap[j]*bp[0]
468	add	%rax,$A[1]
469	mov	-8($np,$j,8),%rax
470	adc	\$0,%rdx
471	mov	%rdx,$A[0]
472
473	mulq	$m1			# np[j]*m1
474	add	%rax,$N[1]
475	mov	($ap,$j,8),%rax
476	adc	\$0,%rdx
477	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
478	adc	\$0,%rdx
479	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
480	mov	%rdx,$N[0]
481
482	mulq	$m0			# ap[j]*bp[0]
483	add	%rax,$A[0]
484	mov	($np,$j,8),%rax
485	adc	\$0,%rdx
486	mov	%rdx,$A[1]
487
488	mulq	$m1			# np[j]*m1
489	add	%rax,$N[0]
490	mov	8($ap,$j,8),%rax
491	adc	\$0,%rdx
492	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
493	adc	\$0,%rdx
494	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
495	mov	%rdx,$N[1]
496
497	mulq	$m0			# ap[j]*bp[0]
498	add	%rax,$A[1]
499	mov	8($np,$j,8),%rax
500	adc	\$0,%rdx
501	lea	4($j),$j		# j++
502	mov	%rdx,$A[0]
503
504	mulq	$m1			# np[j]*m1
505	add	%rax,$N[1]
506	mov	-16($ap,$j,8),%rax
507	adc	\$0,%rdx
508	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
509	adc	\$0,%rdx
510	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
511	mov	%rdx,$N[0]
512	cmp	$num,$j
513	jb	.L1st4x
514
515	mulq	$m0			# ap[j]*bp[0]
516	add	%rax,$A[0]
517	mov	-16($np,$j,8),%rax
518	adc	\$0,%rdx
519	mov	%rdx,$A[1]
520
521	mulq	$m1			# np[j]*m1
522	add	%rax,$N[0]
523	mov	-8($ap,$j,8),%rax
524	adc	\$0,%rdx
525	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
526	adc	\$0,%rdx
527	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
528	mov	%rdx,$N[1]
529
530	mulq	$m0			# ap[j]*bp[0]
531	add	%rax,$A[1]
532	mov	-8($np,$j,8),%rax
533	adc	\$0,%rdx
534	mov	%rdx,$A[0]
535
536	mulq	$m1			# np[j]*m1
537	add	%rax,$N[1]
538	mov	($ap),%rax		# ap[0]
539	adc	\$0,%rdx
540	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
541	adc	\$0,%rdx
542	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
543	mov	%rdx,$N[0]
544
545	xor	$N[1],$N[1]
546	add	$A[0],$N[0]
547	adc	\$0,$N[1]
548	mov	$N[0],-8(%rsp,$j,8)
549	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
550
551	lea	1($i),$i		# i++
552.align	4
553.Louter4x:
554	mov	($bp,$i,8),$m0		# m0=bp[i]
555	xor	$j,$j			# j=0
556	mov	(%rsp),$A[0]
557	mov	$n0,$m1
558	mulq	$m0			# ap[0]*bp[i]
559	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
560	mov	($np),%rax
561	adc	\$0,%rdx
562
563	imulq	$A[0],$m1		# tp[0]*n0
564	mov	%rdx,$A[1]
565
566	mulq	$m1			# np[0]*m1
567	add	%rax,$A[0]		# "$N[0]", discarded
568	mov	8($ap),%rax
569	adc	\$0,%rdx
570	mov	%rdx,$N[1]
571
572	mulq	$m0			# ap[j]*bp[i]
573	add	%rax,$A[1]
574	mov	8($np),%rax
575	adc	\$0,%rdx
576	add	8(%rsp),$A[1]		# +tp[1]
577	adc	\$0,%rdx
578	mov	%rdx,$A[0]
579
580	mulq	$m1			# np[j]*m1
581	add	%rax,$N[1]
582	mov	16($ap),%rax
583	adc	\$0,%rdx
584	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
585	lea	4($j),$j		# j+=2
586	adc	\$0,%rdx
587	mov	$N[1],(%rsp)		# tp[j-1]
588	mov	%rdx,$N[0]
589	jmp	.Linner4x
590.align	16
591.Linner4x:
592	mulq	$m0			# ap[j]*bp[i]
593	add	%rax,$A[0]
594	mov	-16($np,$j,8),%rax
595	adc	\$0,%rdx
596	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
597	adc	\$0,%rdx
598	mov	%rdx,$A[1]
599
600	mulq	$m1			# np[j]*m1
601	add	%rax,$N[0]
602	mov	-8($ap,$j,8),%rax
603	adc	\$0,%rdx
604	add	$A[0],$N[0]
605	adc	\$0,%rdx
606	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
607	mov	%rdx,$N[1]
608
609	mulq	$m0			# ap[j]*bp[i]
610	add	%rax,$A[1]
611	mov	-8($np,$j,8),%rax
612	adc	\$0,%rdx
613	add	-8(%rsp,$j,8),$A[1]
614	adc	\$0,%rdx
615	mov	%rdx,$A[0]
616
617	mulq	$m1			# np[j]*m1
618	add	%rax,$N[1]
619	mov	($ap,$j,8),%rax
620	adc	\$0,%rdx
621	add	$A[1],$N[1]
622	adc	\$0,%rdx
623	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
624	mov	%rdx,$N[0]
625
626	mulq	$m0			# ap[j]*bp[i]
627	add	%rax,$A[0]
628	mov	($np,$j,8),%rax
629	adc	\$0,%rdx
630	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
631	adc	\$0,%rdx
632	mov	%rdx,$A[1]
633
634	mulq	$m1			# np[j]*m1
635	add	%rax,$N[0]
636	mov	8($ap,$j,8),%rax
637	adc	\$0,%rdx
638	add	$A[0],$N[0]
639	adc	\$0,%rdx
640	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
641	mov	%rdx,$N[1]
642
643	mulq	$m0			# ap[j]*bp[i]
644	add	%rax,$A[1]
645	mov	8($np,$j,8),%rax
646	adc	\$0,%rdx
647	add	8(%rsp,$j,8),$A[1]
648	adc	\$0,%rdx
649	lea	4($j),$j		# j++
650	mov	%rdx,$A[0]
651
652	mulq	$m1			# np[j]*m1
653	add	%rax,$N[1]
654	mov	-16($ap,$j,8),%rax
655	adc	\$0,%rdx
656	add	$A[1],$N[1]
657	adc	\$0,%rdx
658	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
659	mov	%rdx,$N[0]
660	cmp	$num,$j
661	jb	.Linner4x
662
663	mulq	$m0			# ap[j]*bp[i]
664	add	%rax,$A[0]
665	mov	-16($np,$j,8),%rax
666	adc	\$0,%rdx
667	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
668	adc	\$0,%rdx
669	mov	%rdx,$A[1]
670
671	mulq	$m1			# np[j]*m1
672	add	%rax,$N[0]
673	mov	-8($ap,$j,8),%rax
674	adc	\$0,%rdx
675	add	$A[0],$N[0]
676	adc	\$0,%rdx
677	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
678	mov	%rdx,$N[1]
679
680	mulq	$m0			# ap[j]*bp[i]
681	add	%rax,$A[1]
682	mov	-8($np,$j,8),%rax
683	adc	\$0,%rdx
684	add	-8(%rsp,$j,8),$A[1]
685	adc	\$0,%rdx
686	lea	1($i),$i		# i++
687	mov	%rdx,$A[0]
688
689	mulq	$m1			# np[j]*m1
690	add	%rax,$N[1]
691	mov	($ap),%rax		# ap[0]
692	adc	\$0,%rdx
693	add	$A[1],$N[1]
694	adc	\$0,%rdx
695	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
696	mov	%rdx,$N[0]
697
698	xor	$N[1],$N[1]
699	add	$A[0],$N[0]
700	adc	\$0,$N[1]
701	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
702	adc	\$0,$N[1]
703	mov	$N[0],-8(%rsp,$j,8)
704	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
705
706	cmp	$num,$i
707	jb	.Louter4x
708___
709{
710my @ri=("%rax","%rdx",$m0,$m1);
711$code.=<<___;
712	mov	16(%rsp,$num,8),$rp	# restore $rp
713	lea	-4($num),$j
714	mov	0(%rsp),@ri[0]		# tp[0]
715	mov	8(%rsp),@ri[1]		# tp[1]
716	shr	\$2,$j			# j=num/4-1
717	lea	(%rsp),$ap		# borrow ap for tp
718	xor	$i,$i			# i=0 and clear CF!
719
720	sub	0($np),@ri[0]
721	mov	16($ap),@ri[2]		# tp[2]
722	mov	24($ap),@ri[3]		# tp[3]
723	sbb	8($np),@ri[1]
724
725.Lsub4x:
726	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
727	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
728	sbb	16($np,$i,8),@ri[2]
729	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
730	mov	40($ap,$i,8),@ri[1]
731	sbb	24($np,$i,8),@ri[3]
732	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
733	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
734	sbb	32($np,$i,8),@ri[0]
735	mov	48($ap,$i,8),@ri[2]
736	mov	56($ap,$i,8),@ri[3]
737	sbb	40($np,$i,8),@ri[1]
738	lea	4($i),$i		# i++
739	dec	$j			# doesn't affect CF!
740	jnz	.Lsub4x
741
742	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
743	mov	32($ap,$i,8),@ri[0]	# load overflow bit
744	sbb	16($np,$i,8),@ri[2]
745	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
746	sbb	24($np,$i,8),@ri[3]
747	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
748
749	sbb	\$0,@ri[0]		# handle upmost overflow bit
750	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
751	pxor	%xmm0,%xmm0
752	movq	@ri[0],%xmm4
753	pcmpeqd	%xmm5,%xmm5
754	pshufd	\$0,%xmm4,%xmm4
755	mov	$num,$j
756	pxor	%xmm4,%xmm5
757	shr	\$2,$j			# j=num/4
758	xor	%eax,%eax		# i=0
759
760	jmp	.Lcopy4x
761.align	16
762.Lcopy4x:				# conditional copy
763	movdqa	(%rsp,%rax),%xmm1
764	movdqu	($rp,%rax),%xmm2
765	pand	%xmm4,%xmm1
766	pand	%xmm5,%xmm2
767	movdqa	16(%rsp,%rax),%xmm3
768	movdqa	%xmm0,(%rsp,%rax)
769	por	%xmm2,%xmm1
770	movdqu	16($rp,%rax),%xmm2
771	movdqu	%xmm1,($rp,%rax)
772	pand	%xmm4,%xmm3
773	pand	%xmm5,%xmm2
774	movdqa	%xmm0,16(%rsp,%rax)
775	por	%xmm2,%xmm3
776	movdqu	%xmm3,16($rp,%rax)
777	lea	32(%rax),%rax
778	dec	$j
779	jnz	.Lcopy4x
780___
781}
782$code.=<<___;
783	mov	8(%rsp,$num,8),%rsi	# restore %rsp
784.cfi_def_cfa	%rsi, 8
785	mov	\$1,%rax
786	mov	-48(%rsi),%r15
787.cfi_restore	%r15
788	mov	-40(%rsi),%r14
789.cfi_restore	%r14
790	mov	-32(%rsi),%r13
791.cfi_restore	%r13
792	mov	-24(%rsi),%r12
793.cfi_restore	%r12
794	mov	-16(%rsi),%rbp
795.cfi_restore	%rbp
796	mov	-8(%rsi),%rbx
797.cfi_restore	%rbx
798	lea	(%rsi),%rsp
799.cfi_def_cfa_register	%rsp
800.Lmul4x_epilogue:
801	ret
802.cfi_endproc
803.size	bn_mul4x_mont,.-bn_mul4x_mont
804___
805}}}
806{{{
807######################################################################
808# void bn_sqr8x_mont(
809my $rptr="%rdi";	# const BN_ULONG *rptr,
810my $aptr="%rsi";	# const BN_ULONG *aptr,
811my $bptr="%rdx";	# not used
812my $nptr="%rcx";	# const BN_ULONG *nptr,
813my $n0  ="%r8";		# const BN_ULONG *n0);
814my $num ="%r9";		# int num, has to be divisible by 8
815
816my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
817my @A0=("%r10","%r11");
818my @A1=("%r12","%r13");
819my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
820
821$code.=<<___	if ($addx);
822.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
823___
824$code.=<<___;
825.extern	bn_sqr8x_internal		# see x86_64-mont5 module
826
827.type	bn_sqr8x_mont,\@function,6
828.align	32
829bn_sqr8x_mont:
830.cfi_startproc
831	mov	%rsp,%rax
832.cfi_def_cfa_register	%rax
833.Lsqr8x_enter:
834	push	%rbx
835.cfi_push	%rbx
836	push	%rbp
837.cfi_push	%rbp
838	push	%r12
839.cfi_push	%r12
840	push	%r13
841.cfi_push	%r13
842	push	%r14
843.cfi_push	%r14
844	push	%r15
845.cfi_push	%r15
846.Lsqr8x_prologue:
847
848	mov	${num}d,%r10d
849	shl	\$3,${num}d		# convert $num to bytes
850	shl	\$3+2,%r10		# 4*$num
851	neg	$num
852
853	##############################################################
854	# ensure that stack frame doesn't alias with $aptr modulo
855	# 4096. this is done to allow memory disambiguation logic
856	# do its job.
857	#
858	lea	-64(%rsp,$num,2),%r11
859	mov	%rsp,%rbp
860	mov	($n0),$n0		# *n0
861	sub	$aptr,%r11
862	and	\$4095,%r11
863	cmp	%r11,%r10
864	jb	.Lsqr8x_sp_alt
865	sub	%r11,%rbp		# align with $aptr
866	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
867	jmp	.Lsqr8x_sp_done
868
869.align	32
870.Lsqr8x_sp_alt:
871	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
872	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
873	sub	%r10,%r11
874	mov	\$0,%r10
875	cmovc	%r10,%r11
876	sub	%r11,%rbp
877.Lsqr8x_sp_done:
878	and	\$-64,%rbp
879	mov	%rsp,%r11
880	sub	%rbp,%r11
881	and	\$-4096,%r11
882	lea	(%rbp,%r11),%rsp
883	mov	(%rsp),%r10
884	cmp	%rbp,%rsp
885	ja	.Lsqr8x_page_walk
886	jmp	.Lsqr8x_page_walk_done
887
888.align	16
889.Lsqr8x_page_walk:
890	lea	-4096(%rsp),%rsp
891	mov	(%rsp),%r10
892	cmp	%rbp,%rsp
893	ja	.Lsqr8x_page_walk
894.Lsqr8x_page_walk_done:
895
896	mov	$num,%r10
897	neg	$num
898
899	mov	$n0,  32(%rsp)
900	mov	%rax, 40(%rsp)		# save original %rsp
901.cfi_cfa_expression	%rsp+40,deref,+8
902.Lsqr8x_body:
903
904	movq	$nptr, %xmm2		# save pointer to modulus
905	pxor	%xmm0,%xmm0
906	movq	$rptr,%xmm1		# save $rptr
907	movq	%r10, %xmm3		# -$num
908___
909$code.=<<___ if ($addx);
910	leaq	OPENSSL_ia32cap_P(%rip),%rax
911	mov	8(%rax),%eax
912	and	\$0x80100,%eax
913	cmp	\$0x80100,%eax
914	jne	.Lsqr8x_nox
915
916	call	bn_sqrx8x_internal	# see x86_64-mont5 module
917					# %rax	top-most carry
918					# %rbp	nptr
919					# %rcx	-8*num
920					# %r8	end of tp[2*num]
921	lea	(%r8,%rcx),%rbx
922	mov	%rcx,$num
923	mov	%rcx,%rdx
924	movq	%xmm1,$rptr
925	sar	\$3+2,%rcx		# %cf=0
926	jmp	.Lsqr8x_sub
927
928.align	32
929.Lsqr8x_nox:
930___
931$code.=<<___;
932	call	bn_sqr8x_internal	# see x86_64-mont5 module
933					# %rax	top-most carry
934					# %rbp	nptr
935					# %r8	-8*num
936					# %rdi	end of tp[2*num]
937	lea	(%rdi,$num),%rbx
938	mov	$num,%rcx
939	mov	$num,%rdx
940	movq	%xmm1,$rptr
941	sar	\$3+2,%rcx		# %cf=0
942	jmp	.Lsqr8x_sub
943
944.align	32
945.Lsqr8x_sub:
946	mov	8*0(%rbx),%r12
947	mov	8*1(%rbx),%r13
948	mov	8*2(%rbx),%r14
949	mov	8*3(%rbx),%r15
950	lea	8*4(%rbx),%rbx
951	sbb	8*0(%rbp),%r12
952	sbb	8*1(%rbp),%r13
953	sbb	8*2(%rbp),%r14
954	sbb	8*3(%rbp),%r15
955	lea	8*4(%rbp),%rbp
956	mov	%r12,8*0($rptr)
957	mov	%r13,8*1($rptr)
958	mov	%r14,8*2($rptr)
959	mov	%r15,8*3($rptr)
960	lea	8*4($rptr),$rptr
961	inc	%rcx			# preserves %cf
962	jnz	.Lsqr8x_sub
963
964	sbb	\$0,%rax		# top-most carry
965	lea	(%rbx,$num),%rbx	# rewind
966	lea	($rptr,$num),$rptr	# rewind
967
968	movq	%rax,%xmm1
969	pxor	%xmm0,%xmm0
970	pshufd	\$0,%xmm1,%xmm1
971	mov	40(%rsp),%rsi		# restore %rsp
972.cfi_def_cfa	%rsi,8
973	jmp	.Lsqr8x_cond_copy
974
975.align	32
976.Lsqr8x_cond_copy:
977	movdqa	16*0(%rbx),%xmm2
978	movdqa	16*1(%rbx),%xmm3
979	lea	16*2(%rbx),%rbx
980	movdqu	16*0($rptr),%xmm4
981	movdqu	16*1($rptr),%xmm5
982	lea	16*2($rptr),$rptr
983	movdqa	%xmm0,-16*2(%rbx)	# zero tp
984	movdqa	%xmm0,-16*1(%rbx)
985	movdqa	%xmm0,-16*2(%rbx,%rdx)
986	movdqa	%xmm0,-16*1(%rbx,%rdx)
987	pcmpeqd	%xmm1,%xmm0
988	pand	%xmm1,%xmm2
989	pand	%xmm1,%xmm3
990	pand	%xmm0,%xmm4
991	pand	%xmm0,%xmm5
992	pxor	%xmm0,%xmm0
993	por	%xmm2,%xmm4
994	por	%xmm3,%xmm5
995	movdqu	%xmm4,-16*2($rptr)
996	movdqu	%xmm5,-16*1($rptr)
997	add	\$32,$num
998	jnz	.Lsqr8x_cond_copy
999
1000	mov	\$1,%rax
1001	mov	-48(%rsi),%r15
1002.cfi_restore	%r15
1003	mov	-40(%rsi),%r14
1004.cfi_restore	%r14
1005	mov	-32(%rsi),%r13
1006.cfi_restore	%r13
1007	mov	-24(%rsi),%r12
1008.cfi_restore	%r12
1009	mov	-16(%rsi),%rbp
1010.cfi_restore	%rbp
1011	mov	-8(%rsi),%rbx
1012.cfi_restore	%rbx
1013	lea	(%rsi),%rsp
1014.cfi_def_cfa_register	%rsp
1015.Lsqr8x_epilogue:
1016	ret
1017.cfi_endproc
1018.size	bn_sqr8x_mont,.-bn_sqr8x_mont
1019___
1020}}}
1021
1022if ($addx) {{{
1023my $bp="%rdx";	# original value
1024
1025$code.=<<___;
1026.type	bn_mulx4x_mont,\@function,6
1027.align	32
1028bn_mulx4x_mont:
1029.cfi_startproc
1030	mov	%rsp,%rax
1031.cfi_def_cfa_register	%rax
1032.Lmulx4x_enter:
1033	push	%rbx
1034.cfi_push	%rbx
1035	push	%rbp
1036.cfi_push	%rbp
1037	push	%r12
1038.cfi_push	%r12
1039	push	%r13
1040.cfi_push	%r13
1041	push	%r14
1042.cfi_push	%r14
1043	push	%r15
1044.cfi_push	%r15
1045.Lmulx4x_prologue:
1046
1047	shl	\$3,${num}d		# convert $num to bytes
1048	xor	%r10,%r10
1049	sub	$num,%r10		# -$num
1050	mov	($n0),$n0		# *n0
1051	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
1052	and	\$-128,%rbp
1053	mov	%rsp,%r11
1054	sub	%rbp,%r11
1055	and	\$-4096,%r11
1056	lea	(%rbp,%r11),%rsp
1057	mov	(%rsp),%r10
1058	cmp	%rbp,%rsp
1059	ja	.Lmulx4x_page_walk
1060	jmp	.Lmulx4x_page_walk_done
1061
1062.align	16
1063.Lmulx4x_page_walk:
1064	lea	-4096(%rsp),%rsp
1065	mov	(%rsp),%r10
1066	cmp	%rbp,%rsp
1067	ja	.Lmulx4x_page_walk
1068.Lmulx4x_page_walk_done:
1069
1070	lea	($bp,$num),%r10
1071	##############################################################
1072	# Stack layout
1073	# +0	num
1074	# +8	off-loaded &b[i]
1075	# +16	end of b[num]
1076	# +24	saved n0
1077	# +32	saved rp
1078	# +40	saved %rsp
1079	# +48	inner counter
1080	# +56
1081	# +64	tmp[num+1]
1082	#
1083	mov	$num,0(%rsp)		# save $num
1084	shr	\$5,$num
1085	mov	%r10,16(%rsp)		# end of b[num]
1086	sub	\$1,$num
1087	mov	$n0, 24(%rsp)		# save *n0
1088	mov	$rp, 32(%rsp)		# save $rp
1089	mov	%rax,40(%rsp)		# save original %rsp
1090.cfi_cfa_expression	%rsp+40,deref,+8
1091	mov	$num,48(%rsp)		# inner counter
1092	jmp	.Lmulx4x_body
1093
1094.align	32
1095.Lmulx4x_body:
1096___
1097my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
1098   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1099my $rptr=$bptr;
1100$code.=<<___;
1101	lea	8($bp),$bptr
1102	mov	($bp),%rdx		# b[0], $bp==%rdx actually
1103	lea	64+32(%rsp),$tptr
1104	mov	%rdx,$bi
1105
1106	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
1107	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
1108	add	%rax,%r11
1109	mov	$bptr,8(%rsp)		# off-load &b[i]
1110	mulx	2*8($aptr),%r12,%r13	# ...
1111	adc	%r14,%r12
1112	adc	\$0,%r13
1113
1114	mov	$mi,$bptr		# borrow $bptr
1115	imulq	24(%rsp),$mi		# "t[0]"*n0
1116	xor	$zero,$zero		# cf=0, of=0
1117
1118	mulx	3*8($aptr),%rax,%r14
1119	 mov	$mi,%rdx
1120	lea	4*8($aptr),$aptr
1121	adcx	%rax,%r13
1122	adcx	$zero,%r14		# cf=0
1123
1124	mulx	0*8($nptr),%rax,%r10
1125	adcx	%rax,$bptr		# discarded
1126	adox	%r11,%r10
1127	mulx	1*8($nptr),%rax,%r11
1128	adcx	%rax,%r10
1129	adox	%r12,%r11
1130	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
1131	mov	48(%rsp),$bptr		# counter value
1132	mov	%r10,-4*8($tptr)
1133	adcx	%rax,%r11
1134	adox	%r13,%r12
1135	mulx	3*8($nptr),%rax,%r15
1136	 mov	$bi,%rdx
1137	mov	%r11,-3*8($tptr)
1138	adcx	%rax,%r12
1139	adox	$zero,%r15		# of=0
1140	lea	4*8($nptr),$nptr
1141	mov	%r12,-2*8($tptr)
1142
1143	jmp	.Lmulx4x_1st
1144
1145.align	32
1146.Lmulx4x_1st:
1147	adcx	$zero,%r15		# cf=0, modulo-scheduled
1148	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
1149	adcx	%r14,%r10
1150	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
1151	adcx	%rax,%r11
1152	mulx	2*8($aptr),%r12,%rax	# ...
1153	adcx	%r14,%r12
1154	mulx	3*8($aptr),%r13,%r14
1155	 .byte	0x67,0x67
1156	 mov	$mi,%rdx
1157	adcx	%rax,%r13
1158	adcx	$zero,%r14		# cf=0
1159	lea	4*8($aptr),$aptr
1160	lea	4*8($tptr),$tptr
1161
1162	adox	%r15,%r10
1163	mulx	0*8($nptr),%rax,%r15
1164	adcx	%rax,%r10
1165	adox	%r15,%r11
1166	mulx	1*8($nptr),%rax,%r15
1167	adcx	%rax,%r11
1168	adox	%r15,%r12
1169	mulx	2*8($nptr),%rax,%r15
1170	mov	%r10,-5*8($tptr)
1171	adcx	%rax,%r12
1172	mov	%r11,-4*8($tptr)
1173	adox	%r15,%r13
1174	mulx	3*8($nptr),%rax,%r15
1175	 mov	$bi,%rdx
1176	mov	%r12,-3*8($tptr)
1177	adcx	%rax,%r13
1178	adox	$zero,%r15
1179	lea	4*8($nptr),$nptr
1180	mov	%r13,-2*8($tptr)
1181
1182	dec	$bptr			# of=0, pass cf
1183	jnz	.Lmulx4x_1st
1184
1185	mov	0(%rsp),$num		# load num
1186	mov	8(%rsp),$bptr		# re-load &b[i]
1187	adc	$zero,%r15		# modulo-scheduled
1188	add	%r15,%r14
1189	sbb	%r15,%r15		# top-most carry
1190	mov	%r14,-1*8($tptr)
1191	jmp	.Lmulx4x_outer
1192
1193.align	32
1194.Lmulx4x_outer:
1195	mov	($bptr),%rdx		# b[i]
1196	lea	8($bptr),$bptr		# b++
1197	sub	$num,$aptr		# rewind $aptr
1198	mov	%r15,($tptr)		# save top-most carry
1199	lea	64+4*8(%rsp),$tptr
1200	sub	$num,$nptr		# rewind $nptr
1201
1202	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
1203	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1204	mov	%rdx,$bi
1205	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
1206	adox	-4*8($tptr),$mi
1207	adcx	%r14,%r11
1208	mulx	2*8($aptr),%r15,%r13	# ...
1209	adox	-3*8($tptr),%r11
1210	adcx	%r15,%r12
1211	adox	-2*8($tptr),%r12
1212	adcx	$zero,%r13
1213	adox	$zero,%r13
1214
1215	mov	$bptr,8(%rsp)		# off-load &b[i]
1216	mov	$mi,%r15
1217	imulq	24(%rsp),$mi		# "t[0]"*n0
1218	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1219
1220	mulx	3*8($aptr),%rax,%r14
1221	 mov	$mi,%rdx
1222	adcx	%rax,%r13
1223	adox	-1*8($tptr),%r13
1224	adcx	$zero,%r14
1225	lea	4*8($aptr),$aptr
1226	adox	$zero,%r14
1227
1228	mulx	0*8($nptr),%rax,%r10
1229	adcx	%rax,%r15		# discarded
1230	adox	%r11,%r10
1231	mulx	1*8($nptr),%rax,%r11
1232	adcx	%rax,%r10
1233	adox	%r12,%r11
1234	mulx	2*8($nptr),%rax,%r12
1235	mov	%r10,-4*8($tptr)
1236	adcx	%rax,%r11
1237	adox	%r13,%r12
1238	mulx	3*8($nptr),%rax,%r15
1239	 mov	$bi,%rdx
1240	mov	%r11,-3*8($tptr)
1241	lea	4*8($nptr),$nptr
1242	adcx	%rax,%r12
1243	adox	$zero,%r15		# of=0
1244	mov	48(%rsp),$bptr		# counter value
1245	mov	%r12,-2*8($tptr)
1246
1247	jmp	.Lmulx4x_inner
1248
1249.align	32
1250.Lmulx4x_inner:
1251	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
1252	adcx	$zero,%r15		# cf=0, modulo-scheduled
1253	adox	%r14,%r10
1254	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
1255	adcx	0*8($tptr),%r10
1256	adox	%rax,%r11
1257	mulx	2*8($aptr),%r12,%rax	# ...
1258	adcx	1*8($tptr),%r11
1259	adox	%r14,%r12
1260	mulx	3*8($aptr),%r13,%r14
1261	 mov	$mi,%rdx
1262	adcx	2*8($tptr),%r12
1263	adox	%rax,%r13
1264	adcx	3*8($tptr),%r13
1265	adox	$zero,%r14		# of=0
1266	lea	4*8($aptr),$aptr
1267	lea	4*8($tptr),$tptr
1268	adcx	$zero,%r14		# cf=0
1269
1270	adox	%r15,%r10
1271	mulx	0*8($nptr),%rax,%r15
1272	adcx	%rax,%r10
1273	adox	%r15,%r11
1274	mulx	1*8($nptr),%rax,%r15
1275	adcx	%rax,%r11
1276	adox	%r15,%r12
1277	mulx	2*8($nptr),%rax,%r15
1278	mov	%r10,-5*8($tptr)
1279	adcx	%rax,%r12
1280	adox	%r15,%r13
1281	mulx	3*8($nptr),%rax,%r15
1282	 mov	$bi,%rdx
1283	mov	%r11,-4*8($tptr)
1284	mov	%r12,-3*8($tptr)
1285	adcx	%rax,%r13
1286	adox	$zero,%r15
1287	lea	4*8($nptr),$nptr
1288	mov	%r13,-2*8($tptr)
1289
1290	dec	$bptr			# of=0, pass cf
1291	jnz	.Lmulx4x_inner
1292
1293	mov	0(%rsp),$num		# load num
1294	mov	8(%rsp),$bptr		# re-load &b[i]
1295	adc	$zero,%r15		# modulo-scheduled
1296	sub	0*8($tptr),$zero	# pull top-most carry
1297	adc	%r15,%r14
1298	sbb	%r15,%r15		# top-most carry
1299	mov	%r14,-1*8($tptr)
1300
1301	cmp	16(%rsp),$bptr
1302	jne	.Lmulx4x_outer
1303
1304	lea	64(%rsp),$tptr
1305	sub	$num,$nptr		# rewind $nptr
1306	neg	%r15
1307	mov	$num,%rdx
1308	shr	\$3+2,$num		# %cf=0
1309	mov	32(%rsp),$rptr		# restore rp
1310	jmp	.Lmulx4x_sub
1311
1312.align	32
1313.Lmulx4x_sub:
1314	mov	8*0($tptr),%r11
1315	mov	8*1($tptr),%r12
1316	mov	8*2($tptr),%r13
1317	mov	8*3($tptr),%r14
1318	lea	8*4($tptr),$tptr
1319	sbb	8*0($nptr),%r11
1320	sbb	8*1($nptr),%r12
1321	sbb	8*2($nptr),%r13
1322	sbb	8*3($nptr),%r14
1323	lea	8*4($nptr),$nptr
1324	mov	%r11,8*0($rptr)
1325	mov	%r12,8*1($rptr)
1326	mov	%r13,8*2($rptr)
1327	mov	%r14,8*3($rptr)
1328	lea	8*4($rptr),$rptr
1329	dec	$num			# preserves %cf
1330	jnz	.Lmulx4x_sub
1331
1332	sbb	\$0,%r15		# top-most carry
1333	lea	64(%rsp),$tptr
1334	sub	%rdx,$rptr		# rewind
1335
1336	movq	%r15,%xmm1
1337	pxor	%xmm0,%xmm0
1338	pshufd	\$0,%xmm1,%xmm1
1339	mov	40(%rsp),%rsi		# restore %rsp
1340.cfi_def_cfa	%rsi,8
1341	jmp	.Lmulx4x_cond_copy
1342
1343.align	32
1344.Lmulx4x_cond_copy:
1345	movdqa	16*0($tptr),%xmm2
1346	movdqa	16*1($tptr),%xmm3
1347	lea	16*2($tptr),$tptr
1348	movdqu	16*0($rptr),%xmm4
1349	movdqu	16*1($rptr),%xmm5
1350	lea	16*2($rptr),$rptr
1351	movdqa	%xmm0,-16*2($tptr)	# zero tp
1352	movdqa	%xmm0,-16*1($tptr)
1353	pcmpeqd	%xmm1,%xmm0
1354	pand	%xmm1,%xmm2
1355	pand	%xmm1,%xmm3
1356	pand	%xmm0,%xmm4
1357	pand	%xmm0,%xmm5
1358	pxor	%xmm0,%xmm0
1359	por	%xmm2,%xmm4
1360	por	%xmm3,%xmm5
1361	movdqu	%xmm4,-16*2($rptr)
1362	movdqu	%xmm5,-16*1($rptr)
1363	sub	\$32,%rdx
1364	jnz	.Lmulx4x_cond_copy
1365
1366	mov	%rdx,($tptr)
1367
1368	mov	\$1,%rax
1369	mov	-48(%rsi),%r15
1370.cfi_restore	%r15
1371	mov	-40(%rsi),%r14
1372.cfi_restore	%r14
1373	mov	-32(%rsi),%r13
1374.cfi_restore	%r13
1375	mov	-24(%rsi),%r12
1376.cfi_restore	%r12
1377	mov	-16(%rsi),%rbp
1378.cfi_restore	%rbp
1379	mov	-8(%rsi),%rbx
1380.cfi_restore	%rbx
1381	lea	(%rsi),%rsp
1382.cfi_def_cfa_register	%rsp
1383.Lmulx4x_epilogue:
1384	ret
1385.cfi_endproc
1386.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1387___
1388}}}
1389$code.=<<___;
1390.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1391.align	16
1392___
1393
1394# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1395#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1396if ($win64) {
1397$rec="%rcx";
1398$frame="%rdx";
1399$context="%r8";
1400$disp="%r9";
1401
1402$code.=<<___;
1403.extern	__imp_RtlVirtualUnwind
1404.type	mul_handler,\@abi-omnipotent
1405.align	16
1406mul_handler:
1407	push	%rsi
1408	push	%rdi
1409	push	%rbx
1410	push	%rbp
1411	push	%r12
1412	push	%r13
1413	push	%r14
1414	push	%r15
1415	pushfq
1416	sub	\$64,%rsp
1417
1418	mov	120($context),%rax	# pull context->Rax
1419	mov	248($context),%rbx	# pull context->Rip
1420
1421	mov	8($disp),%rsi		# disp->ImageBase
1422	mov	56($disp),%r11		# disp->HandlerData
1423
1424	mov	0(%r11),%r10d		# HandlerData[0]
1425	lea	(%rsi,%r10),%r10	# end of prologue label
1426	cmp	%r10,%rbx		# context->Rip<end of prologue label
1427	jb	.Lcommon_seh_tail
1428
1429	mov	152($context),%rax	# pull context->Rsp
1430
1431	mov	4(%r11),%r10d		# HandlerData[1]
1432	lea	(%rsi,%r10),%r10	# epilogue label
1433	cmp	%r10,%rbx		# context->Rip>=epilogue label
1434	jae	.Lcommon_seh_tail
1435
1436	mov	192($context),%r10	# pull $num
1437	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1438
1439	jmp	.Lcommon_pop_regs
1440.size	mul_handler,.-mul_handler
1441
1442.type	sqr_handler,\@abi-omnipotent
1443.align	16
1444sqr_handler:
1445	push	%rsi
1446	push	%rdi
1447	push	%rbx
1448	push	%rbp
1449	push	%r12
1450	push	%r13
1451	push	%r14
1452	push	%r15
1453	pushfq
1454	sub	\$64,%rsp
1455
1456	mov	120($context),%rax	# pull context->Rax
1457	mov	248($context),%rbx	# pull context->Rip
1458
1459	mov	8($disp),%rsi		# disp->ImageBase
1460	mov	56($disp),%r11		# disp->HandlerData
1461
1462	mov	0(%r11),%r10d		# HandlerData[0]
1463	lea	(%rsi,%r10),%r10	# end of prologue label
1464	cmp	%r10,%rbx		# context->Rip<.Lsqr_prologue
1465	jb	.Lcommon_seh_tail
1466
1467	mov	4(%r11),%r10d		# HandlerData[1]
1468	lea	(%rsi,%r10),%r10	# body label
1469	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1470	jb	.Lcommon_pop_regs
1471
1472	mov	152($context),%rax	# pull context->Rsp
1473
1474	mov	8(%r11),%r10d		# HandlerData[2]
1475	lea	(%rsi,%r10),%r10	# epilogue label
1476	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1477	jae	.Lcommon_seh_tail
1478
1479	mov	40(%rax),%rax		# pull saved stack pointer
1480
1481.Lcommon_pop_regs:
1482	mov	-8(%rax),%rbx
1483	mov	-16(%rax),%rbp
1484	mov	-24(%rax),%r12
1485	mov	-32(%rax),%r13
1486	mov	-40(%rax),%r14
1487	mov	-48(%rax),%r15
1488	mov	%rbx,144($context)	# restore context->Rbx
1489	mov	%rbp,160($context)	# restore context->Rbp
1490	mov	%r12,216($context)	# restore context->R12
1491	mov	%r13,224($context)	# restore context->R13
1492	mov	%r14,232($context)	# restore context->R14
1493	mov	%r15,240($context)	# restore context->R15
1494
1495.Lcommon_seh_tail:
1496	mov	8(%rax),%rdi
1497	mov	16(%rax),%rsi
1498	mov	%rax,152($context)	# restore context->Rsp
1499	mov	%rsi,168($context)	# restore context->Rsi
1500	mov	%rdi,176($context)	# restore context->Rdi
1501
1502	mov	40($disp),%rdi		# disp->ContextRecord
1503	mov	$context,%rsi		# context
1504	mov	\$154,%ecx		# sizeof(CONTEXT)
1505	.long	0xa548f3fc		# cld; rep movsq
1506
1507	mov	$disp,%rsi
1508	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1509	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1510	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1511	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1512	mov	40(%rsi),%r10		# disp->ContextRecord
1513	lea	56(%rsi),%r11		# &disp->HandlerData
1514	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1515	mov	%r10,32(%rsp)		# arg5
1516	mov	%r11,40(%rsp)		# arg6
1517	mov	%r12,48(%rsp)		# arg7
1518	mov	%rcx,56(%rsp)		# arg8, (NULL)
1519	call	*__imp_RtlVirtualUnwind(%rip)
1520
1521	mov	\$1,%eax		# ExceptionContinueSearch
1522	add	\$64,%rsp
1523	popfq
1524	pop	%r15
1525	pop	%r14
1526	pop	%r13
1527	pop	%r12
1528	pop	%rbp
1529	pop	%rbx
1530	pop	%rdi
1531	pop	%rsi
1532	ret
1533.size	sqr_handler,.-sqr_handler
1534
1535.section	.pdata
1536.align	4
1537	.rva	.LSEH_begin_bn_mul_mont
1538	.rva	.LSEH_end_bn_mul_mont
1539	.rva	.LSEH_info_bn_mul_mont
1540
1541	.rva	.LSEH_begin_bn_mul4x_mont
1542	.rva	.LSEH_end_bn_mul4x_mont
1543	.rva	.LSEH_info_bn_mul4x_mont
1544
1545	.rva	.LSEH_begin_bn_sqr8x_mont
1546	.rva	.LSEH_end_bn_sqr8x_mont
1547	.rva	.LSEH_info_bn_sqr8x_mont
1548___
1549$code.=<<___ if ($addx);
1550	.rva	.LSEH_begin_bn_mulx4x_mont
1551	.rva	.LSEH_end_bn_mulx4x_mont
1552	.rva	.LSEH_info_bn_mulx4x_mont
1553___
1554$code.=<<___;
1555.section	.xdata
1556.align	8
1557.LSEH_info_bn_mul_mont:
1558	.byte	9,0,0,0
1559	.rva	mul_handler
1560	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1561.LSEH_info_bn_mul4x_mont:
1562	.byte	9,0,0,0
1563	.rva	mul_handler
1564	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1565.LSEH_info_bn_sqr8x_mont:
1566	.byte	9,0,0,0
1567	.rva	sqr_handler
1568	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
1569.align	8
1570___
1571$code.=<<___ if ($addx);
1572.LSEH_info_bn_mulx4x_mont:
1573	.byte	9,0,0,0
1574	.rva	sqr_handler
1575	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
1576.align	8
1577___
1578}
1579
1580print $code;
1581close STDOUT;
1582