• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/env perl
2# Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16
17# October 2005
18#
19# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
20# Because unlike integer multiplier, which simply stalls whole CPU,
21# FPU is fully pipelined and can effectively emit 48 bit partial
22# product every cycle. Why not blended SPARC v9? One can argue that
23# making this module dependent on UltraSPARC VIS extension limits its
24# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
25# implementations from compatibility matrix. But the rest, whole Sun
26# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
27# VIS extension instructions used in this module. This is considered
28# good enough to not care about HAL SPARC64 users [if any] who have
29# integer-only pure SPARCv9 module to "fall down" to.
30
31# USI&II cores currently exhibit uniform 2x improvement [over pre-
32# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
33# performance improves few percents for shorter keys and worsens few
34# percents for longer keys. This is because USIII integer multiplier
35# is >3x faster than USI&II one, which is harder to match [but see
36# TODO list below]. It should also be noted that SPARC64 V features
37# out-of-order execution, which *might* mean that integer multiplier
38# is pipelined, which in turn *might* be impossible to match... On
39# additional note, SPARC64 V implements FP Multiply-Add instruction,
40# which is perfectly usable in this context... In other words, as far
41# as Fujitsu SPARC64 V goes, talk to the author:-)
42
43# The implementation implies following "non-natural" limitations on
44# input arguments:
45# - num may not be less than 4;
46# - num has to be even;
47# Failure to meet either condition has no fatal effects, simply
48# doesn't give any performance gain.
49
50# TODO:
51# - modulo-schedule inner loop for better performance (on in-order
52#   execution core such as UltraSPARC this shall result in further
53#   noticeable(!) improvement);
54# - dedicated squaring procedure[?];
55
56######################################################################
57# November 2006
58#
59# Modulo-scheduled inner loops allow to interleave floating point and
60# integer instructions and minimize Read-After-Write penalties. This
61# results in *further* 20-50% performance improvement [depending on
62# key length, more for longer keys] on USI&II cores and 30-80% - on
63# USIII&IV.
64
65$output = pop;
66open STDOUT,">$output";
67
68$fname="bn_mul_mont_fpu";
69
70$frame="STACK_FRAME";
71$bias="STACK_BIAS";
72$locals=64;
73
74# In order to provide for 32-/64-bit ABI duality, I keep integers wider
75# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
76# exclusively for pointers, indexes and other small values...
77# int bn_mul_mont(
78$rp="%i0";	# BN_ULONG *rp,
79$ap="%i1";	# const BN_ULONG *ap,
80$bp="%i2";	# const BN_ULONG *bp,
81$np="%i3";	# const BN_ULONG *np,
82$n0="%i4";	# const BN_ULONG *n0,
83$num="%i5";	# int num);
84
85$tp="%l0";	# t[num]
86$ap_l="%l1";	# a[num],n[num] are smashed to 32-bit words and saved
87$ap_h="%l2";	# to these four vectors as double-precision FP values.
88$np_l="%l3";	# This way a bunch of fxtods are eliminated in second
89$np_h="%l4";	# loop and L1-cache aliasing is minimized...
90$i="%l5";
91$j="%l6";
92$mask="%l7";	# 16-bit mask, 0xffff
93
94$n0="%g4";	# reassigned(!) to "64-bit" register
95$carry="%i4";	# %i4 reused(!) for a carry bit
96
97# FP register naming chart
98#
99#     ..HILO
100#       dcba
101#   --------
102#        LOa
103#       LOb
104#      LOc
105#     LOd
106#      HIa
107#     HIb
108#    HIc
109#   HId
110#    ..a
111#   ..b
112$ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
113$na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
114$alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
115$nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
116
117$dota="%f24"; $dotb="%f26";
118
119$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
120$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
121$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
122$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
123
124$ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load
125
126$code=<<___;
127#include "sparc_arch.h"
128
129.section	".text",#alloc,#execinstr
130
131.global $fname
132.align  32
133$fname:
134	save	%sp,-$frame-$locals,%sp
135
136	cmp	$num,4
137	bl,a,pn %icc,.Lret
138	clr	%i0
139	andcc	$num,1,%g0		! $num has to be even...
140	bnz,a,pn %icc,.Lret
141	clr	%i0			! signal "unsupported input value"
142
143	srl	$num,1,$num
144	sethi	%hi(0xffff),$mask
145	ld	[%i4+0],$n0		! $n0 reassigned, remember?
146	or	$mask,%lo(0xffff),$mask
147	ld	[%i4+4],%o0
148	sllx	%o0,32,%o0
149	or	%o0,$n0,$n0		! $n0=n0[1].n0[0]
150
151	sll	$num,3,$num		! num*=8
152
153	add	%sp,$bias,%o0		! real top of stack
154	sll	$num,2,%o1
155	add	%o1,$num,%o1		! %o1=num*5
156	sub	%o0,%o1,%o0
157	and	%o0,-2048,%o0		! optimize TLB utilization
158	sub	%o0,$bias,%sp		! alloca(5*num*8)
159
160	rd	%asi,%o7		! save %asi
161	add	%sp,$bias+$frame+$locals,$tp
162	add	$tp,$num,$ap_l
163	add	$ap_l,$num,$ap_l	! [an]p_[lh] point at the vectors' ends !
164	add	$ap_l,$num,$ap_h
165	add	$ap_h,$num,$np_l
166	add	$np_l,$num,$np_h
167
168	wr	%g0,$ASI_FL16_P,%asi	! setup %asi for 16-bit FP loads
169
170	add	$rp,$num,$rp		! readjust input pointers to point
171	add	$ap,$num,$ap		! at the ends too...
172	add	$bp,$num,$bp
173	add	$np,$num,$np
174
175	stx	%o7,[%sp+$bias+$frame+48]	! save %asi
176
177	sub	%g0,$num,$i		! i=-num
178	sub	%g0,$num,$j		! j=-num
179
180	add	$ap,$j,%o3
181	add	$bp,$i,%o4
182
183	ld	[%o3+4],%g1		! bp[0]
184	ld	[%o3+0],%o0
185	ld	[%o4+4],%g5		! ap[0]
186	sllx	%g1,32,%g1
187	ld	[%o4+0],%o1
188	sllx	%g5,32,%g5
189	or	%g1,%o0,%o0
190	or	%g5,%o1,%o1
191
192	add	$np,$j,%o5
193
194	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
195	mulx	$n0,%o0,%o0		! ap[0]*bp[0]*n0
196	stx	%o0,[%sp+$bias+$frame+0]
197
198	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
199	fzeros	$alo
200	ld	[%o3+4],$ahi_
201	fzeros	$ahi
202	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
203	fzeros	$nlo
204	ld	[%o5+4],$nhi_
205	fzeros	$nhi
206
207	! transfer b[i] to FPU as 4x16-bit values
208	ldda	[%o4+2]%asi,$ba
209	fxtod	$alo,$alo
210	ldda	[%o4+0]%asi,$bb
211	fxtod	$ahi,$ahi
212	ldda	[%o4+6]%asi,$bc
213	fxtod	$nlo,$nlo
214	ldda	[%o4+4]%asi,$bd
215	fxtod	$nhi,$nhi
216
217	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
218	ldda	[%sp+$bias+$frame+6]%asi,$na
219	fxtod	$ba,$ba
220	ldda	[%sp+$bias+$frame+4]%asi,$nb
221	fxtod	$bb,$bb
222	ldda	[%sp+$bias+$frame+2]%asi,$nc
223	fxtod	$bc,$bc
224	ldda	[%sp+$bias+$frame+0]%asi,$nd
225	fxtod	$bd,$bd
226
227	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
228	fxtod	$na,$na
229	std	$ahi,[$ap_h+$j]
230	fxtod	$nb,$nb
231	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
232	fxtod	$nc,$nc
233	std	$nhi,[$np_h+$j]
234	fxtod	$nd,$nd
235
236		fmuld	$alo,$ba,$aloa
237		fmuld	$nlo,$na,$nloa
238		fmuld	$alo,$bb,$alob
239		fmuld	$nlo,$nb,$nlob
240		fmuld	$alo,$bc,$aloc
241	faddd	$aloa,$nloa,$nloa
242		fmuld	$nlo,$nc,$nloc
243		fmuld	$alo,$bd,$alod
244	faddd	$alob,$nlob,$nlob
245		fmuld	$nlo,$nd,$nlod
246		fmuld	$ahi,$ba,$ahia
247	faddd	$aloc,$nloc,$nloc
248		fmuld	$nhi,$na,$nhia
249		fmuld	$ahi,$bb,$ahib
250	faddd	$alod,$nlod,$nlod
251		fmuld	$nhi,$nb,$nhib
252		fmuld	$ahi,$bc,$ahic
253	faddd	$ahia,$nhia,$nhia
254		fmuld	$nhi,$nc,$nhic
255		fmuld	$ahi,$bd,$ahid
256	faddd	$ahib,$nhib,$nhib
257		fmuld	$nhi,$nd,$nhid
258
259	faddd	$ahic,$nhic,$dota	! $nhic
260	faddd	$ahid,$nhid,$dotb	! $nhid
261
262	faddd	$nloc,$nhia,$nloc
263	faddd	$nlod,$nhib,$nlod
264
265	fdtox	$nloa,$nloa
266	fdtox	$nlob,$nlob
267	fdtox	$nloc,$nloc
268	fdtox	$nlod,$nlod
269
270	std	$nloa,[%sp+$bias+$frame+0]
271	add	$j,8,$j
272	std	$nlob,[%sp+$bias+$frame+8]
273	add	$ap,$j,%o4
274	std	$nloc,[%sp+$bias+$frame+16]
275	add	$np,$j,%o5
276	std	$nlod,[%sp+$bias+$frame+24]
277
278	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
279	fzeros	$alo
280	ld	[%o4+4],$ahi_
281	fzeros	$ahi
282	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
283	fzeros	$nlo
284	ld	[%o5+4],$nhi_
285	fzeros	$nhi
286
287	fxtod	$alo,$alo
288	fxtod	$ahi,$ahi
289	fxtod	$nlo,$nlo
290	fxtod	$nhi,$nhi
291
292	ldx	[%sp+$bias+$frame+0],%o0
293		fmuld	$alo,$ba,$aloa
294	ldx	[%sp+$bias+$frame+8],%o1
295		fmuld	$nlo,$na,$nloa
296	ldx	[%sp+$bias+$frame+16],%o2
297		fmuld	$alo,$bb,$alob
298	ldx	[%sp+$bias+$frame+24],%o3
299		fmuld	$nlo,$nb,$nlob
300
301	srlx	%o0,16,%o7
302	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
303		fmuld	$alo,$bc,$aloc
304	add	%o7,%o1,%o1
305	std	$ahi,[$ap_h+$j]
306		faddd	$aloa,$nloa,$nloa
307		fmuld	$nlo,$nc,$nloc
308	srlx	%o1,16,%o7
309	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
310		fmuld	$alo,$bd,$alod
311	add	%o7,%o2,%o2
312	std	$nhi,[$np_h+$j]
313		faddd	$alob,$nlob,$nlob
314		fmuld	$nlo,$nd,$nlod
315	srlx	%o2,16,%o7
316		fmuld	$ahi,$ba,$ahia
317	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
318		faddd	$aloc,$nloc,$nloc
319		fmuld	$nhi,$na,$nhia
320	!and	%o0,$mask,%o0
321	!and	%o1,$mask,%o1
322	!and	%o2,$mask,%o2
323	!sllx	%o1,16,%o1
324	!sllx	%o2,32,%o2
325	!sllx	%o3,48,%o7
326	!or	%o1,%o0,%o0
327	!or	%o2,%o0,%o0
328	!or	%o7,%o0,%o0		! 64-bit result
329	srlx	%o3,16,%g1		! 34-bit carry
330		fmuld	$ahi,$bb,$ahib
331
332	faddd	$alod,$nlod,$nlod
333		fmuld	$nhi,$nb,$nhib
334		fmuld	$ahi,$bc,$ahic
335	faddd	$ahia,$nhia,$nhia
336		fmuld	$nhi,$nc,$nhic
337		fmuld	$ahi,$bd,$ahid
338	faddd	$ahib,$nhib,$nhib
339		fmuld	$nhi,$nd,$nhid
340
341	faddd	$dota,$nloa,$nloa
342	faddd	$dotb,$nlob,$nlob
343	faddd	$ahic,$nhic,$dota	! $nhic
344	faddd	$ahid,$nhid,$dotb	! $nhid
345
346	faddd	$nloc,$nhia,$nloc
347	faddd	$nlod,$nhib,$nlod
348
349	fdtox	$nloa,$nloa
350	fdtox	$nlob,$nlob
351	fdtox	$nloc,$nloc
352	fdtox	$nlod,$nlod
353
354	std	$nloa,[%sp+$bias+$frame+0]
355	std	$nlob,[%sp+$bias+$frame+8]
356	addcc	$j,8,$j
357	std	$nloc,[%sp+$bias+$frame+16]
358	bz,pn	%icc,.L1stskip
359	std	$nlod,[%sp+$bias+$frame+24]
360
361.align	32			! incidentally already aligned !
362.L1st:
363	add	$ap,$j,%o4
364	add	$np,$j,%o5
365	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
366	fzeros	$alo
367	ld	[%o4+4],$ahi_
368	fzeros	$ahi
369	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
370	fzeros	$nlo
371	ld	[%o5+4],$nhi_
372	fzeros	$nhi
373
374	fxtod	$alo,$alo
375	fxtod	$ahi,$ahi
376	fxtod	$nlo,$nlo
377	fxtod	$nhi,$nhi
378
379	ldx	[%sp+$bias+$frame+0],%o0
380		fmuld	$alo,$ba,$aloa
381	ldx	[%sp+$bias+$frame+8],%o1
382		fmuld	$nlo,$na,$nloa
383	ldx	[%sp+$bias+$frame+16],%o2
384		fmuld	$alo,$bb,$alob
385	ldx	[%sp+$bias+$frame+24],%o3
386		fmuld	$nlo,$nb,$nlob
387
388	srlx	%o0,16,%o7
389	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
390		fmuld	$alo,$bc,$aloc
391	add	%o7,%o1,%o1
392	std	$ahi,[$ap_h+$j]
393		faddd	$aloa,$nloa,$nloa
394		fmuld	$nlo,$nc,$nloc
395	srlx	%o1,16,%o7
396	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
397		fmuld	$alo,$bd,$alod
398	add	%o7,%o2,%o2
399	std	$nhi,[$np_h+$j]
400		faddd	$alob,$nlob,$nlob
401		fmuld	$nlo,$nd,$nlod
402	srlx	%o2,16,%o7
403		fmuld	$ahi,$ba,$ahia
404	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
405	and	%o0,$mask,%o0
406		faddd	$aloc,$nloc,$nloc
407		fmuld	$nhi,$na,$nhia
408	and	%o1,$mask,%o1
409	and	%o2,$mask,%o2
410		fmuld	$ahi,$bb,$ahib
411	sllx	%o1,16,%o1
412		faddd	$alod,$nlod,$nlod
413		fmuld	$nhi,$nb,$nhib
414	sllx	%o2,32,%o2
415		fmuld	$ahi,$bc,$ahic
416	sllx	%o3,48,%o7
417	or	%o1,%o0,%o0
418		faddd	$ahia,$nhia,$nhia
419		fmuld	$nhi,$nc,$nhic
420	or	%o2,%o0,%o0
421		fmuld	$ahi,$bd,$ahid
422	or	%o7,%o0,%o0		! 64-bit result
423		faddd	$ahib,$nhib,$nhib
424		fmuld	$nhi,$nd,$nhid
425	addcc	%g1,%o0,%o0
426		faddd	$dota,$nloa,$nloa
427	srlx	%o3,16,%g1		! 34-bit carry
428		faddd	$dotb,$nlob,$nlob
429	bcs,a	%xcc,.+8
430	add	%g1,1,%g1
431
432	stx	%o0,[$tp]		! tp[j-1]=
433
434	faddd	$ahic,$nhic,$dota	! $nhic
435	faddd	$ahid,$nhid,$dotb	! $nhid
436
437	faddd	$nloc,$nhia,$nloc
438	faddd	$nlod,$nhib,$nlod
439
440	fdtox	$nloa,$nloa
441	fdtox	$nlob,$nlob
442	fdtox	$nloc,$nloc
443	fdtox	$nlod,$nlod
444
445	std	$nloa,[%sp+$bias+$frame+0]
446	std	$nlob,[%sp+$bias+$frame+8]
447	std	$nloc,[%sp+$bias+$frame+16]
448	std	$nlod,[%sp+$bias+$frame+24]
449
450	addcc	$j,8,$j
451	bnz,pt	%icc,.L1st
452	add	$tp,8,$tp
453
454.L1stskip:
455	fdtox	$dota,$dota
456	fdtox	$dotb,$dotb
457
458	ldx	[%sp+$bias+$frame+0],%o0
459	ldx	[%sp+$bias+$frame+8],%o1
460	ldx	[%sp+$bias+$frame+16],%o2
461	ldx	[%sp+$bias+$frame+24],%o3
462
463	srlx	%o0,16,%o7
464	std	$dota,[%sp+$bias+$frame+32]
465	add	%o7,%o1,%o1
466	std	$dotb,[%sp+$bias+$frame+40]
467	srlx	%o1,16,%o7
468	add	%o7,%o2,%o2
469	srlx	%o2,16,%o7
470	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
471	and	%o0,$mask,%o0
472	and	%o1,$mask,%o1
473	and	%o2,$mask,%o2
474	sllx	%o1,16,%o1
475	sllx	%o2,32,%o2
476	sllx	%o3,48,%o7
477	or	%o1,%o0,%o0
478	or	%o2,%o0,%o0
479	or	%o7,%o0,%o0		! 64-bit result
480	ldx	[%sp+$bias+$frame+32],%o4
481	addcc	%g1,%o0,%o0
482	ldx	[%sp+$bias+$frame+40],%o5
483	srlx	%o3,16,%g1		! 34-bit carry
484	bcs,a	%xcc,.+8
485	add	%g1,1,%g1
486
487	stx	%o0,[$tp]		! tp[j-1]=
488	add	$tp,8,$tp
489
490	srlx	%o4,16,%o7
491	add	%o7,%o5,%o5
492	and	%o4,$mask,%o4
493	sllx	%o5,16,%o7
494	or	%o7,%o4,%o4
495	addcc	%g1,%o4,%o4
496	srlx	%o5,48,%g1
497	bcs,a	%xcc,.+8
498	add	%g1,1,%g1
499
500	mov	%g1,$carry
501	stx	%o4,[$tp]		! tp[num-1]=
502
503	ba	.Louter
504	add	$i,8,$i
505.align	32
506.Louter:
507	sub	%g0,$num,$j		! j=-num
508	add	%sp,$bias+$frame+$locals,$tp
509
510	add	$ap,$j,%o3
511	add	$bp,$i,%o4
512
513	ld	[%o3+4],%g1		! bp[i]
514	ld	[%o3+0],%o0
515	ld	[%o4+4],%g5		! ap[0]
516	sllx	%g1,32,%g1
517	ld	[%o4+0],%o1
518	sllx	%g5,32,%g5
519	or	%g1,%o0,%o0
520	or	%g5,%o1,%o1
521
522	ldx	[$tp],%o2		! tp[0]
523	mulx	%o1,%o0,%o0
524	addcc	%o2,%o0,%o0
525	mulx	$n0,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
526	stx	%o0,[%sp+$bias+$frame+0]
527
528	! transfer b[i] to FPU as 4x16-bit values
529	ldda	[%o4+2]%asi,$ba
530	ldda	[%o4+0]%asi,$bb
531	ldda	[%o4+6]%asi,$bc
532	ldda	[%o4+4]%asi,$bd
533
534	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
535	ldda	[%sp+$bias+$frame+6]%asi,$na
536	fxtod	$ba,$ba
537	ldda	[%sp+$bias+$frame+4]%asi,$nb
538	fxtod	$bb,$bb
539	ldda	[%sp+$bias+$frame+2]%asi,$nc
540	fxtod	$bc,$bc
541	ldda	[%sp+$bias+$frame+0]%asi,$nd
542	fxtod	$bd,$bd
543	ldd	[$ap_l+$j],$alo		! load a[j] in double format
544	fxtod	$na,$na
545	ldd	[$ap_h+$j],$ahi
546	fxtod	$nb,$nb
547	ldd	[$np_l+$j],$nlo		! load n[j] in double format
548	fxtod	$nc,$nc
549	ldd	[$np_h+$j],$nhi
550	fxtod	$nd,$nd
551
552		fmuld	$alo,$ba,$aloa
553		fmuld	$nlo,$na,$nloa
554		fmuld	$alo,$bb,$alob
555		fmuld	$nlo,$nb,$nlob
556		fmuld	$alo,$bc,$aloc
557	faddd	$aloa,$nloa,$nloa
558		fmuld	$nlo,$nc,$nloc
559		fmuld	$alo,$bd,$alod
560	faddd	$alob,$nlob,$nlob
561		fmuld	$nlo,$nd,$nlod
562		fmuld	$ahi,$ba,$ahia
563	faddd	$aloc,$nloc,$nloc
564		fmuld	$nhi,$na,$nhia
565		fmuld	$ahi,$bb,$ahib
566	faddd	$alod,$nlod,$nlod
567		fmuld	$nhi,$nb,$nhib
568		fmuld	$ahi,$bc,$ahic
569	faddd	$ahia,$nhia,$nhia
570		fmuld	$nhi,$nc,$nhic
571		fmuld	$ahi,$bd,$ahid
572	faddd	$ahib,$nhib,$nhib
573		fmuld	$nhi,$nd,$nhid
574
575	faddd	$ahic,$nhic,$dota	! $nhic
576	faddd	$ahid,$nhid,$dotb	! $nhid
577
578	faddd	$nloc,$nhia,$nloc
579	faddd	$nlod,$nhib,$nlod
580
581	fdtox	$nloa,$nloa
582	fdtox	$nlob,$nlob
583	fdtox	$nloc,$nloc
584	fdtox	$nlod,$nlod
585
586	std	$nloa,[%sp+$bias+$frame+0]
587	std	$nlob,[%sp+$bias+$frame+8]
588	std	$nloc,[%sp+$bias+$frame+16]
589	add	$j,8,$j
590	std	$nlod,[%sp+$bias+$frame+24]
591
592	ldd	[$ap_l+$j],$alo		! load a[j] in double format
593	ldd	[$ap_h+$j],$ahi
594	ldd	[$np_l+$j],$nlo		! load n[j] in double format
595	ldd	[$np_h+$j],$nhi
596
597		fmuld	$alo,$ba,$aloa
598		fmuld	$nlo,$na,$nloa
599		fmuld	$alo,$bb,$alob
600		fmuld	$nlo,$nb,$nlob
601		fmuld	$alo,$bc,$aloc
602	ldx	[%sp+$bias+$frame+0],%o0
603		faddd	$aloa,$nloa,$nloa
604		fmuld	$nlo,$nc,$nloc
605	ldx	[%sp+$bias+$frame+8],%o1
606		fmuld	$alo,$bd,$alod
607	ldx	[%sp+$bias+$frame+16],%o2
608		faddd	$alob,$nlob,$nlob
609		fmuld	$nlo,$nd,$nlod
610	ldx	[%sp+$bias+$frame+24],%o3
611		fmuld	$ahi,$ba,$ahia
612
613	srlx	%o0,16,%o7
614		faddd	$aloc,$nloc,$nloc
615		fmuld	$nhi,$na,$nhia
616	add	%o7,%o1,%o1
617		fmuld	$ahi,$bb,$ahib
618	srlx	%o1,16,%o7
619		faddd	$alod,$nlod,$nlod
620		fmuld	$nhi,$nb,$nhib
621	add	%o7,%o2,%o2
622		fmuld	$ahi,$bc,$ahic
623	srlx	%o2,16,%o7
624		faddd	$ahia,$nhia,$nhia
625		fmuld	$nhi,$nc,$nhic
626	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
627	! why?
628	and	%o0,$mask,%o0
629		fmuld	$ahi,$bd,$ahid
630	and	%o1,$mask,%o1
631	and	%o2,$mask,%o2
632		faddd	$ahib,$nhib,$nhib
633		fmuld	$nhi,$nd,$nhid
634	sllx	%o1,16,%o1
635		faddd	$dota,$nloa,$nloa
636	sllx	%o2,32,%o2
637		faddd	$dotb,$nlob,$nlob
638	sllx	%o3,48,%o7
639	or	%o1,%o0,%o0
640		faddd	$ahic,$nhic,$dota	! $nhic
641	or	%o2,%o0,%o0
642		faddd	$ahid,$nhid,$dotb	! $nhid
643	or	%o7,%o0,%o0		! 64-bit result
644	ldx	[$tp],%o7
645		faddd	$nloc,$nhia,$nloc
646	addcc	%o7,%o0,%o0
647	! end-of-why?
648		faddd	$nlod,$nhib,$nlod
649	srlx	%o3,16,%g1		! 34-bit carry
650		fdtox	$nloa,$nloa
651	bcs,a	%xcc,.+8
652	add	%g1,1,%g1
653
654	fdtox	$nlob,$nlob
655	fdtox	$nloc,$nloc
656	fdtox	$nlod,$nlod
657
658	std	$nloa,[%sp+$bias+$frame+0]
659	std	$nlob,[%sp+$bias+$frame+8]
660	addcc	$j,8,$j
661	std	$nloc,[%sp+$bias+$frame+16]
662	bz,pn	%icc,.Linnerskip
663	std	$nlod,[%sp+$bias+$frame+24]
664
665	ba	.Linner
666	nop
667.align	32
668.Linner:
669	ldd	[$ap_l+$j],$alo		! load a[j] in double format
670	ldd	[$ap_h+$j],$ahi
671	ldd	[$np_l+$j],$nlo		! load n[j] in double format
672	ldd	[$np_h+$j],$nhi
673
674		fmuld	$alo,$ba,$aloa
675		fmuld	$nlo,$na,$nloa
676		fmuld	$alo,$bb,$alob
677		fmuld	$nlo,$nb,$nlob
678		fmuld	$alo,$bc,$aloc
679	ldx	[%sp+$bias+$frame+0],%o0
680		faddd	$aloa,$nloa,$nloa
681		fmuld	$nlo,$nc,$nloc
682	ldx	[%sp+$bias+$frame+8],%o1
683		fmuld	$alo,$bd,$alod
684	ldx	[%sp+$bias+$frame+16],%o2
685		faddd	$alob,$nlob,$nlob
686		fmuld	$nlo,$nd,$nlod
687	ldx	[%sp+$bias+$frame+24],%o3
688		fmuld	$ahi,$ba,$ahia
689
690	srlx	%o0,16,%o7
691		faddd	$aloc,$nloc,$nloc
692		fmuld	$nhi,$na,$nhia
693	add	%o7,%o1,%o1
694		fmuld	$ahi,$bb,$ahib
695	srlx	%o1,16,%o7
696		faddd	$alod,$nlod,$nlod
697		fmuld	$nhi,$nb,$nhib
698	add	%o7,%o2,%o2
699		fmuld	$ahi,$bc,$ahic
700	srlx	%o2,16,%o7
701		faddd	$ahia,$nhia,$nhia
702		fmuld	$nhi,$nc,$nhic
703	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
704	and	%o0,$mask,%o0
705		fmuld	$ahi,$bd,$ahid
706	and	%o1,$mask,%o1
707	and	%o2,$mask,%o2
708		faddd	$ahib,$nhib,$nhib
709		fmuld	$nhi,$nd,$nhid
710	sllx	%o1,16,%o1
711		faddd	$dota,$nloa,$nloa
712	sllx	%o2,32,%o2
713		faddd	$dotb,$nlob,$nlob
714	sllx	%o3,48,%o7
715	or	%o1,%o0,%o0
716		faddd	$ahic,$nhic,$dota	! $nhic
717	or	%o2,%o0,%o0
718		faddd	$ahid,$nhid,$dotb	! $nhid
719	or	%o7,%o0,%o0		! 64-bit result
720		faddd	$nloc,$nhia,$nloc
721	addcc	%g1,%o0,%o0
722	ldx	[$tp+8],%o7		! tp[j]
723		faddd	$nlod,$nhib,$nlod
724	srlx	%o3,16,%g1		! 34-bit carry
725		fdtox	$nloa,$nloa
726	bcs,a	%xcc,.+8
727	add	%g1,1,%g1
728		fdtox	$nlob,$nlob
729	addcc	%o7,%o0,%o0
730		fdtox	$nloc,$nloc
731	bcs,a	%xcc,.+8
732	add	%g1,1,%g1
733
734	stx	%o0,[$tp]		! tp[j-1]
735		fdtox	$nlod,$nlod
736
737	std	$nloa,[%sp+$bias+$frame+0]
738	std	$nlob,[%sp+$bias+$frame+8]
739	std	$nloc,[%sp+$bias+$frame+16]
740	addcc	$j,8,$j
741	std	$nlod,[%sp+$bias+$frame+24]
742	bnz,pt	%icc,.Linner
743	add	$tp,8,$tp
744
745.Linnerskip:
746	fdtox	$dota,$dota
747	fdtox	$dotb,$dotb
748
749	ldx	[%sp+$bias+$frame+0],%o0
750	ldx	[%sp+$bias+$frame+8],%o1
751	ldx	[%sp+$bias+$frame+16],%o2
752	ldx	[%sp+$bias+$frame+24],%o3
753
754	srlx	%o0,16,%o7
755	std	$dota,[%sp+$bias+$frame+32]
756	add	%o7,%o1,%o1
757	std	$dotb,[%sp+$bias+$frame+40]
758	srlx	%o1,16,%o7
759	add	%o7,%o2,%o2
760	srlx	%o2,16,%o7
761	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
762	and	%o0,$mask,%o0
763	and	%o1,$mask,%o1
764	and	%o2,$mask,%o2
765	sllx	%o1,16,%o1
766	sllx	%o2,32,%o2
767	sllx	%o3,48,%o7
768	or	%o1,%o0,%o0
769	or	%o2,%o0,%o0
770	ldx	[%sp+$bias+$frame+32],%o4
771	or	%o7,%o0,%o0		! 64-bit result
772	ldx	[%sp+$bias+$frame+40],%o5
773	addcc	%g1,%o0,%o0
774	ldx	[$tp+8],%o7		! tp[j]
775	srlx	%o3,16,%g1		! 34-bit carry
776	bcs,a	%xcc,.+8
777	add	%g1,1,%g1
778
779	addcc	%o7,%o0,%o0
780	bcs,a	%xcc,.+8
781	add	%g1,1,%g1
782
783	stx	%o0,[$tp]		! tp[j-1]
784	add	$tp,8,$tp
785
786	srlx	%o4,16,%o7
787	add	%o7,%o5,%o5
788	and	%o4,$mask,%o4
789	sllx	%o5,16,%o7
790	or	%o7,%o4,%o4
791	addcc	%g1,%o4,%o4
792	srlx	%o5,48,%g1
793	bcs,a	%xcc,.+8
794	add	%g1,1,%g1
795
796	addcc	$carry,%o4,%o4
797	stx	%o4,[$tp]		! tp[num-1]
798	mov	%g1,$carry
799	bcs,a	%xcc,.+8
800	add	$carry,1,$carry
801
802	addcc	$i,8,$i
803	bnz	%icc,.Louter
804	nop
805
806	add	$tp,8,$tp		! adjust tp to point at the end
807	orn	%g0,%g0,%g4
808	sub	%g0,$num,%o7		! n=-num
809	ba	.Lsub
810	subcc	%g0,%g0,%g0		! clear %icc.c
811
812.align	32
813.Lsub:
814	ldx	[$tp+%o7],%o0
815	add	$np,%o7,%g1
816	ld	[%g1+0],%o2
817	ld	[%g1+4],%o3
818	srlx	%o0,32,%o1
819	subccc	%o0,%o2,%o2
820	add	$rp,%o7,%g1
821	subccc	%o1,%o3,%o3
822	st	%o2,[%g1+0]
823	add	%o7,8,%o7
824	brnz,pt	%o7,.Lsub
825	st	%o3,[%g1+4]
826	subc	$carry,0,%g4
827	sub	%g0,$num,%o7		! n=-num
828	ba	.Lcopy
829	nop
830
831.align	32
832.Lcopy:
833	ldx	[$tp+%o7],%o0
834	add	$rp,%o7,%g1
835	ld	[%g1+0],%o2
836	ld	[%g1+4],%o3
837	stx	%g0,[$tp+%o7]
838	and	%o0,%g4,%o0
839	srlx	%o0,32,%o1
840	andn	%o2,%g4,%o2
841	andn	%o3,%g4,%o3
842	or	%o2,%o0,%o0
843	or	%o3,%o1,%o1
844	st	%o0,[%g1+0]
845	add	%o7,8,%o7
846	brnz,pt	%o7,.Lcopy
847	st	%o1,[%g1+4]
848	sub	%g0,$num,%o7		! n=-num
849
850.Lzap:
851	stx	%g0,[$ap_l+%o7]
852	stx	%g0,[$ap_h+%o7]
853	stx	%g0,[$np_l+%o7]
854	stx	%g0,[$np_h+%o7]
855	add	%o7,8,%o7
856	brnz,pt	%o7,.Lzap
857	nop
858
859	ldx	[%sp+$bias+$frame+48],%o7
860	wr	%g0,%o7,%asi		! restore %asi
861
862	mov	1,%i0
863.Lret:
864	ret
865	restore
866.type   $fname,#function
867.size	$fname,(.-$fname)
868.asciz	"Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
869.align	32
870___
871
872$code =~ s/\`([^\`]*)\`/eval($1)/gem;
873
874# Below substitution makes it possible to compile without demanding
875# VIS extensions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
876# dare to do this, because VIS capability is detected at run-time now
877# and this routine is not called on CPU not capable to execute it. Do
878# note that fzeros is not the only VIS dependency! Another dependency
879# is implicit and is just _a_ numerical value loaded to %asi register,
880# which assembler can't recognize as VIS specific...
881$code =~ s/fzeros\s+%f([0-9]+)/
882	   sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
883	  /gem;
884
885print $code;
886# flush
887close STDOUT or die "error closing STDOUT: $!";
888