• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# December 2007
11
12# The reason for undertaken effort is basically following. Even though
13# Power 6 CPU operates at incredible 4.7GHz clock frequency, its PKI
14# performance was observed to be less than impressive, essentially as
15# fast as 1.8GHz PPC970, or 2.6 times(!) slower than one would hope.
16# Well, it's not surprising that IBM had to make some sacrifices to
17# boost the clock frequency that much, but no overall improvement?
18# Having observed how much difference did switching to FPU make on
19# UltraSPARC, playing same stunt on Power 6 appeared appropriate...
20# Unfortunately the resulting performance improvement is not as
21# impressive, ~30%, and in absolute terms is still very far from what
22# one would expect from 4.7GHz CPU. There is a chance that I'm doing
23# something wrong, but in the lack of assembler level micro-profiling
24# data or at least decent platform guide I can't tell... Or better
25# results might be achieved with VMX... Anyway, this module provides
26# *worse* performance on other PowerPC implementations, ~40-15% slower
27# on PPC970 depending on key length and ~40% slower on Power 5 for all
28# key lengths. As it's obviously inappropriate as "best all-round"
29# alternative, it has to be complemented with run-time CPU family
30# detection. Oh! It should also be noted that unlike other PowerPC
31# implementation IALU ppc-mont.pl module performs *suboptimaly* on
32# >=1024-bit key lengths on Power 6. It should also be noted that
33# *everything* said so far applies to 64-bit builds! As far as 32-bit
34# application executed on 64-bit CPU goes, this module is likely to
35# become preferred choice, because it's easy to adapt it for such
36# case and *is* faster than 32-bit ppc-mont.pl on *all* processors.
37
38# February 2008
39
40# Micro-profiling assisted optimization results in ~15% improvement
41# over original ppc64-mont.pl version, or overall ~50% improvement
42# over ppc.pl module on Power 6. If compared to ppc-mont.pl on same
43# Power 6 CPU, this module is 5-150% faster depending on key length,
44# [hereafter] more for longer keys. But if compared to ppc-mont.pl
45# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
46# in absolute terms, but it's apparently the way Power 6 is...
47
48# December 2009
49
50# Adapted for 32-bit build this module delivers 25-120%, yes, more
51# than *twice* for longer keys, performance improvement over 32-bit
52# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
53# even 64-bit integer operations and the trouble is that most PPC
54# operating systems don't preserve upper halves of general purpose
55# registers upon 32-bit signal delivery. They do preserve them upon
56# context switch, but not signalling:-( This means that asynchronous
57# signals have to be blocked upon entry to this subroutine. Signal
58# masking (and of course complementary unmasking) has quite an impact
59# on performance, naturally larger for shorter keys. It's so severe
60# that 512-bit key performance can be as low as 1/3 of expected one.
61# This is why this routine can be engaged for longer key operations
62# only on these OSes, see crypto/ppccap.c for further details. MacOS X
63# is an exception from this and doesn't require signal masking, and
64# that's where above improvement coefficients were collected. For
65# others alternative would be to break dependence on upper halves of
66# GPRs by sticking to 32-bit integer operations...
67
68$flavour = shift;
69
70if ($flavour =~ /32/) {
71	$SIZE_T=4;
72	$RZONE=	224;
73	$fname=	"bn_mul_mont_fpu64";
74
75	$STUX=	"stwux";	# store indexed and update
76	$PUSH=	"stw";
77	$POP=	"lwz";
78} elsif ($flavour =~ /64/) {
79	$SIZE_T=8;
80	$RZONE=	288;
81	$fname=	"bn_mul_mont_fpu64";
82
83	# same as above, but 64-bit mnemonics...
84	$STUX=	"stdux";	# store indexed and update
85	$PUSH=	"std";
86	$POP=	"ld";
87} else { die "nonsense $flavour"; }
88
89$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
90( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
91( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
92die "can't locate ppc-xlate.pl";
93
94open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
95
96$FRAME=64;	# padded frame header
97$TRANSFER=16*8;
98
99$carry="r0";
100$sp="r1";
101$toc="r2";
102$rp="r3";	$ovf="r3";
103$ap="r4";
104$bp="r5";
105$np="r6";
106$n0="r7";
107$num="r8";
108$rp="r9";	# $rp is reassigned
109$tp="r10";
110$j="r11";
111$i="r12";
112# non-volatile registers
113$nap_d="r22";	# interleaved ap and np in double format
114$a0="r23";	# ap[0]
115$t0="r24";	# temporary registers
116$t1="r25";
117$t2="r26";
118$t3="r27";
119$t4="r28";
120$t5="r29";
121$t6="r30";
122$t7="r31";
123
124# PPC offers enough register bank capacity to unroll inner loops twice
125#
126#     ..A3A2A1A0
127#           dcba
128#    -----------
129#            A0a
130#           A0b
131#          A0c
132#         A0d
133#          A1a
134#         A1b
135#        A1c
136#       A1d
137#        A2a
138#       A2b
139#      A2c
140#     A2d
141#      A3a
142#     A3b
143#    A3c
144#   A3d
145#    ..a
146#   ..b
147#
148$ba="f0";	$bb="f1";	$bc="f2";	$bd="f3";
149$na="f4";	$nb="f5";	$nc="f6";	$nd="f7";
150$dota="f8";	$dotb="f9";
151$A0="f10";	$A1="f11";	$A2="f12";	$A3="f13";
152$N0="f20";	$N1="f21";	$N2="f22";	$N3="f23";
153$T0a="f24";	$T0b="f25";
154$T1a="f26";	$T1b="f27";
155$T2a="f28";	$T2b="f29";
156$T3a="f30";	$T3b="f31";
157
158# sp----------->+-------------------------------+
159#		| saved sp			|
160#		+-------------------------------+
161#		.				.
162#   +64		+-------------------------------+
163#		| 16 gpr<->fpr transfer zone	|
164#		.				.
165#		.				.
166#   +16*8	+-------------------------------+
167#		| __int64 tmp[-1]		|
168#		+-------------------------------+
169#		| __int64 tmp[num]		|
170#		.				.
171#		.				.
172#		.				.
173#   +(num+1)*8	+-------------------------------+
174#		| padding to 64 byte boundary	|
175#		.				.
176#   +X		+-------------------------------+
177#		| double nap_d[4*num]		|
178#		.				.
179#		.				.
180#		.				.
181#		+-------------------------------+
182#		.				.
183#   -12*size_t	+-------------------------------+
184#		| 10 saved gpr, r22-r31		|
185#		.				.
186#		.				.
187#   -12*8	+-------------------------------+
188#		| 12 saved fpr, f20-f31		|
189#		.				.
190#		.				.
191#		+-------------------------------+
192
193$code=<<___;
194.machine "any"
195.text
196
197.globl	.$fname
198.align	5
199.$fname:
200	cmpwi	$num,`3*8/$SIZE_T`
201	mr	$rp,r3		; $rp is reassigned
202	li	r3,0		; possible "not handled" return code
203	bltlr-
204	andi.	r0,$num,`16/$SIZE_T-1`		; $num has to be "even"
205	bnelr-
206
207	slwi	$num,$num,`log($SIZE_T)/log(2)`	; num*=sizeof(BN_LONG)
208	li	$i,-4096
209	slwi	$tp,$num,2	; place for {an}p_{lh}[num], i.e. 4*num
210	add	$tp,$tp,$num	; place for tp[num+1]
211	addi	$tp,$tp,`$FRAME+$TRANSFER+8+64+$RZONE`
212	subf	$tp,$tp,$sp	; $sp-$tp
213	and	$tp,$tp,$i	; minimize TLB usage
214	subf	$tp,$sp,$tp	; $tp-$sp
215	mr	$i,$sp
216	$STUX	$sp,$sp,$tp	; alloca
217
218	$PUSH	r22,`-12*8-10*$SIZE_T`($i)
219	$PUSH	r23,`-12*8-9*$SIZE_T`($i)
220	$PUSH	r24,`-12*8-8*$SIZE_T`($i)
221	$PUSH	r25,`-12*8-7*$SIZE_T`($i)
222	$PUSH	r26,`-12*8-6*$SIZE_T`($i)
223	$PUSH	r27,`-12*8-5*$SIZE_T`($i)
224	$PUSH	r28,`-12*8-4*$SIZE_T`($i)
225	$PUSH	r29,`-12*8-3*$SIZE_T`($i)
226	$PUSH	r30,`-12*8-2*$SIZE_T`($i)
227	$PUSH	r31,`-12*8-1*$SIZE_T`($i)
228	stfd	f20,`-12*8`($i)
229	stfd	f21,`-11*8`($i)
230	stfd	f22,`-10*8`($i)
231	stfd	f23,`-9*8`($i)
232	stfd	f24,`-8*8`($i)
233	stfd	f25,`-7*8`($i)
234	stfd	f26,`-6*8`($i)
235	stfd	f27,`-5*8`($i)
236	stfd	f28,`-4*8`($i)
237	stfd	f29,`-3*8`($i)
238	stfd	f30,`-2*8`($i)
239	stfd	f31,`-1*8`($i)
240___
241$code.=<<___ if ($SIZE_T==8);
242	ld	$a0,0($ap)	; pull ap[0] value
243	ld	$n0,0($n0)	; pull n0[0] value
244	ld	$t3,0($bp)	; bp[0]
245___
246$code.=<<___ if ($SIZE_T==4);
247	mr	$t1,$n0
248	lwz	$a0,0($ap)	; pull ap[0,1] value
249	lwz	$t0,4($ap)
250	lwz	$n0,0($t1)	; pull n0[0,1] value
251	lwz	$t1,4($t1)
252	lwz	$t3,0($bp)	; bp[0,1]
253	lwz	$t2,4($bp)
254	insrdi	$a0,$t0,32,0
255	insrdi	$n0,$t1,32,0
256	insrdi	$t3,$t2,32,0
257___
258$code.=<<___;
259	addi	$tp,$sp,`$FRAME+$TRANSFER+8+64`
260	li	$i,-64
261	add	$nap_d,$tp,$num
262	and	$nap_d,$nap_d,$i	; align to 64 bytes
263
264	mulld	$t7,$a0,$t3	; ap[0]*bp[0]
265	; nap_d is off by 1, because it's used with stfdu/lfdu
266	addi	$nap_d,$nap_d,-8
267	srwi	$j,$num,`3+1`	; counter register, num/2
268	mulld	$t7,$t7,$n0	; tp[0]*n0
269	addi	$j,$j,-1
270	addi	$tp,$sp,`$FRAME+$TRANSFER-8`
271	li	$carry,0
272	mtctr	$j
273
274	; transfer bp[0] to FPU as 4x16-bit values
275	extrdi	$t0,$t3,16,48
276	extrdi	$t1,$t3,16,32
277	extrdi	$t2,$t3,16,16
278	extrdi	$t3,$t3,16,0
279	std	$t0,`$FRAME+0`($sp)
280	std	$t1,`$FRAME+8`($sp)
281	std	$t2,`$FRAME+16`($sp)
282	std	$t3,`$FRAME+24`($sp)
283	; transfer (ap[0]*bp[0])*n0 to FPU as 4x16-bit values
284	extrdi	$t4,$t7,16,48
285	extrdi	$t5,$t7,16,32
286	extrdi	$t6,$t7,16,16
287	extrdi	$t7,$t7,16,0
288	std	$t4,`$FRAME+32`($sp)
289	std	$t5,`$FRAME+40`($sp)
290	std	$t6,`$FRAME+48`($sp)
291	std	$t7,`$FRAME+56`($sp)
292___
293$code.=<<___ if ($SIZE_T==8);
294	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
295	lwz	$t1,0($ap)
296	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
297	lwz	$t3,8($ap)
298	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
299	lwz	$t5,0($np)
300	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
301	lwz	$t7,8($np)
302___
303$code.=<<___ if ($SIZE_T==4);
304	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
305	lwz	$t1,4($ap)
306	lwz	$t2,8($ap)
307	lwz	$t3,12($ap)
308	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
309	lwz	$t5,4($np)
310	lwz	$t6,8($np)
311	lwz	$t7,12($np)
312___
313$code.=<<___;
314	lfd	$ba,`$FRAME+0`($sp)
315	lfd	$bb,`$FRAME+8`($sp)
316	lfd	$bc,`$FRAME+16`($sp)
317	lfd	$bd,`$FRAME+24`($sp)
318	lfd	$na,`$FRAME+32`($sp)
319	lfd	$nb,`$FRAME+40`($sp)
320	lfd	$nc,`$FRAME+48`($sp)
321	lfd	$nd,`$FRAME+56`($sp)
322	std	$t0,`$FRAME+64`($sp)
323	std	$t1,`$FRAME+72`($sp)
324	std	$t2,`$FRAME+80`($sp)
325	std	$t3,`$FRAME+88`($sp)
326	std	$t4,`$FRAME+96`($sp)
327	std	$t5,`$FRAME+104`($sp)
328	std	$t6,`$FRAME+112`($sp)
329	std	$t7,`$FRAME+120`($sp)
330	fcfid	$ba,$ba
331	fcfid	$bb,$bb
332	fcfid	$bc,$bc
333	fcfid	$bd,$bd
334	fcfid	$na,$na
335	fcfid	$nb,$nb
336	fcfid	$nc,$nc
337	fcfid	$nd,$nd
338
339	lfd	$A0,`$FRAME+64`($sp)
340	lfd	$A1,`$FRAME+72`($sp)
341	lfd	$A2,`$FRAME+80`($sp)
342	lfd	$A3,`$FRAME+88`($sp)
343	lfd	$N0,`$FRAME+96`($sp)
344	lfd	$N1,`$FRAME+104`($sp)
345	lfd	$N2,`$FRAME+112`($sp)
346	lfd	$N3,`$FRAME+120`($sp)
347	fcfid	$A0,$A0
348	fcfid	$A1,$A1
349	fcfid	$A2,$A2
350	fcfid	$A3,$A3
351	fcfid	$N0,$N0
352	fcfid	$N1,$N1
353	fcfid	$N2,$N2
354	fcfid	$N3,$N3
355	addi	$ap,$ap,16
356	addi	$np,$np,16
357
358	fmul	$T1a,$A1,$ba
359	fmul	$T1b,$A1,$bb
360	stfd	$A0,8($nap_d)		; save a[j] in double format
361	stfd	$A1,16($nap_d)
362	fmul	$T2a,$A2,$ba
363	fmul	$T2b,$A2,$bb
364	stfd	$A2,24($nap_d)		; save a[j+1] in double format
365	stfd	$A3,32($nap_d)
366	fmul	$T3a,$A3,$ba
367	fmul	$T3b,$A3,$bb
368	stfd	$N0,40($nap_d)		; save n[j] in double format
369	stfd	$N1,48($nap_d)
370	fmul	$T0a,$A0,$ba
371	fmul	$T0b,$A0,$bb
372	stfd	$N2,56($nap_d)		; save n[j+1] in double format
373	stfdu	$N3,64($nap_d)
374
375	fmadd	$T1a,$A0,$bc,$T1a
376	fmadd	$T1b,$A0,$bd,$T1b
377	fmadd	$T2a,$A1,$bc,$T2a
378	fmadd	$T2b,$A1,$bd,$T2b
379	fmadd	$T3a,$A2,$bc,$T3a
380	fmadd	$T3b,$A2,$bd,$T3b
381	fmul	$dota,$A3,$bc
382	fmul	$dotb,$A3,$bd
383
384	fmadd	$T1a,$N1,$na,$T1a
385	fmadd	$T1b,$N1,$nb,$T1b
386	fmadd	$T2a,$N2,$na,$T2a
387	fmadd	$T2b,$N2,$nb,$T2b
388	fmadd	$T3a,$N3,$na,$T3a
389	fmadd	$T3b,$N3,$nb,$T3b
390	fmadd	$T0a,$N0,$na,$T0a
391	fmadd	$T0b,$N0,$nb,$T0b
392
393	fmadd	$T1a,$N0,$nc,$T1a
394	fmadd	$T1b,$N0,$nd,$T1b
395	fmadd	$T2a,$N1,$nc,$T2a
396	fmadd	$T2b,$N1,$nd,$T2b
397	fmadd	$T3a,$N2,$nc,$T3a
398	fmadd	$T3b,$N2,$nd,$T3b
399	fmadd	$dota,$N3,$nc,$dota
400	fmadd	$dotb,$N3,$nd,$dotb
401
402	fctid	$T0a,$T0a
403	fctid	$T0b,$T0b
404	fctid	$T1a,$T1a
405	fctid	$T1b,$T1b
406	fctid	$T2a,$T2a
407	fctid	$T2b,$T2b
408	fctid	$T3a,$T3a
409	fctid	$T3b,$T3b
410
411	stfd	$T0a,`$FRAME+0`($sp)
412	stfd	$T0b,`$FRAME+8`($sp)
413	stfd	$T1a,`$FRAME+16`($sp)
414	stfd	$T1b,`$FRAME+24`($sp)
415	stfd	$T2a,`$FRAME+32`($sp)
416	stfd	$T2b,`$FRAME+40`($sp)
417	stfd	$T3a,`$FRAME+48`($sp)
418	stfd	$T3b,`$FRAME+56`($sp)
419
420.align	5
421L1st:
422___
423$code.=<<___ if ($SIZE_T==8);
424	lwz	$t0,4($ap)		; load a[j] as 32-bit word pair
425	lwz	$t1,0($ap)
426	lwz	$t2,12($ap)		; load a[j+1] as 32-bit word pair
427	lwz	$t3,8($ap)
428	lwz	$t4,4($np)		; load n[j] as 32-bit word pair
429	lwz	$t5,0($np)
430	lwz	$t6,12($np)		; load n[j+1] as 32-bit word pair
431	lwz	$t7,8($np)
432___
433$code.=<<___ if ($SIZE_T==4);
434	lwz	$t0,0($ap)		; load a[j..j+3] as 32-bit word pairs
435	lwz	$t1,4($ap)
436	lwz	$t2,8($ap)
437	lwz	$t3,12($ap)
438	lwz	$t4,0($np)		; load n[j..j+3] as 32-bit word pairs
439	lwz	$t5,4($np)
440	lwz	$t6,8($np)
441	lwz	$t7,12($np)
442___
443$code.=<<___;
444	std	$t0,`$FRAME+64`($sp)
445	std	$t1,`$FRAME+72`($sp)
446	std	$t2,`$FRAME+80`($sp)
447	std	$t3,`$FRAME+88`($sp)
448	std	$t4,`$FRAME+96`($sp)
449	std	$t5,`$FRAME+104`($sp)
450	std	$t6,`$FRAME+112`($sp)
451	std	$t7,`$FRAME+120`($sp)
452	ld	$t0,`$FRAME+0`($sp)
453	ld	$t1,`$FRAME+8`($sp)
454	ld	$t2,`$FRAME+16`($sp)
455	ld	$t3,`$FRAME+24`($sp)
456	ld	$t4,`$FRAME+32`($sp)
457	ld	$t5,`$FRAME+40`($sp)
458	ld	$t6,`$FRAME+48`($sp)
459	ld	$t7,`$FRAME+56`($sp)
460	lfd	$A0,`$FRAME+64`($sp)
461	lfd	$A1,`$FRAME+72`($sp)
462	lfd	$A2,`$FRAME+80`($sp)
463	lfd	$A3,`$FRAME+88`($sp)
464	lfd	$N0,`$FRAME+96`($sp)
465	lfd	$N1,`$FRAME+104`($sp)
466	lfd	$N2,`$FRAME+112`($sp)
467	lfd	$N3,`$FRAME+120`($sp)
468	fcfid	$A0,$A0
469	fcfid	$A1,$A1
470	fcfid	$A2,$A2
471	fcfid	$A3,$A3
472	fcfid	$N0,$N0
473	fcfid	$N1,$N1
474	fcfid	$N2,$N2
475	fcfid	$N3,$N3
476	addi	$ap,$ap,16
477	addi	$np,$np,16
478
479	fmul	$T1a,$A1,$ba
480	fmul	$T1b,$A1,$bb
481	fmul	$T2a,$A2,$ba
482	fmul	$T2b,$A2,$bb
483	stfd	$A0,8($nap_d)		; save a[j] in double format
484	stfd	$A1,16($nap_d)
485	fmul	$T3a,$A3,$ba
486	fmul	$T3b,$A3,$bb
487	fmadd	$T0a,$A0,$ba,$dota
488	fmadd	$T0b,$A0,$bb,$dotb
489	stfd	$A2,24($nap_d)		; save a[j+1] in double format
490	stfd	$A3,32($nap_d)
491
492	fmadd	$T1a,$A0,$bc,$T1a
493	fmadd	$T1b,$A0,$bd,$T1b
494	fmadd	$T2a,$A1,$bc,$T2a
495	fmadd	$T2b,$A1,$bd,$T2b
496	stfd	$N0,40($nap_d)		; save n[j] in double format
497	stfd	$N1,48($nap_d)
498	fmadd	$T3a,$A2,$bc,$T3a
499	fmadd	$T3b,$A2,$bd,$T3b
500	 add	$t0,$t0,$carry		; can not overflow
501	fmul	$dota,$A3,$bc
502	fmul	$dotb,$A3,$bd
503	stfd	$N2,56($nap_d)		; save n[j+1] in double format
504	stfdu	$N3,64($nap_d)
505	 srdi	$carry,$t0,16
506	 add	$t1,$t1,$carry
507	 srdi	$carry,$t1,16
508
509	fmadd	$T1a,$N1,$na,$T1a
510	fmadd	$T1b,$N1,$nb,$T1b
511	 insrdi	$t0,$t1,16,32
512	fmadd	$T2a,$N2,$na,$T2a
513	fmadd	$T2b,$N2,$nb,$T2b
514	 add	$t2,$t2,$carry
515	fmadd	$T3a,$N3,$na,$T3a
516	fmadd	$T3b,$N3,$nb,$T3b
517	 srdi	$carry,$t2,16
518	fmadd	$T0a,$N0,$na,$T0a
519	fmadd	$T0b,$N0,$nb,$T0b
520	 insrdi	$t0,$t2,16,16
521	 add	$t3,$t3,$carry
522	 srdi	$carry,$t3,16
523
524	fmadd	$T1a,$N0,$nc,$T1a
525	fmadd	$T1b,$N0,$nd,$T1b
526	 insrdi	$t0,$t3,16,0		; 0..63 bits
527	fmadd	$T2a,$N1,$nc,$T2a
528	fmadd	$T2b,$N1,$nd,$T2b
529	 add	$t4,$t4,$carry
530	fmadd	$T3a,$N2,$nc,$T3a
531	fmadd	$T3b,$N2,$nd,$T3b
532	 srdi	$carry,$t4,16
533	fmadd	$dota,$N3,$nc,$dota
534	fmadd	$dotb,$N3,$nd,$dotb
535	 add	$t5,$t5,$carry
536	 srdi	$carry,$t5,16
537	 insrdi	$t4,$t5,16,32
538
539	fctid	$T0a,$T0a
540	fctid	$T0b,$T0b
541	 add	$t6,$t6,$carry
542	fctid	$T1a,$T1a
543	fctid	$T1b,$T1b
544	 srdi	$carry,$t6,16
545	fctid	$T2a,$T2a
546	fctid	$T2b,$T2b
547	 insrdi	$t4,$t6,16,16
548	fctid	$T3a,$T3a
549	fctid	$T3b,$T3b
550	 add	$t7,$t7,$carry
551	 insrdi	$t4,$t7,16,0		; 64..127 bits
552	 srdi	$carry,$t7,16		; upper 33 bits
553
554	stfd	$T0a,`$FRAME+0`($sp)
555	stfd	$T0b,`$FRAME+8`($sp)
556	stfd	$T1a,`$FRAME+16`($sp)
557	stfd	$T1b,`$FRAME+24`($sp)
558	stfd	$T2a,`$FRAME+32`($sp)
559	stfd	$T2b,`$FRAME+40`($sp)
560	stfd	$T3a,`$FRAME+48`($sp)
561	stfd	$T3b,`$FRAME+56`($sp)
562	 std	$t0,8($tp)		; tp[j-1]
563	 stdu	$t4,16($tp)		; tp[j]
564	bdnz-	L1st
565
566	fctid	$dota,$dota
567	fctid	$dotb,$dotb
568
569	ld	$t0,`$FRAME+0`($sp)
570	ld	$t1,`$FRAME+8`($sp)
571	ld	$t2,`$FRAME+16`($sp)
572	ld	$t3,`$FRAME+24`($sp)
573	ld	$t4,`$FRAME+32`($sp)
574	ld	$t5,`$FRAME+40`($sp)
575	ld	$t6,`$FRAME+48`($sp)
576	ld	$t7,`$FRAME+56`($sp)
577	stfd	$dota,`$FRAME+64`($sp)
578	stfd	$dotb,`$FRAME+72`($sp)
579
580	add	$t0,$t0,$carry		; can not overflow
581	srdi	$carry,$t0,16
582	add	$t1,$t1,$carry
583	srdi	$carry,$t1,16
584	insrdi	$t0,$t1,16,32
585	add	$t2,$t2,$carry
586	srdi	$carry,$t2,16
587	insrdi	$t0,$t2,16,16
588	add	$t3,$t3,$carry
589	srdi	$carry,$t3,16
590	insrdi	$t0,$t3,16,0		; 0..63 bits
591	add	$t4,$t4,$carry
592	srdi	$carry,$t4,16
593	add	$t5,$t5,$carry
594	srdi	$carry,$t5,16
595	insrdi	$t4,$t5,16,32
596	add	$t6,$t6,$carry
597	srdi	$carry,$t6,16
598	insrdi	$t4,$t6,16,16
599	add	$t7,$t7,$carry
600	insrdi	$t4,$t7,16,0		; 64..127 bits
601	srdi	$carry,$t7,16		; upper 33 bits
602	ld	$t6,`$FRAME+64`($sp)
603	ld	$t7,`$FRAME+72`($sp)
604
605	std	$t0,8($tp)		; tp[j-1]
606	stdu	$t4,16($tp)		; tp[j]
607
608	add	$t6,$t6,$carry		; can not overflow
609	srdi	$carry,$t6,16
610	add	$t7,$t7,$carry
611	insrdi	$t6,$t7,48,0
612	srdi	$ovf,$t7,48
613	std	$t6,8($tp)		; tp[num-1]
614
615	slwi	$t7,$num,2
616	subf	$nap_d,$t7,$nap_d	; rewind pointer
617
618	li	$i,8			; i=1
619.align	5
620Louter:
621___
622$code.=<<___ if ($SIZE_T==8);
623	ldx	$t3,$bp,$i	; bp[i]
624___
625$code.=<<___ if ($SIZE_T==4);
626	add	$t0,$bp,$i
627	lwz	$t3,0($t0)		; bp[i,i+1]
628	lwz	$t0,4($t0)
629	insrdi	$t3,$t0,32,0
630___
631$code.=<<___;
632	ld	$t6,`$FRAME+$TRANSFER+8`($sp)	; tp[0]
633	mulld	$t7,$a0,$t3	; ap[0]*bp[i]
634
635	addi	$tp,$sp,`$FRAME+$TRANSFER`
636	add	$t7,$t7,$t6	; ap[0]*bp[i]+tp[0]
637	li	$carry,0
638	mulld	$t7,$t7,$n0	; tp[0]*n0
639	mtctr	$j
640
641	; transfer bp[i] to FPU as 4x16-bit values
642	extrdi	$t0,$t3,16,48
643	extrdi	$t1,$t3,16,32
644	extrdi	$t2,$t3,16,16
645	extrdi	$t3,$t3,16,0
646	std	$t0,`$FRAME+0`($sp)
647	std	$t1,`$FRAME+8`($sp)
648	std	$t2,`$FRAME+16`($sp)
649	std	$t3,`$FRAME+24`($sp)
650	; transfer (ap[0]*bp[i]+tp[0])*n0 to FPU as 4x16-bit values
651	extrdi	$t4,$t7,16,48
652	extrdi	$t5,$t7,16,32
653	extrdi	$t6,$t7,16,16
654	extrdi	$t7,$t7,16,0
655	std	$t4,`$FRAME+32`($sp)
656	std	$t5,`$FRAME+40`($sp)
657	std	$t6,`$FRAME+48`($sp)
658	std	$t7,`$FRAME+56`($sp)
659
660	lfd	$A0,8($nap_d)		; load a[j] in double format
661	lfd	$A1,16($nap_d)
662	lfd	$A2,24($nap_d)		; load a[j+1] in double format
663	lfd	$A3,32($nap_d)
664	lfd	$N0,40($nap_d)		; load n[j] in double format
665	lfd	$N1,48($nap_d)
666	lfd	$N2,56($nap_d)		; load n[j+1] in double format
667	lfdu	$N3,64($nap_d)
668
669	lfd	$ba,`$FRAME+0`($sp)
670	lfd	$bb,`$FRAME+8`($sp)
671	lfd	$bc,`$FRAME+16`($sp)
672	lfd	$bd,`$FRAME+24`($sp)
673	lfd	$na,`$FRAME+32`($sp)
674	lfd	$nb,`$FRAME+40`($sp)
675	lfd	$nc,`$FRAME+48`($sp)
676	lfd	$nd,`$FRAME+56`($sp)
677
678	fcfid	$ba,$ba
679	fcfid	$bb,$bb
680	fcfid	$bc,$bc
681	fcfid	$bd,$bd
682	fcfid	$na,$na
683	fcfid	$nb,$nb
684	fcfid	$nc,$nc
685	fcfid	$nd,$nd
686
687	fmul	$T1a,$A1,$ba
688	fmul	$T1b,$A1,$bb
689	fmul	$T2a,$A2,$ba
690	fmul	$T2b,$A2,$bb
691	fmul	$T3a,$A3,$ba
692	fmul	$T3b,$A3,$bb
693	fmul	$T0a,$A0,$ba
694	fmul	$T0b,$A0,$bb
695
696	fmadd	$T1a,$A0,$bc,$T1a
697	fmadd	$T1b,$A0,$bd,$T1b
698	fmadd	$T2a,$A1,$bc,$T2a
699	fmadd	$T2b,$A1,$bd,$T2b
700	fmadd	$T3a,$A2,$bc,$T3a
701	fmadd	$T3b,$A2,$bd,$T3b
702	fmul	$dota,$A3,$bc
703	fmul	$dotb,$A3,$bd
704
705	fmadd	$T1a,$N1,$na,$T1a
706	fmadd	$T1b,$N1,$nb,$T1b
707	 lfd	$A0,8($nap_d)		; load a[j] in double format
708	 lfd	$A1,16($nap_d)
709	fmadd	$T2a,$N2,$na,$T2a
710	fmadd	$T2b,$N2,$nb,$T2b
711	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
712	 lfd	$A3,32($nap_d)
713	fmadd	$T3a,$N3,$na,$T3a
714	fmadd	$T3b,$N3,$nb,$T3b
715	fmadd	$T0a,$N0,$na,$T0a
716	fmadd	$T0b,$N0,$nb,$T0b
717
718	fmadd	$T1a,$N0,$nc,$T1a
719	fmadd	$T1b,$N0,$nd,$T1b
720	fmadd	$T2a,$N1,$nc,$T2a
721	fmadd	$T2b,$N1,$nd,$T2b
722	fmadd	$T3a,$N2,$nc,$T3a
723	fmadd	$T3b,$N2,$nd,$T3b
724	fmadd	$dota,$N3,$nc,$dota
725	fmadd	$dotb,$N3,$nd,$dotb
726
727	fctid	$T0a,$T0a
728	fctid	$T0b,$T0b
729	fctid	$T1a,$T1a
730	fctid	$T1b,$T1b
731	fctid	$T2a,$T2a
732	fctid	$T2b,$T2b
733	fctid	$T3a,$T3a
734	fctid	$T3b,$T3b
735
736	stfd	$T0a,`$FRAME+0`($sp)
737	stfd	$T0b,`$FRAME+8`($sp)
738	stfd	$T1a,`$FRAME+16`($sp)
739	stfd	$T1b,`$FRAME+24`($sp)
740	stfd	$T2a,`$FRAME+32`($sp)
741	stfd	$T2b,`$FRAME+40`($sp)
742	stfd	$T3a,`$FRAME+48`($sp)
743	stfd	$T3b,`$FRAME+56`($sp)
744
745.align	5
746Linner:
747	fmul	$T1a,$A1,$ba
748	fmul	$T1b,$A1,$bb
749	fmul	$T2a,$A2,$ba
750	fmul	$T2b,$A2,$bb
751	lfd	$N0,40($nap_d)		; load n[j] in double format
752	lfd	$N1,48($nap_d)
753	fmul	$T3a,$A3,$ba
754	fmul	$T3b,$A3,$bb
755	fmadd	$T0a,$A0,$ba,$dota
756	fmadd	$T0b,$A0,$bb,$dotb
757	lfd	$N2,56($nap_d)		; load n[j+1] in double format
758	lfdu	$N3,64($nap_d)
759
760	fmadd	$T1a,$A0,$bc,$T1a
761	fmadd	$T1b,$A0,$bd,$T1b
762	fmadd	$T2a,$A1,$bc,$T2a
763	fmadd	$T2b,$A1,$bd,$T2b
764	 lfd	$A0,8($nap_d)		; load a[j] in double format
765	 lfd	$A1,16($nap_d)
766	fmadd	$T3a,$A2,$bc,$T3a
767	fmadd	$T3b,$A2,$bd,$T3b
768	fmul	$dota,$A3,$bc
769	fmul	$dotb,$A3,$bd
770	 lfd	$A2,24($nap_d)		; load a[j+1] in double format
771	 lfd	$A3,32($nap_d)
772
773	fmadd	$T1a,$N1,$na,$T1a
774	fmadd	$T1b,$N1,$nb,$T1b
775	 ld	$t0,`$FRAME+0`($sp)
776	 ld	$t1,`$FRAME+8`($sp)
777	fmadd	$T2a,$N2,$na,$T2a
778	fmadd	$T2b,$N2,$nb,$T2b
779	 ld	$t2,`$FRAME+16`($sp)
780	 ld	$t3,`$FRAME+24`($sp)
781	fmadd	$T3a,$N3,$na,$T3a
782	fmadd	$T3b,$N3,$nb,$T3b
783	 add	$t0,$t0,$carry		; can not overflow
784	 ld	$t4,`$FRAME+32`($sp)
785	 ld	$t5,`$FRAME+40`($sp)
786	fmadd	$T0a,$N0,$na,$T0a
787	fmadd	$T0b,$N0,$nb,$T0b
788	 srdi	$carry,$t0,16
789	 add	$t1,$t1,$carry
790	 srdi	$carry,$t1,16
791	 ld	$t6,`$FRAME+48`($sp)
792	 ld	$t7,`$FRAME+56`($sp)
793
794	fmadd	$T1a,$N0,$nc,$T1a
795	fmadd	$T1b,$N0,$nd,$T1b
796	 insrdi	$t0,$t1,16,32
797	 ld	$t1,8($tp)		; tp[j]
798	fmadd	$T2a,$N1,$nc,$T2a
799	fmadd	$T2b,$N1,$nd,$T2b
800	 add	$t2,$t2,$carry
801	fmadd	$T3a,$N2,$nc,$T3a
802	fmadd	$T3b,$N2,$nd,$T3b
803	 srdi	$carry,$t2,16
804	 insrdi	$t0,$t2,16,16
805	fmadd	$dota,$N3,$nc,$dota
806	fmadd	$dotb,$N3,$nd,$dotb
807	 add	$t3,$t3,$carry
808	 ldu	$t2,16($tp)		; tp[j+1]
809	 srdi	$carry,$t3,16
810	 insrdi	$t0,$t3,16,0		; 0..63 bits
811	 add	$t4,$t4,$carry
812
813	fctid	$T0a,$T0a
814	fctid	$T0b,$T0b
815	 srdi	$carry,$t4,16
816	fctid	$T1a,$T1a
817	fctid	$T1b,$T1b
818	 add	$t5,$t5,$carry
819	fctid	$T2a,$T2a
820	fctid	$T2b,$T2b
821	 srdi	$carry,$t5,16
822	 insrdi	$t4,$t5,16,32
823	fctid	$T3a,$T3a
824	fctid	$T3b,$T3b
825	 add	$t6,$t6,$carry
826	 srdi	$carry,$t6,16
827	 insrdi	$t4,$t6,16,16
828
829	stfd	$T0a,`$FRAME+0`($sp)
830	stfd	$T0b,`$FRAME+8`($sp)
831	 add	$t7,$t7,$carry
832	 addc	$t3,$t0,$t1
833___
834$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
835	extrdi	$t0,$t0,32,0
836	extrdi	$t1,$t1,32,0
837	adde	$t0,$t0,$t1
838___
839$code.=<<___;
840	stfd	$T1a,`$FRAME+16`($sp)
841	stfd	$T1b,`$FRAME+24`($sp)
842	 insrdi	$t4,$t7,16,0		; 64..127 bits
843	 srdi	$carry,$t7,16		; upper 33 bits
844	stfd	$T2a,`$FRAME+32`($sp)
845	stfd	$T2b,`$FRAME+40`($sp)
846	 adde	$t5,$t4,$t2
847___
848$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
849	extrdi	$t4,$t4,32,0
850	extrdi	$t2,$t2,32,0
851	adde	$t4,$t4,$t2
852___
853$code.=<<___;
854	stfd	$T3a,`$FRAME+48`($sp)
855	stfd	$T3b,`$FRAME+56`($sp)
856	 addze	$carry,$carry
857	 std	$t3,-16($tp)		; tp[j-1]
858	 std	$t5,-8($tp)		; tp[j]
859	bdnz-	Linner
860
861	fctid	$dota,$dota
862	fctid	$dotb,$dotb
863	ld	$t0,`$FRAME+0`($sp)
864	ld	$t1,`$FRAME+8`($sp)
865	ld	$t2,`$FRAME+16`($sp)
866	ld	$t3,`$FRAME+24`($sp)
867	ld	$t4,`$FRAME+32`($sp)
868	ld	$t5,`$FRAME+40`($sp)
869	ld	$t6,`$FRAME+48`($sp)
870	ld	$t7,`$FRAME+56`($sp)
871	stfd	$dota,`$FRAME+64`($sp)
872	stfd	$dotb,`$FRAME+72`($sp)
873
874	add	$t0,$t0,$carry		; can not overflow
875	srdi	$carry,$t0,16
876	add	$t1,$t1,$carry
877	srdi	$carry,$t1,16
878	insrdi	$t0,$t1,16,32
879	add	$t2,$t2,$carry
880	ld	$t1,8($tp)		; tp[j]
881	srdi	$carry,$t2,16
882	insrdi	$t0,$t2,16,16
883	add	$t3,$t3,$carry
884	ldu	$t2,16($tp)		; tp[j+1]
885	srdi	$carry,$t3,16
886	insrdi	$t0,$t3,16,0		; 0..63 bits
887	add	$t4,$t4,$carry
888	srdi	$carry,$t4,16
889	add	$t5,$t5,$carry
890	srdi	$carry,$t5,16
891	insrdi	$t4,$t5,16,32
892	add	$t6,$t6,$carry
893	srdi	$carry,$t6,16
894	insrdi	$t4,$t6,16,16
895	add	$t7,$t7,$carry
896	insrdi	$t4,$t7,16,0		; 64..127 bits
897	srdi	$carry,$t7,16		; upper 33 bits
898	ld	$t6,`$FRAME+64`($sp)
899	ld	$t7,`$FRAME+72`($sp)
900
901	addc	$t3,$t0,$t1
902___
903$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
904	extrdi	$t0,$t0,32,0
905	extrdi	$t1,$t1,32,0
906	adde	$t0,$t0,$t1
907___
908$code.=<<___;
909	adde	$t5,$t4,$t2
910___
911$code.=<<___ if ($SIZE_T==4);		# adjust XER[CA]
912	extrdi	$t4,$t4,32,0
913	extrdi	$t2,$t2,32,0
914	adde	$t4,$t4,$t2
915___
916$code.=<<___;
917	addze	$carry,$carry
918
919	std	$t3,-16($tp)		; tp[j-1]
920	std	$t5,-8($tp)		; tp[j]
921
922	add	$carry,$carry,$ovf	; comsume upmost overflow
923	add	$t6,$t6,$carry		; can not overflow
924	srdi	$carry,$t6,16
925	add	$t7,$t7,$carry
926	insrdi	$t6,$t7,48,0
927	srdi	$ovf,$t7,48
928	std	$t6,0($tp)		; tp[num-1]
929
930	slwi	$t7,$num,2
931	addi	$i,$i,8
932	subf	$nap_d,$t7,$nap_d	; rewind pointer
933	cmpw	$i,$num
934	blt-	Louter
935___
936
937$code.=<<___ if ($SIZE_T==8);
938	subf	$np,$num,$np	; rewind np
939	addi	$j,$j,1		; restore counter
940	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
941	addi	$tp,$sp,`$FRAME+$TRANSFER+8`
942	addi	$t4,$sp,`$FRAME+$TRANSFER+16`
943	addi	$t5,$np,8
944	addi	$t6,$rp,8
945	mtctr	$j
946
947.align	4
948Lsub:	ldx	$t0,$tp,$i
949	ldx	$t1,$np,$i
950	ldx	$t2,$t4,$i
951	ldx	$t3,$t5,$i
952	subfe	$t0,$t1,$t0	; tp[j]-np[j]
953	subfe	$t2,$t3,$t2	; tp[j+1]-np[j+1]
954	stdx	$t0,$rp,$i
955	stdx	$t2,$t6,$i
956	addi	$i,$i,16
957	bdnz-	Lsub
958
959	li	$i,0
960	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
961	and	$ap,$tp,$ovf
962	andc	$np,$rp,$ovf
963	or	$ap,$ap,$np	; ap=borrow?tp:rp
964	addi	$t7,$ap,8
965	mtctr	$j
966
967.align	4
968Lcopy:				; copy or in-place refresh
969	ldx	$t0,$ap,$i
970	ldx	$t1,$t7,$i
971	std	$i,8($nap_d)	; zap nap_d
972	std	$i,16($nap_d)
973	std	$i,24($nap_d)
974	std	$i,32($nap_d)
975	std	$i,40($nap_d)
976	std	$i,48($nap_d)
977	std	$i,56($nap_d)
978	stdu	$i,64($nap_d)
979	stdx	$t0,$rp,$i
980	stdx	$t1,$t6,$i
981	stdx	$i,$tp,$i	; zap tp at once
982	stdx	$i,$t4,$i
983	addi	$i,$i,16
984	bdnz-	Lcopy
985___
986$code.=<<___ if ($SIZE_T==4);
987	subf	$np,$num,$np	; rewind np
988	addi	$j,$j,1		; restore counter
989	subfc	$i,$i,$i	; j=0 and "clear" XER[CA]
990	addi	$tp,$sp,`$FRAME+$TRANSFER`
991	addi	$np,$np,-4
992	addi	$rp,$rp,-4
993	addi	$ap,$sp,`$FRAME+$TRANSFER+4`
994	mtctr	$j
995
996.align	4
997Lsub:	ld	$t0,8($tp)	; load tp[j..j+3] in 64-bit word order
998	ldu	$t2,16($tp)
999	lwz	$t4,4($np)	; load np[j..j+3] in 32-bit word order
1000	lwz	$t5,8($np)
1001	lwz	$t6,12($np)
1002	lwzu	$t7,16($np)
1003	extrdi	$t1,$t0,32,0
1004	extrdi	$t3,$t2,32,0
1005	subfe	$t4,$t4,$t0	; tp[j]-np[j]
1006	 stw	$t0,4($ap)	; save tp[j..j+3] in 32-bit word order
1007	subfe	$t5,$t5,$t1	; tp[j+1]-np[j+1]
1008	 stw	$t1,8($ap)
1009	subfe	$t6,$t6,$t2	; tp[j+2]-np[j+2]
1010	 stw	$t2,12($ap)
1011	subfe	$t7,$t7,$t3	; tp[j+3]-np[j+3]
1012	 stwu	$t3,16($ap)
1013	stw	$t4,4($rp)
1014	stw	$t5,8($rp)
1015	stw	$t6,12($rp)
1016	stwu	$t7,16($rp)
1017	bdnz-	Lsub
1018
1019	li	$i,0
1020	subfe	$ovf,$i,$ovf	; handle upmost overflow bit
1021	addi	$tp,$sp,`$FRAME+$TRANSFER+4`
1022	subf	$rp,$num,$rp	; rewind rp
1023	and	$ap,$tp,$ovf
1024	andc	$np,$rp,$ovf
1025	or	$ap,$ap,$np	; ap=borrow?tp:rp
1026	addi	$tp,$sp,`$FRAME+$TRANSFER`
1027	mtctr	$j
1028
1029.align	4
1030Lcopy:				; copy or in-place refresh
1031	lwz	$t0,4($ap)
1032	lwz	$t1,8($ap)
1033	lwz	$t2,12($ap)
1034	lwzu	$t3,16($ap)
1035	std	$i,8($nap_d)	; zap nap_d
1036	std	$i,16($nap_d)
1037	std	$i,24($nap_d)
1038	std	$i,32($nap_d)
1039	std	$i,40($nap_d)
1040	std	$i,48($nap_d)
1041	std	$i,56($nap_d)
1042	stdu	$i,64($nap_d)
1043	stw	$t0,4($rp)
1044	stw	$t1,8($rp)
1045	stw	$t2,12($rp)
1046	stwu	$t3,16($rp)
1047	std	$i,8($tp)	; zap tp at once
1048	stdu	$i,16($tp)
1049	bdnz-	Lcopy
1050___
1051
1052$code.=<<___;
1053	$POP	$i,0($sp)
1054	li	r3,1	; signal "handled"
1055	$POP	r22,`-12*8-10*$SIZE_T`($i)
1056	$POP	r23,`-12*8-9*$SIZE_T`($i)
1057	$POP	r24,`-12*8-8*$SIZE_T`($i)
1058	$POP	r25,`-12*8-7*$SIZE_T`($i)
1059	$POP	r26,`-12*8-6*$SIZE_T`($i)
1060	$POP	r27,`-12*8-5*$SIZE_T`($i)
1061	$POP	r28,`-12*8-4*$SIZE_T`($i)
1062	$POP	r29,`-12*8-3*$SIZE_T`($i)
1063	$POP	r30,`-12*8-2*$SIZE_T`($i)
1064	$POP	r31,`-12*8-1*$SIZE_T`($i)
1065	lfd	f20,`-12*8`($i)
1066	lfd	f21,`-11*8`($i)
1067	lfd	f22,`-10*8`($i)
1068	lfd	f23,`-9*8`($i)
1069	lfd	f24,`-8*8`($i)
1070	lfd	f25,`-7*8`($i)
1071	lfd	f26,`-6*8`($i)
1072	lfd	f27,`-5*8`($i)
1073	lfd	f28,`-4*8`($i)
1074	lfd	f29,`-3*8`($i)
1075	lfd	f30,`-2*8`($i)
1076	lfd	f31,`-1*8`($i)
1077	mr	$sp,$i
1078	blr
1079	.long	0
1080	.byte	0,12,4,0,0x8c,10,6,0
1081	.long	0
1082
1083.asciz  "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
1084___
1085
1086$code =~ s/\`([^\`]*)\`/eval $1/gem;
1087print $code;
1088close STDOUT;
1089