• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; PA-RISC 2.0 implementation of bn_asm code, based on the
3; 64-bit version of the code.  This code is effectively the
4; same as the 64-bit version except the register model is
5; slightly different given all values must be 32-bit between
6; function calls.  Thus the 64-bit return values are returned
7; in %ret0 and %ret1 vs just %ret0 as is done in 64-bit
8;
9;
10; This code is approximately 2x faster than the C version
11; for RSA/DSA.
12;
13; See http://devresource.hp.com/  for more details on the PA-RISC
14; architecture.  Also see the book "PA-RISC 2.0 Architecture"
15; by Gerry Kane for information on the instruction set architecture.
16;
17; Code written by Chris Ruemmler (with some help from the HP C
18; compiler).
19;
20; The code compiles with HP's assembler
21;
22
23	.level	2.0N
24	.space	$TEXT$
25	.subspa	$CODE$,QUAD=0,ALIGN=8,ACCESS=0x2c,CODE_ONLY
26
27;
28; Global Register definitions used for the routines.
29;
30; Some information about HP's runtime architecture for 32-bits.
31;
32; "Caller save" means the calling function must save the register
33; if it wants the register to be preserved.
34; "Callee save" means if a function uses the register, it must save
35; the value before using it.
36;
37; For the floating point registers
38;
39;    "caller save" registers: fr4-fr11, fr22-fr31
40;    "callee save" registers: fr12-fr21
41;    "special" registers: fr0-fr3 (status and exception registers)
42;
43; For the integer registers
44;     value zero             :  r0
45;     "caller save" registers: r1,r19-r26
46;     "callee save" registers: r3-r18
47;     return register        :  r2  (rp)
48;     return values          ; r28,r29  (ret0,ret1)
49;     Stack pointer          ; r30  (sp)
50;     millicode return ptr   ; r31  (also a caller save register)
51
52
53;
54; Arguments to the routines
55;
56r_ptr       .reg %r26
57a_ptr       .reg %r25
58b_ptr       .reg %r24
59num         .reg %r24
60n           .reg %r23
61
62;
63; Note that the "w" argument for bn_mul_add_words and bn_mul_words
64; is passed on the stack at a delta of -56 from the top of stack
65; as the routine is entered.
66;
67
68;
69; Globals used in some routines
70;
71
72top_overflow .reg %r23
73high_mask    .reg %r22    ; value 0xffffffff80000000L
74
75
76;------------------------------------------------------------------------------
77;
78; bn_mul_add_words
79;
80;BN_ULONG bn_mul_add_words(BN_ULONG *r_ptr, BN_ULONG *a_ptr,
81;								int num, BN_ULONG w)
82;
83; arg0 = r_ptr
84; arg1 = a_ptr
85; arg3 = num
86; -56(sp) =  w
87;
88; Local register definitions
89;
90
91fm1          .reg %fr22
92fm           .reg %fr23
93ht_temp      .reg %fr24
94ht_temp_1    .reg %fr25
95lt_temp      .reg %fr26
96lt_temp_1    .reg %fr27
97fm1_1        .reg %fr28
98fm_1         .reg %fr29
99
100fw_h         .reg %fr7L
101fw_l         .reg %fr7R
102fw           .reg %fr7
103
104fht_0        .reg %fr8L
105flt_0        .reg %fr8R
106t_float_0    .reg %fr8
107
108fht_1        .reg %fr9L
109flt_1        .reg %fr9R
110t_float_1    .reg %fr9
111
112tmp_0        .reg %r31
113tmp_1        .reg %r21
114m_0          .reg %r20
115m_1          .reg %r19
116ht_0         .reg %r1
117ht_1         .reg %r3
118lt_0         .reg %r4
119lt_1         .reg %r5
120m1_0         .reg %r6
121m1_1         .reg %r7
122rp_val       .reg %r8
123rp_val_1     .reg %r9
124
125bn_mul_add_words
126	.export	bn_mul_add_words,entry,NO_RELOCATION,LONG_RETURN
127	.proc
128	.callinfo frame=128
129    .entry
130	.align 64
131
132    STD     %r3,0(%sp)          ; save r3
133    STD     %r4,8(%sp)          ; save r4
134	NOP                         ; Needed to make the loop 16-byte aligned
135	NOP                         ; needed to make the loop 16-byte aligned
136
137    STD     %r5,16(%sp)         ; save r5
138	NOP
139    STD     %r6,24(%sp)         ; save r6
140    STD     %r7,32(%sp)         ; save r7
141
142    STD     %r8,40(%sp)         ; save r8
143    STD     %r9,48(%sp)         ; save r9
144    COPY    %r0,%ret1           ; return 0 by default
145    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
146
147    CMPIB,>= 0,num,bn_mul_add_words_exit  ; if (num <= 0) then exit
148	LDO     128(%sp),%sp        ; bump stack
149
150	;
151	; The loop is unrolled twice, so if there is only 1 number
152    ; then go straight to the cleanup code.
153	;
154	CMPIB,= 1,num,bn_mul_add_words_single_top
155	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
156
157	;
158	; This loop is unrolled 2 times (64-byte aligned as well)
159	;
160	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
161    ; two 32-bit mutiplies can be issued per cycle.
162    ;
163bn_mul_add_words_unroll2
164
165    FLDD    0(a_ptr),t_float_0       ; load up 64-bit value (fr8L) ht(L)/lt(R)
166    FLDD    8(a_ptr),t_float_1       ; load up 64-bit value (fr8L) ht(L)/lt(R)
167    LDD     0(r_ptr),rp_val          ; rp[0]
168    LDD     8(r_ptr),rp_val_1        ; rp[1]
169
170    XMPYU   fht_0,fw_l,fm1           ; m1[0] = fht_0*fw_l
171    XMPYU   fht_1,fw_l,fm1_1         ; m1[1] = fht_1*fw_l
172    FSTD    fm1,-16(%sp)             ; -16(sp) = m1[0]
173    FSTD    fm1_1,-48(%sp)           ; -48(sp) = m1[1]
174
175    XMPYU   flt_0,fw_h,fm            ; m[0] = flt_0*fw_h
176    XMPYU   flt_1,fw_h,fm_1          ; m[1] = flt_1*fw_h
177    FSTD    fm,-8(%sp)               ; -8(sp) = m[0]
178    FSTD    fm_1,-40(%sp)            ; -40(sp) = m[1]
179
180    XMPYU   fht_0,fw_h,ht_temp       ; ht_temp   = fht_0*fw_h
181    XMPYU   fht_1,fw_h,ht_temp_1     ; ht_temp_1 = fht_1*fw_h
182    FSTD    ht_temp,-24(%sp)         ; -24(sp)   = ht_temp
183    FSTD    ht_temp_1,-56(%sp)       ; -56(sp)   = ht_temp_1
184
185    XMPYU   flt_0,fw_l,lt_temp       ; lt_temp = lt*fw_l
186    XMPYU   flt_1,fw_l,lt_temp_1     ; lt_temp = lt*fw_l
187    FSTD    lt_temp,-32(%sp)         ; -32(sp) = lt_temp
188    FSTD    lt_temp_1,-64(%sp)       ; -64(sp) = lt_temp_1
189
190    LDD     -8(%sp),m_0              ; m[0]
191    LDD     -40(%sp),m_1             ; m[1]
192    LDD     -16(%sp),m1_0            ; m1[0]
193    LDD     -48(%sp),m1_1            ; m1[1]
194
195    LDD     -24(%sp),ht_0            ; ht[0]
196    LDD     -56(%sp),ht_1            ; ht[1]
197    ADD,L   m1_0,m_0,tmp_0           ; tmp_0 = m[0] + m1[0];
198    ADD,L   m1_1,m_1,tmp_1           ; tmp_1 = m[1] + m1[1];
199
200    LDD     -32(%sp),lt_0
201    LDD     -64(%sp),lt_1
202    CMPCLR,*>>= tmp_0,m1_0, %r0      ; if (m[0] < m1[0])
203    ADD,L   ht_0,top_overflow,ht_0   ; ht[0] += (1<<32)
204
205    CMPCLR,*>>= tmp_1,m1_1,%r0       ; if (m[1] < m1[1])
206    ADD,L   ht_1,top_overflow,ht_1   ; ht[1] += (1<<32)
207    EXTRD,U tmp_0,31,32,m_0          ; m[0]>>32
208    DEPD,Z  tmp_0,31,32,m1_0         ; m1[0] = m[0]<<32
209
210    EXTRD,U tmp_1,31,32,m_1          ; m[1]>>32
211    DEPD,Z  tmp_1,31,32,m1_1         ; m1[1] = m[1]<<32
212    ADD,L   ht_0,m_0,ht_0            ; ht[0]+= (m[0]>>32)
213    ADD,L   ht_1,m_1,ht_1            ; ht[1]+= (m[1]>>32)
214
215    ADD     lt_0,m1_0,lt_0           ; lt[0] = lt[0]+m1[0];
216	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
217    ADD     lt_1,m1_1,lt_1           ; lt[1] = lt[1]+m1[1];
218    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
219
220    ADD    %ret1,lt_0,lt_0           ; lt[0] = lt[0] + c;
221	ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
222    ADD     lt_0,rp_val,lt_0         ; lt[0] = lt[0]+rp[0]
223    ADD,DC  ht_0,%r0,ht_0            ; ht[0]++
224
225	LDO    -2(num),num               ; num = num - 2;
226    ADD     ht_0,lt_1,lt_1           ; lt[1] = lt[1] + ht_0 (c);
227    ADD,DC  ht_1,%r0,ht_1            ; ht[1]++
228    STD     lt_0,0(r_ptr)            ; rp[0] = lt[0]
229
230    ADD     lt_1,rp_val_1,lt_1       ; lt[1] = lt[1]+rp[1]
231    ADD,DC  ht_1,%r0,%ret1           ; ht[1]++
232    LDO     16(a_ptr),a_ptr          ; a_ptr += 2
233
234    STD     lt_1,8(r_ptr)            ; rp[1] = lt[1]
235	CMPIB,<= 2,num,bn_mul_add_words_unroll2 ; go again if more to do
236    LDO     16(r_ptr),r_ptr          ; r_ptr += 2
237
238    CMPIB,=,N 0,num,bn_mul_add_words_exit ; are we done, or cleanup last one
239
240	;
241	; Top of loop aligned on 64-byte boundary
242	;
243bn_mul_add_words_single_top
244    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
245    LDD     0(r_ptr),rp_val           ; rp[0]
246    LDO     8(a_ptr),a_ptr            ; a_ptr++
247    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
248    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
249    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
250    FSTD    fm,-8(%sp)                ; -8(sp) = m
251    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
252    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
253    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
254    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
255
256    LDD     -8(%sp),m_0
257    LDD    -16(%sp),m1_0              ; m1 = temp1
258    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
259    LDD     -24(%sp),ht_0
260    LDD     -32(%sp),lt_0
261
262    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
263    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
264
265    EXTRD,U tmp_0,31,32,m_0           ; m>>32
266    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
267
268    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
269    ADD     lt_0,m1_0,tmp_0           ; tmp_0 = lt+m1;
270    ADD,DC  ht_0,%r0,ht_0             ; ht++
271    ADD     %ret1,tmp_0,lt_0          ; lt = lt + c;
272    ADD,DC  ht_0,%r0,ht_0             ; ht++
273    ADD     lt_0,rp_val,lt_0          ; lt = lt+rp[0]
274    ADD,DC  ht_0,%r0,%ret1            ; ht++
275    STD     lt_0,0(r_ptr)             ; rp[0] = lt
276
277bn_mul_add_words_exit
278    .EXIT
279
280    EXTRD,U %ret1,31,32,%ret0         ; for 32-bit, return in ret0/ret1
281    LDD     -80(%sp),%r9              ; restore r9
282    LDD     -88(%sp),%r8              ; restore r8
283    LDD     -96(%sp),%r7              ; restore r7
284    LDD     -104(%sp),%r6             ; restore r6
285    LDD     -112(%sp),%r5             ; restore r5
286    LDD     -120(%sp),%r4             ; restore r4
287    BVE     (%rp)
288    LDD,MB  -128(%sp),%r3             ; restore r3
289	.PROCEND	;in=23,24,25,26,29;out=28;
290
291;----------------------------------------------------------------------------
292;
293;BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
294;
295; arg0 = rp
296; arg1 = ap
297; arg3 = num
298; w on stack at -56(sp)
299
300bn_mul_words
301	.proc
302	.callinfo frame=128
303    .entry
304	.EXPORT	bn_mul_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
305	.align 64
306
307    STD     %r3,0(%sp)          ; save r3
308    STD     %r4,8(%sp)          ; save r4
309	NOP
310    STD     %r5,16(%sp)         ; save r5
311
312    STD     %r6,24(%sp)         ; save r6
313    STD     %r7,32(%sp)         ; save r7
314    COPY    %r0,%ret1           ; return 0 by default
315    DEPDI,Z 1,31,1,top_overflow ; top_overflow = 1 << 32
316
317    CMPIB,>= 0,num,bn_mul_words_exit
318	LDO     128(%sp),%sp    ; bump stack
319
320	;
321	; See if only 1 word to do, thus just do cleanup
322	;
323	CMPIB,= 1,num,bn_mul_words_single_top
324	FLDD    -184(%sp),fw        ; (-56-128) load up w into fw (fw_h/fw_l)
325
326	;
327	; This loop is unrolled 2 times (64-byte aligned as well)
328	;
329	; PA-RISC 2.0 chips have two fully pipelined multipliers, thus
330    ; two 32-bit mutiplies can be issued per cycle.
331    ;
332bn_mul_words_unroll2
333
334    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
335    FLDD    8(a_ptr),t_float_1        ; load up 64-bit value (fr8L) ht(L)/lt(R)
336    XMPYU   fht_0,fw_l,fm1            ; m1[0] = fht_0*fw_l
337    XMPYU   fht_1,fw_l,fm1_1          ; m1[1] = ht*fw_l
338
339    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
340    FSTD    fm1_1,-48(%sp)            ; -48(sp) = m1
341    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
342    XMPYU   flt_1,fw_h,fm_1           ; m = lt*fw_h
343
344    FSTD    fm,-8(%sp)                ; -8(sp) = m
345    FSTD    fm_1,-40(%sp)             ; -40(sp) = m
346    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = fht_0*fw_h
347    XMPYU   fht_1,fw_h,ht_temp_1      ; ht_temp = ht*fw_h
348
349    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
350    FSTD    ht_temp_1,-56(%sp)        ; -56(sp) = ht
351    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
352    XMPYU   flt_1,fw_l,lt_temp_1      ; lt_temp = lt*fw_l
353
354    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
355    FSTD    lt_temp_1,-64(%sp)        ; -64(sp) = lt
356    LDD     -8(%sp),m_0
357    LDD     -40(%sp),m_1
358
359    LDD    -16(%sp),m1_0
360    LDD    -48(%sp),m1_1
361    LDD     -24(%sp),ht_0
362    LDD     -56(%sp),ht_1
363
364    ADD,L   m1_0,m_0,tmp_0            ; tmp_0 = m + m1;
365    ADD,L   m1_1,m_1,tmp_1            ; tmp_1 = m + m1;
366    LDD     -32(%sp),lt_0
367    LDD     -64(%sp),lt_1
368
369    CMPCLR,*>>= tmp_0,m1_0, %r0       ; if (m < m1)
370    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
371    CMPCLR,*>>= tmp_1,m1_1,%r0        ; if (m < m1)
372    ADD,L   ht_1,top_overflow,ht_1    ; ht += (1<<32)
373
374    EXTRD,U tmp_0,31,32,m_0           ; m>>32
375    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
376    EXTRD,U tmp_1,31,32,m_1           ; m>>32
377    DEPD,Z  tmp_1,31,32,m1_1          ; m1 = m<<32
378
379    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
380    ADD,L   ht_1,m_1,ht_1             ; ht+= (m>>32)
381    ADD     lt_0,m1_0,lt_0            ; lt = lt+m1;
382	ADD,DC  ht_0,%r0,ht_0             ; ht++
383
384    ADD     lt_1,m1_1,lt_1            ; lt = lt+m1;
385    ADD,DC  ht_1,%r0,ht_1             ; ht++
386    ADD    %ret1,lt_0,lt_0            ; lt = lt + c (ret1);
387	ADD,DC  ht_0,%r0,ht_0             ; ht++
388
389    ADD     ht_0,lt_1,lt_1            ; lt = lt + c (ht_0)
390    ADD,DC  ht_1,%r0,ht_1             ; ht++
391    STD     lt_0,0(r_ptr)             ; rp[0] = lt
392    STD     lt_1,8(r_ptr)             ; rp[1] = lt
393
394	COPY    ht_1,%ret1                ; carry = ht
395	LDO    -2(num),num                ; num = num - 2;
396    LDO     16(a_ptr),a_ptr           ; ap += 2
397	CMPIB,<= 2,num,bn_mul_words_unroll2
398    LDO     16(r_ptr),r_ptr           ; rp++
399
400    CMPIB,=,N 0,num,bn_mul_words_exit ; are we done?
401
402	;
403	; Top of loop aligned on 64-byte boundary
404	;
405bn_mul_words_single_top
406    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
407
408    XMPYU   fht_0,fw_l,fm1            ; m1 = ht*fw_l
409    FSTD    fm1,-16(%sp)              ; -16(sp) = m1
410    XMPYU   flt_0,fw_h,fm             ; m = lt*fw_h
411    FSTD    fm,-8(%sp)                ; -8(sp) = m
412    XMPYU   fht_0,fw_h,ht_temp        ; ht_temp = ht*fw_h
413    FSTD    ht_temp,-24(%sp)          ; -24(sp) = ht
414    XMPYU   flt_0,fw_l,lt_temp        ; lt_temp = lt*fw_l
415    FSTD    lt_temp,-32(%sp)          ; -32(sp) = lt
416
417    LDD     -8(%sp),m_0
418    LDD    -16(%sp),m1_0
419    ADD,L   m_0,m1_0,tmp_0            ; tmp_0 = m + m1;
420    LDD     -24(%sp),ht_0
421    LDD     -32(%sp),lt_0
422
423    CMPCLR,*>>= tmp_0,m1_0,%r0        ; if (m < m1)
424    ADD,L   ht_0,top_overflow,ht_0    ; ht += (1<<32)
425
426    EXTRD,U tmp_0,31,32,m_0           ; m>>32
427    DEPD,Z  tmp_0,31,32,m1_0          ; m1 = m<<32
428
429    ADD,L   ht_0,m_0,ht_0             ; ht+= (m>>32)
430    ADD     lt_0,m1_0,lt_0            ; lt= lt+m1;
431    ADD,DC  ht_0,%r0,ht_0             ; ht++
432
433    ADD     %ret1,lt_0,lt_0           ; lt = lt + c;
434    ADD,DC  ht_0,%r0,ht_0             ; ht++
435
436    COPY    ht_0,%ret1                ; copy carry
437    STD     lt_0,0(r_ptr)             ; rp[0] = lt
438
439bn_mul_words_exit
440    .EXIT
441    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
442    LDD     -96(%sp),%r7              ; restore r7
443    LDD     -104(%sp),%r6             ; restore r6
444    LDD     -112(%sp),%r5             ; restore r5
445    LDD     -120(%sp),%r4             ; restore r4
446    BVE     (%rp)
447    LDD,MB  -128(%sp),%r3             ; restore r3
448	.PROCEND
449
450;----------------------------------------------------------------------------
451;
452;void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
453;
454; arg0 = rp
455; arg1 = ap
456; arg2 = num
457;
458
459bn_sqr_words
460	.proc
461	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
462	.EXPORT	bn_sqr_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
463    .entry
464	.align 64
465
466    STD     %r3,0(%sp)          ; save r3
467    STD     %r4,8(%sp)          ; save r4
468	NOP
469    STD     %r5,16(%sp)         ; save r5
470
471    CMPIB,>= 0,num,bn_sqr_words_exit
472	LDO     128(%sp),%sp       ; bump stack
473
474	;
475	; If only 1, the goto straight to cleanup
476	;
477	CMPIB,= 1,num,bn_sqr_words_single_top
478    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
479
480	;
481	; This loop is unrolled 2 times (64-byte aligned as well)
482	;
483
484bn_sqr_words_unroll2
485    FLDD    0(a_ptr),t_float_0        ; a[0]
486    FLDD    8(a_ptr),t_float_1        ; a[1]
487    XMPYU   fht_0,flt_0,fm            ; m[0]
488    XMPYU   fht_1,flt_1,fm_1          ; m[1]
489
490    FSTD    fm,-24(%sp)               ; store m[0]
491    FSTD    fm_1,-56(%sp)             ; store m[1]
492    XMPYU   flt_0,flt_0,lt_temp       ; lt[0]
493    XMPYU   flt_1,flt_1,lt_temp_1     ; lt[1]
494
495    FSTD    lt_temp,-16(%sp)          ; store lt[0]
496    FSTD    lt_temp_1,-48(%sp)        ; store lt[1]
497    XMPYU   fht_0,fht_0,ht_temp       ; ht[0]
498    XMPYU   fht_1,fht_1,ht_temp_1     ; ht[1]
499
500    FSTD    ht_temp,-8(%sp)           ; store ht[0]
501    FSTD    ht_temp_1,-40(%sp)        ; store ht[1]
502    LDD     -24(%sp),m_0
503    LDD     -56(%sp),m_1
504
505    AND     m_0,high_mask,tmp_0       ; m[0] & Mask
506    AND     m_1,high_mask,tmp_1       ; m[1] & Mask
507    DEPD,Z  m_0,30,31,m_0             ; m[0] << 32+1
508    DEPD,Z  m_1,30,31,m_1             ; m[1] << 32+1
509
510    LDD     -16(%sp),lt_0
511    LDD     -48(%sp),lt_1
512    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m[0]&Mask >> 32-1
513    EXTRD,U tmp_1,32,33,tmp_1         ; tmp_1 = m[1]&Mask >> 32-1
514
515    LDD     -8(%sp),ht_0
516    LDD     -40(%sp),ht_1
517    ADD,L   ht_0,tmp_0,ht_0           ; ht[0] += tmp_0
518    ADD,L   ht_1,tmp_1,ht_1           ; ht[1] += tmp_1
519
520    ADD     lt_0,m_0,lt_0             ; lt = lt+m
521    ADD,DC  ht_0,%r0,ht_0             ; ht[0]++
522    STD     lt_0,0(r_ptr)             ; rp[0] = lt[0]
523    STD     ht_0,8(r_ptr)             ; rp[1] = ht[1]
524
525    ADD     lt_1,m_1,lt_1             ; lt = lt+m
526    ADD,DC  ht_1,%r0,ht_1             ; ht[1]++
527    STD     lt_1,16(r_ptr)            ; rp[2] = lt[1]
528    STD     ht_1,24(r_ptr)            ; rp[3] = ht[1]
529
530	LDO    -2(num),num                ; num = num - 2;
531    LDO     16(a_ptr),a_ptr           ; ap += 2
532	CMPIB,<= 2,num,bn_sqr_words_unroll2
533    LDO     32(r_ptr),r_ptr           ; rp += 4
534
535    CMPIB,=,N 0,num,bn_sqr_words_exit ; are we done?
536
537	;
538	; Top of loop aligned on 64-byte boundary
539	;
540bn_sqr_words_single_top
541    FLDD    0(a_ptr),t_float_0        ; load up 64-bit value (fr8L) ht(L)/lt(R)
542
543    XMPYU   fht_0,flt_0,fm            ; m
544    FSTD    fm,-24(%sp)               ; store m
545
546    XMPYU   flt_0,flt_0,lt_temp       ; lt
547    FSTD    lt_temp,-16(%sp)          ; store lt
548
549    XMPYU   fht_0,fht_0,ht_temp       ; ht
550    FSTD    ht_temp,-8(%sp)           ; store ht
551
552    LDD     -24(%sp),m_0              ; load m
553    AND     m_0,high_mask,tmp_0       ; m & Mask
554    DEPD,Z  m_0,30,31,m_0             ; m << 32+1
555    LDD     -16(%sp),lt_0             ; lt
556
557    LDD     -8(%sp),ht_0              ; ht
558    EXTRD,U tmp_0,32,33,tmp_0         ; tmp_0 = m&Mask >> 32-1
559    ADD     m_0,lt_0,lt_0             ; lt = lt+m
560    ADD,L   ht_0,tmp_0,ht_0           ; ht += tmp_0
561    ADD,DC  ht_0,%r0,ht_0             ; ht++
562
563    STD     lt_0,0(r_ptr)             ; rp[0] = lt
564    STD     ht_0,8(r_ptr)             ; rp[1] = ht
565
566bn_sqr_words_exit
567    .EXIT
568    LDD     -112(%sp),%r5       ; restore r5
569    LDD     -120(%sp),%r4       ; restore r4
570    BVE     (%rp)
571    LDD,MB  -128(%sp),%r3
572	.PROCEND	;in=23,24,25,26,29;out=28;
573
574
575;----------------------------------------------------------------------------
576;
577;BN_ULONG bn_add_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
578;
579; arg0 = rp
580; arg1 = ap
581; arg2 = bp
582; arg3 = n
583
584t  .reg %r22
585b  .reg %r21
586l  .reg %r20
587
588bn_add_words
589	.proc
590    .entry
591	.callinfo
592	.EXPORT	bn_add_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
593	.align 64
594
595    CMPIB,>= 0,n,bn_add_words_exit
596    COPY    %r0,%ret1           ; return 0 by default
597
598	;
599	; If 2 or more numbers do the loop
600	;
601	CMPIB,= 1,n,bn_add_words_single_top
602	NOP
603
604	;
605	; This loop is unrolled 2 times (64-byte aligned as well)
606	;
607bn_add_words_unroll2
608	LDD     0(a_ptr),t
609	LDD     0(b_ptr),b
610	ADD     t,%ret1,t                    ; t = t+c;
611	ADD,DC  %r0,%r0,%ret1                ; set c to carry
612	ADD     t,b,l                        ; l = t + b[0]
613	ADD,DC  %ret1,%r0,%ret1              ; c+= carry
614	STD     l,0(r_ptr)
615
616	LDD     8(a_ptr),t
617	LDD     8(b_ptr),b
618	ADD     t,%ret1,t                     ; t = t+c;
619	ADD,DC  %r0,%r0,%ret1                 ; set c to carry
620	ADD     t,b,l                         ; l = t + b[0]
621	ADD,DC  %ret1,%r0,%ret1               ; c+= carry
622	STD     l,8(r_ptr)
623
624	LDO     -2(n),n
625	LDO     16(a_ptr),a_ptr
626	LDO     16(b_ptr),b_ptr
627
628	CMPIB,<= 2,n,bn_add_words_unroll2
629	LDO     16(r_ptr),r_ptr
630
631    CMPIB,=,N 0,n,bn_add_words_exit ; are we done?
632
633bn_add_words_single_top
634	LDD     0(a_ptr),t
635	LDD     0(b_ptr),b
636
637	ADD     t,%ret1,t                 ; t = t+c;
638	ADD,DC  %r0,%r0,%ret1             ; set c to carry (could use CMPCLR??)
639	ADD     t,b,l                     ; l = t + b[0]
640	ADD,DC  %ret1,%r0,%ret1           ; c+= carry
641	STD     l,0(r_ptr)
642
643bn_add_words_exit
644    .EXIT
645    BVE     (%rp)
646    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
647	.PROCEND	;in=23,24,25,26,29;out=28;
648
649;----------------------------------------------------------------------------
650;
651;BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
652;
653; arg0 = rp
654; arg1 = ap
655; arg2 = bp
656; arg3 = n
657
658t1       .reg %r22
659t2       .reg %r21
660sub_tmp1 .reg %r20
661sub_tmp2 .reg %r19
662
663
664bn_sub_words
665	.proc
666	.callinfo
667	.EXPORT	bn_sub_words,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
668    .entry
669	.align 64
670
671    CMPIB,>=  0,n,bn_sub_words_exit
672    COPY    %r0,%ret1           ; return 0 by default
673
674	;
675	; If 2 or more numbers do the loop
676	;
677	CMPIB,= 1,n,bn_sub_words_single_top
678	NOP
679
680	;
681	; This loop is unrolled 2 times (64-byte aligned as well)
682	;
683bn_sub_words_unroll2
684	LDD     0(a_ptr),t1
685	LDD     0(b_ptr),t2
686	SUB     t1,t2,sub_tmp1           ; t3 = t1-t2;
687	SUB     sub_tmp1,%ret1,sub_tmp1  ; t3 = t3- c;
688
689	CMPCLR,*>> t1,t2,sub_tmp2        ; clear if t1 > t2
690	LDO      1(%r0),sub_tmp2
691
692	CMPCLR,*= t1,t2,%r0
693	COPY    sub_tmp2,%ret1
694	STD     sub_tmp1,0(r_ptr)
695
696	LDD     8(a_ptr),t1
697	LDD     8(b_ptr),t2
698	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
699	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c;
700	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
701	LDO      1(%r0),sub_tmp2
702
703	CMPCLR,*= t1,t2,%r0
704	COPY    sub_tmp2,%ret1
705	STD     sub_tmp1,8(r_ptr)
706
707	LDO     -2(n),n
708	LDO     16(a_ptr),a_ptr
709	LDO     16(b_ptr),b_ptr
710
711	CMPIB,<= 2,n,bn_sub_words_unroll2
712	LDO     16(r_ptr),r_ptr
713
714    CMPIB,=,N 0,n,bn_sub_words_exit ; are we done?
715
716bn_sub_words_single_top
717	LDD     0(a_ptr),t1
718	LDD     0(b_ptr),t2
719	SUB     t1,t2,sub_tmp1            ; t3 = t1-t2;
720	SUB     sub_tmp1,%ret1,sub_tmp1   ; t3 = t3- c;
721	CMPCLR,*>> t1,t2,sub_tmp2         ; clear if t1 > t2
722	LDO      1(%r0),sub_tmp2
723
724	CMPCLR,*= t1,t2,%r0
725	COPY    sub_tmp2,%ret1
726
727	STD     sub_tmp1,0(r_ptr)
728
729bn_sub_words_exit
730    .EXIT
731    BVE     (%rp)
732    EXTRD,U %ret1,31,32,%ret0           ; for 32-bit, return in ret0/ret1
733	.PROCEND	;in=23,24,25,26,29;out=28;
734
735;------------------------------------------------------------------------------
736;
737; unsigned long bn_div_words(unsigned long h, unsigned long l, unsigned long d)
738;
739; arg0 = h
740; arg1 = l
741; arg2 = d
742;
743; This is mainly just output from the HP C compiler.
744;
745;------------------------------------------------------------------------------
746bn_div_words
747	.PROC
748	.EXPORT	bn_div_words,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,RTNVAL=GR,LONG_RETURN
749	.IMPORT	BN_num_bits_word,CODE
750	;--- not PIC	.IMPORT	__iob,DATA
751	;--- not PIC	.IMPORT	fprintf,CODE
752	.IMPORT	abort,CODE
753	.IMPORT	$$div2U,MILLICODE
754	.CALLINFO CALLER,FRAME=144,ENTRY_GR=%r9,SAVE_RP,ARGS_SAVED,ORDERING_AWARE
755        .ENTRY
756        STW     %r2,-20(%r30)   ;offset 0x8ec
757        STW,MA  %r3,192(%r30)   ;offset 0x8f0
758        STW     %r4,-188(%r30)  ;offset 0x8f4
759        DEPD    %r5,31,32,%r6   ;offset 0x8f8
760        STD     %r6,-184(%r30)  ;offset 0x8fc
761        DEPD    %r7,31,32,%r8   ;offset 0x900
762        STD     %r8,-176(%r30)  ;offset 0x904
763        STW     %r9,-168(%r30)  ;offset 0x908
764        LDD     -248(%r30),%r3  ;offset 0x90c
765        COPY    %r26,%r4        ;offset 0x910
766        COPY    %r24,%r5        ;offset 0x914
767        DEPD    %r25,31,32,%r4  ;offset 0x918
768        CMPB,*<>        %r3,%r0,$0006000C       ;offset 0x91c
769        DEPD    %r23,31,32,%r5  ;offset 0x920
770        MOVIB,TR        -1,%r29,$00060002       ;offset 0x924
771        EXTRD,U %r29,31,32,%r28 ;offset 0x928
772$0006002A
773        LDO     -1(%r29),%r29   ;offset 0x92c
774        SUB     %r23,%r7,%r23   ;offset 0x930
775$00060024
776        SUB     %r4,%r31,%r25   ;offset 0x934
777        AND     %r25,%r19,%r26  ;offset 0x938
778        CMPB,*<>,N      %r0,%r26,$00060046      ;offset 0x93c
779        DEPD,Z  %r25,31,32,%r20 ;offset 0x940
780        OR      %r20,%r24,%r21  ;offset 0x944
781        CMPB,*<<,N      %r21,%r23,$0006002A     ;offset 0x948
782        SUB     %r31,%r2,%r31   ;offset 0x94c
783$00060046
784$0006002E
785        DEPD,Z  %r23,31,32,%r25 ;offset 0x950
786        EXTRD,U %r23,31,32,%r26 ;offset 0x954
787        AND     %r25,%r19,%r24  ;offset 0x958
788        ADD,L   %r31,%r26,%r31  ;offset 0x95c
789        CMPCLR,*>>=     %r5,%r24,%r0    ;offset 0x960
790        LDO     1(%r31),%r31    ;offset 0x964
791$00060032
792        CMPB,*<<=,N     %r31,%r4,$00060036      ;offset 0x968
793        LDO     -1(%r29),%r29   ;offset 0x96c
794        ADD,L   %r4,%r3,%r4     ;offset 0x970
795$00060036
796        ADDIB,=,N       -1,%r8,$D0      ;offset 0x974
797        SUB     %r5,%r24,%r28   ;offset 0x978
798$0006003A
799        SUB     %r4,%r31,%r24   ;offset 0x97c
800        SHRPD   %r24,%r28,32,%r4        ;offset 0x980
801        DEPD,Z  %r29,31,32,%r9  ;offset 0x984
802        DEPD,Z  %r28,31,32,%r5  ;offset 0x988
803$0006001C
804        EXTRD,U %r4,31,32,%r31  ;offset 0x98c
805        CMPB,*<>,N      %r31,%r2,$00060020      ;offset 0x990
806        MOVB,TR %r6,%r29,$D1    ;offset 0x994
807        STD     %r29,-152(%r30) ;offset 0x998
808$0006000C
809        EXTRD,U %r3,31,32,%r25  ;offset 0x99c
810        COPY    %r3,%r26        ;offset 0x9a0
811        EXTRD,U %r3,31,32,%r9   ;offset 0x9a4
812        EXTRD,U %r4,31,32,%r8   ;offset 0x9a8
813        .CALL   ARGW0=GR,ARGW1=GR,RTNVAL=GR     ;in=25,26;out=28;
814        B,L     BN_num_bits_word,%r2    ;offset 0x9ac
815        EXTRD,U %r5,31,32,%r7   ;offset 0x9b0
816        LDI     64,%r20 ;offset 0x9b4
817        DEPD    %r7,31,32,%r5   ;offset 0x9b8
818        DEPD    %r8,31,32,%r4   ;offset 0x9bc
819        DEPD    %r9,31,32,%r3   ;offset 0x9c0
820        CMPB,=  %r28,%r20,$00060012     ;offset 0x9c4
821        COPY    %r28,%r24       ;offset 0x9c8
822        MTSARCM %r24    ;offset 0x9cc
823        DEPDI,Z -1,%sar,1,%r19  ;offset 0x9d0
824        CMPB,*>>,N      %r4,%r19,$D2    ;offset 0x9d4
825$00060012
826        SUBI    64,%r24,%r31    ;offset 0x9d8
827        CMPCLR,*<<      %r4,%r3,%r0     ;offset 0x9dc
828        SUB     %r4,%r3,%r4     ;offset 0x9e0
829$00060016
830        CMPB,=  %r31,%r0,$0006001A      ;offset 0x9e4
831        COPY    %r0,%r9 ;offset 0x9e8
832        MTSARCM %r31    ;offset 0x9ec
833        DEPD,Z  %r3,%sar,64,%r3 ;offset 0x9f0
834        SUBI    64,%r31,%r26    ;offset 0x9f4
835        MTSAR   %r26    ;offset 0x9f8
836        SHRPD   %r4,%r5,%sar,%r4        ;offset 0x9fc
837        MTSARCM %r31    ;offset 0xa00
838        DEPD,Z  %r5,%sar,64,%r5 ;offset 0xa04
839$0006001A
840        DEPDI,Z -1,31,32,%r19   ;offset 0xa08
841        AND     %r3,%r19,%r29   ;offset 0xa0c
842        EXTRD,U %r29,31,32,%r2  ;offset 0xa10
843        DEPDI,Z -1,63,32,%r6    ;offset 0xa14
844        MOVIB,TR        2,%r8,$0006001C ;offset 0xa18
845        EXTRD,U %r3,63,32,%r7   ;offset 0xa1c
846$D2
847        ;--- not PIC	ADDIL   LR'__iob-$global$,%r27,%r1      ;offset 0xa20
848        ;--- not PIC	LDIL    LR'C$7,%r21     ;offset 0xa24
849        ;--- not PIC	LDO     RR'__iob-$global$+32(%r1),%r26  ;offset 0xa28
850        ;--- not PIC	.CALL   ARGW0=GR,ARGW1=GR,ARGW2=GR,RTNVAL=GR    ;in=24,25,26;out=28;
851        ;--- not PIC	B,L     fprintf,%r2     ;offset 0xa2c
852        ;--- not PIC	LDO     RR'C$7(%r21),%r25       ;offset 0xa30
853        .CALL           ;
854        B,L     abort,%r2       ;offset 0xa34
855        NOP             ;offset 0xa38
856        B       $D3     ;offset 0xa3c
857        LDW     -212(%r30),%r2  ;offset 0xa40
858$00060020
859        COPY    %r4,%r26        ;offset 0xa44
860        EXTRD,U %r4,31,32,%r25  ;offset 0xa48
861        COPY    %r2,%r24        ;offset 0xa4c
862        .CALL   ;in=23,24,25,26;out=20,21,22,28,29; (MILLICALL)
863        B,L     $$div2U,%r31    ;offset 0xa50
864        EXTRD,U %r2,31,32,%r23  ;offset 0xa54
865        DEPD    %r28,31,32,%r29 ;offset 0xa58
866$00060022
867        STD     %r29,-152(%r30) ;offset 0xa5c
868$D1
869        AND     %r5,%r19,%r24   ;offset 0xa60
870        EXTRD,U %r24,31,32,%r24 ;offset 0xa64
871        STW     %r2,-160(%r30)  ;offset 0xa68
872        STW     %r7,-128(%r30)  ;offset 0xa6c
873        FLDD    -152(%r30),%fr4 ;offset 0xa70
874        FLDD    -152(%r30),%fr7 ;offset 0xa74
875        FLDW    -160(%r30),%fr8L        ;offset 0xa78
876        FLDW    -128(%r30),%fr5L        ;offset 0xa7c
877        XMPYU   %fr8L,%fr7L,%fr10       ;offset 0xa80
878        FSTD    %fr10,-136(%r30)        ;offset 0xa84
879        XMPYU   %fr8L,%fr7R,%fr22       ;offset 0xa88
880        FSTD    %fr22,-144(%r30)        ;offset 0xa8c
881        XMPYU   %fr5L,%fr4L,%fr11       ;offset 0xa90
882        XMPYU   %fr5L,%fr4R,%fr23       ;offset 0xa94
883        FSTD    %fr11,-112(%r30)        ;offset 0xa98
884        FSTD    %fr23,-120(%r30)        ;offset 0xa9c
885        LDD     -136(%r30),%r28 ;offset 0xaa0
886        DEPD,Z  %r28,31,32,%r31 ;offset 0xaa4
887        LDD     -144(%r30),%r20 ;offset 0xaa8
888        ADD,L   %r20,%r31,%r31  ;offset 0xaac
889        LDD     -112(%r30),%r22 ;offset 0xab0
890        DEPD,Z  %r22,31,32,%r22 ;offset 0xab4
891        LDD     -120(%r30),%r21 ;offset 0xab8
892        B       $00060024       ;offset 0xabc
893        ADD,L   %r21,%r22,%r23  ;offset 0xac0
894$D0
895        OR      %r9,%r29,%r29   ;offset 0xac4
896$00060040
897        EXTRD,U %r29,31,32,%r28 ;offset 0xac8
898$00060002
899$L2
900        LDW     -212(%r30),%r2  ;offset 0xacc
901$D3
902        LDW     -168(%r30),%r9  ;offset 0xad0
903        LDD     -176(%r30),%r8  ;offset 0xad4
904        EXTRD,U %r8,31,32,%r7   ;offset 0xad8
905        LDD     -184(%r30),%r6  ;offset 0xadc
906        EXTRD,U %r6,31,32,%r5   ;offset 0xae0
907        LDW     -188(%r30),%r4  ;offset 0xae4
908        BVE     (%r2)   ;offset 0xae8
909        .EXIT
910        LDW,MB  -192(%r30),%r3  ;offset 0xaec
911	.PROCEND	;in=23,25;out=28,29;fpin=105,107;
912
913
914
915
916;----------------------------------------------------------------------------
917;
918; Registers to hold 64-bit values to manipulate.  The "L" part
919; of the register corresponds to the upper 32-bits, while the "R"
920; part corresponds to the lower 32-bits
921;
922; Note, that when using b6 and b7, the code must save these before
923; using them because they are callee save registers
924;
925;
926; Floating point registers to use to save values that
927; are manipulated.  These don't collide with ftemp1-6 and
928; are all caller save registers
929;
930a0        .reg %fr22
931a0L       .reg %fr22L
932a0R       .reg %fr22R
933
934a1        .reg %fr23
935a1L       .reg %fr23L
936a1R       .reg %fr23R
937
938a2        .reg %fr24
939a2L       .reg %fr24L
940a2R       .reg %fr24R
941
942a3        .reg %fr25
943a3L       .reg %fr25L
944a3R       .reg %fr25R
945
946a4        .reg %fr26
947a4L       .reg %fr26L
948a4R       .reg %fr26R
949
950a5        .reg %fr27
951a5L       .reg %fr27L
952a5R       .reg %fr27R
953
954a6        .reg %fr28
955a6L       .reg %fr28L
956a6R       .reg %fr28R
957
958a7        .reg %fr29
959a7L       .reg %fr29L
960a7R       .reg %fr29R
961
962b0        .reg %fr30
963b0L       .reg %fr30L
964b0R       .reg %fr30R
965
966b1        .reg %fr31
967b1L       .reg %fr31L
968b1R       .reg %fr31R
969
970;
971; Temporary floating point variables, these are all caller save
972; registers
973;
974ftemp1    .reg %fr4
975ftemp2    .reg %fr5
976ftemp3    .reg %fr6
977ftemp4    .reg %fr7
978
979;
980; The B set of registers when used.
981;
982
983b2        .reg %fr8
984b2L       .reg %fr8L
985b2R       .reg %fr8R
986
987b3        .reg %fr9
988b3L       .reg %fr9L
989b3R       .reg %fr9R
990
991b4        .reg %fr10
992b4L       .reg %fr10L
993b4R       .reg %fr10R
994
995b5        .reg %fr11
996b5L       .reg %fr11L
997b5R       .reg %fr11R
998
999b6        .reg %fr12
1000b6L       .reg %fr12L
1001b6R       .reg %fr12R
1002
1003b7        .reg %fr13
1004b7L       .reg %fr13L
1005b7R       .reg %fr13R
1006
1007c1           .reg %r21   ; only reg
1008temp1        .reg %r20   ; only reg
1009temp2        .reg %r19   ; only reg
1010temp3        .reg %r31   ; only reg
1011
1012m1           .reg %r28
1013c2           .reg %r23
1014high_one     .reg %r1
1015ht           .reg %r6
1016lt           .reg %r5
1017m            .reg %r4
1018c3           .reg %r3
1019
1020SQR_ADD_C  .macro  A0L,A0R,C1,C2,C3
1021    XMPYU   A0L,A0R,ftemp1       ; m
1022    FSTD    ftemp1,-24(%sp)      ; store m
1023
1024    XMPYU   A0R,A0R,ftemp2       ; lt
1025    FSTD    ftemp2,-16(%sp)      ; store lt
1026
1027    XMPYU   A0L,A0L,ftemp3       ; ht
1028    FSTD    ftemp3,-8(%sp)       ; store ht
1029
1030    LDD     -24(%sp),m           ; load m
1031    AND     m,high_mask,temp2    ; m & Mask
1032    DEPD,Z  m,30,31,temp3        ; m << 32+1
1033    LDD     -16(%sp),lt          ; lt
1034
1035    LDD     -8(%sp),ht           ; ht
1036    EXTRD,U temp2,32,33,temp1    ; temp1 = m&Mask >> 32-1
1037    ADD     temp3,lt,lt          ; lt = lt+m
1038    ADD,L   ht,temp1,ht          ; ht += temp1
1039    ADD,DC  ht,%r0,ht            ; ht++
1040
1041    ADD     C1,lt,C1             ; c1=c1+lt
1042    ADD,DC  ht,%r0,ht            ; ht++
1043
1044    ADD     C2,ht,C2             ; c2=c2+ht
1045    ADD,DC  C3,%r0,C3            ; c3++
1046.endm
1047
1048SQR_ADD_C2 .macro  A0L,A0R,A1L,A1R,C1,C2,C3
1049    XMPYU   A0L,A1R,ftemp1          ; m1 = bl*ht
1050    FSTD    ftemp1,-16(%sp)         ;
1051    XMPYU   A0R,A1L,ftemp2          ; m = bh*lt
1052    FSTD    ftemp2,-8(%sp)          ;
1053    XMPYU   A0R,A1R,ftemp3          ; lt = bl*lt
1054    FSTD    ftemp3,-32(%sp)
1055    XMPYU   A0L,A1L,ftemp4          ; ht = bh*ht
1056    FSTD    ftemp4,-24(%sp)         ;
1057
1058    LDD     -8(%sp),m               ; r21 = m
1059    LDD     -16(%sp),m1             ; r19 = m1
1060    ADD,L   m,m1,m                  ; m+m1
1061
1062    DEPD,Z  m,31,32,temp3           ; (m+m1<<32)
1063    LDD     -24(%sp),ht             ; r24 = ht
1064
1065    CMPCLR,*>>= m,m1,%r0            ; if (m < m1)
1066    ADD,L   ht,high_one,ht          ; ht+=high_one
1067
1068    EXTRD,U m,31,32,temp1           ; m >> 32
1069    LDD     -32(%sp),lt             ; lt
1070    ADD,L   ht,temp1,ht             ; ht+= m>>32
1071    ADD     lt,temp3,lt             ; lt = lt+m1
1072    ADD,DC  ht,%r0,ht               ; ht++
1073
1074    ADD     ht,ht,ht                ; ht=ht+ht;
1075    ADD,DC  C3,%r0,C3               ; add in carry (c3++)
1076
1077    ADD     lt,lt,lt                ; lt=lt+lt;
1078    ADD,DC  ht,%r0,ht               ; add in carry (ht++)
1079
1080    ADD     C1,lt,C1                ; c1=c1+lt
1081    ADD,DC,*NUV ht,%r0,ht           ; add in carry (ht++)
1082    LDO     1(C3),C3              ; bump c3 if overflow,nullify otherwise
1083
1084    ADD     C2,ht,C2                ; c2 = c2 + ht
1085    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
1086.endm
1087
1088;
1089;void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
1090; arg0 = r_ptr
1091; arg1 = a_ptr
1092;
1093
1094bn_sqr_comba8
1095	.PROC
1096	.CALLINFO FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1097	.EXPORT	bn_sqr_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1098    .ENTRY
1099	.align 64
1100
1101    STD     %r3,0(%sp)          ; save r3
1102    STD     %r4,8(%sp)          ; save r4
1103    STD     %r5,16(%sp)         ; save r5
1104    STD     %r6,24(%sp)         ; save r6
1105
1106	;
1107	; Zero out carries
1108	;
1109	COPY     %r0,c1
1110	COPY     %r0,c2
1111	COPY     %r0,c3
1112
1113	LDO      128(%sp),%sp       ; bump stack
1114    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
1115    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
1116
1117	;
1118	; Load up all of the values we are going to use
1119	;
1120    FLDD     0(a_ptr),a0
1121    FLDD     8(a_ptr),a1
1122    FLDD    16(a_ptr),a2
1123    FLDD    24(a_ptr),a3
1124    FLDD    32(a_ptr),a4
1125    FLDD    40(a_ptr),a5
1126    FLDD    48(a_ptr),a6
1127    FLDD    56(a_ptr),a7
1128
1129	SQR_ADD_C a0L,a0R,c1,c2,c3
1130	STD     c1,0(r_ptr)          ; r[0] = c1;
1131	COPY    %r0,c1
1132
1133	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1134	STD     c2,8(r_ptr)          ; r[1] = c2;
1135	COPY    %r0,c2
1136
1137	SQR_ADD_C a1L,a1R,c3,c1,c2
1138	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1139	STD     c3,16(r_ptr)            ; r[2] = c3;
1140	COPY    %r0,c3
1141
1142	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1143	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1144	STD     c1,24(r_ptr)           ; r[3] = c1;
1145	COPY    %r0,c1
1146
1147	SQR_ADD_C a2L,a2R,c2,c3,c1
1148	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1149	SQR_ADD_C2 a4L,a4R,a0L,a0R,c2,c3,c1
1150	STD     c2,32(r_ptr)          ; r[4] = c2;
1151	COPY    %r0,c2
1152
1153	SQR_ADD_C2 a5L,a5R,a0L,a0R,c3,c1,c2
1154	SQR_ADD_C2 a4L,a4R,a1L,a1R,c3,c1,c2
1155	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1156	STD     c3,40(r_ptr)          ; r[5] = c3;
1157	COPY    %r0,c3
1158
1159	SQR_ADD_C a3L,a3R,c1,c2,c3
1160	SQR_ADD_C2 a4L,a4R,a2L,a2R,c1,c2,c3
1161	SQR_ADD_C2 a5L,a5R,a1L,a1R,c1,c2,c3
1162	SQR_ADD_C2 a6L,a6R,a0L,a0R,c1,c2,c3
1163	STD     c1,48(r_ptr)          ; r[6] = c1;
1164	COPY    %r0,c1
1165
1166	SQR_ADD_C2 a7L,a7R,a0L,a0R,c2,c3,c1
1167	SQR_ADD_C2 a6L,a6R,a1L,a1R,c2,c3,c1
1168	SQR_ADD_C2 a5L,a5R,a2L,a2R,c2,c3,c1
1169	SQR_ADD_C2 a4L,a4R,a3L,a3R,c2,c3,c1
1170	STD     c2,56(r_ptr)          ; r[7] = c2;
1171	COPY    %r0,c2
1172
1173	SQR_ADD_C a4L,a4R,c3,c1,c2
1174	SQR_ADD_C2 a5L,a5R,a3L,a3R,c3,c1,c2
1175	SQR_ADD_C2 a6L,a6R,a2L,a2R,c3,c1,c2
1176	SQR_ADD_C2 a7L,a7R,a1L,a1R,c3,c1,c2
1177	STD     c3,64(r_ptr)          ; r[8] = c3;
1178	COPY    %r0,c3
1179
1180	SQR_ADD_C2 a7L,a7R,a2L,a2R,c1,c2,c3
1181	SQR_ADD_C2 a6L,a6R,a3L,a3R,c1,c2,c3
1182	SQR_ADD_C2 a5L,a5R,a4L,a4R,c1,c2,c3
1183	STD     c1,72(r_ptr)          ; r[9] = c1;
1184	COPY    %r0,c1
1185
1186	SQR_ADD_C a5L,a5R,c2,c3,c1
1187	SQR_ADD_C2 a6L,a6R,a4L,a4R,c2,c3,c1
1188	SQR_ADD_C2 a7L,a7R,a3L,a3R,c2,c3,c1
1189	STD     c2,80(r_ptr)          ; r[10] = c2;
1190	COPY    %r0,c2
1191
1192	SQR_ADD_C2 a7L,a7R,a4L,a4R,c3,c1,c2
1193	SQR_ADD_C2 a6L,a6R,a5L,a5R,c3,c1,c2
1194	STD     c3,88(r_ptr)          ; r[11] = c3;
1195	COPY    %r0,c3
1196
1197	SQR_ADD_C a6L,a6R,c1,c2,c3
1198	SQR_ADD_C2 a7L,a7R,a5L,a5R,c1,c2,c3
1199	STD     c1,96(r_ptr)          ; r[12] = c1;
1200	COPY    %r0,c1
1201
1202	SQR_ADD_C2 a7L,a7R,a6L,a6R,c2,c3,c1
1203	STD     c2,104(r_ptr)         ; r[13] = c2;
1204	COPY    %r0,c2
1205
1206	SQR_ADD_C a7L,a7R,c3,c1,c2
1207	STD     c3, 112(r_ptr)       ; r[14] = c3
1208	STD     c1, 120(r_ptr)       ; r[15] = c1
1209
1210    .EXIT
1211    LDD     -104(%sp),%r6        ; restore r6
1212    LDD     -112(%sp),%r5        ; restore r5
1213    LDD     -120(%sp),%r4        ; restore r4
1214    BVE     (%rp)
1215    LDD,MB  -128(%sp),%r3
1216
1217	.PROCEND
1218
1219;-----------------------------------------------------------------------------
1220;
1221;void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
1222; arg0 = r_ptr
1223; arg1 = a_ptr
1224;
1225
1226bn_sqr_comba4
1227	.proc
1228	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1229	.EXPORT	bn_sqr_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1230    .entry
1231	.align 64
1232    STD     %r3,0(%sp)          ; save r3
1233    STD     %r4,8(%sp)          ; save r4
1234    STD     %r5,16(%sp)         ; save r5
1235    STD     %r6,24(%sp)         ; save r6
1236
1237	;
1238	; Zero out carries
1239	;
1240	COPY     %r0,c1
1241	COPY     %r0,c2
1242	COPY     %r0,c3
1243
1244	LDO      128(%sp),%sp       ; bump stack
1245    DEPDI,Z -1,32,33,high_mask   ; Create Mask 0xffffffff80000000L
1246    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
1247
1248	;
1249	; Load up all of the values we are going to use
1250	;
1251    FLDD     0(a_ptr),a0
1252    FLDD     8(a_ptr),a1
1253    FLDD    16(a_ptr),a2
1254    FLDD    24(a_ptr),a3
1255    FLDD    32(a_ptr),a4
1256    FLDD    40(a_ptr),a5
1257    FLDD    48(a_ptr),a6
1258    FLDD    56(a_ptr),a7
1259
1260	SQR_ADD_C a0L,a0R,c1,c2,c3
1261
1262	STD     c1,0(r_ptr)          ; r[0] = c1;
1263	COPY    %r0,c1
1264
1265	SQR_ADD_C2 a1L,a1R,a0L,a0R,c2,c3,c1
1266
1267	STD     c2,8(r_ptr)          ; r[1] = c2;
1268	COPY    %r0,c2
1269
1270	SQR_ADD_C a1L,a1R,c3,c1,c2
1271	SQR_ADD_C2 a2L,a2R,a0L,a0R,c3,c1,c2
1272
1273	STD     c3,16(r_ptr)            ; r[2] = c3;
1274	COPY    %r0,c3
1275
1276	SQR_ADD_C2 a3L,a3R,a0L,a0R,c1,c2,c3
1277	SQR_ADD_C2 a2L,a2R,a1L,a1R,c1,c2,c3
1278
1279	STD     c1,24(r_ptr)           ; r[3] = c1;
1280	COPY    %r0,c1
1281
1282	SQR_ADD_C a2L,a2R,c2,c3,c1
1283	SQR_ADD_C2 a3L,a3R,a1L,a1R,c2,c3,c1
1284
1285	STD     c2,32(r_ptr)           ; r[4] = c2;
1286	COPY    %r0,c2
1287
1288	SQR_ADD_C2 a3L,a3R,a2L,a2R,c3,c1,c2
1289	STD     c3,40(r_ptr)           ; r[5] = c3;
1290	COPY    %r0,c3
1291
1292	SQR_ADD_C a3L,a3R,c1,c2,c3
1293	STD     c1,48(r_ptr)           ; r[6] = c1;
1294	STD     c2,56(r_ptr)           ; r[7] = c2;
1295
1296    .EXIT
1297    LDD     -104(%sp),%r6        ; restore r6
1298    LDD     -112(%sp),%r5        ; restore r5
1299    LDD     -120(%sp),%r4        ; restore r4
1300    BVE     (%rp)
1301    LDD,MB  -128(%sp),%r3
1302
1303	.PROCEND
1304
1305
1306;---------------------------------------------------------------------------
1307
1308MUL_ADD_C  .macro  A0L,A0R,B0L,B0R,C1,C2,C3
1309    XMPYU   A0L,B0R,ftemp1        ; m1 = bl*ht
1310    FSTD    ftemp1,-16(%sp)       ;
1311    XMPYU   A0R,B0L,ftemp2        ; m = bh*lt
1312    FSTD    ftemp2,-8(%sp)        ;
1313    XMPYU   A0R,B0R,ftemp3        ; lt = bl*lt
1314    FSTD    ftemp3,-32(%sp)
1315    XMPYU   A0L,B0L,ftemp4        ; ht = bh*ht
1316    FSTD    ftemp4,-24(%sp)       ;
1317
1318    LDD     -8(%sp),m             ; r21 = m
1319    LDD     -16(%sp),m1           ; r19 = m1
1320    ADD,L   m,m1,m                ; m+m1
1321
1322    DEPD,Z  m,31,32,temp3         ; (m+m1<<32)
1323    LDD     -24(%sp),ht           ; r24 = ht
1324
1325    CMPCLR,*>>= m,m1,%r0          ; if (m < m1)
1326    ADD,L   ht,high_one,ht        ; ht+=high_one
1327
1328    EXTRD,U m,31,32,temp1         ; m >> 32
1329    LDD     -32(%sp),lt           ; lt
1330    ADD,L   ht,temp1,ht           ; ht+= m>>32
1331    ADD     lt,temp3,lt           ; lt = lt+m1
1332    ADD,DC  ht,%r0,ht             ; ht++
1333
1334    ADD     C1,lt,C1              ; c1=c1+lt
1335    ADD,DC  ht,%r0,ht             ; bump c3 if overflow,nullify otherwise
1336
1337    ADD     C2,ht,C2              ; c2 = c2 + ht
1338    ADD,DC  C3,%r0,C3             ; add in carry (c3++)
1339.endm
1340
1341
1342;
1343;void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1344; arg0 = r_ptr
1345; arg1 = a_ptr
1346; arg2 = b_ptr
1347;
1348
1349bn_mul_comba8
1350	.proc
1351	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1352	.EXPORT	bn_mul_comba8,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1353    .entry
1354	.align 64
1355
1356    STD     %r3,0(%sp)          ; save r3
1357    STD     %r4,8(%sp)          ; save r4
1358    STD     %r5,16(%sp)         ; save r5
1359    STD     %r6,24(%sp)         ; save r6
1360    FSTD    %fr12,32(%sp)       ; save r6
1361    FSTD    %fr13,40(%sp)       ; save r7
1362
1363	;
1364	; Zero out carries
1365	;
1366	COPY     %r0,c1
1367	COPY     %r0,c2
1368	COPY     %r0,c3
1369
1370	LDO      128(%sp),%sp       ; bump stack
1371    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
1372
1373	;
1374	; Load up all of the values we are going to use
1375	;
1376    FLDD      0(a_ptr),a0
1377    FLDD      8(a_ptr),a1
1378    FLDD     16(a_ptr),a2
1379    FLDD     24(a_ptr),a3
1380    FLDD     32(a_ptr),a4
1381    FLDD     40(a_ptr),a5
1382    FLDD     48(a_ptr),a6
1383    FLDD     56(a_ptr),a7
1384
1385    FLDD      0(b_ptr),b0
1386    FLDD      8(b_ptr),b1
1387    FLDD     16(b_ptr),b2
1388    FLDD     24(b_ptr),b3
1389    FLDD     32(b_ptr),b4
1390    FLDD     40(b_ptr),b5
1391    FLDD     48(b_ptr),b6
1392    FLDD     56(b_ptr),b7
1393
1394	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1395	STD       c1,0(r_ptr)
1396	COPY      %r0,c1
1397
1398	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1399	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1400	STD       c2,8(r_ptr)
1401	COPY      %r0,c2
1402
1403	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1404	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1405	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1406	STD       c3,16(r_ptr)
1407	COPY      %r0,c3
1408
1409	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1410	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1411	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1412	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1413	STD       c1,24(r_ptr)
1414	COPY      %r0,c1
1415
1416	MUL_ADD_C a4L,a4R,b0L,b0R,c2,c3,c1
1417	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1418	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1419	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1420	MUL_ADD_C a0L,a0R,b4L,b4R,c2,c3,c1
1421	STD       c2,32(r_ptr)
1422	COPY      %r0,c2
1423
1424	MUL_ADD_C a0L,a0R,b5L,b5R,c3,c1,c2
1425	MUL_ADD_C a1L,a1R,b4L,b4R,c3,c1,c2
1426	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1427	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1428	MUL_ADD_C a4L,a4R,b1L,b1R,c3,c1,c2
1429	MUL_ADD_C a5L,a5R,b0L,b0R,c3,c1,c2
1430	STD       c3,40(r_ptr)
1431	COPY      %r0,c3
1432
1433	MUL_ADD_C a6L,a6R,b0L,b0R,c1,c2,c3
1434	MUL_ADD_C a5L,a5R,b1L,b1R,c1,c2,c3
1435	MUL_ADD_C a4L,a4R,b2L,b2R,c1,c2,c3
1436	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1437	MUL_ADD_C a2L,a2R,b4L,b4R,c1,c2,c3
1438	MUL_ADD_C a1L,a1R,b5L,b5R,c1,c2,c3
1439	MUL_ADD_C a0L,a0R,b6L,b6R,c1,c2,c3
1440	STD       c1,48(r_ptr)
1441	COPY      %r0,c1
1442
1443	MUL_ADD_C a0L,a0R,b7L,b7R,c2,c3,c1
1444	MUL_ADD_C a1L,a1R,b6L,b6R,c2,c3,c1
1445	MUL_ADD_C a2L,a2R,b5L,b5R,c2,c3,c1
1446	MUL_ADD_C a3L,a3R,b4L,b4R,c2,c3,c1
1447	MUL_ADD_C a4L,a4R,b3L,b3R,c2,c3,c1
1448	MUL_ADD_C a5L,a5R,b2L,b2R,c2,c3,c1
1449	MUL_ADD_C a6L,a6R,b1L,b1R,c2,c3,c1
1450	MUL_ADD_C a7L,a7R,b0L,b0R,c2,c3,c1
1451	STD       c2,56(r_ptr)
1452	COPY      %r0,c2
1453
1454	MUL_ADD_C a7L,a7R,b1L,b1R,c3,c1,c2
1455	MUL_ADD_C a6L,a6R,b2L,b2R,c3,c1,c2
1456	MUL_ADD_C a5L,a5R,b3L,b3R,c3,c1,c2
1457	MUL_ADD_C a4L,a4R,b4L,b4R,c3,c1,c2
1458	MUL_ADD_C a3L,a3R,b5L,b5R,c3,c1,c2
1459	MUL_ADD_C a2L,a2R,b6L,b6R,c3,c1,c2
1460	MUL_ADD_C a1L,a1R,b7L,b7R,c3,c1,c2
1461	STD       c3,64(r_ptr)
1462	COPY      %r0,c3
1463
1464	MUL_ADD_C a2L,a2R,b7L,b7R,c1,c2,c3
1465	MUL_ADD_C a3L,a3R,b6L,b6R,c1,c2,c3
1466	MUL_ADD_C a4L,a4R,b5L,b5R,c1,c2,c3
1467	MUL_ADD_C a5L,a5R,b4L,b4R,c1,c2,c3
1468	MUL_ADD_C a6L,a6R,b3L,b3R,c1,c2,c3
1469	MUL_ADD_C a7L,a7R,b2L,b2R,c1,c2,c3
1470	STD       c1,72(r_ptr)
1471	COPY      %r0,c1
1472
1473	MUL_ADD_C a7L,a7R,b3L,b3R,c2,c3,c1
1474	MUL_ADD_C a6L,a6R,b4L,b4R,c2,c3,c1
1475	MUL_ADD_C a5L,a5R,b5L,b5R,c2,c3,c1
1476	MUL_ADD_C a4L,a4R,b6L,b6R,c2,c3,c1
1477	MUL_ADD_C a3L,a3R,b7L,b7R,c2,c3,c1
1478	STD       c2,80(r_ptr)
1479	COPY      %r0,c2
1480
1481	MUL_ADD_C a4L,a4R,b7L,b7R,c3,c1,c2
1482	MUL_ADD_C a5L,a5R,b6L,b6R,c3,c1,c2
1483	MUL_ADD_C a6L,a6R,b5L,b5R,c3,c1,c2
1484	MUL_ADD_C a7L,a7R,b4L,b4R,c3,c1,c2
1485	STD       c3,88(r_ptr)
1486	COPY      %r0,c3
1487
1488	MUL_ADD_C a7L,a7R,b5L,b5R,c1,c2,c3
1489	MUL_ADD_C a6L,a6R,b6L,b6R,c1,c2,c3
1490	MUL_ADD_C a5L,a5R,b7L,b7R,c1,c2,c3
1491	STD       c1,96(r_ptr)
1492	COPY      %r0,c1
1493
1494	MUL_ADD_C a6L,a6R,b7L,b7R,c2,c3,c1
1495	MUL_ADD_C a7L,a7R,b6L,b6R,c2,c3,c1
1496	STD       c2,104(r_ptr)
1497	COPY      %r0,c2
1498
1499	MUL_ADD_C a7L,a7R,b7L,b7R,c3,c1,c2
1500	STD       c3,112(r_ptr)
1501	STD       c1,120(r_ptr)
1502
1503    .EXIT
1504    FLDD    -88(%sp),%fr13
1505    FLDD    -96(%sp),%fr12
1506    LDD     -104(%sp),%r6        ; restore r6
1507    LDD     -112(%sp),%r5        ; restore r5
1508    LDD     -120(%sp),%r4        ; restore r4
1509    BVE     (%rp)
1510    LDD,MB  -128(%sp),%r3
1511
1512	.PROCEND
1513
1514;-----------------------------------------------------------------------------
1515;
1516;void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
1517; arg0 = r_ptr
1518; arg1 = a_ptr
1519; arg2 = b_ptr
1520;
1521
1522bn_mul_comba4
1523	.proc
1524	.callinfo FRAME=128,ENTRY_GR=%r3,ARGS_SAVED,ORDERING_AWARE
1525	.EXPORT	bn_mul_comba4,ENTRY,PRIV_LEV=3,NO_RELOCATION,LONG_RETURN
1526    .entry
1527	.align 64
1528
1529    STD     %r3,0(%sp)          ; save r3
1530    STD     %r4,8(%sp)          ; save r4
1531    STD     %r5,16(%sp)         ; save r5
1532    STD     %r6,24(%sp)         ; save r6
1533    FSTD    %fr12,32(%sp)       ; save r6
1534    FSTD    %fr13,40(%sp)       ; save r7
1535
1536	;
1537	; Zero out carries
1538	;
1539	COPY     %r0,c1
1540	COPY     %r0,c2
1541	COPY     %r0,c3
1542
1543	LDO      128(%sp),%sp       ; bump stack
1544    DEPDI,Z  1,31,1,high_one     ; Create Value  1 << 32
1545
1546	;
1547	; Load up all of the values we are going to use
1548	;
1549    FLDD      0(a_ptr),a0
1550    FLDD      8(a_ptr),a1
1551    FLDD     16(a_ptr),a2
1552    FLDD     24(a_ptr),a3
1553
1554    FLDD      0(b_ptr),b0
1555    FLDD      8(b_ptr),b1
1556    FLDD     16(b_ptr),b2
1557    FLDD     24(b_ptr),b3
1558
1559	MUL_ADD_C a0L,a0R,b0L,b0R,c1,c2,c3
1560	STD       c1,0(r_ptr)
1561	COPY      %r0,c1
1562
1563	MUL_ADD_C a0L,a0R,b1L,b1R,c2,c3,c1
1564	MUL_ADD_C a1L,a1R,b0L,b0R,c2,c3,c1
1565	STD       c2,8(r_ptr)
1566	COPY      %r0,c2
1567
1568	MUL_ADD_C a2L,a2R,b0L,b0R,c3,c1,c2
1569	MUL_ADD_C a1L,a1R,b1L,b1R,c3,c1,c2
1570	MUL_ADD_C a0L,a0R,b2L,b2R,c3,c1,c2
1571	STD       c3,16(r_ptr)
1572	COPY      %r0,c3
1573
1574	MUL_ADD_C a0L,a0R,b3L,b3R,c1,c2,c3
1575	MUL_ADD_C a1L,a1R,b2L,b2R,c1,c2,c3
1576	MUL_ADD_C a2L,a2R,b1L,b1R,c1,c2,c3
1577	MUL_ADD_C a3L,a3R,b0L,b0R,c1,c2,c3
1578	STD       c1,24(r_ptr)
1579	COPY      %r0,c1
1580
1581	MUL_ADD_C a3L,a3R,b1L,b1R,c2,c3,c1
1582	MUL_ADD_C a2L,a2R,b2L,b2R,c2,c3,c1
1583	MUL_ADD_C a1L,a1R,b3L,b3R,c2,c3,c1
1584	STD       c2,32(r_ptr)
1585	COPY      %r0,c2
1586
1587	MUL_ADD_C a2L,a2R,b3L,b3R,c3,c1,c2
1588	MUL_ADD_C a3L,a3R,b2L,b2R,c3,c1,c2
1589	STD       c3,40(r_ptr)
1590	COPY      %r0,c3
1591
1592	MUL_ADD_C a3L,a3R,b3L,b3R,c1,c2,c3
1593	STD       c1,48(r_ptr)
1594	STD       c2,56(r_ptr)
1595
1596    .EXIT
1597    FLDD    -88(%sp),%fr13
1598    FLDD    -96(%sp),%fr12
1599    LDD     -104(%sp),%r6        ; restore r6
1600    LDD     -112(%sp),%r5        ; restore r5
1601    LDD     -120(%sp),%r4        ; restore r4
1602    BVE     (%rp)
1603    LDD,MB  -128(%sp),%r3
1604
1605	.PROCEND
1606
1607
1608;--- not PIC	.SPACE	$TEXT$
1609;--- not PIC	.SUBSPA	$CODE$
1610;--- not PIC	.SPACE	$PRIVATE$,SORT=16
1611;--- not PIC	.IMPORT	$global$,DATA
1612;--- not PIC	.SPACE	$TEXT$
1613;--- not PIC	.SUBSPA	$CODE$
1614;--- not PIC	.SUBSPA	$LIT$,ACCESS=0x2c
1615;--- not PIC	C$7
1616;--- not PIC	.ALIGN	8
1617;--- not PIC	.STRINGZ	"Division would overflow (%d)\n"
1618	.END
1619