• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5# project.
6#
7# Rights for redistribution and usage in source and binary forms are
8# granted according to the OpenSSL license. Warranty of any kind is
9# disclaimed.
10# ====================================================================
11
12
13# July 1999
14#
15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c.
16#
17# The module is designed to work with either of the "new" MIPS ABI(5),
18# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under
19# IRIX 5.x not only because it doesn't support new ABIs but also
20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those
21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only
22# cause illegal instruction exception:-(
23#
24# In addition the code depends on preprocessor flags set up by MIPSpro
25# compiler driver (either as or cc) and therefore (probably?) can't be
26# compiled by the GNU assembler. GNU C driver manages fine though...
27# I mean as long as -mmips-as is specified or is the default option,
28# because then it simply invokes /usr/bin/as which in turn takes
29# perfect care of the preprocessor definitions. Another neat feature
30# offered by the MIPSpro assembler is an optimization pass. This gave
31# me the opportunity to have the code looking more regular as all those
32# architecture dependent instruction rescheduling details were left to
33# the assembler. Cool, huh?
34#
35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa'
36# goes way over 3 times faster!
37#
38#					<appro@fy.chalmers.se>
39
40# October 2010
41#
42# Adapt the module even for 32-bit ABIs and other OSes. The former was
43# achieved by mechanical replacement of 64-bit arithmetic instructions
44# such as dmultu, daddu, etc. with their 32-bit counterparts and
45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned
46# >3x performance improvement naturally does not apply to 32-bit code
47# [because there is no instruction 32-bit compiler can't use], one
48# has to content with 40-85% improvement depending on benchmark and
49# key length, more for longer keys.
50
51$flavour = shift;
52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
53open STDOUT,">$output";
54
55if ($flavour =~ /64|n32/i) {
56	$LD="ld";
57	$ST="sd";
58	$MULTU="dmultu";
59	$DIVU="ddivu";
60	$ADDU="daddu";
61	$SUBU="dsubu";
62	$SRL="dsrl";
63	$SLL="dsll";
64	$BNSZ=8;
65	$PTR_ADD="daddu";
66	$PTR_SUB="dsubu";
67	$SZREG=8;
68	$REG_S="sd";
69	$REG_L="ld";
70} else {
71	$LD="lw";
72	$ST="sw";
73	$MULTU="multu";
74	$DIVU="divu";
75	$ADDU="addu";
76	$SUBU="subu";
77	$SRL="srl";
78	$SLL="sll";
79	$BNSZ=4;
80	$PTR_ADD="addu";
81	$PTR_SUB="subu";
82	$SZREG=4;
83	$REG_S="sw";
84	$REG_L="lw";
85	$code=".set	mips2\n";
86}
87
88# Below is N32/64 register layout used in the original module.
89#
90($zero,$at,$v0,$v1)=map("\$$_",(0..3));
91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
94($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7);
96#
97# No special adaptation is required for O32. NUBI on the other hand
98# is treated by saving/restoring ($v1,$t0..$t3).
99
100$gp=$v1 if ($flavour =~ /nubi/i);
101
102$minus4=$v1;
103
104$code.=<<___;
105.rdata
106.asciiz	"mips3.s, Version 1.2"
107.asciiz	"MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>"
108
109.text
110.set	noat
111
112.align	5
113.globl	bn_mul_add_words
114.ent	bn_mul_add_words
115bn_mul_add_words:
116	.set	noreorder
117	bgtz	$a2,bn_mul_add_words_internal
118	move	$v0,$zero
119	jr	$ra
120	move	$a0,$v0
121.end	bn_mul_add_words
122
123.align	5
124.ent	bn_mul_add_words_internal
125bn_mul_add_words_internal:
126___
127$code.=<<___ if ($flavour =~ /nubi/i);
128	.frame	$sp,6*$SZREG,$ra
129	.mask	0x8000f008,-$SZREG
130	.set	noreorder
131	$PTR_SUB $sp,6*$SZREG
132	$REG_S	$ra,5*$SZREG($sp)
133	$REG_S	$t3,4*$SZREG($sp)
134	$REG_S	$t2,3*$SZREG($sp)
135	$REG_S	$t1,2*$SZREG($sp)
136	$REG_S	$t0,1*$SZREG($sp)
137	$REG_S	$gp,0*$SZREG($sp)
138___
139$code.=<<___;
140	.set	reorder
141	li	$minus4,-4
142	and	$ta0,$a2,$minus4
143	$LD	$t0,0($a1)
144	beqz	$ta0,.L_bn_mul_add_words_tail
145
146.L_bn_mul_add_words_loop:
147	$MULTU	$t0,$a3
148	$LD	$t1,0($a0)
149	$LD	$t2,$BNSZ($a1)
150	$LD	$t3,$BNSZ($a0)
151	$LD	$ta0,2*$BNSZ($a1)
152	$LD	$ta1,2*$BNSZ($a0)
153	$ADDU	$t1,$v0
154	sltu	$v0,$t1,$v0	# All manuals say it "compares 32-bit
155				# values", but it seems to work fine
156				# even on 64-bit registers.
157	mflo	$at
158	mfhi	$t0
159	$ADDU	$t1,$at
160	$ADDU	$v0,$t0
161	 $MULTU	$t2,$a3
162	sltu	$at,$t1,$at
163	$ST	$t1,0($a0)
164	$ADDU	$v0,$at
165
166	$LD	$ta2,3*$BNSZ($a1)
167	$LD	$ta3,3*$BNSZ($a0)
168	$ADDU	$t3,$v0
169	sltu	$v0,$t3,$v0
170	mflo	$at
171	mfhi	$t2
172	$ADDU	$t3,$at
173	$ADDU	$v0,$t2
174	 $MULTU	$ta0,$a3
175	sltu	$at,$t3,$at
176	$ST	$t3,$BNSZ($a0)
177	$ADDU	$v0,$at
178
179	subu	$a2,4
180	$PTR_ADD $a0,4*$BNSZ
181	$PTR_ADD $a1,4*$BNSZ
182	$ADDU	$ta1,$v0
183	sltu	$v0,$ta1,$v0
184	mflo	$at
185	mfhi	$ta0
186	$ADDU	$ta1,$at
187	$ADDU	$v0,$ta0
188	 $MULTU	$ta2,$a3
189	sltu	$at,$ta1,$at
190	$ST	$ta1,-2*$BNSZ($a0)
191	$ADDU	$v0,$at
192
193
194	and	$ta0,$a2,$minus4
195	$ADDU	$ta3,$v0
196	sltu	$v0,$ta3,$v0
197	mflo	$at
198	mfhi	$ta2
199	$ADDU	$ta3,$at
200	$ADDU	$v0,$ta2
201	sltu	$at,$ta3,$at
202	$ST	$ta3,-$BNSZ($a0)
203	$ADDU	$v0,$at
204	.set	noreorder
205	bgtzl	$ta0,.L_bn_mul_add_words_loop
206	$LD	$t0,0($a1)
207
208	beqz	$a2,.L_bn_mul_add_words_return
209	nop
210
211.L_bn_mul_add_words_tail:
212	.set	reorder
213	$LD	$t0,0($a1)
214	$MULTU	$t0,$a3
215	$LD	$t1,0($a0)
216	subu	$a2,1
217	$ADDU	$t1,$v0
218	sltu	$v0,$t1,$v0
219	mflo	$at
220	mfhi	$t0
221	$ADDU	$t1,$at
222	$ADDU	$v0,$t0
223	sltu	$at,$t1,$at
224	$ST	$t1,0($a0)
225	$ADDU	$v0,$at
226	beqz	$a2,.L_bn_mul_add_words_return
227
228	$LD	$t0,$BNSZ($a1)
229	$MULTU	$t0,$a3
230	$LD	$t1,$BNSZ($a0)
231	subu	$a2,1
232	$ADDU	$t1,$v0
233	sltu	$v0,$t1,$v0
234	mflo	$at
235	mfhi	$t0
236	$ADDU	$t1,$at
237	$ADDU	$v0,$t0
238	sltu	$at,$t1,$at
239	$ST	$t1,$BNSZ($a0)
240	$ADDU	$v0,$at
241	beqz	$a2,.L_bn_mul_add_words_return
242
243	$LD	$t0,2*$BNSZ($a1)
244	$MULTU	$t0,$a3
245	$LD	$t1,2*$BNSZ($a0)
246	$ADDU	$t1,$v0
247	sltu	$v0,$t1,$v0
248	mflo	$at
249	mfhi	$t0
250	$ADDU	$t1,$at
251	$ADDU	$v0,$t0
252	sltu	$at,$t1,$at
253	$ST	$t1,2*$BNSZ($a0)
254	$ADDU	$v0,$at
255
256.L_bn_mul_add_words_return:
257	.set	noreorder
258___
259$code.=<<___ if ($flavour =~ /nubi/i);
260	$REG_L	$t3,4*$SZREG($sp)
261	$REG_L	$t2,3*$SZREG($sp)
262	$REG_L	$t1,2*$SZREG($sp)
263	$REG_L	$t0,1*$SZREG($sp)
264	$REG_L	$gp,0*$SZREG($sp)
265	$PTR_ADD $sp,6*$SZREG
266___
267$code.=<<___;
268	jr	$ra
269	move	$a0,$v0
270.end	bn_mul_add_words_internal
271
272.align	5
273.globl	bn_mul_words
274.ent	bn_mul_words
275bn_mul_words:
276	.set	noreorder
277	bgtz	$a2,bn_mul_words_internal
278	move	$v0,$zero
279	jr	$ra
280	move	$a0,$v0
281.end	bn_mul_words
282
283.align	5
284.ent	bn_mul_words_internal
285bn_mul_words_internal:
286___
287$code.=<<___ if ($flavour =~ /nubi/i);
288	.frame	$sp,6*$SZREG,$ra
289	.mask	0x8000f008,-$SZREG
290	.set	noreorder
291	$PTR_SUB $sp,6*$SZREG
292	$REG_S	$ra,5*$SZREG($sp)
293	$REG_S	$t3,4*$SZREG($sp)
294	$REG_S	$t2,3*$SZREG($sp)
295	$REG_S	$t1,2*$SZREG($sp)
296	$REG_S	$t0,1*$SZREG($sp)
297	$REG_S	$gp,0*$SZREG($sp)
298___
299$code.=<<___;
300	.set	reorder
301	li	$minus4,-4
302	and	$ta0,$a2,$minus4
303	$LD	$t0,0($a1)
304	beqz	$ta0,.L_bn_mul_words_tail
305
306.L_bn_mul_words_loop:
307	$MULTU	$t0,$a3
308	$LD	$t2,$BNSZ($a1)
309	$LD	$ta0,2*$BNSZ($a1)
310	$LD	$ta2,3*$BNSZ($a1)
311	mflo	$at
312	mfhi	$t0
313	$ADDU	$v0,$at
314	sltu	$t1,$v0,$at
315	 $MULTU	$t2,$a3
316	$ST	$v0,0($a0)
317	$ADDU	$v0,$t1,$t0
318
319	subu	$a2,4
320	$PTR_ADD $a0,4*$BNSZ
321	$PTR_ADD $a1,4*$BNSZ
322	mflo	$at
323	mfhi	$t2
324	$ADDU	$v0,$at
325	sltu	$t3,$v0,$at
326	 $MULTU	$ta0,$a3
327	$ST	$v0,-3*$BNSZ($a0)
328	$ADDU	$v0,$t3,$t2
329
330	mflo	$at
331	mfhi	$ta0
332	$ADDU	$v0,$at
333	sltu	$ta1,$v0,$at
334	 $MULTU	$ta2,$a3
335	$ST	$v0,-2*$BNSZ($a0)
336	$ADDU	$v0,$ta1,$ta0
337
338	and	$ta0,$a2,$minus4
339	mflo	$at
340	mfhi	$ta2
341	$ADDU	$v0,$at
342	sltu	$ta3,$v0,$at
343	$ST	$v0,-$BNSZ($a0)
344	$ADDU	$v0,$ta3,$ta2
345	.set	noreorder
346	bgtzl	$ta0,.L_bn_mul_words_loop
347	$LD	$t0,0($a1)
348
349	beqz	$a2,.L_bn_mul_words_return
350	nop
351
352.L_bn_mul_words_tail:
353	.set	reorder
354	$LD	$t0,0($a1)
355	$MULTU	$t0,$a3
356	subu	$a2,1
357	mflo	$at
358	mfhi	$t0
359	$ADDU	$v0,$at
360	sltu	$t1,$v0,$at
361	$ST	$v0,0($a0)
362	$ADDU	$v0,$t1,$t0
363	beqz	$a2,.L_bn_mul_words_return
364
365	$LD	$t0,$BNSZ($a1)
366	$MULTU	$t0,$a3
367	subu	$a2,1
368	mflo	$at
369	mfhi	$t0
370	$ADDU	$v0,$at
371	sltu	$t1,$v0,$at
372	$ST	$v0,$BNSZ($a0)
373	$ADDU	$v0,$t1,$t0
374	beqz	$a2,.L_bn_mul_words_return
375
376	$LD	$t0,2*$BNSZ($a1)
377	$MULTU	$t0,$a3
378	mflo	$at
379	mfhi	$t0
380	$ADDU	$v0,$at
381	sltu	$t1,$v0,$at
382	$ST	$v0,2*$BNSZ($a0)
383	$ADDU	$v0,$t1,$t0
384
385.L_bn_mul_words_return:
386	.set	noreorder
387___
388$code.=<<___ if ($flavour =~ /nubi/i);
389	$REG_L	$t3,4*$SZREG($sp)
390	$REG_L	$t2,3*$SZREG($sp)
391	$REG_L	$t1,2*$SZREG($sp)
392	$REG_L	$t0,1*$SZREG($sp)
393	$REG_L	$gp,0*$SZREG($sp)
394	$PTR_ADD $sp,6*$SZREG
395___
396$code.=<<___;
397	jr	$ra
398	move	$a0,$v0
399.end	bn_mul_words_internal
400
401.align	5
402.globl	bn_sqr_words
403.ent	bn_sqr_words
404bn_sqr_words:
405	.set	noreorder
406	bgtz	$a2,bn_sqr_words_internal
407	move	$v0,$zero
408	jr	$ra
409	move	$a0,$v0
410.end	bn_sqr_words
411
412.align	5
413.ent	bn_sqr_words_internal
414bn_sqr_words_internal:
415___
416$code.=<<___ if ($flavour =~ /nubi/i);
417	.frame	$sp,6*$SZREG,$ra
418	.mask	0x8000f008,-$SZREG
419	.set	noreorder
420	$PTR_SUB $sp,6*$SZREG
421	$REG_S	$ra,5*$SZREG($sp)
422	$REG_S	$t3,4*$SZREG($sp)
423	$REG_S	$t2,3*$SZREG($sp)
424	$REG_S	$t1,2*$SZREG($sp)
425	$REG_S	$t0,1*$SZREG($sp)
426	$REG_S	$gp,0*$SZREG($sp)
427___
428$code.=<<___;
429	.set	reorder
430	li	$minus4,-4
431	and	$ta0,$a2,$minus4
432	$LD	$t0,0($a1)
433	beqz	$ta0,.L_bn_sqr_words_tail
434
435.L_bn_sqr_words_loop:
436	$MULTU	$t0,$t0
437	$LD	$t2,$BNSZ($a1)
438	$LD	$ta0,2*$BNSZ($a1)
439	$LD	$ta2,3*$BNSZ($a1)
440	mflo	$t1
441	mfhi	$t0
442	$ST	$t1,0($a0)
443	$ST	$t0,$BNSZ($a0)
444
445	$MULTU	$t2,$t2
446	subu	$a2,4
447	$PTR_ADD $a0,8*$BNSZ
448	$PTR_ADD $a1,4*$BNSZ
449	mflo	$t3
450	mfhi	$t2
451	$ST	$t3,-6*$BNSZ($a0)
452	$ST	$t2,-5*$BNSZ($a0)
453
454	$MULTU	$ta0,$ta0
455	mflo	$ta1
456	mfhi	$ta0
457	$ST	$ta1,-4*$BNSZ($a0)
458	$ST	$ta0,-3*$BNSZ($a0)
459
460
461	$MULTU	$ta2,$ta2
462	and	$ta0,$a2,$minus4
463	mflo	$ta3
464	mfhi	$ta2
465	$ST	$ta3,-2*$BNSZ($a0)
466	$ST	$ta2,-$BNSZ($a0)
467
468	.set	noreorder
469	bgtzl	$ta0,.L_bn_sqr_words_loop
470	$LD	$t0,0($a1)
471
472	beqz	$a2,.L_bn_sqr_words_return
473	nop
474
475.L_bn_sqr_words_tail:
476	.set	reorder
477	$LD	$t0,0($a1)
478	$MULTU	$t0,$t0
479	subu	$a2,1
480	mflo	$t1
481	mfhi	$t0
482	$ST	$t1,0($a0)
483	$ST	$t0,$BNSZ($a0)
484	beqz	$a2,.L_bn_sqr_words_return
485
486	$LD	$t0,$BNSZ($a1)
487	$MULTU	$t0,$t0
488	subu	$a2,1
489	mflo	$t1
490	mfhi	$t0
491	$ST	$t1,2*$BNSZ($a0)
492	$ST	$t0,3*$BNSZ($a0)
493	beqz	$a2,.L_bn_sqr_words_return
494
495	$LD	$t0,2*$BNSZ($a1)
496	$MULTU	$t0,$t0
497	mflo	$t1
498	mfhi	$t0
499	$ST	$t1,4*$BNSZ($a0)
500	$ST	$t0,5*$BNSZ($a0)
501
502.L_bn_sqr_words_return:
503	.set	noreorder
504___
505$code.=<<___ if ($flavour =~ /nubi/i);
506	$REG_L	$t3,4*$SZREG($sp)
507	$REG_L	$t2,3*$SZREG($sp)
508	$REG_L	$t1,2*$SZREG($sp)
509	$REG_L	$t0,1*$SZREG($sp)
510	$REG_L	$gp,0*$SZREG($sp)
511	$PTR_ADD $sp,6*$SZREG
512___
513$code.=<<___;
514	jr	$ra
515	move	$a0,$v0
516
517.end	bn_sqr_words_internal
518
519.align	5
520.globl	bn_add_words
521.ent	bn_add_words
522bn_add_words:
523	.set	noreorder
524	bgtz	$a3,bn_add_words_internal
525	move	$v0,$zero
526	jr	$ra
527	move	$a0,$v0
528.end	bn_add_words
529
530.align	5
531.ent	bn_add_words_internal
532bn_add_words_internal:
533___
534$code.=<<___ if ($flavour =~ /nubi/i);
535	.frame	$sp,6*$SZREG,$ra
536	.mask	0x8000f008,-$SZREG
537	.set	noreorder
538	$PTR_SUB $sp,6*$SZREG
539	$REG_S	$ra,5*$SZREG($sp)
540	$REG_S	$t3,4*$SZREG($sp)
541	$REG_S	$t2,3*$SZREG($sp)
542	$REG_S	$t1,2*$SZREG($sp)
543	$REG_S	$t0,1*$SZREG($sp)
544	$REG_S	$gp,0*$SZREG($sp)
545___
546$code.=<<___;
547	.set	reorder
548	li	$minus4,-4
549	and	$at,$a3,$minus4
550	$LD	$t0,0($a1)
551	beqz	$at,.L_bn_add_words_tail
552
553.L_bn_add_words_loop:
554	$LD	$ta0,0($a2)
555	subu	$a3,4
556	$LD	$t1,$BNSZ($a1)
557	and	$at,$a3,$minus4
558	$LD	$t2,2*$BNSZ($a1)
559	$PTR_ADD $a2,4*$BNSZ
560	$LD	$t3,3*$BNSZ($a1)
561	$PTR_ADD $a0,4*$BNSZ
562	$LD	$ta1,-3*$BNSZ($a2)
563	$PTR_ADD $a1,4*$BNSZ
564	$LD	$ta2,-2*$BNSZ($a2)
565	$LD	$ta3,-$BNSZ($a2)
566	$ADDU	$ta0,$t0
567	sltu	$t8,$ta0,$t0
568	$ADDU	$t0,$ta0,$v0
569	sltu	$v0,$t0,$ta0
570	$ST	$t0,-4*$BNSZ($a0)
571	$ADDU	$v0,$t8
572
573	$ADDU	$ta1,$t1
574	sltu	$t9,$ta1,$t1
575	$ADDU	$t1,$ta1,$v0
576	sltu	$v0,$t1,$ta1
577	$ST	$t1,-3*$BNSZ($a0)
578	$ADDU	$v0,$t9
579
580	$ADDU	$ta2,$t2
581	sltu	$t8,$ta2,$t2
582	$ADDU	$t2,$ta2,$v0
583	sltu	$v0,$t2,$ta2
584	$ST	$t2,-2*$BNSZ($a0)
585	$ADDU	$v0,$t8
586
587	$ADDU	$ta3,$t3
588	sltu	$t9,$ta3,$t3
589	$ADDU	$t3,$ta3,$v0
590	sltu	$v0,$t3,$ta3
591	$ST	$t3,-$BNSZ($a0)
592	$ADDU	$v0,$t9
593
594	.set	noreorder
595	bgtzl	$at,.L_bn_add_words_loop
596	$LD	$t0,0($a1)
597
598	beqz	$a3,.L_bn_add_words_return
599	nop
600
601.L_bn_add_words_tail:
602	.set	reorder
603	$LD	$t0,0($a1)
604	$LD	$ta0,0($a2)
605	$ADDU	$ta0,$t0
606	subu	$a3,1
607	sltu	$t8,$ta0,$t0
608	$ADDU	$t0,$ta0,$v0
609	sltu	$v0,$t0,$ta0
610	$ST	$t0,0($a0)
611	$ADDU	$v0,$t8
612	beqz	$a3,.L_bn_add_words_return
613
614	$LD	$t1,$BNSZ($a1)
615	$LD	$ta1,$BNSZ($a2)
616	$ADDU	$ta1,$t1
617	subu	$a3,1
618	sltu	$t9,$ta1,$t1
619	$ADDU	$t1,$ta1,$v0
620	sltu	$v0,$t1,$ta1
621	$ST	$t1,$BNSZ($a0)
622	$ADDU	$v0,$t9
623	beqz	$a3,.L_bn_add_words_return
624
625	$LD	$t2,2*$BNSZ($a1)
626	$LD	$ta2,2*$BNSZ($a2)
627	$ADDU	$ta2,$t2
628	sltu	$t8,$ta2,$t2
629	$ADDU	$t2,$ta2,$v0
630	sltu	$v0,$t2,$ta2
631	$ST	$t2,2*$BNSZ($a0)
632	$ADDU	$v0,$t8
633
634.L_bn_add_words_return:
635	.set	noreorder
636___
637$code.=<<___ if ($flavour =~ /nubi/i);
638	$REG_L	$t3,4*$SZREG($sp)
639	$REG_L	$t2,3*$SZREG($sp)
640	$REG_L	$t1,2*$SZREG($sp)
641	$REG_L	$t0,1*$SZREG($sp)
642	$REG_L	$gp,0*$SZREG($sp)
643	$PTR_ADD $sp,6*$SZREG
644___
645$code.=<<___;
646	jr	$ra
647	move	$a0,$v0
648
649.end	bn_add_words_internal
650
651.align	5
652.globl	bn_sub_words
653.ent	bn_sub_words
654bn_sub_words:
655	.set	noreorder
656	bgtz	$a3,bn_sub_words_internal
657	move	$v0,$zero
658	jr	$ra
659	move	$a0,$zero
660.end	bn_sub_words
661
662.align	5
663.ent	bn_sub_words_internal
664bn_sub_words_internal:
665___
666$code.=<<___ if ($flavour =~ /nubi/i);
667	.frame	$sp,6*$SZREG,$ra
668	.mask	0x8000f008,-$SZREG
669	.set	noreorder
670	$PTR_SUB $sp,6*$SZREG
671	$REG_S	$ra,5*$SZREG($sp)
672	$REG_S	$t3,4*$SZREG($sp)
673	$REG_S	$t2,3*$SZREG($sp)
674	$REG_S	$t1,2*$SZREG($sp)
675	$REG_S	$t0,1*$SZREG($sp)
676	$REG_S	$gp,0*$SZREG($sp)
677___
678$code.=<<___;
679	.set	reorder
680	li	$minus4,-4
681	and	$at,$a3,$minus4
682	$LD	$t0,0($a1)
683	beqz	$at,.L_bn_sub_words_tail
684
685.L_bn_sub_words_loop:
686	$LD	$ta0,0($a2)
687	subu	$a3,4
688	$LD	$t1,$BNSZ($a1)
689	and	$at,$a3,$minus4
690	$LD	$t2,2*$BNSZ($a1)
691	$PTR_ADD $a2,4*$BNSZ
692	$LD	$t3,3*$BNSZ($a1)
693	$PTR_ADD $a0,4*$BNSZ
694	$LD	$ta1,-3*$BNSZ($a2)
695	$PTR_ADD $a1,4*$BNSZ
696	$LD	$ta2,-2*$BNSZ($a2)
697	$LD	$ta3,-$BNSZ($a2)
698	sltu	$t8,$t0,$ta0
699	$SUBU	$ta0,$t0,$ta0
700	$SUBU	$t0,$ta0,$v0
701	sgtu	$v0,$t0,$ta0
702	$ST	$t0,-4*$BNSZ($a0)
703	$ADDU	$v0,$t8
704
705	sltu	$t9,$t1,$ta1
706	$SUBU	$ta1,$t1,$ta1
707	$SUBU	$t1,$ta1,$v0
708	sgtu	$v0,$t1,$ta1
709	$ST	$t1,-3*$BNSZ($a0)
710	$ADDU	$v0,$t9
711
712
713	sltu	$t8,$t2,$ta2
714	$SUBU	$ta2,$t2,$ta2
715	$SUBU	$t2,$ta2,$v0
716	sgtu	$v0,$t2,$ta2
717	$ST	$t2,-2*$BNSZ($a0)
718	$ADDU	$v0,$t8
719
720	sltu	$t9,$t3,$ta3
721	$SUBU	$ta3,$t3,$ta3
722	$SUBU	$t3,$ta3,$v0
723	sgtu	$v0,$t3,$ta3
724	$ST	$t3,-$BNSZ($a0)
725	$ADDU	$v0,$t9
726
727	.set	noreorder
728	bgtzl	$at,.L_bn_sub_words_loop
729	$LD	$t0,0($a1)
730
731	beqz	$a3,.L_bn_sub_words_return
732	nop
733
734.L_bn_sub_words_tail:
735	.set	reorder
736	$LD	$t0,0($a1)
737	$LD	$ta0,0($a2)
738	subu	$a3,1
739	sltu	$t8,$t0,$ta0
740	$SUBU	$ta0,$t0,$ta0
741	$SUBU	$t0,$ta0,$v0
742	sgtu	$v0,$t0,$ta0
743	$ST	$t0,0($a0)
744	$ADDU	$v0,$t8
745	beqz	$a3,.L_bn_sub_words_return
746
747	$LD	$t1,$BNSZ($a1)
748	subu	$a3,1
749	$LD	$ta1,$BNSZ($a2)
750	sltu	$t9,$t1,$ta1
751	$SUBU	$ta1,$t1,$ta1
752	$SUBU	$t1,$ta1,$v0
753	sgtu	$v0,$t1,$ta1
754	$ST	$t1,$BNSZ($a0)
755	$ADDU	$v0,$t9
756	beqz	$a3,.L_bn_sub_words_return
757
758	$LD	$t2,2*$BNSZ($a1)
759	$LD	$ta2,2*$BNSZ($a2)
760	sltu	$t8,$t2,$ta2
761	$SUBU	$ta2,$t2,$ta2
762	$SUBU	$t2,$ta2,$v0
763	sgtu	$v0,$t2,$ta2
764	$ST	$t2,2*$BNSZ($a0)
765	$ADDU	$v0,$t8
766
767.L_bn_sub_words_return:
768	.set	noreorder
769___
770$code.=<<___ if ($flavour =~ /nubi/i);
771	$REG_L	$t3,4*$SZREG($sp)
772	$REG_L	$t2,3*$SZREG($sp)
773	$REG_L	$t1,2*$SZREG($sp)
774	$REG_L	$t0,1*$SZREG($sp)
775	$REG_L	$gp,0*$SZREG($sp)
776	$PTR_ADD $sp,6*$SZREG
777___
778$code.=<<___;
779	jr	$ra
780	move	$a0,$v0
781.end	bn_sub_words_internal
782
783.align 5
784.globl	bn_div_3_words
785.ent	bn_div_3_words
786bn_div_3_words:
787	.set	noreorder
788	move	$a3,$a0		# we know that bn_div_words does not
789				# touch $a3, $ta2, $ta3 and preserves $a2
790				# so that we can save two arguments
791				# and return address in registers
792				# instead of stack:-)
793
794	$LD	$a0,($a3)
795	move	$ta2,$a1
796	bne	$a0,$a2,bn_div_3_words_internal
797	$LD	$a1,-$BNSZ($a3)
798	li	$v0,-1
799	jr	$ra
800	move	$a0,$v0
801.end	bn_div_3_words
802
803.align	5
804.ent	bn_div_3_words_internal
805bn_div_3_words_internal:
806___
807$code.=<<___ if ($flavour =~ /nubi/i);
808	.frame	$sp,6*$SZREG,$ra
809	.mask	0x8000f008,-$SZREG
810	.set	noreorder
811	$PTR_SUB $sp,6*$SZREG
812	$REG_S	$ra,5*$SZREG($sp)
813	$REG_S	$t3,4*$SZREG($sp)
814	$REG_S	$t2,3*$SZREG($sp)
815	$REG_S	$t1,2*$SZREG($sp)
816	$REG_S	$t0,1*$SZREG($sp)
817	$REG_S	$gp,0*$SZREG($sp)
818___
819$code.=<<___;
820	.set	reorder
821	move	$ta3,$ra
822	bal	bn_div_words
823	move	$ra,$ta3
824	$MULTU	$ta2,$v0
825	$LD	$t2,-2*$BNSZ($a3)
826	move	$ta0,$zero
827	mfhi	$t1
828	mflo	$t0
829	sltu	$t8,$t1,$a1
830.L_bn_div_3_words_inner_loop:
831	bnez	$t8,.L_bn_div_3_words_inner_loop_done
832	sgeu	$at,$t2,$t0
833	seq	$t9,$t1,$a1
834	and	$at,$t9
835	sltu	$t3,$t0,$ta2
836	$ADDU	$a1,$a2
837	$SUBU	$t1,$t3
838	$SUBU	$t0,$ta2
839	sltu	$t8,$t1,$a1
840	sltu	$ta0,$a1,$a2
841	or	$t8,$ta0
842	.set	noreorder
843	beqzl	$at,.L_bn_div_3_words_inner_loop
844	$SUBU	$v0,1
845	.set	reorder
846.L_bn_div_3_words_inner_loop_done:
847	.set	noreorder
848___
849$code.=<<___ if ($flavour =~ /nubi/i);
850	$REG_L	$t3,4*$SZREG($sp)
851	$REG_L	$t2,3*$SZREG($sp)
852	$REG_L	$t1,2*$SZREG($sp)
853	$REG_L	$t0,1*$SZREG($sp)
854	$REG_L	$gp,0*$SZREG($sp)
855	$PTR_ADD $sp,6*$SZREG
856___
857$code.=<<___;
858	jr	$ra
859	move	$a0,$v0
860.end	bn_div_3_words_internal
861
862.align	5
863.globl	bn_div_words
864.ent	bn_div_words
865bn_div_words:
866	.set	noreorder
867	bnez	$a2,bn_div_words_internal
868	li	$v0,-1		# I would rather signal div-by-zero
869				# which can be done with 'break 7'
870	jr	$ra
871	move	$a0,$v0
872.end	bn_div_words
873
874.align	5
875.ent	bn_div_words_internal
876bn_div_words_internal:
877___
878$code.=<<___ if ($flavour =~ /nubi/i);
879	.frame	$sp,6*$SZREG,$ra
880	.mask	0x8000f008,-$SZREG
881	.set	noreorder
882	$PTR_SUB $sp,6*$SZREG
883	$REG_S	$ra,5*$SZREG($sp)
884	$REG_S	$t3,4*$SZREG($sp)
885	$REG_S	$t2,3*$SZREG($sp)
886	$REG_S	$t1,2*$SZREG($sp)
887	$REG_S	$t0,1*$SZREG($sp)
888	$REG_S	$gp,0*$SZREG($sp)
889___
890$code.=<<___;
891	move	$v1,$zero
892	bltz	$a2,.L_bn_div_words_body
893	move	$t9,$v1
894	$SLL	$a2,1
895	bgtz	$a2,.-4
896	addu	$t9,1
897
898	.set	reorder
899	negu	$t1,$t9
900	li	$t2,-1
901	$SLL	$t2,$t1
902	and	$t2,$a0
903	$SRL	$at,$a1,$t1
904	.set	noreorder
905	bnezl	$t2,.+8
906	break	6		# signal overflow
907	.set	reorder
908	$SLL	$a0,$t9
909	$SLL	$a1,$t9
910	or	$a0,$at
911___
912$QT=$ta0;
913$HH=$ta1;
914$DH=$v1;
915$code.=<<___;
916.L_bn_div_words_body:
917	$SRL	$DH,$a2,4*$BNSZ	# bits
918	sgeu	$at,$a0,$a2
919	.set	noreorder
920	bnezl	$at,.+8
921	$SUBU	$a0,$a2
922	.set	reorder
923
924	li	$QT,-1
925	$SRL	$HH,$a0,4*$BNSZ	# bits
926	$SRL	$QT,4*$BNSZ	# q=0xffffffff
927	beq	$DH,$HH,.L_bn_div_words_skip_div1
928	$DIVU	$zero,$a0,$DH
929	mflo	$QT
930.L_bn_div_words_skip_div1:
931	$MULTU	$a2,$QT
932	$SLL	$t3,$a0,4*$BNSZ	# bits
933	$SRL	$at,$a1,4*$BNSZ	# bits
934	or	$t3,$at
935	mflo	$t0
936	mfhi	$t1
937.L_bn_div_words_inner_loop1:
938	sltu	$t2,$t3,$t0
939	seq	$t8,$HH,$t1
940	sltu	$at,$HH,$t1
941	and	$t2,$t8
942	sltu	$v0,$t0,$a2
943	or	$at,$t2
944	.set	noreorder
945	beqz	$at,.L_bn_div_words_inner_loop1_done
946	$SUBU	$t1,$v0
947	$SUBU	$t0,$a2
948	b	.L_bn_div_words_inner_loop1
949	$SUBU	$QT,1
950	.set	reorder
951.L_bn_div_words_inner_loop1_done:
952
953	$SLL	$a1,4*$BNSZ	# bits
954	$SUBU	$a0,$t3,$t0
955	$SLL	$v0,$QT,4*$BNSZ	# bits
956
957	li	$QT,-1
958	$SRL	$HH,$a0,4*$BNSZ	# bits
959	$SRL	$QT,4*$BNSZ	# q=0xffffffff
960	beq	$DH,$HH,.L_bn_div_words_skip_div2
961	$DIVU	$zero,$a0,$DH
962	mflo	$QT
963.L_bn_div_words_skip_div2:
964	$MULTU	$a2,$QT
965	$SLL	$t3,$a0,4*$BNSZ	# bits
966	$SRL	$at,$a1,4*$BNSZ	# bits
967	or	$t3,$at
968	mflo	$t0
969	mfhi	$t1
970.L_bn_div_words_inner_loop2:
971	sltu	$t2,$t3,$t0
972	seq	$t8,$HH,$t1
973	sltu	$at,$HH,$t1
974	and	$t2,$t8
975	sltu	$v1,$t0,$a2
976	or	$at,$t2
977	.set	noreorder
978	beqz	$at,.L_bn_div_words_inner_loop2_done
979	$SUBU	$t1,$v1
980	$SUBU	$t0,$a2
981	b	.L_bn_div_words_inner_loop2
982	$SUBU	$QT,1
983	.set	reorder
984.L_bn_div_words_inner_loop2_done:
985
986	$SUBU	$a0,$t3,$t0
987	or	$v0,$QT
988	$SRL	$v1,$a0,$t9	# $v1 contains remainder if anybody wants it
989	$SRL	$a2,$t9		# restore $a2
990
991	.set	noreorder
992	move	$a1,$v1
993___
994$code.=<<___ if ($flavour =~ /nubi/i);
995	$REG_L	$t3,4*$SZREG($sp)
996	$REG_L	$t2,3*$SZREG($sp)
997	$REG_L	$t1,2*$SZREG($sp)
998	$REG_L	$t0,1*$SZREG($sp)
999	$REG_L	$gp,0*$SZREG($sp)
1000	$PTR_ADD $sp,6*$SZREG
1001___
1002$code.=<<___;
1003	jr	$ra
1004	move	$a0,$v0
1005.end	bn_div_words_internal
1006___
1007undef $HH; undef $QT; undef $DH;
1008
1009($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3);
1010($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3);
1011
1012($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1
1013($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2
1014
1015($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3);
1016
1017$code.=<<___;
1018
1019.align	5
1020.globl	bn_mul_comba8
1021.ent	bn_mul_comba8
1022bn_mul_comba8:
1023	.set	noreorder
1024___
1025$code.=<<___ if ($flavour =~ /nubi/i);
1026	.frame	$sp,12*$SZREG,$ra
1027	.mask	0x803ff008,-$SZREG
1028	$PTR_SUB $sp,12*$SZREG
1029	$REG_S	$ra,11*$SZREG($sp)
1030	$REG_S	$s5,10*$SZREG($sp)
1031	$REG_S	$s4,9*$SZREG($sp)
1032	$REG_S	$s3,8*$SZREG($sp)
1033	$REG_S	$s2,7*$SZREG($sp)
1034	$REG_S	$s1,6*$SZREG($sp)
1035	$REG_S	$s0,5*$SZREG($sp)
1036	$REG_S	$t3,4*$SZREG($sp)
1037	$REG_S	$t2,3*$SZREG($sp)
1038	$REG_S	$t1,2*$SZREG($sp)
1039	$REG_S	$t0,1*$SZREG($sp)
1040	$REG_S	$gp,0*$SZREG($sp)
1041___
1042$code.=<<___ if ($flavour !~ /nubi/i);
1043	.frame	$sp,6*$SZREG,$ra
1044	.mask	0x003f0000,-$SZREG
1045	$PTR_SUB $sp,6*$SZREG
1046	$REG_S	$s5,5*$SZREG($sp)
1047	$REG_S	$s4,4*$SZREG($sp)
1048	$REG_S	$s3,3*$SZREG($sp)
1049	$REG_S	$s2,2*$SZREG($sp)
1050	$REG_S	$s1,1*$SZREG($sp)
1051	$REG_S	$s0,0*$SZREG($sp)
1052___
1053$code.=<<___;
1054
1055	.set	reorder
1056	$LD	$a_0,0($a1)	# If compiled with -mips3 option on
1057				# R5000 box assembler barks on this
1058				# 1ine with "should not have mult/div
1059				# as last instruction in bb (R10K
1060				# bug)" warning. If anybody out there
1061				# has a clue about how to circumvent
1062				# this do send me a note.
1063				#		<appro\@fy.chalmers.se>
1064
1065	$LD	$b_0,0($a2)
1066	$LD	$a_1,$BNSZ($a1)
1067	$LD	$a_2,2*$BNSZ($a1)
1068	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1069	$LD	$a_3,3*$BNSZ($a1)
1070	$LD	$b_1,$BNSZ($a2)
1071	$LD	$b_2,2*$BNSZ($a2)
1072	$LD	$b_3,3*$BNSZ($a2)
1073	mflo	$c_1
1074	mfhi	$c_2
1075
1076	$LD	$a_4,4*$BNSZ($a1)
1077	$LD	$a_5,5*$BNSZ($a1)
1078	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1079	$LD	$a_6,6*$BNSZ($a1)
1080	$LD	$a_7,7*$BNSZ($a1)
1081	$LD	$b_4,4*$BNSZ($a2)
1082	$LD	$b_5,5*$BNSZ($a2)
1083	mflo	$t_1
1084	mfhi	$t_2
1085	$ADDU	$c_2,$t_1
1086	sltu	$at,$c_2,$t_1
1087	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1088	$ADDU	$c_3,$t_2,$at
1089	$LD	$b_6,6*$BNSZ($a2)
1090	$LD	$b_7,7*$BNSZ($a2)
1091	$ST	$c_1,0($a0)	# r[0]=c1;
1092	mflo	$t_1
1093	mfhi	$t_2
1094	$ADDU	$c_2,$t_1
1095	sltu	$at,$c_2,$t_1
1096	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1097	$ADDU	$t_2,$at
1098	$ADDU	$c_3,$t_2
1099	sltu	$c_1,$c_3,$t_2
1100	$ST	$c_2,$BNSZ($a0)	# r[1]=c2;
1101
1102	mflo	$t_1
1103	mfhi	$t_2
1104	$ADDU	$c_3,$t_1
1105	sltu	$at,$c_3,$t_1
1106	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1107	$ADDU	$t_2,$at
1108	$ADDU	$c_1,$t_2
1109	mflo	$t_1
1110	mfhi	$t_2
1111	$ADDU	$c_3,$t_1
1112	sltu	$at,$c_3,$t_1
1113	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1114	$ADDU	$t_2,$at
1115	$ADDU	$c_1,$t_2
1116	sltu	$c_2,$c_1,$t_2
1117	mflo	$t_1
1118	mfhi	$t_2
1119	$ADDU	$c_3,$t_1
1120	sltu	$at,$c_3,$t_1
1121	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1122	$ADDU	$t_2,$at
1123	$ADDU	$c_1,$t_2
1124	sltu	$at,$c_1,$t_2
1125	$ADDU	$c_2,$at
1126	$ST	$c_3,2*$BNSZ($a0)	# r[2]=c3;
1127
1128	mflo	$t_1
1129	mfhi	$t_2
1130	$ADDU	$c_1,$t_1
1131	sltu	$at,$c_1,$t_1
1132	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1133	$ADDU	$t_2,$at
1134	$ADDU	$c_2,$t_2
1135	sltu	$c_3,$c_2,$t_2
1136	mflo	$t_1
1137	mfhi	$t_2
1138	$ADDU	$c_1,$t_1
1139	sltu	$at,$c_1,$t_1
1140	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1141	$ADDU	$t_2,$at
1142	$ADDU	$c_2,$t_2
1143	sltu	$at,$c_2,$t_2
1144	$ADDU	$c_3,$at
1145	mflo	$t_1
1146	mfhi	$t_2
1147	$ADDU	$c_1,$t_1
1148	sltu	$at,$c_1,$t_1
1149	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1150	$ADDU	$t_2,$at
1151	$ADDU	$c_2,$t_2
1152	sltu	$at,$c_2,$t_2
1153	$ADDU	$c_3,$at
1154	mflo	$t_1
1155	mfhi	$t_2
1156	$ADDU	$c_1,$t_1
1157	sltu	$at,$c_1,$t_1
1158	 $MULTU	$a_4,$b_0		# mul_add_c(a[4],b[0],c2,c3,c1);
1159	$ADDU	$t_2,$at
1160	$ADDU	$c_2,$t_2
1161	sltu	$at,$c_2,$t_2
1162	$ADDU	$c_3,$at
1163	$ST	$c_1,3*$BNSZ($a0)	# r[3]=c1;
1164
1165	mflo	$t_1
1166	mfhi	$t_2
1167	$ADDU	$c_2,$t_1
1168	sltu	$at,$c_2,$t_1
1169	$MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1170	$ADDU	$t_2,$at
1171	$ADDU	$c_3,$t_2
1172	sltu	$c_1,$c_3,$t_2
1173	mflo	$t_1
1174	mfhi	$t_2
1175	$ADDU	$c_2,$t_1
1176	sltu	$at,$c_2,$t_1
1177	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1178	$ADDU	$t_2,$at
1179	$ADDU	$c_3,$t_2
1180	sltu	$at,$c_3,$t_2
1181	$ADDU	$c_1,$at
1182	mflo	$t_1
1183	mfhi	$t_2
1184	$ADDU	$c_2,$t_1
1185	sltu	$at,$c_2,$t_1
1186	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1187	$ADDU	$t_2,$at
1188	$ADDU	$c_3,$t_2
1189	sltu	$at,$c_3,$t_2
1190	$ADDU	$c_1,$at
1191	mflo	$t_1
1192	mfhi	$t_2
1193	$ADDU	$c_2,$t_1
1194	sltu	$at,$c_2,$t_1
1195	$MULTU	$a_0,$b_4		# mul_add_c(a[0],b[4],c2,c3,c1);
1196	$ADDU	$t_2,$at
1197	$ADDU	$c_3,$t_2
1198	sltu	$at,$c_3,$t_2
1199	$ADDU	$c_1,$at
1200	mflo	$t_1
1201	mfhi	$t_2
1202	$ADDU	$c_2,$t_1
1203	sltu	$at,$c_2,$t_1
1204	 $MULTU	$a_0,$b_5		# mul_add_c(a[0],b[5],c3,c1,c2);
1205	$ADDU	$t_2,$at
1206	$ADDU	$c_3,$t_2
1207	sltu	$at,$c_3,$t_2
1208	$ADDU	$c_1,$at
1209	$ST	$c_2,4*$BNSZ($a0)	# r[4]=c2;
1210
1211	mflo	$t_1
1212	mfhi	$t_2
1213	$ADDU	$c_3,$t_1
1214	sltu	$at,$c_3,$t_1
1215	$MULTU	$a_1,$b_4		# mul_add_c(a[1],b[4],c3,c1,c2);
1216	$ADDU	$t_2,$at
1217	$ADDU	$c_1,$t_2
1218	sltu	$c_2,$c_1,$t_2
1219	mflo	$t_1
1220	mfhi	$t_2
1221	$ADDU	$c_3,$t_1
1222	sltu	$at,$c_3,$t_1
1223	$MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1224	$ADDU	$t_2,$at
1225	$ADDU	$c_1,$t_2
1226	sltu	$at,$c_1,$t_2
1227	$ADDU	$c_2,$at
1228	mflo	$t_1
1229	mfhi	$t_2
1230	$ADDU	$c_3,$t_1
1231	sltu	$at,$c_3,$t_1
1232	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1233	$ADDU	$t_2,$at
1234	$ADDU	$c_1,$t_2
1235	sltu	$at,$c_1,$t_2
1236	$ADDU	$c_2,$at
1237	mflo	$t_1
1238	mfhi	$t_2
1239	$ADDU	$c_3,$t_1
1240	sltu	$at,$c_3,$t_1
1241	$MULTU	$a_4,$b_1		# mul_add_c(a[4],b[1],c3,c1,c2);
1242	$ADDU	$t_2,$at
1243	$ADDU	$c_1,$t_2
1244	sltu	$at,$c_1,$t_2
1245	$ADDU	$c_2,$at
1246	mflo	$t_1
1247	mfhi	$t_2
1248	$ADDU	$c_3,$t_1
1249	sltu	$at,$c_3,$t_1
1250	$MULTU	$a_5,$b_0		# mul_add_c(a[5],b[0],c3,c1,c2);
1251	$ADDU	$t_2,$at
1252	$ADDU	$c_1,$t_2
1253	sltu	$at,$c_1,$t_2
1254	$ADDU	$c_2,$at
1255	mflo	$t_1
1256	mfhi	$t_2
1257	$ADDU	$c_3,$t_1
1258	sltu	$at,$c_3,$t_1
1259	 $MULTU	$a_6,$b_0		# mul_add_c(a[6],b[0],c1,c2,c3);
1260	$ADDU	$t_2,$at
1261	$ADDU	$c_1,$t_2
1262	sltu	$at,$c_1,$t_2
1263	$ADDU	$c_2,$at
1264	$ST	$c_3,5*$BNSZ($a0)	# r[5]=c3;
1265
1266	mflo	$t_1
1267	mfhi	$t_2
1268	$ADDU	$c_1,$t_1
1269	sltu	$at,$c_1,$t_1
1270	$MULTU	$a_5,$b_1		# mul_add_c(a[5],b[1],c1,c2,c3);
1271	$ADDU	$t_2,$at
1272	$ADDU	$c_2,$t_2
1273	sltu	$c_3,$c_2,$t_2
1274	mflo	$t_1
1275	mfhi	$t_2
1276	$ADDU	$c_1,$t_1
1277	sltu	$at,$c_1,$t_1
1278	$MULTU	$a_4,$b_2		# mul_add_c(a[4],b[2],c1,c2,c3);
1279	$ADDU	$t_2,$at
1280	$ADDU	$c_2,$t_2
1281	sltu	$at,$c_2,$t_2
1282	$ADDU	$c_3,$at
1283	mflo	$t_1
1284	mfhi	$t_2
1285	$ADDU	$c_1,$t_1
1286	sltu	$at,$c_1,$t_1
1287	$MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1288	$ADDU	$t_2,$at
1289	$ADDU	$c_2,$t_2
1290	sltu	$at,$c_2,$t_2
1291	$ADDU	$c_3,$at
1292	mflo	$t_1
1293	mfhi	$t_2
1294	$ADDU	$c_1,$t_1
1295	sltu	$at,$c_1,$t_1
1296	$MULTU	$a_2,$b_4		# mul_add_c(a[2],b[4],c1,c2,c3);
1297	$ADDU	$t_2,$at
1298	$ADDU	$c_2,$t_2
1299	sltu	$at,$c_2,$t_2
1300	$ADDU	$c_3,$at
1301	mflo	$t_1
1302	mfhi	$t_2
1303	$ADDU	$c_1,$t_1
1304	sltu	$at,$c_1,$t_1
1305	$MULTU	$a_1,$b_5		# mul_add_c(a[1],b[5],c1,c2,c3);
1306	$ADDU	$t_2,$at
1307	$ADDU	$c_2,$t_2
1308	sltu	$at,$c_2,$t_2
1309	$ADDU	$c_3,$at
1310	mflo	$t_1
1311	mfhi	$t_2
1312	$ADDU	$c_1,$t_1
1313	sltu	$at,$c_1,$t_1
1314	$MULTU	$a_0,$b_6		# mul_add_c(a[0],b[6],c1,c2,c3);
1315	$ADDU	$t_2,$at
1316	$ADDU	$c_2,$t_2
1317	sltu	$at,$c_2,$t_2
1318	$ADDU	$c_3,$at
1319	mflo	$t_1
1320	mfhi	$t_2
1321	$ADDU	$c_1,$t_1
1322	sltu	$at,$c_1,$t_1
1323	 $MULTU	$a_0,$b_7		# mul_add_c(a[0],b[7],c2,c3,c1);
1324	$ADDU	$t_2,$at
1325	$ADDU	$c_2,$t_2
1326	sltu	$at,$c_2,$t_2
1327	$ADDU	$c_3,$at
1328	$ST	$c_1,6*$BNSZ($a0)	# r[6]=c1;
1329
1330	mflo	$t_1
1331	mfhi	$t_2
1332	$ADDU	$c_2,$t_1
1333	sltu	$at,$c_2,$t_1
1334	$MULTU	$a_1,$b_6		# mul_add_c(a[1],b[6],c2,c3,c1);
1335	$ADDU	$t_2,$at
1336	$ADDU	$c_3,$t_2
1337	sltu	$c_1,$c_3,$t_2
1338	mflo	$t_1
1339	mfhi	$t_2
1340	$ADDU	$c_2,$t_1
1341	sltu	$at,$c_2,$t_1
1342	$MULTU	$a_2,$b_5		# mul_add_c(a[2],b[5],c2,c3,c1);
1343	$ADDU	$t_2,$at
1344	$ADDU	$c_3,$t_2
1345	sltu	$at,$c_3,$t_2
1346	$ADDU	$c_1,$at
1347	mflo	$t_1
1348	mfhi	$t_2
1349	$ADDU	$c_2,$t_1
1350	sltu	$at,$c_2,$t_1
1351	$MULTU	$a_3,$b_4		# mul_add_c(a[3],b[4],c2,c3,c1);
1352	$ADDU	$t_2,$at
1353	$ADDU	$c_3,$t_2
1354	sltu	$at,$c_3,$t_2
1355	$ADDU	$c_1,$at
1356	mflo	$t_1
1357	mfhi	$t_2
1358	$ADDU	$c_2,$t_1
1359	sltu	$at,$c_2,$t_1
1360	$MULTU	$a_4,$b_3		# mul_add_c(a[4],b[3],c2,c3,c1);
1361	$ADDU	$t_2,$at
1362	$ADDU	$c_3,$t_2
1363	sltu	$at,$c_3,$t_2
1364	$ADDU	$c_1,$at
1365	mflo	$t_1
1366	mfhi	$t_2
1367	$ADDU	$c_2,$t_1
1368	sltu	$at,$c_2,$t_1
1369	$MULTU	$a_5,$b_2		# mul_add_c(a[5],b[2],c2,c3,c1);
1370	$ADDU	$t_2,$at
1371	$ADDU	$c_3,$t_2
1372	sltu	$at,$c_3,$t_2
1373	$ADDU	$c_1,$at
1374	mflo	$t_1
1375	mfhi	$t_2
1376	$ADDU	$c_2,$t_1
1377	sltu	$at,$c_2,$t_1
1378	$MULTU	$a_6,$b_1		# mul_add_c(a[6],b[1],c2,c3,c1);
1379	$ADDU	$t_2,$at
1380	$ADDU	$c_3,$t_2
1381	sltu	$at,$c_3,$t_2
1382	$ADDU	$c_1,$at
1383	mflo	$t_1
1384	mfhi	$t_2
1385	$ADDU	$c_2,$t_1
1386	sltu	$at,$c_2,$t_1
1387	$MULTU	$a_7,$b_0		# mul_add_c(a[7],b[0],c2,c3,c1);
1388	$ADDU	$t_2,$at
1389	$ADDU	$c_3,$t_2
1390	sltu	$at,$c_3,$t_2
1391	$ADDU	$c_1,$at
1392	mflo	$t_1
1393	mfhi	$t_2
1394	$ADDU	$c_2,$t_1
1395	sltu	$at,$c_2,$t_1
1396	 $MULTU	$a_7,$b_1		# mul_add_c(a[7],b[1],c3,c1,c2);
1397	$ADDU	$t_2,$at
1398	$ADDU	$c_3,$t_2
1399	sltu	$at,$c_3,$t_2
1400	$ADDU	$c_1,$at
1401	$ST	$c_2,7*$BNSZ($a0)	# r[7]=c2;
1402
1403	mflo	$t_1
1404	mfhi	$t_2
1405	$ADDU	$c_3,$t_1
1406	sltu	$at,$c_3,$t_1
1407	$MULTU	$a_6,$b_2		# mul_add_c(a[6],b[2],c3,c1,c2);
1408	$ADDU	$t_2,$at
1409	$ADDU	$c_1,$t_2
1410	sltu	$c_2,$c_1,$t_2
1411	mflo	$t_1
1412	mfhi	$t_2
1413	$ADDU	$c_3,$t_1
1414	sltu	$at,$c_3,$t_1
1415	$MULTU	$a_5,$b_3		# mul_add_c(a[5],b[3],c3,c1,c2);
1416	$ADDU	$t_2,$at
1417	$ADDU	$c_1,$t_2
1418	sltu	$at,$c_1,$t_2
1419	$ADDU	$c_2,$at
1420	mflo	$t_1
1421	mfhi	$t_2
1422	$ADDU	$c_3,$t_1
1423	sltu	$at,$c_3,$t_1
1424	$MULTU	$a_4,$b_4		# mul_add_c(a[4],b[4],c3,c1,c2);
1425	$ADDU	$t_2,$at
1426	$ADDU	$c_1,$t_2
1427	sltu	$at,$c_1,$t_2
1428	$ADDU	$c_2,$at
1429	mflo	$t_1
1430	mfhi	$t_2
1431	$ADDU	$c_3,$t_1
1432	sltu	$at,$c_3,$t_1
1433	$MULTU	$a_3,$b_5		# mul_add_c(a[3],b[5],c3,c1,c2);
1434	$ADDU	$t_2,$at
1435	$ADDU	$c_1,$t_2
1436	sltu	$at,$c_1,$t_2
1437	$ADDU	$c_2,$at
1438	mflo	$t_1
1439	mfhi	$t_2
1440	$ADDU	$c_3,$t_1
1441	sltu	$at,$c_3,$t_1
1442	$MULTU	$a_2,$b_6		# mul_add_c(a[2],b[6],c3,c1,c2);
1443	$ADDU	$t_2,$at
1444	$ADDU	$c_1,$t_2
1445	sltu	$at,$c_1,$t_2
1446	$ADDU	$c_2,$at
1447	mflo	$t_1
1448	mfhi	$t_2
1449	$ADDU	$c_3,$t_1
1450	sltu	$at,$c_3,$t_1
1451	$MULTU	$a_1,$b_7		# mul_add_c(a[1],b[7],c3,c1,c2);
1452	$ADDU	$t_2,$at
1453	$ADDU	$c_1,$t_2
1454	sltu	$at,$c_1,$t_2
1455	$ADDU	$c_2,$at
1456	mflo	$t_1
1457	mfhi	$t_2
1458	$ADDU	$c_3,$t_1
1459	sltu	$at,$c_3,$t_1
1460	 $MULTU	$a_2,$b_7		# mul_add_c(a[2],b[7],c1,c2,c3);
1461	$ADDU	$t_2,$at
1462	$ADDU	$c_1,$t_2
1463	sltu	$at,$c_1,$t_2
1464	$ADDU	$c_2,$at
1465	$ST	$c_3,8*$BNSZ($a0)	# r[8]=c3;
1466
1467	mflo	$t_1
1468	mfhi	$t_2
1469	$ADDU	$c_1,$t_1
1470	sltu	$at,$c_1,$t_1
1471	$MULTU	$a_3,$b_6		# mul_add_c(a[3],b[6],c1,c2,c3);
1472	$ADDU	$t_2,$at
1473	$ADDU	$c_2,$t_2
1474	sltu	$c_3,$c_2,$t_2
1475	mflo	$t_1
1476	mfhi	$t_2
1477	$ADDU	$c_1,$t_1
1478	sltu	$at,$c_1,$t_1
1479	$MULTU	$a_4,$b_5		# mul_add_c(a[4],b[5],c1,c2,c3);
1480	$ADDU	$t_2,$at
1481	$ADDU	$c_2,$t_2
1482	sltu	$at,$c_2,$t_2
1483	$ADDU	$c_3,$at
1484	mflo	$t_1
1485	mfhi	$t_2
1486	$ADDU	$c_1,$t_1
1487	sltu	$at,$c_1,$t_1
1488	$MULTU	$a_5,$b_4		# mul_add_c(a[5],b[4],c1,c2,c3);
1489	$ADDU	$t_2,$at
1490	$ADDU	$c_2,$t_2
1491	sltu	$at,$c_2,$t_2
1492	$ADDU	$c_3,$at
1493	mflo	$t_1
1494	mfhi	$t_2
1495	$ADDU	$c_1,$t_1
1496	sltu	$at,$c_1,$t_1
1497	$MULTU	$a_6,$b_3		# mul_add_c(a[6],b[3],c1,c2,c3);
1498	$ADDU	$t_2,$at
1499	$ADDU	$c_2,$t_2
1500	sltu	$at,$c_2,$t_2
1501	$ADDU	$c_3,$at
1502	mflo	$t_1
1503	mfhi	$t_2
1504	$ADDU	$c_1,$t_1
1505	sltu	$at,$c_1,$t_1
1506	$MULTU	$a_7,$b_2		# mul_add_c(a[7],b[2],c1,c2,c3);
1507	$ADDU	$t_2,$at
1508	$ADDU	$c_2,$t_2
1509	sltu	$at,$c_2,$t_2
1510	$ADDU	$c_3,$at
1511	mflo	$t_1
1512	mfhi	$t_2
1513	$ADDU	$c_1,$t_1
1514	sltu	$at,$c_1,$t_1
1515	 $MULTU	$a_7,$b_3		# mul_add_c(a[7],b[3],c2,c3,c1);
1516	$ADDU	$t_2,$at
1517	$ADDU	$c_2,$t_2
1518	sltu	$at,$c_2,$t_2
1519	$ADDU	$c_3,$at
1520	$ST	$c_1,9*$BNSZ($a0)	# r[9]=c1;
1521
1522	mflo	$t_1
1523	mfhi	$t_2
1524	$ADDU	$c_2,$t_1
1525	sltu	$at,$c_2,$t_1
1526	$MULTU	$a_6,$b_4		# mul_add_c(a[6],b[4],c2,c3,c1);
1527	$ADDU	$t_2,$at
1528	$ADDU	$c_3,$t_2
1529	sltu	$c_1,$c_3,$t_2
1530	mflo	$t_1
1531	mfhi	$t_2
1532	$ADDU	$c_2,$t_1
1533	sltu	$at,$c_2,$t_1
1534	$MULTU	$a_5,$b_5		# mul_add_c(a[5],b[5],c2,c3,c1);
1535	$ADDU	$t_2,$at
1536	$ADDU	$c_3,$t_2
1537	sltu	$at,$c_3,$t_2
1538	$ADDU	$c_1,$at
1539	mflo	$t_1
1540	mfhi	$t_2
1541	$ADDU	$c_2,$t_1
1542	sltu	$at,$c_2,$t_1
1543	$MULTU	$a_4,$b_6		# mul_add_c(a[4],b[6],c2,c3,c1);
1544	$ADDU	$t_2,$at
1545	$ADDU	$c_3,$t_2
1546	sltu	$at,$c_3,$t_2
1547	$ADDU	$c_1,$at
1548	mflo	$t_1
1549	mfhi	$t_2
1550	$ADDU	$c_2,$t_1
1551	sltu	$at,$c_2,$t_1
1552	$MULTU	$a_3,$b_7		# mul_add_c(a[3],b[7],c2,c3,c1);
1553	$ADDU	$t_2,$at
1554	$ADDU	$c_3,$t_2
1555	sltu	$at,$c_3,$t_2
1556	$ADDU	$c_1,$at
1557	mflo	$t_1
1558	mfhi	$t_2
1559	$ADDU	$c_2,$t_1
1560	sltu	$at,$c_2,$t_1
1561	$MULTU	$a_4,$b_7		# mul_add_c(a[4],b[7],c3,c1,c2);
1562	$ADDU	$t_2,$at
1563	$ADDU	$c_3,$t_2
1564	sltu	$at,$c_3,$t_2
1565	$ADDU	$c_1,$at
1566	$ST	$c_2,10*$BNSZ($a0)	# r[10]=c2;
1567
1568	mflo	$t_1
1569	mfhi	$t_2
1570	$ADDU	$c_3,$t_1
1571	sltu	$at,$c_3,$t_1
1572	$MULTU	$a_5,$b_6		# mul_add_c(a[5],b[6],c3,c1,c2);
1573	$ADDU	$t_2,$at
1574	$ADDU	$c_1,$t_2
1575	sltu	$c_2,$c_1,$t_2
1576	mflo	$t_1
1577	mfhi	$t_2
1578	$ADDU	$c_3,$t_1
1579	sltu	$at,$c_3,$t_1
1580	$MULTU	$a_6,$b_5		# mul_add_c(a[6],b[5],c3,c1,c2);
1581	$ADDU	$t_2,$at
1582	$ADDU	$c_1,$t_2
1583	sltu	$at,$c_1,$t_2
1584	$ADDU	$c_2,$at
1585	mflo	$t_1
1586	mfhi	$t_2
1587	$ADDU	$c_3,$t_1
1588	sltu	$at,$c_3,$t_1
1589	$MULTU	$a_7,$b_4		# mul_add_c(a[7],b[4],c3,c1,c2);
1590	$ADDU	$t_2,$at
1591	$ADDU	$c_1,$t_2
1592	sltu	$at,$c_1,$t_2
1593	$ADDU	$c_2,$at
1594	mflo	$t_1
1595	mfhi	$t_2
1596	$ADDU	$c_3,$t_1
1597	sltu	$at,$c_3,$t_1
1598	 $MULTU	$a_7,$b_5		# mul_add_c(a[7],b[5],c1,c2,c3);
1599	$ADDU	$t_2,$at
1600	$ADDU	$c_1,$t_2
1601	sltu	$at,$c_1,$t_2
1602	$ADDU	$c_2,$at
1603	$ST	$c_3,11*$BNSZ($a0)	# r[11]=c3;
1604
1605	mflo	$t_1
1606	mfhi	$t_2
1607	$ADDU	$c_1,$t_1
1608	sltu	$at,$c_1,$t_1
1609	$MULTU	$a_6,$b_6		# mul_add_c(a[6],b[6],c1,c2,c3);
1610	$ADDU	$t_2,$at
1611	$ADDU	$c_2,$t_2
1612	sltu	$c_3,$c_2,$t_2
1613	mflo	$t_1
1614	mfhi	$t_2
1615	$ADDU	$c_1,$t_1
1616	sltu	$at,$c_1,$t_1
1617	$MULTU	$a_5,$b_7		# mul_add_c(a[5],b[7],c1,c2,c3);
1618	$ADDU	$t_2,$at
1619	$ADDU	$c_2,$t_2
1620	sltu	$at,$c_2,$t_2
1621	$ADDU	$c_3,$at
1622	mflo	$t_1
1623	mfhi	$t_2
1624	$ADDU	$c_1,$t_1
1625	sltu	$at,$c_1,$t_1
1626	 $MULTU	$a_6,$b_7		# mul_add_c(a[6],b[7],c2,c3,c1);
1627	$ADDU	$t_2,$at
1628	$ADDU	$c_2,$t_2
1629	sltu	$at,$c_2,$t_2
1630	$ADDU	$c_3,$at
1631	$ST	$c_1,12*$BNSZ($a0)	# r[12]=c1;
1632
1633	mflo	$t_1
1634	mfhi	$t_2
1635	$ADDU	$c_2,$t_1
1636	sltu	$at,$c_2,$t_1
1637	$MULTU	$a_7,$b_6		# mul_add_c(a[7],b[6],c2,c3,c1);
1638	$ADDU	$t_2,$at
1639	$ADDU	$c_3,$t_2
1640	sltu	$c_1,$c_3,$t_2
1641	mflo	$t_1
1642	mfhi	$t_2
1643	$ADDU	$c_2,$t_1
1644	sltu	$at,$c_2,$t_1
1645	$MULTU	$a_7,$b_7		# mul_add_c(a[7],b[7],c3,c1,c2);
1646	$ADDU	$t_2,$at
1647	$ADDU	$c_3,$t_2
1648	sltu	$at,$c_3,$t_2
1649	$ADDU	$c_1,$at
1650	$ST	$c_2,13*$BNSZ($a0)	# r[13]=c2;
1651
1652	mflo	$t_1
1653	mfhi	$t_2
1654	$ADDU	$c_3,$t_1
1655	sltu	$at,$c_3,$t_1
1656	$ADDU	$t_2,$at
1657	$ADDU	$c_1,$t_2
1658	$ST	$c_3,14*$BNSZ($a0)	# r[14]=c3;
1659	$ST	$c_1,15*$BNSZ($a0)	# r[15]=c1;
1660
1661	.set	noreorder
1662___
1663$code.=<<___ if ($flavour =~ /nubi/i);
1664	$REG_L	$s5,10*$SZREG($sp)
1665	$REG_L	$s4,9*$SZREG($sp)
1666	$REG_L	$s3,8*$SZREG($sp)
1667	$REG_L	$s2,7*$SZREG($sp)
1668	$REG_L	$s1,6*$SZREG($sp)
1669	$REG_L	$s0,5*$SZREG($sp)
1670	$REG_L	$t3,4*$SZREG($sp)
1671	$REG_L	$t2,3*$SZREG($sp)
1672	$REG_L	$t1,2*$SZREG($sp)
1673	$REG_L	$t0,1*$SZREG($sp)
1674	$REG_L	$gp,0*$SZREG($sp)
1675	jr	$ra
1676	$PTR_ADD $sp,12*$SZREG
1677___
1678$code.=<<___ if ($flavour !~ /nubi/i);
1679	$REG_L	$s5,5*$SZREG($sp)
1680	$REG_L	$s4,4*$SZREG($sp)
1681	$REG_L	$s3,3*$SZREG($sp)
1682	$REG_L	$s2,2*$SZREG($sp)
1683	$REG_L	$s1,1*$SZREG($sp)
1684	$REG_L	$s0,0*$SZREG($sp)
1685	jr	$ra
1686	$PTR_ADD $sp,6*$SZREG
1687___
1688$code.=<<___;
1689.end	bn_mul_comba8
1690
1691.align	5
1692.globl	bn_mul_comba4
1693.ent	bn_mul_comba4
1694bn_mul_comba4:
1695___
1696$code.=<<___ if ($flavour =~ /nubi/i);
1697	.frame	$sp,6*$SZREG,$ra
1698	.mask	0x8000f008,-$SZREG
1699	.set	noreorder
1700	$PTR_SUB $sp,6*$SZREG
1701	$REG_S	$ra,5*$SZREG($sp)
1702	$REG_S	$t3,4*$SZREG($sp)
1703	$REG_S	$t2,3*$SZREG($sp)
1704	$REG_S	$t1,2*$SZREG($sp)
1705	$REG_S	$t0,1*$SZREG($sp)
1706	$REG_S	$gp,0*$SZREG($sp)
1707___
1708$code.=<<___;
1709	.set	reorder
1710	$LD	$a_0,0($a1)
1711	$LD	$b_0,0($a2)
1712	$LD	$a_1,$BNSZ($a1)
1713	$LD	$a_2,2*$BNSZ($a1)
1714	$MULTU	$a_0,$b_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1715	$LD	$a_3,3*$BNSZ($a1)
1716	$LD	$b_1,$BNSZ($a2)
1717	$LD	$b_2,2*$BNSZ($a2)
1718	$LD	$b_3,3*$BNSZ($a2)
1719	mflo	$c_1
1720	mfhi	$c_2
1721	$ST	$c_1,0($a0)
1722
1723	$MULTU	$a_0,$b_1		# mul_add_c(a[0],b[1],c2,c3,c1);
1724	mflo	$t_1
1725	mfhi	$t_2
1726	$ADDU	$c_2,$t_1
1727	sltu	$at,$c_2,$t_1
1728	$MULTU	$a_1,$b_0		# mul_add_c(a[1],b[0],c2,c3,c1);
1729	$ADDU	$c_3,$t_2,$at
1730	mflo	$t_1
1731	mfhi	$t_2
1732	$ADDU	$c_2,$t_1
1733	sltu	$at,$c_2,$t_1
1734	 $MULTU	$a_2,$b_0		# mul_add_c(a[2],b[0],c3,c1,c2);
1735	$ADDU	$t_2,$at
1736	$ADDU	$c_3,$t_2
1737	sltu	$c_1,$c_3,$t_2
1738	$ST	$c_2,$BNSZ($a0)
1739
1740	mflo	$t_1
1741	mfhi	$t_2
1742	$ADDU	$c_3,$t_1
1743	sltu	$at,$c_3,$t_1
1744	$MULTU	$a_1,$b_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1745	$ADDU	$t_2,$at
1746	$ADDU	$c_1,$t_2
1747	mflo	$t_1
1748	mfhi	$t_2
1749	$ADDU	$c_3,$t_1
1750	sltu	$at,$c_3,$t_1
1751	$MULTU	$a_0,$b_2		# mul_add_c(a[0],b[2],c3,c1,c2);
1752	$ADDU	$t_2,$at
1753	$ADDU	$c_1,$t_2
1754	sltu	$c_2,$c_1,$t_2
1755	mflo	$t_1
1756	mfhi	$t_2
1757	$ADDU	$c_3,$t_1
1758	sltu	$at,$c_3,$t_1
1759	 $MULTU	$a_0,$b_3		# mul_add_c(a[0],b[3],c1,c2,c3);
1760	$ADDU	$t_2,$at
1761	$ADDU	$c_1,$t_2
1762	sltu	$at,$c_1,$t_2
1763	$ADDU	$c_2,$at
1764	$ST	$c_3,2*$BNSZ($a0)
1765
1766	mflo	$t_1
1767	mfhi	$t_2
1768	$ADDU	$c_1,$t_1
1769	sltu	$at,$c_1,$t_1
1770	$MULTU	$a_1,$b_2		# mul_add_c(a[1],b[2],c1,c2,c3);
1771	$ADDU	$t_2,$at
1772	$ADDU	$c_2,$t_2
1773	sltu	$c_3,$c_2,$t_2
1774	mflo	$t_1
1775	mfhi	$t_2
1776	$ADDU	$c_1,$t_1
1777	sltu	$at,$c_1,$t_1
1778	$MULTU	$a_2,$b_1		# mul_add_c(a[2],b[1],c1,c2,c3);
1779	$ADDU	$t_2,$at
1780	$ADDU	$c_2,$t_2
1781	sltu	$at,$c_2,$t_2
1782	$ADDU	$c_3,$at
1783	mflo	$t_1
1784	mfhi	$t_2
1785	$ADDU	$c_1,$t_1
1786	sltu	$at,$c_1,$t_1
1787	$MULTU	$a_3,$b_0		# mul_add_c(a[3],b[0],c1,c2,c3);
1788	$ADDU	$t_2,$at
1789	$ADDU	$c_2,$t_2
1790	sltu	$at,$c_2,$t_2
1791	$ADDU	$c_3,$at
1792	mflo	$t_1
1793	mfhi	$t_2
1794	$ADDU	$c_1,$t_1
1795	sltu	$at,$c_1,$t_1
1796	 $MULTU	$a_3,$b_1		# mul_add_c(a[3],b[1],c2,c3,c1);
1797	$ADDU	$t_2,$at
1798	$ADDU	$c_2,$t_2
1799	sltu	$at,$c_2,$t_2
1800	$ADDU	$c_3,$at
1801	$ST	$c_1,3*$BNSZ($a0)
1802
1803	mflo	$t_1
1804	mfhi	$t_2
1805	$ADDU	$c_2,$t_1
1806	sltu	$at,$c_2,$t_1
1807	$MULTU	$a_2,$b_2		# mul_add_c(a[2],b[2],c2,c3,c1);
1808	$ADDU	$t_2,$at
1809	$ADDU	$c_3,$t_2
1810	sltu	$c_1,$c_3,$t_2
1811	mflo	$t_1
1812	mfhi	$t_2
1813	$ADDU	$c_2,$t_1
1814	sltu	$at,$c_2,$t_1
1815	$MULTU	$a_1,$b_3		# mul_add_c(a[1],b[3],c2,c3,c1);
1816	$ADDU	$t_2,$at
1817	$ADDU	$c_3,$t_2
1818	sltu	$at,$c_3,$t_2
1819	$ADDU	$c_1,$at
1820	mflo	$t_1
1821	mfhi	$t_2
1822	$ADDU	$c_2,$t_1
1823	sltu	$at,$c_2,$t_1
1824	 $MULTU	$a_2,$b_3		# mul_add_c(a[2],b[3],c3,c1,c2);
1825	$ADDU	$t_2,$at
1826	$ADDU	$c_3,$t_2
1827	sltu	$at,$c_3,$t_2
1828	$ADDU	$c_1,$at
1829	$ST	$c_2,4*$BNSZ($a0)
1830
1831	mflo	$t_1
1832	mfhi	$t_2
1833	$ADDU	$c_3,$t_1
1834	sltu	$at,$c_3,$t_1
1835	$MULTU	$a_3,$b_2		# mul_add_c(a[3],b[2],c3,c1,c2);
1836	$ADDU	$t_2,$at
1837	$ADDU	$c_1,$t_2
1838	sltu	$c_2,$c_1,$t_2
1839	mflo	$t_1
1840	mfhi	$t_2
1841	$ADDU	$c_3,$t_1
1842	sltu	$at,$c_3,$t_1
1843	 $MULTU	$a_3,$b_3		# mul_add_c(a[3],b[3],c1,c2,c3);
1844	$ADDU	$t_2,$at
1845	$ADDU	$c_1,$t_2
1846	sltu	$at,$c_1,$t_2
1847	$ADDU	$c_2,$at
1848	$ST	$c_3,5*$BNSZ($a0)
1849
1850	mflo	$t_1
1851	mfhi	$t_2
1852	$ADDU	$c_1,$t_1
1853	sltu	$at,$c_1,$t_1
1854	$ADDU	$t_2,$at
1855	$ADDU	$c_2,$t_2
1856	$ST	$c_1,6*$BNSZ($a0)
1857	$ST	$c_2,7*$BNSZ($a0)
1858
1859	.set	noreorder
1860___
1861$code.=<<___ if ($flavour =~ /nubi/i);
1862	$REG_L	$t3,4*$SZREG($sp)
1863	$REG_L	$t2,3*$SZREG($sp)
1864	$REG_L	$t1,2*$SZREG($sp)
1865	$REG_L	$t0,1*$SZREG($sp)
1866	$REG_L	$gp,0*$SZREG($sp)
1867	$PTR_ADD $sp,6*$SZREG
1868___
1869$code.=<<___;
1870	jr	$ra
1871	nop
1872.end	bn_mul_comba4
1873___
1874
1875($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3);
1876
1877$code.=<<___;
1878
1879.align	5
1880.globl	bn_sqr_comba8
1881.ent	bn_sqr_comba8
1882bn_sqr_comba8:
1883___
1884$code.=<<___ if ($flavour =~ /nubi/i);
1885	.frame	$sp,6*$SZREG,$ra
1886	.mask	0x8000f008,-$SZREG
1887	.set	noreorder
1888	$PTR_SUB $sp,6*$SZREG
1889	$REG_S	$ra,5*$SZREG($sp)
1890	$REG_S	$t3,4*$SZREG($sp)
1891	$REG_S	$t2,3*$SZREG($sp)
1892	$REG_S	$t1,2*$SZREG($sp)
1893	$REG_S	$t0,1*$SZREG($sp)
1894	$REG_S	$gp,0*$SZREG($sp)
1895___
1896$code.=<<___;
1897	.set	reorder
1898	$LD	$a_0,0($a1)
1899	$LD	$a_1,$BNSZ($a1)
1900	$LD	$a_2,2*$BNSZ($a1)
1901	$LD	$a_3,3*$BNSZ($a1)
1902
1903	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
1904	$LD	$a_4,4*$BNSZ($a1)
1905	$LD	$a_5,5*$BNSZ($a1)
1906	$LD	$a_6,6*$BNSZ($a1)
1907	$LD	$a_7,7*$BNSZ($a1)
1908	mflo	$c_1
1909	mfhi	$c_2
1910	$ST	$c_1,0($a0)
1911
1912	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
1913	mflo	$t_1
1914	mfhi	$t_2
1915	slt	$c_1,$t_2,$zero
1916	$SLL	$t_2,1
1917	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
1918	slt	$a2,$t_1,$zero
1919	$ADDU	$t_2,$a2
1920	$SLL	$t_1,1
1921	$ADDU	$c_2,$t_1
1922	sltu	$at,$c_2,$t_1
1923	$ADDU	$c_3,$t_2,$at
1924	$ST	$c_2,$BNSZ($a0)
1925
1926	mflo	$t_1
1927	mfhi	$t_2
1928	slt	$c_2,$t_2,$zero
1929	$SLL	$t_2,1
1930	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
1931	slt	$a2,$t_1,$zero
1932	$ADDU	$t_2,$a2
1933	$SLL	$t_1,1
1934	$ADDU	$c_3,$t_1
1935	sltu	$at,$c_3,$t_1
1936	$ADDU	$t_2,$at
1937	$ADDU	$c_1,$t_2
1938	sltu	$at,$c_1,$t_2
1939	$ADDU	$c_2,$at
1940	mflo	$t_1
1941	mfhi	$t_2
1942	$ADDU	$c_3,$t_1
1943	sltu	$at,$c_3,$t_1
1944	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
1945	$ADDU	$t_2,$at
1946	$ADDU	$c_1,$t_2
1947	sltu	$at,$c_1,$t_2
1948	$ADDU	$c_2,$at
1949	$ST	$c_3,2*$BNSZ($a0)
1950
1951	mflo	$t_1
1952	mfhi	$t_2
1953	slt	$c_3,$t_2,$zero
1954	$SLL	$t_2,1
1955	$MULTU	$a_1,$a_2		# mul_add_c2(a[1],b[2],c1,c2,c3);
1956	slt	$a2,$t_1,$zero
1957	$ADDU	$t_2,$a2
1958	$SLL	$t_1,1
1959	$ADDU	$c_1,$t_1
1960	sltu	$at,$c_1,$t_1
1961	$ADDU	$t_2,$at
1962	$ADDU	$c_2,$t_2
1963	sltu	$at,$c_2,$t_2
1964	$ADDU	$c_3,$at
1965	mflo	$t_1
1966	mfhi	$t_2
1967	slt	$at,$t_2,$zero
1968	$ADDU	$c_3,$at
1969	 $MULTU	$a_4,$a_0		# mul_add_c2(a[4],b[0],c2,c3,c1);
1970	$SLL	$t_2,1
1971	slt	$a2,$t_1,$zero
1972	$ADDU	$t_2,$a2
1973	$SLL	$t_1,1
1974	$ADDU	$c_1,$t_1
1975	sltu	$at,$c_1,$t_1
1976	$ADDU	$t_2,$at
1977	$ADDU	$c_2,$t_2
1978	sltu	$at,$c_2,$t_2
1979	$ADDU	$c_3,$at
1980	$ST	$c_1,3*$BNSZ($a0)
1981
1982	mflo	$t_1
1983	mfhi	$t_2
1984	slt	$c_1,$t_2,$zero
1985	$SLL	$t_2,1
1986	$MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
1987	slt	$a2,$t_1,$zero
1988	$ADDU	$t_2,$a2
1989	$SLL	$t_1,1
1990	$ADDU	$c_2,$t_1
1991	sltu	$at,$c_2,$t_1
1992	$ADDU	$t_2,$at
1993	$ADDU	$c_3,$t_2
1994	sltu	$at,$c_3,$t_2
1995	$ADDU	$c_1,$at
1996	mflo	$t_1
1997	mfhi	$t_2
1998	slt	$at,$t_2,$zero
1999	$ADDU	$c_1,$at
2000	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
2001	$SLL	$t_2,1
2002	slt	$a2,$t_1,$zero
2003	$ADDU	$t_2,$a2
2004	$SLL	$t_1,1
2005	$ADDU	$c_2,$t_1
2006	sltu	$at,$c_2,$t_1
2007	$ADDU	$t_2,$at
2008	$ADDU	$c_3,$t_2
2009	sltu	$at,$c_3,$t_2
2010	$ADDU	$c_1,$at
2011	mflo	$t_1
2012	mfhi	$t_2
2013	$ADDU	$c_2,$t_1
2014	sltu	$at,$c_2,$t_1
2015	 $MULTU	$a_0,$a_5		# mul_add_c2(a[0],b[5],c3,c1,c2);
2016	$ADDU	$t_2,$at
2017	$ADDU	$c_3,$t_2
2018	sltu	$at,$c_3,$t_2
2019	$ADDU	$c_1,$at
2020	$ST	$c_2,4*$BNSZ($a0)
2021
2022	mflo	$t_1
2023	mfhi	$t_2
2024	slt	$c_2,$t_2,$zero
2025	$SLL	$t_2,1
2026	$MULTU	$a_1,$a_4		# mul_add_c2(a[1],b[4],c3,c1,c2);
2027	slt	$a2,$t_1,$zero
2028	$ADDU	$t_2,$a2
2029	$SLL	$t_1,1
2030	$ADDU	$c_3,$t_1
2031	sltu	$at,$c_3,$t_1
2032	$ADDU	$t_2,$at
2033	$ADDU	$c_1,$t_2
2034	sltu	$at,$c_1,$t_2
2035	$ADDU	$c_2,$at
2036	mflo	$t_1
2037	mfhi	$t_2
2038	slt	$at,$t_2,$zero
2039	$ADDU	$c_2,$at
2040	$MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
2041	$SLL	$t_2,1
2042	slt	$a2,$t_1,$zero
2043	$ADDU	$t_2,$a2
2044	$SLL	$t_1,1
2045	$ADDU	$c_3,$t_1
2046	sltu	$at,$c_3,$t_1
2047	$ADDU	$t_2,$at
2048	$ADDU	$c_1,$t_2
2049	sltu	$at,$c_1,$t_2
2050	$ADDU	$c_2,$at
2051	mflo	$t_1
2052	mfhi	$t_2
2053	slt	$at,$t_2,$zero
2054	 $MULTU	$a_6,$a_0		# mul_add_c2(a[6],b[0],c1,c2,c3);
2055	$ADDU	$c_2,$at
2056	$SLL	$t_2,1
2057	slt	$a2,$t_1,$zero
2058	$ADDU	$t_2,$a2
2059	$SLL	$t_1,1
2060	$ADDU	$c_3,$t_1
2061	sltu	$at,$c_3,$t_1
2062	$ADDU	$t_2,$at
2063	$ADDU	$c_1,$t_2
2064	sltu	$at,$c_1,$t_2
2065	$ADDU	$c_2,$at
2066	$ST	$c_3,5*$BNSZ($a0)
2067
2068	mflo	$t_1
2069	mfhi	$t_2
2070	slt	$c_3,$t_2,$zero
2071	$SLL	$t_2,1
2072	$MULTU	$a_5,$a_1		# mul_add_c2(a[5],b[1],c1,c2,c3);
2073	slt	$a2,$t_1,$zero
2074	$ADDU	$t_2,$a2
2075	$SLL	$t_1,1
2076	$ADDU	$c_1,$t_1
2077	sltu	$at,$c_1,$t_1
2078	$ADDU	$t_2,$at
2079	$ADDU	$c_2,$t_2
2080	sltu	$at,$c_2,$t_2
2081	$ADDU	$c_3,$at
2082	mflo	$t_1
2083	mfhi	$t_2
2084	slt	$at,$t_2,$zero
2085	$ADDU	$c_3,$at
2086	$MULTU	$a_4,$a_2		# mul_add_c2(a[4],b[2],c1,c2,c3);
2087	$SLL	$t_2,1
2088	slt	$a2,$t_1,$zero
2089	$ADDU	$t_2,$a2
2090	$SLL	$t_1,1
2091	$ADDU	$c_1,$t_1
2092	sltu	$at,$c_1,$t_1
2093	$ADDU	$t_2,$at
2094	$ADDU	$c_2,$t_2
2095	sltu	$at,$c_2,$t_2
2096	$ADDU	$c_3,$at
2097	mflo	$t_1
2098	mfhi	$t_2
2099	slt	$at,$t_2,$zero
2100	$ADDU	$c_3,$at
2101	$MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
2102	$SLL	$t_2,1
2103	slt	$a2,$t_1,$zero
2104	$ADDU	$t_2,$a2
2105	$SLL	$t_1,1
2106	$ADDU	$c_1,$t_1
2107	sltu	$at,$c_1,$t_1
2108	$ADDU	$t_2,$at
2109	$ADDU	$c_2,$t_2
2110	sltu	$at,$c_2,$t_2
2111	$ADDU	$c_3,$at
2112	mflo	$t_1
2113	mfhi	$t_2
2114	$ADDU	$c_1,$t_1
2115	sltu	$at,$c_1,$t_1
2116	 $MULTU	$a_0,$a_7		# mul_add_c2(a[0],b[7],c2,c3,c1);
2117	$ADDU	$t_2,$at
2118	$ADDU	$c_2,$t_2
2119	sltu	$at,$c_2,$t_2
2120	$ADDU	$c_3,$at
2121	$ST	$c_1,6*$BNSZ($a0)
2122
2123	mflo	$t_1
2124	mfhi	$t_2
2125	slt	$c_1,$t_2,$zero
2126	$SLL	$t_2,1
2127	$MULTU	$a_1,$a_6		# mul_add_c2(a[1],b[6],c2,c3,c1);
2128	slt	$a2,$t_1,$zero
2129	$ADDU	$t_2,$a2
2130	$SLL	$t_1,1
2131	$ADDU	$c_2,$t_1
2132	sltu	$at,$c_2,$t_1
2133	$ADDU	$t_2,$at
2134	$ADDU	$c_3,$t_2
2135	sltu	$at,$c_3,$t_2
2136	$ADDU	$c_1,$at
2137	mflo	$t_1
2138	mfhi	$t_2
2139	slt	$at,$t_2,$zero
2140	$ADDU	$c_1,$at
2141	$MULTU	$a_2,$a_5		# mul_add_c2(a[2],b[5],c2,c3,c1);
2142	$SLL	$t_2,1
2143	slt	$a2,$t_1,$zero
2144	$ADDU	$t_2,$a2
2145	$SLL	$t_1,1
2146	$ADDU	$c_2,$t_1
2147	sltu	$at,$c_2,$t_1
2148	$ADDU	$t_2,$at
2149	$ADDU	$c_3,$t_2
2150	sltu	$at,$c_3,$t_2
2151	$ADDU	$c_1,$at
2152	mflo	$t_1
2153	mfhi	$t_2
2154	slt	$at,$t_2,$zero
2155	$ADDU	$c_1,$at
2156	$MULTU	$a_3,$a_4		# mul_add_c2(a[3],b[4],c2,c3,c1);
2157	$SLL	$t_2,1
2158	slt	$a2,$t_1,$zero
2159	$ADDU	$t_2,$a2
2160	$SLL	$t_1,1
2161	$ADDU	$c_2,$t_1
2162	sltu	$at,$c_2,$t_1
2163	$ADDU	$t_2,$at
2164	$ADDU	$c_3,$t_2
2165	sltu	$at,$c_3,$t_2
2166	$ADDU	$c_1,$at
2167	mflo	$t_1
2168	mfhi	$t_2
2169	slt	$at,$t_2,$zero
2170	$ADDU	$c_1,$at
2171	 $MULTU	$a_7,$a_1		# mul_add_c2(a[7],b[1],c3,c1,c2);
2172	$SLL	$t_2,1
2173	slt	$a2,$t_1,$zero
2174	$ADDU	$t_2,$a2
2175	$SLL	$t_1,1
2176	$ADDU	$c_2,$t_1
2177	sltu	$at,$c_2,$t_1
2178	$ADDU	$t_2,$at
2179	$ADDU	$c_3,$t_2
2180	sltu	$at,$c_3,$t_2
2181	$ADDU	$c_1,$at
2182	$ST	$c_2,7*$BNSZ($a0)
2183
2184	mflo	$t_1
2185	mfhi	$t_2
2186	slt	$c_2,$t_2,$zero
2187	$SLL	$t_2,1
2188	$MULTU	$a_6,$a_2		# mul_add_c2(a[6],b[2],c3,c1,c2);
2189	slt	$a2,$t_1,$zero
2190	$ADDU	$t_2,$a2
2191	$SLL	$t_1,1
2192	$ADDU	$c_3,$t_1
2193	sltu	$at,$c_3,$t_1
2194	$ADDU	$t_2,$at
2195	$ADDU	$c_1,$t_2
2196	sltu	$at,$c_1,$t_2
2197	$ADDU	$c_2,$at
2198	mflo	$t_1
2199	mfhi	$t_2
2200	slt	$at,$t_2,$zero
2201	$ADDU	$c_2,$at
2202	$MULTU	$a_5,$a_3		# mul_add_c2(a[5],b[3],c3,c1,c2);
2203	$SLL	$t_2,1
2204	slt	$a2,$t_1,$zero
2205	$ADDU	$t_2,$a2
2206	$SLL	$t_1,1
2207	$ADDU	$c_3,$t_1
2208	sltu	$at,$c_3,$t_1
2209	$ADDU	$t_2,$at
2210	$ADDU	$c_1,$t_2
2211	sltu	$at,$c_1,$t_2
2212	$ADDU	$c_2,$at
2213	mflo	$t_1
2214	mfhi	$t_2
2215	slt	$at,$t_2,$zero
2216	$ADDU	$c_2,$at
2217	$MULTU	$a_4,$a_4		# mul_add_c(a[4],b[4],c3,c1,c2);
2218	$SLL	$t_2,1
2219	slt	$a2,$t_1,$zero
2220	$ADDU	$t_2,$a2
2221	$SLL	$t_1,1
2222	$ADDU	$c_3,$t_1
2223	sltu	$at,$c_3,$t_1
2224	$ADDU	$t_2,$at
2225	$ADDU	$c_1,$t_2
2226	sltu	$at,$c_1,$t_2
2227	$ADDU	$c_2,$at
2228	mflo	$t_1
2229	mfhi	$t_2
2230	$ADDU	$c_3,$t_1
2231	sltu	$at,$c_3,$t_1
2232	 $MULTU	$a_2,$a_7		# mul_add_c2(a[2],b[7],c1,c2,c3);
2233	$ADDU	$t_2,$at
2234	$ADDU	$c_1,$t_2
2235	sltu	$at,$c_1,$t_2
2236	$ADDU	$c_2,$at
2237	$ST	$c_3,8*$BNSZ($a0)
2238
2239	mflo	$t_1
2240	mfhi	$t_2
2241	slt	$c_3,$t_2,$zero
2242	$SLL	$t_2,1
2243	$MULTU	$a_3,$a_6		# mul_add_c2(a[3],b[6],c1,c2,c3);
2244	slt	$a2,$t_1,$zero
2245	$ADDU	$t_2,$a2
2246	$SLL	$t_1,1
2247	$ADDU	$c_1,$t_1
2248	sltu	$at,$c_1,$t_1
2249	$ADDU	$t_2,$at
2250	$ADDU	$c_2,$t_2
2251	sltu	$at,$c_2,$t_2
2252	$ADDU	$c_3,$at
2253	mflo	$t_1
2254	mfhi	$t_2
2255	slt	$at,$t_2,$zero
2256	$ADDU	$c_3,$at
2257	$MULTU	$a_4,$a_5		# mul_add_c2(a[4],b[5],c1,c2,c3);
2258	$SLL	$t_2,1
2259	slt	$a2,$t_1,$zero
2260	$ADDU	$t_2,$a2
2261	$SLL	$t_1,1
2262	$ADDU	$c_1,$t_1
2263	sltu	$at,$c_1,$t_1
2264	$ADDU	$t_2,$at
2265	$ADDU	$c_2,$t_2
2266	sltu	$at,$c_2,$t_2
2267	$ADDU	$c_3,$at
2268	mflo	$t_1
2269	mfhi	$t_2
2270	slt	$at,$t_2,$zero
2271	$ADDU	$c_3,$at
2272	 $MULTU	$a_7,$a_3		# mul_add_c2(a[7],b[3],c2,c3,c1);
2273	$SLL	$t_2,1
2274	slt	$a2,$t_1,$zero
2275	$ADDU	$t_2,$a2
2276	$SLL	$t_1,1
2277	$ADDU	$c_1,$t_1
2278	sltu	$at,$c_1,$t_1
2279	$ADDU	$t_2,$at
2280	$ADDU	$c_2,$t_2
2281	sltu	$at,$c_2,$t_2
2282	$ADDU	$c_3,$at
2283	$ST	$c_1,9*$BNSZ($a0)
2284
2285	mflo	$t_1
2286	mfhi	$t_2
2287	slt	$c_1,$t_2,$zero
2288	$SLL	$t_2,1
2289	$MULTU	$a_6,$a_4		# mul_add_c2(a[6],b[4],c2,c3,c1);
2290	slt	$a2,$t_1,$zero
2291	$ADDU	$t_2,$a2
2292	$SLL	$t_1,1
2293	$ADDU	$c_2,$t_1
2294	sltu	$at,$c_2,$t_1
2295	$ADDU	$t_2,$at
2296	$ADDU	$c_3,$t_2
2297	sltu	$at,$c_3,$t_2
2298	$ADDU	$c_1,$at
2299	mflo	$t_1
2300	mfhi	$t_2
2301	slt	$at,$t_2,$zero
2302	$ADDU	$c_1,$at
2303	$MULTU	$a_5,$a_5		# mul_add_c(a[5],b[5],c2,c3,c1);
2304	$SLL	$t_2,1
2305	slt	$a2,$t_1,$zero
2306	$ADDU	$t_2,$a2
2307	$SLL	$t_1,1
2308	$ADDU	$c_2,$t_1
2309	sltu	$at,$c_2,$t_1
2310	$ADDU	$t_2,$at
2311	$ADDU	$c_3,$t_2
2312	sltu	$at,$c_3,$t_2
2313	$ADDU	$c_1,$at
2314	mflo	$t_1
2315	mfhi	$t_2
2316	$ADDU	$c_2,$t_1
2317	sltu	$at,$c_2,$t_1
2318	 $MULTU	$a_4,$a_7		# mul_add_c2(a[4],b[7],c3,c1,c2);
2319	$ADDU	$t_2,$at
2320	$ADDU	$c_3,$t_2
2321	sltu	$at,$c_3,$t_2
2322	$ADDU	$c_1,$at
2323	$ST	$c_2,10*$BNSZ($a0)
2324
2325	mflo	$t_1
2326	mfhi	$t_2
2327	slt	$c_2,$t_2,$zero
2328	$SLL	$t_2,1
2329	$MULTU	$a_5,$a_6		# mul_add_c2(a[5],b[6],c3,c1,c2);
2330	slt	$a2,$t_1,$zero
2331	$ADDU	$t_2,$a2
2332	$SLL	$t_1,1
2333	$ADDU	$c_3,$t_1
2334	sltu	$at,$c_3,$t_1
2335	$ADDU	$t_2,$at
2336	$ADDU	$c_1,$t_2
2337	sltu	$at,$c_1,$t_2
2338	$ADDU	$c_2,$at
2339	mflo	$t_1
2340	mfhi	$t_2
2341	slt	$at,$t_2,$zero
2342	$ADDU	$c_2,$at
2343	 $MULTU	$a_7,$a_5		# mul_add_c2(a[7],b[5],c1,c2,c3);
2344	$SLL	$t_2,1
2345	slt	$a2,$t_1,$zero
2346	$ADDU	$t_2,$a2
2347	$SLL	$t_1,1
2348	$ADDU	$c_3,$t_1
2349	sltu	$at,$c_3,$t_1
2350	$ADDU	$t_2,$at
2351	$ADDU	$c_1,$t_2
2352	sltu	$at,$c_1,$t_2
2353	$ADDU	$c_2,$at
2354	$ST	$c_3,11*$BNSZ($a0)
2355
2356	mflo	$t_1
2357	mfhi	$t_2
2358	slt	$c_3,$t_2,$zero
2359	$SLL	$t_2,1
2360	$MULTU	$a_6,$a_6		# mul_add_c(a[6],b[6],c1,c2,c3);
2361	slt	$a2,$t_1,$zero
2362	$ADDU	$t_2,$a2
2363	$SLL	$t_1,1
2364	$ADDU	$c_1,$t_1
2365	sltu	$at,$c_1,$t_1
2366	$ADDU	$t_2,$at
2367	$ADDU	$c_2,$t_2
2368	sltu	$at,$c_2,$t_2
2369	$ADDU	$c_3,$at
2370	mflo	$t_1
2371	mfhi	$t_2
2372	$ADDU	$c_1,$t_1
2373	sltu	$at,$c_1,$t_1
2374	 $MULTU	$a_6,$a_7		# mul_add_c2(a[6],b[7],c2,c3,c1);
2375	$ADDU	$t_2,$at
2376	$ADDU	$c_2,$t_2
2377	sltu	$at,$c_2,$t_2
2378	$ADDU	$c_3,$at
2379	$ST	$c_1,12*$BNSZ($a0)
2380
2381	mflo	$t_1
2382	mfhi	$t_2
2383	slt	$c_1,$t_2,$zero
2384	$SLL	$t_2,1
2385	 $MULTU	$a_7,$a_7		# mul_add_c(a[7],b[7],c3,c1,c2);
2386	slt	$a2,$t_1,$zero
2387	$ADDU	$t_2,$a2
2388	$SLL	$t_1,1
2389	$ADDU	$c_2,$t_1
2390	sltu	$at,$c_2,$t_1
2391	$ADDU	$t_2,$at
2392	$ADDU	$c_3,$t_2
2393	sltu	$at,$c_3,$t_2
2394	$ADDU	$c_1,$at
2395	$ST	$c_2,13*$BNSZ($a0)
2396
2397	mflo	$t_1
2398	mfhi	$t_2
2399	$ADDU	$c_3,$t_1
2400	sltu	$at,$c_3,$t_1
2401	$ADDU	$t_2,$at
2402	$ADDU	$c_1,$t_2
2403	$ST	$c_3,14*$BNSZ($a0)
2404	$ST	$c_1,15*$BNSZ($a0)
2405
2406	.set	noreorder
2407___
2408$code.=<<___ if ($flavour =~ /nubi/i);
2409	$REG_L	$t3,4*$SZREG($sp)
2410	$REG_L	$t2,3*$SZREG($sp)
2411	$REG_L	$t1,2*$SZREG($sp)
2412	$REG_L	$t0,1*$SZREG($sp)
2413	$REG_L	$gp,0*$SZREG($sp)
2414	$PTR_ADD $sp,6*$SZREG
2415___
2416$code.=<<___;
2417	jr	$ra
2418	nop
2419.end	bn_sqr_comba8
2420
2421.align	5
2422.globl	bn_sqr_comba4
2423.ent	bn_sqr_comba4
2424bn_sqr_comba4:
2425___
2426$code.=<<___ if ($flavour =~ /nubi/i);
2427	.frame	$sp,6*$SZREG,$ra
2428	.mask	0x8000f008,-$SZREG
2429	.set	noreorder
2430	$PTR_SUB $sp,6*$SZREG
2431	$REG_S	$ra,5*$SZREG($sp)
2432	$REG_S	$t3,4*$SZREG($sp)
2433	$REG_S	$t2,3*$SZREG($sp)
2434	$REG_S	$t1,2*$SZREG($sp)
2435	$REG_S	$t0,1*$SZREG($sp)
2436	$REG_S	$gp,0*$SZREG($sp)
2437___
2438$code.=<<___;
2439	.set	reorder
2440	$LD	$a_0,0($a1)
2441	$LD	$a_1,$BNSZ($a1)
2442	$MULTU	$a_0,$a_0		# mul_add_c(a[0],b[0],c1,c2,c3);
2443	$LD	$a_2,2*$BNSZ($a1)
2444	$LD	$a_3,3*$BNSZ($a1)
2445	mflo	$c_1
2446	mfhi	$c_2
2447	$ST	$c_1,0($a0)
2448
2449	$MULTU	$a_0,$a_1		# mul_add_c2(a[0],b[1],c2,c3,c1);
2450	mflo	$t_1
2451	mfhi	$t_2
2452	slt	$c_1,$t_2,$zero
2453	$SLL	$t_2,1
2454	 $MULTU	$a_2,$a_0		# mul_add_c2(a[2],b[0],c3,c1,c2);
2455	slt	$a2,$t_1,$zero
2456	$ADDU	$t_2,$a2
2457	$SLL	$t_1,1
2458	$ADDU	$c_2,$t_1
2459	sltu	$at,$c_2,$t_1
2460	$ADDU	$c_3,$t_2,$at
2461	$ST	$c_2,$BNSZ($a0)
2462
2463	mflo	$t_1
2464	mfhi	$t_2
2465	slt	$c_2,$t_2,$zero
2466	$SLL	$t_2,1
2467	$MULTU	$a_1,$a_1		# mul_add_c(a[1],b[1],c3,c1,c2);
2468	slt	$a2,$t_1,$zero
2469	$ADDU	$t_2,$a2
2470	$SLL	$t_1,1
2471	$ADDU	$c_3,$t_1
2472	sltu	$at,$c_3,$t_1
2473	$ADDU	$t_2,$at
2474	$ADDU	$c_1,$t_2
2475	sltu	$at,$c_1,$t_2
2476	$ADDU	$c_2,$at
2477	mflo	$t_1
2478	mfhi	$t_2
2479	$ADDU	$c_3,$t_1
2480	sltu	$at,$c_3,$t_1
2481	 $MULTU	$a_0,$a_3		# mul_add_c2(a[0],b[3],c1,c2,c3);
2482	$ADDU	$t_2,$at
2483	$ADDU	$c_1,$t_2
2484	sltu	$at,$c_1,$t_2
2485	$ADDU	$c_2,$at
2486	$ST	$c_3,2*$BNSZ($a0)
2487
2488	mflo	$t_1
2489	mfhi	$t_2
2490	slt	$c_3,$t_2,$zero
2491	$SLL	$t_2,1
2492	$MULTU	$a_1,$a_2		# mul_add_c(a2[1],b[2],c1,c2,c3);
2493	slt	$a2,$t_1,$zero
2494	$ADDU	$t_2,$a2
2495	$SLL	$t_1,1
2496	$ADDU	$c_1,$t_1
2497	sltu	$at,$c_1,$t_1
2498	$ADDU	$t_2,$at
2499	$ADDU	$c_2,$t_2
2500	sltu	$at,$c_2,$t_2
2501	$ADDU	$c_3,$at
2502	mflo	$t_1
2503	mfhi	$t_2
2504	slt	$at,$t_2,$zero
2505	$ADDU	$c_3,$at
2506	 $MULTU	$a_3,$a_1		# mul_add_c2(a[3],b[1],c2,c3,c1);
2507	$SLL	$t_2,1
2508	slt	$a2,$t_1,$zero
2509	$ADDU	$t_2,$a2
2510	$SLL	$t_1,1
2511	$ADDU	$c_1,$t_1
2512	sltu	$at,$c_1,$t_1
2513	$ADDU	$t_2,$at
2514	$ADDU	$c_2,$t_2
2515	sltu	$at,$c_2,$t_2
2516	$ADDU	$c_3,$at
2517	$ST	$c_1,3*$BNSZ($a0)
2518
2519	mflo	$t_1
2520	mfhi	$t_2
2521	slt	$c_1,$t_2,$zero
2522	$SLL	$t_2,1
2523	$MULTU	$a_2,$a_2		# mul_add_c(a[2],b[2],c2,c3,c1);
2524	slt	$a2,$t_1,$zero
2525	$ADDU	$t_2,$a2
2526	$SLL	$t_1,1
2527	$ADDU	$c_2,$t_1
2528	sltu	$at,$c_2,$t_1
2529	$ADDU	$t_2,$at
2530	$ADDU	$c_3,$t_2
2531	sltu	$at,$c_3,$t_2
2532	$ADDU	$c_1,$at
2533	mflo	$t_1
2534	mfhi	$t_2
2535	$ADDU	$c_2,$t_1
2536	sltu	$at,$c_2,$t_1
2537	 $MULTU	$a_2,$a_3		# mul_add_c2(a[2],b[3],c3,c1,c2);
2538	$ADDU	$t_2,$at
2539	$ADDU	$c_3,$t_2
2540	sltu	$at,$c_3,$t_2
2541	$ADDU	$c_1,$at
2542	$ST	$c_2,4*$BNSZ($a0)
2543
2544	mflo	$t_1
2545	mfhi	$t_2
2546	slt	$c_2,$t_2,$zero
2547	$SLL	$t_2,1
2548	 $MULTU	$a_3,$a_3		# mul_add_c(a[3],b[3],c1,c2,c3);
2549	slt	$a2,$t_1,$zero
2550	$ADDU	$t_2,$a2
2551	$SLL	$t_1,1
2552	$ADDU	$c_3,$t_1
2553	sltu	$at,$c_3,$t_1
2554	$ADDU	$t_2,$at
2555	$ADDU	$c_1,$t_2
2556	sltu	$at,$c_1,$t_2
2557	$ADDU	$c_2,$at
2558	$ST	$c_3,5*$BNSZ($a0)
2559
2560	mflo	$t_1
2561	mfhi	$t_2
2562	$ADDU	$c_1,$t_1
2563	sltu	$at,$c_1,$t_1
2564	$ADDU	$t_2,$at
2565	$ADDU	$c_2,$t_2
2566	$ST	$c_1,6*$BNSZ($a0)
2567	$ST	$c_2,7*$BNSZ($a0)
2568
2569	.set	noreorder
2570___
2571$code.=<<___ if ($flavour =~ /nubi/i);
2572	$REG_L	$t3,4*$SZREG($sp)
2573	$REG_L	$t2,3*$SZREG($sp)
2574	$REG_L	$t1,2*$SZREG($sp)
2575	$REG_L	$t0,1*$SZREG($sp)
2576	$REG_L	$gp,0*$SZREG($sp)
2577	$PTR_ADD $sp,6*$SZREG
2578___
2579$code.=<<___;
2580	jr	$ra
2581	nop
2582.end	bn_sqr_comba4
2583___
2584print $code;
2585close STDOUT;
2586