1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. 6# 7# Rights for redistribution and usage in source and binary forms are 8# granted according to the OpenSSL license. Warranty of any kind is 9# disclaimed. 10# ==================================================================== 11 12 13# July 1999 14# 15# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. 16# 17# The module is designed to work with either of the "new" MIPS ABI(5), 18# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under 19# IRIX 5.x not only because it doesn't support new ABIs but also 20# because 5.x kernels put R4x00 CPU into 32-bit mode and all those 21# 64-bit instructions (daddu, dmultu, etc.) found below gonna only 22# cause illegal instruction exception:-( 23# 24# In addition the code depends on preprocessor flags set up by MIPSpro 25# compiler driver (either as or cc) and therefore (probably?) can't be 26# compiled by the GNU assembler. GNU C driver manages fine though... 27# I mean as long as -mmips-as is specified or is the default option, 28# because then it simply invokes /usr/bin/as which in turn takes 29# perfect care of the preprocessor definitions. Another neat feature 30# offered by the MIPSpro assembler is an optimization pass. This gave 31# me the opportunity to have the code looking more regular as all those 32# architecture dependent instruction rescheduling details were left to 33# the assembler. Cool, huh? 34# 35# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 36# goes way over 3 times faster! 37# 38# <appro@fy.chalmers.se> 39 40# October 2010 41# 42# Adapt the module even for 32-bit ABIs and other OSes. The former was 43# achieved by mechanical replacement of 64-bit arithmetic instructions 44# such as dmultu, daddu, etc. with their 32-bit counterparts and 45# adjusting offsets denoting multiples of BN_ULONG. Above mentioned 46# >3x performance improvement naturally does not apply to 32-bit code 47# [because there is no instruction 32-bit compiler can't use], one 48# has to content with 40-85% improvement depending on benchmark and 49# key length, more for longer keys. 50 51$flavour = shift; 52while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 53open STDOUT,">$output"; 54 55if ($flavour =~ /64|n32/i) { 56 $LD="ld"; 57 $ST="sd"; 58 $MULTU="dmultu"; 59 $DIVU="ddivu"; 60 $ADDU="daddu"; 61 $SUBU="dsubu"; 62 $SRL="dsrl"; 63 $SLL="dsll"; 64 $BNSZ=8; 65 $PTR_ADD="daddu"; 66 $PTR_SUB="dsubu"; 67 $SZREG=8; 68 $REG_S="sd"; 69 $REG_L="ld"; 70} else { 71 $LD="lw"; 72 $ST="sw"; 73 $MULTU="multu"; 74 $DIVU="divu"; 75 $ADDU="addu"; 76 $SUBU="subu"; 77 $SRL="srl"; 78 $SLL="sll"; 79 $BNSZ=4; 80 $PTR_ADD="addu"; 81 $PTR_SUB="subu"; 82 $SZREG=4; 83 $REG_S="sw"; 84 $REG_L="lw"; 85 $code=".set mips2\n"; 86} 87 88# Below is N32/64 register layout used in the original module. 89# 90($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 91($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 92($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 93($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 94($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 95($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); 96# 97# No special adaptation is required for O32. NUBI on the other hand 98# is treated by saving/restoring ($v1,$t0..$t3). 99 100$gp=$v1 if ($flavour =~ /nubi/i); 101 102$minus4=$v1; 103 104$code.=<<___; 105.rdata 106.asciiz "mips3.s, Version 1.2" 107.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" 108 109.text 110.set noat 111 112.align 5 113.globl bn_mul_add_words 114.ent bn_mul_add_words 115bn_mul_add_words: 116 .set noreorder 117 bgtz $a2,bn_mul_add_words_internal 118 move $v0,$zero 119 jr $ra 120 move $a0,$v0 121.end bn_mul_add_words 122 123.align 5 124.ent bn_mul_add_words_internal 125bn_mul_add_words_internal: 126___ 127$code.=<<___ if ($flavour =~ /nubi/i); 128 .frame $sp,6*$SZREG,$ra 129 .mask 0x8000f008,-$SZREG 130 .set noreorder 131 $PTR_SUB $sp,6*$SZREG 132 $REG_S $ra,5*$SZREG($sp) 133 $REG_S $t3,4*$SZREG($sp) 134 $REG_S $t2,3*$SZREG($sp) 135 $REG_S $t1,2*$SZREG($sp) 136 $REG_S $t0,1*$SZREG($sp) 137 $REG_S $gp,0*$SZREG($sp) 138___ 139$code.=<<___; 140 .set reorder 141 li $minus4,-4 142 and $ta0,$a2,$minus4 143 $LD $t0,0($a1) 144 beqz $ta0,.L_bn_mul_add_words_tail 145 146.L_bn_mul_add_words_loop: 147 $MULTU $t0,$a3 148 $LD $t1,0($a0) 149 $LD $t2,$BNSZ($a1) 150 $LD $t3,$BNSZ($a0) 151 $LD $ta0,2*$BNSZ($a1) 152 $LD $ta1,2*$BNSZ($a0) 153 $ADDU $t1,$v0 154 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit 155 # values", but it seems to work fine 156 # even on 64-bit registers. 157 mflo $at 158 mfhi $t0 159 $ADDU $t1,$at 160 $ADDU $v0,$t0 161 $MULTU $t2,$a3 162 sltu $at,$t1,$at 163 $ST $t1,0($a0) 164 $ADDU $v0,$at 165 166 $LD $ta2,3*$BNSZ($a1) 167 $LD $ta3,3*$BNSZ($a0) 168 $ADDU $t3,$v0 169 sltu $v0,$t3,$v0 170 mflo $at 171 mfhi $t2 172 $ADDU $t3,$at 173 $ADDU $v0,$t2 174 $MULTU $ta0,$a3 175 sltu $at,$t3,$at 176 $ST $t3,$BNSZ($a0) 177 $ADDU $v0,$at 178 179 subu $a2,4 180 $PTR_ADD $a0,4*$BNSZ 181 $PTR_ADD $a1,4*$BNSZ 182 $ADDU $ta1,$v0 183 sltu $v0,$ta1,$v0 184 mflo $at 185 mfhi $ta0 186 $ADDU $ta1,$at 187 $ADDU $v0,$ta0 188 $MULTU $ta2,$a3 189 sltu $at,$ta1,$at 190 $ST $ta1,-2*$BNSZ($a0) 191 $ADDU $v0,$at 192 193 194 and $ta0,$a2,$minus4 195 $ADDU $ta3,$v0 196 sltu $v0,$ta3,$v0 197 mflo $at 198 mfhi $ta2 199 $ADDU $ta3,$at 200 $ADDU $v0,$ta2 201 sltu $at,$ta3,$at 202 $ST $ta3,-$BNSZ($a0) 203 $ADDU $v0,$at 204 .set noreorder 205 bgtzl $ta0,.L_bn_mul_add_words_loop 206 $LD $t0,0($a1) 207 208 beqz $a2,.L_bn_mul_add_words_return 209 nop 210 211.L_bn_mul_add_words_tail: 212 .set reorder 213 $LD $t0,0($a1) 214 $MULTU $t0,$a3 215 $LD $t1,0($a0) 216 subu $a2,1 217 $ADDU $t1,$v0 218 sltu $v0,$t1,$v0 219 mflo $at 220 mfhi $t0 221 $ADDU $t1,$at 222 $ADDU $v0,$t0 223 sltu $at,$t1,$at 224 $ST $t1,0($a0) 225 $ADDU $v0,$at 226 beqz $a2,.L_bn_mul_add_words_return 227 228 $LD $t0,$BNSZ($a1) 229 $MULTU $t0,$a3 230 $LD $t1,$BNSZ($a0) 231 subu $a2,1 232 $ADDU $t1,$v0 233 sltu $v0,$t1,$v0 234 mflo $at 235 mfhi $t0 236 $ADDU $t1,$at 237 $ADDU $v0,$t0 238 sltu $at,$t1,$at 239 $ST $t1,$BNSZ($a0) 240 $ADDU $v0,$at 241 beqz $a2,.L_bn_mul_add_words_return 242 243 $LD $t0,2*$BNSZ($a1) 244 $MULTU $t0,$a3 245 $LD $t1,2*$BNSZ($a0) 246 $ADDU $t1,$v0 247 sltu $v0,$t1,$v0 248 mflo $at 249 mfhi $t0 250 $ADDU $t1,$at 251 $ADDU $v0,$t0 252 sltu $at,$t1,$at 253 $ST $t1,2*$BNSZ($a0) 254 $ADDU $v0,$at 255 256.L_bn_mul_add_words_return: 257 .set noreorder 258___ 259$code.=<<___ if ($flavour =~ /nubi/i); 260 $REG_L $t3,4*$SZREG($sp) 261 $REG_L $t2,3*$SZREG($sp) 262 $REG_L $t1,2*$SZREG($sp) 263 $REG_L $t0,1*$SZREG($sp) 264 $REG_L $gp,0*$SZREG($sp) 265 $PTR_ADD $sp,6*$SZREG 266___ 267$code.=<<___; 268 jr $ra 269 move $a0,$v0 270.end bn_mul_add_words_internal 271 272.align 5 273.globl bn_mul_words 274.ent bn_mul_words 275bn_mul_words: 276 .set noreorder 277 bgtz $a2,bn_mul_words_internal 278 move $v0,$zero 279 jr $ra 280 move $a0,$v0 281.end bn_mul_words 282 283.align 5 284.ent bn_mul_words_internal 285bn_mul_words_internal: 286___ 287$code.=<<___ if ($flavour =~ /nubi/i); 288 .frame $sp,6*$SZREG,$ra 289 .mask 0x8000f008,-$SZREG 290 .set noreorder 291 $PTR_SUB $sp,6*$SZREG 292 $REG_S $ra,5*$SZREG($sp) 293 $REG_S $t3,4*$SZREG($sp) 294 $REG_S $t2,3*$SZREG($sp) 295 $REG_S $t1,2*$SZREG($sp) 296 $REG_S $t0,1*$SZREG($sp) 297 $REG_S $gp,0*$SZREG($sp) 298___ 299$code.=<<___; 300 .set reorder 301 li $minus4,-4 302 and $ta0,$a2,$minus4 303 $LD $t0,0($a1) 304 beqz $ta0,.L_bn_mul_words_tail 305 306.L_bn_mul_words_loop: 307 $MULTU $t0,$a3 308 $LD $t2,$BNSZ($a1) 309 $LD $ta0,2*$BNSZ($a1) 310 $LD $ta2,3*$BNSZ($a1) 311 mflo $at 312 mfhi $t0 313 $ADDU $v0,$at 314 sltu $t1,$v0,$at 315 $MULTU $t2,$a3 316 $ST $v0,0($a0) 317 $ADDU $v0,$t1,$t0 318 319 subu $a2,4 320 $PTR_ADD $a0,4*$BNSZ 321 $PTR_ADD $a1,4*$BNSZ 322 mflo $at 323 mfhi $t2 324 $ADDU $v0,$at 325 sltu $t3,$v0,$at 326 $MULTU $ta0,$a3 327 $ST $v0,-3*$BNSZ($a0) 328 $ADDU $v0,$t3,$t2 329 330 mflo $at 331 mfhi $ta0 332 $ADDU $v0,$at 333 sltu $ta1,$v0,$at 334 $MULTU $ta2,$a3 335 $ST $v0,-2*$BNSZ($a0) 336 $ADDU $v0,$ta1,$ta0 337 338 and $ta0,$a2,$minus4 339 mflo $at 340 mfhi $ta2 341 $ADDU $v0,$at 342 sltu $ta3,$v0,$at 343 $ST $v0,-$BNSZ($a0) 344 $ADDU $v0,$ta3,$ta2 345 .set noreorder 346 bgtzl $ta0,.L_bn_mul_words_loop 347 $LD $t0,0($a1) 348 349 beqz $a2,.L_bn_mul_words_return 350 nop 351 352.L_bn_mul_words_tail: 353 .set reorder 354 $LD $t0,0($a1) 355 $MULTU $t0,$a3 356 subu $a2,1 357 mflo $at 358 mfhi $t0 359 $ADDU $v0,$at 360 sltu $t1,$v0,$at 361 $ST $v0,0($a0) 362 $ADDU $v0,$t1,$t0 363 beqz $a2,.L_bn_mul_words_return 364 365 $LD $t0,$BNSZ($a1) 366 $MULTU $t0,$a3 367 subu $a2,1 368 mflo $at 369 mfhi $t0 370 $ADDU $v0,$at 371 sltu $t1,$v0,$at 372 $ST $v0,$BNSZ($a0) 373 $ADDU $v0,$t1,$t0 374 beqz $a2,.L_bn_mul_words_return 375 376 $LD $t0,2*$BNSZ($a1) 377 $MULTU $t0,$a3 378 mflo $at 379 mfhi $t0 380 $ADDU $v0,$at 381 sltu $t1,$v0,$at 382 $ST $v0,2*$BNSZ($a0) 383 $ADDU $v0,$t1,$t0 384 385.L_bn_mul_words_return: 386 .set noreorder 387___ 388$code.=<<___ if ($flavour =~ /nubi/i); 389 $REG_L $t3,4*$SZREG($sp) 390 $REG_L $t2,3*$SZREG($sp) 391 $REG_L $t1,2*$SZREG($sp) 392 $REG_L $t0,1*$SZREG($sp) 393 $REG_L $gp,0*$SZREG($sp) 394 $PTR_ADD $sp,6*$SZREG 395___ 396$code.=<<___; 397 jr $ra 398 move $a0,$v0 399.end bn_mul_words_internal 400 401.align 5 402.globl bn_sqr_words 403.ent bn_sqr_words 404bn_sqr_words: 405 .set noreorder 406 bgtz $a2,bn_sqr_words_internal 407 move $v0,$zero 408 jr $ra 409 move $a0,$v0 410.end bn_sqr_words 411 412.align 5 413.ent bn_sqr_words_internal 414bn_sqr_words_internal: 415___ 416$code.=<<___ if ($flavour =~ /nubi/i); 417 .frame $sp,6*$SZREG,$ra 418 .mask 0x8000f008,-$SZREG 419 .set noreorder 420 $PTR_SUB $sp,6*$SZREG 421 $REG_S $ra,5*$SZREG($sp) 422 $REG_S $t3,4*$SZREG($sp) 423 $REG_S $t2,3*$SZREG($sp) 424 $REG_S $t1,2*$SZREG($sp) 425 $REG_S $t0,1*$SZREG($sp) 426 $REG_S $gp,0*$SZREG($sp) 427___ 428$code.=<<___; 429 .set reorder 430 li $minus4,-4 431 and $ta0,$a2,$minus4 432 $LD $t0,0($a1) 433 beqz $ta0,.L_bn_sqr_words_tail 434 435.L_bn_sqr_words_loop: 436 $MULTU $t0,$t0 437 $LD $t2,$BNSZ($a1) 438 $LD $ta0,2*$BNSZ($a1) 439 $LD $ta2,3*$BNSZ($a1) 440 mflo $t1 441 mfhi $t0 442 $ST $t1,0($a0) 443 $ST $t0,$BNSZ($a0) 444 445 $MULTU $t2,$t2 446 subu $a2,4 447 $PTR_ADD $a0,8*$BNSZ 448 $PTR_ADD $a1,4*$BNSZ 449 mflo $t3 450 mfhi $t2 451 $ST $t3,-6*$BNSZ($a0) 452 $ST $t2,-5*$BNSZ($a0) 453 454 $MULTU $ta0,$ta0 455 mflo $ta1 456 mfhi $ta0 457 $ST $ta1,-4*$BNSZ($a0) 458 $ST $ta0,-3*$BNSZ($a0) 459 460 461 $MULTU $ta2,$ta2 462 and $ta0,$a2,$minus4 463 mflo $ta3 464 mfhi $ta2 465 $ST $ta3,-2*$BNSZ($a0) 466 $ST $ta2,-$BNSZ($a0) 467 468 .set noreorder 469 bgtzl $ta0,.L_bn_sqr_words_loop 470 $LD $t0,0($a1) 471 472 beqz $a2,.L_bn_sqr_words_return 473 nop 474 475.L_bn_sqr_words_tail: 476 .set reorder 477 $LD $t0,0($a1) 478 $MULTU $t0,$t0 479 subu $a2,1 480 mflo $t1 481 mfhi $t0 482 $ST $t1,0($a0) 483 $ST $t0,$BNSZ($a0) 484 beqz $a2,.L_bn_sqr_words_return 485 486 $LD $t0,$BNSZ($a1) 487 $MULTU $t0,$t0 488 subu $a2,1 489 mflo $t1 490 mfhi $t0 491 $ST $t1,2*$BNSZ($a0) 492 $ST $t0,3*$BNSZ($a0) 493 beqz $a2,.L_bn_sqr_words_return 494 495 $LD $t0,2*$BNSZ($a1) 496 $MULTU $t0,$t0 497 mflo $t1 498 mfhi $t0 499 $ST $t1,4*$BNSZ($a0) 500 $ST $t0,5*$BNSZ($a0) 501 502.L_bn_sqr_words_return: 503 .set noreorder 504___ 505$code.=<<___ if ($flavour =~ /nubi/i); 506 $REG_L $t3,4*$SZREG($sp) 507 $REG_L $t2,3*$SZREG($sp) 508 $REG_L $t1,2*$SZREG($sp) 509 $REG_L $t0,1*$SZREG($sp) 510 $REG_L $gp,0*$SZREG($sp) 511 $PTR_ADD $sp,6*$SZREG 512___ 513$code.=<<___; 514 jr $ra 515 move $a0,$v0 516 517.end bn_sqr_words_internal 518 519.align 5 520.globl bn_add_words 521.ent bn_add_words 522bn_add_words: 523 .set noreorder 524 bgtz $a3,bn_add_words_internal 525 move $v0,$zero 526 jr $ra 527 move $a0,$v0 528.end bn_add_words 529 530.align 5 531.ent bn_add_words_internal 532bn_add_words_internal: 533___ 534$code.=<<___ if ($flavour =~ /nubi/i); 535 .frame $sp,6*$SZREG,$ra 536 .mask 0x8000f008,-$SZREG 537 .set noreorder 538 $PTR_SUB $sp,6*$SZREG 539 $REG_S $ra,5*$SZREG($sp) 540 $REG_S $t3,4*$SZREG($sp) 541 $REG_S $t2,3*$SZREG($sp) 542 $REG_S $t1,2*$SZREG($sp) 543 $REG_S $t0,1*$SZREG($sp) 544 $REG_S $gp,0*$SZREG($sp) 545___ 546$code.=<<___; 547 .set reorder 548 li $minus4,-4 549 and $at,$a3,$minus4 550 $LD $t0,0($a1) 551 beqz $at,.L_bn_add_words_tail 552 553.L_bn_add_words_loop: 554 $LD $ta0,0($a2) 555 subu $a3,4 556 $LD $t1,$BNSZ($a1) 557 and $at,$a3,$minus4 558 $LD $t2,2*$BNSZ($a1) 559 $PTR_ADD $a2,4*$BNSZ 560 $LD $t3,3*$BNSZ($a1) 561 $PTR_ADD $a0,4*$BNSZ 562 $LD $ta1,-3*$BNSZ($a2) 563 $PTR_ADD $a1,4*$BNSZ 564 $LD $ta2,-2*$BNSZ($a2) 565 $LD $ta3,-$BNSZ($a2) 566 $ADDU $ta0,$t0 567 sltu $t8,$ta0,$t0 568 $ADDU $t0,$ta0,$v0 569 sltu $v0,$t0,$ta0 570 $ST $t0,-4*$BNSZ($a0) 571 $ADDU $v0,$t8 572 573 $ADDU $ta1,$t1 574 sltu $t9,$ta1,$t1 575 $ADDU $t1,$ta1,$v0 576 sltu $v0,$t1,$ta1 577 $ST $t1,-3*$BNSZ($a0) 578 $ADDU $v0,$t9 579 580 $ADDU $ta2,$t2 581 sltu $t8,$ta2,$t2 582 $ADDU $t2,$ta2,$v0 583 sltu $v0,$t2,$ta2 584 $ST $t2,-2*$BNSZ($a0) 585 $ADDU $v0,$t8 586 587 $ADDU $ta3,$t3 588 sltu $t9,$ta3,$t3 589 $ADDU $t3,$ta3,$v0 590 sltu $v0,$t3,$ta3 591 $ST $t3,-$BNSZ($a0) 592 $ADDU $v0,$t9 593 594 .set noreorder 595 bgtzl $at,.L_bn_add_words_loop 596 $LD $t0,0($a1) 597 598 beqz $a3,.L_bn_add_words_return 599 nop 600 601.L_bn_add_words_tail: 602 .set reorder 603 $LD $t0,0($a1) 604 $LD $ta0,0($a2) 605 $ADDU $ta0,$t0 606 subu $a3,1 607 sltu $t8,$ta0,$t0 608 $ADDU $t0,$ta0,$v0 609 sltu $v0,$t0,$ta0 610 $ST $t0,0($a0) 611 $ADDU $v0,$t8 612 beqz $a3,.L_bn_add_words_return 613 614 $LD $t1,$BNSZ($a1) 615 $LD $ta1,$BNSZ($a2) 616 $ADDU $ta1,$t1 617 subu $a3,1 618 sltu $t9,$ta1,$t1 619 $ADDU $t1,$ta1,$v0 620 sltu $v0,$t1,$ta1 621 $ST $t1,$BNSZ($a0) 622 $ADDU $v0,$t9 623 beqz $a3,.L_bn_add_words_return 624 625 $LD $t2,2*$BNSZ($a1) 626 $LD $ta2,2*$BNSZ($a2) 627 $ADDU $ta2,$t2 628 sltu $t8,$ta2,$t2 629 $ADDU $t2,$ta2,$v0 630 sltu $v0,$t2,$ta2 631 $ST $t2,2*$BNSZ($a0) 632 $ADDU $v0,$t8 633 634.L_bn_add_words_return: 635 .set noreorder 636___ 637$code.=<<___ if ($flavour =~ /nubi/i); 638 $REG_L $t3,4*$SZREG($sp) 639 $REG_L $t2,3*$SZREG($sp) 640 $REG_L $t1,2*$SZREG($sp) 641 $REG_L $t0,1*$SZREG($sp) 642 $REG_L $gp,0*$SZREG($sp) 643 $PTR_ADD $sp,6*$SZREG 644___ 645$code.=<<___; 646 jr $ra 647 move $a0,$v0 648 649.end bn_add_words_internal 650 651.align 5 652.globl bn_sub_words 653.ent bn_sub_words 654bn_sub_words: 655 .set noreorder 656 bgtz $a3,bn_sub_words_internal 657 move $v0,$zero 658 jr $ra 659 move $a0,$zero 660.end bn_sub_words 661 662.align 5 663.ent bn_sub_words_internal 664bn_sub_words_internal: 665___ 666$code.=<<___ if ($flavour =~ /nubi/i); 667 .frame $sp,6*$SZREG,$ra 668 .mask 0x8000f008,-$SZREG 669 .set noreorder 670 $PTR_SUB $sp,6*$SZREG 671 $REG_S $ra,5*$SZREG($sp) 672 $REG_S $t3,4*$SZREG($sp) 673 $REG_S $t2,3*$SZREG($sp) 674 $REG_S $t1,2*$SZREG($sp) 675 $REG_S $t0,1*$SZREG($sp) 676 $REG_S $gp,0*$SZREG($sp) 677___ 678$code.=<<___; 679 .set reorder 680 li $minus4,-4 681 and $at,$a3,$minus4 682 $LD $t0,0($a1) 683 beqz $at,.L_bn_sub_words_tail 684 685.L_bn_sub_words_loop: 686 $LD $ta0,0($a2) 687 subu $a3,4 688 $LD $t1,$BNSZ($a1) 689 and $at,$a3,$minus4 690 $LD $t2,2*$BNSZ($a1) 691 $PTR_ADD $a2,4*$BNSZ 692 $LD $t3,3*$BNSZ($a1) 693 $PTR_ADD $a0,4*$BNSZ 694 $LD $ta1,-3*$BNSZ($a2) 695 $PTR_ADD $a1,4*$BNSZ 696 $LD $ta2,-2*$BNSZ($a2) 697 $LD $ta3,-$BNSZ($a2) 698 sltu $t8,$t0,$ta0 699 $SUBU $ta0,$t0,$ta0 700 $SUBU $t0,$ta0,$v0 701 sgtu $v0,$t0,$ta0 702 $ST $t0,-4*$BNSZ($a0) 703 $ADDU $v0,$t8 704 705 sltu $t9,$t1,$ta1 706 $SUBU $ta1,$t1,$ta1 707 $SUBU $t1,$ta1,$v0 708 sgtu $v0,$t1,$ta1 709 $ST $t1,-3*$BNSZ($a0) 710 $ADDU $v0,$t9 711 712 713 sltu $t8,$t2,$ta2 714 $SUBU $ta2,$t2,$ta2 715 $SUBU $t2,$ta2,$v0 716 sgtu $v0,$t2,$ta2 717 $ST $t2,-2*$BNSZ($a0) 718 $ADDU $v0,$t8 719 720 sltu $t9,$t3,$ta3 721 $SUBU $ta3,$t3,$ta3 722 $SUBU $t3,$ta3,$v0 723 sgtu $v0,$t3,$ta3 724 $ST $t3,-$BNSZ($a0) 725 $ADDU $v0,$t9 726 727 .set noreorder 728 bgtzl $at,.L_bn_sub_words_loop 729 $LD $t0,0($a1) 730 731 beqz $a3,.L_bn_sub_words_return 732 nop 733 734.L_bn_sub_words_tail: 735 .set reorder 736 $LD $t0,0($a1) 737 $LD $ta0,0($a2) 738 subu $a3,1 739 sltu $t8,$t0,$ta0 740 $SUBU $ta0,$t0,$ta0 741 $SUBU $t0,$ta0,$v0 742 sgtu $v0,$t0,$ta0 743 $ST $t0,0($a0) 744 $ADDU $v0,$t8 745 beqz $a3,.L_bn_sub_words_return 746 747 $LD $t1,$BNSZ($a1) 748 subu $a3,1 749 $LD $ta1,$BNSZ($a2) 750 sltu $t9,$t1,$ta1 751 $SUBU $ta1,$t1,$ta1 752 $SUBU $t1,$ta1,$v0 753 sgtu $v0,$t1,$ta1 754 $ST $t1,$BNSZ($a0) 755 $ADDU $v0,$t9 756 beqz $a3,.L_bn_sub_words_return 757 758 $LD $t2,2*$BNSZ($a1) 759 $LD $ta2,2*$BNSZ($a2) 760 sltu $t8,$t2,$ta2 761 $SUBU $ta2,$t2,$ta2 762 $SUBU $t2,$ta2,$v0 763 sgtu $v0,$t2,$ta2 764 $ST $t2,2*$BNSZ($a0) 765 $ADDU $v0,$t8 766 767.L_bn_sub_words_return: 768 .set noreorder 769___ 770$code.=<<___ if ($flavour =~ /nubi/i); 771 $REG_L $t3,4*$SZREG($sp) 772 $REG_L $t2,3*$SZREG($sp) 773 $REG_L $t1,2*$SZREG($sp) 774 $REG_L $t0,1*$SZREG($sp) 775 $REG_L $gp,0*$SZREG($sp) 776 $PTR_ADD $sp,6*$SZREG 777___ 778$code.=<<___; 779 jr $ra 780 move $a0,$v0 781.end bn_sub_words_internal 782 783.align 5 784.globl bn_div_3_words 785.ent bn_div_3_words 786bn_div_3_words: 787 .set noreorder 788 move $a3,$a0 # we know that bn_div_words does not 789 # touch $a3, $ta2, $ta3 and preserves $a2 790 # so that we can save two arguments 791 # and return address in registers 792 # instead of stack:-) 793 794 $LD $a0,($a3) 795 move $ta2,$a1 796 bne $a0,$a2,bn_div_3_words_internal 797 $LD $a1,-$BNSZ($a3) 798 li $v0,-1 799 jr $ra 800 move $a0,$v0 801.end bn_div_3_words 802 803.align 5 804.ent bn_div_3_words_internal 805bn_div_3_words_internal: 806___ 807$code.=<<___ if ($flavour =~ /nubi/i); 808 .frame $sp,6*$SZREG,$ra 809 .mask 0x8000f008,-$SZREG 810 .set noreorder 811 $PTR_SUB $sp,6*$SZREG 812 $REG_S $ra,5*$SZREG($sp) 813 $REG_S $t3,4*$SZREG($sp) 814 $REG_S $t2,3*$SZREG($sp) 815 $REG_S $t1,2*$SZREG($sp) 816 $REG_S $t0,1*$SZREG($sp) 817 $REG_S $gp,0*$SZREG($sp) 818___ 819$code.=<<___; 820 .set reorder 821 move $ta3,$ra 822 bal bn_div_words 823 move $ra,$ta3 824 $MULTU $ta2,$v0 825 $LD $t2,-2*$BNSZ($a3) 826 move $ta0,$zero 827 mfhi $t1 828 mflo $t0 829 sltu $t8,$t1,$a1 830.L_bn_div_3_words_inner_loop: 831 bnez $t8,.L_bn_div_3_words_inner_loop_done 832 sgeu $at,$t2,$t0 833 seq $t9,$t1,$a1 834 and $at,$t9 835 sltu $t3,$t0,$ta2 836 $ADDU $a1,$a2 837 $SUBU $t1,$t3 838 $SUBU $t0,$ta2 839 sltu $t8,$t1,$a1 840 sltu $ta0,$a1,$a2 841 or $t8,$ta0 842 .set noreorder 843 beqzl $at,.L_bn_div_3_words_inner_loop 844 $SUBU $v0,1 845 .set reorder 846.L_bn_div_3_words_inner_loop_done: 847 .set noreorder 848___ 849$code.=<<___ if ($flavour =~ /nubi/i); 850 $REG_L $t3,4*$SZREG($sp) 851 $REG_L $t2,3*$SZREG($sp) 852 $REG_L $t1,2*$SZREG($sp) 853 $REG_L $t0,1*$SZREG($sp) 854 $REG_L $gp,0*$SZREG($sp) 855 $PTR_ADD $sp,6*$SZREG 856___ 857$code.=<<___; 858 jr $ra 859 move $a0,$v0 860.end bn_div_3_words_internal 861 862.align 5 863.globl bn_div_words 864.ent bn_div_words 865bn_div_words: 866 .set noreorder 867 bnez $a2,bn_div_words_internal 868 li $v0,-1 # I would rather signal div-by-zero 869 # which can be done with 'break 7' 870 jr $ra 871 move $a0,$v0 872.end bn_div_words 873 874.align 5 875.ent bn_div_words_internal 876bn_div_words_internal: 877___ 878$code.=<<___ if ($flavour =~ /nubi/i); 879 .frame $sp,6*$SZREG,$ra 880 .mask 0x8000f008,-$SZREG 881 .set noreorder 882 $PTR_SUB $sp,6*$SZREG 883 $REG_S $ra,5*$SZREG($sp) 884 $REG_S $t3,4*$SZREG($sp) 885 $REG_S $t2,3*$SZREG($sp) 886 $REG_S $t1,2*$SZREG($sp) 887 $REG_S $t0,1*$SZREG($sp) 888 $REG_S $gp,0*$SZREG($sp) 889___ 890$code.=<<___; 891 move $v1,$zero 892 bltz $a2,.L_bn_div_words_body 893 move $t9,$v1 894 $SLL $a2,1 895 bgtz $a2,.-4 896 addu $t9,1 897 898 .set reorder 899 negu $t1,$t9 900 li $t2,-1 901 $SLL $t2,$t1 902 and $t2,$a0 903 $SRL $at,$a1,$t1 904 .set noreorder 905 bnezl $t2,.+8 906 break 6 # signal overflow 907 .set reorder 908 $SLL $a0,$t9 909 $SLL $a1,$t9 910 or $a0,$at 911___ 912$QT=$ta0; 913$HH=$ta1; 914$DH=$v1; 915$code.=<<___; 916.L_bn_div_words_body: 917 $SRL $DH,$a2,4*$BNSZ # bits 918 sgeu $at,$a0,$a2 919 .set noreorder 920 bnezl $at,.+8 921 $SUBU $a0,$a2 922 .set reorder 923 924 li $QT,-1 925 $SRL $HH,$a0,4*$BNSZ # bits 926 $SRL $QT,4*$BNSZ # q=0xffffffff 927 beq $DH,$HH,.L_bn_div_words_skip_div1 928 $DIVU $zero,$a0,$DH 929 mflo $QT 930.L_bn_div_words_skip_div1: 931 $MULTU $a2,$QT 932 $SLL $t3,$a0,4*$BNSZ # bits 933 $SRL $at,$a1,4*$BNSZ # bits 934 or $t3,$at 935 mflo $t0 936 mfhi $t1 937.L_bn_div_words_inner_loop1: 938 sltu $t2,$t3,$t0 939 seq $t8,$HH,$t1 940 sltu $at,$HH,$t1 941 and $t2,$t8 942 sltu $v0,$t0,$a2 943 or $at,$t2 944 .set noreorder 945 beqz $at,.L_bn_div_words_inner_loop1_done 946 $SUBU $t1,$v0 947 $SUBU $t0,$a2 948 b .L_bn_div_words_inner_loop1 949 $SUBU $QT,1 950 .set reorder 951.L_bn_div_words_inner_loop1_done: 952 953 $SLL $a1,4*$BNSZ # bits 954 $SUBU $a0,$t3,$t0 955 $SLL $v0,$QT,4*$BNSZ # bits 956 957 li $QT,-1 958 $SRL $HH,$a0,4*$BNSZ # bits 959 $SRL $QT,4*$BNSZ # q=0xffffffff 960 beq $DH,$HH,.L_bn_div_words_skip_div2 961 $DIVU $zero,$a0,$DH 962 mflo $QT 963.L_bn_div_words_skip_div2: 964 $MULTU $a2,$QT 965 $SLL $t3,$a0,4*$BNSZ # bits 966 $SRL $at,$a1,4*$BNSZ # bits 967 or $t3,$at 968 mflo $t0 969 mfhi $t1 970.L_bn_div_words_inner_loop2: 971 sltu $t2,$t3,$t0 972 seq $t8,$HH,$t1 973 sltu $at,$HH,$t1 974 and $t2,$t8 975 sltu $v1,$t0,$a2 976 or $at,$t2 977 .set noreorder 978 beqz $at,.L_bn_div_words_inner_loop2_done 979 $SUBU $t1,$v1 980 $SUBU $t0,$a2 981 b .L_bn_div_words_inner_loop2 982 $SUBU $QT,1 983 .set reorder 984.L_bn_div_words_inner_loop2_done: 985 986 $SUBU $a0,$t3,$t0 987 or $v0,$QT 988 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it 989 $SRL $a2,$t9 # restore $a2 990 991 .set noreorder 992 move $a1,$v1 993___ 994$code.=<<___ if ($flavour =~ /nubi/i); 995 $REG_L $t3,4*$SZREG($sp) 996 $REG_L $t2,3*$SZREG($sp) 997 $REG_L $t1,2*$SZREG($sp) 998 $REG_L $t0,1*$SZREG($sp) 999 $REG_L $gp,0*$SZREG($sp) 1000 $PTR_ADD $sp,6*$SZREG 1001___ 1002$code.=<<___; 1003 jr $ra 1004 move $a0,$v0 1005.end bn_div_words_internal 1006___ 1007undef $HH; undef $QT; undef $DH; 1008 1009($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); 1010($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); 1011 1012($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 1013($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 1014 1015($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); 1016 1017$code.=<<___; 1018 1019.align 5 1020.globl bn_mul_comba8 1021.ent bn_mul_comba8 1022bn_mul_comba8: 1023 .set noreorder 1024___ 1025$code.=<<___ if ($flavour =~ /nubi/i); 1026 .frame $sp,12*$SZREG,$ra 1027 .mask 0x803ff008,-$SZREG 1028 $PTR_SUB $sp,12*$SZREG 1029 $REG_S $ra,11*$SZREG($sp) 1030 $REG_S $s5,10*$SZREG($sp) 1031 $REG_S $s4,9*$SZREG($sp) 1032 $REG_S $s3,8*$SZREG($sp) 1033 $REG_S $s2,7*$SZREG($sp) 1034 $REG_S $s1,6*$SZREG($sp) 1035 $REG_S $s0,5*$SZREG($sp) 1036 $REG_S $t3,4*$SZREG($sp) 1037 $REG_S $t2,3*$SZREG($sp) 1038 $REG_S $t1,2*$SZREG($sp) 1039 $REG_S $t0,1*$SZREG($sp) 1040 $REG_S $gp,0*$SZREG($sp) 1041___ 1042$code.=<<___ if ($flavour !~ /nubi/i); 1043 .frame $sp,6*$SZREG,$ra 1044 .mask 0x003f0000,-$SZREG 1045 $PTR_SUB $sp,6*$SZREG 1046 $REG_S $s5,5*$SZREG($sp) 1047 $REG_S $s4,4*$SZREG($sp) 1048 $REG_S $s3,3*$SZREG($sp) 1049 $REG_S $s2,2*$SZREG($sp) 1050 $REG_S $s1,1*$SZREG($sp) 1051 $REG_S $s0,0*$SZREG($sp) 1052___ 1053$code.=<<___; 1054 1055 .set reorder 1056 $LD $a_0,0($a1) # If compiled with -mips3 option on 1057 # R5000 box assembler barks on this 1058 # 1ine with "should not have mult/div 1059 # as last instruction in bb (R10K 1060 # bug)" warning. If anybody out there 1061 # has a clue about how to circumvent 1062 # this do send me a note. 1063 # <appro\@fy.chalmers.se> 1064 1065 $LD $b_0,0($a2) 1066 $LD $a_1,$BNSZ($a1) 1067 $LD $a_2,2*$BNSZ($a1) 1068 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1069 $LD $a_3,3*$BNSZ($a1) 1070 $LD $b_1,$BNSZ($a2) 1071 $LD $b_2,2*$BNSZ($a2) 1072 $LD $b_3,3*$BNSZ($a2) 1073 mflo $c_1 1074 mfhi $c_2 1075 1076 $LD $a_4,4*$BNSZ($a1) 1077 $LD $a_5,5*$BNSZ($a1) 1078 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1079 $LD $a_6,6*$BNSZ($a1) 1080 $LD $a_7,7*$BNSZ($a1) 1081 $LD $b_4,4*$BNSZ($a2) 1082 $LD $b_5,5*$BNSZ($a2) 1083 mflo $t_1 1084 mfhi $t_2 1085 $ADDU $c_2,$t_1 1086 sltu $at,$c_2,$t_1 1087 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1088 $ADDU $c_3,$t_2,$at 1089 $LD $b_6,6*$BNSZ($a2) 1090 $LD $b_7,7*$BNSZ($a2) 1091 $ST $c_1,0($a0) # r[0]=c1; 1092 mflo $t_1 1093 mfhi $t_2 1094 $ADDU $c_2,$t_1 1095 sltu $at,$c_2,$t_1 1096 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1097 $ADDU $t_2,$at 1098 $ADDU $c_3,$t_2 1099 sltu $c_1,$c_3,$t_2 1100 $ST $c_2,$BNSZ($a0) # r[1]=c2; 1101 1102 mflo $t_1 1103 mfhi $t_2 1104 $ADDU $c_3,$t_1 1105 sltu $at,$c_3,$t_1 1106 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1107 $ADDU $t_2,$at 1108 $ADDU $c_1,$t_2 1109 mflo $t_1 1110 mfhi $t_2 1111 $ADDU $c_3,$t_1 1112 sltu $at,$c_3,$t_1 1113 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1114 $ADDU $t_2,$at 1115 $ADDU $c_1,$t_2 1116 sltu $c_2,$c_1,$t_2 1117 mflo $t_1 1118 mfhi $t_2 1119 $ADDU $c_3,$t_1 1120 sltu $at,$c_3,$t_1 1121 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1122 $ADDU $t_2,$at 1123 $ADDU $c_1,$t_2 1124 sltu $at,$c_1,$t_2 1125 $ADDU $c_2,$at 1126 $ST $c_3,2*$BNSZ($a0) # r[2]=c3; 1127 1128 mflo $t_1 1129 mfhi $t_2 1130 $ADDU $c_1,$t_1 1131 sltu $at,$c_1,$t_1 1132 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1133 $ADDU $t_2,$at 1134 $ADDU $c_2,$t_2 1135 sltu $c_3,$c_2,$t_2 1136 mflo $t_1 1137 mfhi $t_2 1138 $ADDU $c_1,$t_1 1139 sltu $at,$c_1,$t_1 1140 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1141 $ADDU $t_2,$at 1142 $ADDU $c_2,$t_2 1143 sltu $at,$c_2,$t_2 1144 $ADDU $c_3,$at 1145 mflo $t_1 1146 mfhi $t_2 1147 $ADDU $c_1,$t_1 1148 sltu $at,$c_1,$t_1 1149 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1150 $ADDU $t_2,$at 1151 $ADDU $c_2,$t_2 1152 sltu $at,$c_2,$t_2 1153 $ADDU $c_3,$at 1154 mflo $t_1 1155 mfhi $t_2 1156 $ADDU $c_1,$t_1 1157 sltu $at,$c_1,$t_1 1158 $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); 1159 $ADDU $t_2,$at 1160 $ADDU $c_2,$t_2 1161 sltu $at,$c_2,$t_2 1162 $ADDU $c_3,$at 1163 $ST $c_1,3*$BNSZ($a0) # r[3]=c1; 1164 1165 mflo $t_1 1166 mfhi $t_2 1167 $ADDU $c_2,$t_1 1168 sltu $at,$c_2,$t_1 1169 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1170 $ADDU $t_2,$at 1171 $ADDU $c_3,$t_2 1172 sltu $c_1,$c_3,$t_2 1173 mflo $t_1 1174 mfhi $t_2 1175 $ADDU $c_2,$t_1 1176 sltu $at,$c_2,$t_1 1177 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1178 $ADDU $t_2,$at 1179 $ADDU $c_3,$t_2 1180 sltu $at,$c_3,$t_2 1181 $ADDU $c_1,$at 1182 mflo $t_1 1183 mfhi $t_2 1184 $ADDU $c_2,$t_1 1185 sltu $at,$c_2,$t_1 1186 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1187 $ADDU $t_2,$at 1188 $ADDU $c_3,$t_2 1189 sltu $at,$c_3,$t_2 1190 $ADDU $c_1,$at 1191 mflo $t_1 1192 mfhi $t_2 1193 $ADDU $c_2,$t_1 1194 sltu $at,$c_2,$t_1 1195 $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); 1196 $ADDU $t_2,$at 1197 $ADDU $c_3,$t_2 1198 sltu $at,$c_3,$t_2 1199 $ADDU $c_1,$at 1200 mflo $t_1 1201 mfhi $t_2 1202 $ADDU $c_2,$t_1 1203 sltu $at,$c_2,$t_1 1204 $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); 1205 $ADDU $t_2,$at 1206 $ADDU $c_3,$t_2 1207 sltu $at,$c_3,$t_2 1208 $ADDU $c_1,$at 1209 $ST $c_2,4*$BNSZ($a0) # r[4]=c2; 1210 1211 mflo $t_1 1212 mfhi $t_2 1213 $ADDU $c_3,$t_1 1214 sltu $at,$c_3,$t_1 1215 $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); 1216 $ADDU $t_2,$at 1217 $ADDU $c_1,$t_2 1218 sltu $c_2,$c_1,$t_2 1219 mflo $t_1 1220 mfhi $t_2 1221 $ADDU $c_3,$t_1 1222 sltu $at,$c_3,$t_1 1223 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1224 $ADDU $t_2,$at 1225 $ADDU $c_1,$t_2 1226 sltu $at,$c_1,$t_2 1227 $ADDU $c_2,$at 1228 mflo $t_1 1229 mfhi $t_2 1230 $ADDU $c_3,$t_1 1231 sltu $at,$c_3,$t_1 1232 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1233 $ADDU $t_2,$at 1234 $ADDU $c_1,$t_2 1235 sltu $at,$c_1,$t_2 1236 $ADDU $c_2,$at 1237 mflo $t_1 1238 mfhi $t_2 1239 $ADDU $c_3,$t_1 1240 sltu $at,$c_3,$t_1 1241 $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); 1242 $ADDU $t_2,$at 1243 $ADDU $c_1,$t_2 1244 sltu $at,$c_1,$t_2 1245 $ADDU $c_2,$at 1246 mflo $t_1 1247 mfhi $t_2 1248 $ADDU $c_3,$t_1 1249 sltu $at,$c_3,$t_1 1250 $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); 1251 $ADDU $t_2,$at 1252 $ADDU $c_1,$t_2 1253 sltu $at,$c_1,$t_2 1254 $ADDU $c_2,$at 1255 mflo $t_1 1256 mfhi $t_2 1257 $ADDU $c_3,$t_1 1258 sltu $at,$c_3,$t_1 1259 $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); 1260 $ADDU $t_2,$at 1261 $ADDU $c_1,$t_2 1262 sltu $at,$c_1,$t_2 1263 $ADDU $c_2,$at 1264 $ST $c_3,5*$BNSZ($a0) # r[5]=c3; 1265 1266 mflo $t_1 1267 mfhi $t_2 1268 $ADDU $c_1,$t_1 1269 sltu $at,$c_1,$t_1 1270 $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); 1271 $ADDU $t_2,$at 1272 $ADDU $c_2,$t_2 1273 sltu $c_3,$c_2,$t_2 1274 mflo $t_1 1275 mfhi $t_2 1276 $ADDU $c_1,$t_1 1277 sltu $at,$c_1,$t_1 1278 $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); 1279 $ADDU $t_2,$at 1280 $ADDU $c_2,$t_2 1281 sltu $at,$c_2,$t_2 1282 $ADDU $c_3,$at 1283 mflo $t_1 1284 mfhi $t_2 1285 $ADDU $c_1,$t_1 1286 sltu $at,$c_1,$t_1 1287 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1288 $ADDU $t_2,$at 1289 $ADDU $c_2,$t_2 1290 sltu $at,$c_2,$t_2 1291 $ADDU $c_3,$at 1292 mflo $t_1 1293 mfhi $t_2 1294 $ADDU $c_1,$t_1 1295 sltu $at,$c_1,$t_1 1296 $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); 1297 $ADDU $t_2,$at 1298 $ADDU $c_2,$t_2 1299 sltu $at,$c_2,$t_2 1300 $ADDU $c_3,$at 1301 mflo $t_1 1302 mfhi $t_2 1303 $ADDU $c_1,$t_1 1304 sltu $at,$c_1,$t_1 1305 $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); 1306 $ADDU $t_2,$at 1307 $ADDU $c_2,$t_2 1308 sltu $at,$c_2,$t_2 1309 $ADDU $c_3,$at 1310 mflo $t_1 1311 mfhi $t_2 1312 $ADDU $c_1,$t_1 1313 sltu $at,$c_1,$t_1 1314 $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); 1315 $ADDU $t_2,$at 1316 $ADDU $c_2,$t_2 1317 sltu $at,$c_2,$t_2 1318 $ADDU $c_3,$at 1319 mflo $t_1 1320 mfhi $t_2 1321 $ADDU $c_1,$t_1 1322 sltu $at,$c_1,$t_1 1323 $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); 1324 $ADDU $t_2,$at 1325 $ADDU $c_2,$t_2 1326 sltu $at,$c_2,$t_2 1327 $ADDU $c_3,$at 1328 $ST $c_1,6*$BNSZ($a0) # r[6]=c1; 1329 1330 mflo $t_1 1331 mfhi $t_2 1332 $ADDU $c_2,$t_1 1333 sltu $at,$c_2,$t_1 1334 $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); 1335 $ADDU $t_2,$at 1336 $ADDU $c_3,$t_2 1337 sltu $c_1,$c_3,$t_2 1338 mflo $t_1 1339 mfhi $t_2 1340 $ADDU $c_2,$t_1 1341 sltu $at,$c_2,$t_1 1342 $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); 1343 $ADDU $t_2,$at 1344 $ADDU $c_3,$t_2 1345 sltu $at,$c_3,$t_2 1346 $ADDU $c_1,$at 1347 mflo $t_1 1348 mfhi $t_2 1349 $ADDU $c_2,$t_1 1350 sltu $at,$c_2,$t_1 1351 $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); 1352 $ADDU $t_2,$at 1353 $ADDU $c_3,$t_2 1354 sltu $at,$c_3,$t_2 1355 $ADDU $c_1,$at 1356 mflo $t_1 1357 mfhi $t_2 1358 $ADDU $c_2,$t_1 1359 sltu $at,$c_2,$t_1 1360 $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); 1361 $ADDU $t_2,$at 1362 $ADDU $c_3,$t_2 1363 sltu $at,$c_3,$t_2 1364 $ADDU $c_1,$at 1365 mflo $t_1 1366 mfhi $t_2 1367 $ADDU $c_2,$t_1 1368 sltu $at,$c_2,$t_1 1369 $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); 1370 $ADDU $t_2,$at 1371 $ADDU $c_3,$t_2 1372 sltu $at,$c_3,$t_2 1373 $ADDU $c_1,$at 1374 mflo $t_1 1375 mfhi $t_2 1376 $ADDU $c_2,$t_1 1377 sltu $at,$c_2,$t_1 1378 $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); 1379 $ADDU $t_2,$at 1380 $ADDU $c_3,$t_2 1381 sltu $at,$c_3,$t_2 1382 $ADDU $c_1,$at 1383 mflo $t_1 1384 mfhi $t_2 1385 $ADDU $c_2,$t_1 1386 sltu $at,$c_2,$t_1 1387 $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); 1388 $ADDU $t_2,$at 1389 $ADDU $c_3,$t_2 1390 sltu $at,$c_3,$t_2 1391 $ADDU $c_1,$at 1392 mflo $t_1 1393 mfhi $t_2 1394 $ADDU $c_2,$t_1 1395 sltu $at,$c_2,$t_1 1396 $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); 1397 $ADDU $t_2,$at 1398 $ADDU $c_3,$t_2 1399 sltu $at,$c_3,$t_2 1400 $ADDU $c_1,$at 1401 $ST $c_2,7*$BNSZ($a0) # r[7]=c2; 1402 1403 mflo $t_1 1404 mfhi $t_2 1405 $ADDU $c_3,$t_1 1406 sltu $at,$c_3,$t_1 1407 $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); 1408 $ADDU $t_2,$at 1409 $ADDU $c_1,$t_2 1410 sltu $c_2,$c_1,$t_2 1411 mflo $t_1 1412 mfhi $t_2 1413 $ADDU $c_3,$t_1 1414 sltu $at,$c_3,$t_1 1415 $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); 1416 $ADDU $t_2,$at 1417 $ADDU $c_1,$t_2 1418 sltu $at,$c_1,$t_2 1419 $ADDU $c_2,$at 1420 mflo $t_1 1421 mfhi $t_2 1422 $ADDU $c_3,$t_1 1423 sltu $at,$c_3,$t_1 1424 $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); 1425 $ADDU $t_2,$at 1426 $ADDU $c_1,$t_2 1427 sltu $at,$c_1,$t_2 1428 $ADDU $c_2,$at 1429 mflo $t_1 1430 mfhi $t_2 1431 $ADDU $c_3,$t_1 1432 sltu $at,$c_3,$t_1 1433 $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); 1434 $ADDU $t_2,$at 1435 $ADDU $c_1,$t_2 1436 sltu $at,$c_1,$t_2 1437 $ADDU $c_2,$at 1438 mflo $t_1 1439 mfhi $t_2 1440 $ADDU $c_3,$t_1 1441 sltu $at,$c_3,$t_1 1442 $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); 1443 $ADDU $t_2,$at 1444 $ADDU $c_1,$t_2 1445 sltu $at,$c_1,$t_2 1446 $ADDU $c_2,$at 1447 mflo $t_1 1448 mfhi $t_2 1449 $ADDU $c_3,$t_1 1450 sltu $at,$c_3,$t_1 1451 $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); 1452 $ADDU $t_2,$at 1453 $ADDU $c_1,$t_2 1454 sltu $at,$c_1,$t_2 1455 $ADDU $c_2,$at 1456 mflo $t_1 1457 mfhi $t_2 1458 $ADDU $c_3,$t_1 1459 sltu $at,$c_3,$t_1 1460 $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); 1461 $ADDU $t_2,$at 1462 $ADDU $c_1,$t_2 1463 sltu $at,$c_1,$t_2 1464 $ADDU $c_2,$at 1465 $ST $c_3,8*$BNSZ($a0) # r[8]=c3; 1466 1467 mflo $t_1 1468 mfhi $t_2 1469 $ADDU $c_1,$t_1 1470 sltu $at,$c_1,$t_1 1471 $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); 1472 $ADDU $t_2,$at 1473 $ADDU $c_2,$t_2 1474 sltu $c_3,$c_2,$t_2 1475 mflo $t_1 1476 mfhi $t_2 1477 $ADDU $c_1,$t_1 1478 sltu $at,$c_1,$t_1 1479 $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); 1480 $ADDU $t_2,$at 1481 $ADDU $c_2,$t_2 1482 sltu $at,$c_2,$t_2 1483 $ADDU $c_3,$at 1484 mflo $t_1 1485 mfhi $t_2 1486 $ADDU $c_1,$t_1 1487 sltu $at,$c_1,$t_1 1488 $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); 1489 $ADDU $t_2,$at 1490 $ADDU $c_2,$t_2 1491 sltu $at,$c_2,$t_2 1492 $ADDU $c_3,$at 1493 mflo $t_1 1494 mfhi $t_2 1495 $ADDU $c_1,$t_1 1496 sltu $at,$c_1,$t_1 1497 $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); 1498 $ADDU $t_2,$at 1499 $ADDU $c_2,$t_2 1500 sltu $at,$c_2,$t_2 1501 $ADDU $c_3,$at 1502 mflo $t_1 1503 mfhi $t_2 1504 $ADDU $c_1,$t_1 1505 sltu $at,$c_1,$t_1 1506 $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); 1507 $ADDU $t_2,$at 1508 $ADDU $c_2,$t_2 1509 sltu $at,$c_2,$t_2 1510 $ADDU $c_3,$at 1511 mflo $t_1 1512 mfhi $t_2 1513 $ADDU $c_1,$t_1 1514 sltu $at,$c_1,$t_1 1515 $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); 1516 $ADDU $t_2,$at 1517 $ADDU $c_2,$t_2 1518 sltu $at,$c_2,$t_2 1519 $ADDU $c_3,$at 1520 $ST $c_1,9*$BNSZ($a0) # r[9]=c1; 1521 1522 mflo $t_1 1523 mfhi $t_2 1524 $ADDU $c_2,$t_1 1525 sltu $at,$c_2,$t_1 1526 $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); 1527 $ADDU $t_2,$at 1528 $ADDU $c_3,$t_2 1529 sltu $c_1,$c_3,$t_2 1530 mflo $t_1 1531 mfhi $t_2 1532 $ADDU $c_2,$t_1 1533 sltu $at,$c_2,$t_1 1534 $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); 1535 $ADDU $t_2,$at 1536 $ADDU $c_3,$t_2 1537 sltu $at,$c_3,$t_2 1538 $ADDU $c_1,$at 1539 mflo $t_1 1540 mfhi $t_2 1541 $ADDU $c_2,$t_1 1542 sltu $at,$c_2,$t_1 1543 $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); 1544 $ADDU $t_2,$at 1545 $ADDU $c_3,$t_2 1546 sltu $at,$c_3,$t_2 1547 $ADDU $c_1,$at 1548 mflo $t_1 1549 mfhi $t_2 1550 $ADDU $c_2,$t_1 1551 sltu $at,$c_2,$t_1 1552 $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); 1553 $ADDU $t_2,$at 1554 $ADDU $c_3,$t_2 1555 sltu $at,$c_3,$t_2 1556 $ADDU $c_1,$at 1557 mflo $t_1 1558 mfhi $t_2 1559 $ADDU $c_2,$t_1 1560 sltu $at,$c_2,$t_1 1561 $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); 1562 $ADDU $t_2,$at 1563 $ADDU $c_3,$t_2 1564 sltu $at,$c_3,$t_2 1565 $ADDU $c_1,$at 1566 $ST $c_2,10*$BNSZ($a0) # r[10]=c2; 1567 1568 mflo $t_1 1569 mfhi $t_2 1570 $ADDU $c_3,$t_1 1571 sltu $at,$c_3,$t_1 1572 $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); 1573 $ADDU $t_2,$at 1574 $ADDU $c_1,$t_2 1575 sltu $c_2,$c_1,$t_2 1576 mflo $t_1 1577 mfhi $t_2 1578 $ADDU $c_3,$t_1 1579 sltu $at,$c_3,$t_1 1580 $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); 1581 $ADDU $t_2,$at 1582 $ADDU $c_1,$t_2 1583 sltu $at,$c_1,$t_2 1584 $ADDU $c_2,$at 1585 mflo $t_1 1586 mfhi $t_2 1587 $ADDU $c_3,$t_1 1588 sltu $at,$c_3,$t_1 1589 $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); 1590 $ADDU $t_2,$at 1591 $ADDU $c_1,$t_2 1592 sltu $at,$c_1,$t_2 1593 $ADDU $c_2,$at 1594 mflo $t_1 1595 mfhi $t_2 1596 $ADDU $c_3,$t_1 1597 sltu $at,$c_3,$t_1 1598 $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); 1599 $ADDU $t_2,$at 1600 $ADDU $c_1,$t_2 1601 sltu $at,$c_1,$t_2 1602 $ADDU $c_2,$at 1603 $ST $c_3,11*$BNSZ($a0) # r[11]=c3; 1604 1605 mflo $t_1 1606 mfhi $t_2 1607 $ADDU $c_1,$t_1 1608 sltu $at,$c_1,$t_1 1609 $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); 1610 $ADDU $t_2,$at 1611 $ADDU $c_2,$t_2 1612 sltu $c_3,$c_2,$t_2 1613 mflo $t_1 1614 mfhi $t_2 1615 $ADDU $c_1,$t_1 1616 sltu $at,$c_1,$t_1 1617 $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); 1618 $ADDU $t_2,$at 1619 $ADDU $c_2,$t_2 1620 sltu $at,$c_2,$t_2 1621 $ADDU $c_3,$at 1622 mflo $t_1 1623 mfhi $t_2 1624 $ADDU $c_1,$t_1 1625 sltu $at,$c_1,$t_1 1626 $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); 1627 $ADDU $t_2,$at 1628 $ADDU $c_2,$t_2 1629 sltu $at,$c_2,$t_2 1630 $ADDU $c_3,$at 1631 $ST $c_1,12*$BNSZ($a0) # r[12]=c1; 1632 1633 mflo $t_1 1634 mfhi $t_2 1635 $ADDU $c_2,$t_1 1636 sltu $at,$c_2,$t_1 1637 $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); 1638 $ADDU $t_2,$at 1639 $ADDU $c_3,$t_2 1640 sltu $c_1,$c_3,$t_2 1641 mflo $t_1 1642 mfhi $t_2 1643 $ADDU $c_2,$t_1 1644 sltu $at,$c_2,$t_1 1645 $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); 1646 $ADDU $t_2,$at 1647 $ADDU $c_3,$t_2 1648 sltu $at,$c_3,$t_2 1649 $ADDU $c_1,$at 1650 $ST $c_2,13*$BNSZ($a0) # r[13]=c2; 1651 1652 mflo $t_1 1653 mfhi $t_2 1654 $ADDU $c_3,$t_1 1655 sltu $at,$c_3,$t_1 1656 $ADDU $t_2,$at 1657 $ADDU $c_1,$t_2 1658 $ST $c_3,14*$BNSZ($a0) # r[14]=c3; 1659 $ST $c_1,15*$BNSZ($a0) # r[15]=c1; 1660 1661 .set noreorder 1662___ 1663$code.=<<___ if ($flavour =~ /nubi/i); 1664 $REG_L $s5,10*$SZREG($sp) 1665 $REG_L $s4,9*$SZREG($sp) 1666 $REG_L $s3,8*$SZREG($sp) 1667 $REG_L $s2,7*$SZREG($sp) 1668 $REG_L $s1,6*$SZREG($sp) 1669 $REG_L $s0,5*$SZREG($sp) 1670 $REG_L $t3,4*$SZREG($sp) 1671 $REG_L $t2,3*$SZREG($sp) 1672 $REG_L $t1,2*$SZREG($sp) 1673 $REG_L $t0,1*$SZREG($sp) 1674 $REG_L $gp,0*$SZREG($sp) 1675 jr $ra 1676 $PTR_ADD $sp,12*$SZREG 1677___ 1678$code.=<<___ if ($flavour !~ /nubi/i); 1679 $REG_L $s5,5*$SZREG($sp) 1680 $REG_L $s4,4*$SZREG($sp) 1681 $REG_L $s3,3*$SZREG($sp) 1682 $REG_L $s2,2*$SZREG($sp) 1683 $REG_L $s1,1*$SZREG($sp) 1684 $REG_L $s0,0*$SZREG($sp) 1685 jr $ra 1686 $PTR_ADD $sp,6*$SZREG 1687___ 1688$code.=<<___; 1689.end bn_mul_comba8 1690 1691.align 5 1692.globl bn_mul_comba4 1693.ent bn_mul_comba4 1694bn_mul_comba4: 1695___ 1696$code.=<<___ if ($flavour =~ /nubi/i); 1697 .frame $sp,6*$SZREG,$ra 1698 .mask 0x8000f008,-$SZREG 1699 .set noreorder 1700 $PTR_SUB $sp,6*$SZREG 1701 $REG_S $ra,5*$SZREG($sp) 1702 $REG_S $t3,4*$SZREG($sp) 1703 $REG_S $t2,3*$SZREG($sp) 1704 $REG_S $t1,2*$SZREG($sp) 1705 $REG_S $t0,1*$SZREG($sp) 1706 $REG_S $gp,0*$SZREG($sp) 1707___ 1708$code.=<<___; 1709 .set reorder 1710 $LD $a_0,0($a1) 1711 $LD $b_0,0($a2) 1712 $LD $a_1,$BNSZ($a1) 1713 $LD $a_2,2*$BNSZ($a1) 1714 $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1715 $LD $a_3,3*$BNSZ($a1) 1716 $LD $b_1,$BNSZ($a2) 1717 $LD $b_2,2*$BNSZ($a2) 1718 $LD $b_3,3*$BNSZ($a2) 1719 mflo $c_1 1720 mfhi $c_2 1721 $ST $c_1,0($a0) 1722 1723 $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); 1724 mflo $t_1 1725 mfhi $t_2 1726 $ADDU $c_2,$t_1 1727 sltu $at,$c_2,$t_1 1728 $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); 1729 $ADDU $c_3,$t_2,$at 1730 mflo $t_1 1731 mfhi $t_2 1732 $ADDU $c_2,$t_1 1733 sltu $at,$c_2,$t_1 1734 $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); 1735 $ADDU $t_2,$at 1736 $ADDU $c_3,$t_2 1737 sltu $c_1,$c_3,$t_2 1738 $ST $c_2,$BNSZ($a0) 1739 1740 mflo $t_1 1741 mfhi $t_2 1742 $ADDU $c_3,$t_1 1743 sltu $at,$c_3,$t_1 1744 $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1745 $ADDU $t_2,$at 1746 $ADDU $c_1,$t_2 1747 mflo $t_1 1748 mfhi $t_2 1749 $ADDU $c_3,$t_1 1750 sltu $at,$c_3,$t_1 1751 $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); 1752 $ADDU $t_2,$at 1753 $ADDU $c_1,$t_2 1754 sltu $c_2,$c_1,$t_2 1755 mflo $t_1 1756 mfhi $t_2 1757 $ADDU $c_3,$t_1 1758 sltu $at,$c_3,$t_1 1759 $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); 1760 $ADDU $t_2,$at 1761 $ADDU $c_1,$t_2 1762 sltu $at,$c_1,$t_2 1763 $ADDU $c_2,$at 1764 $ST $c_3,2*$BNSZ($a0) 1765 1766 mflo $t_1 1767 mfhi $t_2 1768 $ADDU $c_1,$t_1 1769 sltu $at,$c_1,$t_1 1770 $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); 1771 $ADDU $t_2,$at 1772 $ADDU $c_2,$t_2 1773 sltu $c_3,$c_2,$t_2 1774 mflo $t_1 1775 mfhi $t_2 1776 $ADDU $c_1,$t_1 1777 sltu $at,$c_1,$t_1 1778 $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); 1779 $ADDU $t_2,$at 1780 $ADDU $c_2,$t_2 1781 sltu $at,$c_2,$t_2 1782 $ADDU $c_3,$at 1783 mflo $t_1 1784 mfhi $t_2 1785 $ADDU $c_1,$t_1 1786 sltu $at,$c_1,$t_1 1787 $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); 1788 $ADDU $t_2,$at 1789 $ADDU $c_2,$t_2 1790 sltu $at,$c_2,$t_2 1791 $ADDU $c_3,$at 1792 mflo $t_1 1793 mfhi $t_2 1794 $ADDU $c_1,$t_1 1795 sltu $at,$c_1,$t_1 1796 $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); 1797 $ADDU $t_2,$at 1798 $ADDU $c_2,$t_2 1799 sltu $at,$c_2,$t_2 1800 $ADDU $c_3,$at 1801 $ST $c_1,3*$BNSZ($a0) 1802 1803 mflo $t_1 1804 mfhi $t_2 1805 $ADDU $c_2,$t_1 1806 sltu $at,$c_2,$t_1 1807 $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); 1808 $ADDU $t_2,$at 1809 $ADDU $c_3,$t_2 1810 sltu $c_1,$c_3,$t_2 1811 mflo $t_1 1812 mfhi $t_2 1813 $ADDU $c_2,$t_1 1814 sltu $at,$c_2,$t_1 1815 $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); 1816 $ADDU $t_2,$at 1817 $ADDU $c_3,$t_2 1818 sltu $at,$c_3,$t_2 1819 $ADDU $c_1,$at 1820 mflo $t_1 1821 mfhi $t_2 1822 $ADDU $c_2,$t_1 1823 sltu $at,$c_2,$t_1 1824 $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); 1825 $ADDU $t_2,$at 1826 $ADDU $c_3,$t_2 1827 sltu $at,$c_3,$t_2 1828 $ADDU $c_1,$at 1829 $ST $c_2,4*$BNSZ($a0) 1830 1831 mflo $t_1 1832 mfhi $t_2 1833 $ADDU $c_3,$t_1 1834 sltu $at,$c_3,$t_1 1835 $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); 1836 $ADDU $t_2,$at 1837 $ADDU $c_1,$t_2 1838 sltu $c_2,$c_1,$t_2 1839 mflo $t_1 1840 mfhi $t_2 1841 $ADDU $c_3,$t_1 1842 sltu $at,$c_3,$t_1 1843 $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); 1844 $ADDU $t_2,$at 1845 $ADDU $c_1,$t_2 1846 sltu $at,$c_1,$t_2 1847 $ADDU $c_2,$at 1848 $ST $c_3,5*$BNSZ($a0) 1849 1850 mflo $t_1 1851 mfhi $t_2 1852 $ADDU $c_1,$t_1 1853 sltu $at,$c_1,$t_1 1854 $ADDU $t_2,$at 1855 $ADDU $c_2,$t_2 1856 $ST $c_1,6*$BNSZ($a0) 1857 $ST $c_2,7*$BNSZ($a0) 1858 1859 .set noreorder 1860___ 1861$code.=<<___ if ($flavour =~ /nubi/i); 1862 $REG_L $t3,4*$SZREG($sp) 1863 $REG_L $t2,3*$SZREG($sp) 1864 $REG_L $t1,2*$SZREG($sp) 1865 $REG_L $t0,1*$SZREG($sp) 1866 $REG_L $gp,0*$SZREG($sp) 1867 $PTR_ADD $sp,6*$SZREG 1868___ 1869$code.=<<___; 1870 jr $ra 1871 nop 1872.end bn_mul_comba4 1873___ 1874 1875($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); 1876 1877$code.=<<___; 1878 1879.align 5 1880.globl bn_sqr_comba8 1881.ent bn_sqr_comba8 1882bn_sqr_comba8: 1883___ 1884$code.=<<___ if ($flavour =~ /nubi/i); 1885 .frame $sp,6*$SZREG,$ra 1886 .mask 0x8000f008,-$SZREG 1887 .set noreorder 1888 $PTR_SUB $sp,6*$SZREG 1889 $REG_S $ra,5*$SZREG($sp) 1890 $REG_S $t3,4*$SZREG($sp) 1891 $REG_S $t2,3*$SZREG($sp) 1892 $REG_S $t1,2*$SZREG($sp) 1893 $REG_S $t0,1*$SZREG($sp) 1894 $REG_S $gp,0*$SZREG($sp) 1895___ 1896$code.=<<___; 1897 .set reorder 1898 $LD $a_0,0($a1) 1899 $LD $a_1,$BNSZ($a1) 1900 $LD $a_2,2*$BNSZ($a1) 1901 $LD $a_3,3*$BNSZ($a1) 1902 1903 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 1904 $LD $a_4,4*$BNSZ($a1) 1905 $LD $a_5,5*$BNSZ($a1) 1906 $LD $a_6,6*$BNSZ($a1) 1907 $LD $a_7,7*$BNSZ($a1) 1908 mflo $c_1 1909 mfhi $c_2 1910 $ST $c_1,0($a0) 1911 1912 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 1913 mflo $t_1 1914 mfhi $t_2 1915 slt $c_1,$t_2,$zero 1916 $SLL $t_2,1 1917 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 1918 slt $a2,$t_1,$zero 1919 $ADDU $t_2,$a2 1920 $SLL $t_1,1 1921 $ADDU $c_2,$t_1 1922 sltu $at,$c_2,$t_1 1923 $ADDU $c_3,$t_2,$at 1924 $ST $c_2,$BNSZ($a0) 1925 1926 mflo $t_1 1927 mfhi $t_2 1928 slt $c_2,$t_2,$zero 1929 $SLL $t_2,1 1930 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); 1931 slt $a2,$t_1,$zero 1932 $ADDU $t_2,$a2 1933 $SLL $t_1,1 1934 $ADDU $c_3,$t_1 1935 sltu $at,$c_3,$t_1 1936 $ADDU $t_2,$at 1937 $ADDU $c_1,$t_2 1938 sltu $at,$c_1,$t_2 1939 $ADDU $c_2,$at 1940 mflo $t_1 1941 mfhi $t_2 1942 $ADDU $c_3,$t_1 1943 sltu $at,$c_3,$t_1 1944 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 1945 $ADDU $t_2,$at 1946 $ADDU $c_1,$t_2 1947 sltu $at,$c_1,$t_2 1948 $ADDU $c_2,$at 1949 $ST $c_3,2*$BNSZ($a0) 1950 1951 mflo $t_1 1952 mfhi $t_2 1953 slt $c_3,$t_2,$zero 1954 $SLL $t_2,1 1955 $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); 1956 slt $a2,$t_1,$zero 1957 $ADDU $t_2,$a2 1958 $SLL $t_1,1 1959 $ADDU $c_1,$t_1 1960 sltu $at,$c_1,$t_1 1961 $ADDU $t_2,$at 1962 $ADDU $c_2,$t_2 1963 sltu $at,$c_2,$t_2 1964 $ADDU $c_3,$at 1965 mflo $t_1 1966 mfhi $t_2 1967 slt $at,$t_2,$zero 1968 $ADDU $c_3,$at 1969 $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); 1970 $SLL $t_2,1 1971 slt $a2,$t_1,$zero 1972 $ADDU $t_2,$a2 1973 $SLL $t_1,1 1974 $ADDU $c_1,$t_1 1975 sltu $at,$c_1,$t_1 1976 $ADDU $t_2,$at 1977 $ADDU $c_2,$t_2 1978 sltu $at,$c_2,$t_2 1979 $ADDU $c_3,$at 1980 $ST $c_1,3*$BNSZ($a0) 1981 1982 mflo $t_1 1983 mfhi $t_2 1984 slt $c_1,$t_2,$zero 1985 $SLL $t_2,1 1986 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); 1987 slt $a2,$t_1,$zero 1988 $ADDU $t_2,$a2 1989 $SLL $t_1,1 1990 $ADDU $c_2,$t_1 1991 sltu $at,$c_2,$t_1 1992 $ADDU $t_2,$at 1993 $ADDU $c_3,$t_2 1994 sltu $at,$c_3,$t_2 1995 $ADDU $c_1,$at 1996 mflo $t_1 1997 mfhi $t_2 1998 slt $at,$t_2,$zero 1999 $ADDU $c_1,$at 2000 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); 2001 $SLL $t_2,1 2002 slt $a2,$t_1,$zero 2003 $ADDU $t_2,$a2 2004 $SLL $t_1,1 2005 $ADDU $c_2,$t_1 2006 sltu $at,$c_2,$t_1 2007 $ADDU $t_2,$at 2008 $ADDU $c_3,$t_2 2009 sltu $at,$c_3,$t_2 2010 $ADDU $c_1,$at 2011 mflo $t_1 2012 mfhi $t_2 2013 $ADDU $c_2,$t_1 2014 sltu $at,$c_2,$t_1 2015 $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); 2016 $ADDU $t_2,$at 2017 $ADDU $c_3,$t_2 2018 sltu $at,$c_3,$t_2 2019 $ADDU $c_1,$at 2020 $ST $c_2,4*$BNSZ($a0) 2021 2022 mflo $t_1 2023 mfhi $t_2 2024 slt $c_2,$t_2,$zero 2025 $SLL $t_2,1 2026 $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); 2027 slt $a2,$t_1,$zero 2028 $ADDU $t_2,$a2 2029 $SLL $t_1,1 2030 $ADDU $c_3,$t_1 2031 sltu $at,$c_3,$t_1 2032 $ADDU $t_2,$at 2033 $ADDU $c_1,$t_2 2034 sltu $at,$c_1,$t_2 2035 $ADDU $c_2,$at 2036 mflo $t_1 2037 mfhi $t_2 2038 slt $at,$t_2,$zero 2039 $ADDU $c_2,$at 2040 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); 2041 $SLL $t_2,1 2042 slt $a2,$t_1,$zero 2043 $ADDU $t_2,$a2 2044 $SLL $t_1,1 2045 $ADDU $c_3,$t_1 2046 sltu $at,$c_3,$t_1 2047 $ADDU $t_2,$at 2048 $ADDU $c_1,$t_2 2049 sltu $at,$c_1,$t_2 2050 $ADDU $c_2,$at 2051 mflo $t_1 2052 mfhi $t_2 2053 slt $at,$t_2,$zero 2054 $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); 2055 $ADDU $c_2,$at 2056 $SLL $t_2,1 2057 slt $a2,$t_1,$zero 2058 $ADDU $t_2,$a2 2059 $SLL $t_1,1 2060 $ADDU $c_3,$t_1 2061 sltu $at,$c_3,$t_1 2062 $ADDU $t_2,$at 2063 $ADDU $c_1,$t_2 2064 sltu $at,$c_1,$t_2 2065 $ADDU $c_2,$at 2066 $ST $c_3,5*$BNSZ($a0) 2067 2068 mflo $t_1 2069 mfhi $t_2 2070 slt $c_3,$t_2,$zero 2071 $SLL $t_2,1 2072 $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); 2073 slt $a2,$t_1,$zero 2074 $ADDU $t_2,$a2 2075 $SLL $t_1,1 2076 $ADDU $c_1,$t_1 2077 sltu $at,$c_1,$t_1 2078 $ADDU $t_2,$at 2079 $ADDU $c_2,$t_2 2080 sltu $at,$c_2,$t_2 2081 $ADDU $c_3,$at 2082 mflo $t_1 2083 mfhi $t_2 2084 slt $at,$t_2,$zero 2085 $ADDU $c_3,$at 2086 $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); 2087 $SLL $t_2,1 2088 slt $a2,$t_1,$zero 2089 $ADDU $t_2,$a2 2090 $SLL $t_1,1 2091 $ADDU $c_1,$t_1 2092 sltu $at,$c_1,$t_1 2093 $ADDU $t_2,$at 2094 $ADDU $c_2,$t_2 2095 sltu $at,$c_2,$t_2 2096 $ADDU $c_3,$at 2097 mflo $t_1 2098 mfhi $t_2 2099 slt $at,$t_2,$zero 2100 $ADDU $c_3,$at 2101 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); 2102 $SLL $t_2,1 2103 slt $a2,$t_1,$zero 2104 $ADDU $t_2,$a2 2105 $SLL $t_1,1 2106 $ADDU $c_1,$t_1 2107 sltu $at,$c_1,$t_1 2108 $ADDU $t_2,$at 2109 $ADDU $c_2,$t_2 2110 sltu $at,$c_2,$t_2 2111 $ADDU $c_3,$at 2112 mflo $t_1 2113 mfhi $t_2 2114 $ADDU $c_1,$t_1 2115 sltu $at,$c_1,$t_1 2116 $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); 2117 $ADDU $t_2,$at 2118 $ADDU $c_2,$t_2 2119 sltu $at,$c_2,$t_2 2120 $ADDU $c_3,$at 2121 $ST $c_1,6*$BNSZ($a0) 2122 2123 mflo $t_1 2124 mfhi $t_2 2125 slt $c_1,$t_2,$zero 2126 $SLL $t_2,1 2127 $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); 2128 slt $a2,$t_1,$zero 2129 $ADDU $t_2,$a2 2130 $SLL $t_1,1 2131 $ADDU $c_2,$t_1 2132 sltu $at,$c_2,$t_1 2133 $ADDU $t_2,$at 2134 $ADDU $c_3,$t_2 2135 sltu $at,$c_3,$t_2 2136 $ADDU $c_1,$at 2137 mflo $t_1 2138 mfhi $t_2 2139 slt $at,$t_2,$zero 2140 $ADDU $c_1,$at 2141 $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); 2142 $SLL $t_2,1 2143 slt $a2,$t_1,$zero 2144 $ADDU $t_2,$a2 2145 $SLL $t_1,1 2146 $ADDU $c_2,$t_1 2147 sltu $at,$c_2,$t_1 2148 $ADDU $t_2,$at 2149 $ADDU $c_3,$t_2 2150 sltu $at,$c_3,$t_2 2151 $ADDU $c_1,$at 2152 mflo $t_1 2153 mfhi $t_2 2154 slt $at,$t_2,$zero 2155 $ADDU $c_1,$at 2156 $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); 2157 $SLL $t_2,1 2158 slt $a2,$t_1,$zero 2159 $ADDU $t_2,$a2 2160 $SLL $t_1,1 2161 $ADDU $c_2,$t_1 2162 sltu $at,$c_2,$t_1 2163 $ADDU $t_2,$at 2164 $ADDU $c_3,$t_2 2165 sltu $at,$c_3,$t_2 2166 $ADDU $c_1,$at 2167 mflo $t_1 2168 mfhi $t_2 2169 slt $at,$t_2,$zero 2170 $ADDU $c_1,$at 2171 $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); 2172 $SLL $t_2,1 2173 slt $a2,$t_1,$zero 2174 $ADDU $t_2,$a2 2175 $SLL $t_1,1 2176 $ADDU $c_2,$t_1 2177 sltu $at,$c_2,$t_1 2178 $ADDU $t_2,$at 2179 $ADDU $c_3,$t_2 2180 sltu $at,$c_3,$t_2 2181 $ADDU $c_1,$at 2182 $ST $c_2,7*$BNSZ($a0) 2183 2184 mflo $t_1 2185 mfhi $t_2 2186 slt $c_2,$t_2,$zero 2187 $SLL $t_2,1 2188 $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); 2189 slt $a2,$t_1,$zero 2190 $ADDU $t_2,$a2 2191 $SLL $t_1,1 2192 $ADDU $c_3,$t_1 2193 sltu $at,$c_3,$t_1 2194 $ADDU $t_2,$at 2195 $ADDU $c_1,$t_2 2196 sltu $at,$c_1,$t_2 2197 $ADDU $c_2,$at 2198 mflo $t_1 2199 mfhi $t_2 2200 slt $at,$t_2,$zero 2201 $ADDU $c_2,$at 2202 $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); 2203 $SLL $t_2,1 2204 slt $a2,$t_1,$zero 2205 $ADDU $t_2,$a2 2206 $SLL $t_1,1 2207 $ADDU $c_3,$t_1 2208 sltu $at,$c_3,$t_1 2209 $ADDU $t_2,$at 2210 $ADDU $c_1,$t_2 2211 sltu $at,$c_1,$t_2 2212 $ADDU $c_2,$at 2213 mflo $t_1 2214 mfhi $t_2 2215 slt $at,$t_2,$zero 2216 $ADDU $c_2,$at 2217 $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); 2218 $SLL $t_2,1 2219 slt $a2,$t_1,$zero 2220 $ADDU $t_2,$a2 2221 $SLL $t_1,1 2222 $ADDU $c_3,$t_1 2223 sltu $at,$c_3,$t_1 2224 $ADDU $t_2,$at 2225 $ADDU $c_1,$t_2 2226 sltu $at,$c_1,$t_2 2227 $ADDU $c_2,$at 2228 mflo $t_1 2229 mfhi $t_2 2230 $ADDU $c_3,$t_1 2231 sltu $at,$c_3,$t_1 2232 $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); 2233 $ADDU $t_2,$at 2234 $ADDU $c_1,$t_2 2235 sltu $at,$c_1,$t_2 2236 $ADDU $c_2,$at 2237 $ST $c_3,8*$BNSZ($a0) 2238 2239 mflo $t_1 2240 mfhi $t_2 2241 slt $c_3,$t_2,$zero 2242 $SLL $t_2,1 2243 $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); 2244 slt $a2,$t_1,$zero 2245 $ADDU $t_2,$a2 2246 $SLL $t_1,1 2247 $ADDU $c_1,$t_1 2248 sltu $at,$c_1,$t_1 2249 $ADDU $t_2,$at 2250 $ADDU $c_2,$t_2 2251 sltu $at,$c_2,$t_2 2252 $ADDU $c_3,$at 2253 mflo $t_1 2254 mfhi $t_2 2255 slt $at,$t_2,$zero 2256 $ADDU $c_3,$at 2257 $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); 2258 $SLL $t_2,1 2259 slt $a2,$t_1,$zero 2260 $ADDU $t_2,$a2 2261 $SLL $t_1,1 2262 $ADDU $c_1,$t_1 2263 sltu $at,$c_1,$t_1 2264 $ADDU $t_2,$at 2265 $ADDU $c_2,$t_2 2266 sltu $at,$c_2,$t_2 2267 $ADDU $c_3,$at 2268 mflo $t_1 2269 mfhi $t_2 2270 slt $at,$t_2,$zero 2271 $ADDU $c_3,$at 2272 $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); 2273 $SLL $t_2,1 2274 slt $a2,$t_1,$zero 2275 $ADDU $t_2,$a2 2276 $SLL $t_1,1 2277 $ADDU $c_1,$t_1 2278 sltu $at,$c_1,$t_1 2279 $ADDU $t_2,$at 2280 $ADDU $c_2,$t_2 2281 sltu $at,$c_2,$t_2 2282 $ADDU $c_3,$at 2283 $ST $c_1,9*$BNSZ($a0) 2284 2285 mflo $t_1 2286 mfhi $t_2 2287 slt $c_1,$t_2,$zero 2288 $SLL $t_2,1 2289 $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); 2290 slt $a2,$t_1,$zero 2291 $ADDU $t_2,$a2 2292 $SLL $t_1,1 2293 $ADDU $c_2,$t_1 2294 sltu $at,$c_2,$t_1 2295 $ADDU $t_2,$at 2296 $ADDU $c_3,$t_2 2297 sltu $at,$c_3,$t_2 2298 $ADDU $c_1,$at 2299 mflo $t_1 2300 mfhi $t_2 2301 slt $at,$t_2,$zero 2302 $ADDU $c_1,$at 2303 $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); 2304 $SLL $t_2,1 2305 slt $a2,$t_1,$zero 2306 $ADDU $t_2,$a2 2307 $SLL $t_1,1 2308 $ADDU $c_2,$t_1 2309 sltu $at,$c_2,$t_1 2310 $ADDU $t_2,$at 2311 $ADDU $c_3,$t_2 2312 sltu $at,$c_3,$t_2 2313 $ADDU $c_1,$at 2314 mflo $t_1 2315 mfhi $t_2 2316 $ADDU $c_2,$t_1 2317 sltu $at,$c_2,$t_1 2318 $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); 2319 $ADDU $t_2,$at 2320 $ADDU $c_3,$t_2 2321 sltu $at,$c_3,$t_2 2322 $ADDU $c_1,$at 2323 $ST $c_2,10*$BNSZ($a0) 2324 2325 mflo $t_1 2326 mfhi $t_2 2327 slt $c_2,$t_2,$zero 2328 $SLL $t_2,1 2329 $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); 2330 slt $a2,$t_1,$zero 2331 $ADDU $t_2,$a2 2332 $SLL $t_1,1 2333 $ADDU $c_3,$t_1 2334 sltu $at,$c_3,$t_1 2335 $ADDU $t_2,$at 2336 $ADDU $c_1,$t_2 2337 sltu $at,$c_1,$t_2 2338 $ADDU $c_2,$at 2339 mflo $t_1 2340 mfhi $t_2 2341 slt $at,$t_2,$zero 2342 $ADDU $c_2,$at 2343 $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); 2344 $SLL $t_2,1 2345 slt $a2,$t_1,$zero 2346 $ADDU $t_2,$a2 2347 $SLL $t_1,1 2348 $ADDU $c_3,$t_1 2349 sltu $at,$c_3,$t_1 2350 $ADDU $t_2,$at 2351 $ADDU $c_1,$t_2 2352 sltu $at,$c_1,$t_2 2353 $ADDU $c_2,$at 2354 $ST $c_3,11*$BNSZ($a0) 2355 2356 mflo $t_1 2357 mfhi $t_2 2358 slt $c_3,$t_2,$zero 2359 $SLL $t_2,1 2360 $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); 2361 slt $a2,$t_1,$zero 2362 $ADDU $t_2,$a2 2363 $SLL $t_1,1 2364 $ADDU $c_1,$t_1 2365 sltu $at,$c_1,$t_1 2366 $ADDU $t_2,$at 2367 $ADDU $c_2,$t_2 2368 sltu $at,$c_2,$t_2 2369 $ADDU $c_3,$at 2370 mflo $t_1 2371 mfhi $t_2 2372 $ADDU $c_1,$t_1 2373 sltu $at,$c_1,$t_1 2374 $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); 2375 $ADDU $t_2,$at 2376 $ADDU $c_2,$t_2 2377 sltu $at,$c_2,$t_2 2378 $ADDU $c_3,$at 2379 $ST $c_1,12*$BNSZ($a0) 2380 2381 mflo $t_1 2382 mfhi $t_2 2383 slt $c_1,$t_2,$zero 2384 $SLL $t_2,1 2385 $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); 2386 slt $a2,$t_1,$zero 2387 $ADDU $t_2,$a2 2388 $SLL $t_1,1 2389 $ADDU $c_2,$t_1 2390 sltu $at,$c_2,$t_1 2391 $ADDU $t_2,$at 2392 $ADDU $c_3,$t_2 2393 sltu $at,$c_3,$t_2 2394 $ADDU $c_1,$at 2395 $ST $c_2,13*$BNSZ($a0) 2396 2397 mflo $t_1 2398 mfhi $t_2 2399 $ADDU $c_3,$t_1 2400 sltu $at,$c_3,$t_1 2401 $ADDU $t_2,$at 2402 $ADDU $c_1,$t_2 2403 $ST $c_3,14*$BNSZ($a0) 2404 $ST $c_1,15*$BNSZ($a0) 2405 2406 .set noreorder 2407___ 2408$code.=<<___ if ($flavour =~ /nubi/i); 2409 $REG_L $t3,4*$SZREG($sp) 2410 $REG_L $t2,3*$SZREG($sp) 2411 $REG_L $t1,2*$SZREG($sp) 2412 $REG_L $t0,1*$SZREG($sp) 2413 $REG_L $gp,0*$SZREG($sp) 2414 $PTR_ADD $sp,6*$SZREG 2415___ 2416$code.=<<___; 2417 jr $ra 2418 nop 2419.end bn_sqr_comba8 2420 2421.align 5 2422.globl bn_sqr_comba4 2423.ent bn_sqr_comba4 2424bn_sqr_comba4: 2425___ 2426$code.=<<___ if ($flavour =~ /nubi/i); 2427 .frame $sp,6*$SZREG,$ra 2428 .mask 0x8000f008,-$SZREG 2429 .set noreorder 2430 $PTR_SUB $sp,6*$SZREG 2431 $REG_S $ra,5*$SZREG($sp) 2432 $REG_S $t3,4*$SZREG($sp) 2433 $REG_S $t2,3*$SZREG($sp) 2434 $REG_S $t1,2*$SZREG($sp) 2435 $REG_S $t0,1*$SZREG($sp) 2436 $REG_S $gp,0*$SZREG($sp) 2437___ 2438$code.=<<___; 2439 .set reorder 2440 $LD $a_0,0($a1) 2441 $LD $a_1,$BNSZ($a1) 2442 $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); 2443 $LD $a_2,2*$BNSZ($a1) 2444 $LD $a_3,3*$BNSZ($a1) 2445 mflo $c_1 2446 mfhi $c_2 2447 $ST $c_1,0($a0) 2448 2449 $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); 2450 mflo $t_1 2451 mfhi $t_2 2452 slt $c_1,$t_2,$zero 2453 $SLL $t_2,1 2454 $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); 2455 slt $a2,$t_1,$zero 2456 $ADDU $t_2,$a2 2457 $SLL $t_1,1 2458 $ADDU $c_2,$t_1 2459 sltu $at,$c_2,$t_1 2460 $ADDU $c_3,$t_2,$at 2461 $ST $c_2,$BNSZ($a0) 2462 2463 mflo $t_1 2464 mfhi $t_2 2465 slt $c_2,$t_2,$zero 2466 $SLL $t_2,1 2467 $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); 2468 slt $a2,$t_1,$zero 2469 $ADDU $t_2,$a2 2470 $SLL $t_1,1 2471 $ADDU $c_3,$t_1 2472 sltu $at,$c_3,$t_1 2473 $ADDU $t_2,$at 2474 $ADDU $c_1,$t_2 2475 sltu $at,$c_1,$t_2 2476 $ADDU $c_2,$at 2477 mflo $t_1 2478 mfhi $t_2 2479 $ADDU $c_3,$t_1 2480 sltu $at,$c_3,$t_1 2481 $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); 2482 $ADDU $t_2,$at 2483 $ADDU $c_1,$t_2 2484 sltu $at,$c_1,$t_2 2485 $ADDU $c_2,$at 2486 $ST $c_3,2*$BNSZ($a0) 2487 2488 mflo $t_1 2489 mfhi $t_2 2490 slt $c_3,$t_2,$zero 2491 $SLL $t_2,1 2492 $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); 2493 slt $a2,$t_1,$zero 2494 $ADDU $t_2,$a2 2495 $SLL $t_1,1 2496 $ADDU $c_1,$t_1 2497 sltu $at,$c_1,$t_1 2498 $ADDU $t_2,$at 2499 $ADDU $c_2,$t_2 2500 sltu $at,$c_2,$t_2 2501 $ADDU $c_3,$at 2502 mflo $t_1 2503 mfhi $t_2 2504 slt $at,$t_2,$zero 2505 $ADDU $c_3,$at 2506 $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); 2507 $SLL $t_2,1 2508 slt $a2,$t_1,$zero 2509 $ADDU $t_2,$a2 2510 $SLL $t_1,1 2511 $ADDU $c_1,$t_1 2512 sltu $at,$c_1,$t_1 2513 $ADDU $t_2,$at 2514 $ADDU $c_2,$t_2 2515 sltu $at,$c_2,$t_2 2516 $ADDU $c_3,$at 2517 $ST $c_1,3*$BNSZ($a0) 2518 2519 mflo $t_1 2520 mfhi $t_2 2521 slt $c_1,$t_2,$zero 2522 $SLL $t_2,1 2523 $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); 2524 slt $a2,$t_1,$zero 2525 $ADDU $t_2,$a2 2526 $SLL $t_1,1 2527 $ADDU $c_2,$t_1 2528 sltu $at,$c_2,$t_1 2529 $ADDU $t_2,$at 2530 $ADDU $c_3,$t_2 2531 sltu $at,$c_3,$t_2 2532 $ADDU $c_1,$at 2533 mflo $t_1 2534 mfhi $t_2 2535 $ADDU $c_2,$t_1 2536 sltu $at,$c_2,$t_1 2537 $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); 2538 $ADDU $t_2,$at 2539 $ADDU $c_3,$t_2 2540 sltu $at,$c_3,$t_2 2541 $ADDU $c_1,$at 2542 $ST $c_2,4*$BNSZ($a0) 2543 2544 mflo $t_1 2545 mfhi $t_2 2546 slt $c_2,$t_2,$zero 2547 $SLL $t_2,1 2548 $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); 2549 slt $a2,$t_1,$zero 2550 $ADDU $t_2,$a2 2551 $SLL $t_1,1 2552 $ADDU $c_3,$t_1 2553 sltu $at,$c_3,$t_1 2554 $ADDU $t_2,$at 2555 $ADDU $c_1,$t_2 2556 sltu $at,$c_1,$t_2 2557 $ADDU $c_2,$at 2558 $ST $c_3,5*$BNSZ($a0) 2559 2560 mflo $t_1 2561 mfhi $t_2 2562 $ADDU $c_1,$t_1 2563 sltu $at,$c_1,$t_1 2564 $ADDU $t_2,$at 2565 $ADDU $c_2,$t_2 2566 $ST $c_1,6*$BNSZ($a0) 2567 $ST $c_2,7*$BNSZ($a0) 2568 2569 .set noreorder 2570___ 2571$code.=<<___ if ($flavour =~ /nubi/i); 2572 $REG_L $t3,4*$SZREG($sp) 2573 $REG_L $t2,3*$SZREG($sp) 2574 $REG_L $t1,2*$SZREG($sp) 2575 $REG_L $t0,1*$SZREG($sp) 2576 $REG_L $gp,0*$SZREG($sp) 2577 $PTR_ADD $sp,6*$SZREG 2578___ 2579$code.=<<___; 2580 jr $ra 2581 nop 2582.end bn_sqr_comba4 2583___ 2584print $code; 2585close STDOUT; 2586