1.rdata 2.asciiz "mips3.s, Version 1.1" 3.asciiz "MIPS III/IV ISA artwork by Andy Polyakov <appro@fy.chalmers.se>" 4 5/* 6 * ==================================================================== 7 * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 8 * project. 9 * 10 * Rights for redistribution and usage in source and binary forms are 11 * granted according to the OpenSSL license. Warranty of any kind is 12 * disclaimed. 13 * ==================================================================== 14 */ 15 16/* 17 * This is my modest contributon to the OpenSSL project (see 18 * http://www.openssl.org/ for more information about it) and is 19 * a drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c 20 * module. For updates see http://fy.chalmers.se/~appro/hpe/. 21 * 22 * The module is designed to work with either of the "new" MIPS ABI(5), 23 * namely N32 or N64, offered by IRIX 6.x. It's not ment to work under 24 * IRIX 5.x not only because it doesn't support new ABIs but also 25 * because 5.x kernels put R4x00 CPU into 32-bit mode and all those 26 * 64-bit instructions (daddu, dmultu, etc.) found below gonna only 27 * cause illegal instruction exception:-( 28 * 29 * In addition the code depends on preprocessor flags set up by MIPSpro 30 * compiler driver (either as or cc) and therefore (probably?) can't be 31 * compiled by the GNU assembler. GNU C driver manages fine though... 32 * I mean as long as -mmips-as is specified or is the default option, 33 * because then it simply invokes /usr/bin/as which in turn takes 34 * perfect care of the preprocessor definitions. Another neat feature 35 * offered by the MIPSpro assembler is an optimization pass. This gave 36 * me the opportunity to have the code looking more regular as all those 37 * architecture dependent instruction rescheduling details were left to 38 * the assembler. Cool, huh? 39 * 40 * Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 41 * goes way over 3 times faster! 42 * 43 * <appro@fy.chalmers.se> 44 */ 45#include <asm.h> 46#include <regdef.h> 47 48#if _MIPS_ISA>=4 49#define MOVNZ(cond,dst,src) \ 50 movn dst,src,cond 51#else 52#define MOVNZ(cond,dst,src) \ 53 .set noreorder; \ 54 bnezl cond,.+8; \ 55 move dst,src; \ 56 .set reorder 57#endif 58 59.text 60 61.set noat 62.set reorder 63 64#define MINUS4 v1 65 66.align 5 67LEAF(bn_mul_add_words) 68 .set noreorder 69 bgtzl a2,.L_bn_mul_add_words_proceed 70 ld t0,0(a1) 71 jr ra 72 move v0,zero 73 .set reorder 74 75.L_bn_mul_add_words_proceed: 76 li MINUS4,-4 77 and ta0,a2,MINUS4 78 move v0,zero 79 beqz ta0,.L_bn_mul_add_words_tail 80 81.L_bn_mul_add_words_loop: 82 dmultu t0,a3 83 ld t1,0(a0) 84 ld t2,8(a1) 85 ld t3,8(a0) 86 ld ta0,16(a1) 87 ld ta1,16(a0) 88 daddu t1,v0 89 sltu v0,t1,v0 /* All manuals say it "compares 32-bit 90 * values", but it seems to work fine 91 * even on 64-bit registers. */ 92 mflo AT 93 mfhi t0 94 daddu t1,AT 95 daddu v0,t0 96 sltu AT,t1,AT 97 sd t1,0(a0) 98 daddu v0,AT 99 100 dmultu t2,a3 101 ld ta2,24(a1) 102 ld ta3,24(a0) 103 daddu t3,v0 104 sltu v0,t3,v0 105 mflo AT 106 mfhi t2 107 daddu t3,AT 108 daddu v0,t2 109 sltu AT,t3,AT 110 sd t3,8(a0) 111 daddu v0,AT 112 113 dmultu ta0,a3 114 subu a2,4 115 PTR_ADD a0,32 116 PTR_ADD a1,32 117 daddu ta1,v0 118 sltu v0,ta1,v0 119 mflo AT 120 mfhi ta0 121 daddu ta1,AT 122 daddu v0,ta0 123 sltu AT,ta1,AT 124 sd ta1,-16(a0) 125 daddu v0,AT 126 127 128 dmultu ta2,a3 129 and ta0,a2,MINUS4 130 daddu ta3,v0 131 sltu v0,ta3,v0 132 mflo AT 133 mfhi ta2 134 daddu ta3,AT 135 daddu v0,ta2 136 sltu AT,ta3,AT 137 sd ta3,-8(a0) 138 daddu v0,AT 139 .set noreorder 140 bgtzl ta0,.L_bn_mul_add_words_loop 141 ld t0,0(a1) 142 143 bnezl a2,.L_bn_mul_add_words_tail 144 ld t0,0(a1) 145 .set reorder 146 147.L_bn_mul_add_words_return: 148 jr ra 149 150.L_bn_mul_add_words_tail: 151 dmultu t0,a3 152 ld t1,0(a0) 153 subu a2,1 154 daddu t1,v0 155 sltu v0,t1,v0 156 mflo AT 157 mfhi t0 158 daddu t1,AT 159 daddu v0,t0 160 sltu AT,t1,AT 161 sd t1,0(a0) 162 daddu v0,AT 163 beqz a2,.L_bn_mul_add_words_return 164 165 ld t0,8(a1) 166 dmultu t0,a3 167 ld t1,8(a0) 168 subu a2,1 169 daddu t1,v0 170 sltu v0,t1,v0 171 mflo AT 172 mfhi t0 173 daddu t1,AT 174 daddu v0,t0 175 sltu AT,t1,AT 176 sd t1,8(a0) 177 daddu v0,AT 178 beqz a2,.L_bn_mul_add_words_return 179 180 ld t0,16(a1) 181 dmultu t0,a3 182 ld t1,16(a0) 183 daddu t1,v0 184 sltu v0,t1,v0 185 mflo AT 186 mfhi t0 187 daddu t1,AT 188 daddu v0,t0 189 sltu AT,t1,AT 190 sd t1,16(a0) 191 daddu v0,AT 192 jr ra 193END(bn_mul_add_words) 194 195.align 5 196LEAF(bn_mul_words) 197 .set noreorder 198 bgtzl a2,.L_bn_mul_words_proceed 199 ld t0,0(a1) 200 jr ra 201 move v0,zero 202 .set reorder 203 204.L_bn_mul_words_proceed: 205 li MINUS4,-4 206 and ta0,a2,MINUS4 207 move v0,zero 208 beqz ta0,.L_bn_mul_words_tail 209 210.L_bn_mul_words_loop: 211 dmultu t0,a3 212 ld t2,8(a1) 213 ld ta0,16(a1) 214 ld ta2,24(a1) 215 mflo AT 216 mfhi t0 217 daddu v0,AT 218 sltu t1,v0,AT 219 sd v0,0(a0) 220 daddu v0,t1,t0 221 222 dmultu t2,a3 223 subu a2,4 224 PTR_ADD a0,32 225 PTR_ADD a1,32 226 mflo AT 227 mfhi t2 228 daddu v0,AT 229 sltu t3,v0,AT 230 sd v0,-24(a0) 231 daddu v0,t3,t2 232 233 dmultu ta0,a3 234 mflo AT 235 mfhi ta0 236 daddu v0,AT 237 sltu ta1,v0,AT 238 sd v0,-16(a0) 239 daddu v0,ta1,ta0 240 241 242 dmultu ta2,a3 243 and ta0,a2,MINUS4 244 mflo AT 245 mfhi ta2 246 daddu v0,AT 247 sltu ta3,v0,AT 248 sd v0,-8(a0) 249 daddu v0,ta3,ta2 250 .set noreorder 251 bgtzl ta0,.L_bn_mul_words_loop 252 ld t0,0(a1) 253 254 bnezl a2,.L_bn_mul_words_tail 255 ld t0,0(a1) 256 .set reorder 257 258.L_bn_mul_words_return: 259 jr ra 260 261.L_bn_mul_words_tail: 262 dmultu t0,a3 263 subu a2,1 264 mflo AT 265 mfhi t0 266 daddu v0,AT 267 sltu t1,v0,AT 268 sd v0,0(a0) 269 daddu v0,t1,t0 270 beqz a2,.L_bn_mul_words_return 271 272 ld t0,8(a1) 273 dmultu t0,a3 274 subu a2,1 275 mflo AT 276 mfhi t0 277 daddu v0,AT 278 sltu t1,v0,AT 279 sd v0,8(a0) 280 daddu v0,t1,t0 281 beqz a2,.L_bn_mul_words_return 282 283 ld t0,16(a1) 284 dmultu t0,a3 285 mflo AT 286 mfhi t0 287 daddu v0,AT 288 sltu t1,v0,AT 289 sd v0,16(a0) 290 daddu v0,t1,t0 291 jr ra 292END(bn_mul_words) 293 294.align 5 295LEAF(bn_sqr_words) 296 .set noreorder 297 bgtzl a2,.L_bn_sqr_words_proceed 298 ld t0,0(a1) 299 jr ra 300 move v0,zero 301 .set reorder 302 303.L_bn_sqr_words_proceed: 304 li MINUS4,-4 305 and ta0,a2,MINUS4 306 move v0,zero 307 beqz ta0,.L_bn_sqr_words_tail 308 309.L_bn_sqr_words_loop: 310 dmultu t0,t0 311 ld t2,8(a1) 312 ld ta0,16(a1) 313 ld ta2,24(a1) 314 mflo t1 315 mfhi t0 316 sd t1,0(a0) 317 sd t0,8(a0) 318 319 dmultu t2,t2 320 subu a2,4 321 PTR_ADD a0,64 322 PTR_ADD a1,32 323 mflo t3 324 mfhi t2 325 sd t3,-48(a0) 326 sd t2,-40(a0) 327 328 dmultu ta0,ta0 329 mflo ta1 330 mfhi ta0 331 sd ta1,-32(a0) 332 sd ta0,-24(a0) 333 334 335 dmultu ta2,ta2 336 and ta0,a2,MINUS4 337 mflo ta3 338 mfhi ta2 339 sd ta3,-16(a0) 340 sd ta2,-8(a0) 341 342 .set noreorder 343 bgtzl ta0,.L_bn_sqr_words_loop 344 ld t0,0(a1) 345 346 bnezl a2,.L_bn_sqr_words_tail 347 ld t0,0(a1) 348 .set reorder 349 350.L_bn_sqr_words_return: 351 move v0,zero 352 jr ra 353 354.L_bn_sqr_words_tail: 355 dmultu t0,t0 356 subu a2,1 357 mflo t1 358 mfhi t0 359 sd t1,0(a0) 360 sd t0,8(a0) 361 beqz a2,.L_bn_sqr_words_return 362 363 ld t0,8(a1) 364 dmultu t0,t0 365 subu a2,1 366 mflo t1 367 mfhi t0 368 sd t1,16(a0) 369 sd t0,24(a0) 370 beqz a2,.L_bn_sqr_words_return 371 372 ld t0,16(a1) 373 dmultu t0,t0 374 mflo t1 375 mfhi t0 376 sd t1,32(a0) 377 sd t0,40(a0) 378 jr ra 379END(bn_sqr_words) 380 381.align 5 382LEAF(bn_add_words) 383 .set noreorder 384 bgtzl a3,.L_bn_add_words_proceed 385 ld t0,0(a1) 386 jr ra 387 move v0,zero 388 .set reorder 389 390.L_bn_add_words_proceed: 391 li MINUS4,-4 392 and AT,a3,MINUS4 393 move v0,zero 394 beqz AT,.L_bn_add_words_tail 395 396.L_bn_add_words_loop: 397 ld ta0,0(a2) 398 subu a3,4 399 ld t1,8(a1) 400 and AT,a3,MINUS4 401 ld t2,16(a1) 402 PTR_ADD a2,32 403 ld t3,24(a1) 404 PTR_ADD a0,32 405 ld ta1,-24(a2) 406 PTR_ADD a1,32 407 ld ta2,-16(a2) 408 ld ta3,-8(a2) 409 daddu ta0,t0 410 sltu t8,ta0,t0 411 daddu t0,ta0,v0 412 sltu v0,t0,ta0 413 sd t0,-32(a0) 414 daddu v0,t8 415 416 daddu ta1,t1 417 sltu t9,ta1,t1 418 daddu t1,ta1,v0 419 sltu v0,t1,ta1 420 sd t1,-24(a0) 421 daddu v0,t9 422 423 daddu ta2,t2 424 sltu t8,ta2,t2 425 daddu t2,ta2,v0 426 sltu v0,t2,ta2 427 sd t2,-16(a0) 428 daddu v0,t8 429 430 daddu ta3,t3 431 sltu t9,ta3,t3 432 daddu t3,ta3,v0 433 sltu v0,t3,ta3 434 sd t3,-8(a0) 435 daddu v0,t9 436 437 .set noreorder 438 bgtzl AT,.L_bn_add_words_loop 439 ld t0,0(a1) 440 441 bnezl a3,.L_bn_add_words_tail 442 ld t0,0(a1) 443 .set reorder 444 445.L_bn_add_words_return: 446 jr ra 447 448.L_bn_add_words_tail: 449 ld ta0,0(a2) 450 daddu ta0,t0 451 subu a3,1 452 sltu t8,ta0,t0 453 daddu t0,ta0,v0 454 sltu v0,t0,ta0 455 sd t0,0(a0) 456 daddu v0,t8 457 beqz a3,.L_bn_add_words_return 458 459 ld t1,8(a1) 460 ld ta1,8(a2) 461 daddu ta1,t1 462 subu a3,1 463 sltu t9,ta1,t1 464 daddu t1,ta1,v0 465 sltu v0,t1,ta1 466 sd t1,8(a0) 467 daddu v0,t9 468 beqz a3,.L_bn_add_words_return 469 470 ld t2,16(a1) 471 ld ta2,16(a2) 472 daddu ta2,t2 473 sltu t8,ta2,t2 474 daddu t2,ta2,v0 475 sltu v0,t2,ta2 476 sd t2,16(a0) 477 daddu v0,t8 478 jr ra 479END(bn_add_words) 480 481.align 5 482LEAF(bn_sub_words) 483 .set noreorder 484 bgtzl a3,.L_bn_sub_words_proceed 485 ld t0,0(a1) 486 jr ra 487 move v0,zero 488 .set reorder 489 490.L_bn_sub_words_proceed: 491 li MINUS4,-4 492 and AT,a3,MINUS4 493 move v0,zero 494 beqz AT,.L_bn_sub_words_tail 495 496.L_bn_sub_words_loop: 497 ld ta0,0(a2) 498 subu a3,4 499 ld t1,8(a1) 500 and AT,a3,MINUS4 501 ld t2,16(a1) 502 PTR_ADD a2,32 503 ld t3,24(a1) 504 PTR_ADD a0,32 505 ld ta1,-24(a2) 506 PTR_ADD a1,32 507 ld ta2,-16(a2) 508 ld ta3,-8(a2) 509 sltu t8,t0,ta0 510 dsubu t0,ta0 511 dsubu ta0,t0,v0 512 sd ta0,-32(a0) 513 MOVNZ (t0,v0,t8) 514 515 sltu t9,t1,ta1 516 dsubu t1,ta1 517 dsubu ta1,t1,v0 518 sd ta1,-24(a0) 519 MOVNZ (t1,v0,t9) 520 521 522 sltu t8,t2,ta2 523 dsubu t2,ta2 524 dsubu ta2,t2,v0 525 sd ta2,-16(a0) 526 MOVNZ (t2,v0,t8) 527 528 sltu t9,t3,ta3 529 dsubu t3,ta3 530 dsubu ta3,t3,v0 531 sd ta3,-8(a0) 532 MOVNZ (t3,v0,t9) 533 534 .set noreorder 535 bgtzl AT,.L_bn_sub_words_loop 536 ld t0,0(a1) 537 538 bnezl a3,.L_bn_sub_words_tail 539 ld t0,0(a1) 540 .set reorder 541 542.L_bn_sub_words_return: 543 jr ra 544 545.L_bn_sub_words_tail: 546 ld ta0,0(a2) 547 subu a3,1 548 sltu t8,t0,ta0 549 dsubu t0,ta0 550 dsubu ta0,t0,v0 551 MOVNZ (t0,v0,t8) 552 sd ta0,0(a0) 553 beqz a3,.L_bn_sub_words_return 554 555 ld t1,8(a1) 556 subu a3,1 557 ld ta1,8(a2) 558 sltu t9,t1,ta1 559 dsubu t1,ta1 560 dsubu ta1,t1,v0 561 MOVNZ (t1,v0,t9) 562 sd ta1,8(a0) 563 beqz a3,.L_bn_sub_words_return 564 565 ld t2,16(a1) 566 ld ta2,16(a2) 567 sltu t8,t2,ta2 568 dsubu t2,ta2 569 dsubu ta2,t2,v0 570 MOVNZ (t2,v0,t8) 571 sd ta2,16(a0) 572 jr ra 573END(bn_sub_words) 574 575#undef MINUS4 576 577.align 5 578LEAF(bn_div_3_words) 579 .set reorder 580 move a3,a0 /* we know that bn_div_words doesn't 581 * touch a3, ta2, ta3 and preserves a2 582 * so that we can save two arguments 583 * and return address in registers 584 * instead of stack:-) 585 */ 586 ld a0,(a3) 587 move ta2,a1 588 ld a1,-8(a3) 589 bne a0,a2,.L_bn_div_3_words_proceed 590 li v0,-1 591 jr ra 592.L_bn_div_3_words_proceed: 593 move ta3,ra 594 bal bn_div_words 595 move ra,ta3 596 dmultu ta2,v0 597 ld t2,-16(a3) 598 move ta0,zero 599 mfhi t1 600 mflo t0 601 sltu t8,t1,v1 602.L_bn_div_3_words_inner_loop: 603 bnez t8,.L_bn_div_3_words_inner_loop_done 604 sgeu AT,t2,t0 605 seq t9,t1,v1 606 and AT,t9 607 sltu t3,t0,ta2 608 daddu v1,a2 609 dsubu t1,t3 610 dsubu t0,ta2 611 sltu t8,t1,v1 612 sltu ta0,v1,a2 613 or t8,ta0 614 .set noreorder 615 beqzl AT,.L_bn_div_3_words_inner_loop 616 dsubu v0,1 617 .set reorder 618.L_bn_div_3_words_inner_loop_done: 619 jr ra 620END(bn_div_3_words) 621 622.align 5 623LEAF(bn_div_words) 624 .set noreorder 625 bnezl a2,.L_bn_div_words_proceed 626 move v1,zero 627 jr ra 628 li v0,-1 /* I'd rather signal div-by-zero 629 * which can be done with 'break 7' */ 630 631.L_bn_div_words_proceed: 632 bltz a2,.L_bn_div_words_body 633 move t9,v1 634 dsll a2,1 635 bgtz a2,.-4 636 addu t9,1 637 638 .set reorder 639 negu t1,t9 640 li t2,-1 641 dsll t2,t1 642 and t2,a0 643 dsrl AT,a1,t1 644 .set noreorder 645 bnezl t2,.+8 646 break 6 /* signal overflow */ 647 .set reorder 648 dsll a0,t9 649 dsll a1,t9 650 or a0,AT 651 652#define QT ta0 653#define HH ta1 654#define DH v1 655.L_bn_div_words_body: 656 dsrl DH,a2,32 657 sgeu AT,a0,a2 658 .set noreorder 659 bnezl AT,.+8 660 dsubu a0,a2 661 .set reorder 662 663 li QT,-1 664 dsrl HH,a0,32 665 dsrl QT,32 /* q=0xffffffff */ 666 beq DH,HH,.L_bn_div_words_skip_div1 667 ddivu zero,a0,DH 668 mflo QT 669.L_bn_div_words_skip_div1: 670 dmultu a2,QT 671 dsll t3,a0,32 672 dsrl AT,a1,32 673 or t3,AT 674 mflo t0 675 mfhi t1 676.L_bn_div_words_inner_loop1: 677 sltu t2,t3,t0 678 seq t8,HH,t1 679 sltu AT,HH,t1 680 and t2,t8 681 sltu v0,t0,a2 682 or AT,t2 683 .set noreorder 684 beqz AT,.L_bn_div_words_inner_loop1_done 685 dsubu t1,v0 686 dsubu t0,a2 687 b .L_bn_div_words_inner_loop1 688 dsubu QT,1 689 .set reorder 690.L_bn_div_words_inner_loop1_done: 691 692 dsll a1,32 693 dsubu a0,t3,t0 694 dsll v0,QT,32 695 696 li QT,-1 697 dsrl HH,a0,32 698 dsrl QT,32 /* q=0xffffffff */ 699 beq DH,HH,.L_bn_div_words_skip_div2 700 ddivu zero,a0,DH 701 mflo QT 702.L_bn_div_words_skip_div2: 703#undef DH 704 dmultu a2,QT 705 dsll t3,a0,32 706 dsrl AT,a1,32 707 or t3,AT 708 mflo t0 709 mfhi t1 710.L_bn_div_words_inner_loop2: 711 sltu t2,t3,t0 712 seq t8,HH,t1 713 sltu AT,HH,t1 714 and t2,t8 715 sltu v1,t0,a2 716 or AT,t2 717 .set noreorder 718 beqz AT,.L_bn_div_words_inner_loop2_done 719 dsubu t1,v1 720 dsubu t0,a2 721 b .L_bn_div_words_inner_loop2 722 dsubu QT,1 723 .set reorder 724.L_bn_div_words_inner_loop2_done: 725#undef HH 726 727 dsubu a0,t3,t0 728 or v0,QT 729 dsrl v1,a0,t9 /* v1 contains remainder if anybody wants it */ 730 dsrl a2,t9 /* restore a2 */ 731 jr ra 732#undef QT 733END(bn_div_words) 734 735#define a_0 t0 736#define a_1 t1 737#define a_2 t2 738#define a_3 t3 739#define b_0 ta0 740#define b_1 ta1 741#define b_2 ta2 742#define b_3 ta3 743 744#define a_4 s0 745#define a_5 s2 746#define a_6 s4 747#define a_7 a1 /* once we load a[7] we don't need a anymore */ 748#define b_4 s1 749#define b_5 s3 750#define b_6 s5 751#define b_7 a2 /* once we load b[7] we don't need b anymore */ 752 753#define t_1 t8 754#define t_2 t9 755 756#define c_1 v0 757#define c_2 v1 758#define c_3 a3 759 760#define FRAME_SIZE 48 761 762.align 5 763LEAF(bn_mul_comba8) 764 .set noreorder 765 PTR_SUB sp,FRAME_SIZE 766 .frame sp,64,ra 767 .set reorder 768 ld a_0,0(a1) /* If compiled with -mips3 option on 769 * R5000 box assembler barks on this 770 * line with "shouldn't have mult/div 771 * as last instruction in bb (R10K 772 * bug)" warning. If anybody out there 773 * has a clue about how to circumvent 774 * this do send me a note. 775 * <appro@fy.chalmers.se> 776 */ 777 ld b_0,0(a2) 778 ld a_1,8(a1) 779 ld a_2,16(a1) 780 ld a_3,24(a1) 781 ld b_1,8(a2) 782 ld b_2,16(a2) 783 ld b_3,24(a2) 784 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ 785 sd s0,0(sp) 786 sd s1,8(sp) 787 sd s2,16(sp) 788 sd s3,24(sp) 789 sd s4,32(sp) 790 sd s5,40(sp) 791 mflo c_1 792 mfhi c_2 793 794 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ 795 ld a_4,32(a1) 796 ld a_5,40(a1) 797 ld a_6,48(a1) 798 ld a_7,56(a1) 799 ld b_4,32(a2) 800 ld b_5,40(a2) 801 mflo t_1 802 mfhi t_2 803 daddu c_2,t_1 804 sltu AT,c_2,t_1 805 daddu c_3,t_2,AT 806 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ 807 ld b_6,48(a2) 808 ld b_7,56(a2) 809 sd c_1,0(a0) /* r[0]=c1; */ 810 mflo t_1 811 mfhi t_2 812 daddu c_2,t_1 813 sltu AT,c_2,t_1 814 daddu t_2,AT 815 daddu c_3,t_2 816 sltu c_1,c_3,t_2 817 sd c_2,8(a0) /* r[1]=c2; */ 818 819 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ 820 mflo t_1 821 mfhi t_2 822 daddu c_3,t_1 823 sltu AT,c_3,t_1 824 daddu t_2,AT 825 daddu c_1,t_2 826 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ 827 mflo t_1 828 mfhi t_2 829 daddu c_3,t_1 830 sltu AT,c_3,t_1 831 daddu t_2,AT 832 daddu c_1,t_2 833 sltu c_2,c_1,t_2 834 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ 835 mflo t_1 836 mfhi t_2 837 daddu c_3,t_1 838 sltu AT,c_3,t_1 839 daddu t_2,AT 840 daddu c_1,t_2 841 sltu AT,c_1,t_2 842 daddu c_2,AT 843 sd c_3,16(a0) /* r[2]=c3; */ 844 845 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ 846 mflo t_1 847 mfhi t_2 848 daddu c_1,t_1 849 sltu AT,c_1,t_1 850 daddu t_2,AT 851 daddu c_2,t_2 852 sltu c_3,c_2,t_2 853 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ 854 mflo t_1 855 mfhi t_2 856 daddu c_1,t_1 857 sltu AT,c_1,t_1 858 daddu t_2,AT 859 daddu c_2,t_2 860 sltu AT,c_2,t_2 861 daddu c_3,AT 862 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ 863 mflo t_1 864 mfhi t_2 865 daddu c_1,t_1 866 sltu AT,c_1,t_1 867 daddu t_2,AT 868 daddu c_2,t_2 869 sltu AT,c_2,t_2 870 daddu c_3,AT 871 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ 872 mflo t_1 873 mfhi t_2 874 daddu c_1,t_1 875 sltu AT,c_1,t_1 876 daddu t_2,AT 877 daddu c_2,t_2 878 sltu AT,c_2,t_2 879 daddu c_3,AT 880 sd c_1,24(a0) /* r[3]=c1; */ 881 882 dmultu a_4,b_0 /* mul_add_c(a[4],b[0],c2,c3,c1); */ 883 mflo t_1 884 mfhi t_2 885 daddu c_2,t_1 886 sltu AT,c_2,t_1 887 daddu t_2,AT 888 daddu c_3,t_2 889 sltu c_1,c_3,t_2 890 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ 891 mflo t_1 892 mfhi t_2 893 daddu c_2,t_1 894 sltu AT,c_2,t_1 895 daddu t_2,AT 896 daddu c_3,t_2 897 sltu AT,c_3,t_2 898 daddu c_1,AT 899 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ 900 mflo t_1 901 mfhi t_2 902 daddu c_2,t_1 903 sltu AT,c_2,t_1 904 daddu t_2,AT 905 daddu c_3,t_2 906 sltu AT,c_3,t_2 907 daddu c_1,AT 908 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ 909 mflo t_1 910 mfhi t_2 911 daddu c_2,t_1 912 sltu AT,c_2,t_1 913 daddu t_2,AT 914 daddu c_3,t_2 915 sltu AT,c_3,t_2 916 daddu c_1,AT 917 dmultu a_0,b_4 /* mul_add_c(a[0],b[4],c2,c3,c1); */ 918 mflo t_1 919 mfhi t_2 920 daddu c_2,t_1 921 sltu AT,c_2,t_1 922 daddu t_2,AT 923 daddu c_3,t_2 924 sltu AT,c_3,t_2 925 daddu c_1,AT 926 sd c_2,32(a0) /* r[4]=c2; */ 927 928 dmultu a_0,b_5 /* mul_add_c(a[0],b[5],c3,c1,c2); */ 929 mflo t_1 930 mfhi t_2 931 daddu c_3,t_1 932 sltu AT,c_3,t_1 933 daddu t_2,AT 934 daddu c_1,t_2 935 sltu c_2,c_1,t_2 936 dmultu a_1,b_4 /* mul_add_c(a[1],b[4],c3,c1,c2); */ 937 mflo t_1 938 mfhi t_2 939 daddu c_3,t_1 940 sltu AT,c_3,t_1 941 daddu t_2,AT 942 daddu c_1,t_2 943 sltu AT,c_1,t_2 944 daddu c_2,AT 945 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ 946 mflo t_1 947 mfhi t_2 948 daddu c_3,t_1 949 sltu AT,c_3,t_1 950 daddu t_2,AT 951 daddu c_1,t_2 952 sltu AT,c_1,t_2 953 daddu c_2,AT 954 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ 955 mflo t_1 956 mfhi t_2 957 daddu c_3,t_1 958 sltu AT,c_3,t_1 959 daddu t_2,AT 960 daddu c_1,t_2 961 sltu AT,c_1,t_2 962 daddu c_2,AT 963 dmultu a_4,b_1 /* mul_add_c(a[4],b[1],c3,c1,c2); */ 964 mflo t_1 965 mfhi t_2 966 daddu c_3,t_1 967 sltu AT,c_3,t_1 968 daddu t_2,AT 969 daddu c_1,t_2 970 sltu AT,c_1,t_2 971 daddu c_2,AT 972 dmultu a_5,b_0 /* mul_add_c(a[5],b[0],c3,c1,c2); */ 973 mflo t_1 974 mfhi t_2 975 daddu c_3,t_1 976 sltu AT,c_3,t_1 977 daddu t_2,AT 978 daddu c_1,t_2 979 sltu AT,c_1,t_2 980 daddu c_2,AT 981 sd c_3,40(a0) /* r[5]=c3; */ 982 983 dmultu a_6,b_0 /* mul_add_c(a[6],b[0],c1,c2,c3); */ 984 mflo t_1 985 mfhi t_2 986 daddu c_1,t_1 987 sltu AT,c_1,t_1 988 daddu t_2,AT 989 daddu c_2,t_2 990 sltu c_3,c_2,t_2 991 dmultu a_5,b_1 /* mul_add_c(a[5],b[1],c1,c2,c3); */ 992 mflo t_1 993 mfhi t_2 994 daddu c_1,t_1 995 sltu AT,c_1,t_1 996 daddu t_2,AT 997 daddu c_2,t_2 998 sltu AT,c_2,t_2 999 daddu c_3,AT 1000 dmultu a_4,b_2 /* mul_add_c(a[4],b[2],c1,c2,c3); */ 1001 mflo t_1 1002 mfhi t_2 1003 daddu c_1,t_1 1004 sltu AT,c_1,t_1 1005 daddu t_2,AT 1006 daddu c_2,t_2 1007 sltu AT,c_2,t_2 1008 daddu c_3,AT 1009 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ 1010 mflo t_1 1011 mfhi t_2 1012 daddu c_1,t_1 1013 sltu AT,c_1,t_1 1014 daddu t_2,AT 1015 daddu c_2,t_2 1016 sltu AT,c_2,t_2 1017 daddu c_3,AT 1018 dmultu a_2,b_4 /* mul_add_c(a[2],b[4],c1,c2,c3); */ 1019 mflo t_1 1020 mfhi t_2 1021 daddu c_1,t_1 1022 sltu AT,c_1,t_1 1023 daddu t_2,AT 1024 daddu c_2,t_2 1025 sltu AT,c_2,t_2 1026 daddu c_3,AT 1027 dmultu a_1,b_5 /* mul_add_c(a[1],b[5],c1,c2,c3); */ 1028 mflo t_1 1029 mfhi t_2 1030 daddu c_1,t_1 1031 sltu AT,c_1,t_1 1032 daddu t_2,AT 1033 daddu c_2,t_2 1034 sltu AT,c_2,t_2 1035 daddu c_3,AT 1036 dmultu a_0,b_6 /* mul_add_c(a[0],b[6],c1,c2,c3); */ 1037 mflo t_1 1038 mfhi t_2 1039 daddu c_1,t_1 1040 sltu AT,c_1,t_1 1041 daddu t_2,AT 1042 daddu c_2,t_2 1043 sltu AT,c_2,t_2 1044 daddu c_3,AT 1045 sd c_1,48(a0) /* r[6]=c1; */ 1046 1047 dmultu a_0,b_7 /* mul_add_c(a[0],b[7],c2,c3,c1); */ 1048 mflo t_1 1049 mfhi t_2 1050 daddu c_2,t_1 1051 sltu AT,c_2,t_1 1052 daddu t_2,AT 1053 daddu c_3,t_2 1054 sltu c_1,c_3,t_2 1055 dmultu a_1,b_6 /* mul_add_c(a[1],b[6],c2,c3,c1); */ 1056 mflo t_1 1057 mfhi t_2 1058 daddu c_2,t_1 1059 sltu AT,c_2,t_1 1060 daddu t_2,AT 1061 daddu c_3,t_2 1062 sltu AT,c_3,t_2 1063 daddu c_1,AT 1064 dmultu a_2,b_5 /* mul_add_c(a[2],b[5],c2,c3,c1); */ 1065 mflo t_1 1066 mfhi t_2 1067 daddu c_2,t_1 1068 sltu AT,c_2,t_1 1069 daddu t_2,AT 1070 daddu c_3,t_2 1071 sltu AT,c_3,t_2 1072 daddu c_1,AT 1073 dmultu a_3,b_4 /* mul_add_c(a[3],b[4],c2,c3,c1); */ 1074 mflo t_1 1075 mfhi t_2 1076 daddu c_2,t_1 1077 sltu AT,c_2,t_1 1078 daddu t_2,AT 1079 daddu c_3,t_2 1080 sltu AT,c_3,t_2 1081 daddu c_1,AT 1082 dmultu a_4,b_3 /* mul_add_c(a[4],b[3],c2,c3,c1); */ 1083 mflo t_1 1084 mfhi t_2 1085 daddu c_2,t_1 1086 sltu AT,c_2,t_1 1087 daddu t_2,AT 1088 daddu c_3,t_2 1089 sltu AT,c_3,t_2 1090 daddu c_1,AT 1091 dmultu a_5,b_2 /* mul_add_c(a[5],b[2],c2,c3,c1); */ 1092 mflo t_1 1093 mfhi t_2 1094 daddu c_2,t_1 1095 sltu AT,c_2,t_1 1096 daddu t_2,AT 1097 daddu c_3,t_2 1098 sltu AT,c_3,t_2 1099 daddu c_1,AT 1100 dmultu a_6,b_1 /* mul_add_c(a[6],b[1],c2,c3,c1); */ 1101 mflo t_1 1102 mfhi t_2 1103 daddu c_2,t_1 1104 sltu AT,c_2,t_1 1105 daddu t_2,AT 1106 daddu c_3,t_2 1107 sltu AT,c_3,t_2 1108 daddu c_1,AT 1109 dmultu a_7,b_0 /* mul_add_c(a[7],b[0],c2,c3,c1); */ 1110 mflo t_1 1111 mfhi t_2 1112 daddu c_2,t_1 1113 sltu AT,c_2,t_1 1114 daddu t_2,AT 1115 daddu c_3,t_2 1116 sltu AT,c_3,t_2 1117 daddu c_1,AT 1118 sd c_2,56(a0) /* r[7]=c2; */ 1119 1120 dmultu a_7,b_1 /* mul_add_c(a[7],b[1],c3,c1,c2); */ 1121 mflo t_1 1122 mfhi t_2 1123 daddu c_3,t_1 1124 sltu AT,c_3,t_1 1125 daddu t_2,AT 1126 daddu c_1,t_2 1127 sltu c_2,c_1,t_2 1128 dmultu a_6,b_2 /* mul_add_c(a[6],b[2],c3,c1,c2); */ 1129 mflo t_1 1130 mfhi t_2 1131 daddu c_3,t_1 1132 sltu AT,c_3,t_1 1133 daddu t_2,AT 1134 daddu c_1,t_2 1135 sltu AT,c_1,t_2 1136 daddu c_2,AT 1137 dmultu a_5,b_3 /* mul_add_c(a[5],b[3],c3,c1,c2); */ 1138 mflo t_1 1139 mfhi t_2 1140 daddu c_3,t_1 1141 sltu AT,c_3,t_1 1142 daddu t_2,AT 1143 daddu c_1,t_2 1144 sltu AT,c_1,t_2 1145 daddu c_2,AT 1146 dmultu a_4,b_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ 1147 mflo t_1 1148 mfhi t_2 1149 daddu c_3,t_1 1150 sltu AT,c_3,t_1 1151 daddu t_2,AT 1152 daddu c_1,t_2 1153 sltu AT,c_1,t_2 1154 daddu c_2,AT 1155 dmultu a_3,b_5 /* mul_add_c(a[3],b[5],c3,c1,c2); */ 1156 mflo t_1 1157 mfhi t_2 1158 daddu c_3,t_1 1159 sltu AT,c_3,t_1 1160 daddu t_2,AT 1161 daddu c_1,t_2 1162 sltu AT,c_1,t_2 1163 daddu c_2,AT 1164 dmultu a_2,b_6 /* mul_add_c(a[2],b[6],c3,c1,c2); */ 1165 mflo t_1 1166 mfhi t_2 1167 daddu c_3,t_1 1168 sltu AT,c_3,t_1 1169 daddu t_2,AT 1170 daddu c_1,t_2 1171 sltu AT,c_1,t_2 1172 daddu c_2,AT 1173 dmultu a_1,b_7 /* mul_add_c(a[1],b[7],c3,c1,c2); */ 1174 mflo t_1 1175 mfhi t_2 1176 daddu c_3,t_1 1177 sltu AT,c_3,t_1 1178 daddu t_2,AT 1179 daddu c_1,t_2 1180 sltu AT,c_1,t_2 1181 daddu c_2,AT 1182 sd c_3,64(a0) /* r[8]=c3; */ 1183 1184 dmultu a_2,b_7 /* mul_add_c(a[2],b[7],c1,c2,c3); */ 1185 mflo t_1 1186 mfhi t_2 1187 daddu c_1,t_1 1188 sltu AT,c_1,t_1 1189 daddu t_2,AT 1190 daddu c_2,t_2 1191 sltu c_3,c_2,t_2 1192 dmultu a_3,b_6 /* mul_add_c(a[3],b[6],c1,c2,c3); */ 1193 mflo t_1 1194 mfhi t_2 1195 daddu c_1,t_1 1196 sltu AT,c_1,t_1 1197 daddu t_2,AT 1198 daddu c_2,t_2 1199 sltu AT,c_2,t_2 1200 daddu c_3,AT 1201 dmultu a_4,b_5 /* mul_add_c(a[4],b[5],c1,c2,c3); */ 1202 mflo t_1 1203 mfhi t_2 1204 daddu c_1,t_1 1205 sltu AT,c_1,t_1 1206 daddu t_2,AT 1207 daddu c_2,t_2 1208 sltu AT,c_2,t_2 1209 daddu c_3,AT 1210 dmultu a_5,b_4 /* mul_add_c(a[5],b[4],c1,c2,c3); */ 1211 mflo t_1 1212 mfhi t_2 1213 daddu c_1,t_1 1214 sltu AT,c_1,t_1 1215 daddu t_2,AT 1216 daddu c_2,t_2 1217 sltu AT,c_2,t_2 1218 daddu c_3,AT 1219 dmultu a_6,b_3 /* mul_add_c(a[6],b[3],c1,c2,c3); */ 1220 mflo t_1 1221 mfhi t_2 1222 daddu c_1,t_1 1223 sltu AT,c_1,t_1 1224 daddu t_2,AT 1225 daddu c_2,t_2 1226 sltu AT,c_2,t_2 1227 daddu c_3,AT 1228 dmultu a_7,b_2 /* mul_add_c(a[7],b[2],c1,c2,c3); */ 1229 mflo t_1 1230 mfhi t_2 1231 daddu c_1,t_1 1232 sltu AT,c_1,t_1 1233 daddu t_2,AT 1234 daddu c_2,t_2 1235 sltu AT,c_2,t_2 1236 daddu c_3,AT 1237 sd c_1,72(a0) /* r[9]=c1; */ 1238 1239 dmultu a_7,b_3 /* mul_add_c(a[7],b[3],c2,c3,c1); */ 1240 mflo t_1 1241 mfhi t_2 1242 daddu c_2,t_1 1243 sltu AT,c_2,t_1 1244 daddu t_2,AT 1245 daddu c_3,t_2 1246 sltu c_1,c_3,t_2 1247 dmultu a_6,b_4 /* mul_add_c(a[6],b[4],c2,c3,c1); */ 1248 mflo t_1 1249 mfhi t_2 1250 daddu c_2,t_1 1251 sltu AT,c_2,t_1 1252 daddu t_2,AT 1253 daddu c_3,t_2 1254 sltu AT,c_3,t_2 1255 daddu c_1,AT 1256 dmultu a_5,b_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ 1257 mflo t_1 1258 mfhi t_2 1259 daddu c_2,t_1 1260 sltu AT,c_2,t_1 1261 daddu t_2,AT 1262 daddu c_3,t_2 1263 sltu AT,c_3,t_2 1264 daddu c_1,AT 1265 dmultu a_4,b_6 /* mul_add_c(a[4],b[6],c2,c3,c1); */ 1266 mflo t_1 1267 mfhi t_2 1268 daddu c_2,t_1 1269 sltu AT,c_2,t_1 1270 daddu t_2,AT 1271 daddu c_3,t_2 1272 sltu AT,c_3,t_2 1273 daddu c_1,AT 1274 dmultu a_3,b_7 /* mul_add_c(a[3],b[7],c2,c3,c1); */ 1275 mflo t_1 1276 mfhi t_2 1277 daddu c_2,t_1 1278 sltu AT,c_2,t_1 1279 daddu t_2,AT 1280 daddu c_3,t_2 1281 sltu AT,c_3,t_2 1282 daddu c_1,AT 1283 sd c_2,80(a0) /* r[10]=c2; */ 1284 1285 dmultu a_4,b_7 /* mul_add_c(a[4],b[7],c3,c1,c2); */ 1286 mflo t_1 1287 mfhi t_2 1288 daddu c_3,t_1 1289 sltu AT,c_3,t_1 1290 daddu t_2,AT 1291 daddu c_1,t_2 1292 sltu c_2,c_1,t_2 1293 dmultu a_5,b_6 /* mul_add_c(a[5],b[6],c3,c1,c2); */ 1294 mflo t_1 1295 mfhi t_2 1296 daddu c_3,t_1 1297 sltu AT,c_3,t_1 1298 daddu t_2,AT 1299 daddu c_1,t_2 1300 sltu AT,c_1,t_2 1301 daddu c_2,AT 1302 dmultu a_6,b_5 /* mul_add_c(a[6],b[5],c3,c1,c2); */ 1303 mflo t_1 1304 mfhi t_2 1305 daddu c_3,t_1 1306 sltu AT,c_3,t_1 1307 daddu t_2,AT 1308 daddu c_1,t_2 1309 sltu AT,c_1,t_2 1310 daddu c_2,AT 1311 dmultu a_7,b_4 /* mul_add_c(a[7],b[4],c3,c1,c2); */ 1312 mflo t_1 1313 mfhi t_2 1314 daddu c_3,t_1 1315 sltu AT,c_3,t_1 1316 daddu t_2,AT 1317 daddu c_1,t_2 1318 sltu AT,c_1,t_2 1319 daddu c_2,AT 1320 sd c_3,88(a0) /* r[11]=c3; */ 1321 1322 dmultu a_7,b_5 /* mul_add_c(a[7],b[5],c1,c2,c3); */ 1323 mflo t_1 1324 mfhi t_2 1325 daddu c_1,t_1 1326 sltu AT,c_1,t_1 1327 daddu t_2,AT 1328 daddu c_2,t_2 1329 sltu c_3,c_2,t_2 1330 dmultu a_6,b_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ 1331 mflo t_1 1332 mfhi t_2 1333 daddu c_1,t_1 1334 sltu AT,c_1,t_1 1335 daddu t_2,AT 1336 daddu c_2,t_2 1337 sltu AT,c_2,t_2 1338 daddu c_3,AT 1339 dmultu a_5,b_7 /* mul_add_c(a[5],b[7],c1,c2,c3); */ 1340 mflo t_1 1341 mfhi t_2 1342 daddu c_1,t_1 1343 sltu AT,c_1,t_1 1344 daddu t_2,AT 1345 daddu c_2,t_2 1346 sltu AT,c_2,t_2 1347 daddu c_3,AT 1348 sd c_1,96(a0) /* r[12]=c1; */ 1349 1350 dmultu a_6,b_7 /* mul_add_c(a[6],b[7],c2,c3,c1); */ 1351 mflo t_1 1352 mfhi t_2 1353 daddu c_2,t_1 1354 sltu AT,c_2,t_1 1355 daddu t_2,AT 1356 daddu c_3,t_2 1357 sltu c_1,c_3,t_2 1358 dmultu a_7,b_6 /* mul_add_c(a[7],b[6],c2,c3,c1); */ 1359 mflo t_1 1360 mfhi t_2 1361 daddu c_2,t_1 1362 sltu AT,c_2,t_1 1363 daddu t_2,AT 1364 daddu c_3,t_2 1365 sltu AT,c_3,t_2 1366 daddu c_1,AT 1367 sd c_2,104(a0) /* r[13]=c2; */ 1368 1369 dmultu a_7,b_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ 1370 ld s0,0(sp) 1371 ld s1,8(sp) 1372 ld s2,16(sp) 1373 ld s3,24(sp) 1374 ld s4,32(sp) 1375 ld s5,40(sp) 1376 mflo t_1 1377 mfhi t_2 1378 daddu c_3,t_1 1379 sltu AT,c_3,t_1 1380 daddu t_2,AT 1381 daddu c_1,t_2 1382 sd c_3,112(a0) /* r[14]=c3; */ 1383 sd c_1,120(a0) /* r[15]=c1; */ 1384 1385 PTR_ADD sp,FRAME_SIZE 1386 1387 jr ra 1388END(bn_mul_comba8) 1389 1390.align 5 1391LEAF(bn_mul_comba4) 1392 .set reorder 1393 ld a_0,0(a1) 1394 ld b_0,0(a2) 1395 ld a_1,8(a1) 1396 ld a_2,16(a1) 1397 dmultu a_0,b_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ 1398 ld a_3,24(a1) 1399 ld b_1,8(a2) 1400 ld b_2,16(a2) 1401 ld b_3,24(a2) 1402 mflo c_1 1403 mfhi c_2 1404 sd c_1,0(a0) 1405 1406 dmultu a_0,b_1 /* mul_add_c(a[0],b[1],c2,c3,c1); */ 1407 mflo t_1 1408 mfhi t_2 1409 daddu c_2,t_1 1410 sltu AT,c_2,t_1 1411 daddu c_3,t_2,AT 1412 dmultu a_1,b_0 /* mul_add_c(a[1],b[0],c2,c3,c1); */ 1413 mflo t_1 1414 mfhi t_2 1415 daddu c_2,t_1 1416 sltu AT,c_2,t_1 1417 daddu t_2,AT 1418 daddu c_3,t_2 1419 sltu c_1,c_3,t_2 1420 sd c_2,8(a0) 1421 1422 dmultu a_2,b_0 /* mul_add_c(a[2],b[0],c3,c1,c2); */ 1423 mflo t_1 1424 mfhi t_2 1425 daddu c_3,t_1 1426 sltu AT,c_3,t_1 1427 daddu t_2,AT 1428 daddu c_1,t_2 1429 dmultu a_1,b_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ 1430 mflo t_1 1431 mfhi t_2 1432 daddu c_3,t_1 1433 sltu AT,c_3,t_1 1434 daddu t_2,AT 1435 daddu c_1,t_2 1436 sltu c_2,c_1,t_2 1437 dmultu a_0,b_2 /* mul_add_c(a[0],b[2],c3,c1,c2); */ 1438 mflo t_1 1439 mfhi t_2 1440 daddu c_3,t_1 1441 sltu AT,c_3,t_1 1442 daddu t_2,AT 1443 daddu c_1,t_2 1444 sltu AT,c_1,t_2 1445 daddu c_2,AT 1446 sd c_3,16(a0) 1447 1448 dmultu a_0,b_3 /* mul_add_c(a[0],b[3],c1,c2,c3); */ 1449 mflo t_1 1450 mfhi t_2 1451 daddu c_1,t_1 1452 sltu AT,c_1,t_1 1453 daddu t_2,AT 1454 daddu c_2,t_2 1455 sltu c_3,c_2,t_2 1456 dmultu a_1,b_2 /* mul_add_c(a[1],b[2],c1,c2,c3); */ 1457 mflo t_1 1458 mfhi t_2 1459 daddu c_1,t_1 1460 sltu AT,c_1,t_1 1461 daddu t_2,AT 1462 daddu c_2,t_2 1463 sltu AT,c_2,t_2 1464 daddu c_3,AT 1465 dmultu a_2,b_1 /* mul_add_c(a[2],b[1],c1,c2,c3); */ 1466 mflo t_1 1467 mfhi t_2 1468 daddu c_1,t_1 1469 sltu AT,c_1,t_1 1470 daddu t_2,AT 1471 daddu c_2,t_2 1472 sltu AT,c_2,t_2 1473 daddu c_3,AT 1474 dmultu a_3,b_0 /* mul_add_c(a[3],b[0],c1,c2,c3); */ 1475 mflo t_1 1476 mfhi t_2 1477 daddu c_1,t_1 1478 sltu AT,c_1,t_1 1479 daddu t_2,AT 1480 daddu c_2,t_2 1481 sltu AT,c_2,t_2 1482 daddu c_3,AT 1483 sd c_1,24(a0) 1484 1485 dmultu a_3,b_1 /* mul_add_c(a[3],b[1],c2,c3,c1); */ 1486 mflo t_1 1487 mfhi t_2 1488 daddu c_2,t_1 1489 sltu AT,c_2,t_1 1490 daddu t_2,AT 1491 daddu c_3,t_2 1492 sltu c_1,c_3,t_2 1493 dmultu a_2,b_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ 1494 mflo t_1 1495 mfhi t_2 1496 daddu c_2,t_1 1497 sltu AT,c_2,t_1 1498 daddu t_2,AT 1499 daddu c_3,t_2 1500 sltu AT,c_3,t_2 1501 daddu c_1,AT 1502 dmultu a_1,b_3 /* mul_add_c(a[1],b[3],c2,c3,c1); */ 1503 mflo t_1 1504 mfhi t_2 1505 daddu c_2,t_1 1506 sltu AT,c_2,t_1 1507 daddu t_2,AT 1508 daddu c_3,t_2 1509 sltu AT,c_3,t_2 1510 daddu c_1,AT 1511 sd c_2,32(a0) 1512 1513 dmultu a_2,b_3 /* mul_add_c(a[2],b[3],c3,c1,c2); */ 1514 mflo t_1 1515 mfhi t_2 1516 daddu c_3,t_1 1517 sltu AT,c_3,t_1 1518 daddu t_2,AT 1519 daddu c_1,t_2 1520 sltu c_2,c_1,t_2 1521 dmultu a_3,b_2 /* mul_add_c(a[3],b[2],c3,c1,c2); */ 1522 mflo t_1 1523 mfhi t_2 1524 daddu c_3,t_1 1525 sltu AT,c_3,t_1 1526 daddu t_2,AT 1527 daddu c_1,t_2 1528 sltu AT,c_1,t_2 1529 daddu c_2,AT 1530 sd c_3,40(a0) 1531 1532 dmultu a_3,b_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ 1533 mflo t_1 1534 mfhi t_2 1535 daddu c_1,t_1 1536 sltu AT,c_1,t_1 1537 daddu t_2,AT 1538 daddu c_2,t_2 1539 sd c_1,48(a0) 1540 sd c_2,56(a0) 1541 1542 jr ra 1543END(bn_mul_comba4) 1544 1545#undef a_4 1546#undef a_5 1547#undef a_6 1548#undef a_7 1549#define a_4 b_0 1550#define a_5 b_1 1551#define a_6 b_2 1552#define a_7 b_3 1553 1554.align 5 1555LEAF(bn_sqr_comba8) 1556 .set reorder 1557 ld a_0,0(a1) 1558 ld a_1,8(a1) 1559 ld a_2,16(a1) 1560 ld a_3,24(a1) 1561 1562 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ 1563 ld a_4,32(a1) 1564 ld a_5,40(a1) 1565 ld a_6,48(a1) 1566 ld a_7,56(a1) 1567 mflo c_1 1568 mfhi c_2 1569 sd c_1,0(a0) 1570 1571 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ 1572 mflo t_1 1573 mfhi t_2 1574 slt c_1,t_2,zero 1575 dsll t_2,1 1576 slt a2,t_1,zero 1577 daddu t_2,a2 1578 dsll t_1,1 1579 daddu c_2,t_1 1580 sltu AT,c_2,t_1 1581 daddu c_3,t_2,AT 1582 sd c_2,8(a0) 1583 1584 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ 1585 mflo t_1 1586 mfhi t_2 1587 slt c_2,t_2,zero 1588 dsll t_2,1 1589 slt a2,t_1,zero 1590 daddu t_2,a2 1591 dsll t_1,1 1592 daddu c_3,t_1 1593 sltu AT,c_3,t_1 1594 daddu t_2,AT 1595 daddu c_1,t_2 1596 sltu AT,c_1,t_2 1597 daddu c_2,AT 1598 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ 1599 mflo t_1 1600 mfhi t_2 1601 daddu c_3,t_1 1602 sltu AT,c_3,t_1 1603 daddu t_2,AT 1604 daddu c_1,t_2 1605 sltu AT,c_1,t_2 1606 daddu c_2,AT 1607 sd c_3,16(a0) 1608 1609 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ 1610 mflo t_1 1611 mfhi t_2 1612 slt c_3,t_2,zero 1613 dsll t_2,1 1614 slt a2,t_1,zero 1615 daddu t_2,a2 1616 dsll t_1,1 1617 daddu c_1,t_1 1618 sltu AT,c_1,t_1 1619 daddu t_2,AT 1620 daddu c_2,t_2 1621 sltu AT,c_2,t_2 1622 daddu c_3,AT 1623 dmultu a_1,a_2 /* mul_add_c2(a[1],b[2],c1,c2,c3); */ 1624 mflo t_1 1625 mfhi t_2 1626 slt AT,t_2,zero 1627 daddu c_3,AT 1628 dsll t_2,1 1629 slt a2,t_1,zero 1630 daddu t_2,a2 1631 dsll t_1,1 1632 daddu c_1,t_1 1633 sltu AT,c_1,t_1 1634 daddu t_2,AT 1635 daddu c_2,t_2 1636 sltu AT,c_2,t_2 1637 daddu c_3,AT 1638 sd c_1,24(a0) 1639 1640 dmultu a_4,a_0 /* mul_add_c2(a[4],b[0],c2,c3,c1); */ 1641 mflo t_1 1642 mfhi t_2 1643 slt c_1,t_2,zero 1644 dsll t_2,1 1645 slt a2,t_1,zero 1646 daddu t_2,a2 1647 dsll t_1,1 1648 daddu c_2,t_1 1649 sltu AT,c_2,t_1 1650 daddu t_2,AT 1651 daddu c_3,t_2 1652 sltu AT,c_3,t_2 1653 daddu c_1,AT 1654 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ 1655 mflo t_1 1656 mfhi t_2 1657 slt AT,t_2,zero 1658 daddu c_1,AT 1659 dsll t_2,1 1660 slt a2,t_1,zero 1661 daddu t_2,a2 1662 dsll t_1,1 1663 daddu c_2,t_1 1664 sltu AT,c_2,t_1 1665 daddu t_2,AT 1666 daddu c_3,t_2 1667 sltu AT,c_3,t_2 1668 daddu c_1,AT 1669 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ 1670 mflo t_1 1671 mfhi t_2 1672 daddu c_2,t_1 1673 sltu AT,c_2,t_1 1674 daddu t_2,AT 1675 daddu c_3,t_2 1676 sltu AT,c_3,t_2 1677 daddu c_1,AT 1678 sd c_2,32(a0) 1679 1680 dmultu a_0,a_5 /* mul_add_c2(a[0],b[5],c3,c1,c2); */ 1681 mflo t_1 1682 mfhi t_2 1683 slt c_2,t_2,zero 1684 dsll t_2,1 1685 slt a2,t_1,zero 1686 daddu t_2,a2 1687 dsll t_1,1 1688 daddu c_3,t_1 1689 sltu AT,c_3,t_1 1690 daddu t_2,AT 1691 daddu c_1,t_2 1692 sltu AT,c_1,t_2 1693 daddu c_2,AT 1694 dmultu a_1,a_4 /* mul_add_c2(a[1],b[4],c3,c1,c2); */ 1695 mflo t_1 1696 mfhi t_2 1697 slt AT,t_2,zero 1698 daddu c_2,AT 1699 dsll t_2,1 1700 slt a2,t_1,zero 1701 daddu t_2,a2 1702 dsll t_1,1 1703 daddu c_3,t_1 1704 sltu AT,c_3,t_1 1705 daddu t_2,AT 1706 daddu c_1,t_2 1707 sltu AT,c_1,t_2 1708 daddu c_2,AT 1709 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ 1710 mflo t_1 1711 mfhi t_2 1712 slt AT,t_2,zero 1713 daddu c_2,AT 1714 dsll t_2,1 1715 slt a2,t_1,zero 1716 daddu t_2,a2 1717 dsll t_1,1 1718 daddu c_3,t_1 1719 sltu AT,c_3,t_1 1720 daddu t_2,AT 1721 daddu c_1,t_2 1722 sltu AT,c_1,t_2 1723 daddu c_2,AT 1724 sd c_3,40(a0) 1725 1726 dmultu a_6,a_0 /* mul_add_c2(a[6],b[0],c1,c2,c3); */ 1727 mflo t_1 1728 mfhi t_2 1729 slt c_3,t_2,zero 1730 dsll t_2,1 1731 slt a2,t_1,zero 1732 daddu t_2,a2 1733 dsll t_1,1 1734 daddu c_1,t_1 1735 sltu AT,c_1,t_1 1736 daddu t_2,AT 1737 daddu c_2,t_2 1738 sltu AT,c_2,t_2 1739 daddu c_3,AT 1740 dmultu a_5,a_1 /* mul_add_c2(a[5],b[1],c1,c2,c3); */ 1741 mflo t_1 1742 mfhi t_2 1743 slt AT,t_2,zero 1744 daddu c_3,AT 1745 dsll t_2,1 1746 slt a2,t_1,zero 1747 daddu t_2,a2 1748 dsll t_1,1 1749 daddu c_1,t_1 1750 sltu AT,c_1,t_1 1751 daddu t_2,AT 1752 daddu c_2,t_2 1753 sltu AT,c_2,t_2 1754 daddu c_3,AT 1755 dmultu a_4,a_2 /* mul_add_c2(a[4],b[2],c1,c2,c3); */ 1756 mflo t_1 1757 mfhi t_2 1758 slt AT,t_2,zero 1759 daddu c_3,AT 1760 dsll t_2,1 1761 slt a2,t_1,zero 1762 daddu t_2,a2 1763 dsll t_1,1 1764 daddu c_1,t_1 1765 sltu AT,c_1,t_1 1766 daddu t_2,AT 1767 daddu c_2,t_2 1768 sltu AT,c_2,t_2 1769 daddu c_3,AT 1770 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ 1771 mflo t_1 1772 mfhi t_2 1773 daddu c_1,t_1 1774 sltu AT,c_1,t_1 1775 daddu t_2,AT 1776 daddu c_2,t_2 1777 sltu AT,c_2,t_2 1778 daddu c_3,AT 1779 sd c_1,48(a0) 1780 1781 dmultu a_0,a_7 /* mul_add_c2(a[0],b[7],c2,c3,c1); */ 1782 mflo t_1 1783 mfhi t_2 1784 slt c_1,t_2,zero 1785 dsll t_2,1 1786 slt a2,t_1,zero 1787 daddu t_2,a2 1788 dsll t_1,1 1789 daddu c_2,t_1 1790 sltu AT,c_2,t_1 1791 daddu t_2,AT 1792 daddu c_3,t_2 1793 sltu AT,c_3,t_2 1794 daddu c_1,AT 1795 dmultu a_1,a_6 /* mul_add_c2(a[1],b[6],c2,c3,c1); */ 1796 mflo t_1 1797 mfhi t_2 1798 slt AT,t_2,zero 1799 daddu c_1,AT 1800 dsll t_2,1 1801 slt a2,t_1,zero 1802 daddu t_2,a2 1803 dsll t_1,1 1804 daddu c_2,t_1 1805 sltu AT,c_2,t_1 1806 daddu t_2,AT 1807 daddu c_3,t_2 1808 sltu AT,c_3,t_2 1809 daddu c_1,AT 1810 dmultu a_2,a_5 /* mul_add_c2(a[2],b[5],c2,c3,c1); */ 1811 mflo t_1 1812 mfhi t_2 1813 slt AT,t_2,zero 1814 daddu c_1,AT 1815 dsll t_2,1 1816 slt a2,t_1,zero 1817 daddu t_2,a2 1818 dsll t_1,1 1819 daddu c_2,t_1 1820 sltu AT,c_2,t_1 1821 daddu t_2,AT 1822 daddu c_3,t_2 1823 sltu AT,c_3,t_2 1824 daddu c_1,AT 1825 dmultu a_3,a_4 /* mul_add_c2(a[3],b[4],c2,c3,c1); */ 1826 mflo t_1 1827 mfhi t_2 1828 slt AT,t_2,zero 1829 daddu c_1,AT 1830 dsll t_2,1 1831 slt a2,t_1,zero 1832 daddu t_2,a2 1833 dsll t_1,1 1834 daddu c_2,t_1 1835 sltu AT,c_2,t_1 1836 daddu t_2,AT 1837 daddu c_3,t_2 1838 sltu AT,c_3,t_2 1839 daddu c_1,AT 1840 sd c_2,56(a0) 1841 1842 dmultu a_7,a_1 /* mul_add_c2(a[7],b[1],c3,c1,c2); */ 1843 mflo t_1 1844 mfhi t_2 1845 slt c_2,t_2,zero 1846 dsll t_2,1 1847 slt a2,t_1,zero 1848 daddu t_2,a2 1849 dsll t_1,1 1850 daddu c_3,t_1 1851 sltu AT,c_3,t_1 1852 daddu t_2,AT 1853 daddu c_1,t_2 1854 sltu AT,c_1,t_2 1855 daddu c_2,AT 1856 dmultu a_6,a_2 /* mul_add_c2(a[6],b[2],c3,c1,c2); */ 1857 mflo t_1 1858 mfhi t_2 1859 slt AT,t_2,zero 1860 daddu c_2,AT 1861 dsll t_2,1 1862 slt a2,t_1,zero 1863 daddu t_2,a2 1864 dsll t_1,1 1865 daddu c_3,t_1 1866 sltu AT,c_3,t_1 1867 daddu t_2,AT 1868 daddu c_1,t_2 1869 sltu AT,c_1,t_2 1870 daddu c_2,AT 1871 dmultu a_5,a_3 /* mul_add_c2(a[5],b[3],c3,c1,c2); */ 1872 mflo t_1 1873 mfhi t_2 1874 slt AT,t_2,zero 1875 daddu c_2,AT 1876 dsll t_2,1 1877 slt a2,t_1,zero 1878 daddu t_2,a2 1879 dsll t_1,1 1880 daddu c_3,t_1 1881 sltu AT,c_3,t_1 1882 daddu t_2,AT 1883 daddu c_1,t_2 1884 sltu AT,c_1,t_2 1885 daddu c_2,AT 1886 dmultu a_4,a_4 /* mul_add_c(a[4],b[4],c3,c1,c2); */ 1887 mflo t_1 1888 mfhi t_2 1889 daddu c_3,t_1 1890 sltu AT,c_3,t_1 1891 daddu t_2,AT 1892 daddu c_1,t_2 1893 sltu AT,c_1,t_2 1894 daddu c_2,AT 1895 sd c_3,64(a0) 1896 1897 dmultu a_2,a_7 /* mul_add_c2(a[2],b[7],c1,c2,c3); */ 1898 mflo t_1 1899 mfhi t_2 1900 slt c_3,t_2,zero 1901 dsll t_2,1 1902 slt a2,t_1,zero 1903 daddu t_2,a2 1904 dsll t_1,1 1905 daddu c_1,t_1 1906 sltu AT,c_1,t_1 1907 daddu t_2,AT 1908 daddu c_2,t_2 1909 sltu AT,c_2,t_2 1910 daddu c_3,AT 1911 dmultu a_3,a_6 /* mul_add_c2(a[3],b[6],c1,c2,c3); */ 1912 mflo t_1 1913 mfhi t_2 1914 slt AT,t_2,zero 1915 daddu c_3,AT 1916 dsll t_2,1 1917 slt a2,t_1,zero 1918 daddu t_2,a2 1919 dsll t_1,1 1920 daddu c_1,t_1 1921 sltu AT,c_1,t_1 1922 daddu t_2,AT 1923 daddu c_2,t_2 1924 sltu AT,c_2,t_2 1925 daddu c_3,AT 1926 dmultu a_4,a_5 /* mul_add_c2(a[4],b[5],c1,c2,c3); */ 1927 mflo t_1 1928 mfhi t_2 1929 slt AT,t_2,zero 1930 daddu c_3,AT 1931 dsll t_2,1 1932 slt a2,t_1,zero 1933 daddu t_2,a2 1934 dsll t_1,1 1935 daddu c_1,t_1 1936 sltu AT,c_1,t_1 1937 daddu t_2,AT 1938 daddu c_2,t_2 1939 sltu AT,c_2,t_2 1940 daddu c_3,AT 1941 sd c_1,72(a0) 1942 1943 dmultu a_7,a_3 /* mul_add_c2(a[7],b[3],c2,c3,c1); */ 1944 mflo t_1 1945 mfhi t_2 1946 slt c_1,t_2,zero 1947 dsll t_2,1 1948 slt a2,t_1,zero 1949 daddu t_2,a2 1950 dsll t_1,1 1951 daddu c_2,t_1 1952 sltu AT,c_2,t_1 1953 daddu t_2,AT 1954 daddu c_3,t_2 1955 sltu AT,c_3,t_2 1956 daddu c_1,AT 1957 dmultu a_6,a_4 /* mul_add_c2(a[6],b[4],c2,c3,c1); */ 1958 mflo t_1 1959 mfhi t_2 1960 slt AT,t_2,zero 1961 daddu c_1,AT 1962 dsll t_2,1 1963 slt a2,t_1,zero 1964 daddu t_2,a2 1965 dsll t_1,1 1966 daddu c_2,t_1 1967 sltu AT,c_2,t_1 1968 daddu t_2,AT 1969 daddu c_3,t_2 1970 sltu AT,c_3,t_2 1971 daddu c_1,AT 1972 dmultu a_5,a_5 /* mul_add_c(a[5],b[5],c2,c3,c1); */ 1973 mflo t_1 1974 mfhi t_2 1975 daddu c_2,t_1 1976 sltu AT,c_2,t_1 1977 daddu t_2,AT 1978 daddu c_3,t_2 1979 sltu AT,c_3,t_2 1980 daddu c_1,AT 1981 sd c_2,80(a0) 1982 1983 dmultu a_4,a_7 /* mul_add_c2(a[4],b[7],c3,c1,c2); */ 1984 mflo t_1 1985 mfhi t_2 1986 slt c_2,t_2,zero 1987 dsll t_2,1 1988 slt a2,t_1,zero 1989 daddu t_2,a2 1990 dsll t_1,1 1991 daddu c_3,t_1 1992 sltu AT,c_3,t_1 1993 daddu t_2,AT 1994 daddu c_1,t_2 1995 sltu AT,c_1,t_2 1996 daddu c_2,AT 1997 dmultu a_5,a_6 /* mul_add_c2(a[5],b[6],c3,c1,c2); */ 1998 mflo t_1 1999 mfhi t_2 2000 slt AT,t_2,zero 2001 daddu c_2,AT 2002 dsll t_2,1 2003 slt a2,t_1,zero 2004 daddu t_2,a2 2005 dsll t_1,1 2006 daddu c_3,t_1 2007 sltu AT,c_3,t_1 2008 daddu t_2,AT 2009 daddu c_1,t_2 2010 sltu AT,c_1,t_2 2011 daddu c_2,AT 2012 sd c_3,88(a0) 2013 2014 dmultu a_7,a_5 /* mul_add_c2(a[7],b[5],c1,c2,c3); */ 2015 mflo t_1 2016 mfhi t_2 2017 slt c_3,t_2,zero 2018 dsll t_2,1 2019 slt a2,t_1,zero 2020 daddu t_2,a2 2021 dsll t_1,1 2022 daddu c_1,t_1 2023 sltu AT,c_1,t_1 2024 daddu t_2,AT 2025 daddu c_2,t_2 2026 sltu AT,c_2,t_2 2027 daddu c_3,AT 2028 dmultu a_6,a_6 /* mul_add_c(a[6],b[6],c1,c2,c3); */ 2029 mflo t_1 2030 mfhi t_2 2031 daddu c_1,t_1 2032 sltu AT,c_1,t_1 2033 daddu t_2,AT 2034 daddu c_2,t_2 2035 sltu AT,c_2,t_2 2036 daddu c_3,AT 2037 sd c_1,96(a0) 2038 2039 dmultu a_6,a_7 /* mul_add_c2(a[6],b[7],c2,c3,c1); */ 2040 mflo t_1 2041 mfhi t_2 2042 slt c_1,t_2,zero 2043 dsll t_2,1 2044 slt a2,t_1,zero 2045 daddu t_2,a2 2046 dsll t_1,1 2047 daddu c_2,t_1 2048 sltu AT,c_2,t_1 2049 daddu t_2,AT 2050 daddu c_3,t_2 2051 sltu AT,c_3,t_2 2052 daddu c_1,AT 2053 sd c_2,104(a0) 2054 2055 dmultu a_7,a_7 /* mul_add_c(a[7],b[7],c3,c1,c2); */ 2056 mflo t_1 2057 mfhi t_2 2058 daddu c_3,t_1 2059 sltu AT,c_3,t_1 2060 daddu t_2,AT 2061 daddu c_1,t_2 2062 sd c_3,112(a0) 2063 sd c_1,120(a0) 2064 2065 jr ra 2066END(bn_sqr_comba8) 2067 2068.align 5 2069LEAF(bn_sqr_comba4) 2070 .set reorder 2071 ld a_0,0(a1) 2072 ld a_1,8(a1) 2073 ld a_2,16(a1) 2074 ld a_3,24(a1) 2075 dmultu a_0,a_0 /* mul_add_c(a[0],b[0],c1,c2,c3); */ 2076 mflo c_1 2077 mfhi c_2 2078 sd c_1,0(a0) 2079 2080 dmultu a_0,a_1 /* mul_add_c2(a[0],b[1],c2,c3,c1); */ 2081 mflo t_1 2082 mfhi t_2 2083 slt c_1,t_2,zero 2084 dsll t_2,1 2085 slt a2,t_1,zero 2086 daddu t_2,a2 2087 dsll t_1,1 2088 daddu c_2,t_1 2089 sltu AT,c_2,t_1 2090 daddu c_3,t_2,AT 2091 sd c_2,8(a0) 2092 2093 dmultu a_2,a_0 /* mul_add_c2(a[2],b[0],c3,c1,c2); */ 2094 mflo t_1 2095 mfhi t_2 2096 slt c_2,t_2,zero 2097 dsll t_2,1 2098 slt a2,t_1,zero 2099 daddu t_2,a2 2100 dsll t_1,1 2101 daddu c_3,t_1 2102 sltu AT,c_3,t_1 2103 daddu t_2,AT 2104 daddu c_1,t_2 2105 sltu AT,c_1,t_2 2106 daddu c_2,AT 2107 dmultu a_1,a_1 /* mul_add_c(a[1],b[1],c3,c1,c2); */ 2108 mflo t_1 2109 mfhi t_2 2110 daddu c_3,t_1 2111 sltu AT,c_3,t_1 2112 daddu t_2,AT 2113 daddu c_1,t_2 2114 sltu AT,c_1,t_2 2115 daddu c_2,AT 2116 sd c_3,16(a0) 2117 2118 dmultu a_0,a_3 /* mul_add_c2(a[0],b[3],c1,c2,c3); */ 2119 mflo t_1 2120 mfhi t_2 2121 slt c_3,t_2,zero 2122 dsll t_2,1 2123 slt a2,t_1,zero 2124 daddu t_2,a2 2125 dsll t_1,1 2126 daddu c_1,t_1 2127 sltu AT,c_1,t_1 2128 daddu t_2,AT 2129 daddu c_2,t_2 2130 sltu AT,c_2,t_2 2131 daddu c_3,AT 2132 dmultu a_1,a_2 /* mul_add_c(a2[1],b[2],c1,c2,c3); */ 2133 mflo t_1 2134 mfhi t_2 2135 slt AT,t_2,zero 2136 daddu c_3,AT 2137 dsll t_2,1 2138 slt a2,t_1,zero 2139 daddu t_2,a2 2140 dsll t_1,1 2141 daddu c_1,t_1 2142 sltu AT,c_1,t_1 2143 daddu t_2,AT 2144 daddu c_2,t_2 2145 sltu AT,c_2,t_2 2146 daddu c_3,AT 2147 sd c_1,24(a0) 2148 2149 dmultu a_3,a_1 /* mul_add_c2(a[3],b[1],c2,c3,c1); */ 2150 mflo t_1 2151 mfhi t_2 2152 slt c_1,t_2,zero 2153 dsll t_2,1 2154 slt a2,t_1,zero 2155 daddu t_2,a2 2156 dsll t_1,1 2157 daddu c_2,t_1 2158 sltu AT,c_2,t_1 2159 daddu t_2,AT 2160 daddu c_3,t_2 2161 sltu AT,c_3,t_2 2162 daddu c_1,AT 2163 dmultu a_2,a_2 /* mul_add_c(a[2],b[2],c2,c3,c1); */ 2164 mflo t_1 2165 mfhi t_2 2166 daddu c_2,t_1 2167 sltu AT,c_2,t_1 2168 daddu t_2,AT 2169 daddu c_3,t_2 2170 sltu AT,c_3,t_2 2171 daddu c_1,AT 2172 sd c_2,32(a0) 2173 2174 dmultu a_2,a_3 /* mul_add_c2(a[2],b[3],c3,c1,c2); */ 2175 mflo t_1 2176 mfhi t_2 2177 slt c_2,t_2,zero 2178 dsll t_2,1 2179 slt a2,t_1,zero 2180 daddu t_2,a2 2181 dsll t_1,1 2182 daddu c_3,t_1 2183 sltu AT,c_3,t_1 2184 daddu t_2,AT 2185 daddu c_1,t_2 2186 sltu AT,c_1,t_2 2187 daddu c_2,AT 2188 sd c_3,40(a0) 2189 2190 dmultu a_3,a_3 /* mul_add_c(a[3],b[3],c1,c2,c3); */ 2191 mflo t_1 2192 mfhi t_2 2193 daddu c_1,t_1 2194 sltu AT,c_1,t_1 2195 daddu t_2,AT 2196 daddu c_2,t_2 2197 sd c_1,48(a0) 2198 sd c_2,56(a0) 2199 2200 jr ra 2201END(bn_sqr_comba4) 2202