1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# This module doesn't present direct interest for OpenSSL, because it 11# doesn't provide better performance for longer keys, at least not on 12# in-order-execution cores. While 512-bit RSA sign operations can be 13# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and 14# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from 15# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA 16# verify:-( All comparisons are against bn_mul_mont-free assembler. 17# The module might be of interest to embedded system developers, as 18# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 19# and 75-30% [less for longer keys] on MIPS32 over compiler-generated 20# code. 21 22###################################################################### 23# There is a number of MIPS ABI in use, O32 and N32/64 are most 24# widely used. Then there is a new contender: NUBI. It appears that if 25# one picks the latter, it's possible to arrange code in ABI neutral 26# manner. Therefore let's stick to NUBI register layout: 27# 28($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); 29($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 30($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); 31($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); 32# 33# The return value is placed in $a0. Following coding rules facilitate 34# interoperability: 35# 36# - never ever touch $tp, "thread pointer", former $gp; 37# - copy return value to $t0, former $v0 [or to $a0 if you're adapting 38# old code]; 39# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; 40# 41# For reference here is register layout for N32/64 MIPS ABIs: 42# 43# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 44# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 45# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 46# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 47# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 48# 49$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 50 51if ($flavour =~ /64|n32/i) { 52 $PTR_ADD="dadd"; # incidentally works even on n32 53 $PTR_SUB="dsub"; # incidentally works even on n32 54 $REG_S="sd"; 55 $REG_L="ld"; 56 $SZREG=8; 57} else { 58 $PTR_ADD="add"; 59 $PTR_SUB="sub"; 60 $REG_S="sw"; 61 $REG_L="lw"; 62 $SZREG=4; 63} 64$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; 65# 66# <appro@openssl.org> 67# 68###################################################################### 69 70while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 71open STDOUT,">$output"; 72 73if ($flavour =~ /64|n32/i) { 74 $LD="ld"; 75 $ST="sd"; 76 $MULTU="dmultu"; 77 $ADDU="daddu"; 78 $SUBU="dsubu"; 79 $BNSZ=8; 80} else { 81 $LD="lw"; 82 $ST="sw"; 83 $MULTU="multu"; 84 $ADDU="addu"; 85 $SUBU="subu"; 86 $BNSZ=4; 87} 88 89# int bn_mul_mont( 90$rp=$a0; # BN_ULONG *rp, 91$ap=$a1; # const BN_ULONG *ap, 92$bp=$a2; # const BN_ULONG *bp, 93$np=$a3; # const BN_ULONG *np, 94$n0=$a4; # const BN_ULONG *n0, 95$num=$a5; # int num); 96 97$lo0=$a6; 98$hi0=$a7; 99$lo1=$t1; 100$hi1=$t2; 101$aj=$s0; 102$bi=$s1; 103$nj=$s2; 104$tp=$s3; 105$alo=$s4; 106$ahi=$s5; 107$nlo=$s6; 108$nhi=$s7; 109$tj=$s8; 110$i=$s9; 111$j=$s10; 112$m1=$s11; 113 114$FRAMESIZE=14; 115 116$code=<<___; 117.text 118 119.set noat 120.set noreorder 121 122.align 5 123.globl bn_mul_mont 124.ent bn_mul_mont 125bn_mul_mont: 126___ 127$code.=<<___ if ($flavour =~ /o32/i); 128 lw $n0,16($sp) 129 lw $num,20($sp) 130___ 131$code.=<<___; 132 slt $at,$num,4 133 bnez $at,1f 134 li $t0,0 135 slt $at,$num,17 # on in-order CPU 136 bnezl $at,bn_mul_mont_internal 137 nop 1381: jr $ra 139 li $a0,0 140.end bn_mul_mont 141 142.align 5 143.ent bn_mul_mont_internal 144bn_mul_mont_internal: 145 .frame $fp,$FRAMESIZE*$SZREG,$ra 146 .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG 147 $PTR_SUB $sp,$FRAMESIZE*$SZREG 148 $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) 149 $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) 150 $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) 151 $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) 152 $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) 153 $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) 154 $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) 155 $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) 156 $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) 157___ 158$code.=<<___ if ($flavour =~ /nubi/i); 159 $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) 160 $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) 161 $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) 162 $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) 163___ 164$code.=<<___; 165 move $fp,$sp 166 167 .set reorder 168 $LD $n0,0($n0) 169 $LD $bi,0($bp) # bp[0] 170 $LD $aj,0($ap) # ap[0] 171 $LD $nj,0($np) # np[0] 172 173 $PTR_SUB $sp,2*$BNSZ # place for two extra words 174 sll $num,`log($BNSZ)/log(2)` 175 li $at,-4096 176 $PTR_SUB $sp,$num 177 and $sp,$at 178 179 $MULTU $aj,$bi 180 $LD $alo,$BNSZ($ap) 181 $LD $nlo,$BNSZ($np) 182 mflo $lo0 183 mfhi $hi0 184 $MULTU $lo0,$n0 185 mflo $m1 186 187 $MULTU $alo,$bi 188 mflo $alo 189 mfhi $ahi 190 191 $MULTU $nj,$m1 192 mflo $lo1 193 mfhi $hi1 194 $MULTU $nlo,$m1 195 $ADDU $lo1,$lo0 196 sltu $at,$lo1,$lo0 197 $ADDU $hi1,$at 198 mflo $nlo 199 mfhi $nhi 200 201 move $tp,$sp 202 li $j,2*$BNSZ 203.align 4 204.L1st: 205 .set noreorder 206 $PTR_ADD $aj,$ap,$j 207 $PTR_ADD $nj,$np,$j 208 $LD $aj,($aj) 209 $LD $nj,($nj) 210 211 $MULTU $aj,$bi 212 $ADDU $lo0,$alo,$hi0 213 $ADDU $lo1,$nlo,$hi1 214 sltu $at,$lo0,$hi0 215 sltu $t0,$lo1,$hi1 216 $ADDU $hi0,$ahi,$at 217 $ADDU $hi1,$nhi,$t0 218 mflo $alo 219 mfhi $ahi 220 221 $ADDU $lo1,$lo0 222 sltu $at,$lo1,$lo0 223 $MULTU $nj,$m1 224 $ADDU $hi1,$at 225 addu $j,$BNSZ 226 $ST $lo1,($tp) 227 sltu $t0,$j,$num 228 mflo $nlo 229 mfhi $nhi 230 231 bnez $t0,.L1st 232 $PTR_ADD $tp,$BNSZ 233 .set reorder 234 235 $ADDU $lo0,$alo,$hi0 236 sltu $at,$lo0,$hi0 237 $ADDU $hi0,$ahi,$at 238 239 $ADDU $lo1,$nlo,$hi1 240 sltu $t0,$lo1,$hi1 241 $ADDU $hi1,$nhi,$t0 242 $ADDU $lo1,$lo0 243 sltu $at,$lo1,$lo0 244 $ADDU $hi1,$at 245 246 $ST $lo1,($tp) 247 248 $ADDU $hi1,$hi0 249 sltu $at,$hi1,$hi0 250 $ST $hi1,$BNSZ($tp) 251 $ST $at,2*$BNSZ($tp) 252 253 li $i,$BNSZ 254.align 4 255.Louter: 256 $PTR_ADD $bi,$bp,$i 257 $LD $bi,($bi) 258 $LD $aj,($ap) 259 $LD $alo,$BNSZ($ap) 260 $LD $tj,($sp) 261 262 $MULTU $aj,$bi 263 $LD $nj,($np) 264 $LD $nlo,$BNSZ($np) 265 mflo $lo0 266 mfhi $hi0 267 $ADDU $lo0,$tj 268 $MULTU $lo0,$n0 269 sltu $at,$lo0,$tj 270 $ADDU $hi0,$at 271 mflo $m1 272 273 $MULTU $alo,$bi 274 mflo $alo 275 mfhi $ahi 276 277 $MULTU $nj,$m1 278 mflo $lo1 279 mfhi $hi1 280 281 $MULTU $nlo,$m1 282 $ADDU $lo1,$lo0 283 sltu $at,$lo1,$lo0 284 $ADDU $hi1,$at 285 mflo $nlo 286 mfhi $nhi 287 288 move $tp,$sp 289 li $j,2*$BNSZ 290 $LD $tj,$BNSZ($tp) 291.align 4 292.Linner: 293 .set noreorder 294 $PTR_ADD $aj,$ap,$j 295 $PTR_ADD $nj,$np,$j 296 $LD $aj,($aj) 297 $LD $nj,($nj) 298 299 $MULTU $aj,$bi 300 $ADDU $lo0,$alo,$hi0 301 $ADDU $lo1,$nlo,$hi1 302 sltu $at,$lo0,$hi0 303 sltu $t0,$lo1,$hi1 304 $ADDU $hi0,$ahi,$at 305 $ADDU $hi1,$nhi,$t0 306 mflo $alo 307 mfhi $ahi 308 309 $ADDU $lo0,$tj 310 addu $j,$BNSZ 311 $MULTU $nj,$m1 312 sltu $at,$lo0,$tj 313 $ADDU $lo1,$lo0 314 $ADDU $hi0,$at 315 sltu $t0,$lo1,$lo0 316 $LD $tj,2*$BNSZ($tp) 317 $ADDU $hi1,$t0 318 sltu $at,$j,$num 319 mflo $nlo 320 mfhi $nhi 321 $ST $lo1,($tp) 322 bnez $at,.Linner 323 $PTR_ADD $tp,$BNSZ 324 .set reorder 325 326 $ADDU $lo0,$alo,$hi0 327 sltu $at,$lo0,$hi0 328 $ADDU $hi0,$ahi,$at 329 $ADDU $lo0,$tj 330 sltu $t0,$lo0,$tj 331 $ADDU $hi0,$t0 332 333 $LD $tj,2*$BNSZ($tp) 334 $ADDU $lo1,$nlo,$hi1 335 sltu $at,$lo1,$hi1 336 $ADDU $hi1,$nhi,$at 337 $ADDU $lo1,$lo0 338 sltu $t0,$lo1,$lo0 339 $ADDU $hi1,$t0 340 $ST $lo1,($tp) 341 342 $ADDU $lo1,$hi1,$hi0 343 sltu $hi1,$lo1,$hi0 344 $ADDU $lo1,$tj 345 sltu $at,$lo1,$tj 346 $ADDU $hi1,$at 347 $ST $lo1,$BNSZ($tp) 348 $ST $hi1,2*$BNSZ($tp) 349 350 addu $i,$BNSZ 351 sltu $t0,$i,$num 352 bnez $t0,.Louter 353 354 .set noreorder 355 $PTR_ADD $tj,$sp,$num # &tp[num] 356 move $tp,$sp 357 move $ap,$sp 358 li $hi0,0 # clear borrow bit 359 360.align 4 361.Lsub: $LD $lo0,($tp) 362 $LD $lo1,($np) 363 $PTR_ADD $tp,$BNSZ 364 $PTR_ADD $np,$BNSZ 365 $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] 366 sgtu $at,$lo1,$lo0 367 $SUBU $lo0,$lo1,$hi0 368 sgtu $hi0,$lo0,$lo1 369 $ST $lo0,($rp) 370 or $hi0,$at 371 sltu $at,$tp,$tj 372 bnez $at,.Lsub 373 $PTR_ADD $rp,$BNSZ 374 375 $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit 376 move $tp,$sp 377 $PTR_SUB $rp,$num # restore rp 378 not $hi1,$hi0 379 380 and $ap,$hi0,$sp 381 and $bp,$hi1,$rp 382 or $ap,$ap,$bp # ap=borrow?tp:rp 383 384.align 4 385.Lcopy: $LD $aj,($ap) 386 $PTR_ADD $ap,$BNSZ 387 $ST $zero,($tp) 388 $PTR_ADD $tp,$BNSZ 389 sltu $at,$tp,$tj 390 $ST $aj,($rp) 391 bnez $at,.Lcopy 392 $PTR_ADD $rp,$BNSZ 393 394 li $a0,1 395 li $t0,1 396 397 .set noreorder 398 move $sp,$fp 399 $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) 400 $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) 401 $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) 402 $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) 403 $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) 404 $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) 405 $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) 406 $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) 407 $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) 408___ 409$code.=<<___ if ($flavour =~ /nubi/i); 410 $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) 411 $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) 412 $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) 413 $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) 414___ 415$code.=<<___; 416 jr $ra 417 $PTR_ADD $sp,$FRAMESIZE*$SZREG 418.end bn_mul_mont_internal 419.rdata 420.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>" 421___ 422 423$code =~ s/\`([^\`]*)\`/eval $1/gem; 424 425print $code; 426close STDOUT; 427