1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# October 2012. 18# 19# SPARCv9 VIS3 Montgomery multiplication procedure suitable for T3 and 20# onward. There are three new instructions used here: umulxhi, 21# addxc[cc] and initializing store. On T3 RSA private key operations 22# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key 23# lengths. This is without dedicated squaring procedure. On T4 24# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly 25# for reference purposes, because T4 has dedicated Montgomery 26# multiplication and squaring *instructions* that deliver even more. 27 28$output = pop; 29open STDOUT,">$output"; 30 31$frame = "STACK_FRAME"; 32$bias = "STACK_BIAS"; 33 34$code.=<<___; 35#include "sparc_arch.h" 36 37#ifdef __arch64__ 38.register %g2,#scratch 39.register %g3,#scratch 40#endif 41 42.section ".text",#alloc,#execinstr 43___ 44 45($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)= 46 (map("%g$_",(1..5)),map("%o$_",(0..5,7))); 47 48# int bn_mul_mont( 49$rp="%o0"; # BN_ULONG *rp, 50$ap="%o1"; # const BN_ULONG *ap, 51$bp="%o2"; # const BN_ULONG *bp, 52$np="%o3"; # const BN_ULONG *np, 53$n0p="%o4"; # const BN_ULONG *n0, 54$num="%o5"; # int num); # caller ensures that num is even 55 # and >=6 56$code.=<<___; 57.globl bn_mul_mont_vis3 58.align 32 59bn_mul_mont_vis3: 60 add %sp, $bias, %g4 ! real top of stack 61 sll $num, 2, $num ! size in bytes 62 add $num, 63, %g5 63 andn %g5, 63, %g5 ! buffer size rounded up to 64 bytes 64 add %g5, %g5, %g1 65 add %g5, %g1, %g1 ! 3*buffer size 66 sub %g4, %g1, %g1 67 andn %g1, 63, %g1 ! align at 64 byte 68 sub %g1, $frame, %g1 ! new top of stack 69 sub %g1, %g4, %g1 70 71 save %sp, %g1, %sp 72___ 73 74# +-------------------------------+<----- %sp 75# . . 76# +-------------------------------+<----- aligned at 64 bytes 77# | __int64 tmp[0] | 78# +-------------------------------+ 79# . . 80# . . 81# +-------------------------------+<----- aligned at 64 bytes 82# | __int64 ap[1..0] | converted ap[] 83# +-------------------------------+ 84# | __int64 np[1..0] | converted np[] 85# +-------------------------------+ 86# | __int64 ap[3..2] | 87# . . 88# . . 89# +-------------------------------+ 90($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5)); 91($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7)); 92($ovf,$i)=($t0,$t1); 93$code.=<<___; 94 ld [$n0p+0], $t0 ! pull n0[0..1] value 95 add %sp, $bias+$frame, $tp 96 ld [$n0p+4], $t1 97 add $tp, %g5, $anp 98 ld [$bp+0], $t2 ! m0=bp[0] 99 sllx $t1, 32, $n0 100 ld [$bp+4], $t3 101 or $t0, $n0, $n0 102 add $bp, 8, $bp 103 104 ld [$ap+0], $t0 ! ap[0] 105 sllx $t3, 32, $m0 106 ld [$ap+4], $t1 107 or $t2, $m0, $m0 108 109 ld [$ap+8], $t2 ! ap[1] 110 sllx $t1, 32, $aj 111 ld [$ap+12], $t3 112 or $t0, $aj, $aj 113 add $ap, 16, $ap 114 stx $aj, [$anp] ! converted ap[0] 115 116 mulx $aj, $m0, $lo0 ! ap[0]*bp[0] 117 umulxhi $aj, $m0, $hi0 118 119 ld [$np+0], $t0 ! np[0] 120 sllx $t3, 32, $aj 121 ld [$np+4], $t1 122 or $t2, $aj, $aj 123 124 ld [$np+8], $t2 ! np[1] 125 sllx $t1, 32, $nj 126 ld [$np+12], $t3 127 or $t0, $nj, $nj 128 add $np, 16, $np 129 stx $nj, [$anp+8] ! converted np[0] 130 131 mulx $lo0, $n0, $m1 ! "tp[0]"*n0 132 stx $aj, [$anp+16] ! converted ap[1] 133 134 mulx $aj, $m0, $alo ! ap[1]*bp[0] 135 umulxhi $aj, $m0, $aj ! ahi=aj 136 137 mulx $nj, $m1, $lo1 ! np[0]*m1 138 umulxhi $nj, $m1, $hi1 139 140 sllx $t3, 32, $nj 141 or $t2, $nj, $nj 142 stx $nj, [$anp+24] ! converted np[1] 143 add $anp, 32, $anp 144 145 addcc $lo0, $lo1, $lo1 146 addxc %g0, $hi1, $hi1 147 148 mulx $nj, $m1, $nlo ! np[1]*m1 149 umulxhi $nj, $m1, $nj ! nhi=nj 150 151 ba .L1st 152 sub $num, 24, $cnt ! cnt=num-3 153 154.align 16 155.L1st: 156 ld [$ap+0], $t0 ! ap[j] 157 addcc $alo, $hi0, $lo0 158 ld [$ap+4], $t1 159 addxc $aj, %g0, $hi0 160 161 sllx $t1, 32, $aj 162 add $ap, 8, $ap 163 or $t0, $aj, $aj 164 stx $aj, [$anp] ! converted ap[j] 165 166 ld [$np+0], $t2 ! np[j] 167 addcc $nlo, $hi1, $lo1 168 ld [$np+4], $t3 169 addxc $nj, %g0, $hi1 ! nhi=nj 170 171 sllx $t3, 32, $nj 172 add $np, 8, $np 173 mulx $aj, $m0, $alo ! ap[j]*bp[0] 174 or $t2, $nj, $nj 175 umulxhi $aj, $m0, $aj ! ahi=aj 176 stx $nj, [$anp+8] ! converted np[j] 177 add $anp, 16, $anp ! anp++ 178 179 mulx $nj, $m1, $nlo ! np[j]*m1 180 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 181 umulxhi $nj, $m1, $nj ! nhi=nj 182 addxc %g0, $hi1, $hi1 183 stx $lo1, [$tp] ! tp[j-1] 184 add $tp, 8, $tp ! tp++ 185 186 brnz,pt $cnt, .L1st 187 sub $cnt, 8, $cnt ! j-- 188!.L1st 189 addcc $alo, $hi0, $lo0 190 addxc $aj, %g0, $hi0 ! ahi=aj 191 192 addcc $nlo, $hi1, $lo1 193 addxc $nj, %g0, $hi1 194 addcc $lo0, $lo1, $lo1 ! np[j]*m1+ap[j]*bp[0] 195 addxc %g0, $hi1, $hi1 196 stx $lo1, [$tp] ! tp[j-1] 197 add $tp, 8, $tp 198 199 addcc $hi0, $hi1, $hi1 200 addxc %g0, %g0, $ovf ! upmost overflow bit 201 stx $hi1, [$tp] 202 add $tp, 8, $tp 203 204 ba .Louter 205 sub $num, 16, $i ! i=num-2 206 207.align 16 208.Louter: 209 ld [$bp+0], $t2 ! m0=bp[i] 210 ld [$bp+4], $t3 211 212 sub $anp, $num, $anp ! rewind 213 sub $tp, $num, $tp 214 sub $anp, $num, $anp 215 216 add $bp, 8, $bp 217 sllx $t3, 32, $m0 218 ldx [$anp+0], $aj ! ap[0] 219 or $t2, $m0, $m0 220 ldx [$anp+8], $nj ! np[0] 221 222 mulx $aj, $m0, $lo0 ! ap[0]*bp[i] 223 ldx [$tp], $tj ! tp[0] 224 umulxhi $aj, $m0, $hi0 225 ldx [$anp+16], $aj ! ap[1] 226 addcc $lo0, $tj, $lo0 ! ap[0]*bp[i]+tp[0] 227 mulx $aj, $m0, $alo ! ap[1]*bp[i] 228 addxc %g0, $hi0, $hi0 229 mulx $lo0, $n0, $m1 ! tp[0]*n0 230 umulxhi $aj, $m0, $aj ! ahi=aj 231 mulx $nj, $m1, $lo1 ! np[0]*m1 232 umulxhi $nj, $m1, $hi1 233 ldx [$anp+24], $nj ! np[1] 234 add $anp, 32, $anp 235 addcc $lo1, $lo0, $lo1 236 mulx $nj, $m1, $nlo ! np[1]*m1 237 addxc %g0, $hi1, $hi1 238 umulxhi $nj, $m1, $nj ! nhi=nj 239 240 ba .Linner 241 sub $num, 24, $cnt ! cnt=num-3 242.align 16 243.Linner: 244 addcc $alo, $hi0, $lo0 245 ldx [$tp+8], $tj ! tp[j] 246 addxc $aj, %g0, $hi0 ! ahi=aj 247 ldx [$anp+0], $aj ! ap[j] 248 addcc $nlo, $hi1, $lo1 249 mulx $aj, $m0, $alo ! ap[j]*bp[i] 250 addxc $nj, %g0, $hi1 ! nhi=nj 251 ldx [$anp+8], $nj ! np[j] 252 add $anp, 16, $anp 253 umulxhi $aj, $m0, $aj ! ahi=aj 254 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 255 mulx $nj, $m1, $nlo ! np[j]*m1 256 addxc %g0, $hi0, $hi0 257 umulxhi $nj, $m1, $nj ! nhi=nj 258 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 259 addxc %g0, $hi1, $hi1 260 stx $lo1, [$tp] ! tp[j-1] 261 add $tp, 8, $tp 262 brnz,pt $cnt, .Linner 263 sub $cnt, 8, $cnt 264!.Linner 265 ldx [$tp+8], $tj ! tp[j] 266 addcc $alo, $hi0, $lo0 267 addxc $aj, %g0, $hi0 ! ahi=aj 268 addcc $lo0, $tj, $lo0 ! ap[j]*bp[i]+tp[j] 269 addxc %g0, $hi0, $hi0 270 271 addcc $nlo, $hi1, $lo1 272 addxc $nj, %g0, $hi1 ! nhi=nj 273 addcc $lo1, $lo0, $lo1 ! np[j]*m1+ap[j]*bp[i]+tp[j] 274 addxc %g0, $hi1, $hi1 275 stx $lo1, [$tp] ! tp[j-1] 276 277 subcc %g0, $ovf, %g0 ! move upmost overflow to CCR.xcc 278 addxccc $hi1, $hi0, $hi1 279 addxc %g0, %g0, $ovf 280 stx $hi1, [$tp+8] 281 add $tp, 16, $tp 282 283 brnz,pt $i, .Louter 284 sub $i, 8, $i 285 286 sub $anp, $num, $anp ! rewind 287 sub $tp, $num, $tp 288 sub $anp, $num, $anp 289 ba .Lsub 290 subcc $num, 8, $cnt ! cnt=num-1 and clear CCR.xcc 291 292.align 16 293.Lsub: 294 ldx [$tp], $tj 295 add $tp, 8, $tp 296 ldx [$anp+8], $nj 297 add $anp, 16, $anp 298 subccc $tj, $nj, $t2 ! tp[j]-np[j] 299 srlx $tj, 32, $tj 300 srlx $nj, 32, $nj 301 subccc $tj, $nj, $t3 302 add $rp, 8, $rp 303 st $t2, [$rp-4] ! reverse order 304 st $t3, [$rp-8] 305 brnz,pt $cnt, .Lsub 306 sub $cnt, 8, $cnt 307 308 sub $anp, $num, $anp ! rewind 309 sub $tp, $num, $tp 310 sub $anp, $num, $anp 311 sub $rp, $num, $rp 312 313 subccc $ovf, %g0, $ovf ! handle upmost overflow bit 314 ba .Lcopy 315 sub $num, 8, $cnt 316 317.align 16 318.Lcopy: ! conditional copy 319 ld [$tp+0], $t0 320 ld [$tp+4], $t1 321 ld [$rp+0], $t2 322 ld [$rp+4], $t3 323 stx %g0, [$tp] ! zap 324 add $tp, 8, $tp 325 stx %g0, [$anp] ! zap 326 stx %g0, [$anp+8] 327 add $anp, 16, $anp 328 movcs %icc, $t0, $t2 329 movcs %icc, $t1, $t3 330 st $t3, [$rp+0] ! flip order 331 st $t2, [$rp+4] 332 add $rp, 8, $rp 333 brnz $cnt, .Lcopy 334 sub $cnt, 8, $cnt 335 336 mov 1, %o0 337 ret 338 restore 339.type bn_mul_mont_vis3, #function 340.size bn_mul_mont_vis3, .-bn_mul_mont_vis3 341.asciz "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>" 342.align 4 343___ 344 345# Purpose of these subroutines is to explicitly encode VIS instructions, 346# so that one can compile the module without having to specify VIS 347# extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a. 348# Idea is to reserve for option to produce "universal" binary and let 349# programmer detect if current CPU is VIS capable at run-time. 350sub unvis3 { 351my ($mnemonic,$rs1,$rs2,$rd)=@_; 352my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 ); 353my ($ref,$opf); 354my %visopf = ( "addxc" => 0x011, 355 "addxccc" => 0x013, 356 "umulxhi" => 0x016 ); 357 358 $ref = "$mnemonic\t$rs1,$rs2,$rd"; 359 360 if ($opf=$visopf{$mnemonic}) { 361 foreach ($rs1,$rs2,$rd) { 362 return $ref if (!/%([goli])([0-9])/); 363 $_=$bias{$1}+$2; 364 } 365 366 return sprintf ".word\t0x%08x !%s", 367 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2, 368 $ref; 369 } else { 370 return $ref; 371 } 372} 373 374foreach (split("\n",$code)) { 375 s/\`([^\`]*)\`/eval $1/ge; 376 377 s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/ 378 &unvis3($1,$2,$3,$4) 379 /ge; 380 381 print $_,"\n"; 382} 383 384close STDOUT or die "error closing STDOUT: $!"; 385