1#! /usr/bin/env perl 2# Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# January 2010 18# 19# "Teaser" Montgomery multiplication module for IA-64. There are 20# several possibilities for improvement: 21# 22# - modulo-scheduling outer loop would eliminate quite a number of 23# stalls after ldf8, xma and getf.sig outside inner loop and 24# improve shorter key performance; 25# - shorter vector support [with input vectors being fetched only 26# once] should be added; 27# - 2x unroll with help of n0[1] would make the code scalable on 28# "wider" IA-64, "wider" than Itanium 2 that is, which is not of 29# acute interest, because upcoming Tukwila's individual cores are 30# reportedly based on Itanium 2 design; 31# - dedicated squaring procedure(?); 32# 33# January 2010 34# 35# Shorter vector support is implemented by zero-padding ap and np 36# vectors up to 8 elements, or 512 bits. This means that 256-bit 37# inputs will be processed only 2 times faster than 512-bit inputs, 38# not 4 [as one would expect, because algorithm complexity is n^2]. 39# The reason for padding is that inputs shorter than 512 bits won't 40# be processed faster anyway, because minimal critical path of the 41# core loop happens to match 512-bit timing. Either way, it resulted 42# in >100% improvement of 512-bit RSA sign benchmark and 50% - of 43# 1024-bit one [in comparison to original version of *this* module]. 44# 45# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* 46# this module is: 47# sign verify sign/s verify/s 48# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 49# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 50# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 51# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 52# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 53# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 54# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 55# 56# ... and *without* (but still with ia64.S): 57# 58# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 59# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 60# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 61# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 62# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 63# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 64# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 65# 66# As it can be seen, RSA sign performance improves by 130-30%, 67# hereafter less for longer keys, while verify - by 74-13%. 68# DSA performance improves by 115-30%. 69 70$output=pop; 71 72if ($^O eq "hpux") { 73 $ADDP="addp4"; 74 for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } 75} else { $ADDP="add"; } 76 77$code=<<___; 78.explicit 79.text 80 81// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, 82// const BN_ULONG *bp,const BN_ULONG *np, 83// const BN_ULONG *n0p,int num); 84.align 64 85.global bn_mul_mont# 86.proc bn_mul_mont# 87bn_mul_mont: 88 .prologue 89 .body 90{ .mmi; cmp4.le p6,p7=2,r37;; 91(p6) cmp4.lt.unc p8,p9=8,r37 92 mov ret0=r0 };; 93{ .bbb; 94(p9) br.cond.dptk.many bn_mul_mont_8 95(p8) br.cond.dpnt.many bn_mul_mont_general 96(p7) br.ret.spnt.many b0 };; 97.endp bn_mul_mont# 98 99prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; 100 101rptr=r8; aptr=r9; bptr=r14; nptr=r15; 102tptr=r16; // &tp[0] 103tp_1=r17; // &tp[-1] 104num=r18; len=r19; lc=r20; 105topbit=r21; // carry bit from tmp[num] 106 107n0=f6; 108m0=f7; 109bi=f8; 110 111.align 64 112.local bn_mul_mont_general# 113.proc bn_mul_mont_general# 114bn_mul_mont_general: 115 .prologue 116{ .mmi; .save ar.pfs,prevfs 117 alloc prevfs=ar.pfs,6,2,0,8 118 $ADDP aptr=0,in1 119 .save ar.lc,prevlc 120 mov prevlc=ar.lc } 121{ .mmi; .vframe prevsp 122 mov prevsp=sp 123 $ADDP bptr=0,in2 124 .save pr,prevpr 125 mov prevpr=pr };; 126 127 .body 128 .rotf alo[6],nlo[4],ahi[8],nhi[6] 129 .rotr a[3],n[3],t[2] 130 131{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 132 ldf8 alo[4]=[aptr],16 // ap[0] 133 $ADDP r30=8,in1 };; 134{ .mmi; ldf8 alo[3]=[r30],16 // ap[1] 135 ldf8 alo[2]=[aptr],16 // ap[2] 136 $ADDP in4=0,in4 };; 137{ .mmi; ldf8 alo[1]=[r30] // ap[3] 138 ldf8 n0=[in4] // n0 139 $ADDP rptr=0,in0 } 140{ .mmi; $ADDP nptr=0,in3 141 mov r31=16 142 zxt4 num=in5 };; 143{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0] 144 shladd len=num,3,r0 145 shladd r31=num,3,r31 };; 146{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1] 147 add lc=-5,num 148 sub r31=sp,r31 };; 149{ .mfb; and sp=-16,r31 // alloca 150 xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] 151 nop.b 0 } 152{ .mfb; nop.m 0 153 xmpy.lu alo[4]=alo[4],bi 154 brp.loop.imp .L1st_ctop,.L1st_cend-16 155 };; 156{ .mfi; nop.m 0 157 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] 158 add tp_1=8,sp } 159{ .mfi; nop.m 0 160 xma.lu alo[3]=alo[3],bi,ahi[2] 161 mov pr.rot=0x20001f<<16 162 // ------^----- (p40) at first (p23) 163 // ----------^^ p[16:20]=1 164 };; 165{ .mfi; nop.m 0 166 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 167 mov ar.lc=lc } 168{ .mfi; nop.m 0 169 fcvt.fxu.s1 nhi[1]=f0 170 mov ar.ec=8 };; 171 172.align 32 173.L1st_ctop: 174.pred.rel "mutex",p40,p42 175{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 176 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 177 (p40) add n[2]=n[2],a[2] } // (p23) } 178{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) 179 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 180 (p42) add n[2]=n[2],a[2],1 };; // (p23) 181{ .mfi; (p21) getf.sig a[0]=alo[5] 182 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 183 (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) 184{ .mfi; (p23) st8 [tp_1]=n[2],8 185 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 186 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 187{ .mmb; (p21) getf.sig n[0]=nlo[3] 188 (p16) nop.m 0 189 br.ctop.sptk .L1st_ctop };; 190.L1st_cend: 191 192{ .mmi; getf.sig a[0]=ahi[6] // (p24) 193 getf.sig n[0]=nhi[4] 194 add num=-1,num };; // num-- 195{ .mmi; .pred.rel "mutex",p40,p42 196(p40) add n[0]=n[0],a[0] 197(p42) add n[0]=n[0],a[0],1 198 sub aptr=aptr,len };; // rewind 199{ .mmi; .pred.rel "mutex",p40,p42 200(p40) cmp.ltu p41,p39=n[0],a[0] 201(p42) cmp.leu p41,p39=n[0],a[0] 202 sub nptr=nptr,len };; 203{ .mmi; .pred.rel "mutex",p39,p41 204(p39) add topbit=r0,r0 205(p41) add topbit=r0,r0,1 206 nop.i 0 } 207{ .mmi; st8 [tp_1]=n[0] 208 add tptr=16,sp 209 add tp_1=8,sp };; 210 211.Louter: 212{ .mmi; ldf8 bi=[bptr],8 // (*bp++) 213 ldf8 ahi[3]=[tptr] // tp[0] 214 add r30=8,aptr };; 215{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0] 216 ldf8 alo[3]=[r30],16 // ap[1] 217 add r31=8,nptr };; 218{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2] 219 xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] 220 brp.loop.imp .Linner_ctop,.Linner_cend-16 221 } 222{ .mfb; ldf8 alo[1]=[r30] // ap[3] 223 xma.lu alo[4]=alo[4],bi,ahi[3] 224 clrrrb.pr };; 225{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0] 226 xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] 227 nop.i 0 } 228{ .mfi; ldf8 nlo[1]=[r31] // np[1] 229 xma.lu alo[3]=alo[3],bi,ahi[2] 230 mov pr.rot=0x20101f<<16 231 // ------^----- (p40) at first (p23) 232 // --------^--- (p30) at first (p22) 233 // ----------^^ p[16:20]=1 234 };; 235{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted 236 xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 237 mov ar.lc=lc } 238{ .mfi; 239 fcvt.fxu.s1 nhi[1]=f0 240 mov ar.ec=8 };; 241 242// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in 243// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 244// in latter case accounts for two-tick pipeline stall, which means 245// that its performance would be ~20% lower than optimal one. No 246// attempt was made to address this, because original Itanium is 247// hardly represented out in the wild... 248.align 32 249.Linner_ctop: 250.pred.rel "mutex",p40,p42 251.pred.rel "mutex",p30,p32 252{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) 253 (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] 254 (p40) add n[2]=n[2],a[2] } // (p23) 255{ .mfi; (p16) nop.m 0 256 (p18) xma.lu alo[2]=alo[2],bi,ahi[1] 257 (p42) add n[2]=n[2],a[2],1 };; // (p23) 258{ .mfi; (p21) getf.sig a[0]=alo[5] 259 (p16) nop.f 0 260 (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) 261{ .mfi; (p21) ld8 t[0]=[tptr],8 262 (p16) nop.f 0 263 (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) 264{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) 265 (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] 266 (p30) add a[1]=a[1],t[1] } // (p22) 267{ .mfi; (p16) nop.m 0 268 (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] 269 (p32) add a[1]=a[1],t[1],1 };; // (p22) 270{ .mmi; (p21) getf.sig n[0]=nlo[3] 271 (p16) nop.m 0 272 (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) 273{ .mmb; (p23) st8 [tp_1]=n[2],8 274 (p32) cmp.leu p31,p29=a[1],t[1] // (p22) 275 br.ctop.sptk .Linner_ctop };; 276.Linner_cend: 277 278{ .mmi; getf.sig a[0]=ahi[6] // (p24) 279 getf.sig n[0]=nhi[4] 280 nop.i 0 };; 281 282{ .mmi; .pred.rel "mutex",p31,p33 283(p31) add a[0]=a[0],topbit 284(p33) add a[0]=a[0],topbit,1 285 mov topbit=r0 };; 286{ .mfi; .pred.rel "mutex",p31,p33 287(p31) cmp.ltu p32,p30=a[0],topbit 288(p33) cmp.leu p32,p30=a[0],topbit 289 } 290{ .mfi; .pred.rel "mutex",p40,p42 291(p40) add n[0]=n[0],a[0] 292(p42) add n[0]=n[0],a[0],1 293 };; 294{ .mmi; .pred.rel "mutex",p44,p46 295(p40) cmp.ltu p41,p39=n[0],a[0] 296(p42) cmp.leu p41,p39=n[0],a[0] 297(p32) add topbit=r0,r0,1 } 298 299{ .mmi; st8 [tp_1]=n[0],8 300 cmp4.ne p6,p0=1,num 301 sub aptr=aptr,len };; // rewind 302{ .mmi; sub nptr=nptr,len 303(p41) add topbit=r0,r0,1 304 add tptr=16,sp } 305{ .mmb; add tp_1=8,sp 306 add num=-1,num // num-- 307(p6) br.cond.sptk.many .Louter };; 308 309{ .mbb; add lc=4,lc 310 brp.loop.imp .Lsub_ctop,.Lsub_cend-16 311 clrrrb.pr };; 312{ .mii; nop.m 0 313 mov pr.rot=0x10001<<16 314 // ------^---- (p33) at first (p17) 315 mov ar.lc=lc } 316{ .mii; nop.m 0 317 mov ar.ec=3 318 nop.i 0 };; 319 320.Lsub_ctop: 321.pred.rel "mutex",p33,p35 322{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) 323 (p16) nop.f 0 324 (p33) sub n[1]=t[1],n[1] } // (p17) 325{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) 326 (p16) nop.f 0 327 (p35) sub n[1]=t[1],n[1],1 };; // (p17) 328{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r 329 (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) 330 (p18) nop.b 0 } 331{ .mib; (p18) nop.m 0 332 (p35) cmp.geu p34,p32=n[1],t[1] // (p17) 333 br.ctop.sptk .Lsub_ctop };; 334.Lsub_cend: 335 336{ .mmb; .pred.rel "mutex",p34,p36 337(p34) sub topbit=topbit,r0 // (p19) 338(p36) sub topbit=topbit,r0,1 339 brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 340 } 341{ .mmb; sub rptr=rptr,len // rewind 342 sub tptr=tptr,len 343 clrrrb.pr };; 344{ .mmi; mov aptr=rptr 345 mov bptr=tptr 346 mov pr.rot=1<<16 };; 347{ .mii; cmp.eq p0,p6=topbit,r0 348 mov ar.lc=lc 349 mov ar.ec=2 };; 350 351.Lcopy_ctop: 352{ .mmi; (p16) ld8 a[0]=[aptr],8 353 (p16) ld8 t[0]=[bptr],8 354 (p6) mov a[1]=t[1] };; // (p17) 355{ .mmb; (p17) st8 [rptr]=a[1],8 356 (p17) st8 [tptr]=r0,8 357 br.ctop.sptk .Lcopy_ctop };; 358.Lcopy_cend: 359 360{ .mmi; mov ret0=1 // signal "handled" 361 rum 1<<5 // clear um.mfh 362 mov ar.lc=prevlc } 363{ .mib; .restore sp 364 mov sp=prevsp 365 mov pr=prevpr,0x1ffff 366 br.ret.sptk.many b0 };; 367.endp bn_mul_mont_general# 368 369a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; 370n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; 371t0=r15; 372 373ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; 374ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; 375 376.align 64 377.skip 48 // aligns loop body 378.local bn_mul_mont_8# 379.proc bn_mul_mont_8# 380bn_mul_mont_8: 381 .prologue 382{ .mmi; .save ar.pfs,prevfs 383 alloc prevfs=ar.pfs,6,2,0,8 384 .vframe prevsp 385 mov prevsp=sp 386 .save ar.lc,prevlc 387 mov prevlc=ar.lc } 388{ .mmi; add r17=-6*16,sp 389 add sp=-7*16,sp 390 .save pr,prevpr 391 mov prevpr=pr };; 392 393{ .mmi; .save.gf 0,0x10 394 stf.spill [sp]=f16,-16 395 .save.gf 0,0x20 396 stf.spill [r17]=f17,32 397 add r16=-5*16,prevsp};; 398{ .mmi; .save.gf 0,0x40 399 stf.spill [r16]=f18,32 400 .save.gf 0,0x80 401 stf.spill [r17]=f19,32 402 $ADDP aptr=0,in1 };; 403{ .mmi; .save.gf 0,0x100 404 stf.spill [r16]=f20,32 405 .save.gf 0,0x200 406 stf.spill [r17]=f21,32 407 $ADDP r29=8,in1 };; 408{ .mmi; .save.gf 0,0x400 409 stf.spill [r16]=f22 410 .save.gf 0,0x800 411 stf.spill [r17]=f23 412 $ADDP rptr=0,in0 };; 413 414 .body 415 .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] 416 .rotr t[8] 417 418// load input vectors padding them to 8 elements 419{ .mmi; ldf8 ai0=[aptr],16 // ap[0] 420 ldf8 ai1=[r29],16 // ap[1] 421 $ADDP bptr=0,in2 } 422{ .mmi; $ADDP r30=8,in2 423 $ADDP nptr=0,in3 424 $ADDP r31=8,in3 };; 425{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0] 426 ldf8 bj[6]=[r30],16 // bp[1] 427 cmp4.le p4,p5=3,in5 } 428{ .mmi; ldf8 ni0=[nptr],16 // np[0] 429 ldf8 ni1=[r31],16 // np[1] 430 cmp4.le p6,p7=4,in5 };; 431 432{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] 433 (p5)fcvt.fxu ai2=f0 434 cmp4.le p8,p9=5,in5 } 435{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] 436 (p7)fcvt.fxu ai3=f0 437 cmp4.le p10,p11=6,in5 } 438{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] 439 (p5)fcvt.fxu bj[5]=f0 440 cmp4.le p12,p13=7,in5 } 441{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] 442 (p7)fcvt.fxu bj[4]=f0 443 cmp4.le p14,p15=8,in5 } 444{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] 445 (p5)fcvt.fxu ni2=f0 446 addp4 r28=-1,in5 } 447{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3] 448 (p7)fcvt.fxu ni3=f0 449 $ADDP in4=0,in4 };; 450 451{ .mfi; ldf8 n0=[in4] 452 fcvt.fxu tf[1]=f0 453 nop.i 0 } 454 455{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] 456 (p9)fcvt.fxu ai4=f0 457 mov t[0]=r0 } 458{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] 459 (p11)fcvt.fxu ai5=f0 460 mov t[1]=r0 } 461{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] 462 (p9)fcvt.fxu bj[3]=f0 463 mov t[2]=r0 } 464{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] 465 (p11)fcvt.fxu bj[2]=f0 466 mov t[3]=r0 } 467{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] 468 (p9)fcvt.fxu ni4=f0 469 mov t[4]=r0 } 470{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5] 471 (p11)fcvt.fxu ni5=f0 472 mov t[5]=r0 };; 473 474{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] 475 (p13)fcvt.fxu ai6=f0 476 mov t[6]=r0 } 477{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] 478 (p15)fcvt.fxu ai7=f0 479 mov t[7]=r0 } 480{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] 481 (p13)fcvt.fxu bj[1]=f0 482 mov ar.lc=r28 } 483{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] 484 (p15)fcvt.fxu bj[0]=f0 485 mov ar.ec=1 } 486{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] 487 (p13)fcvt.fxu ni6=f0 488 mov pr.rot=1<<16 } 489{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7] 490 (p15)fcvt.fxu ni7=f0 491 brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 492 };; 493 494// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt 495// to measure with help of Interval Time Counter indicated that the 496// factor is a tad higher: 33 or 34, if not 35. Exact measurement and 497// addressing the issue is problematic, because I don't have access 498// to platform-specific instruction-level profiler. On Itanium it 499// should run in 56*n ticks, because of higher xma latency... 500.Louter_8_ctop: 501 .pred.rel "mutex",p40,p42 502 .pred.rel "mutex",p48,p50 503{ .mfi; (p16) nop.m 0 // 0: 504 (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] 505 (p40) add a3=a3,n3 } // (p17) a3+=n3 506{ .mfi; (p42) add a3=a3,n3,1 507 (p16) xma.lu alo[0]=ai0,bj[7],tf[1] 508 (p16) nop.i 0 };; 509{ .mii; (p17) getf.sig a7=alo[8] // 1: 510 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 511 (p50) add t[6]=t[6],a3,1 };; 512{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 513 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 514 (p40) cmp.ltu p43,p41=a3,n3 } 515{ .mfi; (p42) cmp.leu p43,p41=a3,n3 516 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 517 (p16) nop.i 0 };; 518{ .mii; (p17) getf.sig n5=nlo[6] // 3: 519 (p48) cmp.ltu p51,p49=t[6],a3 520 (p50) cmp.leu p51,p49=t[6],a3 };; 521 .pred.rel "mutex",p41,p43 522 .pred.rel "mutex",p49,p51 523{ .mfi; (p16) nop.m 0 // 4: 524 (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] 525 (p41) add a4=a4,n4 } // (p17) a4+=n4 526{ .mfi; (p43) add a4=a4,n4,1 527 (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] 528 (p16) nop.i 0 };; 529{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 530 (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 531 (p51) add t[5]=t[5],a4,1 };; 532{ .mfi; (p16) nop.m 0 // 6: 533 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 534 (p41) cmp.ltu p42,p40=a4,n4 } 535{ .mfi; (p43) cmp.leu p42,p40=a4,n4 536 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 537 (p16) nop.i 0 };; 538{ .mii; (p17) getf.sig n6=nlo[7] // 7: 539 (p49) cmp.ltu p50,p48=t[5],a4 540 (p51) cmp.leu p50,p48=t[5],a4 };; 541 .pred.rel "mutex",p40,p42 542 .pred.rel "mutex",p48,p50 543{ .mfi; (p16) nop.m 0 // 8: 544 (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] 545 (p40) add a5=a5,n5 } // (p17) a5+=n5 546{ .mfi; (p42) add a5=a5,n5,1 547 (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] 548 (p16) nop.i 0 };; 549{ .mii; (p16) getf.sig a1=alo[1] // 9: 550 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 551 (p50) add t[4]=t[4],a5,1 };; 552{ .mfi; (p16) nop.m 0 // 10: 553 (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 554 (p40) cmp.ltu p43,p41=a5,n5 } 555{ .mfi; (p42) cmp.leu p43,p41=a5,n5 556 (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] 557 (p16) nop.i 0 };; 558{ .mii; (p17) getf.sig n7=nlo[8] // 11: 559 (p48) cmp.ltu p51,p49=t[4],a5 560 (p50) cmp.leu p51,p49=t[4],a5 };; 561 .pred.rel "mutex",p41,p43 562 .pred.rel "mutex",p49,p51 563{ .mfi; (p17) getf.sig n8=nhi[8] // 12: 564 (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] 565 (p41) add a6=a6,n6 } // (p17) a6+=n6 566{ .mfi; (p43) add a6=a6,n6,1 567 (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] 568 (p16) nop.i 0 };; 569{ .mii; (p16) getf.sig a2=alo[2] // 13: 570 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 571 (p51) add t[3]=t[3],a6,1 };; 572{ .mfi; (p16) nop.m 0 // 14: 573 (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 574 (p41) cmp.ltu p42,p40=a6,n6 } 575{ .mfi; (p43) cmp.leu p42,p40=a6,n6 576 (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] 577 (p16) nop.i 0 };; 578{ .mii; (p16) nop.m 0 // 15: 579 (p49) cmp.ltu p50,p48=t[3],a6 580 (p51) cmp.leu p50,p48=t[3],a6 };; 581 .pred.rel "mutex",p40,p42 582 .pred.rel "mutex",p48,p50 583{ .mfi; (p16) nop.m 0 // 16: 584 (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] 585 (p40) add a7=a7,n7 } // (p17) a7+=n7 586{ .mfi; (p42) add a7=a7,n7,1 587 (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] 588 (p16) nop.i 0 };; 589{ .mii; (p16) getf.sig a3=alo[3] // 17: 590 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 591 (p50) add t[2]=t[2],a7,1 };; 592{ .mfi; (p16) nop.m 0 // 18: 593 (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 594 (p40) cmp.ltu p43,p41=a7,n7 } 595{ .mfi; (p42) cmp.leu p43,p41=a7,n7 596 (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] 597 (p16) nop.i 0 };; 598{ .mii; (p16) getf.sig n1=nlo[1] // 19: 599 (p48) cmp.ltu p51,p49=t[2],a7 600 (p50) cmp.leu p51,p49=t[2],a7 };; 601 .pred.rel "mutex",p41,p43 602 .pred.rel "mutex",p49,p51 603{ .mfi; (p16) nop.m 0 // 20: 604 (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] 605 (p41) add a8=a8,n8 } // (p17) a8+=n8 606{ .mfi; (p43) add a8=a8,n8,1 607 (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] 608 (p16) nop.i 0 };; 609{ .mii; (p16) getf.sig a4=alo[4] // 21: 610 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 611 (p51) add t[1]=t[1],a8,1 };; 612{ .mfi; (p16) nop.m 0 // 22: 613 (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 614 (p41) cmp.ltu p42,p40=a8,n8 } 615{ .mfi; (p43) cmp.leu p42,p40=a8,n8 616 (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] 617 (p16) nop.i 0 };; 618{ .mii; (p16) getf.sig n2=nlo[2] // 23: 619 (p49) cmp.ltu p50,p48=t[1],a8 620 (p51) cmp.leu p50,p48=t[1],a8 };; 621{ .mfi; (p16) nop.m 0 // 24: 622 (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] 623 (p16) add a1=a1,n1 } // (p16) a1+=n1 624{ .mfi; (p16) nop.m 0 625 (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] 626 (p17) mov t[0]=r0 };; 627{ .mii; (p16) getf.sig a5=alo[5] // 25: 628 (p16) add t0=t[7],a1 // (p16) t[7]+=a1 629 (p42) add t[0]=t[0],r0,1 };; 630{ .mfi; (p16) setf.sig tf[0]=t0 // 26: 631 (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 632 (p50) add t[0]=t[0],r0,1 } 633{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 634 (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] 635 (p16) nop.i 0 };; 636{ .mii; (p16) getf.sig n3=nlo[3] // 27: 637 (p16) cmp.ltu.unc p50,p48=t0,a1 638 (p16) nop.i 0 };; 639 .pred.rel "mutex",p40,p42 640 .pred.rel "mutex",p48,p50 641{ .mfi; (p16) nop.m 0 // 28: 642 (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] 643 (p40) add a2=a2,n2 } // (p16) a2+=n2 644{ .mfi; (p42) add a2=a2,n2,1 645 (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] 646 (p16) nop.i 0 };; 647{ .mii; (p16) getf.sig a6=alo[6] // 29: 648 (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 649 (p50) add t[6]=t[6],a2,1 };; 650{ .mfi; (p16) nop.m 0 // 30: 651 (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 652 (p40) cmp.ltu p41,p39=a2,n2 } 653{ .mfi; (p42) cmp.leu p41,p39=a2,n2 654 (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] 655 (p16) nop.i 0 };; 656{ .mfi; (p16) getf.sig n4=nlo[4] // 31: 657 (p16) nop.f 0 658 (p48) cmp.ltu p49,p47=t[6],a2 } 659{ .mfb; (p50) cmp.leu p49,p47=t[6],a2 660 (p16) nop.f 0 661 br.ctop.sptk.many .Louter_8_ctop };; 662.Louter_8_cend: 663 664// above loop has to execute one more time, without (p16), which is 665// replaced with merged move of np[8] to GPR bank 666 .pred.rel "mutex",p40,p42 667 .pred.rel "mutex",p48,p50 668{ .mmi; (p0) getf.sig n1=ni0 // 0: 669 (p40) add a3=a3,n3 // (p17) a3+=n3 670 (p42) add a3=a3,n3,1 };; 671{ .mii; (p17) getf.sig a7=alo[8] // 1: 672 (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 673 (p50) add t[6]=t[6],a3,1 };; 674{ .mfi; (p17) getf.sig a8=ahi[8] // 2: 675 (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 676 (p40) cmp.ltu p43,p41=a3,n3 } 677{ .mfi; (p42) cmp.leu p43,p41=a3,n3 678 (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] 679 (p0) nop.i 0 };; 680{ .mii; (p17) getf.sig n5=nlo[6] // 3: 681 (p48) cmp.ltu p51,p49=t[6],a3 682 (p50) cmp.leu p51,p49=t[6],a3 };; 683 .pred.rel "mutex",p41,p43 684 .pred.rel "mutex",p49,p51 685{ .mmi; (p0) getf.sig n2=ni1 // 4: 686 (p41) add a4=a4,n4 // (p17) a4+=n4 687 (p43) add a4=a4,n4,1 };; 688{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 689 (p0) nop.f 0 690 (p51) add t[5]=t[5],a4,1 };; 691{ .mfi; (p0) getf.sig n3=ni2 // 6: 692 (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 693 (p41) cmp.ltu p42,p40=a4,n4 } 694{ .mfi; (p43) cmp.leu p42,p40=a4,n4 695 (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] 696 (p0) nop.i 0 };; 697{ .mii; (p17) getf.sig n6=nlo[7] // 7: 698 (p49) cmp.ltu p50,p48=t[5],a4 699 (p51) cmp.leu p50,p48=t[5],a4 };; 700 .pred.rel "mutex",p40,p42 701 .pred.rel "mutex",p48,p50 702{ .mii; (p0) getf.sig n4=ni3 // 8: 703 (p40) add a5=a5,n5 // (p17) a5+=n5 704 (p42) add a5=a5,n5,1 };; 705{ .mii; (p0) nop.m 0 // 9: 706 (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 707 (p50) add t[4]=t[4],a5,1 };; 708{ .mii; (p0) nop.m 0 // 10: 709 (p40) cmp.ltu p43,p41=a5,n5 710 (p42) cmp.leu p43,p41=a5,n5 };; 711{ .mii; (p17) getf.sig n7=nlo[8] // 11: 712 (p48) cmp.ltu p51,p49=t[4],a5 713 (p50) cmp.leu p51,p49=t[4],a5 };; 714 .pred.rel "mutex",p41,p43 715 .pred.rel "mutex",p49,p51 716{ .mii; (p17) getf.sig n8=nhi[8] // 12: 717 (p41) add a6=a6,n6 // (p17) a6+=n6 718 (p43) add a6=a6,n6,1 };; 719{ .mii; (p0) getf.sig n5=ni4 // 13: 720 (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 721 (p51) add t[3]=t[3],a6,1 };; 722{ .mii; (p0) nop.m 0 // 14: 723 (p41) cmp.ltu p42,p40=a6,n6 724 (p43) cmp.leu p42,p40=a6,n6 };; 725{ .mii; (p0) getf.sig n6=ni5 // 15: 726 (p49) cmp.ltu p50,p48=t[3],a6 727 (p51) cmp.leu p50,p48=t[3],a6 };; 728 .pred.rel "mutex",p40,p42 729 .pred.rel "mutex",p48,p50 730{ .mii; (p0) nop.m 0 // 16: 731 (p40) add a7=a7,n7 // (p17) a7+=n7 732 (p42) add a7=a7,n7,1 };; 733{ .mii; (p0) nop.m 0 // 17: 734 (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 735 (p50) add t[2]=t[2],a7,1 };; 736{ .mii; (p0) nop.m 0 // 18: 737 (p40) cmp.ltu p43,p41=a7,n7 738 (p42) cmp.leu p43,p41=a7,n7 };; 739{ .mii; (p0) getf.sig n7=ni6 // 19: 740 (p48) cmp.ltu p51,p49=t[2],a7 741 (p50) cmp.leu p51,p49=t[2],a7 };; 742 .pred.rel "mutex",p41,p43 743 .pred.rel "mutex",p49,p51 744{ .mii; (p0) nop.m 0 // 20: 745 (p41) add a8=a8,n8 // (p17) a8+=n8 746 (p43) add a8=a8,n8,1 };; 747{ .mmi; (p0) nop.m 0 // 21: 748 (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 749 (p51) add t[1]=t[1],a8,1 } 750{ .mmi; (p17) mov t[0]=r0 751 (p41) cmp.ltu p42,p40=a8,n8 752 (p43) cmp.leu p42,p40=a8,n8 };; 753{ .mmi; (p0) getf.sig n8=ni7 // 22: 754 (p49) cmp.ltu p50,p48=t[1],a8 755 (p51) cmp.leu p50,p48=t[1],a8 } 756{ .mmi; (p42) add t[0]=t[0],r0,1 757 (p0) add r16=-7*16,prevsp 758 (p0) add r17=-6*16,prevsp };; 759 760// subtract np[8] from carrybit|tmp[8] 761// carrybit|tmp[8] layout upon exit from above loop is: 762// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) 763{ .mmi; (p50)add t[0]=t[0],r0,1 764 add r18=-5*16,prevsp 765 sub n1=t0,n1 };; 766{ .mmi; cmp.gtu p34,p32=n1,t0;; 767 .pred.rel "mutex",p32,p34 768 (p32)sub n2=t[7],n2 769 (p34)sub n2=t[7],n2,1 };; 770{ .mii; (p32)cmp.gtu p35,p33=n2,t[7] 771 (p34)cmp.geu p35,p33=n2,t[7];; 772 .pred.rel "mutex",p33,p35 773 (p33)sub n3=t[6],n3 } 774{ .mmi; (p35)sub n3=t[6],n3,1;; 775 (p33)cmp.gtu p34,p32=n3,t[6] 776 (p35)cmp.geu p34,p32=n3,t[6] };; 777 .pred.rel "mutex",p32,p34 778{ .mii; (p32)sub n4=t[5],n4 779 (p34)sub n4=t[5],n4,1;; 780 (p32)cmp.gtu p35,p33=n4,t[5] } 781{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];; 782 .pred.rel "mutex",p33,p35 783 (p33)sub n5=t[4],n5 784 (p35)sub n5=t[4],n5,1 };; 785{ .mii; (p33)cmp.gtu p34,p32=n5,t[4] 786 (p35)cmp.geu p34,p32=n5,t[4];; 787 .pred.rel "mutex",p32,p34 788 (p32)sub n6=t[3],n6 } 789{ .mmi; (p34)sub n6=t[3],n6,1;; 790 (p32)cmp.gtu p35,p33=n6,t[3] 791 (p34)cmp.geu p35,p33=n6,t[3] };; 792 .pred.rel "mutex",p33,p35 793{ .mii; (p33)sub n7=t[2],n7 794 (p35)sub n7=t[2],n7,1;; 795 (p33)cmp.gtu p34,p32=n7,t[2] } 796{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];; 797 .pred.rel "mutex",p32,p34 798 (p32)sub n8=t[1],n8 799 (p34)sub n8=t[1],n8,1 };; 800{ .mii; (p32)cmp.gtu p35,p33=n8,t[1] 801 (p34)cmp.geu p35,p33=n8,t[1];; 802 .pred.rel "mutex",p33,p35 803 (p33)sub a8=t[0],r0 } 804{ .mmi; (p35)sub a8=t[0],r0,1;; 805 (p33)cmp.gtu p34,p32=a8,t[0] 806 (p35)cmp.geu p34,p32=a8,t[0] };; 807 808// save the result, either tmp[num] or tmp[num]-np[num] 809 .pred.rel "mutex",p32,p34 810{ .mmi; (p32)st8 [rptr]=n1,8 811 (p34)st8 [rptr]=t0,8 812 add r19=-4*16,prevsp};; 813{ .mmb; (p32)st8 [rptr]=n2,8 814 (p34)st8 [rptr]=t[7],8 815 (p5)br.cond.dpnt.few .Ldone };; 816{ .mmb; (p32)st8 [rptr]=n3,8 817 (p34)st8 [rptr]=t[6],8 818 (p7)br.cond.dpnt.few .Ldone };; 819{ .mmb; (p32)st8 [rptr]=n4,8 820 (p34)st8 [rptr]=t[5],8 821 (p9)br.cond.dpnt.few .Ldone };; 822{ .mmb; (p32)st8 [rptr]=n5,8 823 (p34)st8 [rptr]=t[4],8 824 (p11)br.cond.dpnt.few .Ldone };; 825{ .mmb; (p32)st8 [rptr]=n6,8 826 (p34)st8 [rptr]=t[3],8 827 (p13)br.cond.dpnt.few .Ldone };; 828{ .mmb; (p32)st8 [rptr]=n7,8 829 (p34)st8 [rptr]=t[2],8 830 (p15)br.cond.dpnt.few .Ldone };; 831{ .mmb; (p32)st8 [rptr]=n8,8 832 (p34)st8 [rptr]=t[1],8 833 nop.b 0 };; 834.Ldone: // epilogue 835{ .mmi; ldf.fill f16=[r16],64 836 ldf.fill f17=[r17],64 837 nop.i 0 } 838{ .mmi; ldf.fill f18=[r18],64 839 ldf.fill f19=[r19],64 840 mov pr=prevpr,0x1ffff };; 841{ .mmi; ldf.fill f20=[r16] 842 ldf.fill f21=[r17] 843 mov ar.lc=prevlc } 844{ .mmi; ldf.fill f22=[r18] 845 ldf.fill f23=[r19] 846 mov ret0=1 } // signal "handled" 847{ .mib; rum 1<<5 848 .restore sp 849 mov sp=prevsp 850 br.ret.sptk.many b0 };; 851.endp bn_mul_mont_8# 852 853.type copyright#,\@object 854copyright: 855stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>" 856___ 857 858open STDOUT,">$output" if $output; 859print $code; 860close STDOUT or die "error closing STDOUT: $!"; 861