1#! /usr/bin/env perl 2# Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2015 18# 19# "Teaser" Montgomery multiplication module for ARMv8. Needs more 20# work. While it does improve RSA sign performance by 20-30% (less for 21# longer keys) on most processors, for some reason RSA2048 is not 22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication 23# instruction issue rate is limited on processor in question, meaning 24# that dedicated squaring procedure is a must. Well, actually all 25# contemporary AArch64 processors seem to have limited multiplication 26# issue rate, i.e. they can't issue multiplication every cycle, which 27# explains moderate improvement coefficients in comparison to 28# compiler-generated code. Recall that compiler is instructed to use 29# umulh and therefore uses same amount of multiplication instructions 30# to do the job. Assembly's edge is to minimize number of "collateral" 31# instructions and of course instruction scheduling. 32# 33# April 2015 34# 35# Squaring procedure that handles lengths divisible by 8 improves 36# RSA/DSA performance by 25-40-60% depending on processor and key 37# length. Overall improvement coefficients are always positive in 38# comparison to compiler-generated code. On Cortex-A57 improvement 39# is still modest on longest key lengths, while others exhibit e.g. 40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster 41# on Cortex-A57 and ~60-100% faster on others. 42 43$flavour = shift; 44$output = shift; 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 49die "can't locate arm-xlate.pl"; 50 51open OUT,"| \"$^X\" $xlate $flavour $output"; 52*STDOUT=*OUT; 53 54($lo0,$hi0,$aj,$m0,$alo,$ahi, 55 $lo1,$hi1,$nj,$m1,$nlo,$nhi, 56 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); 57 58# int bn_mul_mont( 59$rp="x0"; # BN_ULONG *rp, 60$ap="x1"; # const BN_ULONG *ap, 61$bp="x2"; # const BN_ULONG *bp, 62$np="x3"; # const BN_ULONG *np, 63$n0="x4"; # const BN_ULONG *n0, 64$num="x5"; # int num); 65 66$code.=<<___; 67.text 68 69.globl bn_mul_mont 70.type bn_mul_mont,%function 71.align 5 72bn_mul_mont: 73 tst $num,#7 74 b.eq __bn_sqr8x_mont 75 tst $num,#3 76 b.eq __bn_mul4x_mont 77.Lmul_mont: 78 stp x29,x30,[sp,#-64]! 79 add x29,sp,#0 80 stp x19,x20,[sp,#16] 81 stp x21,x22,[sp,#32] 82 stp x23,x24,[sp,#48] 83 84 ldr $m0,[$bp],#8 // bp[0] 85 sub $tp,sp,$num,lsl#3 86 ldp $hi0,$aj,[$ap],#16 // ap[0..1] 87 lsl $num,$num,#3 88 ldr $n0,[$n0] // *n0 89 and $tp,$tp,#-16 // ABI says so 90 ldp $hi1,$nj,[$np],#16 // np[0..1] 91 92 mul $lo0,$hi0,$m0 // ap[0]*bp[0] 93 sub $j,$num,#16 // j=num-2 94 umulh $hi0,$hi0,$m0 95 mul $alo,$aj,$m0 // ap[1]*bp[0] 96 umulh $ahi,$aj,$m0 97 98 mul $m1,$lo0,$n0 // "tp[0]"*n0 99 mov sp,$tp // alloca 100 101 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 102 umulh $hi1,$hi1,$m1 103 mul $nlo,$nj,$m1 // np[1]*m1 104 // (*) adds $lo1,$lo1,$lo0 // discarded 105 // (*) As for removal of first multiplication and addition 106 // instructions. The outcome of first addition is 107 // guaranteed to be zero, which leaves two computationally 108 // significant outcomes: it either carries or not. Then 109 // question is when does it carry? Is there alternative 110 // way to deduce it? If you follow operations, you can 111 // observe that condition for carry is quite simple: 112 // $lo0 being non-zero. So that carry can be calculated 113 // by adding -1 to $lo0. That's what next instruction does. 114 subs xzr,$lo0,#1 // (*) 115 umulh $nhi,$nj,$m1 116 adc $hi1,$hi1,xzr 117 cbz $j,.L1st_skip 118 119.L1st: 120 ldr $aj,[$ap],#8 121 adds $lo0,$alo,$hi0 122 sub $j,$j,#8 // j-- 123 adc $hi0,$ahi,xzr 124 125 ldr $nj,[$np],#8 126 adds $lo1,$nlo,$hi1 127 mul $alo,$aj,$m0 // ap[j]*bp[0] 128 adc $hi1,$nhi,xzr 129 umulh $ahi,$aj,$m0 130 131 adds $lo1,$lo1,$lo0 132 mul $nlo,$nj,$m1 // np[j]*m1 133 adc $hi1,$hi1,xzr 134 umulh $nhi,$nj,$m1 135 str $lo1,[$tp],#8 // tp[j-1] 136 cbnz $j,.L1st 137 138.L1st_skip: 139 adds $lo0,$alo,$hi0 140 sub $ap,$ap,$num // rewind $ap 141 adc $hi0,$ahi,xzr 142 143 adds $lo1,$nlo,$hi1 144 sub $np,$np,$num // rewind $np 145 adc $hi1,$nhi,xzr 146 147 adds $lo1,$lo1,$lo0 148 sub $i,$num,#8 // i=num-1 149 adcs $hi1,$hi1,$hi0 150 151 adc $ovf,xzr,xzr // upmost overflow bit 152 stp $lo1,$hi1,[$tp] 153 154.Louter: 155 ldr $m0,[$bp],#8 // bp[i] 156 ldp $hi0,$aj,[$ap],#16 157 ldr $tj,[sp] // tp[0] 158 add $tp,sp,#8 159 160 mul $lo0,$hi0,$m0 // ap[0]*bp[i] 161 sub $j,$num,#16 // j=num-2 162 umulh $hi0,$hi0,$m0 163 ldp $hi1,$nj,[$np],#16 164 mul $alo,$aj,$m0 // ap[1]*bp[i] 165 adds $lo0,$lo0,$tj 166 umulh $ahi,$aj,$m0 167 adc $hi0,$hi0,xzr 168 169 mul $m1,$lo0,$n0 170 sub $i,$i,#8 // i-- 171 172 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 173 umulh $hi1,$hi1,$m1 174 mul $nlo,$nj,$m1 // np[1]*m1 175 // (*) adds $lo1,$lo1,$lo0 176 subs xzr,$lo0,#1 // (*) 177 umulh $nhi,$nj,$m1 178 cbz $j,.Linner_skip 179 180.Linner: 181 ldr $aj,[$ap],#8 182 adc $hi1,$hi1,xzr 183 ldr $tj,[$tp],#8 // tp[j] 184 adds $lo0,$alo,$hi0 185 sub $j,$j,#8 // j-- 186 adc $hi0,$ahi,xzr 187 188 adds $lo1,$nlo,$hi1 189 ldr $nj,[$np],#8 190 adc $hi1,$nhi,xzr 191 192 mul $alo,$aj,$m0 // ap[j]*bp[i] 193 adds $lo0,$lo0,$tj 194 umulh $ahi,$aj,$m0 195 adc $hi0,$hi0,xzr 196 197 mul $nlo,$nj,$m1 // np[j]*m1 198 adds $lo1,$lo1,$lo0 199 umulh $nhi,$nj,$m1 200 str $lo1,[$tp,#-16] // tp[j-1] 201 cbnz $j,.Linner 202 203.Linner_skip: 204 ldr $tj,[$tp],#8 // tp[j] 205 adc $hi1,$hi1,xzr 206 adds $lo0,$alo,$hi0 207 sub $ap,$ap,$num // rewind $ap 208 adc $hi0,$ahi,xzr 209 210 adds $lo1,$nlo,$hi1 211 sub $np,$np,$num // rewind $np 212 adcs $hi1,$nhi,$ovf 213 adc $ovf,xzr,xzr 214 215 adds $lo0,$lo0,$tj 216 adc $hi0,$hi0,xzr 217 218 adds $lo1,$lo1,$lo0 219 adcs $hi1,$hi1,$hi0 220 adc $ovf,$ovf,xzr // upmost overflow bit 221 stp $lo1,$hi1,[$tp,#-16] 222 223 cbnz $i,.Louter 224 225 // Final step. We see if result is larger than modulus, and 226 // if it is, subtract the modulus. But comparison implies 227 // subtraction. So we subtract modulus, see if it borrowed, 228 // and conditionally copy original value. 229 ldr $tj,[sp] // tp[0] 230 add $tp,sp,#8 231 ldr $nj,[$np],#8 // np[0] 232 subs $j,$num,#8 // j=num-1 and clear borrow 233 mov $ap,$rp 234.Lsub: 235 sbcs $aj,$tj,$nj // tp[j]-np[j] 236 ldr $tj,[$tp],#8 237 sub $j,$j,#8 // j-- 238 ldr $nj,[$np],#8 239 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] 240 cbnz $j,.Lsub 241 242 sbcs $aj,$tj,$nj 243 sbcs $ovf,$ovf,xzr // did it borrow? 244 str $aj,[$ap],#8 // rp[num-1] 245 246 ldr $tj,[sp] // tp[0] 247 add $tp,sp,#8 248 ldr $aj,[$rp],#8 // rp[0] 249 sub $num,$num,#8 // num-- 250 nop 251.Lcond_copy: 252 sub $num,$num,#8 // num-- 253 csel $nj,$tj,$aj,lo // did it borrow? 254 ldr $tj,[$tp],#8 255 ldr $aj,[$rp],#8 256 str xzr,[$tp,#-16] // wipe tp 257 str $nj,[$rp,#-16] 258 cbnz $num,.Lcond_copy 259 260 csel $nj,$tj,$aj,lo 261 str xzr,[$tp,#-8] // wipe tp 262 str $nj,[$rp,#-8] 263 264 ldp x19,x20,[x29,#16] 265 mov sp,x29 266 ldp x21,x22,[x29,#32] 267 mov x0,#1 268 ldp x23,x24,[x29,#48] 269 ldr x29,[sp],#64 270 ret 271.size bn_mul_mont,.-bn_mul_mont 272___ 273{ 274######################################################################## 275# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. 276 277my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); 278my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); 279my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); 280my ($cnt,$carry,$topmost)=("x27","x28","x30"); 281my ($tp,$ap_end,$na0)=($bp,$np,$carry); 282 283$code.=<<___; 284.type __bn_sqr8x_mont,%function 285.align 5 286__bn_sqr8x_mont: 287 cmp $ap,$bp 288 b.ne __bn_mul4x_mont 289.Lsqr8x_mont: 290 .inst 0xd503233f // paciasp 291 stp x29,x30,[sp,#-128]! 292 add x29,sp,#0 293 stp x19,x20,[sp,#16] 294 stp x21,x22,[sp,#32] 295 stp x23,x24,[sp,#48] 296 stp x25,x26,[sp,#64] 297 stp x27,x28,[sp,#80] 298 stp $rp,$np,[sp,#96] // offload rp and np 299 300 ldp $a0,$a1,[$ap,#8*0] 301 ldp $a2,$a3,[$ap,#8*2] 302 ldp $a4,$a5,[$ap,#8*4] 303 ldp $a6,$a7,[$ap,#8*6] 304 305 sub $tp,sp,$num,lsl#4 306 lsl $num,$num,#3 307 ldr $n0,[$n0] // *n0 308 mov sp,$tp // alloca 309 sub $cnt,$num,#8*8 310 b .Lsqr8x_zero_start 311 312.Lsqr8x_zero: 313 sub $cnt,$cnt,#8*8 314 stp xzr,xzr,[$tp,#8*0] 315 stp xzr,xzr,[$tp,#8*2] 316 stp xzr,xzr,[$tp,#8*4] 317 stp xzr,xzr,[$tp,#8*6] 318.Lsqr8x_zero_start: 319 stp xzr,xzr,[$tp,#8*8] 320 stp xzr,xzr,[$tp,#8*10] 321 stp xzr,xzr,[$tp,#8*12] 322 stp xzr,xzr,[$tp,#8*14] 323 add $tp,$tp,#8*16 324 cbnz $cnt,.Lsqr8x_zero 325 326 add $ap_end,$ap,$num 327 add $ap,$ap,#8*8 328 mov $acc0,xzr 329 mov $acc1,xzr 330 mov $acc2,xzr 331 mov $acc3,xzr 332 mov $acc4,xzr 333 mov $acc5,xzr 334 mov $acc6,xzr 335 mov $acc7,xzr 336 mov $tp,sp 337 str $n0,[x29,#112] // offload n0 338 339 // Multiply everything but a[i]*a[i] 340.align 4 341.Lsqr8x_outer_loop: 342 // a[1]a[0] (i) 343 // a[2]a[0] 344 // a[3]a[0] 345 // a[4]a[0] 346 // a[5]a[0] 347 // a[6]a[0] 348 // a[7]a[0] 349 // a[2]a[1] (ii) 350 // a[3]a[1] 351 // a[4]a[1] 352 // a[5]a[1] 353 // a[6]a[1] 354 // a[7]a[1] 355 // a[3]a[2] (iii) 356 // a[4]a[2] 357 // a[5]a[2] 358 // a[6]a[2] 359 // a[7]a[2] 360 // a[4]a[3] (iv) 361 // a[5]a[3] 362 // a[6]a[3] 363 // a[7]a[3] 364 // a[5]a[4] (v) 365 // a[6]a[4] 366 // a[7]a[4] 367 // a[6]a[5] (vi) 368 // a[7]a[5] 369 // a[7]a[6] (vii) 370 371 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) 372 mul $t1,$a2,$a0 373 mul $t2,$a3,$a0 374 mul $t3,$a4,$a0 375 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) 376 mul $t0,$a5,$a0 377 adcs $acc2,$acc2,$t1 378 mul $t1,$a6,$a0 379 adcs $acc3,$acc3,$t2 380 mul $t2,$a7,$a0 381 adcs $acc4,$acc4,$t3 382 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) 383 adcs $acc5,$acc5,$t0 384 umulh $t0,$a2,$a0 385 adcs $acc6,$acc6,$t1 386 umulh $t1,$a3,$a0 387 adcs $acc7,$acc7,$t2 388 umulh $t2,$a4,$a0 389 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] 390 adc $acc0,xzr,xzr // t[8] 391 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) 392 umulh $t3,$a5,$a0 393 adcs $acc3,$acc3,$t0 394 umulh $t0,$a6,$a0 395 adcs $acc4,$acc4,$t1 396 umulh $t1,$a7,$a0 397 adcs $acc5,$acc5,$t2 398 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) 399 adcs $acc6,$acc6,$t3 400 mul $t3,$a3,$a1 401 adcs $acc7,$acc7,$t0 402 mul $t0,$a4,$a1 403 adc $acc0,$acc0,$t1 404 405 mul $t1,$a5,$a1 406 adds $acc3,$acc3,$t2 407 mul $t2,$a6,$a1 408 adcs $acc4,$acc4,$t3 409 mul $t3,$a7,$a1 410 adcs $acc5,$acc5,$t0 411 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) 412 adcs $acc6,$acc6,$t1 413 umulh $t1,$a3,$a1 414 adcs $acc7,$acc7,$t2 415 umulh $t2,$a4,$a1 416 adcs $acc0,$acc0,$t3 417 umulh $t3,$a5,$a1 418 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] 419 adc $acc1,xzr,xzr // t[9] 420 adds $acc4,$acc4,$t0 421 umulh $t0,$a6,$a1 422 adcs $acc5,$acc5,$t1 423 umulh $t1,$a7,$a1 424 adcs $acc6,$acc6,$t2 425 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) 426 adcs $acc7,$acc7,$t3 427 mul $t3,$a4,$a2 428 adcs $acc0,$acc0,$t0 429 mul $t0,$a5,$a2 430 adc $acc1,$acc1,$t1 431 432 mul $t1,$a6,$a2 433 adds $acc5,$acc5,$t2 434 mul $t2,$a7,$a2 435 adcs $acc6,$acc6,$t3 436 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) 437 adcs $acc7,$acc7,$t0 438 umulh $t0,$a4,$a2 439 adcs $acc0,$acc0,$t1 440 umulh $t1,$a5,$a2 441 adcs $acc1,$acc1,$t2 442 umulh $t2,$a6,$a2 443 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] 444 adc $acc2,xzr,xzr // t[10] 445 adds $acc6,$acc6,$t3 446 umulh $t3,$a7,$a2 447 adcs $acc7,$acc7,$t0 448 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) 449 adcs $acc0,$acc0,$t1 450 mul $t1,$a5,$a3 451 adcs $acc1,$acc1,$t2 452 mul $t2,$a6,$a3 453 adc $acc2,$acc2,$t3 454 455 mul $t3,$a7,$a3 456 adds $acc7,$acc7,$t0 457 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) 458 adcs $acc0,$acc0,$t1 459 umulh $t1,$a5,$a3 460 adcs $acc1,$acc1,$t2 461 umulh $t2,$a6,$a3 462 adcs $acc2,$acc2,$t3 463 umulh $t3,$a7,$a3 464 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] 465 adc $acc3,xzr,xzr // t[11] 466 adds $acc0,$acc0,$t0 467 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) 468 adcs $acc1,$acc1,$t1 469 mul $t1,$a6,$a4 470 adcs $acc2,$acc2,$t2 471 mul $t2,$a7,$a4 472 adc $acc3,$acc3,$t3 473 474 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) 475 adds $acc1,$acc1,$t0 476 umulh $t0,$a6,$a4 477 adcs $acc2,$acc2,$t1 478 umulh $t1,$a7,$a4 479 adcs $acc3,$acc3,$t2 480 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) 481 adc $acc4,xzr,xzr // t[12] 482 adds $acc2,$acc2,$t3 483 mul $t3,$a7,$a5 484 adcs $acc3,$acc3,$t0 485 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) 486 adc $acc4,$acc4,$t1 487 488 umulh $t1,$a7,$a5 489 adds $acc3,$acc3,$t2 490 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) 491 adcs $acc4,$acc4,$t3 492 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) 493 adc $acc5,xzr,xzr // t[13] 494 adds $acc4,$acc4,$t0 495 sub $cnt,$ap_end,$ap // done yet? 496 adc $acc5,$acc5,$t1 497 498 adds $acc5,$acc5,$t2 499 sub $t0,$ap_end,$num // rewinded ap 500 adc $acc6,xzr,xzr // t[14] 501 add $acc6,$acc6,$t3 502 503 cbz $cnt,.Lsqr8x_outer_break 504 505 mov $n0,$a0 506 ldp $a0,$a1,[$tp,#8*0] 507 ldp $a2,$a3,[$tp,#8*2] 508 ldp $a4,$a5,[$tp,#8*4] 509 ldp $a6,$a7,[$tp,#8*6] 510 adds $acc0,$acc0,$a0 511 adcs $acc1,$acc1,$a1 512 ldp $a0,$a1,[$ap,#8*0] 513 adcs $acc2,$acc2,$a2 514 adcs $acc3,$acc3,$a3 515 ldp $a2,$a3,[$ap,#8*2] 516 adcs $acc4,$acc4,$a4 517 adcs $acc5,$acc5,$a5 518 ldp $a4,$a5,[$ap,#8*4] 519 adcs $acc6,$acc6,$a6 520 mov $rp,$ap 521 adcs $acc7,xzr,$a7 522 ldp $a6,$a7,[$ap,#8*6] 523 add $ap,$ap,#8*8 524 //adc $carry,xzr,xzr // moved below 525 mov $cnt,#-8*8 526 527 // a[8]a[0] 528 // a[9]a[0] 529 // a[a]a[0] 530 // a[b]a[0] 531 // a[c]a[0] 532 // a[d]a[0] 533 // a[e]a[0] 534 // a[f]a[0] 535 // a[8]a[1] 536 // a[f]a[1]........................ 537 // a[8]a[2] 538 // a[f]a[2]........................ 539 // a[8]a[3] 540 // a[f]a[3]........................ 541 // a[8]a[4] 542 // a[f]a[4]........................ 543 // a[8]a[5] 544 // a[f]a[5]........................ 545 // a[8]a[6] 546 // a[f]a[6]........................ 547 // a[8]a[7] 548 // a[f]a[7]........................ 549.Lsqr8x_mul: 550 mul $t0,$a0,$n0 551 adc $carry,xzr,xzr // carry bit, modulo-scheduled 552 mul $t1,$a1,$n0 553 add $cnt,$cnt,#8 554 mul $t2,$a2,$n0 555 mul $t3,$a3,$n0 556 adds $acc0,$acc0,$t0 557 mul $t0,$a4,$n0 558 adcs $acc1,$acc1,$t1 559 mul $t1,$a5,$n0 560 adcs $acc2,$acc2,$t2 561 mul $t2,$a6,$n0 562 adcs $acc3,$acc3,$t3 563 mul $t3,$a7,$n0 564 adcs $acc4,$acc4,$t0 565 umulh $t0,$a0,$n0 566 adcs $acc5,$acc5,$t1 567 umulh $t1,$a1,$n0 568 adcs $acc6,$acc6,$t2 569 umulh $t2,$a2,$n0 570 adcs $acc7,$acc7,$t3 571 umulh $t3,$a3,$n0 572 adc $carry,$carry,xzr 573 str $acc0,[$tp],#8 574 adds $acc0,$acc1,$t0 575 umulh $t0,$a4,$n0 576 adcs $acc1,$acc2,$t1 577 umulh $t1,$a5,$n0 578 adcs $acc2,$acc3,$t2 579 umulh $t2,$a6,$n0 580 adcs $acc3,$acc4,$t3 581 umulh $t3,$a7,$n0 582 ldr $n0,[$rp,$cnt] 583 adcs $acc4,$acc5,$t0 584 adcs $acc5,$acc6,$t1 585 adcs $acc6,$acc7,$t2 586 adcs $acc7,$carry,$t3 587 //adc $carry,xzr,xzr // moved above 588 cbnz $cnt,.Lsqr8x_mul 589 // note that carry flag is guaranteed 590 // to be zero at this point 591 cmp $ap,$ap_end // done yet? 592 b.eq .Lsqr8x_break 593 594 ldp $a0,$a1,[$tp,#8*0] 595 ldp $a2,$a3,[$tp,#8*2] 596 ldp $a4,$a5,[$tp,#8*4] 597 ldp $a6,$a7,[$tp,#8*6] 598 adds $acc0,$acc0,$a0 599 ldr $n0,[$rp,#-8*8] 600 adcs $acc1,$acc1,$a1 601 ldp $a0,$a1,[$ap,#8*0] 602 adcs $acc2,$acc2,$a2 603 adcs $acc3,$acc3,$a3 604 ldp $a2,$a3,[$ap,#8*2] 605 adcs $acc4,$acc4,$a4 606 adcs $acc5,$acc5,$a5 607 ldp $a4,$a5,[$ap,#8*4] 608 adcs $acc6,$acc6,$a6 609 mov $cnt,#-8*8 610 adcs $acc7,$acc7,$a7 611 ldp $a6,$a7,[$ap,#8*6] 612 add $ap,$ap,#8*8 613 //adc $carry,xzr,xzr // moved above 614 b .Lsqr8x_mul 615 616.align 4 617.Lsqr8x_break: 618 ldp $a0,$a1,[$rp,#8*0] 619 add $ap,$rp,#8*8 620 ldp $a2,$a3,[$rp,#8*2] 621 sub $t0,$ap_end,$ap // is it last iteration? 622 ldp $a4,$a5,[$rp,#8*4] 623 sub $t1,$tp,$t0 624 ldp $a6,$a7,[$rp,#8*6] 625 cbz $t0,.Lsqr8x_outer_loop 626 627 stp $acc0,$acc1,[$tp,#8*0] 628 ldp $acc0,$acc1,[$t1,#8*0] 629 stp $acc2,$acc3,[$tp,#8*2] 630 ldp $acc2,$acc3,[$t1,#8*2] 631 stp $acc4,$acc5,[$tp,#8*4] 632 ldp $acc4,$acc5,[$t1,#8*4] 633 stp $acc6,$acc7,[$tp,#8*6] 634 mov $tp,$t1 635 ldp $acc6,$acc7,[$t1,#8*6] 636 b .Lsqr8x_outer_loop 637 638.align 4 639.Lsqr8x_outer_break: 640 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 641 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] 642 ldp $t1,$t2,[sp,#8*1] 643 ldp $a5,$a7,[$t0,#8*2] 644 add $ap,$t0,#8*4 645 ldp $t3,$t0,[sp,#8*3] 646 647 stp $acc0,$acc1,[$tp,#8*0] 648 mul $acc0,$a1,$a1 649 stp $acc2,$acc3,[$tp,#8*2] 650 umulh $a1,$a1,$a1 651 stp $acc4,$acc5,[$tp,#8*4] 652 mul $a2,$a3,$a3 653 stp $acc6,$acc7,[$tp,#8*6] 654 mov $tp,sp 655 umulh $a3,$a3,$a3 656 adds $acc1,$a1,$t1,lsl#1 657 extr $t1,$t2,$t1,#63 658 sub $cnt,$num,#8*4 659 660.Lsqr4x_shift_n_add: 661 adcs $acc2,$a2,$t1 662 extr $t2,$t3,$t2,#63 663 sub $cnt,$cnt,#8*4 664 adcs $acc3,$a3,$t2 665 ldp $t1,$t2,[$tp,#8*5] 666 mul $a4,$a5,$a5 667 ldp $a1,$a3,[$ap],#8*2 668 umulh $a5,$a5,$a5 669 mul $a6,$a7,$a7 670 umulh $a7,$a7,$a7 671 extr $t3,$t0,$t3,#63 672 stp $acc0,$acc1,[$tp,#8*0] 673 adcs $acc4,$a4,$t3 674 extr $t0,$t1,$t0,#63 675 stp $acc2,$acc3,[$tp,#8*2] 676 adcs $acc5,$a5,$t0 677 ldp $t3,$t0,[$tp,#8*7] 678 extr $t1,$t2,$t1,#63 679 adcs $acc6,$a6,$t1 680 extr $t2,$t3,$t2,#63 681 adcs $acc7,$a7,$t2 682 ldp $t1,$t2,[$tp,#8*9] 683 mul $a0,$a1,$a1 684 ldp $a5,$a7,[$ap],#8*2 685 umulh $a1,$a1,$a1 686 mul $a2,$a3,$a3 687 umulh $a3,$a3,$a3 688 stp $acc4,$acc5,[$tp,#8*4] 689 extr $t3,$t0,$t3,#63 690 stp $acc6,$acc7,[$tp,#8*6] 691 add $tp,$tp,#8*8 692 adcs $acc0,$a0,$t3 693 extr $t0,$t1,$t0,#63 694 adcs $acc1,$a1,$t0 695 ldp $t3,$t0,[$tp,#8*3] 696 extr $t1,$t2,$t1,#63 697 cbnz $cnt,.Lsqr4x_shift_n_add 698___ 699my ($np,$np_end)=($ap,$ap_end); 700$code.=<<___; 701 ldp $np,$n0,[x29,#104] // pull np and n0 702 703 adcs $acc2,$a2,$t1 704 extr $t2,$t3,$t2,#63 705 adcs $acc3,$a3,$t2 706 ldp $t1,$t2,[$tp,#8*5] 707 mul $a4,$a5,$a5 708 umulh $a5,$a5,$a5 709 stp $acc0,$acc1,[$tp,#8*0] 710 mul $a6,$a7,$a7 711 umulh $a7,$a7,$a7 712 stp $acc2,$acc3,[$tp,#8*2] 713 extr $t3,$t0,$t3,#63 714 adcs $acc4,$a4,$t3 715 extr $t0,$t1,$t0,#63 716 ldp $acc0,$acc1,[sp,#8*0] 717 adcs $acc5,$a5,$t0 718 extr $t1,$t2,$t1,#63 719 ldp $a0,$a1,[$np,#8*0] 720 adcs $acc6,$a6,$t1 721 extr $t2,xzr,$t2,#63 722 ldp $a2,$a3,[$np,#8*2] 723 adc $acc7,$a7,$t2 724 ldp $a4,$a5,[$np,#8*4] 725 726 // Reduce by 512 bits per iteration 727 mul $na0,$n0,$acc0 // t[0]*n0 728 ldp $a6,$a7,[$np,#8*6] 729 add $np_end,$np,$num 730 ldp $acc2,$acc3,[sp,#8*2] 731 stp $acc4,$acc5,[$tp,#8*4] 732 ldp $acc4,$acc5,[sp,#8*4] 733 stp $acc6,$acc7,[$tp,#8*6] 734 ldp $acc6,$acc7,[sp,#8*6] 735 add $np,$np,#8*8 736 mov $topmost,xzr // initial top-most carry 737 mov $tp,sp 738 mov $cnt,#8 739 740.Lsqr8x_reduction: 741 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) 742 mul $t1,$a1,$na0 743 sub $cnt,$cnt,#1 744 mul $t2,$a2,$na0 745 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing 746 mul $t3,$a3,$na0 747 // (*) adds xzr,$acc0,$t0 748 subs xzr,$acc0,#1 // (*) 749 mul $t0,$a4,$na0 750 adcs $acc0,$acc1,$t1 751 mul $t1,$a5,$na0 752 adcs $acc1,$acc2,$t2 753 mul $t2,$a6,$na0 754 adcs $acc2,$acc3,$t3 755 mul $t3,$a7,$na0 756 adcs $acc3,$acc4,$t0 757 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) 758 adcs $acc4,$acc5,$t1 759 umulh $t1,$a1,$na0 760 adcs $acc5,$acc6,$t2 761 umulh $t2,$a2,$na0 762 adcs $acc6,$acc7,$t3 763 umulh $t3,$a3,$na0 764 adc $acc7,xzr,xzr 765 adds $acc0,$acc0,$t0 766 umulh $t0,$a4,$na0 767 adcs $acc1,$acc1,$t1 768 umulh $t1,$a5,$na0 769 adcs $acc2,$acc2,$t2 770 umulh $t2,$a6,$na0 771 adcs $acc3,$acc3,$t3 772 umulh $t3,$a7,$na0 773 mul $na0,$n0,$acc0 // next t[0]*n0 774 adcs $acc4,$acc4,$t0 775 adcs $acc5,$acc5,$t1 776 adcs $acc6,$acc6,$t2 777 adc $acc7,$acc7,$t3 778 cbnz $cnt,.Lsqr8x_reduction 779 780 ldp $t0,$t1,[$tp,#8*0] 781 ldp $t2,$t3,[$tp,#8*2] 782 mov $rp,$tp 783 sub $cnt,$np_end,$np // done yet? 784 adds $acc0,$acc0,$t0 785 adcs $acc1,$acc1,$t1 786 ldp $t0,$t1,[$tp,#8*4] 787 adcs $acc2,$acc2,$t2 788 adcs $acc3,$acc3,$t3 789 ldp $t2,$t3,[$tp,#8*6] 790 adcs $acc4,$acc4,$t0 791 adcs $acc5,$acc5,$t1 792 adcs $acc6,$acc6,$t2 793 adcs $acc7,$acc7,$t3 794 //adc $carry,xzr,xzr // moved below 795 cbz $cnt,.Lsqr8x8_post_condition 796 797 ldr $n0,[$tp,#-8*8] 798 ldp $a0,$a1,[$np,#8*0] 799 ldp $a2,$a3,[$np,#8*2] 800 ldp $a4,$a5,[$np,#8*4] 801 mov $cnt,#-8*8 802 ldp $a6,$a7,[$np,#8*6] 803 add $np,$np,#8*8 804 805.Lsqr8x_tail: 806 mul $t0,$a0,$n0 807 adc $carry,xzr,xzr // carry bit, modulo-scheduled 808 mul $t1,$a1,$n0 809 add $cnt,$cnt,#8 810 mul $t2,$a2,$n0 811 mul $t3,$a3,$n0 812 adds $acc0,$acc0,$t0 813 mul $t0,$a4,$n0 814 adcs $acc1,$acc1,$t1 815 mul $t1,$a5,$n0 816 adcs $acc2,$acc2,$t2 817 mul $t2,$a6,$n0 818 adcs $acc3,$acc3,$t3 819 mul $t3,$a7,$n0 820 adcs $acc4,$acc4,$t0 821 umulh $t0,$a0,$n0 822 adcs $acc5,$acc5,$t1 823 umulh $t1,$a1,$n0 824 adcs $acc6,$acc6,$t2 825 umulh $t2,$a2,$n0 826 adcs $acc7,$acc7,$t3 827 umulh $t3,$a3,$n0 828 adc $carry,$carry,xzr 829 str $acc0,[$tp],#8 830 adds $acc0,$acc1,$t0 831 umulh $t0,$a4,$n0 832 adcs $acc1,$acc2,$t1 833 umulh $t1,$a5,$n0 834 adcs $acc2,$acc3,$t2 835 umulh $t2,$a6,$n0 836 adcs $acc3,$acc4,$t3 837 umulh $t3,$a7,$n0 838 ldr $n0,[$rp,$cnt] 839 adcs $acc4,$acc5,$t0 840 adcs $acc5,$acc6,$t1 841 adcs $acc6,$acc7,$t2 842 adcs $acc7,$carry,$t3 843 //adc $carry,xzr,xzr // moved above 844 cbnz $cnt,.Lsqr8x_tail 845 // note that carry flag is guaranteed 846 // to be zero at this point 847 ldp $a0,$a1,[$tp,#8*0] 848 sub $cnt,$np_end,$np // done yet? 849 sub $t2,$np_end,$num // rewinded np 850 ldp $a2,$a3,[$tp,#8*2] 851 ldp $a4,$a5,[$tp,#8*4] 852 ldp $a6,$a7,[$tp,#8*6] 853 cbz $cnt,.Lsqr8x_tail_break 854 855 ldr $n0,[$rp,#-8*8] 856 adds $acc0,$acc0,$a0 857 adcs $acc1,$acc1,$a1 858 ldp $a0,$a1,[$np,#8*0] 859 adcs $acc2,$acc2,$a2 860 adcs $acc3,$acc3,$a3 861 ldp $a2,$a3,[$np,#8*2] 862 adcs $acc4,$acc4,$a4 863 adcs $acc5,$acc5,$a5 864 ldp $a4,$a5,[$np,#8*4] 865 adcs $acc6,$acc6,$a6 866 mov $cnt,#-8*8 867 adcs $acc7,$acc7,$a7 868 ldp $a6,$a7,[$np,#8*6] 869 add $np,$np,#8*8 870 //adc $carry,xzr,xzr // moved above 871 b .Lsqr8x_tail 872 873.align 4 874.Lsqr8x_tail_break: 875 ldr $n0,[x29,#112] // pull n0 876 add $cnt,$tp,#8*8 // end of current t[num] window 877 878 subs xzr,$topmost,#1 // "move" top-most carry to carry bit 879 adcs $t0,$acc0,$a0 880 adcs $t1,$acc1,$a1 881 ldp $acc0,$acc1,[$rp,#8*0] 882 adcs $acc2,$acc2,$a2 883 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] 884 adcs $acc3,$acc3,$a3 885 ldp $a2,$a3,[$t2,#8*2] 886 adcs $acc4,$acc4,$a4 887 adcs $acc5,$acc5,$a5 888 ldp $a4,$a5,[$t2,#8*4] 889 adcs $acc6,$acc6,$a6 890 adcs $acc7,$acc7,$a7 891 ldp $a6,$a7,[$t2,#8*6] 892 add $np,$t2,#8*8 893 adc $topmost,xzr,xzr // top-most carry 894 mul $na0,$n0,$acc0 895 stp $t0,$t1,[$tp,#8*0] 896 stp $acc2,$acc3,[$tp,#8*2] 897 ldp $acc2,$acc3,[$rp,#8*2] 898 stp $acc4,$acc5,[$tp,#8*4] 899 ldp $acc4,$acc5,[$rp,#8*4] 900 cmp $cnt,x29 // did we hit the bottom? 901 stp $acc6,$acc7,[$tp,#8*6] 902 mov $tp,$rp // slide the window 903 ldp $acc6,$acc7,[$rp,#8*6] 904 mov $cnt,#8 905 b.ne .Lsqr8x_reduction 906 907 // Final step. We see if result is larger than modulus, and 908 // if it is, subtract the modulus. But comparison implies 909 // subtraction. So we subtract modulus, see if it borrowed, 910 // and conditionally copy original value. 911 ldr $rp,[x29,#96] // pull rp 912 add $tp,$tp,#8*8 913 subs $t0,$acc0,$a0 914 sbcs $t1,$acc1,$a1 915 sub $cnt,$num,#8*8 916 mov $ap_end,$rp // $rp copy 917 918.Lsqr8x_sub: 919 sbcs $t2,$acc2,$a2 920 ldp $a0,$a1,[$np,#8*0] 921 sbcs $t3,$acc3,$a3 922 stp $t0,$t1,[$rp,#8*0] 923 sbcs $t0,$acc4,$a4 924 ldp $a2,$a3,[$np,#8*2] 925 sbcs $t1,$acc5,$a5 926 stp $t2,$t3,[$rp,#8*2] 927 sbcs $t2,$acc6,$a6 928 ldp $a4,$a5,[$np,#8*4] 929 sbcs $t3,$acc7,$a7 930 ldp $a6,$a7,[$np,#8*6] 931 add $np,$np,#8*8 932 ldp $acc0,$acc1,[$tp,#8*0] 933 sub $cnt,$cnt,#8*8 934 ldp $acc2,$acc3,[$tp,#8*2] 935 ldp $acc4,$acc5,[$tp,#8*4] 936 ldp $acc6,$acc7,[$tp,#8*6] 937 add $tp,$tp,#8*8 938 stp $t0,$t1,[$rp,#8*4] 939 sbcs $t0,$acc0,$a0 940 stp $t2,$t3,[$rp,#8*6] 941 add $rp,$rp,#8*8 942 sbcs $t1,$acc1,$a1 943 cbnz $cnt,.Lsqr8x_sub 944 945 sbcs $t2,$acc2,$a2 946 mov $tp,sp 947 add $ap,sp,$num 948 ldp $a0,$a1,[$ap_end,#8*0] 949 sbcs $t3,$acc3,$a3 950 stp $t0,$t1,[$rp,#8*0] 951 sbcs $t0,$acc4,$a4 952 ldp $a2,$a3,[$ap_end,#8*2] 953 sbcs $t1,$acc5,$a5 954 stp $t2,$t3,[$rp,#8*2] 955 sbcs $t2,$acc6,$a6 956 ldp $acc0,$acc1,[$ap,#8*0] 957 sbcs $t3,$acc7,$a7 958 ldp $acc2,$acc3,[$ap,#8*2] 959 sbcs xzr,$topmost,xzr // did it borrow? 960 ldr x30,[x29,#8] // pull return address 961 stp $t0,$t1,[$rp,#8*4] 962 stp $t2,$t3,[$rp,#8*6] 963 964 sub $cnt,$num,#8*4 965.Lsqr4x_cond_copy: 966 sub $cnt,$cnt,#8*4 967 csel $t0,$acc0,$a0,lo 968 stp xzr,xzr,[$tp,#8*0] 969 csel $t1,$acc1,$a1,lo 970 ldp $a0,$a1,[$ap_end,#8*4] 971 ldp $acc0,$acc1,[$ap,#8*4] 972 csel $t2,$acc2,$a2,lo 973 stp xzr,xzr,[$tp,#8*2] 974 add $tp,$tp,#8*4 975 csel $t3,$acc3,$a3,lo 976 ldp $a2,$a3,[$ap_end,#8*6] 977 ldp $acc2,$acc3,[$ap,#8*6] 978 add $ap,$ap,#8*4 979 stp $t0,$t1,[$ap_end,#8*0] 980 stp $t2,$t3,[$ap_end,#8*2] 981 add $ap_end,$ap_end,#8*4 982 stp xzr,xzr,[$ap,#8*0] 983 stp xzr,xzr,[$ap,#8*2] 984 cbnz $cnt,.Lsqr4x_cond_copy 985 986 csel $t0,$acc0,$a0,lo 987 stp xzr,xzr,[$tp,#8*0] 988 csel $t1,$acc1,$a1,lo 989 stp xzr,xzr,[$tp,#8*2] 990 csel $t2,$acc2,$a2,lo 991 csel $t3,$acc3,$a3,lo 992 stp $t0,$t1,[$ap_end,#8*0] 993 stp $t2,$t3,[$ap_end,#8*2] 994 995 b .Lsqr8x_done 996 997.align 4 998.Lsqr8x8_post_condition: 999 adc $carry,xzr,xzr 1000 ldr x30,[x29,#8] // pull return address 1001 // $acc0-7,$carry hold result, $a0-7 hold modulus 1002 subs $a0,$acc0,$a0 1003 ldr $ap,[x29,#96] // pull rp 1004 sbcs $a1,$acc1,$a1 1005 stp xzr,xzr,[sp,#8*0] 1006 sbcs $a2,$acc2,$a2 1007 stp xzr,xzr,[sp,#8*2] 1008 sbcs $a3,$acc3,$a3 1009 stp xzr,xzr,[sp,#8*4] 1010 sbcs $a4,$acc4,$a4 1011 stp xzr,xzr,[sp,#8*6] 1012 sbcs $a5,$acc5,$a5 1013 stp xzr,xzr,[sp,#8*8] 1014 sbcs $a6,$acc6,$a6 1015 stp xzr,xzr,[sp,#8*10] 1016 sbcs $a7,$acc7,$a7 1017 stp xzr,xzr,[sp,#8*12] 1018 sbcs $carry,$carry,xzr // did it borrow? 1019 stp xzr,xzr,[sp,#8*14] 1020 1021 // $a0-7 hold result-modulus 1022 csel $a0,$acc0,$a0,lo 1023 csel $a1,$acc1,$a1,lo 1024 csel $a2,$acc2,$a2,lo 1025 csel $a3,$acc3,$a3,lo 1026 stp $a0,$a1,[$ap,#8*0] 1027 csel $a4,$acc4,$a4,lo 1028 csel $a5,$acc5,$a5,lo 1029 stp $a2,$a3,[$ap,#8*2] 1030 csel $a6,$acc6,$a6,lo 1031 csel $a7,$acc7,$a7,lo 1032 stp $a4,$a5,[$ap,#8*4] 1033 stp $a6,$a7,[$ap,#8*6] 1034 1035.Lsqr8x_done: 1036 ldp x19,x20,[x29,#16] 1037 mov sp,x29 1038 ldp x21,x22,[x29,#32] 1039 mov x0,#1 1040 ldp x23,x24,[x29,#48] 1041 ldp x25,x26,[x29,#64] 1042 ldp x27,x28,[x29,#80] 1043 ldr x29,[sp],#128 1044 .inst 0xd50323bf // autiasp 1045 ret 1046.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1047___ 1048} 1049 1050{ 1051######################################################################## 1052# Even though this might look as ARMv8 adaptation of mulx4x_mont from 1053# x86_64-mont5 module, it's different in sense that it performs 1054# reduction 256 bits at a time. 1055 1056my ($a0,$a1,$a2,$a3, 1057 $t0,$t1,$t2,$t3, 1058 $m0,$m1,$m2,$m3, 1059 $acc0,$acc1,$acc2,$acc3,$acc4, 1060 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); 1061my $bp_end=$rp; 1062my ($carry,$topmost) = ($rp,"x30"); 1063 1064$code.=<<___; 1065.type __bn_mul4x_mont,%function 1066.align 5 1067__bn_mul4x_mont: 1068 .inst 0xd503233f // paciasp 1069 stp x29,x30,[sp,#-128]! 1070 add x29,sp,#0 1071 stp x19,x20,[sp,#16] 1072 stp x21,x22,[sp,#32] 1073 stp x23,x24,[sp,#48] 1074 stp x25,x26,[sp,#64] 1075 stp x27,x28,[sp,#80] 1076 1077 sub $tp,sp,$num,lsl#3 1078 lsl $num,$num,#3 1079 ldr $n0,[$n0] // *n0 1080 sub sp,$tp,#8*4 // alloca 1081 1082 add $t0,$bp,$num 1083 add $ap_end,$ap,$num 1084 stp $rp,$t0,[x29,#96] // offload rp and &b[num] 1085 1086 ldr $bi,[$bp,#8*0] // b[0] 1087 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1088 ldp $a2,$a3,[$ap,#8*2] 1089 add $ap,$ap,#8*4 1090 mov $acc0,xzr 1091 mov $acc1,xzr 1092 mov $acc2,xzr 1093 mov $acc3,xzr 1094 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1095 ldp $m2,$m3,[$np,#8*2] 1096 adds $np,$np,#8*4 // clear carry bit 1097 mov $carry,xzr 1098 mov $cnt,#0 1099 mov $tp,sp 1100 1101.Loop_mul4x_1st_reduction: 1102 mul $t0,$a0,$bi // lo(a[0..3]*b[0]) 1103 adc $carry,$carry,xzr // modulo-scheduled 1104 mul $t1,$a1,$bi 1105 add $cnt,$cnt,#8 1106 mul $t2,$a2,$bi 1107 and $cnt,$cnt,#31 1108 mul $t3,$a3,$bi 1109 adds $acc0,$acc0,$t0 1110 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) 1111 adcs $acc1,$acc1,$t1 1112 mul $mi,$acc0,$n0 // t[0]*n0 1113 adcs $acc2,$acc2,$t2 1114 umulh $t1,$a1,$bi 1115 adcs $acc3,$acc3,$t3 1116 umulh $t2,$a2,$bi 1117 adc $acc4,xzr,xzr 1118 umulh $t3,$a3,$bi 1119 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1120 adds $acc1,$acc1,$t0 1121 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) 1122 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1123 adcs $acc2,$acc2,$t1 1124 mul $t1,$m1,$mi 1125 adcs $acc3,$acc3,$t2 1126 mul $t2,$m2,$mi 1127 adc $acc4,$acc4,$t3 // can't overflow 1128 mul $t3,$m3,$mi 1129 // (*) adds xzr,$acc0,$t0 1130 subs xzr,$acc0,#1 // (*) 1131 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) 1132 adcs $acc0,$acc1,$t1 1133 umulh $t1,$m1,$mi 1134 adcs $acc1,$acc2,$t2 1135 umulh $t2,$m2,$mi 1136 adcs $acc2,$acc3,$t3 1137 umulh $t3,$m3,$mi 1138 adcs $acc3,$acc4,$carry 1139 adc $carry,xzr,xzr 1140 adds $acc0,$acc0,$t0 1141 sub $t0,$ap_end,$ap 1142 adcs $acc1,$acc1,$t1 1143 adcs $acc2,$acc2,$t2 1144 adcs $acc3,$acc3,$t3 1145 //adc $carry,$carry,xzr 1146 cbnz $cnt,.Loop_mul4x_1st_reduction 1147 1148 cbz $t0,.Lmul4x4_post_condition 1149 1150 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1151 ldp $a2,$a3,[$ap,#8*2] 1152 add $ap,$ap,#8*4 1153 ldr $mi,[sp] // a[0]*n0 1154 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1155 ldp $m2,$m3,[$np,#8*2] 1156 add $np,$np,#8*4 1157 1158.Loop_mul4x_1st_tail: 1159 mul $t0,$a0,$bi // lo(a[4..7]*b[i]) 1160 adc $carry,$carry,xzr // modulo-scheduled 1161 mul $t1,$a1,$bi 1162 add $cnt,$cnt,#8 1163 mul $t2,$a2,$bi 1164 and $cnt,$cnt,#31 1165 mul $t3,$a3,$bi 1166 adds $acc0,$acc0,$t0 1167 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) 1168 adcs $acc1,$acc1,$t1 1169 umulh $t1,$a1,$bi 1170 adcs $acc2,$acc2,$t2 1171 umulh $t2,$a2,$bi 1172 adcs $acc3,$acc3,$t3 1173 umulh $t3,$a3,$bi 1174 adc $acc4,xzr,xzr 1175 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1176 adds $acc1,$acc1,$t0 1177 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) 1178 adcs $acc2,$acc2,$t1 1179 mul $t1,$m1,$mi 1180 adcs $acc3,$acc3,$t2 1181 mul $t2,$m2,$mi 1182 adc $acc4,$acc4,$t3 // can't overflow 1183 mul $t3,$m3,$mi 1184 adds $acc0,$acc0,$t0 1185 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) 1186 adcs $acc1,$acc1,$t1 1187 umulh $t1,$m1,$mi 1188 adcs $acc2,$acc2,$t2 1189 umulh $t2,$m2,$mi 1190 adcs $acc3,$acc3,$t3 1191 adcs $acc4,$acc4,$carry 1192 umulh $t3,$m3,$mi 1193 adc $carry,xzr,xzr 1194 ldr $mi,[sp,$cnt] // next t[0]*n0 1195 str $acc0,[$tp],#8 // result!!! 1196 adds $acc0,$acc1,$t0 1197 sub $t0,$ap_end,$ap // done yet? 1198 adcs $acc1,$acc2,$t1 1199 adcs $acc2,$acc3,$t2 1200 adcs $acc3,$acc4,$t3 1201 //adc $carry,$carry,xzr 1202 cbnz $cnt,.Loop_mul4x_1st_tail 1203 1204 sub $t1,$ap_end,$num // rewinded $ap 1205 cbz $t0,.Lmul4x_proceed 1206 1207 ldp $a0,$a1,[$ap,#8*0] 1208 ldp $a2,$a3,[$ap,#8*2] 1209 add $ap,$ap,#8*4 1210 ldp $m0,$m1,[$np,#8*0] 1211 ldp $m2,$m3,[$np,#8*2] 1212 add $np,$np,#8*4 1213 b .Loop_mul4x_1st_tail 1214 1215.align 5 1216.Lmul4x_proceed: 1217 ldr $bi,[$bp,#8*4]! // *++b 1218 adc $topmost,$carry,xzr 1219 ldp $a0,$a1,[$t1,#8*0] // a[0..3] 1220 sub $np,$np,$num // rewind np 1221 ldp $a2,$a3,[$t1,#8*2] 1222 add $ap,$t1,#8*4 1223 1224 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1225 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1226 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1227 ldp $acc2,$acc3,[sp,#8*6] 1228 1229 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1230 mov $tp,sp 1231 ldp $m2,$m3,[$np,#8*2] 1232 adds $np,$np,#8*4 // clear carry bit 1233 mov $carry,xzr 1234 1235.align 4 1236.Loop_mul4x_reduction: 1237 mul $t0,$a0,$bi // lo(a[0..3]*b[4]) 1238 adc $carry,$carry,xzr // modulo-scheduled 1239 mul $t1,$a1,$bi 1240 add $cnt,$cnt,#8 1241 mul $t2,$a2,$bi 1242 and $cnt,$cnt,#31 1243 mul $t3,$a3,$bi 1244 adds $acc0,$acc0,$t0 1245 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) 1246 adcs $acc1,$acc1,$t1 1247 mul $mi,$acc0,$n0 // t[0]*n0 1248 adcs $acc2,$acc2,$t2 1249 umulh $t1,$a1,$bi 1250 adcs $acc3,$acc3,$t3 1251 umulh $t2,$a2,$bi 1252 adc $acc4,xzr,xzr 1253 umulh $t3,$a3,$bi 1254 ldr $bi,[$bp,$cnt] // next b[i] 1255 adds $acc1,$acc1,$t0 1256 // (*) mul $t0,$m0,$mi 1257 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1258 adcs $acc2,$acc2,$t1 1259 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 1260 adcs $acc3,$acc3,$t2 1261 mul $t2,$m2,$mi 1262 adc $acc4,$acc4,$t3 // can't overflow 1263 mul $t3,$m3,$mi 1264 // (*) adds xzr,$acc0,$t0 1265 subs xzr,$acc0,#1 // (*) 1266 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 1267 adcs $acc0,$acc1,$t1 1268 umulh $t1,$m1,$mi 1269 adcs $acc1,$acc2,$t2 1270 umulh $t2,$m2,$mi 1271 adcs $acc2,$acc3,$t3 1272 umulh $t3,$m3,$mi 1273 adcs $acc3,$acc4,$carry 1274 adc $carry,xzr,xzr 1275 adds $acc0,$acc0,$t0 1276 adcs $acc1,$acc1,$t1 1277 adcs $acc2,$acc2,$t2 1278 adcs $acc3,$acc3,$t3 1279 //adc $carry,$carry,xzr 1280 cbnz $cnt,.Loop_mul4x_reduction 1281 1282 adc $carry,$carry,xzr 1283 ldp $t0,$t1,[$tp,#8*4] // t[4..7] 1284 ldp $t2,$t3,[$tp,#8*6] 1285 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1286 ldp $a2,$a3,[$ap,#8*2] 1287 add $ap,$ap,#8*4 1288 adds $acc0,$acc0,$t0 1289 adcs $acc1,$acc1,$t1 1290 adcs $acc2,$acc2,$t2 1291 adcs $acc3,$acc3,$t3 1292 //adc $carry,$carry,xzr 1293 1294 ldr $mi,[sp] // t[0]*n0 1295 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1296 ldp $m2,$m3,[$np,#8*2] 1297 add $np,$np,#8*4 1298 1299.align 4 1300.Loop_mul4x_tail: 1301 mul $t0,$a0,$bi // lo(a[4..7]*b[4]) 1302 adc $carry,$carry,xzr // modulo-scheduled 1303 mul $t1,$a1,$bi 1304 add $cnt,$cnt,#8 1305 mul $t2,$a2,$bi 1306 and $cnt,$cnt,#31 1307 mul $t3,$a3,$bi 1308 adds $acc0,$acc0,$t0 1309 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) 1310 adcs $acc1,$acc1,$t1 1311 umulh $t1,$a1,$bi 1312 adcs $acc2,$acc2,$t2 1313 umulh $t2,$a2,$bi 1314 adcs $acc3,$acc3,$t3 1315 umulh $t3,$a3,$bi 1316 adc $acc4,xzr,xzr 1317 ldr $bi,[$bp,$cnt] // next b[i] 1318 adds $acc1,$acc1,$t0 1319 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) 1320 adcs $acc2,$acc2,$t1 1321 mul $t1,$m1,$mi 1322 adcs $acc3,$acc3,$t2 1323 mul $t2,$m2,$mi 1324 adc $acc4,$acc4,$t3 // can't overflow 1325 mul $t3,$m3,$mi 1326 adds $acc0,$acc0,$t0 1327 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) 1328 adcs $acc1,$acc1,$t1 1329 umulh $t1,$m1,$mi 1330 adcs $acc2,$acc2,$t2 1331 umulh $t2,$m2,$mi 1332 adcs $acc3,$acc3,$t3 1333 umulh $t3,$m3,$mi 1334 adcs $acc4,$acc4,$carry 1335 ldr $mi,[sp,$cnt] // next a[0]*n0 1336 adc $carry,xzr,xzr 1337 str $acc0,[$tp],#8 // result!!! 1338 adds $acc0,$acc1,$t0 1339 sub $t0,$ap_end,$ap // done yet? 1340 adcs $acc1,$acc2,$t1 1341 adcs $acc2,$acc3,$t2 1342 adcs $acc3,$acc4,$t3 1343 //adc $carry,$carry,xzr 1344 cbnz $cnt,.Loop_mul4x_tail 1345 1346 sub $t1,$np,$num // rewinded np? 1347 adc $carry,$carry,xzr 1348 cbz $t0,.Loop_mul4x_break 1349 1350 ldp $t0,$t1,[$tp,#8*4] 1351 ldp $t2,$t3,[$tp,#8*6] 1352 ldp $a0,$a1,[$ap,#8*0] 1353 ldp $a2,$a3,[$ap,#8*2] 1354 add $ap,$ap,#8*4 1355 adds $acc0,$acc0,$t0 1356 adcs $acc1,$acc1,$t1 1357 adcs $acc2,$acc2,$t2 1358 adcs $acc3,$acc3,$t3 1359 //adc $carry,$carry,xzr 1360 ldp $m0,$m1,[$np,#8*0] 1361 ldp $m2,$m3,[$np,#8*2] 1362 add $np,$np,#8*4 1363 b .Loop_mul4x_tail 1364 1365.align 4 1366.Loop_mul4x_break: 1367 ldp $t2,$t3,[x29,#96] // pull rp and &b[num] 1368 adds $acc0,$acc0,$topmost 1369 add $bp,$bp,#8*4 // bp++ 1370 adcs $acc1,$acc1,xzr 1371 sub $ap,$ap,$num // rewind ap 1372 adcs $acc2,$acc2,xzr 1373 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1374 adcs $acc3,$acc3,xzr 1375 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1376 adc $topmost,$carry,xzr 1377 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1378 cmp $bp,$t3 // done yet? 1379 ldp $acc2,$acc3,[sp,#8*6] 1380 ldp $m0,$m1,[$t1,#8*0] // n[0..3] 1381 ldp $m2,$m3,[$t1,#8*2] 1382 add $np,$t1,#8*4 1383 b.eq .Lmul4x_post 1384 1385 ldr $bi,[$bp] 1386 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1387 ldp $a2,$a3,[$ap,#8*2] 1388 adds $ap,$ap,#8*4 // clear carry bit 1389 mov $carry,xzr 1390 mov $tp,sp 1391 b .Loop_mul4x_reduction 1392 1393.align 4 1394.Lmul4x_post: 1395 // Final step. We see if result is larger than modulus, and 1396 // if it is, subtract the modulus. But comparison implies 1397 // subtraction. So we subtract modulus, see if it borrowed, 1398 // and conditionally copy original value. 1399 mov $rp,$t2 1400 mov $ap_end,$t2 // $rp copy 1401 subs $t0,$acc0,$m0 1402 add $tp,sp,#8*8 1403 sbcs $t1,$acc1,$m1 1404 sub $cnt,$num,#8*4 1405 1406.Lmul4x_sub: 1407 sbcs $t2,$acc2,$m2 1408 ldp $m0,$m1,[$np,#8*0] 1409 sub $cnt,$cnt,#8*4 1410 ldp $acc0,$acc1,[$tp,#8*0] 1411 sbcs $t3,$acc3,$m3 1412 ldp $m2,$m3,[$np,#8*2] 1413 add $np,$np,#8*4 1414 ldp $acc2,$acc3,[$tp,#8*2] 1415 add $tp,$tp,#8*4 1416 stp $t0,$t1,[$rp,#8*0] 1417 sbcs $t0,$acc0,$m0 1418 stp $t2,$t3,[$rp,#8*2] 1419 add $rp,$rp,#8*4 1420 sbcs $t1,$acc1,$m1 1421 cbnz $cnt,.Lmul4x_sub 1422 1423 sbcs $t2,$acc2,$m2 1424 mov $tp,sp 1425 add $ap,sp,#8*4 1426 ldp $a0,$a1,[$ap_end,#8*0] 1427 sbcs $t3,$acc3,$m3 1428 stp $t0,$t1,[$rp,#8*0] 1429 ldp $a2,$a3,[$ap_end,#8*2] 1430 stp $t2,$t3,[$rp,#8*2] 1431 ldp $acc0,$acc1,[$ap,#8*0] 1432 ldp $acc2,$acc3,[$ap,#8*2] 1433 sbcs xzr,$topmost,xzr // did it borrow? 1434 ldr x30,[x29,#8] // pull return address 1435 1436 sub $cnt,$num,#8*4 1437.Lmul4x_cond_copy: 1438 sub $cnt,$cnt,#8*4 1439 csel $t0,$acc0,$a0,lo 1440 stp xzr,xzr,[$tp,#8*0] 1441 csel $t1,$acc1,$a1,lo 1442 ldp $a0,$a1,[$ap_end,#8*4] 1443 ldp $acc0,$acc1,[$ap,#8*4] 1444 csel $t2,$acc2,$a2,lo 1445 stp xzr,xzr,[$tp,#8*2] 1446 add $tp,$tp,#8*4 1447 csel $t3,$acc3,$a3,lo 1448 ldp $a2,$a3,[$ap_end,#8*6] 1449 ldp $acc2,$acc3,[$ap,#8*6] 1450 add $ap,$ap,#8*4 1451 stp $t0,$t1,[$ap_end,#8*0] 1452 stp $t2,$t3,[$ap_end,#8*2] 1453 add $ap_end,$ap_end,#8*4 1454 cbnz $cnt,.Lmul4x_cond_copy 1455 1456 csel $t0,$acc0,$a0,lo 1457 stp xzr,xzr,[$tp,#8*0] 1458 csel $t1,$acc1,$a1,lo 1459 stp xzr,xzr,[$tp,#8*2] 1460 csel $t2,$acc2,$a2,lo 1461 stp xzr,xzr,[$tp,#8*3] 1462 csel $t3,$acc3,$a3,lo 1463 stp xzr,xzr,[$tp,#8*4] 1464 stp $t0,$t1,[$ap_end,#8*0] 1465 stp $t2,$t3,[$ap_end,#8*2] 1466 1467 b .Lmul4x_done 1468 1469.align 4 1470.Lmul4x4_post_condition: 1471 adc $carry,$carry,xzr 1472 ldr $ap,[x29,#96] // pull rp 1473 // $acc0-3,$carry hold result, $m0-7 hold modulus 1474 subs $a0,$acc0,$m0 1475 ldr x30,[x29,#8] // pull return address 1476 sbcs $a1,$acc1,$m1 1477 stp xzr,xzr,[sp,#8*0] 1478 sbcs $a2,$acc2,$m2 1479 stp xzr,xzr,[sp,#8*2] 1480 sbcs $a3,$acc3,$m3 1481 stp xzr,xzr,[sp,#8*4] 1482 sbcs xzr,$carry,xzr // did it borrow? 1483 stp xzr,xzr,[sp,#8*6] 1484 1485 // $a0-3 hold result-modulus 1486 csel $a0,$acc0,$a0,lo 1487 csel $a1,$acc1,$a1,lo 1488 csel $a2,$acc2,$a2,lo 1489 csel $a3,$acc3,$a3,lo 1490 stp $a0,$a1,[$ap,#8*0] 1491 stp $a2,$a3,[$ap,#8*2] 1492 1493.Lmul4x_done: 1494 ldp x19,x20,[x29,#16] 1495 mov sp,x29 1496 ldp x21,x22,[x29,#32] 1497 mov x0,#1 1498 ldp x23,x24,[x29,#48] 1499 ldp x25,x26,[x29,#64] 1500 ldp x27,x28,[x29,#80] 1501 ldr x29,[sp],#128 1502 .inst 0xd50323bf // autiasp 1503 ret 1504.size __bn_mul4x_mont,.-__bn_mul4x_mont 1505___ 1506} 1507$code.=<<___; 1508.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 1509.align 4 1510___ 1511 1512print $code; 1513 1514close STDOUT or die "error closing STDOUT: $!"; 1515