1#! /usr/bin/env perl 2# Copyright 2015-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# March 2015 18# 19# "Teaser" Montgomery multiplication module for ARMv8. Needs more 20# work. While it does improve RSA sign performance by 20-30% (less for 21# longer keys) on most processors, for some reason RSA2048 is not 22# faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication 23# instruction issue rate is limited on processor in question, meaning 24# that dedicated squaring procedure is a must. Well, actually all 25# contemporary AArch64 processors seem to have limited multiplication 26# issue rate, i.e. they can't issue multiplication every cycle, which 27# explains moderate improvement coefficients in comparison to 28# compiler-generated code. Recall that compiler is instructed to use 29# umulh and therefore uses same amount of multiplication instructions 30# to do the job. Assembly's edge is to minimize number of "collateral" 31# instructions and of course instruction scheduling. 32# 33# April 2015 34# 35# Squaring procedure that handles lengths divisible by 8 improves 36# RSA/DSA performance by 25-40-60% depending on processor and key 37# length. Overall improvement coefficients are always positive in 38# comparison to compiler-generated code. On Cortex-A57 improvement 39# is still modest on longest key lengths, while others exhibit e.g. 40# 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster 41# on Cortex-A57 and ~60-100% faster on others. 42 43# $output is the last argument if it looks like a file (it has an extension) 44# $flavour is the first argument if it doesn't look like a file 45my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 46my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 47 48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 49( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 50( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 51die "can't locate arm-xlate.pl"; 52 53open OUT,"| \"$^X\" $xlate $flavour \"$output\"" 54 or die "can't call $xlate: $1"; 55*STDOUT=*OUT; 56 57($lo0,$hi0,$aj,$m0,$alo,$ahi, 58 $lo1,$hi1,$nj,$m1,$nlo,$nhi, 59 $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); 60 61# int bn_mul_mont( 62$rp="x0"; # BN_ULONG *rp, 63$ap="x1"; # const BN_ULONG *ap, 64$bp="x2"; # const BN_ULONG *bp, 65$np="x3"; # const BN_ULONG *np, 66$n0="x4"; # const BN_ULONG *n0, 67$num="x5"; # int num); 68 69$code.=<<___; 70#ifndef __KERNEL__ 71# include "arm_arch.h" 72.extern OPENSSL_armv8_rsa_neonized 73.hidden OPENSSL_armv8_rsa_neonized 74#endif 75.text 76 77.globl bn_mul_mont 78.type bn_mul_mont,%function 79.align 5 80bn_mul_mont: 81.Lbn_mul_mont: 82 tst $num,#3 83 b.ne .Lmul_mont 84 cmp $num,#32 85 b.le .Lscalar_impl 86#ifndef __KERNEL__ 87 adrp x17,OPENSSL_armv8_rsa_neonized 88 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 89 cbnz w17, bn_mul8x_mont_neon 90#endif 91 92.Lscalar_impl: 93 tst $num,#7 94 b.eq __bn_sqr8x_mont 95 tst $num,#3 96 b.eq __bn_mul4x_mont 97 98.Lmul_mont: 99 stp x29,x30,[sp,#-64]! 100 add x29,sp,#0 101 stp x19,x20,[sp,#16] 102 stp x21,x22,[sp,#32] 103 stp x23,x24,[sp,#48] 104 105 ldr $m0,[$bp],#8 // bp[0] 106 sub $tp,sp,$num,lsl#3 107 ldp $hi0,$aj,[$ap],#16 // ap[0..1] 108 lsl $num,$num,#3 109 ldr $n0,[$n0] // *n0 110 and $tp,$tp,#-16 // ABI says so 111 ldp $hi1,$nj,[$np],#16 // np[0..1] 112 113 mul $lo0,$hi0,$m0 // ap[0]*bp[0] 114 sub $j,$num,#16 // j=num-2 115 umulh $hi0,$hi0,$m0 116 mul $alo,$aj,$m0 // ap[1]*bp[0] 117 umulh $ahi,$aj,$m0 118 119 mul $m1,$lo0,$n0 // "tp[0]"*n0 120 mov sp,$tp // alloca 121 122 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 123 umulh $hi1,$hi1,$m1 124 mul $nlo,$nj,$m1 // np[1]*m1 125 // (*) adds $lo1,$lo1,$lo0 // discarded 126 // (*) As for removal of first multiplication and addition 127 // instructions. The outcome of first addition is 128 // guaranteed to be zero, which leaves two computationally 129 // significant outcomes: it either carries or not. Then 130 // question is when does it carry? Is there alternative 131 // way to deduce it? If you follow operations, you can 132 // observe that condition for carry is quite simple: 133 // $lo0 being non-zero. So that carry can be calculated 134 // by adding -1 to $lo0. That's what next instruction does. 135 subs xzr,$lo0,#1 // (*) 136 umulh $nhi,$nj,$m1 137 adc $hi1,$hi1,xzr 138 cbz $j,.L1st_skip 139 140.L1st: 141 ldr $aj,[$ap],#8 142 adds $lo0,$alo,$hi0 143 sub $j,$j,#8 // j-- 144 adc $hi0,$ahi,xzr 145 146 ldr $nj,[$np],#8 147 adds $lo1,$nlo,$hi1 148 mul $alo,$aj,$m0 // ap[j]*bp[0] 149 adc $hi1,$nhi,xzr 150 umulh $ahi,$aj,$m0 151 152 adds $lo1,$lo1,$lo0 153 mul $nlo,$nj,$m1 // np[j]*m1 154 adc $hi1,$hi1,xzr 155 umulh $nhi,$nj,$m1 156 str $lo1,[$tp],#8 // tp[j-1] 157 cbnz $j,.L1st 158 159.L1st_skip: 160 adds $lo0,$alo,$hi0 161 sub $ap,$ap,$num // rewind $ap 162 adc $hi0,$ahi,xzr 163 164 adds $lo1,$nlo,$hi1 165 sub $np,$np,$num // rewind $np 166 adc $hi1,$nhi,xzr 167 168 adds $lo1,$lo1,$lo0 169 sub $i,$num,#8 // i=num-1 170 adcs $hi1,$hi1,$hi0 171 172 adc $ovf,xzr,xzr // upmost overflow bit 173 stp $lo1,$hi1,[$tp] 174 175.Louter: 176 ldr $m0,[$bp],#8 // bp[i] 177 ldp $hi0,$aj,[$ap],#16 178 ldr $tj,[sp] // tp[0] 179 add $tp,sp,#8 180 181 mul $lo0,$hi0,$m0 // ap[0]*bp[i] 182 sub $j,$num,#16 // j=num-2 183 umulh $hi0,$hi0,$m0 184 ldp $hi1,$nj,[$np],#16 185 mul $alo,$aj,$m0 // ap[1]*bp[i] 186 adds $lo0,$lo0,$tj 187 umulh $ahi,$aj,$m0 188 adc $hi0,$hi0,xzr 189 190 mul $m1,$lo0,$n0 191 sub $i,$i,#8 // i-- 192 193 // (*) mul $lo1,$hi1,$m1 // np[0]*m1 194 umulh $hi1,$hi1,$m1 195 mul $nlo,$nj,$m1 // np[1]*m1 196 // (*) adds $lo1,$lo1,$lo0 197 subs xzr,$lo0,#1 // (*) 198 umulh $nhi,$nj,$m1 199 cbz $j,.Linner_skip 200 201.Linner: 202 ldr $aj,[$ap],#8 203 adc $hi1,$hi1,xzr 204 ldr $tj,[$tp],#8 // tp[j] 205 adds $lo0,$alo,$hi0 206 sub $j,$j,#8 // j-- 207 adc $hi0,$ahi,xzr 208 209 adds $lo1,$nlo,$hi1 210 ldr $nj,[$np],#8 211 adc $hi1,$nhi,xzr 212 213 mul $alo,$aj,$m0 // ap[j]*bp[i] 214 adds $lo0,$lo0,$tj 215 umulh $ahi,$aj,$m0 216 adc $hi0,$hi0,xzr 217 218 mul $nlo,$nj,$m1 // np[j]*m1 219 adds $lo1,$lo1,$lo0 220 umulh $nhi,$nj,$m1 221 stur $lo1,[$tp,#-16] // tp[j-1] 222 cbnz $j,.Linner 223 224.Linner_skip: 225 ldr $tj,[$tp],#8 // tp[j] 226 adc $hi1,$hi1,xzr 227 adds $lo0,$alo,$hi0 228 sub $ap,$ap,$num // rewind $ap 229 adc $hi0,$ahi,xzr 230 231 adds $lo1,$nlo,$hi1 232 sub $np,$np,$num // rewind $np 233 adcs $hi1,$nhi,$ovf 234 adc $ovf,xzr,xzr 235 236 adds $lo0,$lo0,$tj 237 adc $hi0,$hi0,xzr 238 239 adds $lo1,$lo1,$lo0 240 adcs $hi1,$hi1,$hi0 241 adc $ovf,$ovf,xzr // upmost overflow bit 242 stp $lo1,$hi1,[$tp,#-16] 243 244 cbnz $i,.Louter 245 246 // Final step. We see if result is larger than modulus, and 247 // if it is, subtract the modulus. But comparison implies 248 // subtraction. So we subtract modulus, see if it borrowed, 249 // and conditionally copy original value. 250 ldr $tj,[sp] // tp[0] 251 add $tp,sp,#8 252 ldr $nj,[$np],#8 // np[0] 253 subs $j,$num,#8 // j=num-1 and clear borrow 254 mov $ap,$rp 255.Lsub: 256 sbcs $aj,$tj,$nj // tp[j]-np[j] 257 ldr $tj,[$tp],#8 258 sub $j,$j,#8 // j-- 259 ldr $nj,[$np],#8 260 str $aj,[$ap],#8 // rp[j]=tp[j]-np[j] 261 cbnz $j,.Lsub 262 263 sbcs $aj,$tj,$nj 264 sbcs $ovf,$ovf,xzr // did it borrow? 265 str $aj,[$ap],#8 // rp[num-1] 266 267 ldr $tj,[sp] // tp[0] 268 add $tp,sp,#8 269 ldr $aj,[$rp],#8 // rp[0] 270 sub $num,$num,#8 // num-- 271 nop 272.Lcond_copy: 273 sub $num,$num,#8 // num-- 274 csel $nj,$tj,$aj,lo // did it borrow? 275 ldr $tj,[$tp],#8 276 ldr $aj,[$rp],#8 277 stur xzr,[$tp,#-16] // wipe tp 278 stur $nj,[$rp,#-16] 279 cbnz $num,.Lcond_copy 280 281 csel $nj,$tj,$aj,lo 282 stur xzr,[$tp,#-8] // wipe tp 283 stur $nj,[$rp,#-8] 284 285 ldp x19,x20,[x29,#16] 286 mov sp,x29 287 ldp x21,x22,[x29,#32] 288 mov x0,#1 289 ldp x23,x24,[x29,#48] 290 ldr x29,[sp],#64 291 ret 292.size bn_mul_mont,.-bn_mul_mont 293___ 294{ 295my ($A0,$A1,$N0,$N1)=map("v$_",(0..3)); 296my ($Z,$Temp)=("v4.16b","v5"); 297my @ACC=map("v$_",(6..13)); 298my ($Bi,$Ni,$M0)=map("v$_",(28..30)); 299my $sBi="s28"; 300my $sM0="s30"; 301my $zero="v14"; 302my $temp="v15"; 303my $ACCTemp="v16"; 304 305my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); 306my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); 307 308$code.=<<___; 309.type bn_mul8x_mont_neon,%function 310.align 5 311bn_mul8x_mont_neon: 312 stp x29,x30,[sp,#-80]! 313 mov x16,sp 314 stp d8,d9,[sp,#16] 315 stp d10,d11,[sp,#32] 316 stp d12,d13,[sp,#48] 317 stp d14,d15,[sp,#64] 318 lsl $num,$num,#1 319 eor $zero.16b,$zero.16b,$zero.16b 320 321.align 4 322.LNEON_8n: 323 eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b 324 sub $toutptr,sp,#128 325 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b 326 sub $toutptr,$toutptr,$num,lsl#4 327 eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b 328 and $toutptr,$toutptr,#-64 329 eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b 330 mov sp,$toutptr // alloca 331 eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b 332 add $toutptr,$toutptr,#256 333 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b 334 sub $inner,$num,#8 335 eor @ACC[6].16b,@ACC[6].16b,@ACC[6].16b 336 eor @ACC[7].16b,@ACC[7].16b,@ACC[7].16b 337 338.LNEON_8n_init: 339 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 340 subs $inner,$inner,#8 341 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 342 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 343 st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 344 bne .LNEON_8n_init 345 346 add $tinptr,sp,#256 347 ld1 {$A0.4s,$A1.4s},[$aptr],#32 348 add $bnptr,sp,#8 349 ldr $sM0,[$n0],#4 350 mov $outer,$num 351 b .LNEON_8n_outer 352 353.align 4 354.LNEON_8n_outer: 355 ldr $sBi,[$bptr],#4 // *b++ 356 uxtl $Bi.4s,$Bi.4h 357 add $toutptr,sp,#128 358 ld1 {$N0.4s,$N1.4s},[$nptr],#32 359 360 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 361 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 362 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 363 shl $Ni.2d,@ACC[0].2d,#16 364 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 365 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 366 add $Ni.2d,$Ni.2d,@ACC[0].2d 367 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 368 mul $Ni.2s,$Ni.2s,$M0.2s 369 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 370 st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] 371 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 372 uxtl $Ni.4s,$Ni.4h 373 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 374___ 375for ($i=0; $i<7;) { 376$code.=<<___; 377 ldr $sBi,[$bptr],#4 // *b++ 378 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 379 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 380 uxtl $Bi.4s,$Bi.4h 381 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 382 ushr $temp.2d,@ACC[0].2d,#16 383 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 384 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 385 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 386 add @ACC[0].2d,@ACC[0].2d,$temp.2d 387 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 388 ushr @ACC[0].2d,@ACC[0].2d,#16 389 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 390 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 391 add $ACCTemp.2d,@ACC[1].2d,@ACC[0].2d 392 ins @ACC[1].d[0],$ACCTemp.d[0] 393 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] 394___ 395 push(@ACC,shift(@ACC)); $i++; 396$code.=<<___; 397 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 398 ld1 {@ACC[7].2d},[$tinptr],#16 399 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 400 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 401 shl $Ni.2d,@ACC[0].2d,#16 402 ext $Ni.16b,$Ni.16b,$Ni.16b,#8 403 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 404 add $Ni.2d,$Ni.2d,@ACC[0].2d 405 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 406 mul $Ni.2s,$Ni.2s,$M0.2s 407 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 408 st1 {$Bi.2s},[$bnptr],#8 // put aside smashed b[8*i+$i] 409 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 410 uxtl $Ni.4s,$Ni.4h 411 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 412___ 413} 414$code.=<<___; 415 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] 416 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 417 ld1 {$A0.4s,$A1.4s},[$aptr],#32 418 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 419 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 420 mov $Temp.16b,@ACC[0].16b 421 ushr $Temp.2d,$Temp.2d,#16 422 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 423 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 424 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 425 add @ACC[0].2d,@ACC[0].2d,$Temp.2d 426 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 427 ushr @ACC[0].2d,@ACC[0].2d,#16 428 eor $temp.16b,$temp.16b,$temp.16b 429 ins @ACC[0].d[1],$temp.d[0] 430 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 431 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 432 add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d 433 st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] 434 add $bnptr,sp,#8 // rewind 435___ 436 push(@ACC,shift(@ACC)); 437$code.=<<___; 438 sub $inner,$num,#8 439 b .LNEON_8n_inner 440 441.align 4 442.LNEON_8n_inner: 443 subs $inner,$inner,#8 444 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 445 ld1 {@ACC[7].2d},[$tinptr] 446 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 447 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+0] 448 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 449 ld1 {$N0.4s,$N1.4s},[$nptr],#32 450 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 451 b.eq .LInner_jump 452 add $tinptr,$tinptr,#16 // don't advance in last iteration 453.LInner_jump: 454 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 455 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 456 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 457 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 458___ 459for ($i=1; $i<8; $i++) { 460$code.=<<___; 461 ld1 {$Bi.2s},[$bnptr],#8 // pull smashed b[8*i+$i] 462 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 463 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 464 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 465 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 466 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 467 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 468 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 469 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 470 st1 {@ACC[0].2d},[$toutptr],#16 471___ 472 push(@ACC,shift(@ACC)); 473$code.=<<___; 474 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] 475 ld1 {@ACC[7].2d},[$tinptr] 476 umlal @ACC[1].2d,$Bi.2s,$A0.s[1] 477 ld1 {$Ni.2s},[$bnptr],#8 // pull smashed m[8*i+$i] 478 umlal @ACC[2].2d,$Bi.2s,$A0.s[2] 479 b.eq .LInner_jump$i 480 add $tinptr,$tinptr,#16 // don't advance in last iteration 481.LInner_jump$i: 482 umlal @ACC[3].2d,$Bi.2s,$A0.s[3] 483 umlal @ACC[4].2d,$Bi.2s,$A1.s[0] 484 umlal @ACC[5].2d,$Bi.2s,$A1.s[1] 485 umlal @ACC[6].2d,$Bi.2s,$A1.s[2] 486 umlal @ACC[7].2d,$Bi.2s,$A1.s[3] 487___ 488} 489$code.=<<___; 490 b.ne .LInner_after_rewind$i 491 sub $aptr,$aptr,$num,lsl#2 // rewind 492.LInner_after_rewind$i: 493 umlal @ACC[0].2d,$Ni.2s,$N0.s[0] 494 ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] 495 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] 496 ld1 {$A0.4s,$A1.4s},[$aptr],#32 497 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] 498 add $bnptr,sp,#8 // rewind 499 umlal @ACC[3].2d,$Ni.2s,$N0.s[3] 500 umlal @ACC[4].2d,$Ni.2s,$N1.s[0] 501 umlal @ACC[5].2d,$Ni.2s,$N1.s[1] 502 umlal @ACC[6].2d,$Ni.2s,$N1.s[2] 503 st1 {@ACC[0].2d},[$toutptr],#16 504 umlal @ACC[7].2d,$Ni.2s,$N1.s[3] 505 506 bne .LNEON_8n_inner 507___ 508 push(@ACC,shift(@ACC)); 509$code.=<<___; 510 add $tinptr,sp,#128 511 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 512 eor $N0.16b,$N0.16b,$N0.16b // $N0 513 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 514 eor $N1.16b,$N1.16b,$N1.16b // $N1 515 st1 {@ACC[4].2d,@ACC[5].2d},[$toutptr],#32 516 st1 {@ACC[6].2d},[$toutptr] 517 518 subs $outer,$outer,#8 519 ld1 {@ACC[0].2d,@ACC[1].2d},[$tinptr],#32 520 ld1 {@ACC[2].2d,@ACC[3].2d},[$tinptr],#32 521 ld1 {@ACC[4].2d,@ACC[5].2d},[$tinptr],#32 522 ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 523 524 b.eq .LInner_8n_jump_2steps 525 sub $nptr,$nptr,$num,lsl#2 // rewind 526 b .LNEON_8n_outer 527 528.LInner_8n_jump_2steps: 529 add $toutptr,sp,#128 530 st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame 531 mov $Temp.16b,@ACC[0].16b 532 ushr $temp.2d,@ACC[0].2d,#16 533 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 534 st1 {$N0.2d,$N1.2d}, [sp],#32 535 add @ACC[0].2d,@ACC[0].2d,$temp.2d 536 st1 {$N0.2d,$N1.2d}, [sp],#32 537 ushr $temp.2d,@ACC[0].2d,#16 538 st1 {$N0.2d,$N1.2d}, [sp],#32 539 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h 540 ins $temp.d[1],$zero.d[0] 541 542 mov $inner,$num 543 b .LNEON_tail_entry 544 545.align 4 546.LNEON_tail: 547 add @ACC[0].2d,@ACC[0].2d,$temp.2d 548 mov $Temp.16b,@ACC[0].16b 549 ushr $temp.2d,@ACC[0].2d,#16 550 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 551 ld1 {@ACC[2].2d,@ACC[3].2d}, [$tinptr],#32 552 add @ACC[0].2d,@ACC[0].2d,$temp.2d 553 ld1 {@ACC[4].2d,@ACC[5].2d}, [$tinptr],#32 554 ushr $temp.2d,@ACC[0].2d,#16 555 ld1 {@ACC[6].2d,@ACC[7].2d}, [$tinptr],#32 556 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h 557 ins $temp.d[1],$zero.d[0] 558 559.LNEON_tail_entry: 560___ 561for ($i=1; $i<8; $i++) { 562$code.=<<___; 563 add @ACC[1].2d,@ACC[1].2d,$temp.2d 564 st1 {@ACC[0].s}[0], [$toutptr],#4 565 ushr $temp.2d,@ACC[1].2d,#16 566 mov $Temp.16b,@ACC[1].16b 567 ext @ACC[1].16b,@ACC[1].16b,@ACC[1].16b,#8 568 add @ACC[1].2d,@ACC[1].2d,$temp.2d 569 ushr $temp.2d,@ACC[1].2d,#16 570 zip1 @ACC[1].4h,$Temp.4h,@ACC[1].4h 571 ins $temp.d[1],$zero.d[0] 572___ 573 push(@ACC,shift(@ACC)); 574} 575 push(@ACC,shift(@ACC)); 576$code.=<<___; 577 ld1 {@ACC[0].2d,@ACC[1].2d}, [$tinptr],#32 578 subs $inner,$inner,#8 579 st1 {@ACC[7].s}[0], [$toutptr],#4 580 bne .LNEON_tail 581 582 st1 {$temp.s}[0], [$toutptr],#4 // top-most bit 583 sub $nptr,$nptr,$num,lsl#2 // rewind $nptr 584 subs $aptr,sp,#0 // clear carry flag 585 add $bptr,sp,$num,lsl#2 586 587.LNEON_sub: 588 ldp w4,w5,[$aptr],#8 589 ldp w6,w7,[$aptr],#8 590 ldp w8,w9,[$nptr],#8 591 ldp w10,w11,[$nptr],#8 592 sbcs w8,w4,w8 593 sbcs w9,w5,w9 594 sbcs w10,w6,w10 595 sbcs w11,w7,w11 596 sub x17,$bptr,$aptr 597 stp w8,w9,[$rptr],#8 598 stp w10,w11,[$rptr],#8 599 cbnz x17,.LNEON_sub 600 601 ldr w10, [$aptr] // load top-most bit 602 mov x11,sp 603 eor v0.16b,v0.16b,v0.16b 604 sub x11,$bptr,x11 // this is num*4 605 eor v1.16b,v1.16b,v1.16b 606 mov $aptr,sp 607 sub $rptr,$rptr,x11 // rewind $rptr 608 mov $nptr,$bptr // second 3/4th of frame 609 sbcs w10,w10,wzr // result is carry flag 610 611.LNEON_copy_n_zap: 612 ldp w4,w5,[$aptr],#8 613 ldp w6,w7,[$aptr],#8 614 ldp w8,w9,[$rptr],#8 615 ldp w10,w11,[$rptr] 616 sub $rptr,$rptr,#8 617 b.cs .LCopy_1 618 mov w8,w4 619 mov w9,w5 620 mov w10,w6 621 mov w11,w7 622.LCopy_1: 623 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 624 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 625 ldp w4,w5,[$aptr],#8 626 ldp w6,w7,[$aptr],#8 627 stp w8,w9,[$rptr],#8 628 stp w10,w11,[$rptr],#8 629 sub $aptr,$aptr,#32 630 ldp w8,w9,[$rptr],#8 631 ldp w10,w11,[$rptr] 632 sub $rptr,$rptr,#8 633 b.cs .LCopy_2 634 mov w8, w4 635 mov w9, w5 636 mov w10, w6 637 mov w11, w7 638.LCopy_2: 639 st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe 640 st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe 641 sub x17,$bptr,$aptr // preserves carry 642 stp w8,w9,[$rptr],#8 643 stp w10,w11,[$rptr],#8 644 cbnz x17,.LNEON_copy_n_zap 645 646 mov sp,x16 647 ldp d14,d15,[sp,#64] 648 ldp d12,d13,[sp,#48] 649 ldp d10,d11,[sp,#32] 650 ldp d8,d9,[sp,#16] 651 ldr x29,[sp],#80 652 ret // bx lr 653 654.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 655___ 656} 657{ 658######################################################################## 659# Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module. 660 661my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13)); 662my ($t0,$t1,$t2,$t3)=map("x$_",(14..17)); 663my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); 664my ($cnt,$carry,$topmost)=("x27","x28","x30"); 665my ($tp,$ap_end,$na0)=($bp,$np,$carry); 666 667$code.=<<___; 668.type __bn_sqr8x_mont,%function 669.align 5 670__bn_sqr8x_mont: 671 cmp $ap,$bp 672 b.ne __bn_mul4x_mont 673.Lsqr8x_mont: 674 .inst 0xd503233f // paciasp 675 stp x29,x30,[sp,#-128]! 676 add x29,sp,#0 677 stp x19,x20,[sp,#16] 678 stp x21,x22,[sp,#32] 679 stp x23,x24,[sp,#48] 680 stp x25,x26,[sp,#64] 681 stp x27,x28,[sp,#80] 682 stp $rp,$np,[sp,#96] // offload rp and np 683 684 ldp $a0,$a1,[$ap,#8*0] 685 ldp $a2,$a3,[$ap,#8*2] 686 ldp $a4,$a5,[$ap,#8*4] 687 ldp $a6,$a7,[$ap,#8*6] 688 689 sub $tp,sp,$num,lsl#4 690 lsl $num,$num,#3 691 ldr $n0,[$n0] // *n0 692 mov sp,$tp // alloca 693 sub $cnt,$num,#8*8 694 b .Lsqr8x_zero_start 695 696.Lsqr8x_zero: 697 sub $cnt,$cnt,#8*8 698 stp xzr,xzr,[$tp,#8*0] 699 stp xzr,xzr,[$tp,#8*2] 700 stp xzr,xzr,[$tp,#8*4] 701 stp xzr,xzr,[$tp,#8*6] 702.Lsqr8x_zero_start: 703 stp xzr,xzr,[$tp,#8*8] 704 stp xzr,xzr,[$tp,#8*10] 705 stp xzr,xzr,[$tp,#8*12] 706 stp xzr,xzr,[$tp,#8*14] 707 add $tp,$tp,#8*16 708 cbnz $cnt,.Lsqr8x_zero 709 710 add $ap_end,$ap,$num 711 add $ap,$ap,#8*8 712 mov $acc0,xzr 713 mov $acc1,xzr 714 mov $acc2,xzr 715 mov $acc3,xzr 716 mov $acc4,xzr 717 mov $acc5,xzr 718 mov $acc6,xzr 719 mov $acc7,xzr 720 mov $tp,sp 721 str $n0,[x29,#112] // offload n0 722 723 // Multiply everything but a[i]*a[i] 724.align 4 725.Lsqr8x_outer_loop: 726 // a[1]a[0] (i) 727 // a[2]a[0] 728 // a[3]a[0] 729 // a[4]a[0] 730 // a[5]a[0] 731 // a[6]a[0] 732 // a[7]a[0] 733 // a[2]a[1] (ii) 734 // a[3]a[1] 735 // a[4]a[1] 736 // a[5]a[1] 737 // a[6]a[1] 738 // a[7]a[1] 739 // a[3]a[2] (iii) 740 // a[4]a[2] 741 // a[5]a[2] 742 // a[6]a[2] 743 // a[7]a[2] 744 // a[4]a[3] (iv) 745 // a[5]a[3] 746 // a[6]a[3] 747 // a[7]a[3] 748 // a[5]a[4] (v) 749 // a[6]a[4] 750 // a[7]a[4] 751 // a[6]a[5] (vi) 752 // a[7]a[5] 753 // a[7]a[6] (vii) 754 755 mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i) 756 mul $t1,$a2,$a0 757 mul $t2,$a3,$a0 758 mul $t3,$a4,$a0 759 adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0]) 760 mul $t0,$a5,$a0 761 adcs $acc2,$acc2,$t1 762 mul $t1,$a6,$a0 763 adcs $acc3,$acc3,$t2 764 mul $t2,$a7,$a0 765 adcs $acc4,$acc4,$t3 766 umulh $t3,$a1,$a0 // hi(a[1..7]*a[0]) 767 adcs $acc5,$acc5,$t0 768 umulh $t0,$a2,$a0 769 adcs $acc6,$acc6,$t1 770 umulh $t1,$a3,$a0 771 adcs $acc7,$acc7,$t2 772 umulh $t2,$a4,$a0 773 stp $acc0,$acc1,[$tp],#8*2 // t[0..1] 774 adc $acc0,xzr,xzr // t[8] 775 adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0]) 776 umulh $t3,$a5,$a0 777 adcs $acc3,$acc3,$t0 778 umulh $t0,$a6,$a0 779 adcs $acc4,$acc4,$t1 780 umulh $t1,$a7,$a0 781 adcs $acc5,$acc5,$t2 782 mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii) 783 adcs $acc6,$acc6,$t3 784 mul $t3,$a3,$a1 785 adcs $acc7,$acc7,$t0 786 mul $t0,$a4,$a1 787 adc $acc0,$acc0,$t1 788 789 mul $t1,$a5,$a1 790 adds $acc3,$acc3,$t2 791 mul $t2,$a6,$a1 792 adcs $acc4,$acc4,$t3 793 mul $t3,$a7,$a1 794 adcs $acc5,$acc5,$t0 795 umulh $t0,$a2,$a1 // hi(a[2..7]*a[1]) 796 adcs $acc6,$acc6,$t1 797 umulh $t1,$a3,$a1 798 adcs $acc7,$acc7,$t2 799 umulh $t2,$a4,$a1 800 adcs $acc0,$acc0,$t3 801 umulh $t3,$a5,$a1 802 stp $acc2,$acc3,[$tp],#8*2 // t[2..3] 803 adc $acc1,xzr,xzr // t[9] 804 adds $acc4,$acc4,$t0 805 umulh $t0,$a6,$a1 806 adcs $acc5,$acc5,$t1 807 umulh $t1,$a7,$a1 808 adcs $acc6,$acc6,$t2 809 mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii) 810 adcs $acc7,$acc7,$t3 811 mul $t3,$a4,$a2 812 adcs $acc0,$acc0,$t0 813 mul $t0,$a5,$a2 814 adc $acc1,$acc1,$t1 815 816 mul $t1,$a6,$a2 817 adds $acc5,$acc5,$t2 818 mul $t2,$a7,$a2 819 adcs $acc6,$acc6,$t3 820 umulh $t3,$a3,$a2 // hi(a[3..7]*a[2]) 821 adcs $acc7,$acc7,$t0 822 umulh $t0,$a4,$a2 823 adcs $acc0,$acc0,$t1 824 umulh $t1,$a5,$a2 825 adcs $acc1,$acc1,$t2 826 umulh $t2,$a6,$a2 827 stp $acc4,$acc5,[$tp],#8*2 // t[4..5] 828 adc $acc2,xzr,xzr // t[10] 829 adds $acc6,$acc6,$t3 830 umulh $t3,$a7,$a2 831 adcs $acc7,$acc7,$t0 832 mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv) 833 adcs $acc0,$acc0,$t1 834 mul $t1,$a5,$a3 835 adcs $acc1,$acc1,$t2 836 mul $t2,$a6,$a3 837 adc $acc2,$acc2,$t3 838 839 mul $t3,$a7,$a3 840 adds $acc7,$acc7,$t0 841 umulh $t0,$a4,$a3 // hi(a[4..7]*a[3]) 842 adcs $acc0,$acc0,$t1 843 umulh $t1,$a5,$a3 844 adcs $acc1,$acc1,$t2 845 umulh $t2,$a6,$a3 846 adcs $acc2,$acc2,$t3 847 umulh $t3,$a7,$a3 848 stp $acc6,$acc7,[$tp],#8*2 // t[6..7] 849 adc $acc3,xzr,xzr // t[11] 850 adds $acc0,$acc0,$t0 851 mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v) 852 adcs $acc1,$acc1,$t1 853 mul $t1,$a6,$a4 854 adcs $acc2,$acc2,$t2 855 mul $t2,$a7,$a4 856 adc $acc3,$acc3,$t3 857 858 umulh $t3,$a5,$a4 // hi(a[5..7]*a[4]) 859 adds $acc1,$acc1,$t0 860 umulh $t0,$a6,$a4 861 adcs $acc2,$acc2,$t1 862 umulh $t1,$a7,$a4 863 adcs $acc3,$acc3,$t2 864 mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi) 865 adc $acc4,xzr,xzr // t[12] 866 adds $acc2,$acc2,$t3 867 mul $t3,$a7,$a5 868 adcs $acc3,$acc3,$t0 869 umulh $t0,$a6,$a5 // hi(a[6..7]*a[5]) 870 adc $acc4,$acc4,$t1 871 872 umulh $t1,$a7,$a5 873 adds $acc3,$acc3,$t2 874 mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii) 875 adcs $acc4,$acc4,$t3 876 umulh $t3,$a7,$a6 // hi(a[7]*a[6]) 877 adc $acc5,xzr,xzr // t[13] 878 adds $acc4,$acc4,$t0 879 sub $cnt,$ap_end,$ap // done yet? 880 adc $acc5,$acc5,$t1 881 882 adds $acc5,$acc5,$t2 883 sub $t0,$ap_end,$num // rewinded ap 884 adc $acc6,xzr,xzr // t[14] 885 add $acc6,$acc6,$t3 886 887 cbz $cnt,.Lsqr8x_outer_break 888 889 mov $n0,$a0 890 ldp $a0,$a1,[$tp,#8*0] 891 ldp $a2,$a3,[$tp,#8*2] 892 ldp $a4,$a5,[$tp,#8*4] 893 ldp $a6,$a7,[$tp,#8*6] 894 adds $acc0,$acc0,$a0 895 adcs $acc1,$acc1,$a1 896 ldp $a0,$a1,[$ap,#8*0] 897 adcs $acc2,$acc2,$a2 898 adcs $acc3,$acc3,$a3 899 ldp $a2,$a3,[$ap,#8*2] 900 adcs $acc4,$acc4,$a4 901 adcs $acc5,$acc5,$a5 902 ldp $a4,$a5,[$ap,#8*4] 903 adcs $acc6,$acc6,$a6 904 mov $rp,$ap 905 adcs $acc7,xzr,$a7 906 ldp $a6,$a7,[$ap,#8*6] 907 add $ap,$ap,#8*8 908 //adc $carry,xzr,xzr // moved below 909 mov $cnt,#-8*8 910 911 // a[8]a[0] 912 // a[9]a[0] 913 // a[a]a[0] 914 // a[b]a[0] 915 // a[c]a[0] 916 // a[d]a[0] 917 // a[e]a[0] 918 // a[f]a[0] 919 // a[8]a[1] 920 // a[f]a[1]........................ 921 // a[8]a[2] 922 // a[f]a[2]........................ 923 // a[8]a[3] 924 // a[f]a[3]........................ 925 // a[8]a[4] 926 // a[f]a[4]........................ 927 // a[8]a[5] 928 // a[f]a[5]........................ 929 // a[8]a[6] 930 // a[f]a[6]........................ 931 // a[8]a[7] 932 // a[f]a[7]........................ 933.Lsqr8x_mul: 934 mul $t0,$a0,$n0 935 adc $carry,xzr,xzr // carry bit, modulo-scheduled 936 mul $t1,$a1,$n0 937 add $cnt,$cnt,#8 938 mul $t2,$a2,$n0 939 mul $t3,$a3,$n0 940 adds $acc0,$acc0,$t0 941 mul $t0,$a4,$n0 942 adcs $acc1,$acc1,$t1 943 mul $t1,$a5,$n0 944 adcs $acc2,$acc2,$t2 945 mul $t2,$a6,$n0 946 adcs $acc3,$acc3,$t3 947 mul $t3,$a7,$n0 948 adcs $acc4,$acc4,$t0 949 umulh $t0,$a0,$n0 950 adcs $acc5,$acc5,$t1 951 umulh $t1,$a1,$n0 952 adcs $acc6,$acc6,$t2 953 umulh $t2,$a2,$n0 954 adcs $acc7,$acc7,$t3 955 umulh $t3,$a3,$n0 956 adc $carry,$carry,xzr 957 str $acc0,[$tp],#8 958 adds $acc0,$acc1,$t0 959 umulh $t0,$a4,$n0 960 adcs $acc1,$acc2,$t1 961 umulh $t1,$a5,$n0 962 adcs $acc2,$acc3,$t2 963 umulh $t2,$a6,$n0 964 adcs $acc3,$acc4,$t3 965 umulh $t3,$a7,$n0 966 ldr $n0,[$rp,$cnt] 967 adcs $acc4,$acc5,$t0 968 adcs $acc5,$acc6,$t1 969 adcs $acc6,$acc7,$t2 970 adcs $acc7,$carry,$t3 971 //adc $carry,xzr,xzr // moved above 972 cbnz $cnt,.Lsqr8x_mul 973 // note that carry flag is guaranteed 974 // to be zero at this point 975 cmp $ap,$ap_end // done yet? 976 b.eq .Lsqr8x_break 977 978 ldp $a0,$a1,[$tp,#8*0] 979 ldp $a2,$a3,[$tp,#8*2] 980 ldp $a4,$a5,[$tp,#8*4] 981 ldp $a6,$a7,[$tp,#8*6] 982 adds $acc0,$acc0,$a0 983 ldur $n0,[$rp,#-8*8] 984 adcs $acc1,$acc1,$a1 985 ldp $a0,$a1,[$ap,#8*0] 986 adcs $acc2,$acc2,$a2 987 adcs $acc3,$acc3,$a3 988 ldp $a2,$a3,[$ap,#8*2] 989 adcs $acc4,$acc4,$a4 990 adcs $acc5,$acc5,$a5 991 ldp $a4,$a5,[$ap,#8*4] 992 adcs $acc6,$acc6,$a6 993 mov $cnt,#-8*8 994 adcs $acc7,$acc7,$a7 995 ldp $a6,$a7,[$ap,#8*6] 996 add $ap,$ap,#8*8 997 //adc $carry,xzr,xzr // moved above 998 b .Lsqr8x_mul 999 1000.align 4 1001.Lsqr8x_break: 1002 ldp $a0,$a1,[$rp,#8*0] 1003 add $ap,$rp,#8*8 1004 ldp $a2,$a3,[$rp,#8*2] 1005 sub $t0,$ap_end,$ap // is it last iteration? 1006 ldp $a4,$a5,[$rp,#8*4] 1007 sub $t1,$tp,$t0 1008 ldp $a6,$a7,[$rp,#8*6] 1009 cbz $t0,.Lsqr8x_outer_loop 1010 1011 stp $acc0,$acc1,[$tp,#8*0] 1012 ldp $acc0,$acc1,[$t1,#8*0] 1013 stp $acc2,$acc3,[$tp,#8*2] 1014 ldp $acc2,$acc3,[$t1,#8*2] 1015 stp $acc4,$acc5,[$tp,#8*4] 1016 ldp $acc4,$acc5,[$t1,#8*4] 1017 stp $acc6,$acc7,[$tp,#8*6] 1018 mov $tp,$t1 1019 ldp $acc6,$acc7,[$t1,#8*6] 1020 b .Lsqr8x_outer_loop 1021 1022.align 4 1023.Lsqr8x_outer_break: 1024 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1025 ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] 1026 ldp $t1,$t2,[sp,#8*1] 1027 ldp $a5,$a7,[$t0,#8*2] 1028 add $ap,$t0,#8*4 1029 ldp $t3,$t0,[sp,#8*3] 1030 1031 stp $acc0,$acc1,[$tp,#8*0] 1032 mul $acc0,$a1,$a1 1033 stp $acc2,$acc3,[$tp,#8*2] 1034 umulh $a1,$a1,$a1 1035 stp $acc4,$acc5,[$tp,#8*4] 1036 mul $a2,$a3,$a3 1037 stp $acc6,$acc7,[$tp,#8*6] 1038 mov $tp,sp 1039 umulh $a3,$a3,$a3 1040 adds $acc1,$a1,$t1,lsl#1 1041 extr $t1,$t2,$t1,#63 1042 sub $cnt,$num,#8*4 1043 1044.Lsqr4x_shift_n_add: 1045 adcs $acc2,$a2,$t1 1046 extr $t2,$t3,$t2,#63 1047 sub $cnt,$cnt,#8*4 1048 adcs $acc3,$a3,$t2 1049 ldp $t1,$t2,[$tp,#8*5] 1050 mul $a4,$a5,$a5 1051 ldp $a1,$a3,[$ap],#8*2 1052 umulh $a5,$a5,$a5 1053 mul $a6,$a7,$a7 1054 umulh $a7,$a7,$a7 1055 extr $t3,$t0,$t3,#63 1056 stp $acc0,$acc1,[$tp,#8*0] 1057 adcs $acc4,$a4,$t3 1058 extr $t0,$t1,$t0,#63 1059 stp $acc2,$acc3,[$tp,#8*2] 1060 adcs $acc5,$a5,$t0 1061 ldp $t3,$t0,[$tp,#8*7] 1062 extr $t1,$t2,$t1,#63 1063 adcs $acc6,$a6,$t1 1064 extr $t2,$t3,$t2,#63 1065 adcs $acc7,$a7,$t2 1066 ldp $t1,$t2,[$tp,#8*9] 1067 mul $a0,$a1,$a1 1068 ldp $a5,$a7,[$ap],#8*2 1069 umulh $a1,$a1,$a1 1070 mul $a2,$a3,$a3 1071 umulh $a3,$a3,$a3 1072 stp $acc4,$acc5,[$tp,#8*4] 1073 extr $t3,$t0,$t3,#63 1074 stp $acc6,$acc7,[$tp,#8*6] 1075 add $tp,$tp,#8*8 1076 adcs $acc0,$a0,$t3 1077 extr $t0,$t1,$t0,#63 1078 adcs $acc1,$a1,$t0 1079 ldp $t3,$t0,[$tp,#8*3] 1080 extr $t1,$t2,$t1,#63 1081 cbnz $cnt,.Lsqr4x_shift_n_add 1082___ 1083my ($np,$np_end)=($ap,$ap_end); 1084$code.=<<___; 1085 ldp $np,$n0,[x29,#104] // pull np and n0 1086 1087 adcs $acc2,$a2,$t1 1088 extr $t2,$t3,$t2,#63 1089 adcs $acc3,$a3,$t2 1090 ldp $t1,$t2,[$tp,#8*5] 1091 mul $a4,$a5,$a5 1092 umulh $a5,$a5,$a5 1093 stp $acc0,$acc1,[$tp,#8*0] 1094 mul $a6,$a7,$a7 1095 umulh $a7,$a7,$a7 1096 stp $acc2,$acc3,[$tp,#8*2] 1097 extr $t3,$t0,$t3,#63 1098 adcs $acc4,$a4,$t3 1099 extr $t0,$t1,$t0,#63 1100 ldp $acc0,$acc1,[sp,#8*0] 1101 adcs $acc5,$a5,$t0 1102 extr $t1,$t2,$t1,#63 1103 ldp $a0,$a1,[$np,#8*0] 1104 adcs $acc6,$a6,$t1 1105 extr $t2,xzr,$t2,#63 1106 ldp $a2,$a3,[$np,#8*2] 1107 adc $acc7,$a7,$t2 1108 ldp $a4,$a5,[$np,#8*4] 1109 1110 // Reduce by 512 bits per iteration 1111 mul $na0,$n0,$acc0 // t[0]*n0 1112 ldp $a6,$a7,[$np,#8*6] 1113 add $np_end,$np,$num 1114 ldp $acc2,$acc3,[sp,#8*2] 1115 stp $acc4,$acc5,[$tp,#8*4] 1116 ldp $acc4,$acc5,[sp,#8*4] 1117 stp $acc6,$acc7,[$tp,#8*6] 1118 ldp $acc6,$acc7,[sp,#8*6] 1119 add $np,$np,#8*8 1120 mov $topmost,xzr // initial top-most carry 1121 mov $tp,sp 1122 mov $cnt,#8 1123 1124.Lsqr8x_reduction: 1125 // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0) 1126 mul $t1,$a1,$na0 1127 sub $cnt,$cnt,#1 1128 mul $t2,$a2,$na0 1129 str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing 1130 mul $t3,$a3,$na0 1131 // (*) adds xzr,$acc0,$t0 1132 subs xzr,$acc0,#1 // (*) 1133 mul $t0,$a4,$na0 1134 adcs $acc0,$acc1,$t1 1135 mul $t1,$a5,$na0 1136 adcs $acc1,$acc2,$t2 1137 mul $t2,$a6,$na0 1138 adcs $acc2,$acc3,$t3 1139 mul $t3,$a7,$na0 1140 adcs $acc3,$acc4,$t0 1141 umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0) 1142 adcs $acc4,$acc5,$t1 1143 umulh $t1,$a1,$na0 1144 adcs $acc5,$acc6,$t2 1145 umulh $t2,$a2,$na0 1146 adcs $acc6,$acc7,$t3 1147 umulh $t3,$a3,$na0 1148 adc $acc7,xzr,xzr 1149 adds $acc0,$acc0,$t0 1150 umulh $t0,$a4,$na0 1151 adcs $acc1,$acc1,$t1 1152 umulh $t1,$a5,$na0 1153 adcs $acc2,$acc2,$t2 1154 umulh $t2,$a6,$na0 1155 adcs $acc3,$acc3,$t3 1156 umulh $t3,$a7,$na0 1157 mul $na0,$n0,$acc0 // next t[0]*n0 1158 adcs $acc4,$acc4,$t0 1159 adcs $acc5,$acc5,$t1 1160 adcs $acc6,$acc6,$t2 1161 adc $acc7,$acc7,$t3 1162 cbnz $cnt,.Lsqr8x_reduction 1163 1164 ldp $t0,$t1,[$tp,#8*0] 1165 ldp $t2,$t3,[$tp,#8*2] 1166 mov $rp,$tp 1167 sub $cnt,$np_end,$np // done yet? 1168 adds $acc0,$acc0,$t0 1169 adcs $acc1,$acc1,$t1 1170 ldp $t0,$t1,[$tp,#8*4] 1171 adcs $acc2,$acc2,$t2 1172 adcs $acc3,$acc3,$t3 1173 ldp $t2,$t3,[$tp,#8*6] 1174 adcs $acc4,$acc4,$t0 1175 adcs $acc5,$acc5,$t1 1176 adcs $acc6,$acc6,$t2 1177 adcs $acc7,$acc7,$t3 1178 //adc $carry,xzr,xzr // moved below 1179 cbz $cnt,.Lsqr8x8_post_condition 1180 1181 ldur $n0,[$tp,#-8*8] 1182 ldp $a0,$a1,[$np,#8*0] 1183 ldp $a2,$a3,[$np,#8*2] 1184 ldp $a4,$a5,[$np,#8*4] 1185 mov $cnt,#-8*8 1186 ldp $a6,$a7,[$np,#8*6] 1187 add $np,$np,#8*8 1188 1189.Lsqr8x_tail: 1190 mul $t0,$a0,$n0 1191 adc $carry,xzr,xzr // carry bit, modulo-scheduled 1192 mul $t1,$a1,$n0 1193 add $cnt,$cnt,#8 1194 mul $t2,$a2,$n0 1195 mul $t3,$a3,$n0 1196 adds $acc0,$acc0,$t0 1197 mul $t0,$a4,$n0 1198 adcs $acc1,$acc1,$t1 1199 mul $t1,$a5,$n0 1200 adcs $acc2,$acc2,$t2 1201 mul $t2,$a6,$n0 1202 adcs $acc3,$acc3,$t3 1203 mul $t3,$a7,$n0 1204 adcs $acc4,$acc4,$t0 1205 umulh $t0,$a0,$n0 1206 adcs $acc5,$acc5,$t1 1207 umulh $t1,$a1,$n0 1208 adcs $acc6,$acc6,$t2 1209 umulh $t2,$a2,$n0 1210 adcs $acc7,$acc7,$t3 1211 umulh $t3,$a3,$n0 1212 adc $carry,$carry,xzr 1213 str $acc0,[$tp],#8 1214 adds $acc0,$acc1,$t0 1215 umulh $t0,$a4,$n0 1216 adcs $acc1,$acc2,$t1 1217 umulh $t1,$a5,$n0 1218 adcs $acc2,$acc3,$t2 1219 umulh $t2,$a6,$n0 1220 adcs $acc3,$acc4,$t3 1221 umulh $t3,$a7,$n0 1222 ldr $n0,[$rp,$cnt] 1223 adcs $acc4,$acc5,$t0 1224 adcs $acc5,$acc6,$t1 1225 adcs $acc6,$acc7,$t2 1226 adcs $acc7,$carry,$t3 1227 //adc $carry,xzr,xzr // moved above 1228 cbnz $cnt,.Lsqr8x_tail 1229 // note that carry flag is guaranteed 1230 // to be zero at this point 1231 ldp $a0,$a1,[$tp,#8*0] 1232 sub $cnt,$np_end,$np // done yet? 1233 sub $t2,$np_end,$num // rewinded np 1234 ldp $a2,$a3,[$tp,#8*2] 1235 ldp $a4,$a5,[$tp,#8*4] 1236 ldp $a6,$a7,[$tp,#8*6] 1237 cbz $cnt,.Lsqr8x_tail_break 1238 1239 ldur $n0,[$rp,#-8*8] 1240 adds $acc0,$acc0,$a0 1241 adcs $acc1,$acc1,$a1 1242 ldp $a0,$a1,[$np,#8*0] 1243 adcs $acc2,$acc2,$a2 1244 adcs $acc3,$acc3,$a3 1245 ldp $a2,$a3,[$np,#8*2] 1246 adcs $acc4,$acc4,$a4 1247 adcs $acc5,$acc5,$a5 1248 ldp $a4,$a5,[$np,#8*4] 1249 adcs $acc6,$acc6,$a6 1250 mov $cnt,#-8*8 1251 adcs $acc7,$acc7,$a7 1252 ldp $a6,$a7,[$np,#8*6] 1253 add $np,$np,#8*8 1254 //adc $carry,xzr,xzr // moved above 1255 b .Lsqr8x_tail 1256 1257.align 4 1258.Lsqr8x_tail_break: 1259 ldr $n0,[x29,#112] // pull n0 1260 add $cnt,$tp,#8*8 // end of current t[num] window 1261 1262 subs xzr,$topmost,#1 // "move" top-most carry to carry bit 1263 adcs $t0,$acc0,$a0 1264 adcs $t1,$acc1,$a1 1265 ldp $acc0,$acc1,[$rp,#8*0] 1266 adcs $acc2,$acc2,$a2 1267 ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] 1268 adcs $acc3,$acc3,$a3 1269 ldp $a2,$a3,[$t2,#8*2] 1270 adcs $acc4,$acc4,$a4 1271 adcs $acc5,$acc5,$a5 1272 ldp $a4,$a5,[$t2,#8*4] 1273 adcs $acc6,$acc6,$a6 1274 adcs $acc7,$acc7,$a7 1275 ldp $a6,$a7,[$t2,#8*6] 1276 add $np,$t2,#8*8 1277 adc $topmost,xzr,xzr // top-most carry 1278 mul $na0,$n0,$acc0 1279 stp $t0,$t1,[$tp,#8*0] 1280 stp $acc2,$acc3,[$tp,#8*2] 1281 ldp $acc2,$acc3,[$rp,#8*2] 1282 stp $acc4,$acc5,[$tp,#8*4] 1283 ldp $acc4,$acc5,[$rp,#8*4] 1284 cmp $cnt,x29 // did we hit the bottom? 1285 stp $acc6,$acc7,[$tp,#8*6] 1286 mov $tp,$rp // slide the window 1287 ldp $acc6,$acc7,[$rp,#8*6] 1288 mov $cnt,#8 1289 b.ne .Lsqr8x_reduction 1290 1291 // Final step. We see if result is larger than modulus, and 1292 // if it is, subtract the modulus. But comparison implies 1293 // subtraction. So we subtract modulus, see if it borrowed, 1294 // and conditionally copy original value. 1295 ldr $rp,[x29,#96] // pull rp 1296 add $tp,$tp,#8*8 1297 subs $t0,$acc0,$a0 1298 sbcs $t1,$acc1,$a1 1299 sub $cnt,$num,#8*8 1300 mov $ap_end,$rp // $rp copy 1301 1302.Lsqr8x_sub: 1303 sbcs $t2,$acc2,$a2 1304 ldp $a0,$a1,[$np,#8*0] 1305 sbcs $t3,$acc3,$a3 1306 stp $t0,$t1,[$rp,#8*0] 1307 sbcs $t0,$acc4,$a4 1308 ldp $a2,$a3,[$np,#8*2] 1309 sbcs $t1,$acc5,$a5 1310 stp $t2,$t3,[$rp,#8*2] 1311 sbcs $t2,$acc6,$a6 1312 ldp $a4,$a5,[$np,#8*4] 1313 sbcs $t3,$acc7,$a7 1314 ldp $a6,$a7,[$np,#8*6] 1315 add $np,$np,#8*8 1316 ldp $acc0,$acc1,[$tp,#8*0] 1317 sub $cnt,$cnt,#8*8 1318 ldp $acc2,$acc3,[$tp,#8*2] 1319 ldp $acc4,$acc5,[$tp,#8*4] 1320 ldp $acc6,$acc7,[$tp,#8*6] 1321 add $tp,$tp,#8*8 1322 stp $t0,$t1,[$rp,#8*4] 1323 sbcs $t0,$acc0,$a0 1324 stp $t2,$t3,[$rp,#8*6] 1325 add $rp,$rp,#8*8 1326 sbcs $t1,$acc1,$a1 1327 cbnz $cnt,.Lsqr8x_sub 1328 1329 sbcs $t2,$acc2,$a2 1330 mov $tp,sp 1331 add $ap,sp,$num 1332 ldp $a0,$a1,[$ap_end,#8*0] 1333 sbcs $t3,$acc3,$a3 1334 stp $t0,$t1,[$rp,#8*0] 1335 sbcs $t0,$acc4,$a4 1336 ldp $a2,$a3,[$ap_end,#8*2] 1337 sbcs $t1,$acc5,$a5 1338 stp $t2,$t3,[$rp,#8*2] 1339 sbcs $t2,$acc6,$a6 1340 ldp $acc0,$acc1,[$ap,#8*0] 1341 sbcs $t3,$acc7,$a7 1342 ldp $acc2,$acc3,[$ap,#8*2] 1343 sbcs xzr,$topmost,xzr // did it borrow? 1344 ldr x30,[x29,#8] // pull return address 1345 stp $t0,$t1,[$rp,#8*4] 1346 stp $t2,$t3,[$rp,#8*6] 1347 1348 sub $cnt,$num,#8*4 1349.Lsqr4x_cond_copy: 1350 sub $cnt,$cnt,#8*4 1351 csel $t0,$acc0,$a0,lo 1352 stp xzr,xzr,[$tp,#8*0] 1353 csel $t1,$acc1,$a1,lo 1354 ldp $a0,$a1,[$ap_end,#8*4] 1355 ldp $acc0,$acc1,[$ap,#8*4] 1356 csel $t2,$acc2,$a2,lo 1357 stp xzr,xzr,[$tp,#8*2] 1358 add $tp,$tp,#8*4 1359 csel $t3,$acc3,$a3,lo 1360 ldp $a2,$a3,[$ap_end,#8*6] 1361 ldp $acc2,$acc3,[$ap,#8*6] 1362 add $ap,$ap,#8*4 1363 stp $t0,$t1,[$ap_end,#8*0] 1364 stp $t2,$t3,[$ap_end,#8*2] 1365 add $ap_end,$ap_end,#8*4 1366 stp xzr,xzr,[$ap,#8*0] 1367 stp xzr,xzr,[$ap,#8*2] 1368 cbnz $cnt,.Lsqr4x_cond_copy 1369 1370 csel $t0,$acc0,$a0,lo 1371 stp xzr,xzr,[$tp,#8*0] 1372 csel $t1,$acc1,$a1,lo 1373 stp xzr,xzr,[$tp,#8*2] 1374 csel $t2,$acc2,$a2,lo 1375 csel $t3,$acc3,$a3,lo 1376 stp $t0,$t1,[$ap_end,#8*0] 1377 stp $t2,$t3,[$ap_end,#8*2] 1378 1379 b .Lsqr8x_done 1380 1381.align 4 1382.Lsqr8x8_post_condition: 1383 adc $carry,xzr,xzr 1384 ldr x30,[x29,#8] // pull return address 1385 // $acc0-7,$carry hold result, $a0-7 hold modulus 1386 subs $a0,$acc0,$a0 1387 ldr $ap,[x29,#96] // pull rp 1388 sbcs $a1,$acc1,$a1 1389 stp xzr,xzr,[sp,#8*0] 1390 sbcs $a2,$acc2,$a2 1391 stp xzr,xzr,[sp,#8*2] 1392 sbcs $a3,$acc3,$a3 1393 stp xzr,xzr,[sp,#8*4] 1394 sbcs $a4,$acc4,$a4 1395 stp xzr,xzr,[sp,#8*6] 1396 sbcs $a5,$acc5,$a5 1397 stp xzr,xzr,[sp,#8*8] 1398 sbcs $a6,$acc6,$a6 1399 stp xzr,xzr,[sp,#8*10] 1400 sbcs $a7,$acc7,$a7 1401 stp xzr,xzr,[sp,#8*12] 1402 sbcs $carry,$carry,xzr // did it borrow? 1403 stp xzr,xzr,[sp,#8*14] 1404 1405 // $a0-7 hold result-modulus 1406 csel $a0,$acc0,$a0,lo 1407 csel $a1,$acc1,$a1,lo 1408 csel $a2,$acc2,$a2,lo 1409 csel $a3,$acc3,$a3,lo 1410 stp $a0,$a1,[$ap,#8*0] 1411 csel $a4,$acc4,$a4,lo 1412 csel $a5,$acc5,$a5,lo 1413 stp $a2,$a3,[$ap,#8*2] 1414 csel $a6,$acc6,$a6,lo 1415 csel $a7,$acc7,$a7,lo 1416 stp $a4,$a5,[$ap,#8*4] 1417 stp $a6,$a7,[$ap,#8*6] 1418 1419.Lsqr8x_done: 1420 ldp x19,x20,[x29,#16] 1421 mov sp,x29 1422 ldp x21,x22,[x29,#32] 1423 mov x0,#1 1424 ldp x23,x24,[x29,#48] 1425 ldp x25,x26,[x29,#64] 1426 ldp x27,x28,[x29,#80] 1427 ldr x29,[sp],#128 1428 .inst 0xd50323bf // autiasp 1429 ret 1430.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1431___ 1432} 1433 1434{ 1435######################################################################## 1436# Even though this might look as ARMv8 adaptation of mulx4x_mont from 1437# x86_64-mont5 module, it's different in sense that it performs 1438# reduction 256 bits at a time. 1439 1440my ($a0,$a1,$a2,$a3, 1441 $t0,$t1,$t2,$t3, 1442 $m0,$m1,$m2,$m3, 1443 $acc0,$acc1,$acc2,$acc3,$acc4, 1444 $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); 1445my $bp_end=$rp; 1446my ($carry,$topmost) = ($rp,"x30"); 1447 1448$code.=<<___; 1449.type __bn_mul4x_mont,%function 1450.align 5 1451__bn_mul4x_mont: 1452 .inst 0xd503233f // paciasp 1453 stp x29,x30,[sp,#-128]! 1454 add x29,sp,#0 1455 stp x19,x20,[sp,#16] 1456 stp x21,x22,[sp,#32] 1457 stp x23,x24,[sp,#48] 1458 stp x25,x26,[sp,#64] 1459 stp x27,x28,[sp,#80] 1460 1461 sub $tp,sp,$num,lsl#3 1462 lsl $num,$num,#3 1463 ldr $n0,[$n0] // *n0 1464 sub sp,$tp,#8*4 // alloca 1465 1466 add $t0,$bp,$num 1467 add $ap_end,$ap,$num 1468 stp $rp,$t0,[x29,#96] // offload rp and &b[num] 1469 1470 ldr $bi,[$bp,#8*0] // b[0] 1471 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1472 ldp $a2,$a3,[$ap,#8*2] 1473 add $ap,$ap,#8*4 1474 mov $acc0,xzr 1475 mov $acc1,xzr 1476 mov $acc2,xzr 1477 mov $acc3,xzr 1478 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1479 ldp $m2,$m3,[$np,#8*2] 1480 adds $np,$np,#8*4 // clear carry bit 1481 mov $carry,xzr 1482 mov $cnt,#0 1483 mov $tp,sp 1484 1485.Loop_mul4x_1st_reduction: 1486 mul $t0,$a0,$bi // lo(a[0..3]*b[0]) 1487 adc $carry,$carry,xzr // modulo-scheduled 1488 mul $t1,$a1,$bi 1489 add $cnt,$cnt,#8 1490 mul $t2,$a2,$bi 1491 and $cnt,$cnt,#31 1492 mul $t3,$a3,$bi 1493 adds $acc0,$acc0,$t0 1494 umulh $t0,$a0,$bi // hi(a[0..3]*b[0]) 1495 adcs $acc1,$acc1,$t1 1496 mul $mi,$acc0,$n0 // t[0]*n0 1497 adcs $acc2,$acc2,$t2 1498 umulh $t1,$a1,$bi 1499 adcs $acc3,$acc3,$t3 1500 umulh $t2,$a2,$bi 1501 adc $acc4,xzr,xzr 1502 umulh $t3,$a3,$bi 1503 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1504 adds $acc1,$acc1,$t0 1505 // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0) 1506 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1507 adcs $acc2,$acc2,$t1 1508 mul $t1,$m1,$mi 1509 adcs $acc3,$acc3,$t2 1510 mul $t2,$m2,$mi 1511 adc $acc4,$acc4,$t3 // can't overflow 1512 mul $t3,$m3,$mi 1513 // (*) adds xzr,$acc0,$t0 1514 subs xzr,$acc0,#1 // (*) 1515 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0) 1516 adcs $acc0,$acc1,$t1 1517 umulh $t1,$m1,$mi 1518 adcs $acc1,$acc2,$t2 1519 umulh $t2,$m2,$mi 1520 adcs $acc2,$acc3,$t3 1521 umulh $t3,$m3,$mi 1522 adcs $acc3,$acc4,$carry 1523 adc $carry,xzr,xzr 1524 adds $acc0,$acc0,$t0 1525 sub $t0,$ap_end,$ap 1526 adcs $acc1,$acc1,$t1 1527 adcs $acc2,$acc2,$t2 1528 adcs $acc3,$acc3,$t3 1529 //adc $carry,$carry,xzr 1530 cbnz $cnt,.Loop_mul4x_1st_reduction 1531 1532 cbz $t0,.Lmul4x4_post_condition 1533 1534 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1535 ldp $a2,$a3,[$ap,#8*2] 1536 add $ap,$ap,#8*4 1537 ldr $mi,[sp] // a[0]*n0 1538 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1539 ldp $m2,$m3,[$np,#8*2] 1540 add $np,$np,#8*4 1541 1542.Loop_mul4x_1st_tail: 1543 mul $t0,$a0,$bi // lo(a[4..7]*b[i]) 1544 adc $carry,$carry,xzr // modulo-scheduled 1545 mul $t1,$a1,$bi 1546 add $cnt,$cnt,#8 1547 mul $t2,$a2,$bi 1548 and $cnt,$cnt,#31 1549 mul $t3,$a3,$bi 1550 adds $acc0,$acc0,$t0 1551 umulh $t0,$a0,$bi // hi(a[4..7]*b[i]) 1552 adcs $acc1,$acc1,$t1 1553 umulh $t1,$a1,$bi 1554 adcs $acc2,$acc2,$t2 1555 umulh $t2,$a2,$bi 1556 adcs $acc3,$acc3,$t3 1557 umulh $t3,$a3,$bi 1558 adc $acc4,xzr,xzr 1559 ldr $bi,[$bp,$cnt] // next b[i] (or b[0]) 1560 adds $acc1,$acc1,$t0 1561 mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0) 1562 adcs $acc2,$acc2,$t1 1563 mul $t1,$m1,$mi 1564 adcs $acc3,$acc3,$t2 1565 mul $t2,$m2,$mi 1566 adc $acc4,$acc4,$t3 // can't overflow 1567 mul $t3,$m3,$mi 1568 adds $acc0,$acc0,$t0 1569 umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0) 1570 adcs $acc1,$acc1,$t1 1571 umulh $t1,$m1,$mi 1572 adcs $acc2,$acc2,$t2 1573 umulh $t2,$m2,$mi 1574 adcs $acc3,$acc3,$t3 1575 adcs $acc4,$acc4,$carry 1576 umulh $t3,$m3,$mi 1577 adc $carry,xzr,xzr 1578 ldr $mi,[sp,$cnt] // next t[0]*n0 1579 str $acc0,[$tp],#8 // result!!! 1580 adds $acc0,$acc1,$t0 1581 sub $t0,$ap_end,$ap // done yet? 1582 adcs $acc1,$acc2,$t1 1583 adcs $acc2,$acc3,$t2 1584 adcs $acc3,$acc4,$t3 1585 //adc $carry,$carry,xzr 1586 cbnz $cnt,.Loop_mul4x_1st_tail 1587 1588 sub $t1,$ap_end,$num // rewinded $ap 1589 cbz $t0,.Lmul4x_proceed 1590 1591 ldp $a0,$a1,[$ap,#8*0] 1592 ldp $a2,$a3,[$ap,#8*2] 1593 add $ap,$ap,#8*4 1594 ldp $m0,$m1,[$np,#8*0] 1595 ldp $m2,$m3,[$np,#8*2] 1596 add $np,$np,#8*4 1597 b .Loop_mul4x_1st_tail 1598 1599.align 5 1600.Lmul4x_proceed: 1601 ldr $bi,[$bp,#8*4]! // *++b 1602 adc $topmost,$carry,xzr 1603 ldp $a0,$a1,[$t1,#8*0] // a[0..3] 1604 sub $np,$np,$num // rewind np 1605 ldp $a2,$a3,[$t1,#8*2] 1606 add $ap,$t1,#8*4 1607 1608 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1609 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1610 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1611 ldp $acc2,$acc3,[sp,#8*6] 1612 1613 ldp $m0,$m1,[$np,#8*0] // n[0..3] 1614 mov $tp,sp 1615 ldp $m2,$m3,[$np,#8*2] 1616 adds $np,$np,#8*4 // clear carry bit 1617 mov $carry,xzr 1618 1619.align 4 1620.Loop_mul4x_reduction: 1621 mul $t0,$a0,$bi // lo(a[0..3]*b[4]) 1622 adc $carry,$carry,xzr // modulo-scheduled 1623 mul $t1,$a1,$bi 1624 add $cnt,$cnt,#8 1625 mul $t2,$a2,$bi 1626 and $cnt,$cnt,#31 1627 mul $t3,$a3,$bi 1628 adds $acc0,$acc0,$t0 1629 umulh $t0,$a0,$bi // hi(a[0..3]*b[4]) 1630 adcs $acc1,$acc1,$t1 1631 mul $mi,$acc0,$n0 // t[0]*n0 1632 adcs $acc2,$acc2,$t2 1633 umulh $t1,$a1,$bi 1634 adcs $acc3,$acc3,$t3 1635 umulh $t2,$a2,$bi 1636 adc $acc4,xzr,xzr 1637 umulh $t3,$a3,$bi 1638 ldr $bi,[$bp,$cnt] // next b[i] 1639 adds $acc1,$acc1,$t0 1640 // (*) mul $t0,$m0,$mi 1641 str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing 1642 adcs $acc2,$acc2,$t1 1643 mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0 1644 adcs $acc3,$acc3,$t2 1645 mul $t2,$m2,$mi 1646 adc $acc4,$acc4,$t3 // can't overflow 1647 mul $t3,$m3,$mi 1648 // (*) adds xzr,$acc0,$t0 1649 subs xzr,$acc0,#1 // (*) 1650 umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0 1651 adcs $acc0,$acc1,$t1 1652 umulh $t1,$m1,$mi 1653 adcs $acc1,$acc2,$t2 1654 umulh $t2,$m2,$mi 1655 adcs $acc2,$acc3,$t3 1656 umulh $t3,$m3,$mi 1657 adcs $acc3,$acc4,$carry 1658 adc $carry,xzr,xzr 1659 adds $acc0,$acc0,$t0 1660 adcs $acc1,$acc1,$t1 1661 adcs $acc2,$acc2,$t2 1662 adcs $acc3,$acc3,$t3 1663 //adc $carry,$carry,xzr 1664 cbnz $cnt,.Loop_mul4x_reduction 1665 1666 adc $carry,$carry,xzr 1667 ldp $t0,$t1,[$tp,#8*4] // t[4..7] 1668 ldp $t2,$t3,[$tp,#8*6] 1669 ldp $a0,$a1,[$ap,#8*0] // a[4..7] 1670 ldp $a2,$a3,[$ap,#8*2] 1671 add $ap,$ap,#8*4 1672 adds $acc0,$acc0,$t0 1673 adcs $acc1,$acc1,$t1 1674 adcs $acc2,$acc2,$t2 1675 adcs $acc3,$acc3,$t3 1676 //adc $carry,$carry,xzr 1677 1678 ldr $mi,[sp] // t[0]*n0 1679 ldp $m0,$m1,[$np,#8*0] // n[4..7] 1680 ldp $m2,$m3,[$np,#8*2] 1681 add $np,$np,#8*4 1682 1683.align 4 1684.Loop_mul4x_tail: 1685 mul $t0,$a0,$bi // lo(a[4..7]*b[4]) 1686 adc $carry,$carry,xzr // modulo-scheduled 1687 mul $t1,$a1,$bi 1688 add $cnt,$cnt,#8 1689 mul $t2,$a2,$bi 1690 and $cnt,$cnt,#31 1691 mul $t3,$a3,$bi 1692 adds $acc0,$acc0,$t0 1693 umulh $t0,$a0,$bi // hi(a[4..7]*b[4]) 1694 adcs $acc1,$acc1,$t1 1695 umulh $t1,$a1,$bi 1696 adcs $acc2,$acc2,$t2 1697 umulh $t2,$a2,$bi 1698 adcs $acc3,$acc3,$t3 1699 umulh $t3,$a3,$bi 1700 adc $acc4,xzr,xzr 1701 ldr $bi,[$bp,$cnt] // next b[i] 1702 adds $acc1,$acc1,$t0 1703 mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0) 1704 adcs $acc2,$acc2,$t1 1705 mul $t1,$m1,$mi 1706 adcs $acc3,$acc3,$t2 1707 mul $t2,$m2,$mi 1708 adc $acc4,$acc4,$t3 // can't overflow 1709 mul $t3,$m3,$mi 1710 adds $acc0,$acc0,$t0 1711 umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0) 1712 adcs $acc1,$acc1,$t1 1713 umulh $t1,$m1,$mi 1714 adcs $acc2,$acc2,$t2 1715 umulh $t2,$m2,$mi 1716 adcs $acc3,$acc3,$t3 1717 umulh $t3,$m3,$mi 1718 adcs $acc4,$acc4,$carry 1719 ldr $mi,[sp,$cnt] // next a[0]*n0 1720 adc $carry,xzr,xzr 1721 str $acc0,[$tp],#8 // result!!! 1722 adds $acc0,$acc1,$t0 1723 sub $t0,$ap_end,$ap // done yet? 1724 adcs $acc1,$acc2,$t1 1725 adcs $acc2,$acc3,$t2 1726 adcs $acc3,$acc4,$t3 1727 //adc $carry,$carry,xzr 1728 cbnz $cnt,.Loop_mul4x_tail 1729 1730 sub $t1,$np,$num // rewinded np? 1731 adc $carry,$carry,xzr 1732 cbz $t0,.Loop_mul4x_break 1733 1734 ldp $t0,$t1,[$tp,#8*4] 1735 ldp $t2,$t3,[$tp,#8*6] 1736 ldp $a0,$a1,[$ap,#8*0] 1737 ldp $a2,$a3,[$ap,#8*2] 1738 add $ap,$ap,#8*4 1739 adds $acc0,$acc0,$t0 1740 adcs $acc1,$acc1,$t1 1741 adcs $acc2,$acc2,$t2 1742 adcs $acc3,$acc3,$t3 1743 //adc $carry,$carry,xzr 1744 ldp $m0,$m1,[$np,#8*0] 1745 ldp $m2,$m3,[$np,#8*2] 1746 add $np,$np,#8*4 1747 b .Loop_mul4x_tail 1748 1749.align 4 1750.Loop_mul4x_break: 1751 ldp $t2,$t3,[x29,#96] // pull rp and &b[num] 1752 adds $acc0,$acc0,$topmost 1753 add $bp,$bp,#8*4 // bp++ 1754 adcs $acc1,$acc1,xzr 1755 sub $ap,$ap,$num // rewind ap 1756 adcs $acc2,$acc2,xzr 1757 stp $acc0,$acc1,[$tp,#8*0] // result!!! 1758 adcs $acc3,$acc3,xzr 1759 ldp $acc0,$acc1,[sp,#8*4] // t[0..3] 1760 adc $topmost,$carry,xzr 1761 stp $acc2,$acc3,[$tp,#8*2] // result!!! 1762 cmp $bp,$t3 // done yet? 1763 ldp $acc2,$acc3,[sp,#8*6] 1764 ldp $m0,$m1,[$t1,#8*0] // n[0..3] 1765 ldp $m2,$m3,[$t1,#8*2] 1766 add $np,$t1,#8*4 1767 b.eq .Lmul4x_post 1768 1769 ldr $bi,[$bp] 1770 ldp $a0,$a1,[$ap,#8*0] // a[0..3] 1771 ldp $a2,$a3,[$ap,#8*2] 1772 adds $ap,$ap,#8*4 // clear carry bit 1773 mov $carry,xzr 1774 mov $tp,sp 1775 b .Loop_mul4x_reduction 1776 1777.align 4 1778.Lmul4x_post: 1779 // Final step. We see if result is larger than modulus, and 1780 // if it is, subtract the modulus. But comparison implies 1781 // subtraction. So we subtract modulus, see if it borrowed, 1782 // and conditionally copy original value. 1783 mov $rp,$t2 1784 mov $ap_end,$t2 // $rp copy 1785 subs $t0,$acc0,$m0 1786 add $tp,sp,#8*8 1787 sbcs $t1,$acc1,$m1 1788 sub $cnt,$num,#8*4 1789 1790.Lmul4x_sub: 1791 sbcs $t2,$acc2,$m2 1792 ldp $m0,$m1,[$np,#8*0] 1793 sub $cnt,$cnt,#8*4 1794 ldp $acc0,$acc1,[$tp,#8*0] 1795 sbcs $t3,$acc3,$m3 1796 ldp $m2,$m3,[$np,#8*2] 1797 add $np,$np,#8*4 1798 ldp $acc2,$acc3,[$tp,#8*2] 1799 add $tp,$tp,#8*4 1800 stp $t0,$t1,[$rp,#8*0] 1801 sbcs $t0,$acc0,$m0 1802 stp $t2,$t3,[$rp,#8*2] 1803 add $rp,$rp,#8*4 1804 sbcs $t1,$acc1,$m1 1805 cbnz $cnt,.Lmul4x_sub 1806 1807 sbcs $t2,$acc2,$m2 1808 mov $tp,sp 1809 add $ap,sp,#8*4 1810 ldp $a0,$a1,[$ap_end,#8*0] 1811 sbcs $t3,$acc3,$m3 1812 stp $t0,$t1,[$rp,#8*0] 1813 ldp $a2,$a3,[$ap_end,#8*2] 1814 stp $t2,$t3,[$rp,#8*2] 1815 ldp $acc0,$acc1,[$ap,#8*0] 1816 ldp $acc2,$acc3,[$ap,#8*2] 1817 sbcs xzr,$topmost,xzr // did it borrow? 1818 ldr x30,[x29,#8] // pull return address 1819 1820 sub $cnt,$num,#8*4 1821.Lmul4x_cond_copy: 1822 sub $cnt,$cnt,#8*4 1823 csel $t0,$acc0,$a0,lo 1824 stp xzr,xzr,[$tp,#8*0] 1825 csel $t1,$acc1,$a1,lo 1826 ldp $a0,$a1,[$ap_end,#8*4] 1827 ldp $acc0,$acc1,[$ap,#8*4] 1828 csel $t2,$acc2,$a2,lo 1829 stp xzr,xzr,[$tp,#8*2] 1830 add $tp,$tp,#8*4 1831 csel $t3,$acc3,$a3,lo 1832 ldp $a2,$a3,[$ap_end,#8*6] 1833 ldp $acc2,$acc3,[$ap,#8*6] 1834 add $ap,$ap,#8*4 1835 stp $t0,$t1,[$ap_end,#8*0] 1836 stp $t2,$t3,[$ap_end,#8*2] 1837 add $ap_end,$ap_end,#8*4 1838 cbnz $cnt,.Lmul4x_cond_copy 1839 1840 csel $t0,$acc0,$a0,lo 1841 stp xzr,xzr,[$tp,#8*0] 1842 csel $t1,$acc1,$a1,lo 1843 stp xzr,xzr,[$tp,#8*2] 1844 csel $t2,$acc2,$a2,lo 1845 stp xzr,xzr,[$tp,#8*3] 1846 csel $t3,$acc3,$a3,lo 1847 stp xzr,xzr,[$tp,#8*4] 1848 stp $t0,$t1,[$ap_end,#8*0] 1849 stp $t2,$t3,[$ap_end,#8*2] 1850 1851 b .Lmul4x_done 1852 1853.align 4 1854.Lmul4x4_post_condition: 1855 adc $carry,$carry,xzr 1856 ldr $ap,[x29,#96] // pull rp 1857 // $acc0-3,$carry hold result, $m0-7 hold modulus 1858 subs $a0,$acc0,$m0 1859 ldr x30,[x29,#8] // pull return address 1860 sbcs $a1,$acc1,$m1 1861 stp xzr,xzr,[sp,#8*0] 1862 sbcs $a2,$acc2,$m2 1863 stp xzr,xzr,[sp,#8*2] 1864 sbcs $a3,$acc3,$m3 1865 stp xzr,xzr,[sp,#8*4] 1866 sbcs xzr,$carry,xzr // did it borrow? 1867 stp xzr,xzr,[sp,#8*6] 1868 1869 // $a0-3 hold result-modulus 1870 csel $a0,$acc0,$a0,lo 1871 csel $a1,$acc1,$a1,lo 1872 csel $a2,$acc2,$a2,lo 1873 csel $a3,$acc3,$a3,lo 1874 stp $a0,$a1,[$ap,#8*0] 1875 stp $a2,$a3,[$ap,#8*2] 1876 1877.Lmul4x_done: 1878 ldp x19,x20,[x29,#16] 1879 mov sp,x29 1880 ldp x21,x22,[x29,#32] 1881 mov x0,#1 1882 ldp x23,x24,[x29,#48] 1883 ldp x25,x26,[x29,#64] 1884 ldp x27,x28,[x29,#80] 1885 ldr x29,[sp],#128 1886 .inst 0xd50323bf // autiasp 1887 ret 1888.size __bn_mul4x_mont,.-__bn_mul4x_mont 1889___ 1890} 1891$code.=<<___; 1892.asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 1893.align 4 1894___ 1895 1896print $code; 1897 1898close STDOUT or die "error closing STDOUT: $!"; 1899