1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.text 19 20.globl bn_mul_mont 21.hidden bn_mul_mont 22.type bn_mul_mont,%function 23.align 5 24bn_mul_mont: 25 AARCH64_SIGN_LINK_REGISTER 26 tst x5,#7 27 b.eq __bn_sqr8x_mont 28 tst x5,#3 29 b.eq __bn_mul4x_mont 30.Lmul_mont: 31 stp x29,x30,[sp,#-64]! 32 add x29,sp,#0 33 stp x19,x20,[sp,#16] 34 stp x21,x22,[sp,#32] 35 stp x23,x24,[sp,#48] 36 37 ldr x9,[x2],#8 // bp[0] 38 sub x22,sp,x5,lsl#3 39 ldp x7,x8,[x1],#16 // ap[0..1] 40 lsl x5,x5,#3 41 ldr x4,[x4] // *n0 42 and x22,x22,#-16 // ABI says so 43 ldp x13,x14,[x3],#16 // np[0..1] 44 45 mul x6,x7,x9 // ap[0]*bp[0] 46 sub x21,x5,#16 // j=num-2 47 umulh x7,x7,x9 48 mul x10,x8,x9 // ap[1]*bp[0] 49 umulh x11,x8,x9 50 51 mul x15,x6,x4 // "tp[0]"*n0 52 mov sp,x22 // alloca 53 54 // (*) mul x12,x13,x15 // np[0]*m1 55 umulh x13,x13,x15 56 mul x16,x14,x15 // np[1]*m1 57 // (*) adds x12,x12,x6 // discarded 58 // (*) As for removal of first multiplication and addition 59 // instructions. The outcome of first addition is 60 // guaranteed to be zero, which leaves two computationally 61 // significant outcomes: it either carries or not. Then 62 // question is when does it carry? Is there alternative 63 // way to deduce it? If you follow operations, you can 64 // observe that condition for carry is quite simple: 65 // x6 being non-zero. So that carry can be calculated 66 // by adding -1 to x6. That's what next instruction does. 67 subs xzr,x6,#1 // (*) 68 umulh x17,x14,x15 69 adc x13,x13,xzr 70 cbz x21,.L1st_skip 71 72.L1st: 73 ldr x8,[x1],#8 74 adds x6,x10,x7 75 sub x21,x21,#8 // j-- 76 adc x7,x11,xzr 77 78 ldr x14,[x3],#8 79 adds x12,x16,x13 80 mul x10,x8,x9 // ap[j]*bp[0] 81 adc x13,x17,xzr 82 umulh x11,x8,x9 83 84 adds x12,x12,x6 85 mul x16,x14,x15 // np[j]*m1 86 adc x13,x13,xzr 87 umulh x17,x14,x15 88 str x12,[x22],#8 // tp[j-1] 89 cbnz x21,.L1st 90 91.L1st_skip: 92 adds x6,x10,x7 93 sub x1,x1,x5 // rewind x1 94 adc x7,x11,xzr 95 96 adds x12,x16,x13 97 sub x3,x3,x5 // rewind x3 98 adc x13,x17,xzr 99 100 adds x12,x12,x6 101 sub x20,x5,#8 // i=num-1 102 adcs x13,x13,x7 103 104 adc x19,xzr,xzr // upmost overflow bit 105 stp x12,x13,[x22] 106 107.Louter: 108 ldr x9,[x2],#8 // bp[i] 109 ldp x7,x8,[x1],#16 110 ldr x23,[sp] // tp[0] 111 add x22,sp,#8 112 113 mul x6,x7,x9 // ap[0]*bp[i] 114 sub x21,x5,#16 // j=num-2 115 umulh x7,x7,x9 116 ldp x13,x14,[x3],#16 117 mul x10,x8,x9 // ap[1]*bp[i] 118 adds x6,x6,x23 119 umulh x11,x8,x9 120 adc x7,x7,xzr 121 122 mul x15,x6,x4 123 sub x20,x20,#8 // i-- 124 125 // (*) mul x12,x13,x15 // np[0]*m1 126 umulh x13,x13,x15 127 mul x16,x14,x15 // np[1]*m1 128 // (*) adds x12,x12,x6 129 subs xzr,x6,#1 // (*) 130 umulh x17,x14,x15 131 cbz x21,.Linner_skip 132 133.Linner: 134 ldr x8,[x1],#8 135 adc x13,x13,xzr 136 ldr x23,[x22],#8 // tp[j] 137 adds x6,x10,x7 138 sub x21,x21,#8 // j-- 139 adc x7,x11,xzr 140 141 adds x12,x16,x13 142 ldr x14,[x3],#8 143 adc x13,x17,xzr 144 145 mul x10,x8,x9 // ap[j]*bp[i] 146 adds x6,x6,x23 147 umulh x11,x8,x9 148 adc x7,x7,xzr 149 150 mul x16,x14,x15 // np[j]*m1 151 adds x12,x12,x6 152 umulh x17,x14,x15 153 str x12,[x22,#-16] // tp[j-1] 154 cbnz x21,.Linner 155 156.Linner_skip: 157 ldr x23,[x22],#8 // tp[j] 158 adc x13,x13,xzr 159 adds x6,x10,x7 160 sub x1,x1,x5 // rewind x1 161 adc x7,x11,xzr 162 163 adds x12,x16,x13 164 sub x3,x3,x5 // rewind x3 165 adcs x13,x17,x19 166 adc x19,xzr,xzr 167 168 adds x6,x6,x23 169 adc x7,x7,xzr 170 171 adds x12,x12,x6 172 adcs x13,x13,x7 173 adc x19,x19,xzr // upmost overflow bit 174 stp x12,x13,[x22,#-16] 175 176 cbnz x20,.Louter 177 178 // Final step. We see if result is larger than modulus, and 179 // if it is, subtract the modulus. But comparison implies 180 // subtraction. So we subtract modulus, see if it borrowed, 181 // and conditionally copy original value. 182 ldr x23,[sp] // tp[0] 183 add x22,sp,#8 184 ldr x14,[x3],#8 // np[0] 185 subs x21,x5,#8 // j=num-1 and clear borrow 186 mov x1,x0 187.Lsub: 188 sbcs x8,x23,x14 // tp[j]-np[j] 189 ldr x23,[x22],#8 190 sub x21,x21,#8 // j-- 191 ldr x14,[x3],#8 192 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 193 cbnz x21,.Lsub 194 195 sbcs x8,x23,x14 196 sbcs x19,x19,xzr // did it borrow? 197 str x8,[x1],#8 // rp[num-1] 198 199 ldr x23,[sp] // tp[0] 200 add x22,sp,#8 201 ldr x8,[x0],#8 // rp[0] 202 sub x5,x5,#8 // num-- 203 nop 204.Lcond_copy: 205 sub x5,x5,#8 // num-- 206 csel x14,x23,x8,lo // did it borrow? 207 ldr x23,[x22],#8 208 ldr x8,[x0],#8 209 str xzr,[x22,#-16] // wipe tp 210 str x14,[x0,#-16] 211 cbnz x5,.Lcond_copy 212 213 csel x14,x23,x8,lo 214 str xzr,[x22,#-8] // wipe tp 215 str x14,[x0,#-8] 216 217 ldp x19,x20,[x29,#16] 218 mov sp,x29 219 ldp x21,x22,[x29,#32] 220 mov x0,#1 221 ldp x23,x24,[x29,#48] 222 ldr x29,[sp],#64 223 AARCH64_VALIDATE_LINK_REGISTER 224 ret 225.size bn_mul_mont,.-bn_mul_mont 226.type __bn_sqr8x_mont,%function 227.align 5 228__bn_sqr8x_mont: 229 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 230 // only from bn_mul_mont which has already signed the return address. 231 cmp x1,x2 232 b.ne __bn_mul4x_mont 233.Lsqr8x_mont: 234 stp x29,x30,[sp,#-128]! 235 add x29,sp,#0 236 stp x19,x20,[sp,#16] 237 stp x21,x22,[sp,#32] 238 stp x23,x24,[sp,#48] 239 stp x25,x26,[sp,#64] 240 stp x27,x28,[sp,#80] 241 stp x0,x3,[sp,#96] // offload rp and np 242 243 ldp x6,x7,[x1,#8*0] 244 ldp x8,x9,[x1,#8*2] 245 ldp x10,x11,[x1,#8*4] 246 ldp x12,x13,[x1,#8*6] 247 248 sub x2,sp,x5,lsl#4 249 lsl x5,x5,#3 250 ldr x4,[x4] // *n0 251 mov sp,x2 // alloca 252 sub x27,x5,#8*8 253 b .Lsqr8x_zero_start 254 255.Lsqr8x_zero: 256 sub x27,x27,#8*8 257 stp xzr,xzr,[x2,#8*0] 258 stp xzr,xzr,[x2,#8*2] 259 stp xzr,xzr,[x2,#8*4] 260 stp xzr,xzr,[x2,#8*6] 261.Lsqr8x_zero_start: 262 stp xzr,xzr,[x2,#8*8] 263 stp xzr,xzr,[x2,#8*10] 264 stp xzr,xzr,[x2,#8*12] 265 stp xzr,xzr,[x2,#8*14] 266 add x2,x2,#8*16 267 cbnz x27,.Lsqr8x_zero 268 269 add x3,x1,x5 270 add x1,x1,#8*8 271 mov x19,xzr 272 mov x20,xzr 273 mov x21,xzr 274 mov x22,xzr 275 mov x23,xzr 276 mov x24,xzr 277 mov x25,xzr 278 mov x26,xzr 279 mov x2,sp 280 str x4,[x29,#112] // offload n0 281 282 // Multiply everything but a[i]*a[i] 283.align 4 284.Lsqr8x_outer_loop: 285 // a[1]a[0] (i) 286 // a[2]a[0] 287 // a[3]a[0] 288 // a[4]a[0] 289 // a[5]a[0] 290 // a[6]a[0] 291 // a[7]a[0] 292 // a[2]a[1] (ii) 293 // a[3]a[1] 294 // a[4]a[1] 295 // a[5]a[1] 296 // a[6]a[1] 297 // a[7]a[1] 298 // a[3]a[2] (iii) 299 // a[4]a[2] 300 // a[5]a[2] 301 // a[6]a[2] 302 // a[7]a[2] 303 // a[4]a[3] (iv) 304 // a[5]a[3] 305 // a[6]a[3] 306 // a[7]a[3] 307 // a[5]a[4] (v) 308 // a[6]a[4] 309 // a[7]a[4] 310 // a[6]a[5] (vi) 311 // a[7]a[5] 312 // a[7]a[6] (vii) 313 314 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 315 mul x15,x8,x6 316 mul x16,x9,x6 317 mul x17,x10,x6 318 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 319 mul x14,x11,x6 320 adcs x21,x21,x15 321 mul x15,x12,x6 322 adcs x22,x22,x16 323 mul x16,x13,x6 324 adcs x23,x23,x17 325 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 326 adcs x24,x24,x14 327 umulh x14,x8,x6 328 adcs x25,x25,x15 329 umulh x15,x9,x6 330 adcs x26,x26,x16 331 umulh x16,x10,x6 332 stp x19,x20,[x2],#8*2 // t[0..1] 333 adc x19,xzr,xzr // t[8] 334 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 335 umulh x17,x11,x6 336 adcs x22,x22,x14 337 umulh x14,x12,x6 338 adcs x23,x23,x15 339 umulh x15,x13,x6 340 adcs x24,x24,x16 341 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 342 adcs x25,x25,x17 343 mul x17,x9,x7 344 adcs x26,x26,x14 345 mul x14,x10,x7 346 adc x19,x19,x15 347 348 mul x15,x11,x7 349 adds x22,x22,x16 350 mul x16,x12,x7 351 adcs x23,x23,x17 352 mul x17,x13,x7 353 adcs x24,x24,x14 354 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 355 adcs x25,x25,x15 356 umulh x15,x9,x7 357 adcs x26,x26,x16 358 umulh x16,x10,x7 359 adcs x19,x19,x17 360 umulh x17,x11,x7 361 stp x21,x22,[x2],#8*2 // t[2..3] 362 adc x20,xzr,xzr // t[9] 363 adds x23,x23,x14 364 umulh x14,x12,x7 365 adcs x24,x24,x15 366 umulh x15,x13,x7 367 adcs x25,x25,x16 368 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 369 adcs x26,x26,x17 370 mul x17,x10,x8 371 adcs x19,x19,x14 372 mul x14,x11,x8 373 adc x20,x20,x15 374 375 mul x15,x12,x8 376 adds x24,x24,x16 377 mul x16,x13,x8 378 adcs x25,x25,x17 379 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 380 adcs x26,x26,x14 381 umulh x14,x10,x8 382 adcs x19,x19,x15 383 umulh x15,x11,x8 384 adcs x20,x20,x16 385 umulh x16,x12,x8 386 stp x23,x24,[x2],#8*2 // t[4..5] 387 adc x21,xzr,xzr // t[10] 388 adds x25,x25,x17 389 umulh x17,x13,x8 390 adcs x26,x26,x14 391 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 392 adcs x19,x19,x15 393 mul x15,x11,x9 394 adcs x20,x20,x16 395 mul x16,x12,x9 396 adc x21,x21,x17 397 398 mul x17,x13,x9 399 adds x26,x26,x14 400 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 401 adcs x19,x19,x15 402 umulh x15,x11,x9 403 adcs x20,x20,x16 404 umulh x16,x12,x9 405 adcs x21,x21,x17 406 umulh x17,x13,x9 407 stp x25,x26,[x2],#8*2 // t[6..7] 408 adc x22,xzr,xzr // t[11] 409 adds x19,x19,x14 410 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 411 adcs x20,x20,x15 412 mul x15,x12,x10 413 adcs x21,x21,x16 414 mul x16,x13,x10 415 adc x22,x22,x17 416 417 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 418 adds x20,x20,x14 419 umulh x14,x12,x10 420 adcs x21,x21,x15 421 umulh x15,x13,x10 422 adcs x22,x22,x16 423 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 424 adc x23,xzr,xzr // t[12] 425 adds x21,x21,x17 426 mul x17,x13,x11 427 adcs x22,x22,x14 428 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 429 adc x23,x23,x15 430 431 umulh x15,x13,x11 432 adds x22,x22,x16 433 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 434 adcs x23,x23,x17 435 umulh x17,x13,x12 // hi(a[7]*a[6]) 436 adc x24,xzr,xzr // t[13] 437 adds x23,x23,x14 438 sub x27,x3,x1 // done yet? 439 adc x24,x24,x15 440 441 adds x24,x24,x16 442 sub x14,x3,x5 // rewinded ap 443 adc x25,xzr,xzr // t[14] 444 add x25,x25,x17 445 446 cbz x27,.Lsqr8x_outer_break 447 448 mov x4,x6 449 ldp x6,x7,[x2,#8*0] 450 ldp x8,x9,[x2,#8*2] 451 ldp x10,x11,[x2,#8*4] 452 ldp x12,x13,[x2,#8*6] 453 adds x19,x19,x6 454 adcs x20,x20,x7 455 ldp x6,x7,[x1,#8*0] 456 adcs x21,x21,x8 457 adcs x22,x22,x9 458 ldp x8,x9,[x1,#8*2] 459 adcs x23,x23,x10 460 adcs x24,x24,x11 461 ldp x10,x11,[x1,#8*4] 462 adcs x25,x25,x12 463 mov x0,x1 464 adcs x26,xzr,x13 465 ldp x12,x13,[x1,#8*6] 466 add x1,x1,#8*8 467 //adc x28,xzr,xzr // moved below 468 mov x27,#-8*8 469 470 // a[8]a[0] 471 // a[9]a[0] 472 // a[a]a[0] 473 // a[b]a[0] 474 // a[c]a[0] 475 // a[d]a[0] 476 // a[e]a[0] 477 // a[f]a[0] 478 // a[8]a[1] 479 // a[f]a[1]........................ 480 // a[8]a[2] 481 // a[f]a[2]........................ 482 // a[8]a[3] 483 // a[f]a[3]........................ 484 // a[8]a[4] 485 // a[f]a[4]........................ 486 // a[8]a[5] 487 // a[f]a[5]........................ 488 // a[8]a[6] 489 // a[f]a[6]........................ 490 // a[8]a[7] 491 // a[f]a[7]........................ 492.Lsqr8x_mul: 493 mul x14,x6,x4 494 adc x28,xzr,xzr // carry bit, modulo-scheduled 495 mul x15,x7,x4 496 add x27,x27,#8 497 mul x16,x8,x4 498 mul x17,x9,x4 499 adds x19,x19,x14 500 mul x14,x10,x4 501 adcs x20,x20,x15 502 mul x15,x11,x4 503 adcs x21,x21,x16 504 mul x16,x12,x4 505 adcs x22,x22,x17 506 mul x17,x13,x4 507 adcs x23,x23,x14 508 umulh x14,x6,x4 509 adcs x24,x24,x15 510 umulh x15,x7,x4 511 adcs x25,x25,x16 512 umulh x16,x8,x4 513 adcs x26,x26,x17 514 umulh x17,x9,x4 515 adc x28,x28,xzr 516 str x19,[x2],#8 517 adds x19,x20,x14 518 umulh x14,x10,x4 519 adcs x20,x21,x15 520 umulh x15,x11,x4 521 adcs x21,x22,x16 522 umulh x16,x12,x4 523 adcs x22,x23,x17 524 umulh x17,x13,x4 525 ldr x4,[x0,x27] 526 adcs x23,x24,x14 527 adcs x24,x25,x15 528 adcs x25,x26,x16 529 adcs x26,x28,x17 530 //adc x28,xzr,xzr // moved above 531 cbnz x27,.Lsqr8x_mul 532 // note that carry flag is guaranteed 533 // to be zero at this point 534 cmp x1,x3 // done yet? 535 b.eq .Lsqr8x_break 536 537 ldp x6,x7,[x2,#8*0] 538 ldp x8,x9,[x2,#8*2] 539 ldp x10,x11,[x2,#8*4] 540 ldp x12,x13,[x2,#8*6] 541 adds x19,x19,x6 542 ldr x4,[x0,#-8*8] 543 adcs x20,x20,x7 544 ldp x6,x7,[x1,#8*0] 545 adcs x21,x21,x8 546 adcs x22,x22,x9 547 ldp x8,x9,[x1,#8*2] 548 adcs x23,x23,x10 549 adcs x24,x24,x11 550 ldp x10,x11,[x1,#8*4] 551 adcs x25,x25,x12 552 mov x27,#-8*8 553 adcs x26,x26,x13 554 ldp x12,x13,[x1,#8*6] 555 add x1,x1,#8*8 556 //adc x28,xzr,xzr // moved above 557 b .Lsqr8x_mul 558 559.align 4 560.Lsqr8x_break: 561 ldp x6,x7,[x0,#8*0] 562 add x1,x0,#8*8 563 ldp x8,x9,[x0,#8*2] 564 sub x14,x3,x1 // is it last iteration? 565 ldp x10,x11,[x0,#8*4] 566 sub x15,x2,x14 567 ldp x12,x13,[x0,#8*6] 568 cbz x14,.Lsqr8x_outer_loop 569 570 stp x19,x20,[x2,#8*0] 571 ldp x19,x20,[x15,#8*0] 572 stp x21,x22,[x2,#8*2] 573 ldp x21,x22,[x15,#8*2] 574 stp x23,x24,[x2,#8*4] 575 ldp x23,x24,[x15,#8*4] 576 stp x25,x26,[x2,#8*6] 577 mov x2,x15 578 ldp x25,x26,[x15,#8*6] 579 b .Lsqr8x_outer_loop 580 581.align 4 582.Lsqr8x_outer_break: 583 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 584 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 585 ldp x15,x16,[sp,#8*1] 586 ldp x11,x13,[x14,#8*2] 587 add x1,x14,#8*4 588 ldp x17,x14,[sp,#8*3] 589 590 stp x19,x20,[x2,#8*0] 591 mul x19,x7,x7 592 stp x21,x22,[x2,#8*2] 593 umulh x7,x7,x7 594 stp x23,x24,[x2,#8*4] 595 mul x8,x9,x9 596 stp x25,x26,[x2,#8*6] 597 mov x2,sp 598 umulh x9,x9,x9 599 adds x20,x7,x15,lsl#1 600 extr x15,x16,x15,#63 601 sub x27,x5,#8*4 602 603.Lsqr4x_shift_n_add: 604 adcs x21,x8,x15 605 extr x16,x17,x16,#63 606 sub x27,x27,#8*4 607 adcs x22,x9,x16 608 ldp x15,x16,[x2,#8*5] 609 mul x10,x11,x11 610 ldp x7,x9,[x1],#8*2 611 umulh x11,x11,x11 612 mul x12,x13,x13 613 umulh x13,x13,x13 614 extr x17,x14,x17,#63 615 stp x19,x20,[x2,#8*0] 616 adcs x23,x10,x17 617 extr x14,x15,x14,#63 618 stp x21,x22,[x2,#8*2] 619 adcs x24,x11,x14 620 ldp x17,x14,[x2,#8*7] 621 extr x15,x16,x15,#63 622 adcs x25,x12,x15 623 extr x16,x17,x16,#63 624 adcs x26,x13,x16 625 ldp x15,x16,[x2,#8*9] 626 mul x6,x7,x7 627 ldp x11,x13,[x1],#8*2 628 umulh x7,x7,x7 629 mul x8,x9,x9 630 umulh x9,x9,x9 631 stp x23,x24,[x2,#8*4] 632 extr x17,x14,x17,#63 633 stp x25,x26,[x2,#8*6] 634 add x2,x2,#8*8 635 adcs x19,x6,x17 636 extr x14,x15,x14,#63 637 adcs x20,x7,x14 638 ldp x17,x14,[x2,#8*3] 639 extr x15,x16,x15,#63 640 cbnz x27,.Lsqr4x_shift_n_add 641 ldp x1,x4,[x29,#104] // pull np and n0 642 643 adcs x21,x8,x15 644 extr x16,x17,x16,#63 645 adcs x22,x9,x16 646 ldp x15,x16,[x2,#8*5] 647 mul x10,x11,x11 648 umulh x11,x11,x11 649 stp x19,x20,[x2,#8*0] 650 mul x12,x13,x13 651 umulh x13,x13,x13 652 stp x21,x22,[x2,#8*2] 653 extr x17,x14,x17,#63 654 adcs x23,x10,x17 655 extr x14,x15,x14,#63 656 ldp x19,x20,[sp,#8*0] 657 adcs x24,x11,x14 658 extr x15,x16,x15,#63 659 ldp x6,x7,[x1,#8*0] 660 adcs x25,x12,x15 661 extr x16,xzr,x16,#63 662 ldp x8,x9,[x1,#8*2] 663 adc x26,x13,x16 664 ldp x10,x11,[x1,#8*4] 665 666 // Reduce by 512 bits per iteration 667 mul x28,x4,x19 // t[0]*n0 668 ldp x12,x13,[x1,#8*6] 669 add x3,x1,x5 670 ldp x21,x22,[sp,#8*2] 671 stp x23,x24,[x2,#8*4] 672 ldp x23,x24,[sp,#8*4] 673 stp x25,x26,[x2,#8*6] 674 ldp x25,x26,[sp,#8*6] 675 add x1,x1,#8*8 676 mov x30,xzr // initial top-most carry 677 mov x2,sp 678 mov x27,#8 679 680.Lsqr8x_reduction: 681 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 682 mul x15,x7,x28 683 sub x27,x27,#1 684 mul x16,x8,x28 685 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 686 mul x17,x9,x28 687 // (*) adds xzr,x19,x14 688 subs xzr,x19,#1 // (*) 689 mul x14,x10,x28 690 adcs x19,x20,x15 691 mul x15,x11,x28 692 adcs x20,x21,x16 693 mul x16,x12,x28 694 adcs x21,x22,x17 695 mul x17,x13,x28 696 adcs x22,x23,x14 697 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 698 adcs x23,x24,x15 699 umulh x15,x7,x28 700 adcs x24,x25,x16 701 umulh x16,x8,x28 702 adcs x25,x26,x17 703 umulh x17,x9,x28 704 adc x26,xzr,xzr 705 adds x19,x19,x14 706 umulh x14,x10,x28 707 adcs x20,x20,x15 708 umulh x15,x11,x28 709 adcs x21,x21,x16 710 umulh x16,x12,x28 711 adcs x22,x22,x17 712 umulh x17,x13,x28 713 mul x28,x4,x19 // next t[0]*n0 714 adcs x23,x23,x14 715 adcs x24,x24,x15 716 adcs x25,x25,x16 717 adc x26,x26,x17 718 cbnz x27,.Lsqr8x_reduction 719 720 ldp x14,x15,[x2,#8*0] 721 ldp x16,x17,[x2,#8*2] 722 mov x0,x2 723 sub x27,x3,x1 // done yet? 724 adds x19,x19,x14 725 adcs x20,x20,x15 726 ldp x14,x15,[x2,#8*4] 727 adcs x21,x21,x16 728 adcs x22,x22,x17 729 ldp x16,x17,[x2,#8*6] 730 adcs x23,x23,x14 731 adcs x24,x24,x15 732 adcs x25,x25,x16 733 adcs x26,x26,x17 734 //adc x28,xzr,xzr // moved below 735 cbz x27,.Lsqr8x8_post_condition 736 737 ldr x4,[x2,#-8*8] 738 ldp x6,x7,[x1,#8*0] 739 ldp x8,x9,[x1,#8*2] 740 ldp x10,x11,[x1,#8*4] 741 mov x27,#-8*8 742 ldp x12,x13,[x1,#8*6] 743 add x1,x1,#8*8 744 745.Lsqr8x_tail: 746 mul x14,x6,x4 747 adc x28,xzr,xzr // carry bit, modulo-scheduled 748 mul x15,x7,x4 749 add x27,x27,#8 750 mul x16,x8,x4 751 mul x17,x9,x4 752 adds x19,x19,x14 753 mul x14,x10,x4 754 adcs x20,x20,x15 755 mul x15,x11,x4 756 adcs x21,x21,x16 757 mul x16,x12,x4 758 adcs x22,x22,x17 759 mul x17,x13,x4 760 adcs x23,x23,x14 761 umulh x14,x6,x4 762 adcs x24,x24,x15 763 umulh x15,x7,x4 764 adcs x25,x25,x16 765 umulh x16,x8,x4 766 adcs x26,x26,x17 767 umulh x17,x9,x4 768 adc x28,x28,xzr 769 str x19,[x2],#8 770 adds x19,x20,x14 771 umulh x14,x10,x4 772 adcs x20,x21,x15 773 umulh x15,x11,x4 774 adcs x21,x22,x16 775 umulh x16,x12,x4 776 adcs x22,x23,x17 777 umulh x17,x13,x4 778 ldr x4,[x0,x27] 779 adcs x23,x24,x14 780 adcs x24,x25,x15 781 adcs x25,x26,x16 782 adcs x26,x28,x17 783 //adc x28,xzr,xzr // moved above 784 cbnz x27,.Lsqr8x_tail 785 // note that carry flag is guaranteed 786 // to be zero at this point 787 ldp x6,x7,[x2,#8*0] 788 sub x27,x3,x1 // done yet? 789 sub x16,x3,x5 // rewinded np 790 ldp x8,x9,[x2,#8*2] 791 ldp x10,x11,[x2,#8*4] 792 ldp x12,x13,[x2,#8*6] 793 cbz x27,.Lsqr8x_tail_break 794 795 ldr x4,[x0,#-8*8] 796 adds x19,x19,x6 797 adcs x20,x20,x7 798 ldp x6,x7,[x1,#8*0] 799 adcs x21,x21,x8 800 adcs x22,x22,x9 801 ldp x8,x9,[x1,#8*2] 802 adcs x23,x23,x10 803 adcs x24,x24,x11 804 ldp x10,x11,[x1,#8*4] 805 adcs x25,x25,x12 806 mov x27,#-8*8 807 adcs x26,x26,x13 808 ldp x12,x13,[x1,#8*6] 809 add x1,x1,#8*8 810 //adc x28,xzr,xzr // moved above 811 b .Lsqr8x_tail 812 813.align 4 814.Lsqr8x_tail_break: 815 ldr x4,[x29,#112] // pull n0 816 add x27,x2,#8*8 // end of current t[num] window 817 818 subs xzr,x30,#1 // "move" top-most carry to carry bit 819 adcs x14,x19,x6 820 adcs x15,x20,x7 821 ldp x19,x20,[x0,#8*0] 822 adcs x21,x21,x8 823 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 824 adcs x22,x22,x9 825 ldp x8,x9,[x16,#8*2] 826 adcs x23,x23,x10 827 adcs x24,x24,x11 828 ldp x10,x11,[x16,#8*4] 829 adcs x25,x25,x12 830 adcs x26,x26,x13 831 ldp x12,x13,[x16,#8*6] 832 add x1,x16,#8*8 833 adc x30,xzr,xzr // top-most carry 834 mul x28,x4,x19 835 stp x14,x15,[x2,#8*0] 836 stp x21,x22,[x2,#8*2] 837 ldp x21,x22,[x0,#8*2] 838 stp x23,x24,[x2,#8*4] 839 ldp x23,x24,[x0,#8*4] 840 cmp x27,x29 // did we hit the bottom? 841 stp x25,x26,[x2,#8*6] 842 mov x2,x0 // slide the window 843 ldp x25,x26,[x0,#8*6] 844 mov x27,#8 845 b.ne .Lsqr8x_reduction 846 847 // Final step. We see if result is larger than modulus, and 848 // if it is, subtract the modulus. But comparison implies 849 // subtraction. So we subtract modulus, see if it borrowed, 850 // and conditionally copy original value. 851 ldr x0,[x29,#96] // pull rp 852 add x2,x2,#8*8 853 subs x14,x19,x6 854 sbcs x15,x20,x7 855 sub x27,x5,#8*8 856 mov x3,x0 // x0 copy 857 858.Lsqr8x_sub: 859 sbcs x16,x21,x8 860 ldp x6,x7,[x1,#8*0] 861 sbcs x17,x22,x9 862 stp x14,x15,[x0,#8*0] 863 sbcs x14,x23,x10 864 ldp x8,x9,[x1,#8*2] 865 sbcs x15,x24,x11 866 stp x16,x17,[x0,#8*2] 867 sbcs x16,x25,x12 868 ldp x10,x11,[x1,#8*4] 869 sbcs x17,x26,x13 870 ldp x12,x13,[x1,#8*6] 871 add x1,x1,#8*8 872 ldp x19,x20,[x2,#8*0] 873 sub x27,x27,#8*8 874 ldp x21,x22,[x2,#8*2] 875 ldp x23,x24,[x2,#8*4] 876 ldp x25,x26,[x2,#8*6] 877 add x2,x2,#8*8 878 stp x14,x15,[x0,#8*4] 879 sbcs x14,x19,x6 880 stp x16,x17,[x0,#8*6] 881 add x0,x0,#8*8 882 sbcs x15,x20,x7 883 cbnz x27,.Lsqr8x_sub 884 885 sbcs x16,x21,x8 886 mov x2,sp 887 add x1,sp,x5 888 ldp x6,x7,[x3,#8*0] 889 sbcs x17,x22,x9 890 stp x14,x15,[x0,#8*0] 891 sbcs x14,x23,x10 892 ldp x8,x9,[x3,#8*2] 893 sbcs x15,x24,x11 894 stp x16,x17,[x0,#8*2] 895 sbcs x16,x25,x12 896 ldp x19,x20,[x1,#8*0] 897 sbcs x17,x26,x13 898 ldp x21,x22,[x1,#8*2] 899 sbcs xzr,x30,xzr // did it borrow? 900 ldr x30,[x29,#8] // pull return address 901 stp x14,x15,[x0,#8*4] 902 stp x16,x17,[x0,#8*6] 903 904 sub x27,x5,#8*4 905.Lsqr4x_cond_copy: 906 sub x27,x27,#8*4 907 csel x14,x19,x6,lo 908 stp xzr,xzr,[x2,#8*0] 909 csel x15,x20,x7,lo 910 ldp x6,x7,[x3,#8*4] 911 ldp x19,x20,[x1,#8*4] 912 csel x16,x21,x8,lo 913 stp xzr,xzr,[x2,#8*2] 914 add x2,x2,#8*4 915 csel x17,x22,x9,lo 916 ldp x8,x9,[x3,#8*6] 917 ldp x21,x22,[x1,#8*6] 918 add x1,x1,#8*4 919 stp x14,x15,[x3,#8*0] 920 stp x16,x17,[x3,#8*2] 921 add x3,x3,#8*4 922 stp xzr,xzr,[x1,#8*0] 923 stp xzr,xzr,[x1,#8*2] 924 cbnz x27,.Lsqr4x_cond_copy 925 926 csel x14,x19,x6,lo 927 stp xzr,xzr,[x2,#8*0] 928 csel x15,x20,x7,lo 929 stp xzr,xzr,[x2,#8*2] 930 csel x16,x21,x8,lo 931 csel x17,x22,x9,lo 932 stp x14,x15,[x3,#8*0] 933 stp x16,x17,[x3,#8*2] 934 935 b .Lsqr8x_done 936 937.align 4 938.Lsqr8x8_post_condition: 939 adc x28,xzr,xzr 940 ldr x30,[x29,#8] // pull return address 941 // x19-7,x28 hold result, x6-7 hold modulus 942 subs x6,x19,x6 943 ldr x1,[x29,#96] // pull rp 944 sbcs x7,x20,x7 945 stp xzr,xzr,[sp,#8*0] 946 sbcs x8,x21,x8 947 stp xzr,xzr,[sp,#8*2] 948 sbcs x9,x22,x9 949 stp xzr,xzr,[sp,#8*4] 950 sbcs x10,x23,x10 951 stp xzr,xzr,[sp,#8*6] 952 sbcs x11,x24,x11 953 stp xzr,xzr,[sp,#8*8] 954 sbcs x12,x25,x12 955 stp xzr,xzr,[sp,#8*10] 956 sbcs x13,x26,x13 957 stp xzr,xzr,[sp,#8*12] 958 sbcs x28,x28,xzr // did it borrow? 959 stp xzr,xzr,[sp,#8*14] 960 961 // x6-7 hold result-modulus 962 csel x6,x19,x6,lo 963 csel x7,x20,x7,lo 964 csel x8,x21,x8,lo 965 csel x9,x22,x9,lo 966 stp x6,x7,[x1,#8*0] 967 csel x10,x23,x10,lo 968 csel x11,x24,x11,lo 969 stp x8,x9,[x1,#8*2] 970 csel x12,x25,x12,lo 971 csel x13,x26,x13,lo 972 stp x10,x11,[x1,#8*4] 973 stp x12,x13,[x1,#8*6] 974 975.Lsqr8x_done: 976 ldp x19,x20,[x29,#16] 977 mov sp,x29 978 ldp x21,x22,[x29,#32] 979 mov x0,#1 980 ldp x23,x24,[x29,#48] 981 ldp x25,x26,[x29,#64] 982 ldp x27,x28,[x29,#80] 983 ldr x29,[sp],#128 984 // x30 is popped earlier 985 AARCH64_VALIDATE_LINK_REGISTER 986 ret 987.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 988.type __bn_mul4x_mont,%function 989.align 5 990__bn_mul4x_mont: 991 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 992 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 993 // return address. 994 stp x29,x30,[sp,#-128]! 995 add x29,sp,#0 996 stp x19,x20,[sp,#16] 997 stp x21,x22,[sp,#32] 998 stp x23,x24,[sp,#48] 999 stp x25,x26,[sp,#64] 1000 stp x27,x28,[sp,#80] 1001 1002 sub x26,sp,x5,lsl#3 1003 lsl x5,x5,#3 1004 ldr x4,[x4] // *n0 1005 sub sp,x26,#8*4 // alloca 1006 1007 add x10,x2,x5 1008 add x27,x1,x5 1009 stp x0,x10,[x29,#96] // offload rp and &b[num] 1010 1011 ldr x24,[x2,#8*0] // b[0] 1012 ldp x6,x7,[x1,#8*0] // a[0..3] 1013 ldp x8,x9,[x1,#8*2] 1014 add x1,x1,#8*4 1015 mov x19,xzr 1016 mov x20,xzr 1017 mov x21,xzr 1018 mov x22,xzr 1019 ldp x14,x15,[x3,#8*0] // n[0..3] 1020 ldp x16,x17,[x3,#8*2] 1021 adds x3,x3,#8*4 // clear carry bit 1022 mov x0,xzr 1023 mov x28,#0 1024 mov x26,sp 1025 1026.Loop_mul4x_1st_reduction: 1027 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1028 adc x0,x0,xzr // modulo-scheduled 1029 mul x11,x7,x24 1030 add x28,x28,#8 1031 mul x12,x8,x24 1032 and x28,x28,#31 1033 mul x13,x9,x24 1034 adds x19,x19,x10 1035 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1036 adcs x20,x20,x11 1037 mul x25,x19,x4 // t[0]*n0 1038 adcs x21,x21,x12 1039 umulh x11,x7,x24 1040 adcs x22,x22,x13 1041 umulh x12,x8,x24 1042 adc x23,xzr,xzr 1043 umulh x13,x9,x24 1044 ldr x24,[x2,x28] // next b[i] (or b[0]) 1045 adds x20,x20,x10 1046 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1047 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1048 adcs x21,x21,x11 1049 mul x11,x15,x25 1050 adcs x22,x22,x12 1051 mul x12,x16,x25 1052 adc x23,x23,x13 // can't overflow 1053 mul x13,x17,x25 1054 // (*) adds xzr,x19,x10 1055 subs xzr,x19,#1 // (*) 1056 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1057 adcs x19,x20,x11 1058 umulh x11,x15,x25 1059 adcs x20,x21,x12 1060 umulh x12,x16,x25 1061 adcs x21,x22,x13 1062 umulh x13,x17,x25 1063 adcs x22,x23,x0 1064 adc x0,xzr,xzr 1065 adds x19,x19,x10 1066 sub x10,x27,x1 1067 adcs x20,x20,x11 1068 adcs x21,x21,x12 1069 adcs x22,x22,x13 1070 //adc x0,x0,xzr 1071 cbnz x28,.Loop_mul4x_1st_reduction 1072 1073 cbz x10,.Lmul4x4_post_condition 1074 1075 ldp x6,x7,[x1,#8*0] // a[4..7] 1076 ldp x8,x9,[x1,#8*2] 1077 add x1,x1,#8*4 1078 ldr x25,[sp] // a[0]*n0 1079 ldp x14,x15,[x3,#8*0] // n[4..7] 1080 ldp x16,x17,[x3,#8*2] 1081 add x3,x3,#8*4 1082 1083.Loop_mul4x_1st_tail: 1084 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1085 adc x0,x0,xzr // modulo-scheduled 1086 mul x11,x7,x24 1087 add x28,x28,#8 1088 mul x12,x8,x24 1089 and x28,x28,#31 1090 mul x13,x9,x24 1091 adds x19,x19,x10 1092 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1093 adcs x20,x20,x11 1094 umulh x11,x7,x24 1095 adcs x21,x21,x12 1096 umulh x12,x8,x24 1097 adcs x22,x22,x13 1098 umulh x13,x9,x24 1099 adc x23,xzr,xzr 1100 ldr x24,[x2,x28] // next b[i] (or b[0]) 1101 adds x20,x20,x10 1102 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1103 adcs x21,x21,x11 1104 mul x11,x15,x25 1105 adcs x22,x22,x12 1106 mul x12,x16,x25 1107 adc x23,x23,x13 // can't overflow 1108 mul x13,x17,x25 1109 adds x19,x19,x10 1110 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1111 adcs x20,x20,x11 1112 umulh x11,x15,x25 1113 adcs x21,x21,x12 1114 umulh x12,x16,x25 1115 adcs x22,x22,x13 1116 adcs x23,x23,x0 1117 umulh x13,x17,x25 1118 adc x0,xzr,xzr 1119 ldr x25,[sp,x28] // next t[0]*n0 1120 str x19,[x26],#8 // result!!! 1121 adds x19,x20,x10 1122 sub x10,x27,x1 // done yet? 1123 adcs x20,x21,x11 1124 adcs x21,x22,x12 1125 adcs x22,x23,x13 1126 //adc x0,x0,xzr 1127 cbnz x28,.Loop_mul4x_1st_tail 1128 1129 sub x11,x27,x5 // rewinded x1 1130 cbz x10,.Lmul4x_proceed 1131 1132 ldp x6,x7,[x1,#8*0] 1133 ldp x8,x9,[x1,#8*2] 1134 add x1,x1,#8*4 1135 ldp x14,x15,[x3,#8*0] 1136 ldp x16,x17,[x3,#8*2] 1137 add x3,x3,#8*4 1138 b .Loop_mul4x_1st_tail 1139 1140.align 5 1141.Lmul4x_proceed: 1142 ldr x24,[x2,#8*4]! // *++b 1143 adc x30,x0,xzr 1144 ldp x6,x7,[x11,#8*0] // a[0..3] 1145 sub x3,x3,x5 // rewind np 1146 ldp x8,x9,[x11,#8*2] 1147 add x1,x11,#8*4 1148 1149 stp x19,x20,[x26,#8*0] // result!!! 1150 ldp x19,x20,[sp,#8*4] // t[0..3] 1151 stp x21,x22,[x26,#8*2] // result!!! 1152 ldp x21,x22,[sp,#8*6] 1153 1154 ldp x14,x15,[x3,#8*0] // n[0..3] 1155 mov x26,sp 1156 ldp x16,x17,[x3,#8*2] 1157 adds x3,x3,#8*4 // clear carry bit 1158 mov x0,xzr 1159 1160.align 4 1161.Loop_mul4x_reduction: 1162 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1163 adc x0,x0,xzr // modulo-scheduled 1164 mul x11,x7,x24 1165 add x28,x28,#8 1166 mul x12,x8,x24 1167 and x28,x28,#31 1168 mul x13,x9,x24 1169 adds x19,x19,x10 1170 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1171 adcs x20,x20,x11 1172 mul x25,x19,x4 // t[0]*n0 1173 adcs x21,x21,x12 1174 umulh x11,x7,x24 1175 adcs x22,x22,x13 1176 umulh x12,x8,x24 1177 adc x23,xzr,xzr 1178 umulh x13,x9,x24 1179 ldr x24,[x2,x28] // next b[i] 1180 adds x20,x20,x10 1181 // (*) mul x10,x14,x25 1182 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1183 adcs x21,x21,x11 1184 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1185 adcs x22,x22,x12 1186 mul x12,x16,x25 1187 adc x23,x23,x13 // can't overflow 1188 mul x13,x17,x25 1189 // (*) adds xzr,x19,x10 1190 subs xzr,x19,#1 // (*) 1191 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1192 adcs x19,x20,x11 1193 umulh x11,x15,x25 1194 adcs x20,x21,x12 1195 umulh x12,x16,x25 1196 adcs x21,x22,x13 1197 umulh x13,x17,x25 1198 adcs x22,x23,x0 1199 adc x0,xzr,xzr 1200 adds x19,x19,x10 1201 adcs x20,x20,x11 1202 adcs x21,x21,x12 1203 adcs x22,x22,x13 1204 //adc x0,x0,xzr 1205 cbnz x28,.Loop_mul4x_reduction 1206 1207 adc x0,x0,xzr 1208 ldp x10,x11,[x26,#8*4] // t[4..7] 1209 ldp x12,x13,[x26,#8*6] 1210 ldp x6,x7,[x1,#8*0] // a[4..7] 1211 ldp x8,x9,[x1,#8*2] 1212 add x1,x1,#8*4 1213 adds x19,x19,x10 1214 adcs x20,x20,x11 1215 adcs x21,x21,x12 1216 adcs x22,x22,x13 1217 //adc x0,x0,xzr 1218 1219 ldr x25,[sp] // t[0]*n0 1220 ldp x14,x15,[x3,#8*0] // n[4..7] 1221 ldp x16,x17,[x3,#8*2] 1222 add x3,x3,#8*4 1223 1224.align 4 1225.Loop_mul4x_tail: 1226 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1227 adc x0,x0,xzr // modulo-scheduled 1228 mul x11,x7,x24 1229 add x28,x28,#8 1230 mul x12,x8,x24 1231 and x28,x28,#31 1232 mul x13,x9,x24 1233 adds x19,x19,x10 1234 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1235 adcs x20,x20,x11 1236 umulh x11,x7,x24 1237 adcs x21,x21,x12 1238 umulh x12,x8,x24 1239 adcs x22,x22,x13 1240 umulh x13,x9,x24 1241 adc x23,xzr,xzr 1242 ldr x24,[x2,x28] // next b[i] 1243 adds x20,x20,x10 1244 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1245 adcs x21,x21,x11 1246 mul x11,x15,x25 1247 adcs x22,x22,x12 1248 mul x12,x16,x25 1249 adc x23,x23,x13 // can't overflow 1250 mul x13,x17,x25 1251 adds x19,x19,x10 1252 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1253 adcs x20,x20,x11 1254 umulh x11,x15,x25 1255 adcs x21,x21,x12 1256 umulh x12,x16,x25 1257 adcs x22,x22,x13 1258 umulh x13,x17,x25 1259 adcs x23,x23,x0 1260 ldr x25,[sp,x28] // next a[0]*n0 1261 adc x0,xzr,xzr 1262 str x19,[x26],#8 // result!!! 1263 adds x19,x20,x10 1264 sub x10,x27,x1 // done yet? 1265 adcs x20,x21,x11 1266 adcs x21,x22,x12 1267 adcs x22,x23,x13 1268 //adc x0,x0,xzr 1269 cbnz x28,.Loop_mul4x_tail 1270 1271 sub x11,x3,x5 // rewinded np? 1272 adc x0,x0,xzr 1273 cbz x10,.Loop_mul4x_break 1274 1275 ldp x10,x11,[x26,#8*4] 1276 ldp x12,x13,[x26,#8*6] 1277 ldp x6,x7,[x1,#8*0] 1278 ldp x8,x9,[x1,#8*2] 1279 add x1,x1,#8*4 1280 adds x19,x19,x10 1281 adcs x20,x20,x11 1282 adcs x21,x21,x12 1283 adcs x22,x22,x13 1284 //adc x0,x0,xzr 1285 ldp x14,x15,[x3,#8*0] 1286 ldp x16,x17,[x3,#8*2] 1287 add x3,x3,#8*4 1288 b .Loop_mul4x_tail 1289 1290.align 4 1291.Loop_mul4x_break: 1292 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1293 adds x19,x19,x30 1294 add x2,x2,#8*4 // bp++ 1295 adcs x20,x20,xzr 1296 sub x1,x1,x5 // rewind ap 1297 adcs x21,x21,xzr 1298 stp x19,x20,[x26,#8*0] // result!!! 1299 adcs x22,x22,xzr 1300 ldp x19,x20,[sp,#8*4] // t[0..3] 1301 adc x30,x0,xzr 1302 stp x21,x22,[x26,#8*2] // result!!! 1303 cmp x2,x13 // done yet? 1304 ldp x21,x22,[sp,#8*6] 1305 ldp x14,x15,[x11,#8*0] // n[0..3] 1306 ldp x16,x17,[x11,#8*2] 1307 add x3,x11,#8*4 1308 b.eq .Lmul4x_post 1309 1310 ldr x24,[x2] 1311 ldp x6,x7,[x1,#8*0] // a[0..3] 1312 ldp x8,x9,[x1,#8*2] 1313 adds x1,x1,#8*4 // clear carry bit 1314 mov x0,xzr 1315 mov x26,sp 1316 b .Loop_mul4x_reduction 1317 1318.align 4 1319.Lmul4x_post: 1320 // Final step. We see if result is larger than modulus, and 1321 // if it is, subtract the modulus. But comparison implies 1322 // subtraction. So we subtract modulus, see if it borrowed, 1323 // and conditionally copy original value. 1324 mov x0,x12 1325 mov x27,x12 // x0 copy 1326 subs x10,x19,x14 1327 add x26,sp,#8*8 1328 sbcs x11,x20,x15 1329 sub x28,x5,#8*4 1330 1331.Lmul4x_sub: 1332 sbcs x12,x21,x16 1333 ldp x14,x15,[x3,#8*0] 1334 sub x28,x28,#8*4 1335 ldp x19,x20,[x26,#8*0] 1336 sbcs x13,x22,x17 1337 ldp x16,x17,[x3,#8*2] 1338 add x3,x3,#8*4 1339 ldp x21,x22,[x26,#8*2] 1340 add x26,x26,#8*4 1341 stp x10,x11,[x0,#8*0] 1342 sbcs x10,x19,x14 1343 stp x12,x13,[x0,#8*2] 1344 add x0,x0,#8*4 1345 sbcs x11,x20,x15 1346 cbnz x28,.Lmul4x_sub 1347 1348 sbcs x12,x21,x16 1349 mov x26,sp 1350 add x1,sp,#8*4 1351 ldp x6,x7,[x27,#8*0] 1352 sbcs x13,x22,x17 1353 stp x10,x11,[x0,#8*0] 1354 ldp x8,x9,[x27,#8*2] 1355 stp x12,x13,[x0,#8*2] 1356 ldp x19,x20,[x1,#8*0] 1357 ldp x21,x22,[x1,#8*2] 1358 sbcs xzr,x30,xzr // did it borrow? 1359 ldr x30,[x29,#8] // pull return address 1360 1361 sub x28,x5,#8*4 1362.Lmul4x_cond_copy: 1363 sub x28,x28,#8*4 1364 csel x10,x19,x6,lo 1365 stp xzr,xzr,[x26,#8*0] 1366 csel x11,x20,x7,lo 1367 ldp x6,x7,[x27,#8*4] 1368 ldp x19,x20,[x1,#8*4] 1369 csel x12,x21,x8,lo 1370 stp xzr,xzr,[x26,#8*2] 1371 add x26,x26,#8*4 1372 csel x13,x22,x9,lo 1373 ldp x8,x9,[x27,#8*6] 1374 ldp x21,x22,[x1,#8*6] 1375 add x1,x1,#8*4 1376 stp x10,x11,[x27,#8*0] 1377 stp x12,x13,[x27,#8*2] 1378 add x27,x27,#8*4 1379 cbnz x28,.Lmul4x_cond_copy 1380 1381 csel x10,x19,x6,lo 1382 stp xzr,xzr,[x26,#8*0] 1383 csel x11,x20,x7,lo 1384 stp xzr,xzr,[x26,#8*2] 1385 csel x12,x21,x8,lo 1386 stp xzr,xzr,[x26,#8*3] 1387 csel x13,x22,x9,lo 1388 stp xzr,xzr,[x26,#8*4] 1389 stp x10,x11,[x27,#8*0] 1390 stp x12,x13,[x27,#8*2] 1391 1392 b .Lmul4x_done 1393 1394.align 4 1395.Lmul4x4_post_condition: 1396 adc x0,x0,xzr 1397 ldr x1,[x29,#96] // pull rp 1398 // x19-3,x0 hold result, x14-7 hold modulus 1399 subs x6,x19,x14 1400 ldr x30,[x29,#8] // pull return address 1401 sbcs x7,x20,x15 1402 stp xzr,xzr,[sp,#8*0] 1403 sbcs x8,x21,x16 1404 stp xzr,xzr,[sp,#8*2] 1405 sbcs x9,x22,x17 1406 stp xzr,xzr,[sp,#8*4] 1407 sbcs xzr,x0,xzr // did it borrow? 1408 stp xzr,xzr,[sp,#8*6] 1409 1410 // x6-3 hold result-modulus 1411 csel x6,x19,x6,lo 1412 csel x7,x20,x7,lo 1413 csel x8,x21,x8,lo 1414 csel x9,x22,x9,lo 1415 stp x6,x7,[x1,#8*0] 1416 stp x8,x9,[x1,#8*2] 1417 1418.Lmul4x_done: 1419 ldp x19,x20,[x29,#16] 1420 mov sp,x29 1421 ldp x21,x22,[x29,#32] 1422 mov x0,#1 1423 ldp x23,x24,[x29,#48] 1424 ldp x25,x26,[x29,#64] 1425 ldp x27,x28,[x29,#80] 1426 ldr x29,[sp],#128 1427 // x30 is popped earlier 1428 AARCH64_VALIDATE_LINK_REGISTER 1429 ret 1430.size __bn_mul4x_mont,.-__bn_mul4x_mont 1431.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1432.align 2 1433.align 4 1434#endif 1435#endif // !OPENSSL_NO_ASM 1436.section .note.GNU-stack,"",%progbits 1437