1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#include "ring_core_generated/prefix_symbols_asm.h" 14#include <ring-core/arm_arch.h> 15 16.text 17 18.globl bn_mul_mont 19.hidden bn_mul_mont 20.type bn_mul_mont,%function 21.align 5 22bn_mul_mont: 23 AARCH64_SIGN_LINK_REGISTER 24 tst x5,#7 25 b.eq __bn_sqr8x_mont 26 tst x5,#3 27 b.eq __bn_mul4x_mont 28.Lmul_mont: 29 stp x29,x30,[sp,#-64]! 30 add x29,sp,#0 31 stp x19,x20,[sp,#16] 32 stp x21,x22,[sp,#32] 33 stp x23,x24,[sp,#48] 34 35 ldr x9,[x2],#8 // bp[0] 36 sub x22,sp,x5,lsl#3 37 ldp x7,x8,[x1],#16 // ap[0..1] 38 lsl x5,x5,#3 39 ldr x4,[x4] // *n0 40 and x22,x22,#-16 // ABI says so 41 ldp x13,x14,[x3],#16 // np[0..1] 42 43 mul x6,x7,x9 // ap[0]*bp[0] 44 sub x21,x5,#16 // j=num-2 45 umulh x7,x7,x9 46 mul x10,x8,x9 // ap[1]*bp[0] 47 umulh x11,x8,x9 48 49 mul x15,x6,x4 // "tp[0]"*n0 50 mov sp,x22 // alloca 51 52 // (*) mul x12,x13,x15 // np[0]*m1 53 umulh x13,x13,x15 54 mul x16,x14,x15 // np[1]*m1 55 // (*) adds x12,x12,x6 // discarded 56 // (*) As for removal of first multiplication and addition 57 // instructions. The outcome of first addition is 58 // guaranteed to be zero, which leaves two computationally 59 // significant outcomes: it either carries or not. Then 60 // question is when does it carry? Is there alternative 61 // way to deduce it? If you follow operations, you can 62 // observe that condition for carry is quite simple: 63 // x6 being non-zero. So that carry can be calculated 64 // by adding -1 to x6. That's what next instruction does. 65 subs xzr,x6,#1 // (*) 66 umulh x17,x14,x15 67 adc x13,x13,xzr 68 cbz x21,.L1st_skip 69 70.L1st: 71 ldr x8,[x1],#8 72 adds x6,x10,x7 73 sub x21,x21,#8 // j-- 74 adc x7,x11,xzr 75 76 ldr x14,[x3],#8 77 adds x12,x16,x13 78 mul x10,x8,x9 // ap[j]*bp[0] 79 adc x13,x17,xzr 80 umulh x11,x8,x9 81 82 adds x12,x12,x6 83 mul x16,x14,x15 // np[j]*m1 84 adc x13,x13,xzr 85 umulh x17,x14,x15 86 str x12,[x22],#8 // tp[j-1] 87 cbnz x21,.L1st 88 89.L1st_skip: 90 adds x6,x10,x7 91 sub x1,x1,x5 // rewind x1 92 adc x7,x11,xzr 93 94 adds x12,x16,x13 95 sub x3,x3,x5 // rewind x3 96 adc x13,x17,xzr 97 98 adds x12,x12,x6 99 sub x20,x5,#8 // i=num-1 100 adcs x13,x13,x7 101 102 adc x19,xzr,xzr // upmost overflow bit 103 stp x12,x13,[x22] 104 105.Louter: 106 ldr x9,[x2],#8 // bp[i] 107 ldp x7,x8,[x1],#16 108 ldr x23,[sp] // tp[0] 109 add x22,sp,#8 110 111 mul x6,x7,x9 // ap[0]*bp[i] 112 sub x21,x5,#16 // j=num-2 113 umulh x7,x7,x9 114 ldp x13,x14,[x3],#16 115 mul x10,x8,x9 // ap[1]*bp[i] 116 adds x6,x6,x23 117 umulh x11,x8,x9 118 adc x7,x7,xzr 119 120 mul x15,x6,x4 121 sub x20,x20,#8 // i-- 122 123 // (*) mul x12,x13,x15 // np[0]*m1 124 umulh x13,x13,x15 125 mul x16,x14,x15 // np[1]*m1 126 // (*) adds x12,x12,x6 127 subs xzr,x6,#1 // (*) 128 umulh x17,x14,x15 129 cbz x21,.Linner_skip 130 131.Linner: 132 ldr x8,[x1],#8 133 adc x13,x13,xzr 134 ldr x23,[x22],#8 // tp[j] 135 adds x6,x10,x7 136 sub x21,x21,#8 // j-- 137 adc x7,x11,xzr 138 139 adds x12,x16,x13 140 ldr x14,[x3],#8 141 adc x13,x17,xzr 142 143 mul x10,x8,x9 // ap[j]*bp[i] 144 adds x6,x6,x23 145 umulh x11,x8,x9 146 adc x7,x7,xzr 147 148 mul x16,x14,x15 // np[j]*m1 149 adds x12,x12,x6 150 umulh x17,x14,x15 151 str x12,[x22,#-16] // tp[j-1] 152 cbnz x21,.Linner 153 154.Linner_skip: 155 ldr x23,[x22],#8 // tp[j] 156 adc x13,x13,xzr 157 adds x6,x10,x7 158 sub x1,x1,x5 // rewind x1 159 adc x7,x11,xzr 160 161 adds x12,x16,x13 162 sub x3,x3,x5 // rewind x3 163 adcs x13,x17,x19 164 adc x19,xzr,xzr 165 166 adds x6,x6,x23 167 adc x7,x7,xzr 168 169 adds x12,x12,x6 170 adcs x13,x13,x7 171 adc x19,x19,xzr // upmost overflow bit 172 stp x12,x13,[x22,#-16] 173 174 cbnz x20,.Louter 175 176 // Final step. We see if result is larger than modulus, and 177 // if it is, subtract the modulus. But comparison implies 178 // subtraction. So we subtract modulus, see if it borrowed, 179 // and conditionally copy original value. 180 ldr x23,[sp] // tp[0] 181 add x22,sp,#8 182 ldr x14,[x3],#8 // np[0] 183 subs x21,x5,#8 // j=num-1 and clear borrow 184 mov x1,x0 185.Lsub: 186 sbcs x8,x23,x14 // tp[j]-np[j] 187 ldr x23,[x22],#8 188 sub x21,x21,#8 // j-- 189 ldr x14,[x3],#8 190 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 191 cbnz x21,.Lsub 192 193 sbcs x8,x23,x14 194 sbcs x19,x19,xzr // did it borrow? 195 str x8,[x1],#8 // rp[num-1] 196 197 ldr x23,[sp] // tp[0] 198 add x22,sp,#8 199 ldr x8,[x0],#8 // rp[0] 200 sub x5,x5,#8 // num-- 201 nop 202.Lcond_copy: 203 sub x5,x5,#8 // num-- 204 csel x14,x23,x8,lo // did it borrow? 205 ldr x23,[x22],#8 206 ldr x8,[x0],#8 207 str xzr,[x22,#-16] // wipe tp 208 str x14,[x0,#-16] 209 cbnz x5,.Lcond_copy 210 211 csel x14,x23,x8,lo 212 str xzr,[x22,#-8] // wipe tp 213 str x14,[x0,#-8] 214 215 ldp x19,x20,[x29,#16] 216 mov sp,x29 217 ldp x21,x22,[x29,#32] 218 mov x0,#1 219 ldp x23,x24,[x29,#48] 220 ldr x29,[sp],#64 221 AARCH64_VALIDATE_LINK_REGISTER 222 ret 223.size bn_mul_mont,.-bn_mul_mont 224.type __bn_sqr8x_mont,%function 225.align 5 226__bn_sqr8x_mont: 227 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 228 // only from bn_mul_mont which has already signed the return address. 229 cmp x1,x2 230 b.ne __bn_mul4x_mont 231.Lsqr8x_mont: 232 stp x29,x30,[sp,#-128]! 233 add x29,sp,#0 234 stp x19,x20,[sp,#16] 235 stp x21,x22,[sp,#32] 236 stp x23,x24,[sp,#48] 237 stp x25,x26,[sp,#64] 238 stp x27,x28,[sp,#80] 239 stp x0,x3,[sp,#96] // offload rp and np 240 241 ldp x6,x7,[x1,#8*0] 242 ldp x8,x9,[x1,#8*2] 243 ldp x10,x11,[x1,#8*4] 244 ldp x12,x13,[x1,#8*6] 245 246 sub x2,sp,x5,lsl#4 247 lsl x5,x5,#3 248 ldr x4,[x4] // *n0 249 mov sp,x2 // alloca 250 sub x27,x5,#8*8 251 b .Lsqr8x_zero_start 252 253.Lsqr8x_zero: 254 sub x27,x27,#8*8 255 stp xzr,xzr,[x2,#8*0] 256 stp xzr,xzr,[x2,#8*2] 257 stp xzr,xzr,[x2,#8*4] 258 stp xzr,xzr,[x2,#8*6] 259.Lsqr8x_zero_start: 260 stp xzr,xzr,[x2,#8*8] 261 stp xzr,xzr,[x2,#8*10] 262 stp xzr,xzr,[x2,#8*12] 263 stp xzr,xzr,[x2,#8*14] 264 add x2,x2,#8*16 265 cbnz x27,.Lsqr8x_zero 266 267 add x3,x1,x5 268 add x1,x1,#8*8 269 mov x19,xzr 270 mov x20,xzr 271 mov x21,xzr 272 mov x22,xzr 273 mov x23,xzr 274 mov x24,xzr 275 mov x25,xzr 276 mov x26,xzr 277 mov x2,sp 278 str x4,[x29,#112] // offload n0 279 280 // Multiply everything but a[i]*a[i] 281.align 4 282.Lsqr8x_outer_loop: 283 // a[1]a[0] (i) 284 // a[2]a[0] 285 // a[3]a[0] 286 // a[4]a[0] 287 // a[5]a[0] 288 // a[6]a[0] 289 // a[7]a[0] 290 // a[2]a[1] (ii) 291 // a[3]a[1] 292 // a[4]a[1] 293 // a[5]a[1] 294 // a[6]a[1] 295 // a[7]a[1] 296 // a[3]a[2] (iii) 297 // a[4]a[2] 298 // a[5]a[2] 299 // a[6]a[2] 300 // a[7]a[2] 301 // a[4]a[3] (iv) 302 // a[5]a[3] 303 // a[6]a[3] 304 // a[7]a[3] 305 // a[5]a[4] (v) 306 // a[6]a[4] 307 // a[7]a[4] 308 // a[6]a[5] (vi) 309 // a[7]a[5] 310 // a[7]a[6] (vii) 311 312 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 313 mul x15,x8,x6 314 mul x16,x9,x6 315 mul x17,x10,x6 316 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 317 mul x14,x11,x6 318 adcs x21,x21,x15 319 mul x15,x12,x6 320 adcs x22,x22,x16 321 mul x16,x13,x6 322 adcs x23,x23,x17 323 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 324 adcs x24,x24,x14 325 umulh x14,x8,x6 326 adcs x25,x25,x15 327 umulh x15,x9,x6 328 adcs x26,x26,x16 329 umulh x16,x10,x6 330 stp x19,x20,[x2],#8*2 // t[0..1] 331 adc x19,xzr,xzr // t[8] 332 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 333 umulh x17,x11,x6 334 adcs x22,x22,x14 335 umulh x14,x12,x6 336 adcs x23,x23,x15 337 umulh x15,x13,x6 338 adcs x24,x24,x16 339 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 340 adcs x25,x25,x17 341 mul x17,x9,x7 342 adcs x26,x26,x14 343 mul x14,x10,x7 344 adc x19,x19,x15 345 346 mul x15,x11,x7 347 adds x22,x22,x16 348 mul x16,x12,x7 349 adcs x23,x23,x17 350 mul x17,x13,x7 351 adcs x24,x24,x14 352 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 353 adcs x25,x25,x15 354 umulh x15,x9,x7 355 adcs x26,x26,x16 356 umulh x16,x10,x7 357 adcs x19,x19,x17 358 umulh x17,x11,x7 359 stp x21,x22,[x2],#8*2 // t[2..3] 360 adc x20,xzr,xzr // t[9] 361 adds x23,x23,x14 362 umulh x14,x12,x7 363 adcs x24,x24,x15 364 umulh x15,x13,x7 365 adcs x25,x25,x16 366 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 367 adcs x26,x26,x17 368 mul x17,x10,x8 369 adcs x19,x19,x14 370 mul x14,x11,x8 371 adc x20,x20,x15 372 373 mul x15,x12,x8 374 adds x24,x24,x16 375 mul x16,x13,x8 376 adcs x25,x25,x17 377 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 378 adcs x26,x26,x14 379 umulh x14,x10,x8 380 adcs x19,x19,x15 381 umulh x15,x11,x8 382 adcs x20,x20,x16 383 umulh x16,x12,x8 384 stp x23,x24,[x2],#8*2 // t[4..5] 385 adc x21,xzr,xzr // t[10] 386 adds x25,x25,x17 387 umulh x17,x13,x8 388 adcs x26,x26,x14 389 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 390 adcs x19,x19,x15 391 mul x15,x11,x9 392 adcs x20,x20,x16 393 mul x16,x12,x9 394 adc x21,x21,x17 395 396 mul x17,x13,x9 397 adds x26,x26,x14 398 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 399 adcs x19,x19,x15 400 umulh x15,x11,x9 401 adcs x20,x20,x16 402 umulh x16,x12,x9 403 adcs x21,x21,x17 404 umulh x17,x13,x9 405 stp x25,x26,[x2],#8*2 // t[6..7] 406 adc x22,xzr,xzr // t[11] 407 adds x19,x19,x14 408 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 409 adcs x20,x20,x15 410 mul x15,x12,x10 411 adcs x21,x21,x16 412 mul x16,x13,x10 413 adc x22,x22,x17 414 415 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 416 adds x20,x20,x14 417 umulh x14,x12,x10 418 adcs x21,x21,x15 419 umulh x15,x13,x10 420 adcs x22,x22,x16 421 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 422 adc x23,xzr,xzr // t[12] 423 adds x21,x21,x17 424 mul x17,x13,x11 425 adcs x22,x22,x14 426 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 427 adc x23,x23,x15 428 429 umulh x15,x13,x11 430 adds x22,x22,x16 431 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 432 adcs x23,x23,x17 433 umulh x17,x13,x12 // hi(a[7]*a[6]) 434 adc x24,xzr,xzr // t[13] 435 adds x23,x23,x14 436 sub x27,x3,x1 // done yet? 437 adc x24,x24,x15 438 439 adds x24,x24,x16 440 sub x14,x3,x5 // rewinded ap 441 adc x25,xzr,xzr // t[14] 442 add x25,x25,x17 443 444 cbz x27,.Lsqr8x_outer_break 445 446 mov x4,x6 447 ldp x6,x7,[x2,#8*0] 448 ldp x8,x9,[x2,#8*2] 449 ldp x10,x11,[x2,#8*4] 450 ldp x12,x13,[x2,#8*6] 451 adds x19,x19,x6 452 adcs x20,x20,x7 453 ldp x6,x7,[x1,#8*0] 454 adcs x21,x21,x8 455 adcs x22,x22,x9 456 ldp x8,x9,[x1,#8*2] 457 adcs x23,x23,x10 458 adcs x24,x24,x11 459 ldp x10,x11,[x1,#8*4] 460 adcs x25,x25,x12 461 mov x0,x1 462 adcs x26,xzr,x13 463 ldp x12,x13,[x1,#8*6] 464 add x1,x1,#8*8 465 //adc x28,xzr,xzr // moved below 466 mov x27,#-8*8 467 468 // a[8]a[0] 469 // a[9]a[0] 470 // a[a]a[0] 471 // a[b]a[0] 472 // a[c]a[0] 473 // a[d]a[0] 474 // a[e]a[0] 475 // a[f]a[0] 476 // a[8]a[1] 477 // a[f]a[1]........................ 478 // a[8]a[2] 479 // a[f]a[2]........................ 480 // a[8]a[3] 481 // a[f]a[3]........................ 482 // a[8]a[4] 483 // a[f]a[4]........................ 484 // a[8]a[5] 485 // a[f]a[5]........................ 486 // a[8]a[6] 487 // a[f]a[6]........................ 488 // a[8]a[7] 489 // a[f]a[7]........................ 490.Lsqr8x_mul: 491 mul x14,x6,x4 492 adc x28,xzr,xzr // carry bit, modulo-scheduled 493 mul x15,x7,x4 494 add x27,x27,#8 495 mul x16,x8,x4 496 mul x17,x9,x4 497 adds x19,x19,x14 498 mul x14,x10,x4 499 adcs x20,x20,x15 500 mul x15,x11,x4 501 adcs x21,x21,x16 502 mul x16,x12,x4 503 adcs x22,x22,x17 504 mul x17,x13,x4 505 adcs x23,x23,x14 506 umulh x14,x6,x4 507 adcs x24,x24,x15 508 umulh x15,x7,x4 509 adcs x25,x25,x16 510 umulh x16,x8,x4 511 adcs x26,x26,x17 512 umulh x17,x9,x4 513 adc x28,x28,xzr 514 str x19,[x2],#8 515 adds x19,x20,x14 516 umulh x14,x10,x4 517 adcs x20,x21,x15 518 umulh x15,x11,x4 519 adcs x21,x22,x16 520 umulh x16,x12,x4 521 adcs x22,x23,x17 522 umulh x17,x13,x4 523 ldr x4,[x0,x27] 524 adcs x23,x24,x14 525 adcs x24,x25,x15 526 adcs x25,x26,x16 527 adcs x26,x28,x17 528 //adc x28,xzr,xzr // moved above 529 cbnz x27,.Lsqr8x_mul 530 // note that carry flag is guaranteed 531 // to be zero at this point 532 cmp x1,x3 // done yet? 533 b.eq .Lsqr8x_break 534 535 ldp x6,x7,[x2,#8*0] 536 ldp x8,x9,[x2,#8*2] 537 ldp x10,x11,[x2,#8*4] 538 ldp x12,x13,[x2,#8*6] 539 adds x19,x19,x6 540 ldr x4,[x0,#-8*8] 541 adcs x20,x20,x7 542 ldp x6,x7,[x1,#8*0] 543 adcs x21,x21,x8 544 adcs x22,x22,x9 545 ldp x8,x9,[x1,#8*2] 546 adcs x23,x23,x10 547 adcs x24,x24,x11 548 ldp x10,x11,[x1,#8*4] 549 adcs x25,x25,x12 550 mov x27,#-8*8 551 adcs x26,x26,x13 552 ldp x12,x13,[x1,#8*6] 553 add x1,x1,#8*8 554 //adc x28,xzr,xzr // moved above 555 b .Lsqr8x_mul 556 557.align 4 558.Lsqr8x_break: 559 ldp x6,x7,[x0,#8*0] 560 add x1,x0,#8*8 561 ldp x8,x9,[x0,#8*2] 562 sub x14,x3,x1 // is it last iteration? 563 ldp x10,x11,[x0,#8*4] 564 sub x15,x2,x14 565 ldp x12,x13,[x0,#8*6] 566 cbz x14,.Lsqr8x_outer_loop 567 568 stp x19,x20,[x2,#8*0] 569 ldp x19,x20,[x15,#8*0] 570 stp x21,x22,[x2,#8*2] 571 ldp x21,x22,[x15,#8*2] 572 stp x23,x24,[x2,#8*4] 573 ldp x23,x24,[x15,#8*4] 574 stp x25,x26,[x2,#8*6] 575 mov x2,x15 576 ldp x25,x26,[x15,#8*6] 577 b .Lsqr8x_outer_loop 578 579.align 4 580.Lsqr8x_outer_break: 581 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 582 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 583 ldp x15,x16,[sp,#8*1] 584 ldp x11,x13,[x14,#8*2] 585 add x1,x14,#8*4 586 ldp x17,x14,[sp,#8*3] 587 588 stp x19,x20,[x2,#8*0] 589 mul x19,x7,x7 590 stp x21,x22,[x2,#8*2] 591 umulh x7,x7,x7 592 stp x23,x24,[x2,#8*4] 593 mul x8,x9,x9 594 stp x25,x26,[x2,#8*6] 595 mov x2,sp 596 umulh x9,x9,x9 597 adds x20,x7,x15,lsl#1 598 extr x15,x16,x15,#63 599 sub x27,x5,#8*4 600 601.Lsqr4x_shift_n_add: 602 adcs x21,x8,x15 603 extr x16,x17,x16,#63 604 sub x27,x27,#8*4 605 adcs x22,x9,x16 606 ldp x15,x16,[x2,#8*5] 607 mul x10,x11,x11 608 ldp x7,x9,[x1],#8*2 609 umulh x11,x11,x11 610 mul x12,x13,x13 611 umulh x13,x13,x13 612 extr x17,x14,x17,#63 613 stp x19,x20,[x2,#8*0] 614 adcs x23,x10,x17 615 extr x14,x15,x14,#63 616 stp x21,x22,[x2,#8*2] 617 adcs x24,x11,x14 618 ldp x17,x14,[x2,#8*7] 619 extr x15,x16,x15,#63 620 adcs x25,x12,x15 621 extr x16,x17,x16,#63 622 adcs x26,x13,x16 623 ldp x15,x16,[x2,#8*9] 624 mul x6,x7,x7 625 ldp x11,x13,[x1],#8*2 626 umulh x7,x7,x7 627 mul x8,x9,x9 628 umulh x9,x9,x9 629 stp x23,x24,[x2,#8*4] 630 extr x17,x14,x17,#63 631 stp x25,x26,[x2,#8*6] 632 add x2,x2,#8*8 633 adcs x19,x6,x17 634 extr x14,x15,x14,#63 635 adcs x20,x7,x14 636 ldp x17,x14,[x2,#8*3] 637 extr x15,x16,x15,#63 638 cbnz x27,.Lsqr4x_shift_n_add 639 ldp x1,x4,[x29,#104] // pull np and n0 640 641 adcs x21,x8,x15 642 extr x16,x17,x16,#63 643 adcs x22,x9,x16 644 ldp x15,x16,[x2,#8*5] 645 mul x10,x11,x11 646 umulh x11,x11,x11 647 stp x19,x20,[x2,#8*0] 648 mul x12,x13,x13 649 umulh x13,x13,x13 650 stp x21,x22,[x2,#8*2] 651 extr x17,x14,x17,#63 652 adcs x23,x10,x17 653 extr x14,x15,x14,#63 654 ldp x19,x20,[sp,#8*0] 655 adcs x24,x11,x14 656 extr x15,x16,x15,#63 657 ldp x6,x7,[x1,#8*0] 658 adcs x25,x12,x15 659 extr x16,xzr,x16,#63 660 ldp x8,x9,[x1,#8*2] 661 adc x26,x13,x16 662 ldp x10,x11,[x1,#8*4] 663 664 // Reduce by 512 bits per iteration 665 mul x28,x4,x19 // t[0]*n0 666 ldp x12,x13,[x1,#8*6] 667 add x3,x1,x5 668 ldp x21,x22,[sp,#8*2] 669 stp x23,x24,[x2,#8*4] 670 ldp x23,x24,[sp,#8*4] 671 stp x25,x26,[x2,#8*6] 672 ldp x25,x26,[sp,#8*6] 673 add x1,x1,#8*8 674 mov x30,xzr // initial top-most carry 675 mov x2,sp 676 mov x27,#8 677 678.Lsqr8x_reduction: 679 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 680 mul x15,x7,x28 681 sub x27,x27,#1 682 mul x16,x8,x28 683 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 684 mul x17,x9,x28 685 // (*) adds xzr,x19,x14 686 subs xzr,x19,#1 // (*) 687 mul x14,x10,x28 688 adcs x19,x20,x15 689 mul x15,x11,x28 690 adcs x20,x21,x16 691 mul x16,x12,x28 692 adcs x21,x22,x17 693 mul x17,x13,x28 694 adcs x22,x23,x14 695 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 696 adcs x23,x24,x15 697 umulh x15,x7,x28 698 adcs x24,x25,x16 699 umulh x16,x8,x28 700 adcs x25,x26,x17 701 umulh x17,x9,x28 702 adc x26,xzr,xzr 703 adds x19,x19,x14 704 umulh x14,x10,x28 705 adcs x20,x20,x15 706 umulh x15,x11,x28 707 adcs x21,x21,x16 708 umulh x16,x12,x28 709 adcs x22,x22,x17 710 umulh x17,x13,x28 711 mul x28,x4,x19 // next t[0]*n0 712 adcs x23,x23,x14 713 adcs x24,x24,x15 714 adcs x25,x25,x16 715 adc x26,x26,x17 716 cbnz x27,.Lsqr8x_reduction 717 718 ldp x14,x15,[x2,#8*0] 719 ldp x16,x17,[x2,#8*2] 720 mov x0,x2 721 sub x27,x3,x1 // done yet? 722 adds x19,x19,x14 723 adcs x20,x20,x15 724 ldp x14,x15,[x2,#8*4] 725 adcs x21,x21,x16 726 adcs x22,x22,x17 727 ldp x16,x17,[x2,#8*6] 728 adcs x23,x23,x14 729 adcs x24,x24,x15 730 adcs x25,x25,x16 731 adcs x26,x26,x17 732 //adc x28,xzr,xzr // moved below 733 cbz x27,.Lsqr8x8_post_condition 734 735 ldr x4,[x2,#-8*8] 736 ldp x6,x7,[x1,#8*0] 737 ldp x8,x9,[x1,#8*2] 738 ldp x10,x11,[x1,#8*4] 739 mov x27,#-8*8 740 ldp x12,x13,[x1,#8*6] 741 add x1,x1,#8*8 742 743.Lsqr8x_tail: 744 mul x14,x6,x4 745 adc x28,xzr,xzr // carry bit, modulo-scheduled 746 mul x15,x7,x4 747 add x27,x27,#8 748 mul x16,x8,x4 749 mul x17,x9,x4 750 adds x19,x19,x14 751 mul x14,x10,x4 752 adcs x20,x20,x15 753 mul x15,x11,x4 754 adcs x21,x21,x16 755 mul x16,x12,x4 756 adcs x22,x22,x17 757 mul x17,x13,x4 758 adcs x23,x23,x14 759 umulh x14,x6,x4 760 adcs x24,x24,x15 761 umulh x15,x7,x4 762 adcs x25,x25,x16 763 umulh x16,x8,x4 764 adcs x26,x26,x17 765 umulh x17,x9,x4 766 adc x28,x28,xzr 767 str x19,[x2],#8 768 adds x19,x20,x14 769 umulh x14,x10,x4 770 adcs x20,x21,x15 771 umulh x15,x11,x4 772 adcs x21,x22,x16 773 umulh x16,x12,x4 774 adcs x22,x23,x17 775 umulh x17,x13,x4 776 ldr x4,[x0,x27] 777 adcs x23,x24,x14 778 adcs x24,x25,x15 779 adcs x25,x26,x16 780 adcs x26,x28,x17 781 //adc x28,xzr,xzr // moved above 782 cbnz x27,.Lsqr8x_tail 783 // note that carry flag is guaranteed 784 // to be zero at this point 785 ldp x6,x7,[x2,#8*0] 786 sub x27,x3,x1 // done yet? 787 sub x16,x3,x5 // rewinded np 788 ldp x8,x9,[x2,#8*2] 789 ldp x10,x11,[x2,#8*4] 790 ldp x12,x13,[x2,#8*6] 791 cbz x27,.Lsqr8x_tail_break 792 793 ldr x4,[x0,#-8*8] 794 adds x19,x19,x6 795 adcs x20,x20,x7 796 ldp x6,x7,[x1,#8*0] 797 adcs x21,x21,x8 798 adcs x22,x22,x9 799 ldp x8,x9,[x1,#8*2] 800 adcs x23,x23,x10 801 adcs x24,x24,x11 802 ldp x10,x11,[x1,#8*4] 803 adcs x25,x25,x12 804 mov x27,#-8*8 805 adcs x26,x26,x13 806 ldp x12,x13,[x1,#8*6] 807 add x1,x1,#8*8 808 //adc x28,xzr,xzr // moved above 809 b .Lsqr8x_tail 810 811.align 4 812.Lsqr8x_tail_break: 813 ldr x4,[x29,#112] // pull n0 814 add x27,x2,#8*8 // end of current t[num] window 815 816 subs xzr,x30,#1 // "move" top-most carry to carry bit 817 adcs x14,x19,x6 818 adcs x15,x20,x7 819 ldp x19,x20,[x0,#8*0] 820 adcs x21,x21,x8 821 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 822 adcs x22,x22,x9 823 ldp x8,x9,[x16,#8*2] 824 adcs x23,x23,x10 825 adcs x24,x24,x11 826 ldp x10,x11,[x16,#8*4] 827 adcs x25,x25,x12 828 adcs x26,x26,x13 829 ldp x12,x13,[x16,#8*6] 830 add x1,x16,#8*8 831 adc x30,xzr,xzr // top-most carry 832 mul x28,x4,x19 833 stp x14,x15,[x2,#8*0] 834 stp x21,x22,[x2,#8*2] 835 ldp x21,x22,[x0,#8*2] 836 stp x23,x24,[x2,#8*4] 837 ldp x23,x24,[x0,#8*4] 838 cmp x27,x29 // did we hit the bottom? 839 stp x25,x26,[x2,#8*6] 840 mov x2,x0 // slide the window 841 ldp x25,x26,[x0,#8*6] 842 mov x27,#8 843 b.ne .Lsqr8x_reduction 844 845 // Final step. We see if result is larger than modulus, and 846 // if it is, subtract the modulus. But comparison implies 847 // subtraction. So we subtract modulus, see if it borrowed, 848 // and conditionally copy original value. 849 ldr x0,[x29,#96] // pull rp 850 add x2,x2,#8*8 851 subs x14,x19,x6 852 sbcs x15,x20,x7 853 sub x27,x5,#8*8 854 mov x3,x0 // x0 copy 855 856.Lsqr8x_sub: 857 sbcs x16,x21,x8 858 ldp x6,x7,[x1,#8*0] 859 sbcs x17,x22,x9 860 stp x14,x15,[x0,#8*0] 861 sbcs x14,x23,x10 862 ldp x8,x9,[x1,#8*2] 863 sbcs x15,x24,x11 864 stp x16,x17,[x0,#8*2] 865 sbcs x16,x25,x12 866 ldp x10,x11,[x1,#8*4] 867 sbcs x17,x26,x13 868 ldp x12,x13,[x1,#8*6] 869 add x1,x1,#8*8 870 ldp x19,x20,[x2,#8*0] 871 sub x27,x27,#8*8 872 ldp x21,x22,[x2,#8*2] 873 ldp x23,x24,[x2,#8*4] 874 ldp x25,x26,[x2,#8*6] 875 add x2,x2,#8*8 876 stp x14,x15,[x0,#8*4] 877 sbcs x14,x19,x6 878 stp x16,x17,[x0,#8*6] 879 add x0,x0,#8*8 880 sbcs x15,x20,x7 881 cbnz x27,.Lsqr8x_sub 882 883 sbcs x16,x21,x8 884 mov x2,sp 885 add x1,sp,x5 886 ldp x6,x7,[x3,#8*0] 887 sbcs x17,x22,x9 888 stp x14,x15,[x0,#8*0] 889 sbcs x14,x23,x10 890 ldp x8,x9,[x3,#8*2] 891 sbcs x15,x24,x11 892 stp x16,x17,[x0,#8*2] 893 sbcs x16,x25,x12 894 ldp x19,x20,[x1,#8*0] 895 sbcs x17,x26,x13 896 ldp x21,x22,[x1,#8*2] 897 sbcs xzr,x30,xzr // did it borrow? 898 ldr x30,[x29,#8] // pull return address 899 stp x14,x15,[x0,#8*4] 900 stp x16,x17,[x0,#8*6] 901 902 sub x27,x5,#8*4 903.Lsqr4x_cond_copy: 904 sub x27,x27,#8*4 905 csel x14,x19,x6,lo 906 stp xzr,xzr,[x2,#8*0] 907 csel x15,x20,x7,lo 908 ldp x6,x7,[x3,#8*4] 909 ldp x19,x20,[x1,#8*4] 910 csel x16,x21,x8,lo 911 stp xzr,xzr,[x2,#8*2] 912 add x2,x2,#8*4 913 csel x17,x22,x9,lo 914 ldp x8,x9,[x3,#8*6] 915 ldp x21,x22,[x1,#8*6] 916 add x1,x1,#8*4 917 stp x14,x15,[x3,#8*0] 918 stp x16,x17,[x3,#8*2] 919 add x3,x3,#8*4 920 stp xzr,xzr,[x1,#8*0] 921 stp xzr,xzr,[x1,#8*2] 922 cbnz x27,.Lsqr4x_cond_copy 923 924 csel x14,x19,x6,lo 925 stp xzr,xzr,[x2,#8*0] 926 csel x15,x20,x7,lo 927 stp xzr,xzr,[x2,#8*2] 928 csel x16,x21,x8,lo 929 csel x17,x22,x9,lo 930 stp x14,x15,[x3,#8*0] 931 stp x16,x17,[x3,#8*2] 932 933 b .Lsqr8x_done 934 935.align 4 936.Lsqr8x8_post_condition: 937 adc x28,xzr,xzr 938 ldr x30,[x29,#8] // pull return address 939 // x19-7,x28 hold result, x6-7 hold modulus 940 subs x6,x19,x6 941 ldr x1,[x29,#96] // pull rp 942 sbcs x7,x20,x7 943 stp xzr,xzr,[sp,#8*0] 944 sbcs x8,x21,x8 945 stp xzr,xzr,[sp,#8*2] 946 sbcs x9,x22,x9 947 stp xzr,xzr,[sp,#8*4] 948 sbcs x10,x23,x10 949 stp xzr,xzr,[sp,#8*6] 950 sbcs x11,x24,x11 951 stp xzr,xzr,[sp,#8*8] 952 sbcs x12,x25,x12 953 stp xzr,xzr,[sp,#8*10] 954 sbcs x13,x26,x13 955 stp xzr,xzr,[sp,#8*12] 956 sbcs x28,x28,xzr // did it borrow? 957 stp xzr,xzr,[sp,#8*14] 958 959 // x6-7 hold result-modulus 960 csel x6,x19,x6,lo 961 csel x7,x20,x7,lo 962 csel x8,x21,x8,lo 963 csel x9,x22,x9,lo 964 stp x6,x7,[x1,#8*0] 965 csel x10,x23,x10,lo 966 csel x11,x24,x11,lo 967 stp x8,x9,[x1,#8*2] 968 csel x12,x25,x12,lo 969 csel x13,x26,x13,lo 970 stp x10,x11,[x1,#8*4] 971 stp x12,x13,[x1,#8*6] 972 973.Lsqr8x_done: 974 ldp x19,x20,[x29,#16] 975 mov sp,x29 976 ldp x21,x22,[x29,#32] 977 mov x0,#1 978 ldp x23,x24,[x29,#48] 979 ldp x25,x26,[x29,#64] 980 ldp x27,x28,[x29,#80] 981 ldr x29,[sp],#128 982 // x30 is popped earlier 983 AARCH64_VALIDATE_LINK_REGISTER 984 ret 985.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 986.type __bn_mul4x_mont,%function 987.align 5 988__bn_mul4x_mont: 989 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 990 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 991 // return address. 992 stp x29,x30,[sp,#-128]! 993 add x29,sp,#0 994 stp x19,x20,[sp,#16] 995 stp x21,x22,[sp,#32] 996 stp x23,x24,[sp,#48] 997 stp x25,x26,[sp,#64] 998 stp x27,x28,[sp,#80] 999 1000 sub x26,sp,x5,lsl#3 1001 lsl x5,x5,#3 1002 ldr x4,[x4] // *n0 1003 sub sp,x26,#8*4 // alloca 1004 1005 add x10,x2,x5 1006 add x27,x1,x5 1007 stp x0,x10,[x29,#96] // offload rp and &b[num] 1008 1009 ldr x24,[x2,#8*0] // b[0] 1010 ldp x6,x7,[x1,#8*0] // a[0..3] 1011 ldp x8,x9,[x1,#8*2] 1012 add x1,x1,#8*4 1013 mov x19,xzr 1014 mov x20,xzr 1015 mov x21,xzr 1016 mov x22,xzr 1017 ldp x14,x15,[x3,#8*0] // n[0..3] 1018 ldp x16,x17,[x3,#8*2] 1019 adds x3,x3,#8*4 // clear carry bit 1020 mov x0,xzr 1021 mov x28,#0 1022 mov x26,sp 1023 1024.Loop_mul4x_1st_reduction: 1025 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1026 adc x0,x0,xzr // modulo-scheduled 1027 mul x11,x7,x24 1028 add x28,x28,#8 1029 mul x12,x8,x24 1030 and x28,x28,#31 1031 mul x13,x9,x24 1032 adds x19,x19,x10 1033 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1034 adcs x20,x20,x11 1035 mul x25,x19,x4 // t[0]*n0 1036 adcs x21,x21,x12 1037 umulh x11,x7,x24 1038 adcs x22,x22,x13 1039 umulh x12,x8,x24 1040 adc x23,xzr,xzr 1041 umulh x13,x9,x24 1042 ldr x24,[x2,x28] // next b[i] (or b[0]) 1043 adds x20,x20,x10 1044 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1045 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1046 adcs x21,x21,x11 1047 mul x11,x15,x25 1048 adcs x22,x22,x12 1049 mul x12,x16,x25 1050 adc x23,x23,x13 // can't overflow 1051 mul x13,x17,x25 1052 // (*) adds xzr,x19,x10 1053 subs xzr,x19,#1 // (*) 1054 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1055 adcs x19,x20,x11 1056 umulh x11,x15,x25 1057 adcs x20,x21,x12 1058 umulh x12,x16,x25 1059 adcs x21,x22,x13 1060 umulh x13,x17,x25 1061 adcs x22,x23,x0 1062 adc x0,xzr,xzr 1063 adds x19,x19,x10 1064 sub x10,x27,x1 1065 adcs x20,x20,x11 1066 adcs x21,x21,x12 1067 adcs x22,x22,x13 1068 //adc x0,x0,xzr 1069 cbnz x28,.Loop_mul4x_1st_reduction 1070 1071 cbz x10,.Lmul4x4_post_condition 1072 1073 ldp x6,x7,[x1,#8*0] // a[4..7] 1074 ldp x8,x9,[x1,#8*2] 1075 add x1,x1,#8*4 1076 ldr x25,[sp] // a[0]*n0 1077 ldp x14,x15,[x3,#8*0] // n[4..7] 1078 ldp x16,x17,[x3,#8*2] 1079 add x3,x3,#8*4 1080 1081.Loop_mul4x_1st_tail: 1082 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1083 adc x0,x0,xzr // modulo-scheduled 1084 mul x11,x7,x24 1085 add x28,x28,#8 1086 mul x12,x8,x24 1087 and x28,x28,#31 1088 mul x13,x9,x24 1089 adds x19,x19,x10 1090 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1091 adcs x20,x20,x11 1092 umulh x11,x7,x24 1093 adcs x21,x21,x12 1094 umulh x12,x8,x24 1095 adcs x22,x22,x13 1096 umulh x13,x9,x24 1097 adc x23,xzr,xzr 1098 ldr x24,[x2,x28] // next b[i] (or b[0]) 1099 adds x20,x20,x10 1100 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1101 adcs x21,x21,x11 1102 mul x11,x15,x25 1103 adcs x22,x22,x12 1104 mul x12,x16,x25 1105 adc x23,x23,x13 // can't overflow 1106 mul x13,x17,x25 1107 adds x19,x19,x10 1108 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1109 adcs x20,x20,x11 1110 umulh x11,x15,x25 1111 adcs x21,x21,x12 1112 umulh x12,x16,x25 1113 adcs x22,x22,x13 1114 adcs x23,x23,x0 1115 umulh x13,x17,x25 1116 adc x0,xzr,xzr 1117 ldr x25,[sp,x28] // next t[0]*n0 1118 str x19,[x26],#8 // result!!! 1119 adds x19,x20,x10 1120 sub x10,x27,x1 // done yet? 1121 adcs x20,x21,x11 1122 adcs x21,x22,x12 1123 adcs x22,x23,x13 1124 //adc x0,x0,xzr 1125 cbnz x28,.Loop_mul4x_1st_tail 1126 1127 sub x11,x27,x5 // rewinded x1 1128 cbz x10,.Lmul4x_proceed 1129 1130 ldp x6,x7,[x1,#8*0] 1131 ldp x8,x9,[x1,#8*2] 1132 add x1,x1,#8*4 1133 ldp x14,x15,[x3,#8*0] 1134 ldp x16,x17,[x3,#8*2] 1135 add x3,x3,#8*4 1136 b .Loop_mul4x_1st_tail 1137 1138.align 5 1139.Lmul4x_proceed: 1140 ldr x24,[x2,#8*4]! // *++b 1141 adc x30,x0,xzr 1142 ldp x6,x7,[x11,#8*0] // a[0..3] 1143 sub x3,x3,x5 // rewind np 1144 ldp x8,x9,[x11,#8*2] 1145 add x1,x11,#8*4 1146 1147 stp x19,x20,[x26,#8*0] // result!!! 1148 ldp x19,x20,[sp,#8*4] // t[0..3] 1149 stp x21,x22,[x26,#8*2] // result!!! 1150 ldp x21,x22,[sp,#8*6] 1151 1152 ldp x14,x15,[x3,#8*0] // n[0..3] 1153 mov x26,sp 1154 ldp x16,x17,[x3,#8*2] 1155 adds x3,x3,#8*4 // clear carry bit 1156 mov x0,xzr 1157 1158.align 4 1159.Loop_mul4x_reduction: 1160 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1161 adc x0,x0,xzr // modulo-scheduled 1162 mul x11,x7,x24 1163 add x28,x28,#8 1164 mul x12,x8,x24 1165 and x28,x28,#31 1166 mul x13,x9,x24 1167 adds x19,x19,x10 1168 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1169 adcs x20,x20,x11 1170 mul x25,x19,x4 // t[0]*n0 1171 adcs x21,x21,x12 1172 umulh x11,x7,x24 1173 adcs x22,x22,x13 1174 umulh x12,x8,x24 1175 adc x23,xzr,xzr 1176 umulh x13,x9,x24 1177 ldr x24,[x2,x28] // next b[i] 1178 adds x20,x20,x10 1179 // (*) mul x10,x14,x25 1180 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1181 adcs x21,x21,x11 1182 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1183 adcs x22,x22,x12 1184 mul x12,x16,x25 1185 adc x23,x23,x13 // can't overflow 1186 mul x13,x17,x25 1187 // (*) adds xzr,x19,x10 1188 subs xzr,x19,#1 // (*) 1189 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1190 adcs x19,x20,x11 1191 umulh x11,x15,x25 1192 adcs x20,x21,x12 1193 umulh x12,x16,x25 1194 adcs x21,x22,x13 1195 umulh x13,x17,x25 1196 adcs x22,x23,x0 1197 adc x0,xzr,xzr 1198 adds x19,x19,x10 1199 adcs x20,x20,x11 1200 adcs x21,x21,x12 1201 adcs x22,x22,x13 1202 //adc x0,x0,xzr 1203 cbnz x28,.Loop_mul4x_reduction 1204 1205 adc x0,x0,xzr 1206 ldp x10,x11,[x26,#8*4] // t[4..7] 1207 ldp x12,x13,[x26,#8*6] 1208 ldp x6,x7,[x1,#8*0] // a[4..7] 1209 ldp x8,x9,[x1,#8*2] 1210 add x1,x1,#8*4 1211 adds x19,x19,x10 1212 adcs x20,x20,x11 1213 adcs x21,x21,x12 1214 adcs x22,x22,x13 1215 //adc x0,x0,xzr 1216 1217 ldr x25,[sp] // t[0]*n0 1218 ldp x14,x15,[x3,#8*0] // n[4..7] 1219 ldp x16,x17,[x3,#8*2] 1220 add x3,x3,#8*4 1221 1222.align 4 1223.Loop_mul4x_tail: 1224 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1225 adc x0,x0,xzr // modulo-scheduled 1226 mul x11,x7,x24 1227 add x28,x28,#8 1228 mul x12,x8,x24 1229 and x28,x28,#31 1230 mul x13,x9,x24 1231 adds x19,x19,x10 1232 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1233 adcs x20,x20,x11 1234 umulh x11,x7,x24 1235 adcs x21,x21,x12 1236 umulh x12,x8,x24 1237 adcs x22,x22,x13 1238 umulh x13,x9,x24 1239 adc x23,xzr,xzr 1240 ldr x24,[x2,x28] // next b[i] 1241 adds x20,x20,x10 1242 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1243 adcs x21,x21,x11 1244 mul x11,x15,x25 1245 adcs x22,x22,x12 1246 mul x12,x16,x25 1247 adc x23,x23,x13 // can't overflow 1248 mul x13,x17,x25 1249 adds x19,x19,x10 1250 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1251 adcs x20,x20,x11 1252 umulh x11,x15,x25 1253 adcs x21,x21,x12 1254 umulh x12,x16,x25 1255 adcs x22,x22,x13 1256 umulh x13,x17,x25 1257 adcs x23,x23,x0 1258 ldr x25,[sp,x28] // next a[0]*n0 1259 adc x0,xzr,xzr 1260 str x19,[x26],#8 // result!!! 1261 adds x19,x20,x10 1262 sub x10,x27,x1 // done yet? 1263 adcs x20,x21,x11 1264 adcs x21,x22,x12 1265 adcs x22,x23,x13 1266 //adc x0,x0,xzr 1267 cbnz x28,.Loop_mul4x_tail 1268 1269 sub x11,x3,x5 // rewinded np? 1270 adc x0,x0,xzr 1271 cbz x10,.Loop_mul4x_break 1272 1273 ldp x10,x11,[x26,#8*4] 1274 ldp x12,x13,[x26,#8*6] 1275 ldp x6,x7,[x1,#8*0] 1276 ldp x8,x9,[x1,#8*2] 1277 add x1,x1,#8*4 1278 adds x19,x19,x10 1279 adcs x20,x20,x11 1280 adcs x21,x21,x12 1281 adcs x22,x22,x13 1282 //adc x0,x0,xzr 1283 ldp x14,x15,[x3,#8*0] 1284 ldp x16,x17,[x3,#8*2] 1285 add x3,x3,#8*4 1286 b .Loop_mul4x_tail 1287 1288.align 4 1289.Loop_mul4x_break: 1290 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1291 adds x19,x19,x30 1292 add x2,x2,#8*4 // bp++ 1293 adcs x20,x20,xzr 1294 sub x1,x1,x5 // rewind ap 1295 adcs x21,x21,xzr 1296 stp x19,x20,[x26,#8*0] // result!!! 1297 adcs x22,x22,xzr 1298 ldp x19,x20,[sp,#8*4] // t[0..3] 1299 adc x30,x0,xzr 1300 stp x21,x22,[x26,#8*2] // result!!! 1301 cmp x2,x13 // done yet? 1302 ldp x21,x22,[sp,#8*6] 1303 ldp x14,x15,[x11,#8*0] // n[0..3] 1304 ldp x16,x17,[x11,#8*2] 1305 add x3,x11,#8*4 1306 b.eq .Lmul4x_post 1307 1308 ldr x24,[x2] 1309 ldp x6,x7,[x1,#8*0] // a[0..3] 1310 ldp x8,x9,[x1,#8*2] 1311 adds x1,x1,#8*4 // clear carry bit 1312 mov x0,xzr 1313 mov x26,sp 1314 b .Loop_mul4x_reduction 1315 1316.align 4 1317.Lmul4x_post: 1318 // Final step. We see if result is larger than modulus, and 1319 // if it is, subtract the modulus. But comparison implies 1320 // subtraction. So we subtract modulus, see if it borrowed, 1321 // and conditionally copy original value. 1322 mov x0,x12 1323 mov x27,x12 // x0 copy 1324 subs x10,x19,x14 1325 add x26,sp,#8*8 1326 sbcs x11,x20,x15 1327 sub x28,x5,#8*4 1328 1329.Lmul4x_sub: 1330 sbcs x12,x21,x16 1331 ldp x14,x15,[x3,#8*0] 1332 sub x28,x28,#8*4 1333 ldp x19,x20,[x26,#8*0] 1334 sbcs x13,x22,x17 1335 ldp x16,x17,[x3,#8*2] 1336 add x3,x3,#8*4 1337 ldp x21,x22,[x26,#8*2] 1338 add x26,x26,#8*4 1339 stp x10,x11,[x0,#8*0] 1340 sbcs x10,x19,x14 1341 stp x12,x13,[x0,#8*2] 1342 add x0,x0,#8*4 1343 sbcs x11,x20,x15 1344 cbnz x28,.Lmul4x_sub 1345 1346 sbcs x12,x21,x16 1347 mov x26,sp 1348 add x1,sp,#8*4 1349 ldp x6,x7,[x27,#8*0] 1350 sbcs x13,x22,x17 1351 stp x10,x11,[x0,#8*0] 1352 ldp x8,x9,[x27,#8*2] 1353 stp x12,x13,[x0,#8*2] 1354 ldp x19,x20,[x1,#8*0] 1355 ldp x21,x22,[x1,#8*2] 1356 sbcs xzr,x30,xzr // did it borrow? 1357 ldr x30,[x29,#8] // pull return address 1358 1359 sub x28,x5,#8*4 1360.Lmul4x_cond_copy: 1361 sub x28,x28,#8*4 1362 csel x10,x19,x6,lo 1363 stp xzr,xzr,[x26,#8*0] 1364 csel x11,x20,x7,lo 1365 ldp x6,x7,[x27,#8*4] 1366 ldp x19,x20,[x1,#8*4] 1367 csel x12,x21,x8,lo 1368 stp xzr,xzr,[x26,#8*2] 1369 add x26,x26,#8*4 1370 csel x13,x22,x9,lo 1371 ldp x8,x9,[x27,#8*6] 1372 ldp x21,x22,[x1,#8*6] 1373 add x1,x1,#8*4 1374 stp x10,x11,[x27,#8*0] 1375 stp x12,x13,[x27,#8*2] 1376 add x27,x27,#8*4 1377 cbnz x28,.Lmul4x_cond_copy 1378 1379 csel x10,x19,x6,lo 1380 stp xzr,xzr,[x26,#8*0] 1381 csel x11,x20,x7,lo 1382 stp xzr,xzr,[x26,#8*2] 1383 csel x12,x21,x8,lo 1384 stp xzr,xzr,[x26,#8*3] 1385 csel x13,x22,x9,lo 1386 stp xzr,xzr,[x26,#8*4] 1387 stp x10,x11,[x27,#8*0] 1388 stp x12,x13,[x27,#8*2] 1389 1390 b .Lmul4x_done 1391 1392.align 4 1393.Lmul4x4_post_condition: 1394 adc x0,x0,xzr 1395 ldr x1,[x29,#96] // pull rp 1396 // x19-3,x0 hold result, x14-7 hold modulus 1397 subs x6,x19,x14 1398 ldr x30,[x29,#8] // pull return address 1399 sbcs x7,x20,x15 1400 stp xzr,xzr,[sp,#8*0] 1401 sbcs x8,x21,x16 1402 stp xzr,xzr,[sp,#8*2] 1403 sbcs x9,x22,x17 1404 stp xzr,xzr,[sp,#8*4] 1405 sbcs xzr,x0,xzr // did it borrow? 1406 stp xzr,xzr,[sp,#8*6] 1407 1408 // x6-3 hold result-modulus 1409 csel x6,x19,x6,lo 1410 csel x7,x20,x7,lo 1411 csel x8,x21,x8,lo 1412 csel x9,x22,x9,lo 1413 stp x6,x7,[x1,#8*0] 1414 stp x8,x9,[x1,#8*2] 1415 1416.Lmul4x_done: 1417 ldp x19,x20,[x29,#16] 1418 mov sp,x29 1419 ldp x21,x22,[x29,#32] 1420 mov x0,#1 1421 ldp x23,x24,[x29,#48] 1422 ldp x25,x26,[x29,#64] 1423 ldp x27,x28,[x29,#80] 1424 ldr x29,[sp],#128 1425 // x30 is popped earlier 1426 AARCH64_VALIDATE_LINK_REGISTER 1427 ret 1428.size __bn_mul4x_mont,.-__bn_mul4x_mont 1429.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1430.align 2 1431.align 4 1432#endif 1433#endif // !OPENSSL_NO_ASM 1434.section .note.GNU-stack,"",%progbits 1435