1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18.text 19 20.globl bn_mul_mont 21 22.def bn_mul_mont 23 .type 32 24.endef 25.align 5 26bn_mul_mont: 27 AARCH64_SIGN_LINK_REGISTER 28 tst x5,#7 29 b.eq __bn_sqr8x_mont 30 tst x5,#3 31 b.eq __bn_mul4x_mont 32Lmul_mont: 33 stp x29,x30,[sp,#-64]! 34 add x29,sp,#0 35 stp x19,x20,[sp,#16] 36 stp x21,x22,[sp,#32] 37 stp x23,x24,[sp,#48] 38 39 ldr x9,[x2],#8 // bp[0] 40 sub x22,sp,x5,lsl#3 41 ldp x7,x8,[x1],#16 // ap[0..1] 42 lsl x5,x5,#3 43 ldr x4,[x4] // *n0 44 and x22,x22,#-16 // ABI says so 45 ldp x13,x14,[x3],#16 // np[0..1] 46 47 mul x6,x7,x9 // ap[0]*bp[0] 48 sub x21,x5,#16 // j=num-2 49 umulh x7,x7,x9 50 mul x10,x8,x9 // ap[1]*bp[0] 51 umulh x11,x8,x9 52 53 mul x15,x6,x4 // "tp[0]"*n0 54 mov sp,x22 // alloca 55 56 // (*) mul x12,x13,x15 // np[0]*m1 57 umulh x13,x13,x15 58 mul x16,x14,x15 // np[1]*m1 59 // (*) adds x12,x12,x6 // discarded 60 // (*) As for removal of first multiplication and addition 61 // instructions. The outcome of first addition is 62 // guaranteed to be zero, which leaves two computationally 63 // significant outcomes: it either carries or not. Then 64 // question is when does it carry? Is there alternative 65 // way to deduce it? If you follow operations, you can 66 // observe that condition for carry is quite simple: 67 // x6 being non-zero. So that carry can be calculated 68 // by adding -1 to x6. That's what next instruction does. 69 subs xzr,x6,#1 // (*) 70 umulh x17,x14,x15 71 adc x13,x13,xzr 72 cbz x21,L1st_skip 73 74L1st: 75 ldr x8,[x1],#8 76 adds x6,x10,x7 77 sub x21,x21,#8 // j-- 78 adc x7,x11,xzr 79 80 ldr x14,[x3],#8 81 adds x12,x16,x13 82 mul x10,x8,x9 // ap[j]*bp[0] 83 adc x13,x17,xzr 84 umulh x11,x8,x9 85 86 adds x12,x12,x6 87 mul x16,x14,x15 // np[j]*m1 88 adc x13,x13,xzr 89 umulh x17,x14,x15 90 str x12,[x22],#8 // tp[j-1] 91 cbnz x21,L1st 92 93L1st_skip: 94 adds x6,x10,x7 95 sub x1,x1,x5 // rewind x1 96 adc x7,x11,xzr 97 98 adds x12,x16,x13 99 sub x3,x3,x5 // rewind x3 100 adc x13,x17,xzr 101 102 adds x12,x12,x6 103 sub x20,x5,#8 // i=num-1 104 adcs x13,x13,x7 105 106 adc x19,xzr,xzr // upmost overflow bit 107 stp x12,x13,[x22] 108 109Louter: 110 ldr x9,[x2],#8 // bp[i] 111 ldp x7,x8,[x1],#16 112 ldr x23,[sp] // tp[0] 113 add x22,sp,#8 114 115 mul x6,x7,x9 // ap[0]*bp[i] 116 sub x21,x5,#16 // j=num-2 117 umulh x7,x7,x9 118 ldp x13,x14,[x3],#16 119 mul x10,x8,x9 // ap[1]*bp[i] 120 adds x6,x6,x23 121 umulh x11,x8,x9 122 adc x7,x7,xzr 123 124 mul x15,x6,x4 125 sub x20,x20,#8 // i-- 126 127 // (*) mul x12,x13,x15 // np[0]*m1 128 umulh x13,x13,x15 129 mul x16,x14,x15 // np[1]*m1 130 // (*) adds x12,x12,x6 131 subs xzr,x6,#1 // (*) 132 umulh x17,x14,x15 133 cbz x21,Linner_skip 134 135Linner: 136 ldr x8,[x1],#8 137 adc x13,x13,xzr 138 ldr x23,[x22],#8 // tp[j] 139 adds x6,x10,x7 140 sub x21,x21,#8 // j-- 141 adc x7,x11,xzr 142 143 adds x12,x16,x13 144 ldr x14,[x3],#8 145 adc x13,x17,xzr 146 147 mul x10,x8,x9 // ap[j]*bp[i] 148 adds x6,x6,x23 149 umulh x11,x8,x9 150 adc x7,x7,xzr 151 152 mul x16,x14,x15 // np[j]*m1 153 adds x12,x12,x6 154 umulh x17,x14,x15 155 str x12,[x22,#-16] // tp[j-1] 156 cbnz x21,Linner 157 158Linner_skip: 159 ldr x23,[x22],#8 // tp[j] 160 adc x13,x13,xzr 161 adds x6,x10,x7 162 sub x1,x1,x5 // rewind x1 163 adc x7,x11,xzr 164 165 adds x12,x16,x13 166 sub x3,x3,x5 // rewind x3 167 adcs x13,x17,x19 168 adc x19,xzr,xzr 169 170 adds x6,x6,x23 171 adc x7,x7,xzr 172 173 adds x12,x12,x6 174 adcs x13,x13,x7 175 adc x19,x19,xzr // upmost overflow bit 176 stp x12,x13,[x22,#-16] 177 178 cbnz x20,Louter 179 180 // Final step. We see if result is larger than modulus, and 181 // if it is, subtract the modulus. But comparison implies 182 // subtraction. So we subtract modulus, see if it borrowed, 183 // and conditionally copy original value. 184 ldr x23,[sp] // tp[0] 185 add x22,sp,#8 186 ldr x14,[x3],#8 // np[0] 187 subs x21,x5,#8 // j=num-1 and clear borrow 188 mov x1,x0 189Lsub: 190 sbcs x8,x23,x14 // tp[j]-np[j] 191 ldr x23,[x22],#8 192 sub x21,x21,#8 // j-- 193 ldr x14,[x3],#8 194 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 195 cbnz x21,Lsub 196 197 sbcs x8,x23,x14 198 sbcs x19,x19,xzr // did it borrow? 199 str x8,[x1],#8 // rp[num-1] 200 201 ldr x23,[sp] // tp[0] 202 add x22,sp,#8 203 ldr x8,[x0],#8 // rp[0] 204 sub x5,x5,#8 // num-- 205 nop 206Lcond_copy: 207 sub x5,x5,#8 // num-- 208 csel x14,x23,x8,lo // did it borrow? 209 ldr x23,[x22],#8 210 ldr x8,[x0],#8 211 str xzr,[x22,#-16] // wipe tp 212 str x14,[x0,#-16] 213 cbnz x5,Lcond_copy 214 215 csel x14,x23,x8,lo 216 str xzr,[x22,#-8] // wipe tp 217 str x14,[x0,#-8] 218 219 ldp x19,x20,[x29,#16] 220 mov sp,x29 221 ldp x21,x22,[x29,#32] 222 mov x0,#1 223 ldp x23,x24,[x29,#48] 224 ldr x29,[sp],#64 225 AARCH64_VALIDATE_LINK_REGISTER 226 ret 227 228.def __bn_sqr8x_mont 229 .type 32 230.endef 231.align 5 232__bn_sqr8x_mont: 233 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 234 // only from bn_mul_mont which has already signed the return address. 235 cmp x1,x2 236 b.ne __bn_mul4x_mont 237Lsqr8x_mont: 238 stp x29,x30,[sp,#-128]! 239 add x29,sp,#0 240 stp x19,x20,[sp,#16] 241 stp x21,x22,[sp,#32] 242 stp x23,x24,[sp,#48] 243 stp x25,x26,[sp,#64] 244 stp x27,x28,[sp,#80] 245 stp x0,x3,[sp,#96] // offload rp and np 246 247 ldp x6,x7,[x1,#8*0] 248 ldp x8,x9,[x1,#8*2] 249 ldp x10,x11,[x1,#8*4] 250 ldp x12,x13,[x1,#8*6] 251 252 sub x2,sp,x5,lsl#4 253 lsl x5,x5,#3 254 ldr x4,[x4] // *n0 255 mov sp,x2 // alloca 256 sub x27,x5,#8*8 257 b Lsqr8x_zero_start 258 259Lsqr8x_zero: 260 sub x27,x27,#8*8 261 stp xzr,xzr,[x2,#8*0] 262 stp xzr,xzr,[x2,#8*2] 263 stp xzr,xzr,[x2,#8*4] 264 stp xzr,xzr,[x2,#8*6] 265Lsqr8x_zero_start: 266 stp xzr,xzr,[x2,#8*8] 267 stp xzr,xzr,[x2,#8*10] 268 stp xzr,xzr,[x2,#8*12] 269 stp xzr,xzr,[x2,#8*14] 270 add x2,x2,#8*16 271 cbnz x27,Lsqr8x_zero 272 273 add x3,x1,x5 274 add x1,x1,#8*8 275 mov x19,xzr 276 mov x20,xzr 277 mov x21,xzr 278 mov x22,xzr 279 mov x23,xzr 280 mov x24,xzr 281 mov x25,xzr 282 mov x26,xzr 283 mov x2,sp 284 str x4,[x29,#112] // offload n0 285 286 // Multiply everything but a[i]*a[i] 287.align 4 288Lsqr8x_outer_loop: 289 // a[1]a[0] (i) 290 // a[2]a[0] 291 // a[3]a[0] 292 // a[4]a[0] 293 // a[5]a[0] 294 // a[6]a[0] 295 // a[7]a[0] 296 // a[2]a[1] (ii) 297 // a[3]a[1] 298 // a[4]a[1] 299 // a[5]a[1] 300 // a[6]a[1] 301 // a[7]a[1] 302 // a[3]a[2] (iii) 303 // a[4]a[2] 304 // a[5]a[2] 305 // a[6]a[2] 306 // a[7]a[2] 307 // a[4]a[3] (iv) 308 // a[5]a[3] 309 // a[6]a[3] 310 // a[7]a[3] 311 // a[5]a[4] (v) 312 // a[6]a[4] 313 // a[7]a[4] 314 // a[6]a[5] (vi) 315 // a[7]a[5] 316 // a[7]a[6] (vii) 317 318 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 319 mul x15,x8,x6 320 mul x16,x9,x6 321 mul x17,x10,x6 322 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 323 mul x14,x11,x6 324 adcs x21,x21,x15 325 mul x15,x12,x6 326 adcs x22,x22,x16 327 mul x16,x13,x6 328 adcs x23,x23,x17 329 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 330 adcs x24,x24,x14 331 umulh x14,x8,x6 332 adcs x25,x25,x15 333 umulh x15,x9,x6 334 adcs x26,x26,x16 335 umulh x16,x10,x6 336 stp x19,x20,[x2],#8*2 // t[0..1] 337 adc x19,xzr,xzr // t[8] 338 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 339 umulh x17,x11,x6 340 adcs x22,x22,x14 341 umulh x14,x12,x6 342 adcs x23,x23,x15 343 umulh x15,x13,x6 344 adcs x24,x24,x16 345 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 346 adcs x25,x25,x17 347 mul x17,x9,x7 348 adcs x26,x26,x14 349 mul x14,x10,x7 350 adc x19,x19,x15 351 352 mul x15,x11,x7 353 adds x22,x22,x16 354 mul x16,x12,x7 355 adcs x23,x23,x17 356 mul x17,x13,x7 357 adcs x24,x24,x14 358 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 359 adcs x25,x25,x15 360 umulh x15,x9,x7 361 adcs x26,x26,x16 362 umulh x16,x10,x7 363 adcs x19,x19,x17 364 umulh x17,x11,x7 365 stp x21,x22,[x2],#8*2 // t[2..3] 366 adc x20,xzr,xzr // t[9] 367 adds x23,x23,x14 368 umulh x14,x12,x7 369 adcs x24,x24,x15 370 umulh x15,x13,x7 371 adcs x25,x25,x16 372 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 373 adcs x26,x26,x17 374 mul x17,x10,x8 375 adcs x19,x19,x14 376 mul x14,x11,x8 377 adc x20,x20,x15 378 379 mul x15,x12,x8 380 adds x24,x24,x16 381 mul x16,x13,x8 382 adcs x25,x25,x17 383 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 384 adcs x26,x26,x14 385 umulh x14,x10,x8 386 adcs x19,x19,x15 387 umulh x15,x11,x8 388 adcs x20,x20,x16 389 umulh x16,x12,x8 390 stp x23,x24,[x2],#8*2 // t[4..5] 391 adc x21,xzr,xzr // t[10] 392 adds x25,x25,x17 393 umulh x17,x13,x8 394 adcs x26,x26,x14 395 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 396 adcs x19,x19,x15 397 mul x15,x11,x9 398 adcs x20,x20,x16 399 mul x16,x12,x9 400 adc x21,x21,x17 401 402 mul x17,x13,x9 403 adds x26,x26,x14 404 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 405 adcs x19,x19,x15 406 umulh x15,x11,x9 407 adcs x20,x20,x16 408 umulh x16,x12,x9 409 adcs x21,x21,x17 410 umulh x17,x13,x9 411 stp x25,x26,[x2],#8*2 // t[6..7] 412 adc x22,xzr,xzr // t[11] 413 adds x19,x19,x14 414 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 415 adcs x20,x20,x15 416 mul x15,x12,x10 417 adcs x21,x21,x16 418 mul x16,x13,x10 419 adc x22,x22,x17 420 421 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 422 adds x20,x20,x14 423 umulh x14,x12,x10 424 adcs x21,x21,x15 425 umulh x15,x13,x10 426 adcs x22,x22,x16 427 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 428 adc x23,xzr,xzr // t[12] 429 adds x21,x21,x17 430 mul x17,x13,x11 431 adcs x22,x22,x14 432 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 433 adc x23,x23,x15 434 435 umulh x15,x13,x11 436 adds x22,x22,x16 437 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 438 adcs x23,x23,x17 439 umulh x17,x13,x12 // hi(a[7]*a[6]) 440 adc x24,xzr,xzr // t[13] 441 adds x23,x23,x14 442 sub x27,x3,x1 // done yet? 443 adc x24,x24,x15 444 445 adds x24,x24,x16 446 sub x14,x3,x5 // rewinded ap 447 adc x25,xzr,xzr // t[14] 448 add x25,x25,x17 449 450 cbz x27,Lsqr8x_outer_break 451 452 mov x4,x6 453 ldp x6,x7,[x2,#8*0] 454 ldp x8,x9,[x2,#8*2] 455 ldp x10,x11,[x2,#8*4] 456 ldp x12,x13,[x2,#8*6] 457 adds x19,x19,x6 458 adcs x20,x20,x7 459 ldp x6,x7,[x1,#8*0] 460 adcs x21,x21,x8 461 adcs x22,x22,x9 462 ldp x8,x9,[x1,#8*2] 463 adcs x23,x23,x10 464 adcs x24,x24,x11 465 ldp x10,x11,[x1,#8*4] 466 adcs x25,x25,x12 467 mov x0,x1 468 adcs x26,xzr,x13 469 ldp x12,x13,[x1,#8*6] 470 add x1,x1,#8*8 471 //adc x28,xzr,xzr // moved below 472 mov x27,#-8*8 473 474 // a[8]a[0] 475 // a[9]a[0] 476 // a[a]a[0] 477 // a[b]a[0] 478 // a[c]a[0] 479 // a[d]a[0] 480 // a[e]a[0] 481 // a[f]a[0] 482 // a[8]a[1] 483 // a[f]a[1]........................ 484 // a[8]a[2] 485 // a[f]a[2]........................ 486 // a[8]a[3] 487 // a[f]a[3]........................ 488 // a[8]a[4] 489 // a[f]a[4]........................ 490 // a[8]a[5] 491 // a[f]a[5]........................ 492 // a[8]a[6] 493 // a[f]a[6]........................ 494 // a[8]a[7] 495 // a[f]a[7]........................ 496Lsqr8x_mul: 497 mul x14,x6,x4 498 adc x28,xzr,xzr // carry bit, modulo-scheduled 499 mul x15,x7,x4 500 add x27,x27,#8 501 mul x16,x8,x4 502 mul x17,x9,x4 503 adds x19,x19,x14 504 mul x14,x10,x4 505 adcs x20,x20,x15 506 mul x15,x11,x4 507 adcs x21,x21,x16 508 mul x16,x12,x4 509 adcs x22,x22,x17 510 mul x17,x13,x4 511 adcs x23,x23,x14 512 umulh x14,x6,x4 513 adcs x24,x24,x15 514 umulh x15,x7,x4 515 adcs x25,x25,x16 516 umulh x16,x8,x4 517 adcs x26,x26,x17 518 umulh x17,x9,x4 519 adc x28,x28,xzr 520 str x19,[x2],#8 521 adds x19,x20,x14 522 umulh x14,x10,x4 523 adcs x20,x21,x15 524 umulh x15,x11,x4 525 adcs x21,x22,x16 526 umulh x16,x12,x4 527 adcs x22,x23,x17 528 umulh x17,x13,x4 529 ldr x4,[x0,x27] 530 adcs x23,x24,x14 531 adcs x24,x25,x15 532 adcs x25,x26,x16 533 adcs x26,x28,x17 534 //adc x28,xzr,xzr // moved above 535 cbnz x27,Lsqr8x_mul 536 // note that carry flag is guaranteed 537 // to be zero at this point 538 cmp x1,x3 // done yet? 539 b.eq Lsqr8x_break 540 541 ldp x6,x7,[x2,#8*0] 542 ldp x8,x9,[x2,#8*2] 543 ldp x10,x11,[x2,#8*4] 544 ldp x12,x13,[x2,#8*6] 545 adds x19,x19,x6 546 ldr x4,[x0,#-8*8] 547 adcs x20,x20,x7 548 ldp x6,x7,[x1,#8*0] 549 adcs x21,x21,x8 550 adcs x22,x22,x9 551 ldp x8,x9,[x1,#8*2] 552 adcs x23,x23,x10 553 adcs x24,x24,x11 554 ldp x10,x11,[x1,#8*4] 555 adcs x25,x25,x12 556 mov x27,#-8*8 557 adcs x26,x26,x13 558 ldp x12,x13,[x1,#8*6] 559 add x1,x1,#8*8 560 //adc x28,xzr,xzr // moved above 561 b Lsqr8x_mul 562 563.align 4 564Lsqr8x_break: 565 ldp x6,x7,[x0,#8*0] 566 add x1,x0,#8*8 567 ldp x8,x9,[x0,#8*2] 568 sub x14,x3,x1 // is it last iteration? 569 ldp x10,x11,[x0,#8*4] 570 sub x15,x2,x14 571 ldp x12,x13,[x0,#8*6] 572 cbz x14,Lsqr8x_outer_loop 573 574 stp x19,x20,[x2,#8*0] 575 ldp x19,x20,[x15,#8*0] 576 stp x21,x22,[x2,#8*2] 577 ldp x21,x22,[x15,#8*2] 578 stp x23,x24,[x2,#8*4] 579 ldp x23,x24,[x15,#8*4] 580 stp x25,x26,[x2,#8*6] 581 mov x2,x15 582 ldp x25,x26,[x15,#8*6] 583 b Lsqr8x_outer_loop 584 585.align 4 586Lsqr8x_outer_break: 587 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 588 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 589 ldp x15,x16,[sp,#8*1] 590 ldp x11,x13,[x14,#8*2] 591 add x1,x14,#8*4 592 ldp x17,x14,[sp,#8*3] 593 594 stp x19,x20,[x2,#8*0] 595 mul x19,x7,x7 596 stp x21,x22,[x2,#8*2] 597 umulh x7,x7,x7 598 stp x23,x24,[x2,#8*4] 599 mul x8,x9,x9 600 stp x25,x26,[x2,#8*6] 601 mov x2,sp 602 umulh x9,x9,x9 603 adds x20,x7,x15,lsl#1 604 extr x15,x16,x15,#63 605 sub x27,x5,#8*4 606 607Lsqr4x_shift_n_add: 608 adcs x21,x8,x15 609 extr x16,x17,x16,#63 610 sub x27,x27,#8*4 611 adcs x22,x9,x16 612 ldp x15,x16,[x2,#8*5] 613 mul x10,x11,x11 614 ldp x7,x9,[x1],#8*2 615 umulh x11,x11,x11 616 mul x12,x13,x13 617 umulh x13,x13,x13 618 extr x17,x14,x17,#63 619 stp x19,x20,[x2,#8*0] 620 adcs x23,x10,x17 621 extr x14,x15,x14,#63 622 stp x21,x22,[x2,#8*2] 623 adcs x24,x11,x14 624 ldp x17,x14,[x2,#8*7] 625 extr x15,x16,x15,#63 626 adcs x25,x12,x15 627 extr x16,x17,x16,#63 628 adcs x26,x13,x16 629 ldp x15,x16,[x2,#8*9] 630 mul x6,x7,x7 631 ldp x11,x13,[x1],#8*2 632 umulh x7,x7,x7 633 mul x8,x9,x9 634 umulh x9,x9,x9 635 stp x23,x24,[x2,#8*4] 636 extr x17,x14,x17,#63 637 stp x25,x26,[x2,#8*6] 638 add x2,x2,#8*8 639 adcs x19,x6,x17 640 extr x14,x15,x14,#63 641 adcs x20,x7,x14 642 ldp x17,x14,[x2,#8*3] 643 extr x15,x16,x15,#63 644 cbnz x27,Lsqr4x_shift_n_add 645 ldp x1,x4,[x29,#104] // pull np and n0 646 647 adcs x21,x8,x15 648 extr x16,x17,x16,#63 649 adcs x22,x9,x16 650 ldp x15,x16,[x2,#8*5] 651 mul x10,x11,x11 652 umulh x11,x11,x11 653 stp x19,x20,[x2,#8*0] 654 mul x12,x13,x13 655 umulh x13,x13,x13 656 stp x21,x22,[x2,#8*2] 657 extr x17,x14,x17,#63 658 adcs x23,x10,x17 659 extr x14,x15,x14,#63 660 ldp x19,x20,[sp,#8*0] 661 adcs x24,x11,x14 662 extr x15,x16,x15,#63 663 ldp x6,x7,[x1,#8*0] 664 adcs x25,x12,x15 665 extr x16,xzr,x16,#63 666 ldp x8,x9,[x1,#8*2] 667 adc x26,x13,x16 668 ldp x10,x11,[x1,#8*4] 669 670 // Reduce by 512 bits per iteration 671 mul x28,x4,x19 // t[0]*n0 672 ldp x12,x13,[x1,#8*6] 673 add x3,x1,x5 674 ldp x21,x22,[sp,#8*2] 675 stp x23,x24,[x2,#8*4] 676 ldp x23,x24,[sp,#8*4] 677 stp x25,x26,[x2,#8*6] 678 ldp x25,x26,[sp,#8*6] 679 add x1,x1,#8*8 680 mov x30,xzr // initial top-most carry 681 mov x2,sp 682 mov x27,#8 683 684Lsqr8x_reduction: 685 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 686 mul x15,x7,x28 687 sub x27,x27,#1 688 mul x16,x8,x28 689 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 690 mul x17,x9,x28 691 // (*) adds xzr,x19,x14 692 subs xzr,x19,#1 // (*) 693 mul x14,x10,x28 694 adcs x19,x20,x15 695 mul x15,x11,x28 696 adcs x20,x21,x16 697 mul x16,x12,x28 698 adcs x21,x22,x17 699 mul x17,x13,x28 700 adcs x22,x23,x14 701 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 702 adcs x23,x24,x15 703 umulh x15,x7,x28 704 adcs x24,x25,x16 705 umulh x16,x8,x28 706 adcs x25,x26,x17 707 umulh x17,x9,x28 708 adc x26,xzr,xzr 709 adds x19,x19,x14 710 umulh x14,x10,x28 711 adcs x20,x20,x15 712 umulh x15,x11,x28 713 adcs x21,x21,x16 714 umulh x16,x12,x28 715 adcs x22,x22,x17 716 umulh x17,x13,x28 717 mul x28,x4,x19 // next t[0]*n0 718 adcs x23,x23,x14 719 adcs x24,x24,x15 720 adcs x25,x25,x16 721 adc x26,x26,x17 722 cbnz x27,Lsqr8x_reduction 723 724 ldp x14,x15,[x2,#8*0] 725 ldp x16,x17,[x2,#8*2] 726 mov x0,x2 727 sub x27,x3,x1 // done yet? 728 adds x19,x19,x14 729 adcs x20,x20,x15 730 ldp x14,x15,[x2,#8*4] 731 adcs x21,x21,x16 732 adcs x22,x22,x17 733 ldp x16,x17,[x2,#8*6] 734 adcs x23,x23,x14 735 adcs x24,x24,x15 736 adcs x25,x25,x16 737 adcs x26,x26,x17 738 //adc x28,xzr,xzr // moved below 739 cbz x27,Lsqr8x8_post_condition 740 741 ldr x4,[x2,#-8*8] 742 ldp x6,x7,[x1,#8*0] 743 ldp x8,x9,[x1,#8*2] 744 ldp x10,x11,[x1,#8*4] 745 mov x27,#-8*8 746 ldp x12,x13,[x1,#8*6] 747 add x1,x1,#8*8 748 749Lsqr8x_tail: 750 mul x14,x6,x4 751 adc x28,xzr,xzr // carry bit, modulo-scheduled 752 mul x15,x7,x4 753 add x27,x27,#8 754 mul x16,x8,x4 755 mul x17,x9,x4 756 adds x19,x19,x14 757 mul x14,x10,x4 758 adcs x20,x20,x15 759 mul x15,x11,x4 760 adcs x21,x21,x16 761 mul x16,x12,x4 762 adcs x22,x22,x17 763 mul x17,x13,x4 764 adcs x23,x23,x14 765 umulh x14,x6,x4 766 adcs x24,x24,x15 767 umulh x15,x7,x4 768 adcs x25,x25,x16 769 umulh x16,x8,x4 770 adcs x26,x26,x17 771 umulh x17,x9,x4 772 adc x28,x28,xzr 773 str x19,[x2],#8 774 adds x19,x20,x14 775 umulh x14,x10,x4 776 adcs x20,x21,x15 777 umulh x15,x11,x4 778 adcs x21,x22,x16 779 umulh x16,x12,x4 780 adcs x22,x23,x17 781 umulh x17,x13,x4 782 ldr x4,[x0,x27] 783 adcs x23,x24,x14 784 adcs x24,x25,x15 785 adcs x25,x26,x16 786 adcs x26,x28,x17 787 //adc x28,xzr,xzr // moved above 788 cbnz x27,Lsqr8x_tail 789 // note that carry flag is guaranteed 790 // to be zero at this point 791 ldp x6,x7,[x2,#8*0] 792 sub x27,x3,x1 // done yet? 793 sub x16,x3,x5 // rewinded np 794 ldp x8,x9,[x2,#8*2] 795 ldp x10,x11,[x2,#8*4] 796 ldp x12,x13,[x2,#8*6] 797 cbz x27,Lsqr8x_tail_break 798 799 ldr x4,[x0,#-8*8] 800 adds x19,x19,x6 801 adcs x20,x20,x7 802 ldp x6,x7,[x1,#8*0] 803 adcs x21,x21,x8 804 adcs x22,x22,x9 805 ldp x8,x9,[x1,#8*2] 806 adcs x23,x23,x10 807 adcs x24,x24,x11 808 ldp x10,x11,[x1,#8*4] 809 adcs x25,x25,x12 810 mov x27,#-8*8 811 adcs x26,x26,x13 812 ldp x12,x13,[x1,#8*6] 813 add x1,x1,#8*8 814 //adc x28,xzr,xzr // moved above 815 b Lsqr8x_tail 816 817.align 4 818Lsqr8x_tail_break: 819 ldr x4,[x29,#112] // pull n0 820 add x27,x2,#8*8 // end of current t[num] window 821 822 subs xzr,x30,#1 // "move" top-most carry to carry bit 823 adcs x14,x19,x6 824 adcs x15,x20,x7 825 ldp x19,x20,[x0,#8*0] 826 adcs x21,x21,x8 827 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 828 adcs x22,x22,x9 829 ldp x8,x9,[x16,#8*2] 830 adcs x23,x23,x10 831 adcs x24,x24,x11 832 ldp x10,x11,[x16,#8*4] 833 adcs x25,x25,x12 834 adcs x26,x26,x13 835 ldp x12,x13,[x16,#8*6] 836 add x1,x16,#8*8 837 adc x30,xzr,xzr // top-most carry 838 mul x28,x4,x19 839 stp x14,x15,[x2,#8*0] 840 stp x21,x22,[x2,#8*2] 841 ldp x21,x22,[x0,#8*2] 842 stp x23,x24,[x2,#8*4] 843 ldp x23,x24,[x0,#8*4] 844 cmp x27,x29 // did we hit the bottom? 845 stp x25,x26,[x2,#8*6] 846 mov x2,x0 // slide the window 847 ldp x25,x26,[x0,#8*6] 848 mov x27,#8 849 b.ne Lsqr8x_reduction 850 851 // Final step. We see if result is larger than modulus, and 852 // if it is, subtract the modulus. But comparison implies 853 // subtraction. So we subtract modulus, see if it borrowed, 854 // and conditionally copy original value. 855 ldr x0,[x29,#96] // pull rp 856 add x2,x2,#8*8 857 subs x14,x19,x6 858 sbcs x15,x20,x7 859 sub x27,x5,#8*8 860 mov x3,x0 // x0 copy 861 862Lsqr8x_sub: 863 sbcs x16,x21,x8 864 ldp x6,x7,[x1,#8*0] 865 sbcs x17,x22,x9 866 stp x14,x15,[x0,#8*0] 867 sbcs x14,x23,x10 868 ldp x8,x9,[x1,#8*2] 869 sbcs x15,x24,x11 870 stp x16,x17,[x0,#8*2] 871 sbcs x16,x25,x12 872 ldp x10,x11,[x1,#8*4] 873 sbcs x17,x26,x13 874 ldp x12,x13,[x1,#8*6] 875 add x1,x1,#8*8 876 ldp x19,x20,[x2,#8*0] 877 sub x27,x27,#8*8 878 ldp x21,x22,[x2,#8*2] 879 ldp x23,x24,[x2,#8*4] 880 ldp x25,x26,[x2,#8*6] 881 add x2,x2,#8*8 882 stp x14,x15,[x0,#8*4] 883 sbcs x14,x19,x6 884 stp x16,x17,[x0,#8*6] 885 add x0,x0,#8*8 886 sbcs x15,x20,x7 887 cbnz x27,Lsqr8x_sub 888 889 sbcs x16,x21,x8 890 mov x2,sp 891 add x1,sp,x5 892 ldp x6,x7,[x3,#8*0] 893 sbcs x17,x22,x9 894 stp x14,x15,[x0,#8*0] 895 sbcs x14,x23,x10 896 ldp x8,x9,[x3,#8*2] 897 sbcs x15,x24,x11 898 stp x16,x17,[x0,#8*2] 899 sbcs x16,x25,x12 900 ldp x19,x20,[x1,#8*0] 901 sbcs x17,x26,x13 902 ldp x21,x22,[x1,#8*2] 903 sbcs xzr,x30,xzr // did it borrow? 904 ldr x30,[x29,#8] // pull return address 905 stp x14,x15,[x0,#8*4] 906 stp x16,x17,[x0,#8*6] 907 908 sub x27,x5,#8*4 909Lsqr4x_cond_copy: 910 sub x27,x27,#8*4 911 csel x14,x19,x6,lo 912 stp xzr,xzr,[x2,#8*0] 913 csel x15,x20,x7,lo 914 ldp x6,x7,[x3,#8*4] 915 ldp x19,x20,[x1,#8*4] 916 csel x16,x21,x8,lo 917 stp xzr,xzr,[x2,#8*2] 918 add x2,x2,#8*4 919 csel x17,x22,x9,lo 920 ldp x8,x9,[x3,#8*6] 921 ldp x21,x22,[x1,#8*6] 922 add x1,x1,#8*4 923 stp x14,x15,[x3,#8*0] 924 stp x16,x17,[x3,#8*2] 925 add x3,x3,#8*4 926 stp xzr,xzr,[x1,#8*0] 927 stp xzr,xzr,[x1,#8*2] 928 cbnz x27,Lsqr4x_cond_copy 929 930 csel x14,x19,x6,lo 931 stp xzr,xzr,[x2,#8*0] 932 csel x15,x20,x7,lo 933 stp xzr,xzr,[x2,#8*2] 934 csel x16,x21,x8,lo 935 csel x17,x22,x9,lo 936 stp x14,x15,[x3,#8*0] 937 stp x16,x17,[x3,#8*2] 938 939 b Lsqr8x_done 940 941.align 4 942Lsqr8x8_post_condition: 943 adc x28,xzr,xzr 944 ldr x30,[x29,#8] // pull return address 945 // x19-7,x28 hold result, x6-7 hold modulus 946 subs x6,x19,x6 947 ldr x1,[x29,#96] // pull rp 948 sbcs x7,x20,x7 949 stp xzr,xzr,[sp,#8*0] 950 sbcs x8,x21,x8 951 stp xzr,xzr,[sp,#8*2] 952 sbcs x9,x22,x9 953 stp xzr,xzr,[sp,#8*4] 954 sbcs x10,x23,x10 955 stp xzr,xzr,[sp,#8*6] 956 sbcs x11,x24,x11 957 stp xzr,xzr,[sp,#8*8] 958 sbcs x12,x25,x12 959 stp xzr,xzr,[sp,#8*10] 960 sbcs x13,x26,x13 961 stp xzr,xzr,[sp,#8*12] 962 sbcs x28,x28,xzr // did it borrow? 963 stp xzr,xzr,[sp,#8*14] 964 965 // x6-7 hold result-modulus 966 csel x6,x19,x6,lo 967 csel x7,x20,x7,lo 968 csel x8,x21,x8,lo 969 csel x9,x22,x9,lo 970 stp x6,x7,[x1,#8*0] 971 csel x10,x23,x10,lo 972 csel x11,x24,x11,lo 973 stp x8,x9,[x1,#8*2] 974 csel x12,x25,x12,lo 975 csel x13,x26,x13,lo 976 stp x10,x11,[x1,#8*4] 977 stp x12,x13,[x1,#8*6] 978 979Lsqr8x_done: 980 ldp x19,x20,[x29,#16] 981 mov sp,x29 982 ldp x21,x22,[x29,#32] 983 mov x0,#1 984 ldp x23,x24,[x29,#48] 985 ldp x25,x26,[x29,#64] 986 ldp x27,x28,[x29,#80] 987 ldr x29,[sp],#128 988 // x30 is popped earlier 989 AARCH64_VALIDATE_LINK_REGISTER 990 ret 991 992.def __bn_mul4x_mont 993 .type 32 994.endef 995.align 5 996__bn_mul4x_mont: 997 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 998 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 999 // return address. 1000 stp x29,x30,[sp,#-128]! 1001 add x29,sp,#0 1002 stp x19,x20,[sp,#16] 1003 stp x21,x22,[sp,#32] 1004 stp x23,x24,[sp,#48] 1005 stp x25,x26,[sp,#64] 1006 stp x27,x28,[sp,#80] 1007 1008 sub x26,sp,x5,lsl#3 1009 lsl x5,x5,#3 1010 ldr x4,[x4] // *n0 1011 sub sp,x26,#8*4 // alloca 1012 1013 add x10,x2,x5 1014 add x27,x1,x5 1015 stp x0,x10,[x29,#96] // offload rp and &b[num] 1016 1017 ldr x24,[x2,#8*0] // b[0] 1018 ldp x6,x7,[x1,#8*0] // a[0..3] 1019 ldp x8,x9,[x1,#8*2] 1020 add x1,x1,#8*4 1021 mov x19,xzr 1022 mov x20,xzr 1023 mov x21,xzr 1024 mov x22,xzr 1025 ldp x14,x15,[x3,#8*0] // n[0..3] 1026 ldp x16,x17,[x3,#8*2] 1027 adds x3,x3,#8*4 // clear carry bit 1028 mov x0,xzr 1029 mov x28,#0 1030 mov x26,sp 1031 1032Loop_mul4x_1st_reduction: 1033 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1034 adc x0,x0,xzr // modulo-scheduled 1035 mul x11,x7,x24 1036 add x28,x28,#8 1037 mul x12,x8,x24 1038 and x28,x28,#31 1039 mul x13,x9,x24 1040 adds x19,x19,x10 1041 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1042 adcs x20,x20,x11 1043 mul x25,x19,x4 // t[0]*n0 1044 adcs x21,x21,x12 1045 umulh x11,x7,x24 1046 adcs x22,x22,x13 1047 umulh x12,x8,x24 1048 adc x23,xzr,xzr 1049 umulh x13,x9,x24 1050 ldr x24,[x2,x28] // next b[i] (or b[0]) 1051 adds x20,x20,x10 1052 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1053 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1054 adcs x21,x21,x11 1055 mul x11,x15,x25 1056 adcs x22,x22,x12 1057 mul x12,x16,x25 1058 adc x23,x23,x13 // can't overflow 1059 mul x13,x17,x25 1060 // (*) adds xzr,x19,x10 1061 subs xzr,x19,#1 // (*) 1062 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1063 adcs x19,x20,x11 1064 umulh x11,x15,x25 1065 adcs x20,x21,x12 1066 umulh x12,x16,x25 1067 adcs x21,x22,x13 1068 umulh x13,x17,x25 1069 adcs x22,x23,x0 1070 adc x0,xzr,xzr 1071 adds x19,x19,x10 1072 sub x10,x27,x1 1073 adcs x20,x20,x11 1074 adcs x21,x21,x12 1075 adcs x22,x22,x13 1076 //adc x0,x0,xzr 1077 cbnz x28,Loop_mul4x_1st_reduction 1078 1079 cbz x10,Lmul4x4_post_condition 1080 1081 ldp x6,x7,[x1,#8*0] // a[4..7] 1082 ldp x8,x9,[x1,#8*2] 1083 add x1,x1,#8*4 1084 ldr x25,[sp] // a[0]*n0 1085 ldp x14,x15,[x3,#8*0] // n[4..7] 1086 ldp x16,x17,[x3,#8*2] 1087 add x3,x3,#8*4 1088 1089Loop_mul4x_1st_tail: 1090 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1091 adc x0,x0,xzr // modulo-scheduled 1092 mul x11,x7,x24 1093 add x28,x28,#8 1094 mul x12,x8,x24 1095 and x28,x28,#31 1096 mul x13,x9,x24 1097 adds x19,x19,x10 1098 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1099 adcs x20,x20,x11 1100 umulh x11,x7,x24 1101 adcs x21,x21,x12 1102 umulh x12,x8,x24 1103 adcs x22,x22,x13 1104 umulh x13,x9,x24 1105 adc x23,xzr,xzr 1106 ldr x24,[x2,x28] // next b[i] (or b[0]) 1107 adds x20,x20,x10 1108 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1109 adcs x21,x21,x11 1110 mul x11,x15,x25 1111 adcs x22,x22,x12 1112 mul x12,x16,x25 1113 adc x23,x23,x13 // can't overflow 1114 mul x13,x17,x25 1115 adds x19,x19,x10 1116 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1117 adcs x20,x20,x11 1118 umulh x11,x15,x25 1119 adcs x21,x21,x12 1120 umulh x12,x16,x25 1121 adcs x22,x22,x13 1122 adcs x23,x23,x0 1123 umulh x13,x17,x25 1124 adc x0,xzr,xzr 1125 ldr x25,[sp,x28] // next t[0]*n0 1126 str x19,[x26],#8 // result!!! 1127 adds x19,x20,x10 1128 sub x10,x27,x1 // done yet? 1129 adcs x20,x21,x11 1130 adcs x21,x22,x12 1131 adcs x22,x23,x13 1132 //adc x0,x0,xzr 1133 cbnz x28,Loop_mul4x_1st_tail 1134 1135 sub x11,x27,x5 // rewinded x1 1136 cbz x10,Lmul4x_proceed 1137 1138 ldp x6,x7,[x1,#8*0] 1139 ldp x8,x9,[x1,#8*2] 1140 add x1,x1,#8*4 1141 ldp x14,x15,[x3,#8*0] 1142 ldp x16,x17,[x3,#8*2] 1143 add x3,x3,#8*4 1144 b Loop_mul4x_1st_tail 1145 1146.align 5 1147Lmul4x_proceed: 1148 ldr x24,[x2,#8*4]! // *++b 1149 adc x30,x0,xzr 1150 ldp x6,x7,[x11,#8*0] // a[0..3] 1151 sub x3,x3,x5 // rewind np 1152 ldp x8,x9,[x11,#8*2] 1153 add x1,x11,#8*4 1154 1155 stp x19,x20,[x26,#8*0] // result!!! 1156 ldp x19,x20,[sp,#8*4] // t[0..3] 1157 stp x21,x22,[x26,#8*2] // result!!! 1158 ldp x21,x22,[sp,#8*6] 1159 1160 ldp x14,x15,[x3,#8*0] // n[0..3] 1161 mov x26,sp 1162 ldp x16,x17,[x3,#8*2] 1163 adds x3,x3,#8*4 // clear carry bit 1164 mov x0,xzr 1165 1166.align 4 1167Loop_mul4x_reduction: 1168 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1169 adc x0,x0,xzr // modulo-scheduled 1170 mul x11,x7,x24 1171 add x28,x28,#8 1172 mul x12,x8,x24 1173 and x28,x28,#31 1174 mul x13,x9,x24 1175 adds x19,x19,x10 1176 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1177 adcs x20,x20,x11 1178 mul x25,x19,x4 // t[0]*n0 1179 adcs x21,x21,x12 1180 umulh x11,x7,x24 1181 adcs x22,x22,x13 1182 umulh x12,x8,x24 1183 adc x23,xzr,xzr 1184 umulh x13,x9,x24 1185 ldr x24,[x2,x28] // next b[i] 1186 adds x20,x20,x10 1187 // (*) mul x10,x14,x25 1188 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1189 adcs x21,x21,x11 1190 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1191 adcs x22,x22,x12 1192 mul x12,x16,x25 1193 adc x23,x23,x13 // can't overflow 1194 mul x13,x17,x25 1195 // (*) adds xzr,x19,x10 1196 subs xzr,x19,#1 // (*) 1197 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1198 adcs x19,x20,x11 1199 umulh x11,x15,x25 1200 adcs x20,x21,x12 1201 umulh x12,x16,x25 1202 adcs x21,x22,x13 1203 umulh x13,x17,x25 1204 adcs x22,x23,x0 1205 adc x0,xzr,xzr 1206 adds x19,x19,x10 1207 adcs x20,x20,x11 1208 adcs x21,x21,x12 1209 adcs x22,x22,x13 1210 //adc x0,x0,xzr 1211 cbnz x28,Loop_mul4x_reduction 1212 1213 adc x0,x0,xzr 1214 ldp x10,x11,[x26,#8*4] // t[4..7] 1215 ldp x12,x13,[x26,#8*6] 1216 ldp x6,x7,[x1,#8*0] // a[4..7] 1217 ldp x8,x9,[x1,#8*2] 1218 add x1,x1,#8*4 1219 adds x19,x19,x10 1220 adcs x20,x20,x11 1221 adcs x21,x21,x12 1222 adcs x22,x22,x13 1223 //adc x0,x0,xzr 1224 1225 ldr x25,[sp] // t[0]*n0 1226 ldp x14,x15,[x3,#8*0] // n[4..7] 1227 ldp x16,x17,[x3,#8*2] 1228 add x3,x3,#8*4 1229 1230.align 4 1231Loop_mul4x_tail: 1232 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1233 adc x0,x0,xzr // modulo-scheduled 1234 mul x11,x7,x24 1235 add x28,x28,#8 1236 mul x12,x8,x24 1237 and x28,x28,#31 1238 mul x13,x9,x24 1239 adds x19,x19,x10 1240 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1241 adcs x20,x20,x11 1242 umulh x11,x7,x24 1243 adcs x21,x21,x12 1244 umulh x12,x8,x24 1245 adcs x22,x22,x13 1246 umulh x13,x9,x24 1247 adc x23,xzr,xzr 1248 ldr x24,[x2,x28] // next b[i] 1249 adds x20,x20,x10 1250 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1251 adcs x21,x21,x11 1252 mul x11,x15,x25 1253 adcs x22,x22,x12 1254 mul x12,x16,x25 1255 adc x23,x23,x13 // can't overflow 1256 mul x13,x17,x25 1257 adds x19,x19,x10 1258 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1259 adcs x20,x20,x11 1260 umulh x11,x15,x25 1261 adcs x21,x21,x12 1262 umulh x12,x16,x25 1263 adcs x22,x22,x13 1264 umulh x13,x17,x25 1265 adcs x23,x23,x0 1266 ldr x25,[sp,x28] // next a[0]*n0 1267 adc x0,xzr,xzr 1268 str x19,[x26],#8 // result!!! 1269 adds x19,x20,x10 1270 sub x10,x27,x1 // done yet? 1271 adcs x20,x21,x11 1272 adcs x21,x22,x12 1273 adcs x22,x23,x13 1274 //adc x0,x0,xzr 1275 cbnz x28,Loop_mul4x_tail 1276 1277 sub x11,x3,x5 // rewinded np? 1278 adc x0,x0,xzr 1279 cbz x10,Loop_mul4x_break 1280 1281 ldp x10,x11,[x26,#8*4] 1282 ldp x12,x13,[x26,#8*6] 1283 ldp x6,x7,[x1,#8*0] 1284 ldp x8,x9,[x1,#8*2] 1285 add x1,x1,#8*4 1286 adds x19,x19,x10 1287 adcs x20,x20,x11 1288 adcs x21,x21,x12 1289 adcs x22,x22,x13 1290 //adc x0,x0,xzr 1291 ldp x14,x15,[x3,#8*0] 1292 ldp x16,x17,[x3,#8*2] 1293 add x3,x3,#8*4 1294 b Loop_mul4x_tail 1295 1296.align 4 1297Loop_mul4x_break: 1298 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1299 adds x19,x19,x30 1300 add x2,x2,#8*4 // bp++ 1301 adcs x20,x20,xzr 1302 sub x1,x1,x5 // rewind ap 1303 adcs x21,x21,xzr 1304 stp x19,x20,[x26,#8*0] // result!!! 1305 adcs x22,x22,xzr 1306 ldp x19,x20,[sp,#8*4] // t[0..3] 1307 adc x30,x0,xzr 1308 stp x21,x22,[x26,#8*2] // result!!! 1309 cmp x2,x13 // done yet? 1310 ldp x21,x22,[sp,#8*6] 1311 ldp x14,x15,[x11,#8*0] // n[0..3] 1312 ldp x16,x17,[x11,#8*2] 1313 add x3,x11,#8*4 1314 b.eq Lmul4x_post 1315 1316 ldr x24,[x2] 1317 ldp x6,x7,[x1,#8*0] // a[0..3] 1318 ldp x8,x9,[x1,#8*2] 1319 adds x1,x1,#8*4 // clear carry bit 1320 mov x0,xzr 1321 mov x26,sp 1322 b Loop_mul4x_reduction 1323 1324.align 4 1325Lmul4x_post: 1326 // Final step. We see if result is larger than modulus, and 1327 // if it is, subtract the modulus. But comparison implies 1328 // subtraction. So we subtract modulus, see if it borrowed, 1329 // and conditionally copy original value. 1330 mov x0,x12 1331 mov x27,x12 // x0 copy 1332 subs x10,x19,x14 1333 add x26,sp,#8*8 1334 sbcs x11,x20,x15 1335 sub x28,x5,#8*4 1336 1337Lmul4x_sub: 1338 sbcs x12,x21,x16 1339 ldp x14,x15,[x3,#8*0] 1340 sub x28,x28,#8*4 1341 ldp x19,x20,[x26,#8*0] 1342 sbcs x13,x22,x17 1343 ldp x16,x17,[x3,#8*2] 1344 add x3,x3,#8*4 1345 ldp x21,x22,[x26,#8*2] 1346 add x26,x26,#8*4 1347 stp x10,x11,[x0,#8*0] 1348 sbcs x10,x19,x14 1349 stp x12,x13,[x0,#8*2] 1350 add x0,x0,#8*4 1351 sbcs x11,x20,x15 1352 cbnz x28,Lmul4x_sub 1353 1354 sbcs x12,x21,x16 1355 mov x26,sp 1356 add x1,sp,#8*4 1357 ldp x6,x7,[x27,#8*0] 1358 sbcs x13,x22,x17 1359 stp x10,x11,[x0,#8*0] 1360 ldp x8,x9,[x27,#8*2] 1361 stp x12,x13,[x0,#8*2] 1362 ldp x19,x20,[x1,#8*0] 1363 ldp x21,x22,[x1,#8*2] 1364 sbcs xzr,x30,xzr // did it borrow? 1365 ldr x30,[x29,#8] // pull return address 1366 1367 sub x28,x5,#8*4 1368Lmul4x_cond_copy: 1369 sub x28,x28,#8*4 1370 csel x10,x19,x6,lo 1371 stp xzr,xzr,[x26,#8*0] 1372 csel x11,x20,x7,lo 1373 ldp x6,x7,[x27,#8*4] 1374 ldp x19,x20,[x1,#8*4] 1375 csel x12,x21,x8,lo 1376 stp xzr,xzr,[x26,#8*2] 1377 add x26,x26,#8*4 1378 csel x13,x22,x9,lo 1379 ldp x8,x9,[x27,#8*6] 1380 ldp x21,x22,[x1,#8*6] 1381 add x1,x1,#8*4 1382 stp x10,x11,[x27,#8*0] 1383 stp x12,x13,[x27,#8*2] 1384 add x27,x27,#8*4 1385 cbnz x28,Lmul4x_cond_copy 1386 1387 csel x10,x19,x6,lo 1388 stp xzr,xzr,[x26,#8*0] 1389 csel x11,x20,x7,lo 1390 stp xzr,xzr,[x26,#8*2] 1391 csel x12,x21,x8,lo 1392 stp xzr,xzr,[x26,#8*3] 1393 csel x13,x22,x9,lo 1394 stp xzr,xzr,[x26,#8*4] 1395 stp x10,x11,[x27,#8*0] 1396 stp x12,x13,[x27,#8*2] 1397 1398 b Lmul4x_done 1399 1400.align 4 1401Lmul4x4_post_condition: 1402 adc x0,x0,xzr 1403 ldr x1,[x29,#96] // pull rp 1404 // x19-3,x0 hold result, x14-7 hold modulus 1405 subs x6,x19,x14 1406 ldr x30,[x29,#8] // pull return address 1407 sbcs x7,x20,x15 1408 stp xzr,xzr,[sp,#8*0] 1409 sbcs x8,x21,x16 1410 stp xzr,xzr,[sp,#8*2] 1411 sbcs x9,x22,x17 1412 stp xzr,xzr,[sp,#8*4] 1413 sbcs xzr,x0,xzr // did it borrow? 1414 stp xzr,xzr,[sp,#8*6] 1415 1416 // x6-3 hold result-modulus 1417 csel x6,x19,x6,lo 1418 csel x7,x20,x7,lo 1419 csel x8,x21,x8,lo 1420 csel x9,x22,x9,lo 1421 stp x6,x7,[x1,#8*0] 1422 stp x8,x9,[x1,#8*2] 1423 1424Lmul4x_done: 1425 ldp x19,x20,[x29,#16] 1426 mov sp,x29 1427 ldp x21,x22,[x29,#32] 1428 mov x0,#1 1429 ldp x23,x24,[x29,#48] 1430 ldp x25,x26,[x29,#64] 1431 ldp x27,x28,[x29,#80] 1432 ldr x29,[sp],#128 1433 // x30 is popped earlier 1434 AARCH64_VALIDATE_LINK_REGISTER 1435 ret 1436 1437.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1438.align 2 1439.align 4 1440#endif 1441#endif // !OPENSSL_NO_ASM 1442