1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(BORINGSSL_PREFIX) 13#include <boringssl_prefix_symbols_asm.h> 14#endif 15#include <openssl/arm_arch.h> 16 17.text 18 19.globl _bn_mul_mont 20.private_extern _bn_mul_mont 21 22.align 5 23_bn_mul_mont: 24 AARCH64_SIGN_LINK_REGISTER 25 tst x5,#7 26 b.eq __bn_sqr8x_mont 27 tst x5,#3 28 b.eq __bn_mul4x_mont 29Lmul_mont: 30 stp x29,x30,[sp,#-64]! 31 add x29,sp,#0 32 stp x19,x20,[sp,#16] 33 stp x21,x22,[sp,#32] 34 stp x23,x24,[sp,#48] 35 36 ldr x9,[x2],#8 // bp[0] 37 sub x22,sp,x5,lsl#3 38 ldp x7,x8,[x1],#16 // ap[0..1] 39 lsl x5,x5,#3 40 ldr x4,[x4] // *n0 41 and x22,x22,#-16 // ABI says so 42 ldp x13,x14,[x3],#16 // np[0..1] 43 44 mul x6,x7,x9 // ap[0]*bp[0] 45 sub x21,x5,#16 // j=num-2 46 umulh x7,x7,x9 47 mul x10,x8,x9 // ap[1]*bp[0] 48 umulh x11,x8,x9 49 50 mul x15,x6,x4 // "tp[0]"*n0 51 mov sp,x22 // alloca 52 53 // (*) mul x12,x13,x15 // np[0]*m1 54 umulh x13,x13,x15 55 mul x16,x14,x15 // np[1]*m1 56 // (*) adds x12,x12,x6 // discarded 57 // (*) As for removal of first multiplication and addition 58 // instructions. The outcome of first addition is 59 // guaranteed to be zero, which leaves two computationally 60 // significant outcomes: it either carries or not. Then 61 // question is when does it carry? Is there alternative 62 // way to deduce it? If you follow operations, you can 63 // observe that condition for carry is quite simple: 64 // x6 being non-zero. So that carry can be calculated 65 // by adding -1 to x6. That's what next instruction does. 66 subs xzr,x6,#1 // (*) 67 umulh x17,x14,x15 68 adc x13,x13,xzr 69 cbz x21,L1st_skip 70 71L1st: 72 ldr x8,[x1],#8 73 adds x6,x10,x7 74 sub x21,x21,#8 // j-- 75 adc x7,x11,xzr 76 77 ldr x14,[x3],#8 78 adds x12,x16,x13 79 mul x10,x8,x9 // ap[j]*bp[0] 80 adc x13,x17,xzr 81 umulh x11,x8,x9 82 83 adds x12,x12,x6 84 mul x16,x14,x15 // np[j]*m1 85 adc x13,x13,xzr 86 umulh x17,x14,x15 87 str x12,[x22],#8 // tp[j-1] 88 cbnz x21,L1st 89 90L1st_skip: 91 adds x6,x10,x7 92 sub x1,x1,x5 // rewind x1 93 adc x7,x11,xzr 94 95 adds x12,x16,x13 96 sub x3,x3,x5 // rewind x3 97 adc x13,x17,xzr 98 99 adds x12,x12,x6 100 sub x20,x5,#8 // i=num-1 101 adcs x13,x13,x7 102 103 adc x19,xzr,xzr // upmost overflow bit 104 stp x12,x13,[x22] 105 106Louter: 107 ldr x9,[x2],#8 // bp[i] 108 ldp x7,x8,[x1],#16 109 ldr x23,[sp] // tp[0] 110 add x22,sp,#8 111 112 mul x6,x7,x9 // ap[0]*bp[i] 113 sub x21,x5,#16 // j=num-2 114 umulh x7,x7,x9 115 ldp x13,x14,[x3],#16 116 mul x10,x8,x9 // ap[1]*bp[i] 117 adds x6,x6,x23 118 umulh x11,x8,x9 119 adc x7,x7,xzr 120 121 mul x15,x6,x4 122 sub x20,x20,#8 // i-- 123 124 // (*) mul x12,x13,x15 // np[0]*m1 125 umulh x13,x13,x15 126 mul x16,x14,x15 // np[1]*m1 127 // (*) adds x12,x12,x6 128 subs xzr,x6,#1 // (*) 129 umulh x17,x14,x15 130 cbz x21,Linner_skip 131 132Linner: 133 ldr x8,[x1],#8 134 adc x13,x13,xzr 135 ldr x23,[x22],#8 // tp[j] 136 adds x6,x10,x7 137 sub x21,x21,#8 // j-- 138 adc x7,x11,xzr 139 140 adds x12,x16,x13 141 ldr x14,[x3],#8 142 adc x13,x17,xzr 143 144 mul x10,x8,x9 // ap[j]*bp[i] 145 adds x6,x6,x23 146 umulh x11,x8,x9 147 adc x7,x7,xzr 148 149 mul x16,x14,x15 // np[j]*m1 150 adds x12,x12,x6 151 umulh x17,x14,x15 152 str x12,[x22,#-16] // tp[j-1] 153 cbnz x21,Linner 154 155Linner_skip: 156 ldr x23,[x22],#8 // tp[j] 157 adc x13,x13,xzr 158 adds x6,x10,x7 159 sub x1,x1,x5 // rewind x1 160 adc x7,x11,xzr 161 162 adds x12,x16,x13 163 sub x3,x3,x5 // rewind x3 164 adcs x13,x17,x19 165 adc x19,xzr,xzr 166 167 adds x6,x6,x23 168 adc x7,x7,xzr 169 170 adds x12,x12,x6 171 adcs x13,x13,x7 172 adc x19,x19,xzr // upmost overflow bit 173 stp x12,x13,[x22,#-16] 174 175 cbnz x20,Louter 176 177 // Final step. We see if result is larger than modulus, and 178 // if it is, subtract the modulus. But comparison implies 179 // subtraction. So we subtract modulus, see if it borrowed, 180 // and conditionally copy original value. 181 ldr x23,[sp] // tp[0] 182 add x22,sp,#8 183 ldr x14,[x3],#8 // np[0] 184 subs x21,x5,#8 // j=num-1 and clear borrow 185 mov x1,x0 186Lsub: 187 sbcs x8,x23,x14 // tp[j]-np[j] 188 ldr x23,[x22],#8 189 sub x21,x21,#8 // j-- 190 ldr x14,[x3],#8 191 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 192 cbnz x21,Lsub 193 194 sbcs x8,x23,x14 195 sbcs x19,x19,xzr // did it borrow? 196 str x8,[x1],#8 // rp[num-1] 197 198 ldr x23,[sp] // tp[0] 199 add x22,sp,#8 200 ldr x8,[x0],#8 // rp[0] 201 sub x5,x5,#8 // num-- 202 nop 203Lcond_copy: 204 sub x5,x5,#8 // num-- 205 csel x14,x23,x8,lo // did it borrow? 206 ldr x23,[x22],#8 207 ldr x8,[x0],#8 208 str xzr,[x22,#-16] // wipe tp 209 str x14,[x0,#-16] 210 cbnz x5,Lcond_copy 211 212 csel x14,x23,x8,lo 213 str xzr,[x22,#-8] // wipe tp 214 str x14,[x0,#-8] 215 216 ldp x19,x20,[x29,#16] 217 mov sp,x29 218 ldp x21,x22,[x29,#32] 219 mov x0,#1 220 ldp x23,x24,[x29,#48] 221 ldr x29,[sp],#64 222 AARCH64_VALIDATE_LINK_REGISTER 223 ret 224 225 226.align 5 227__bn_sqr8x_mont: 228 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 229 // only from bn_mul_mont which has already signed the return address. 230 cmp x1,x2 231 b.ne __bn_mul4x_mont 232Lsqr8x_mont: 233 stp x29,x30,[sp,#-128]! 234 add x29,sp,#0 235 stp x19,x20,[sp,#16] 236 stp x21,x22,[sp,#32] 237 stp x23,x24,[sp,#48] 238 stp x25,x26,[sp,#64] 239 stp x27,x28,[sp,#80] 240 stp x0,x3,[sp,#96] // offload rp and np 241 242 ldp x6,x7,[x1,#8*0] 243 ldp x8,x9,[x1,#8*2] 244 ldp x10,x11,[x1,#8*4] 245 ldp x12,x13,[x1,#8*6] 246 247 sub x2,sp,x5,lsl#4 248 lsl x5,x5,#3 249 ldr x4,[x4] // *n0 250 mov sp,x2 // alloca 251 sub x27,x5,#8*8 252 b Lsqr8x_zero_start 253 254Lsqr8x_zero: 255 sub x27,x27,#8*8 256 stp xzr,xzr,[x2,#8*0] 257 stp xzr,xzr,[x2,#8*2] 258 stp xzr,xzr,[x2,#8*4] 259 stp xzr,xzr,[x2,#8*6] 260Lsqr8x_zero_start: 261 stp xzr,xzr,[x2,#8*8] 262 stp xzr,xzr,[x2,#8*10] 263 stp xzr,xzr,[x2,#8*12] 264 stp xzr,xzr,[x2,#8*14] 265 add x2,x2,#8*16 266 cbnz x27,Lsqr8x_zero 267 268 add x3,x1,x5 269 add x1,x1,#8*8 270 mov x19,xzr 271 mov x20,xzr 272 mov x21,xzr 273 mov x22,xzr 274 mov x23,xzr 275 mov x24,xzr 276 mov x25,xzr 277 mov x26,xzr 278 mov x2,sp 279 str x4,[x29,#112] // offload n0 280 281 // Multiply everything but a[i]*a[i] 282.align 4 283Lsqr8x_outer_loop: 284 // a[1]a[0] (i) 285 // a[2]a[0] 286 // a[3]a[0] 287 // a[4]a[0] 288 // a[5]a[0] 289 // a[6]a[0] 290 // a[7]a[0] 291 // a[2]a[1] (ii) 292 // a[3]a[1] 293 // a[4]a[1] 294 // a[5]a[1] 295 // a[6]a[1] 296 // a[7]a[1] 297 // a[3]a[2] (iii) 298 // a[4]a[2] 299 // a[5]a[2] 300 // a[6]a[2] 301 // a[7]a[2] 302 // a[4]a[3] (iv) 303 // a[5]a[3] 304 // a[6]a[3] 305 // a[7]a[3] 306 // a[5]a[4] (v) 307 // a[6]a[4] 308 // a[7]a[4] 309 // a[6]a[5] (vi) 310 // a[7]a[5] 311 // a[7]a[6] (vii) 312 313 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 314 mul x15,x8,x6 315 mul x16,x9,x6 316 mul x17,x10,x6 317 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 318 mul x14,x11,x6 319 adcs x21,x21,x15 320 mul x15,x12,x6 321 adcs x22,x22,x16 322 mul x16,x13,x6 323 adcs x23,x23,x17 324 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 325 adcs x24,x24,x14 326 umulh x14,x8,x6 327 adcs x25,x25,x15 328 umulh x15,x9,x6 329 adcs x26,x26,x16 330 umulh x16,x10,x6 331 stp x19,x20,[x2],#8*2 // t[0..1] 332 adc x19,xzr,xzr // t[8] 333 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 334 umulh x17,x11,x6 335 adcs x22,x22,x14 336 umulh x14,x12,x6 337 adcs x23,x23,x15 338 umulh x15,x13,x6 339 adcs x24,x24,x16 340 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 341 adcs x25,x25,x17 342 mul x17,x9,x7 343 adcs x26,x26,x14 344 mul x14,x10,x7 345 adc x19,x19,x15 346 347 mul x15,x11,x7 348 adds x22,x22,x16 349 mul x16,x12,x7 350 adcs x23,x23,x17 351 mul x17,x13,x7 352 adcs x24,x24,x14 353 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 354 adcs x25,x25,x15 355 umulh x15,x9,x7 356 adcs x26,x26,x16 357 umulh x16,x10,x7 358 adcs x19,x19,x17 359 umulh x17,x11,x7 360 stp x21,x22,[x2],#8*2 // t[2..3] 361 adc x20,xzr,xzr // t[9] 362 adds x23,x23,x14 363 umulh x14,x12,x7 364 adcs x24,x24,x15 365 umulh x15,x13,x7 366 adcs x25,x25,x16 367 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 368 adcs x26,x26,x17 369 mul x17,x10,x8 370 adcs x19,x19,x14 371 mul x14,x11,x8 372 adc x20,x20,x15 373 374 mul x15,x12,x8 375 adds x24,x24,x16 376 mul x16,x13,x8 377 adcs x25,x25,x17 378 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 379 adcs x26,x26,x14 380 umulh x14,x10,x8 381 adcs x19,x19,x15 382 umulh x15,x11,x8 383 adcs x20,x20,x16 384 umulh x16,x12,x8 385 stp x23,x24,[x2],#8*2 // t[4..5] 386 adc x21,xzr,xzr // t[10] 387 adds x25,x25,x17 388 umulh x17,x13,x8 389 adcs x26,x26,x14 390 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 391 adcs x19,x19,x15 392 mul x15,x11,x9 393 adcs x20,x20,x16 394 mul x16,x12,x9 395 adc x21,x21,x17 396 397 mul x17,x13,x9 398 adds x26,x26,x14 399 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 400 adcs x19,x19,x15 401 umulh x15,x11,x9 402 adcs x20,x20,x16 403 umulh x16,x12,x9 404 adcs x21,x21,x17 405 umulh x17,x13,x9 406 stp x25,x26,[x2],#8*2 // t[6..7] 407 adc x22,xzr,xzr // t[11] 408 adds x19,x19,x14 409 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 410 adcs x20,x20,x15 411 mul x15,x12,x10 412 adcs x21,x21,x16 413 mul x16,x13,x10 414 adc x22,x22,x17 415 416 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 417 adds x20,x20,x14 418 umulh x14,x12,x10 419 adcs x21,x21,x15 420 umulh x15,x13,x10 421 adcs x22,x22,x16 422 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 423 adc x23,xzr,xzr // t[12] 424 adds x21,x21,x17 425 mul x17,x13,x11 426 adcs x22,x22,x14 427 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 428 adc x23,x23,x15 429 430 umulh x15,x13,x11 431 adds x22,x22,x16 432 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 433 adcs x23,x23,x17 434 umulh x17,x13,x12 // hi(a[7]*a[6]) 435 adc x24,xzr,xzr // t[13] 436 adds x23,x23,x14 437 sub x27,x3,x1 // done yet? 438 adc x24,x24,x15 439 440 adds x24,x24,x16 441 sub x14,x3,x5 // rewinded ap 442 adc x25,xzr,xzr // t[14] 443 add x25,x25,x17 444 445 cbz x27,Lsqr8x_outer_break 446 447 mov x4,x6 448 ldp x6,x7,[x2,#8*0] 449 ldp x8,x9,[x2,#8*2] 450 ldp x10,x11,[x2,#8*4] 451 ldp x12,x13,[x2,#8*6] 452 adds x19,x19,x6 453 adcs x20,x20,x7 454 ldp x6,x7,[x1,#8*0] 455 adcs x21,x21,x8 456 adcs x22,x22,x9 457 ldp x8,x9,[x1,#8*2] 458 adcs x23,x23,x10 459 adcs x24,x24,x11 460 ldp x10,x11,[x1,#8*4] 461 adcs x25,x25,x12 462 mov x0,x1 463 adcs x26,xzr,x13 464 ldp x12,x13,[x1,#8*6] 465 add x1,x1,#8*8 466 //adc x28,xzr,xzr // moved below 467 mov x27,#-8*8 468 469 // a[8]a[0] 470 // a[9]a[0] 471 // a[a]a[0] 472 // a[b]a[0] 473 // a[c]a[0] 474 // a[d]a[0] 475 // a[e]a[0] 476 // a[f]a[0] 477 // a[8]a[1] 478 // a[f]a[1]........................ 479 // a[8]a[2] 480 // a[f]a[2]........................ 481 // a[8]a[3] 482 // a[f]a[3]........................ 483 // a[8]a[4] 484 // a[f]a[4]........................ 485 // a[8]a[5] 486 // a[f]a[5]........................ 487 // a[8]a[6] 488 // a[f]a[6]........................ 489 // a[8]a[7] 490 // a[f]a[7]........................ 491Lsqr8x_mul: 492 mul x14,x6,x4 493 adc x28,xzr,xzr // carry bit, modulo-scheduled 494 mul x15,x7,x4 495 add x27,x27,#8 496 mul x16,x8,x4 497 mul x17,x9,x4 498 adds x19,x19,x14 499 mul x14,x10,x4 500 adcs x20,x20,x15 501 mul x15,x11,x4 502 adcs x21,x21,x16 503 mul x16,x12,x4 504 adcs x22,x22,x17 505 mul x17,x13,x4 506 adcs x23,x23,x14 507 umulh x14,x6,x4 508 adcs x24,x24,x15 509 umulh x15,x7,x4 510 adcs x25,x25,x16 511 umulh x16,x8,x4 512 adcs x26,x26,x17 513 umulh x17,x9,x4 514 adc x28,x28,xzr 515 str x19,[x2],#8 516 adds x19,x20,x14 517 umulh x14,x10,x4 518 adcs x20,x21,x15 519 umulh x15,x11,x4 520 adcs x21,x22,x16 521 umulh x16,x12,x4 522 adcs x22,x23,x17 523 umulh x17,x13,x4 524 ldr x4,[x0,x27] 525 adcs x23,x24,x14 526 adcs x24,x25,x15 527 adcs x25,x26,x16 528 adcs x26,x28,x17 529 //adc x28,xzr,xzr // moved above 530 cbnz x27,Lsqr8x_mul 531 // note that carry flag is guaranteed 532 // to be zero at this point 533 cmp x1,x3 // done yet? 534 b.eq Lsqr8x_break 535 536 ldp x6,x7,[x2,#8*0] 537 ldp x8,x9,[x2,#8*2] 538 ldp x10,x11,[x2,#8*4] 539 ldp x12,x13,[x2,#8*6] 540 adds x19,x19,x6 541 ldr x4,[x0,#-8*8] 542 adcs x20,x20,x7 543 ldp x6,x7,[x1,#8*0] 544 adcs x21,x21,x8 545 adcs x22,x22,x9 546 ldp x8,x9,[x1,#8*2] 547 adcs x23,x23,x10 548 adcs x24,x24,x11 549 ldp x10,x11,[x1,#8*4] 550 adcs x25,x25,x12 551 mov x27,#-8*8 552 adcs x26,x26,x13 553 ldp x12,x13,[x1,#8*6] 554 add x1,x1,#8*8 555 //adc x28,xzr,xzr // moved above 556 b Lsqr8x_mul 557 558.align 4 559Lsqr8x_break: 560 ldp x6,x7,[x0,#8*0] 561 add x1,x0,#8*8 562 ldp x8,x9,[x0,#8*2] 563 sub x14,x3,x1 // is it last iteration? 564 ldp x10,x11,[x0,#8*4] 565 sub x15,x2,x14 566 ldp x12,x13,[x0,#8*6] 567 cbz x14,Lsqr8x_outer_loop 568 569 stp x19,x20,[x2,#8*0] 570 ldp x19,x20,[x15,#8*0] 571 stp x21,x22,[x2,#8*2] 572 ldp x21,x22,[x15,#8*2] 573 stp x23,x24,[x2,#8*4] 574 ldp x23,x24,[x15,#8*4] 575 stp x25,x26,[x2,#8*6] 576 mov x2,x15 577 ldp x25,x26,[x15,#8*6] 578 b Lsqr8x_outer_loop 579 580.align 4 581Lsqr8x_outer_break: 582 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 583 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 584 ldp x15,x16,[sp,#8*1] 585 ldp x11,x13,[x14,#8*2] 586 add x1,x14,#8*4 587 ldp x17,x14,[sp,#8*3] 588 589 stp x19,x20,[x2,#8*0] 590 mul x19,x7,x7 591 stp x21,x22,[x2,#8*2] 592 umulh x7,x7,x7 593 stp x23,x24,[x2,#8*4] 594 mul x8,x9,x9 595 stp x25,x26,[x2,#8*6] 596 mov x2,sp 597 umulh x9,x9,x9 598 adds x20,x7,x15,lsl#1 599 extr x15,x16,x15,#63 600 sub x27,x5,#8*4 601 602Lsqr4x_shift_n_add: 603 adcs x21,x8,x15 604 extr x16,x17,x16,#63 605 sub x27,x27,#8*4 606 adcs x22,x9,x16 607 ldp x15,x16,[x2,#8*5] 608 mul x10,x11,x11 609 ldp x7,x9,[x1],#8*2 610 umulh x11,x11,x11 611 mul x12,x13,x13 612 umulh x13,x13,x13 613 extr x17,x14,x17,#63 614 stp x19,x20,[x2,#8*0] 615 adcs x23,x10,x17 616 extr x14,x15,x14,#63 617 stp x21,x22,[x2,#8*2] 618 adcs x24,x11,x14 619 ldp x17,x14,[x2,#8*7] 620 extr x15,x16,x15,#63 621 adcs x25,x12,x15 622 extr x16,x17,x16,#63 623 adcs x26,x13,x16 624 ldp x15,x16,[x2,#8*9] 625 mul x6,x7,x7 626 ldp x11,x13,[x1],#8*2 627 umulh x7,x7,x7 628 mul x8,x9,x9 629 umulh x9,x9,x9 630 stp x23,x24,[x2,#8*4] 631 extr x17,x14,x17,#63 632 stp x25,x26,[x2,#8*6] 633 add x2,x2,#8*8 634 adcs x19,x6,x17 635 extr x14,x15,x14,#63 636 adcs x20,x7,x14 637 ldp x17,x14,[x2,#8*3] 638 extr x15,x16,x15,#63 639 cbnz x27,Lsqr4x_shift_n_add 640 ldp x1,x4,[x29,#104] // pull np and n0 641 642 adcs x21,x8,x15 643 extr x16,x17,x16,#63 644 adcs x22,x9,x16 645 ldp x15,x16,[x2,#8*5] 646 mul x10,x11,x11 647 umulh x11,x11,x11 648 stp x19,x20,[x2,#8*0] 649 mul x12,x13,x13 650 umulh x13,x13,x13 651 stp x21,x22,[x2,#8*2] 652 extr x17,x14,x17,#63 653 adcs x23,x10,x17 654 extr x14,x15,x14,#63 655 ldp x19,x20,[sp,#8*0] 656 adcs x24,x11,x14 657 extr x15,x16,x15,#63 658 ldp x6,x7,[x1,#8*0] 659 adcs x25,x12,x15 660 extr x16,xzr,x16,#63 661 ldp x8,x9,[x1,#8*2] 662 adc x26,x13,x16 663 ldp x10,x11,[x1,#8*4] 664 665 // Reduce by 512 bits per iteration 666 mul x28,x4,x19 // t[0]*n0 667 ldp x12,x13,[x1,#8*6] 668 add x3,x1,x5 669 ldp x21,x22,[sp,#8*2] 670 stp x23,x24,[x2,#8*4] 671 ldp x23,x24,[sp,#8*4] 672 stp x25,x26,[x2,#8*6] 673 ldp x25,x26,[sp,#8*6] 674 add x1,x1,#8*8 675 mov x30,xzr // initial top-most carry 676 mov x2,sp 677 mov x27,#8 678 679Lsqr8x_reduction: 680 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 681 mul x15,x7,x28 682 sub x27,x27,#1 683 mul x16,x8,x28 684 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 685 mul x17,x9,x28 686 // (*) adds xzr,x19,x14 687 subs xzr,x19,#1 // (*) 688 mul x14,x10,x28 689 adcs x19,x20,x15 690 mul x15,x11,x28 691 adcs x20,x21,x16 692 mul x16,x12,x28 693 adcs x21,x22,x17 694 mul x17,x13,x28 695 adcs x22,x23,x14 696 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 697 adcs x23,x24,x15 698 umulh x15,x7,x28 699 adcs x24,x25,x16 700 umulh x16,x8,x28 701 adcs x25,x26,x17 702 umulh x17,x9,x28 703 adc x26,xzr,xzr 704 adds x19,x19,x14 705 umulh x14,x10,x28 706 adcs x20,x20,x15 707 umulh x15,x11,x28 708 adcs x21,x21,x16 709 umulh x16,x12,x28 710 adcs x22,x22,x17 711 umulh x17,x13,x28 712 mul x28,x4,x19 // next t[0]*n0 713 adcs x23,x23,x14 714 adcs x24,x24,x15 715 adcs x25,x25,x16 716 adc x26,x26,x17 717 cbnz x27,Lsqr8x_reduction 718 719 ldp x14,x15,[x2,#8*0] 720 ldp x16,x17,[x2,#8*2] 721 mov x0,x2 722 sub x27,x3,x1 // done yet? 723 adds x19,x19,x14 724 adcs x20,x20,x15 725 ldp x14,x15,[x2,#8*4] 726 adcs x21,x21,x16 727 adcs x22,x22,x17 728 ldp x16,x17,[x2,#8*6] 729 adcs x23,x23,x14 730 adcs x24,x24,x15 731 adcs x25,x25,x16 732 adcs x26,x26,x17 733 //adc x28,xzr,xzr // moved below 734 cbz x27,Lsqr8x8_post_condition 735 736 ldr x4,[x2,#-8*8] 737 ldp x6,x7,[x1,#8*0] 738 ldp x8,x9,[x1,#8*2] 739 ldp x10,x11,[x1,#8*4] 740 mov x27,#-8*8 741 ldp x12,x13,[x1,#8*6] 742 add x1,x1,#8*8 743 744Lsqr8x_tail: 745 mul x14,x6,x4 746 adc x28,xzr,xzr // carry bit, modulo-scheduled 747 mul x15,x7,x4 748 add x27,x27,#8 749 mul x16,x8,x4 750 mul x17,x9,x4 751 adds x19,x19,x14 752 mul x14,x10,x4 753 adcs x20,x20,x15 754 mul x15,x11,x4 755 adcs x21,x21,x16 756 mul x16,x12,x4 757 adcs x22,x22,x17 758 mul x17,x13,x4 759 adcs x23,x23,x14 760 umulh x14,x6,x4 761 adcs x24,x24,x15 762 umulh x15,x7,x4 763 adcs x25,x25,x16 764 umulh x16,x8,x4 765 adcs x26,x26,x17 766 umulh x17,x9,x4 767 adc x28,x28,xzr 768 str x19,[x2],#8 769 adds x19,x20,x14 770 umulh x14,x10,x4 771 adcs x20,x21,x15 772 umulh x15,x11,x4 773 adcs x21,x22,x16 774 umulh x16,x12,x4 775 adcs x22,x23,x17 776 umulh x17,x13,x4 777 ldr x4,[x0,x27] 778 adcs x23,x24,x14 779 adcs x24,x25,x15 780 adcs x25,x26,x16 781 adcs x26,x28,x17 782 //adc x28,xzr,xzr // moved above 783 cbnz x27,Lsqr8x_tail 784 // note that carry flag is guaranteed 785 // to be zero at this point 786 ldp x6,x7,[x2,#8*0] 787 sub x27,x3,x1 // done yet? 788 sub x16,x3,x5 // rewinded np 789 ldp x8,x9,[x2,#8*2] 790 ldp x10,x11,[x2,#8*4] 791 ldp x12,x13,[x2,#8*6] 792 cbz x27,Lsqr8x_tail_break 793 794 ldr x4,[x0,#-8*8] 795 adds x19,x19,x6 796 adcs x20,x20,x7 797 ldp x6,x7,[x1,#8*0] 798 adcs x21,x21,x8 799 adcs x22,x22,x9 800 ldp x8,x9,[x1,#8*2] 801 adcs x23,x23,x10 802 adcs x24,x24,x11 803 ldp x10,x11,[x1,#8*4] 804 adcs x25,x25,x12 805 mov x27,#-8*8 806 adcs x26,x26,x13 807 ldp x12,x13,[x1,#8*6] 808 add x1,x1,#8*8 809 //adc x28,xzr,xzr // moved above 810 b Lsqr8x_tail 811 812.align 4 813Lsqr8x_tail_break: 814 ldr x4,[x29,#112] // pull n0 815 add x27,x2,#8*8 // end of current t[num] window 816 817 subs xzr,x30,#1 // "move" top-most carry to carry bit 818 adcs x14,x19,x6 819 adcs x15,x20,x7 820 ldp x19,x20,[x0,#8*0] 821 adcs x21,x21,x8 822 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 823 adcs x22,x22,x9 824 ldp x8,x9,[x16,#8*2] 825 adcs x23,x23,x10 826 adcs x24,x24,x11 827 ldp x10,x11,[x16,#8*4] 828 adcs x25,x25,x12 829 adcs x26,x26,x13 830 ldp x12,x13,[x16,#8*6] 831 add x1,x16,#8*8 832 adc x30,xzr,xzr // top-most carry 833 mul x28,x4,x19 834 stp x14,x15,[x2,#8*0] 835 stp x21,x22,[x2,#8*2] 836 ldp x21,x22,[x0,#8*2] 837 stp x23,x24,[x2,#8*4] 838 ldp x23,x24,[x0,#8*4] 839 cmp x27,x29 // did we hit the bottom? 840 stp x25,x26,[x2,#8*6] 841 mov x2,x0 // slide the window 842 ldp x25,x26,[x0,#8*6] 843 mov x27,#8 844 b.ne Lsqr8x_reduction 845 846 // Final step. We see if result is larger than modulus, and 847 // if it is, subtract the modulus. But comparison implies 848 // subtraction. So we subtract modulus, see if it borrowed, 849 // and conditionally copy original value. 850 ldr x0,[x29,#96] // pull rp 851 add x2,x2,#8*8 852 subs x14,x19,x6 853 sbcs x15,x20,x7 854 sub x27,x5,#8*8 855 mov x3,x0 // x0 copy 856 857Lsqr8x_sub: 858 sbcs x16,x21,x8 859 ldp x6,x7,[x1,#8*0] 860 sbcs x17,x22,x9 861 stp x14,x15,[x0,#8*0] 862 sbcs x14,x23,x10 863 ldp x8,x9,[x1,#8*2] 864 sbcs x15,x24,x11 865 stp x16,x17,[x0,#8*2] 866 sbcs x16,x25,x12 867 ldp x10,x11,[x1,#8*4] 868 sbcs x17,x26,x13 869 ldp x12,x13,[x1,#8*6] 870 add x1,x1,#8*8 871 ldp x19,x20,[x2,#8*0] 872 sub x27,x27,#8*8 873 ldp x21,x22,[x2,#8*2] 874 ldp x23,x24,[x2,#8*4] 875 ldp x25,x26,[x2,#8*6] 876 add x2,x2,#8*8 877 stp x14,x15,[x0,#8*4] 878 sbcs x14,x19,x6 879 stp x16,x17,[x0,#8*6] 880 add x0,x0,#8*8 881 sbcs x15,x20,x7 882 cbnz x27,Lsqr8x_sub 883 884 sbcs x16,x21,x8 885 mov x2,sp 886 add x1,sp,x5 887 ldp x6,x7,[x3,#8*0] 888 sbcs x17,x22,x9 889 stp x14,x15,[x0,#8*0] 890 sbcs x14,x23,x10 891 ldp x8,x9,[x3,#8*2] 892 sbcs x15,x24,x11 893 stp x16,x17,[x0,#8*2] 894 sbcs x16,x25,x12 895 ldp x19,x20,[x1,#8*0] 896 sbcs x17,x26,x13 897 ldp x21,x22,[x1,#8*2] 898 sbcs xzr,x30,xzr // did it borrow? 899 ldr x30,[x29,#8] // pull return address 900 stp x14,x15,[x0,#8*4] 901 stp x16,x17,[x0,#8*6] 902 903 sub x27,x5,#8*4 904Lsqr4x_cond_copy: 905 sub x27,x27,#8*4 906 csel x14,x19,x6,lo 907 stp xzr,xzr,[x2,#8*0] 908 csel x15,x20,x7,lo 909 ldp x6,x7,[x3,#8*4] 910 ldp x19,x20,[x1,#8*4] 911 csel x16,x21,x8,lo 912 stp xzr,xzr,[x2,#8*2] 913 add x2,x2,#8*4 914 csel x17,x22,x9,lo 915 ldp x8,x9,[x3,#8*6] 916 ldp x21,x22,[x1,#8*6] 917 add x1,x1,#8*4 918 stp x14,x15,[x3,#8*0] 919 stp x16,x17,[x3,#8*2] 920 add x3,x3,#8*4 921 stp xzr,xzr,[x1,#8*0] 922 stp xzr,xzr,[x1,#8*2] 923 cbnz x27,Lsqr4x_cond_copy 924 925 csel x14,x19,x6,lo 926 stp xzr,xzr,[x2,#8*0] 927 csel x15,x20,x7,lo 928 stp xzr,xzr,[x2,#8*2] 929 csel x16,x21,x8,lo 930 csel x17,x22,x9,lo 931 stp x14,x15,[x3,#8*0] 932 stp x16,x17,[x3,#8*2] 933 934 b Lsqr8x_done 935 936.align 4 937Lsqr8x8_post_condition: 938 adc x28,xzr,xzr 939 ldr x30,[x29,#8] // pull return address 940 // x19-7,x28 hold result, x6-7 hold modulus 941 subs x6,x19,x6 942 ldr x1,[x29,#96] // pull rp 943 sbcs x7,x20,x7 944 stp xzr,xzr,[sp,#8*0] 945 sbcs x8,x21,x8 946 stp xzr,xzr,[sp,#8*2] 947 sbcs x9,x22,x9 948 stp xzr,xzr,[sp,#8*4] 949 sbcs x10,x23,x10 950 stp xzr,xzr,[sp,#8*6] 951 sbcs x11,x24,x11 952 stp xzr,xzr,[sp,#8*8] 953 sbcs x12,x25,x12 954 stp xzr,xzr,[sp,#8*10] 955 sbcs x13,x26,x13 956 stp xzr,xzr,[sp,#8*12] 957 sbcs x28,x28,xzr // did it borrow? 958 stp xzr,xzr,[sp,#8*14] 959 960 // x6-7 hold result-modulus 961 csel x6,x19,x6,lo 962 csel x7,x20,x7,lo 963 csel x8,x21,x8,lo 964 csel x9,x22,x9,lo 965 stp x6,x7,[x1,#8*0] 966 csel x10,x23,x10,lo 967 csel x11,x24,x11,lo 968 stp x8,x9,[x1,#8*2] 969 csel x12,x25,x12,lo 970 csel x13,x26,x13,lo 971 stp x10,x11,[x1,#8*4] 972 stp x12,x13,[x1,#8*6] 973 974Lsqr8x_done: 975 ldp x19,x20,[x29,#16] 976 mov sp,x29 977 ldp x21,x22,[x29,#32] 978 mov x0,#1 979 ldp x23,x24,[x29,#48] 980 ldp x25,x26,[x29,#64] 981 ldp x27,x28,[x29,#80] 982 ldr x29,[sp],#128 983 // x30 is popped earlier 984 AARCH64_VALIDATE_LINK_REGISTER 985 ret 986 987 988.align 5 989__bn_mul4x_mont: 990 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 991 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 992 // return address. 993 stp x29,x30,[sp,#-128]! 994 add x29,sp,#0 995 stp x19,x20,[sp,#16] 996 stp x21,x22,[sp,#32] 997 stp x23,x24,[sp,#48] 998 stp x25,x26,[sp,#64] 999 stp x27,x28,[sp,#80] 1000 1001 sub x26,sp,x5,lsl#3 1002 lsl x5,x5,#3 1003 ldr x4,[x4] // *n0 1004 sub sp,x26,#8*4 // alloca 1005 1006 add x10,x2,x5 1007 add x27,x1,x5 1008 stp x0,x10,[x29,#96] // offload rp and &b[num] 1009 1010 ldr x24,[x2,#8*0] // b[0] 1011 ldp x6,x7,[x1,#8*0] // a[0..3] 1012 ldp x8,x9,[x1,#8*2] 1013 add x1,x1,#8*4 1014 mov x19,xzr 1015 mov x20,xzr 1016 mov x21,xzr 1017 mov x22,xzr 1018 ldp x14,x15,[x3,#8*0] // n[0..3] 1019 ldp x16,x17,[x3,#8*2] 1020 adds x3,x3,#8*4 // clear carry bit 1021 mov x0,xzr 1022 mov x28,#0 1023 mov x26,sp 1024 1025Loop_mul4x_1st_reduction: 1026 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1027 adc x0,x0,xzr // modulo-scheduled 1028 mul x11,x7,x24 1029 add x28,x28,#8 1030 mul x12,x8,x24 1031 and x28,x28,#31 1032 mul x13,x9,x24 1033 adds x19,x19,x10 1034 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1035 adcs x20,x20,x11 1036 mul x25,x19,x4 // t[0]*n0 1037 adcs x21,x21,x12 1038 umulh x11,x7,x24 1039 adcs x22,x22,x13 1040 umulh x12,x8,x24 1041 adc x23,xzr,xzr 1042 umulh x13,x9,x24 1043 ldr x24,[x2,x28] // next b[i] (or b[0]) 1044 adds x20,x20,x10 1045 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1046 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1047 adcs x21,x21,x11 1048 mul x11,x15,x25 1049 adcs x22,x22,x12 1050 mul x12,x16,x25 1051 adc x23,x23,x13 // can't overflow 1052 mul x13,x17,x25 1053 // (*) adds xzr,x19,x10 1054 subs xzr,x19,#1 // (*) 1055 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1056 adcs x19,x20,x11 1057 umulh x11,x15,x25 1058 adcs x20,x21,x12 1059 umulh x12,x16,x25 1060 adcs x21,x22,x13 1061 umulh x13,x17,x25 1062 adcs x22,x23,x0 1063 adc x0,xzr,xzr 1064 adds x19,x19,x10 1065 sub x10,x27,x1 1066 adcs x20,x20,x11 1067 adcs x21,x21,x12 1068 adcs x22,x22,x13 1069 //adc x0,x0,xzr 1070 cbnz x28,Loop_mul4x_1st_reduction 1071 1072 cbz x10,Lmul4x4_post_condition 1073 1074 ldp x6,x7,[x1,#8*0] // a[4..7] 1075 ldp x8,x9,[x1,#8*2] 1076 add x1,x1,#8*4 1077 ldr x25,[sp] // a[0]*n0 1078 ldp x14,x15,[x3,#8*0] // n[4..7] 1079 ldp x16,x17,[x3,#8*2] 1080 add x3,x3,#8*4 1081 1082Loop_mul4x_1st_tail: 1083 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1084 adc x0,x0,xzr // modulo-scheduled 1085 mul x11,x7,x24 1086 add x28,x28,#8 1087 mul x12,x8,x24 1088 and x28,x28,#31 1089 mul x13,x9,x24 1090 adds x19,x19,x10 1091 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1092 adcs x20,x20,x11 1093 umulh x11,x7,x24 1094 adcs x21,x21,x12 1095 umulh x12,x8,x24 1096 adcs x22,x22,x13 1097 umulh x13,x9,x24 1098 adc x23,xzr,xzr 1099 ldr x24,[x2,x28] // next b[i] (or b[0]) 1100 adds x20,x20,x10 1101 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1102 adcs x21,x21,x11 1103 mul x11,x15,x25 1104 adcs x22,x22,x12 1105 mul x12,x16,x25 1106 adc x23,x23,x13 // can't overflow 1107 mul x13,x17,x25 1108 adds x19,x19,x10 1109 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1110 adcs x20,x20,x11 1111 umulh x11,x15,x25 1112 adcs x21,x21,x12 1113 umulh x12,x16,x25 1114 adcs x22,x22,x13 1115 adcs x23,x23,x0 1116 umulh x13,x17,x25 1117 adc x0,xzr,xzr 1118 ldr x25,[sp,x28] // next t[0]*n0 1119 str x19,[x26],#8 // result!!! 1120 adds x19,x20,x10 1121 sub x10,x27,x1 // done yet? 1122 adcs x20,x21,x11 1123 adcs x21,x22,x12 1124 adcs x22,x23,x13 1125 //adc x0,x0,xzr 1126 cbnz x28,Loop_mul4x_1st_tail 1127 1128 sub x11,x27,x5 // rewinded x1 1129 cbz x10,Lmul4x_proceed 1130 1131 ldp x6,x7,[x1,#8*0] 1132 ldp x8,x9,[x1,#8*2] 1133 add x1,x1,#8*4 1134 ldp x14,x15,[x3,#8*0] 1135 ldp x16,x17,[x3,#8*2] 1136 add x3,x3,#8*4 1137 b Loop_mul4x_1st_tail 1138 1139.align 5 1140Lmul4x_proceed: 1141 ldr x24,[x2,#8*4]! // *++b 1142 adc x30,x0,xzr 1143 ldp x6,x7,[x11,#8*0] // a[0..3] 1144 sub x3,x3,x5 // rewind np 1145 ldp x8,x9,[x11,#8*2] 1146 add x1,x11,#8*4 1147 1148 stp x19,x20,[x26,#8*0] // result!!! 1149 ldp x19,x20,[sp,#8*4] // t[0..3] 1150 stp x21,x22,[x26,#8*2] // result!!! 1151 ldp x21,x22,[sp,#8*6] 1152 1153 ldp x14,x15,[x3,#8*0] // n[0..3] 1154 mov x26,sp 1155 ldp x16,x17,[x3,#8*2] 1156 adds x3,x3,#8*4 // clear carry bit 1157 mov x0,xzr 1158 1159.align 4 1160Loop_mul4x_reduction: 1161 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1162 adc x0,x0,xzr // modulo-scheduled 1163 mul x11,x7,x24 1164 add x28,x28,#8 1165 mul x12,x8,x24 1166 and x28,x28,#31 1167 mul x13,x9,x24 1168 adds x19,x19,x10 1169 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1170 adcs x20,x20,x11 1171 mul x25,x19,x4 // t[0]*n0 1172 adcs x21,x21,x12 1173 umulh x11,x7,x24 1174 adcs x22,x22,x13 1175 umulh x12,x8,x24 1176 adc x23,xzr,xzr 1177 umulh x13,x9,x24 1178 ldr x24,[x2,x28] // next b[i] 1179 adds x20,x20,x10 1180 // (*) mul x10,x14,x25 1181 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1182 adcs x21,x21,x11 1183 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1184 adcs x22,x22,x12 1185 mul x12,x16,x25 1186 adc x23,x23,x13 // can't overflow 1187 mul x13,x17,x25 1188 // (*) adds xzr,x19,x10 1189 subs xzr,x19,#1 // (*) 1190 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1191 adcs x19,x20,x11 1192 umulh x11,x15,x25 1193 adcs x20,x21,x12 1194 umulh x12,x16,x25 1195 adcs x21,x22,x13 1196 umulh x13,x17,x25 1197 adcs x22,x23,x0 1198 adc x0,xzr,xzr 1199 adds x19,x19,x10 1200 adcs x20,x20,x11 1201 adcs x21,x21,x12 1202 adcs x22,x22,x13 1203 //adc x0,x0,xzr 1204 cbnz x28,Loop_mul4x_reduction 1205 1206 adc x0,x0,xzr 1207 ldp x10,x11,[x26,#8*4] // t[4..7] 1208 ldp x12,x13,[x26,#8*6] 1209 ldp x6,x7,[x1,#8*0] // a[4..7] 1210 ldp x8,x9,[x1,#8*2] 1211 add x1,x1,#8*4 1212 adds x19,x19,x10 1213 adcs x20,x20,x11 1214 adcs x21,x21,x12 1215 adcs x22,x22,x13 1216 //adc x0,x0,xzr 1217 1218 ldr x25,[sp] // t[0]*n0 1219 ldp x14,x15,[x3,#8*0] // n[4..7] 1220 ldp x16,x17,[x3,#8*2] 1221 add x3,x3,#8*4 1222 1223.align 4 1224Loop_mul4x_tail: 1225 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1226 adc x0,x0,xzr // modulo-scheduled 1227 mul x11,x7,x24 1228 add x28,x28,#8 1229 mul x12,x8,x24 1230 and x28,x28,#31 1231 mul x13,x9,x24 1232 adds x19,x19,x10 1233 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1234 adcs x20,x20,x11 1235 umulh x11,x7,x24 1236 adcs x21,x21,x12 1237 umulh x12,x8,x24 1238 adcs x22,x22,x13 1239 umulh x13,x9,x24 1240 adc x23,xzr,xzr 1241 ldr x24,[x2,x28] // next b[i] 1242 adds x20,x20,x10 1243 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1244 adcs x21,x21,x11 1245 mul x11,x15,x25 1246 adcs x22,x22,x12 1247 mul x12,x16,x25 1248 adc x23,x23,x13 // can't overflow 1249 mul x13,x17,x25 1250 adds x19,x19,x10 1251 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1252 adcs x20,x20,x11 1253 umulh x11,x15,x25 1254 adcs x21,x21,x12 1255 umulh x12,x16,x25 1256 adcs x22,x22,x13 1257 umulh x13,x17,x25 1258 adcs x23,x23,x0 1259 ldr x25,[sp,x28] // next a[0]*n0 1260 adc x0,xzr,xzr 1261 str x19,[x26],#8 // result!!! 1262 adds x19,x20,x10 1263 sub x10,x27,x1 // done yet? 1264 adcs x20,x21,x11 1265 adcs x21,x22,x12 1266 adcs x22,x23,x13 1267 //adc x0,x0,xzr 1268 cbnz x28,Loop_mul4x_tail 1269 1270 sub x11,x3,x5 // rewinded np? 1271 adc x0,x0,xzr 1272 cbz x10,Loop_mul4x_break 1273 1274 ldp x10,x11,[x26,#8*4] 1275 ldp x12,x13,[x26,#8*6] 1276 ldp x6,x7,[x1,#8*0] 1277 ldp x8,x9,[x1,#8*2] 1278 add x1,x1,#8*4 1279 adds x19,x19,x10 1280 adcs x20,x20,x11 1281 adcs x21,x21,x12 1282 adcs x22,x22,x13 1283 //adc x0,x0,xzr 1284 ldp x14,x15,[x3,#8*0] 1285 ldp x16,x17,[x3,#8*2] 1286 add x3,x3,#8*4 1287 b Loop_mul4x_tail 1288 1289.align 4 1290Loop_mul4x_break: 1291 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1292 adds x19,x19,x30 1293 add x2,x2,#8*4 // bp++ 1294 adcs x20,x20,xzr 1295 sub x1,x1,x5 // rewind ap 1296 adcs x21,x21,xzr 1297 stp x19,x20,[x26,#8*0] // result!!! 1298 adcs x22,x22,xzr 1299 ldp x19,x20,[sp,#8*4] // t[0..3] 1300 adc x30,x0,xzr 1301 stp x21,x22,[x26,#8*2] // result!!! 1302 cmp x2,x13 // done yet? 1303 ldp x21,x22,[sp,#8*6] 1304 ldp x14,x15,[x11,#8*0] // n[0..3] 1305 ldp x16,x17,[x11,#8*2] 1306 add x3,x11,#8*4 1307 b.eq Lmul4x_post 1308 1309 ldr x24,[x2] 1310 ldp x6,x7,[x1,#8*0] // a[0..3] 1311 ldp x8,x9,[x1,#8*2] 1312 adds x1,x1,#8*4 // clear carry bit 1313 mov x0,xzr 1314 mov x26,sp 1315 b Loop_mul4x_reduction 1316 1317.align 4 1318Lmul4x_post: 1319 // Final step. We see if result is larger than modulus, and 1320 // if it is, subtract the modulus. But comparison implies 1321 // subtraction. So we subtract modulus, see if it borrowed, 1322 // and conditionally copy original value. 1323 mov x0,x12 1324 mov x27,x12 // x0 copy 1325 subs x10,x19,x14 1326 add x26,sp,#8*8 1327 sbcs x11,x20,x15 1328 sub x28,x5,#8*4 1329 1330Lmul4x_sub: 1331 sbcs x12,x21,x16 1332 ldp x14,x15,[x3,#8*0] 1333 sub x28,x28,#8*4 1334 ldp x19,x20,[x26,#8*0] 1335 sbcs x13,x22,x17 1336 ldp x16,x17,[x3,#8*2] 1337 add x3,x3,#8*4 1338 ldp x21,x22,[x26,#8*2] 1339 add x26,x26,#8*4 1340 stp x10,x11,[x0,#8*0] 1341 sbcs x10,x19,x14 1342 stp x12,x13,[x0,#8*2] 1343 add x0,x0,#8*4 1344 sbcs x11,x20,x15 1345 cbnz x28,Lmul4x_sub 1346 1347 sbcs x12,x21,x16 1348 mov x26,sp 1349 add x1,sp,#8*4 1350 ldp x6,x7,[x27,#8*0] 1351 sbcs x13,x22,x17 1352 stp x10,x11,[x0,#8*0] 1353 ldp x8,x9,[x27,#8*2] 1354 stp x12,x13,[x0,#8*2] 1355 ldp x19,x20,[x1,#8*0] 1356 ldp x21,x22,[x1,#8*2] 1357 sbcs xzr,x30,xzr // did it borrow? 1358 ldr x30,[x29,#8] // pull return address 1359 1360 sub x28,x5,#8*4 1361Lmul4x_cond_copy: 1362 sub x28,x28,#8*4 1363 csel x10,x19,x6,lo 1364 stp xzr,xzr,[x26,#8*0] 1365 csel x11,x20,x7,lo 1366 ldp x6,x7,[x27,#8*4] 1367 ldp x19,x20,[x1,#8*4] 1368 csel x12,x21,x8,lo 1369 stp xzr,xzr,[x26,#8*2] 1370 add x26,x26,#8*4 1371 csel x13,x22,x9,lo 1372 ldp x8,x9,[x27,#8*6] 1373 ldp x21,x22,[x1,#8*6] 1374 add x1,x1,#8*4 1375 stp x10,x11,[x27,#8*0] 1376 stp x12,x13,[x27,#8*2] 1377 add x27,x27,#8*4 1378 cbnz x28,Lmul4x_cond_copy 1379 1380 csel x10,x19,x6,lo 1381 stp xzr,xzr,[x26,#8*0] 1382 csel x11,x20,x7,lo 1383 stp xzr,xzr,[x26,#8*2] 1384 csel x12,x21,x8,lo 1385 stp xzr,xzr,[x26,#8*3] 1386 csel x13,x22,x9,lo 1387 stp xzr,xzr,[x26,#8*4] 1388 stp x10,x11,[x27,#8*0] 1389 stp x12,x13,[x27,#8*2] 1390 1391 b Lmul4x_done 1392 1393.align 4 1394Lmul4x4_post_condition: 1395 adc x0,x0,xzr 1396 ldr x1,[x29,#96] // pull rp 1397 // x19-3,x0 hold result, x14-7 hold modulus 1398 subs x6,x19,x14 1399 ldr x30,[x29,#8] // pull return address 1400 sbcs x7,x20,x15 1401 stp xzr,xzr,[sp,#8*0] 1402 sbcs x8,x21,x16 1403 stp xzr,xzr,[sp,#8*2] 1404 sbcs x9,x22,x17 1405 stp xzr,xzr,[sp,#8*4] 1406 sbcs xzr,x0,xzr // did it borrow? 1407 stp xzr,xzr,[sp,#8*6] 1408 1409 // x6-3 hold result-modulus 1410 csel x6,x19,x6,lo 1411 csel x7,x20,x7,lo 1412 csel x8,x21,x8,lo 1413 csel x9,x22,x9,lo 1414 stp x6,x7,[x1,#8*0] 1415 stp x8,x9,[x1,#8*2] 1416 1417Lmul4x_done: 1418 ldp x19,x20,[x29,#16] 1419 mov sp,x29 1420 ldp x21,x22,[x29,#32] 1421 mov x0,#1 1422 ldp x23,x24,[x29,#48] 1423 ldp x25,x26,[x29,#64] 1424 ldp x27,x28,[x29,#80] 1425 ldr x29,[sp],#128 1426 // x30 is popped earlier 1427 AARCH64_VALIDATE_LINK_REGISTER 1428 ret 1429 1430.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1431.align 2 1432.align 4 1433#endif // !OPENSSL_NO_ASM 1434