1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#include <GFp/arm_arch.h> 13 14.text 15 16.globl _GFp_bn_mul_mont 17.private_extern _GFp_bn_mul_mont 18 19.align 5 20_GFp_bn_mul_mont: 21 AARCH64_SIGN_LINK_REGISTER 22 tst x5,#7 23 b.eq __bn_sqr8x_mont 24 tst x5,#3 25 b.eq __bn_mul4x_mont 26Lmul_mont: 27 stp x29,x30,[sp,#-64]! 28 add x29,sp,#0 29 stp x19,x20,[sp,#16] 30 stp x21,x22,[sp,#32] 31 stp x23,x24,[sp,#48] 32 33 ldr x9,[x2],#8 // bp[0] 34 sub x22,sp,x5,lsl#3 35 ldp x7,x8,[x1],#16 // ap[0..1] 36 lsl x5,x5,#3 37 ldr x4,[x4] // *n0 38 and x22,x22,#-16 // ABI says so 39 ldp x13,x14,[x3],#16 // np[0..1] 40 41 mul x6,x7,x9 // ap[0]*bp[0] 42 sub x21,x5,#16 // j=num-2 43 umulh x7,x7,x9 44 mul x10,x8,x9 // ap[1]*bp[0] 45 umulh x11,x8,x9 46 47 mul x15,x6,x4 // "tp[0]"*n0 48 mov sp,x22 // alloca 49 50 // (*) mul x12,x13,x15 // np[0]*m1 51 umulh x13,x13,x15 52 mul x16,x14,x15 // np[1]*m1 53 // (*) adds x12,x12,x6 // discarded 54 // (*) As for removal of first multiplication and addition 55 // instructions. The outcome of first addition is 56 // guaranteed to be zero, which leaves two computationally 57 // significant outcomes: it either carries or not. Then 58 // question is when does it carry? Is there alternative 59 // way to deduce it? If you follow operations, you can 60 // observe that condition for carry is quite simple: 61 // x6 being non-zero. So that carry can be calculated 62 // by adding -1 to x6. That's what next instruction does. 63 subs xzr,x6,#1 // (*) 64 umulh x17,x14,x15 65 adc x13,x13,xzr 66 cbz x21,L1st_skip 67 68L1st: 69 ldr x8,[x1],#8 70 adds x6,x10,x7 71 sub x21,x21,#8 // j-- 72 adc x7,x11,xzr 73 74 ldr x14,[x3],#8 75 adds x12,x16,x13 76 mul x10,x8,x9 // ap[j]*bp[0] 77 adc x13,x17,xzr 78 umulh x11,x8,x9 79 80 adds x12,x12,x6 81 mul x16,x14,x15 // np[j]*m1 82 adc x13,x13,xzr 83 umulh x17,x14,x15 84 str x12,[x22],#8 // tp[j-1] 85 cbnz x21,L1st 86 87L1st_skip: 88 adds x6,x10,x7 89 sub x1,x1,x5 // rewind x1 90 adc x7,x11,xzr 91 92 adds x12,x16,x13 93 sub x3,x3,x5 // rewind x3 94 adc x13,x17,xzr 95 96 adds x12,x12,x6 97 sub x20,x5,#8 // i=num-1 98 adcs x13,x13,x7 99 100 adc x19,xzr,xzr // upmost overflow bit 101 stp x12,x13,[x22] 102 103Louter: 104 ldr x9,[x2],#8 // bp[i] 105 ldp x7,x8,[x1],#16 106 ldr x23,[sp] // tp[0] 107 add x22,sp,#8 108 109 mul x6,x7,x9 // ap[0]*bp[i] 110 sub x21,x5,#16 // j=num-2 111 umulh x7,x7,x9 112 ldp x13,x14,[x3],#16 113 mul x10,x8,x9 // ap[1]*bp[i] 114 adds x6,x6,x23 115 umulh x11,x8,x9 116 adc x7,x7,xzr 117 118 mul x15,x6,x4 119 sub x20,x20,#8 // i-- 120 121 // (*) mul x12,x13,x15 // np[0]*m1 122 umulh x13,x13,x15 123 mul x16,x14,x15 // np[1]*m1 124 // (*) adds x12,x12,x6 125 subs xzr,x6,#1 // (*) 126 umulh x17,x14,x15 127 cbz x21,Linner_skip 128 129Linner: 130 ldr x8,[x1],#8 131 adc x13,x13,xzr 132 ldr x23,[x22],#8 // tp[j] 133 adds x6,x10,x7 134 sub x21,x21,#8 // j-- 135 adc x7,x11,xzr 136 137 adds x12,x16,x13 138 ldr x14,[x3],#8 139 adc x13,x17,xzr 140 141 mul x10,x8,x9 // ap[j]*bp[i] 142 adds x6,x6,x23 143 umulh x11,x8,x9 144 adc x7,x7,xzr 145 146 mul x16,x14,x15 // np[j]*m1 147 adds x12,x12,x6 148 umulh x17,x14,x15 149 str x12,[x22,#-16] // tp[j-1] 150 cbnz x21,Linner 151 152Linner_skip: 153 ldr x23,[x22],#8 // tp[j] 154 adc x13,x13,xzr 155 adds x6,x10,x7 156 sub x1,x1,x5 // rewind x1 157 adc x7,x11,xzr 158 159 adds x12,x16,x13 160 sub x3,x3,x5 // rewind x3 161 adcs x13,x17,x19 162 adc x19,xzr,xzr 163 164 adds x6,x6,x23 165 adc x7,x7,xzr 166 167 adds x12,x12,x6 168 adcs x13,x13,x7 169 adc x19,x19,xzr // upmost overflow bit 170 stp x12,x13,[x22,#-16] 171 172 cbnz x20,Louter 173 174 // Final step. We see if result is larger than modulus, and 175 // if it is, subtract the modulus. But comparison implies 176 // subtraction. So we subtract modulus, see if it borrowed, 177 // and conditionally copy original value. 178 ldr x23,[sp] // tp[0] 179 add x22,sp,#8 180 ldr x14,[x3],#8 // np[0] 181 subs x21,x5,#8 // j=num-1 and clear borrow 182 mov x1,x0 183Lsub: 184 sbcs x8,x23,x14 // tp[j]-np[j] 185 ldr x23,[x22],#8 186 sub x21,x21,#8 // j-- 187 ldr x14,[x3],#8 188 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 189 cbnz x21,Lsub 190 191 sbcs x8,x23,x14 192 sbcs x19,x19,xzr // did it borrow? 193 str x8,[x1],#8 // rp[num-1] 194 195 ldr x23,[sp] // tp[0] 196 add x22,sp,#8 197 ldr x8,[x0],#8 // rp[0] 198 sub x5,x5,#8 // num-- 199 nop 200Lcond_copy: 201 sub x5,x5,#8 // num-- 202 csel x14,x23,x8,lo // did it borrow? 203 ldr x23,[x22],#8 204 ldr x8,[x0],#8 205 str xzr,[x22,#-16] // wipe tp 206 str x14,[x0,#-16] 207 cbnz x5,Lcond_copy 208 209 csel x14,x23,x8,lo 210 str xzr,[x22,#-8] // wipe tp 211 str x14,[x0,#-8] 212 213 ldp x19,x20,[x29,#16] 214 mov sp,x29 215 ldp x21,x22,[x29,#32] 216 mov x0,#1 217 ldp x23,x24,[x29,#48] 218 ldr x29,[sp],#64 219 AARCH64_VALIDATE_LINK_REGISTER 220 ret 221 222 223.align 5 224__bn_sqr8x_mont: 225 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 226 // only from bn_mul_mont which has already signed the return address. 227 cmp x1,x2 228 b.ne __bn_mul4x_mont 229Lsqr8x_mont: 230 stp x29,x30,[sp,#-128]! 231 add x29,sp,#0 232 stp x19,x20,[sp,#16] 233 stp x21,x22,[sp,#32] 234 stp x23,x24,[sp,#48] 235 stp x25,x26,[sp,#64] 236 stp x27,x28,[sp,#80] 237 stp x0,x3,[sp,#96] // offload rp and np 238 239 ldp x6,x7,[x1,#8*0] 240 ldp x8,x9,[x1,#8*2] 241 ldp x10,x11,[x1,#8*4] 242 ldp x12,x13,[x1,#8*6] 243 244 sub x2,sp,x5,lsl#4 245 lsl x5,x5,#3 246 ldr x4,[x4] // *n0 247 mov sp,x2 // alloca 248 sub x27,x5,#8*8 249 b Lsqr8x_zero_start 250 251Lsqr8x_zero: 252 sub x27,x27,#8*8 253 stp xzr,xzr,[x2,#8*0] 254 stp xzr,xzr,[x2,#8*2] 255 stp xzr,xzr,[x2,#8*4] 256 stp xzr,xzr,[x2,#8*6] 257Lsqr8x_zero_start: 258 stp xzr,xzr,[x2,#8*8] 259 stp xzr,xzr,[x2,#8*10] 260 stp xzr,xzr,[x2,#8*12] 261 stp xzr,xzr,[x2,#8*14] 262 add x2,x2,#8*16 263 cbnz x27,Lsqr8x_zero 264 265 add x3,x1,x5 266 add x1,x1,#8*8 267 mov x19,xzr 268 mov x20,xzr 269 mov x21,xzr 270 mov x22,xzr 271 mov x23,xzr 272 mov x24,xzr 273 mov x25,xzr 274 mov x26,xzr 275 mov x2,sp 276 str x4,[x29,#112] // offload n0 277 278 // Multiply everything but a[i]*a[i] 279.align 4 280Lsqr8x_outer_loop: 281 // a[1]a[0] (i) 282 // a[2]a[0] 283 // a[3]a[0] 284 // a[4]a[0] 285 // a[5]a[0] 286 // a[6]a[0] 287 // a[7]a[0] 288 // a[2]a[1] (ii) 289 // a[3]a[1] 290 // a[4]a[1] 291 // a[5]a[1] 292 // a[6]a[1] 293 // a[7]a[1] 294 // a[3]a[2] (iii) 295 // a[4]a[2] 296 // a[5]a[2] 297 // a[6]a[2] 298 // a[7]a[2] 299 // a[4]a[3] (iv) 300 // a[5]a[3] 301 // a[6]a[3] 302 // a[7]a[3] 303 // a[5]a[4] (v) 304 // a[6]a[4] 305 // a[7]a[4] 306 // a[6]a[5] (vi) 307 // a[7]a[5] 308 // a[7]a[6] (vii) 309 310 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 311 mul x15,x8,x6 312 mul x16,x9,x6 313 mul x17,x10,x6 314 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 315 mul x14,x11,x6 316 adcs x21,x21,x15 317 mul x15,x12,x6 318 adcs x22,x22,x16 319 mul x16,x13,x6 320 adcs x23,x23,x17 321 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 322 adcs x24,x24,x14 323 umulh x14,x8,x6 324 adcs x25,x25,x15 325 umulh x15,x9,x6 326 adcs x26,x26,x16 327 umulh x16,x10,x6 328 stp x19,x20,[x2],#8*2 // t[0..1] 329 adc x19,xzr,xzr // t[8] 330 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 331 umulh x17,x11,x6 332 adcs x22,x22,x14 333 umulh x14,x12,x6 334 adcs x23,x23,x15 335 umulh x15,x13,x6 336 adcs x24,x24,x16 337 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 338 adcs x25,x25,x17 339 mul x17,x9,x7 340 adcs x26,x26,x14 341 mul x14,x10,x7 342 adc x19,x19,x15 343 344 mul x15,x11,x7 345 adds x22,x22,x16 346 mul x16,x12,x7 347 adcs x23,x23,x17 348 mul x17,x13,x7 349 adcs x24,x24,x14 350 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 351 adcs x25,x25,x15 352 umulh x15,x9,x7 353 adcs x26,x26,x16 354 umulh x16,x10,x7 355 adcs x19,x19,x17 356 umulh x17,x11,x7 357 stp x21,x22,[x2],#8*2 // t[2..3] 358 adc x20,xzr,xzr // t[9] 359 adds x23,x23,x14 360 umulh x14,x12,x7 361 adcs x24,x24,x15 362 umulh x15,x13,x7 363 adcs x25,x25,x16 364 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 365 adcs x26,x26,x17 366 mul x17,x10,x8 367 adcs x19,x19,x14 368 mul x14,x11,x8 369 adc x20,x20,x15 370 371 mul x15,x12,x8 372 adds x24,x24,x16 373 mul x16,x13,x8 374 adcs x25,x25,x17 375 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 376 adcs x26,x26,x14 377 umulh x14,x10,x8 378 adcs x19,x19,x15 379 umulh x15,x11,x8 380 adcs x20,x20,x16 381 umulh x16,x12,x8 382 stp x23,x24,[x2],#8*2 // t[4..5] 383 adc x21,xzr,xzr // t[10] 384 adds x25,x25,x17 385 umulh x17,x13,x8 386 adcs x26,x26,x14 387 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 388 adcs x19,x19,x15 389 mul x15,x11,x9 390 adcs x20,x20,x16 391 mul x16,x12,x9 392 adc x21,x21,x17 393 394 mul x17,x13,x9 395 adds x26,x26,x14 396 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 397 adcs x19,x19,x15 398 umulh x15,x11,x9 399 adcs x20,x20,x16 400 umulh x16,x12,x9 401 adcs x21,x21,x17 402 umulh x17,x13,x9 403 stp x25,x26,[x2],#8*2 // t[6..7] 404 adc x22,xzr,xzr // t[11] 405 adds x19,x19,x14 406 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 407 adcs x20,x20,x15 408 mul x15,x12,x10 409 adcs x21,x21,x16 410 mul x16,x13,x10 411 adc x22,x22,x17 412 413 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 414 adds x20,x20,x14 415 umulh x14,x12,x10 416 adcs x21,x21,x15 417 umulh x15,x13,x10 418 adcs x22,x22,x16 419 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 420 adc x23,xzr,xzr // t[12] 421 adds x21,x21,x17 422 mul x17,x13,x11 423 adcs x22,x22,x14 424 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 425 adc x23,x23,x15 426 427 umulh x15,x13,x11 428 adds x22,x22,x16 429 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 430 adcs x23,x23,x17 431 umulh x17,x13,x12 // hi(a[7]*a[6]) 432 adc x24,xzr,xzr // t[13] 433 adds x23,x23,x14 434 sub x27,x3,x1 // done yet? 435 adc x24,x24,x15 436 437 adds x24,x24,x16 438 sub x14,x3,x5 // rewinded ap 439 adc x25,xzr,xzr // t[14] 440 add x25,x25,x17 441 442 cbz x27,Lsqr8x_outer_break 443 444 mov x4,x6 445 ldp x6,x7,[x2,#8*0] 446 ldp x8,x9,[x2,#8*2] 447 ldp x10,x11,[x2,#8*4] 448 ldp x12,x13,[x2,#8*6] 449 adds x19,x19,x6 450 adcs x20,x20,x7 451 ldp x6,x7,[x1,#8*0] 452 adcs x21,x21,x8 453 adcs x22,x22,x9 454 ldp x8,x9,[x1,#8*2] 455 adcs x23,x23,x10 456 adcs x24,x24,x11 457 ldp x10,x11,[x1,#8*4] 458 adcs x25,x25,x12 459 mov x0,x1 460 adcs x26,xzr,x13 461 ldp x12,x13,[x1,#8*6] 462 add x1,x1,#8*8 463 //adc x28,xzr,xzr // moved below 464 mov x27,#-8*8 465 466 // a[8]a[0] 467 // a[9]a[0] 468 // a[a]a[0] 469 // a[b]a[0] 470 // a[c]a[0] 471 // a[d]a[0] 472 // a[e]a[0] 473 // a[f]a[0] 474 // a[8]a[1] 475 // a[f]a[1]........................ 476 // a[8]a[2] 477 // a[f]a[2]........................ 478 // a[8]a[3] 479 // a[f]a[3]........................ 480 // a[8]a[4] 481 // a[f]a[4]........................ 482 // a[8]a[5] 483 // a[f]a[5]........................ 484 // a[8]a[6] 485 // a[f]a[6]........................ 486 // a[8]a[7] 487 // a[f]a[7]........................ 488Lsqr8x_mul: 489 mul x14,x6,x4 490 adc x28,xzr,xzr // carry bit, modulo-scheduled 491 mul x15,x7,x4 492 add x27,x27,#8 493 mul x16,x8,x4 494 mul x17,x9,x4 495 adds x19,x19,x14 496 mul x14,x10,x4 497 adcs x20,x20,x15 498 mul x15,x11,x4 499 adcs x21,x21,x16 500 mul x16,x12,x4 501 adcs x22,x22,x17 502 mul x17,x13,x4 503 adcs x23,x23,x14 504 umulh x14,x6,x4 505 adcs x24,x24,x15 506 umulh x15,x7,x4 507 adcs x25,x25,x16 508 umulh x16,x8,x4 509 adcs x26,x26,x17 510 umulh x17,x9,x4 511 adc x28,x28,xzr 512 str x19,[x2],#8 513 adds x19,x20,x14 514 umulh x14,x10,x4 515 adcs x20,x21,x15 516 umulh x15,x11,x4 517 adcs x21,x22,x16 518 umulh x16,x12,x4 519 adcs x22,x23,x17 520 umulh x17,x13,x4 521 ldr x4,[x0,x27] 522 adcs x23,x24,x14 523 adcs x24,x25,x15 524 adcs x25,x26,x16 525 adcs x26,x28,x17 526 //adc x28,xzr,xzr // moved above 527 cbnz x27,Lsqr8x_mul 528 // note that carry flag is guaranteed 529 // to be zero at this point 530 cmp x1,x3 // done yet? 531 b.eq Lsqr8x_break 532 533 ldp x6,x7,[x2,#8*0] 534 ldp x8,x9,[x2,#8*2] 535 ldp x10,x11,[x2,#8*4] 536 ldp x12,x13,[x2,#8*6] 537 adds x19,x19,x6 538 ldr x4,[x0,#-8*8] 539 adcs x20,x20,x7 540 ldp x6,x7,[x1,#8*0] 541 adcs x21,x21,x8 542 adcs x22,x22,x9 543 ldp x8,x9,[x1,#8*2] 544 adcs x23,x23,x10 545 adcs x24,x24,x11 546 ldp x10,x11,[x1,#8*4] 547 adcs x25,x25,x12 548 mov x27,#-8*8 549 adcs x26,x26,x13 550 ldp x12,x13,[x1,#8*6] 551 add x1,x1,#8*8 552 //adc x28,xzr,xzr // moved above 553 b Lsqr8x_mul 554 555.align 4 556Lsqr8x_break: 557 ldp x6,x7,[x0,#8*0] 558 add x1,x0,#8*8 559 ldp x8,x9,[x0,#8*2] 560 sub x14,x3,x1 // is it last iteration? 561 ldp x10,x11,[x0,#8*4] 562 sub x15,x2,x14 563 ldp x12,x13,[x0,#8*6] 564 cbz x14,Lsqr8x_outer_loop 565 566 stp x19,x20,[x2,#8*0] 567 ldp x19,x20,[x15,#8*0] 568 stp x21,x22,[x2,#8*2] 569 ldp x21,x22,[x15,#8*2] 570 stp x23,x24,[x2,#8*4] 571 ldp x23,x24,[x15,#8*4] 572 stp x25,x26,[x2,#8*6] 573 mov x2,x15 574 ldp x25,x26,[x15,#8*6] 575 b Lsqr8x_outer_loop 576 577.align 4 578Lsqr8x_outer_break: 579 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 580 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 581 ldp x15,x16,[sp,#8*1] 582 ldp x11,x13,[x14,#8*2] 583 add x1,x14,#8*4 584 ldp x17,x14,[sp,#8*3] 585 586 stp x19,x20,[x2,#8*0] 587 mul x19,x7,x7 588 stp x21,x22,[x2,#8*2] 589 umulh x7,x7,x7 590 stp x23,x24,[x2,#8*4] 591 mul x8,x9,x9 592 stp x25,x26,[x2,#8*6] 593 mov x2,sp 594 umulh x9,x9,x9 595 adds x20,x7,x15,lsl#1 596 extr x15,x16,x15,#63 597 sub x27,x5,#8*4 598 599Lsqr4x_shift_n_add: 600 adcs x21,x8,x15 601 extr x16,x17,x16,#63 602 sub x27,x27,#8*4 603 adcs x22,x9,x16 604 ldp x15,x16,[x2,#8*5] 605 mul x10,x11,x11 606 ldp x7,x9,[x1],#8*2 607 umulh x11,x11,x11 608 mul x12,x13,x13 609 umulh x13,x13,x13 610 extr x17,x14,x17,#63 611 stp x19,x20,[x2,#8*0] 612 adcs x23,x10,x17 613 extr x14,x15,x14,#63 614 stp x21,x22,[x2,#8*2] 615 adcs x24,x11,x14 616 ldp x17,x14,[x2,#8*7] 617 extr x15,x16,x15,#63 618 adcs x25,x12,x15 619 extr x16,x17,x16,#63 620 adcs x26,x13,x16 621 ldp x15,x16,[x2,#8*9] 622 mul x6,x7,x7 623 ldp x11,x13,[x1],#8*2 624 umulh x7,x7,x7 625 mul x8,x9,x9 626 umulh x9,x9,x9 627 stp x23,x24,[x2,#8*4] 628 extr x17,x14,x17,#63 629 stp x25,x26,[x2,#8*6] 630 add x2,x2,#8*8 631 adcs x19,x6,x17 632 extr x14,x15,x14,#63 633 adcs x20,x7,x14 634 ldp x17,x14,[x2,#8*3] 635 extr x15,x16,x15,#63 636 cbnz x27,Lsqr4x_shift_n_add 637 ldp x1,x4,[x29,#104] // pull np and n0 638 639 adcs x21,x8,x15 640 extr x16,x17,x16,#63 641 adcs x22,x9,x16 642 ldp x15,x16,[x2,#8*5] 643 mul x10,x11,x11 644 umulh x11,x11,x11 645 stp x19,x20,[x2,#8*0] 646 mul x12,x13,x13 647 umulh x13,x13,x13 648 stp x21,x22,[x2,#8*2] 649 extr x17,x14,x17,#63 650 adcs x23,x10,x17 651 extr x14,x15,x14,#63 652 ldp x19,x20,[sp,#8*0] 653 adcs x24,x11,x14 654 extr x15,x16,x15,#63 655 ldp x6,x7,[x1,#8*0] 656 adcs x25,x12,x15 657 extr x16,xzr,x16,#63 658 ldp x8,x9,[x1,#8*2] 659 adc x26,x13,x16 660 ldp x10,x11,[x1,#8*4] 661 662 // Reduce by 512 bits per iteration 663 mul x28,x4,x19 // t[0]*n0 664 ldp x12,x13,[x1,#8*6] 665 add x3,x1,x5 666 ldp x21,x22,[sp,#8*2] 667 stp x23,x24,[x2,#8*4] 668 ldp x23,x24,[sp,#8*4] 669 stp x25,x26,[x2,#8*6] 670 ldp x25,x26,[sp,#8*6] 671 add x1,x1,#8*8 672 mov x30,xzr // initial top-most carry 673 mov x2,sp 674 mov x27,#8 675 676Lsqr8x_reduction: 677 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 678 mul x15,x7,x28 679 sub x27,x27,#1 680 mul x16,x8,x28 681 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 682 mul x17,x9,x28 683 // (*) adds xzr,x19,x14 684 subs xzr,x19,#1 // (*) 685 mul x14,x10,x28 686 adcs x19,x20,x15 687 mul x15,x11,x28 688 adcs x20,x21,x16 689 mul x16,x12,x28 690 adcs x21,x22,x17 691 mul x17,x13,x28 692 adcs x22,x23,x14 693 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 694 adcs x23,x24,x15 695 umulh x15,x7,x28 696 adcs x24,x25,x16 697 umulh x16,x8,x28 698 adcs x25,x26,x17 699 umulh x17,x9,x28 700 adc x26,xzr,xzr 701 adds x19,x19,x14 702 umulh x14,x10,x28 703 adcs x20,x20,x15 704 umulh x15,x11,x28 705 adcs x21,x21,x16 706 umulh x16,x12,x28 707 adcs x22,x22,x17 708 umulh x17,x13,x28 709 mul x28,x4,x19 // next t[0]*n0 710 adcs x23,x23,x14 711 adcs x24,x24,x15 712 adcs x25,x25,x16 713 adc x26,x26,x17 714 cbnz x27,Lsqr8x_reduction 715 716 ldp x14,x15,[x2,#8*0] 717 ldp x16,x17,[x2,#8*2] 718 mov x0,x2 719 sub x27,x3,x1 // done yet? 720 adds x19,x19,x14 721 adcs x20,x20,x15 722 ldp x14,x15,[x2,#8*4] 723 adcs x21,x21,x16 724 adcs x22,x22,x17 725 ldp x16,x17,[x2,#8*6] 726 adcs x23,x23,x14 727 adcs x24,x24,x15 728 adcs x25,x25,x16 729 adcs x26,x26,x17 730 //adc x28,xzr,xzr // moved below 731 cbz x27,Lsqr8x8_post_condition 732 733 ldr x4,[x2,#-8*8] 734 ldp x6,x7,[x1,#8*0] 735 ldp x8,x9,[x1,#8*2] 736 ldp x10,x11,[x1,#8*4] 737 mov x27,#-8*8 738 ldp x12,x13,[x1,#8*6] 739 add x1,x1,#8*8 740 741Lsqr8x_tail: 742 mul x14,x6,x4 743 adc x28,xzr,xzr // carry bit, modulo-scheduled 744 mul x15,x7,x4 745 add x27,x27,#8 746 mul x16,x8,x4 747 mul x17,x9,x4 748 adds x19,x19,x14 749 mul x14,x10,x4 750 adcs x20,x20,x15 751 mul x15,x11,x4 752 adcs x21,x21,x16 753 mul x16,x12,x4 754 adcs x22,x22,x17 755 mul x17,x13,x4 756 adcs x23,x23,x14 757 umulh x14,x6,x4 758 adcs x24,x24,x15 759 umulh x15,x7,x4 760 adcs x25,x25,x16 761 umulh x16,x8,x4 762 adcs x26,x26,x17 763 umulh x17,x9,x4 764 adc x28,x28,xzr 765 str x19,[x2],#8 766 adds x19,x20,x14 767 umulh x14,x10,x4 768 adcs x20,x21,x15 769 umulh x15,x11,x4 770 adcs x21,x22,x16 771 umulh x16,x12,x4 772 adcs x22,x23,x17 773 umulh x17,x13,x4 774 ldr x4,[x0,x27] 775 adcs x23,x24,x14 776 adcs x24,x25,x15 777 adcs x25,x26,x16 778 adcs x26,x28,x17 779 //adc x28,xzr,xzr // moved above 780 cbnz x27,Lsqr8x_tail 781 // note that carry flag is guaranteed 782 // to be zero at this point 783 ldp x6,x7,[x2,#8*0] 784 sub x27,x3,x1 // done yet? 785 sub x16,x3,x5 // rewinded np 786 ldp x8,x9,[x2,#8*2] 787 ldp x10,x11,[x2,#8*4] 788 ldp x12,x13,[x2,#8*6] 789 cbz x27,Lsqr8x_tail_break 790 791 ldr x4,[x0,#-8*8] 792 adds x19,x19,x6 793 adcs x20,x20,x7 794 ldp x6,x7,[x1,#8*0] 795 adcs x21,x21,x8 796 adcs x22,x22,x9 797 ldp x8,x9,[x1,#8*2] 798 adcs x23,x23,x10 799 adcs x24,x24,x11 800 ldp x10,x11,[x1,#8*4] 801 adcs x25,x25,x12 802 mov x27,#-8*8 803 adcs x26,x26,x13 804 ldp x12,x13,[x1,#8*6] 805 add x1,x1,#8*8 806 //adc x28,xzr,xzr // moved above 807 b Lsqr8x_tail 808 809.align 4 810Lsqr8x_tail_break: 811 ldr x4,[x29,#112] // pull n0 812 add x27,x2,#8*8 // end of current t[num] window 813 814 subs xzr,x30,#1 // "move" top-most carry to carry bit 815 adcs x14,x19,x6 816 adcs x15,x20,x7 817 ldp x19,x20,[x0,#8*0] 818 adcs x21,x21,x8 819 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 820 adcs x22,x22,x9 821 ldp x8,x9,[x16,#8*2] 822 adcs x23,x23,x10 823 adcs x24,x24,x11 824 ldp x10,x11,[x16,#8*4] 825 adcs x25,x25,x12 826 adcs x26,x26,x13 827 ldp x12,x13,[x16,#8*6] 828 add x1,x16,#8*8 829 adc x30,xzr,xzr // top-most carry 830 mul x28,x4,x19 831 stp x14,x15,[x2,#8*0] 832 stp x21,x22,[x2,#8*2] 833 ldp x21,x22,[x0,#8*2] 834 stp x23,x24,[x2,#8*4] 835 ldp x23,x24,[x0,#8*4] 836 cmp x27,x29 // did we hit the bottom? 837 stp x25,x26,[x2,#8*6] 838 mov x2,x0 // slide the window 839 ldp x25,x26,[x0,#8*6] 840 mov x27,#8 841 b.ne Lsqr8x_reduction 842 843 // Final step. We see if result is larger than modulus, and 844 // if it is, subtract the modulus. But comparison implies 845 // subtraction. So we subtract modulus, see if it borrowed, 846 // and conditionally copy original value. 847 ldr x0,[x29,#96] // pull rp 848 add x2,x2,#8*8 849 subs x14,x19,x6 850 sbcs x15,x20,x7 851 sub x27,x5,#8*8 852 mov x3,x0 // x0 copy 853 854Lsqr8x_sub: 855 sbcs x16,x21,x8 856 ldp x6,x7,[x1,#8*0] 857 sbcs x17,x22,x9 858 stp x14,x15,[x0,#8*0] 859 sbcs x14,x23,x10 860 ldp x8,x9,[x1,#8*2] 861 sbcs x15,x24,x11 862 stp x16,x17,[x0,#8*2] 863 sbcs x16,x25,x12 864 ldp x10,x11,[x1,#8*4] 865 sbcs x17,x26,x13 866 ldp x12,x13,[x1,#8*6] 867 add x1,x1,#8*8 868 ldp x19,x20,[x2,#8*0] 869 sub x27,x27,#8*8 870 ldp x21,x22,[x2,#8*2] 871 ldp x23,x24,[x2,#8*4] 872 ldp x25,x26,[x2,#8*6] 873 add x2,x2,#8*8 874 stp x14,x15,[x0,#8*4] 875 sbcs x14,x19,x6 876 stp x16,x17,[x0,#8*6] 877 add x0,x0,#8*8 878 sbcs x15,x20,x7 879 cbnz x27,Lsqr8x_sub 880 881 sbcs x16,x21,x8 882 mov x2,sp 883 add x1,sp,x5 884 ldp x6,x7,[x3,#8*0] 885 sbcs x17,x22,x9 886 stp x14,x15,[x0,#8*0] 887 sbcs x14,x23,x10 888 ldp x8,x9,[x3,#8*2] 889 sbcs x15,x24,x11 890 stp x16,x17,[x0,#8*2] 891 sbcs x16,x25,x12 892 ldp x19,x20,[x1,#8*0] 893 sbcs x17,x26,x13 894 ldp x21,x22,[x1,#8*2] 895 sbcs xzr,x30,xzr // did it borrow? 896 ldr x30,[x29,#8] // pull return address 897 stp x14,x15,[x0,#8*4] 898 stp x16,x17,[x0,#8*6] 899 900 sub x27,x5,#8*4 901Lsqr4x_cond_copy: 902 sub x27,x27,#8*4 903 csel x14,x19,x6,lo 904 stp xzr,xzr,[x2,#8*0] 905 csel x15,x20,x7,lo 906 ldp x6,x7,[x3,#8*4] 907 ldp x19,x20,[x1,#8*4] 908 csel x16,x21,x8,lo 909 stp xzr,xzr,[x2,#8*2] 910 add x2,x2,#8*4 911 csel x17,x22,x9,lo 912 ldp x8,x9,[x3,#8*6] 913 ldp x21,x22,[x1,#8*6] 914 add x1,x1,#8*4 915 stp x14,x15,[x3,#8*0] 916 stp x16,x17,[x3,#8*2] 917 add x3,x3,#8*4 918 stp xzr,xzr,[x1,#8*0] 919 stp xzr,xzr,[x1,#8*2] 920 cbnz x27,Lsqr4x_cond_copy 921 922 csel x14,x19,x6,lo 923 stp xzr,xzr,[x2,#8*0] 924 csel x15,x20,x7,lo 925 stp xzr,xzr,[x2,#8*2] 926 csel x16,x21,x8,lo 927 csel x17,x22,x9,lo 928 stp x14,x15,[x3,#8*0] 929 stp x16,x17,[x3,#8*2] 930 931 b Lsqr8x_done 932 933.align 4 934Lsqr8x8_post_condition: 935 adc x28,xzr,xzr 936 ldr x30,[x29,#8] // pull return address 937 // x19-7,x28 hold result, x6-7 hold modulus 938 subs x6,x19,x6 939 ldr x1,[x29,#96] // pull rp 940 sbcs x7,x20,x7 941 stp xzr,xzr,[sp,#8*0] 942 sbcs x8,x21,x8 943 stp xzr,xzr,[sp,#8*2] 944 sbcs x9,x22,x9 945 stp xzr,xzr,[sp,#8*4] 946 sbcs x10,x23,x10 947 stp xzr,xzr,[sp,#8*6] 948 sbcs x11,x24,x11 949 stp xzr,xzr,[sp,#8*8] 950 sbcs x12,x25,x12 951 stp xzr,xzr,[sp,#8*10] 952 sbcs x13,x26,x13 953 stp xzr,xzr,[sp,#8*12] 954 sbcs x28,x28,xzr // did it borrow? 955 stp xzr,xzr,[sp,#8*14] 956 957 // x6-7 hold result-modulus 958 csel x6,x19,x6,lo 959 csel x7,x20,x7,lo 960 csel x8,x21,x8,lo 961 csel x9,x22,x9,lo 962 stp x6,x7,[x1,#8*0] 963 csel x10,x23,x10,lo 964 csel x11,x24,x11,lo 965 stp x8,x9,[x1,#8*2] 966 csel x12,x25,x12,lo 967 csel x13,x26,x13,lo 968 stp x10,x11,[x1,#8*4] 969 stp x12,x13,[x1,#8*6] 970 971Lsqr8x_done: 972 ldp x19,x20,[x29,#16] 973 mov sp,x29 974 ldp x21,x22,[x29,#32] 975 mov x0,#1 976 ldp x23,x24,[x29,#48] 977 ldp x25,x26,[x29,#64] 978 ldp x27,x28,[x29,#80] 979 ldr x29,[sp],#128 980 // x30 is popped earlier 981 AARCH64_VALIDATE_LINK_REGISTER 982 ret 983 984 985.align 5 986__bn_mul4x_mont: 987 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 988 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 989 // return address. 990 stp x29,x30,[sp,#-128]! 991 add x29,sp,#0 992 stp x19,x20,[sp,#16] 993 stp x21,x22,[sp,#32] 994 stp x23,x24,[sp,#48] 995 stp x25,x26,[sp,#64] 996 stp x27,x28,[sp,#80] 997 998 sub x26,sp,x5,lsl#3 999 lsl x5,x5,#3 1000 ldr x4,[x4] // *n0 1001 sub sp,x26,#8*4 // alloca 1002 1003 add x10,x2,x5 1004 add x27,x1,x5 1005 stp x0,x10,[x29,#96] // offload rp and &b[num] 1006 1007 ldr x24,[x2,#8*0] // b[0] 1008 ldp x6,x7,[x1,#8*0] // a[0..3] 1009 ldp x8,x9,[x1,#8*2] 1010 add x1,x1,#8*4 1011 mov x19,xzr 1012 mov x20,xzr 1013 mov x21,xzr 1014 mov x22,xzr 1015 ldp x14,x15,[x3,#8*0] // n[0..3] 1016 ldp x16,x17,[x3,#8*2] 1017 adds x3,x3,#8*4 // clear carry bit 1018 mov x0,xzr 1019 mov x28,#0 1020 mov x26,sp 1021 1022Loop_mul4x_1st_reduction: 1023 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1024 adc x0,x0,xzr // modulo-scheduled 1025 mul x11,x7,x24 1026 add x28,x28,#8 1027 mul x12,x8,x24 1028 and x28,x28,#31 1029 mul x13,x9,x24 1030 adds x19,x19,x10 1031 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1032 adcs x20,x20,x11 1033 mul x25,x19,x4 // t[0]*n0 1034 adcs x21,x21,x12 1035 umulh x11,x7,x24 1036 adcs x22,x22,x13 1037 umulh x12,x8,x24 1038 adc x23,xzr,xzr 1039 umulh x13,x9,x24 1040 ldr x24,[x2,x28] // next b[i] (or b[0]) 1041 adds x20,x20,x10 1042 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1043 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1044 adcs x21,x21,x11 1045 mul x11,x15,x25 1046 adcs x22,x22,x12 1047 mul x12,x16,x25 1048 adc x23,x23,x13 // can't overflow 1049 mul x13,x17,x25 1050 // (*) adds xzr,x19,x10 1051 subs xzr,x19,#1 // (*) 1052 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1053 adcs x19,x20,x11 1054 umulh x11,x15,x25 1055 adcs x20,x21,x12 1056 umulh x12,x16,x25 1057 adcs x21,x22,x13 1058 umulh x13,x17,x25 1059 adcs x22,x23,x0 1060 adc x0,xzr,xzr 1061 adds x19,x19,x10 1062 sub x10,x27,x1 1063 adcs x20,x20,x11 1064 adcs x21,x21,x12 1065 adcs x22,x22,x13 1066 //adc x0,x0,xzr 1067 cbnz x28,Loop_mul4x_1st_reduction 1068 1069 cbz x10,Lmul4x4_post_condition 1070 1071 ldp x6,x7,[x1,#8*0] // a[4..7] 1072 ldp x8,x9,[x1,#8*2] 1073 add x1,x1,#8*4 1074 ldr x25,[sp] // a[0]*n0 1075 ldp x14,x15,[x3,#8*0] // n[4..7] 1076 ldp x16,x17,[x3,#8*2] 1077 add x3,x3,#8*4 1078 1079Loop_mul4x_1st_tail: 1080 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1081 adc x0,x0,xzr // modulo-scheduled 1082 mul x11,x7,x24 1083 add x28,x28,#8 1084 mul x12,x8,x24 1085 and x28,x28,#31 1086 mul x13,x9,x24 1087 adds x19,x19,x10 1088 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1089 adcs x20,x20,x11 1090 umulh x11,x7,x24 1091 adcs x21,x21,x12 1092 umulh x12,x8,x24 1093 adcs x22,x22,x13 1094 umulh x13,x9,x24 1095 adc x23,xzr,xzr 1096 ldr x24,[x2,x28] // next b[i] (or b[0]) 1097 adds x20,x20,x10 1098 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1099 adcs x21,x21,x11 1100 mul x11,x15,x25 1101 adcs x22,x22,x12 1102 mul x12,x16,x25 1103 adc x23,x23,x13 // can't overflow 1104 mul x13,x17,x25 1105 adds x19,x19,x10 1106 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1107 adcs x20,x20,x11 1108 umulh x11,x15,x25 1109 adcs x21,x21,x12 1110 umulh x12,x16,x25 1111 adcs x22,x22,x13 1112 adcs x23,x23,x0 1113 umulh x13,x17,x25 1114 adc x0,xzr,xzr 1115 ldr x25,[sp,x28] // next t[0]*n0 1116 str x19,[x26],#8 // result!!! 1117 adds x19,x20,x10 1118 sub x10,x27,x1 // done yet? 1119 adcs x20,x21,x11 1120 adcs x21,x22,x12 1121 adcs x22,x23,x13 1122 //adc x0,x0,xzr 1123 cbnz x28,Loop_mul4x_1st_tail 1124 1125 sub x11,x27,x5 // rewinded x1 1126 cbz x10,Lmul4x_proceed 1127 1128 ldp x6,x7,[x1,#8*0] 1129 ldp x8,x9,[x1,#8*2] 1130 add x1,x1,#8*4 1131 ldp x14,x15,[x3,#8*0] 1132 ldp x16,x17,[x3,#8*2] 1133 add x3,x3,#8*4 1134 b Loop_mul4x_1st_tail 1135 1136.align 5 1137Lmul4x_proceed: 1138 ldr x24,[x2,#8*4]! // *++b 1139 adc x30,x0,xzr 1140 ldp x6,x7,[x11,#8*0] // a[0..3] 1141 sub x3,x3,x5 // rewind np 1142 ldp x8,x9,[x11,#8*2] 1143 add x1,x11,#8*4 1144 1145 stp x19,x20,[x26,#8*0] // result!!! 1146 ldp x19,x20,[sp,#8*4] // t[0..3] 1147 stp x21,x22,[x26,#8*2] // result!!! 1148 ldp x21,x22,[sp,#8*6] 1149 1150 ldp x14,x15,[x3,#8*0] // n[0..3] 1151 mov x26,sp 1152 ldp x16,x17,[x3,#8*2] 1153 adds x3,x3,#8*4 // clear carry bit 1154 mov x0,xzr 1155 1156.align 4 1157Loop_mul4x_reduction: 1158 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1159 adc x0,x0,xzr // modulo-scheduled 1160 mul x11,x7,x24 1161 add x28,x28,#8 1162 mul x12,x8,x24 1163 and x28,x28,#31 1164 mul x13,x9,x24 1165 adds x19,x19,x10 1166 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1167 adcs x20,x20,x11 1168 mul x25,x19,x4 // t[0]*n0 1169 adcs x21,x21,x12 1170 umulh x11,x7,x24 1171 adcs x22,x22,x13 1172 umulh x12,x8,x24 1173 adc x23,xzr,xzr 1174 umulh x13,x9,x24 1175 ldr x24,[x2,x28] // next b[i] 1176 adds x20,x20,x10 1177 // (*) mul x10,x14,x25 1178 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1179 adcs x21,x21,x11 1180 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1181 adcs x22,x22,x12 1182 mul x12,x16,x25 1183 adc x23,x23,x13 // can't overflow 1184 mul x13,x17,x25 1185 // (*) adds xzr,x19,x10 1186 subs xzr,x19,#1 // (*) 1187 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1188 adcs x19,x20,x11 1189 umulh x11,x15,x25 1190 adcs x20,x21,x12 1191 umulh x12,x16,x25 1192 adcs x21,x22,x13 1193 umulh x13,x17,x25 1194 adcs x22,x23,x0 1195 adc x0,xzr,xzr 1196 adds x19,x19,x10 1197 adcs x20,x20,x11 1198 adcs x21,x21,x12 1199 adcs x22,x22,x13 1200 //adc x0,x0,xzr 1201 cbnz x28,Loop_mul4x_reduction 1202 1203 adc x0,x0,xzr 1204 ldp x10,x11,[x26,#8*4] // t[4..7] 1205 ldp x12,x13,[x26,#8*6] 1206 ldp x6,x7,[x1,#8*0] // a[4..7] 1207 ldp x8,x9,[x1,#8*2] 1208 add x1,x1,#8*4 1209 adds x19,x19,x10 1210 adcs x20,x20,x11 1211 adcs x21,x21,x12 1212 adcs x22,x22,x13 1213 //adc x0,x0,xzr 1214 1215 ldr x25,[sp] // t[0]*n0 1216 ldp x14,x15,[x3,#8*0] // n[4..7] 1217 ldp x16,x17,[x3,#8*2] 1218 add x3,x3,#8*4 1219 1220.align 4 1221Loop_mul4x_tail: 1222 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1223 adc x0,x0,xzr // modulo-scheduled 1224 mul x11,x7,x24 1225 add x28,x28,#8 1226 mul x12,x8,x24 1227 and x28,x28,#31 1228 mul x13,x9,x24 1229 adds x19,x19,x10 1230 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1231 adcs x20,x20,x11 1232 umulh x11,x7,x24 1233 adcs x21,x21,x12 1234 umulh x12,x8,x24 1235 adcs x22,x22,x13 1236 umulh x13,x9,x24 1237 adc x23,xzr,xzr 1238 ldr x24,[x2,x28] // next b[i] 1239 adds x20,x20,x10 1240 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1241 adcs x21,x21,x11 1242 mul x11,x15,x25 1243 adcs x22,x22,x12 1244 mul x12,x16,x25 1245 adc x23,x23,x13 // can't overflow 1246 mul x13,x17,x25 1247 adds x19,x19,x10 1248 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1249 adcs x20,x20,x11 1250 umulh x11,x15,x25 1251 adcs x21,x21,x12 1252 umulh x12,x16,x25 1253 adcs x22,x22,x13 1254 umulh x13,x17,x25 1255 adcs x23,x23,x0 1256 ldr x25,[sp,x28] // next a[0]*n0 1257 adc x0,xzr,xzr 1258 str x19,[x26],#8 // result!!! 1259 adds x19,x20,x10 1260 sub x10,x27,x1 // done yet? 1261 adcs x20,x21,x11 1262 adcs x21,x22,x12 1263 adcs x22,x23,x13 1264 //adc x0,x0,xzr 1265 cbnz x28,Loop_mul4x_tail 1266 1267 sub x11,x3,x5 // rewinded np? 1268 adc x0,x0,xzr 1269 cbz x10,Loop_mul4x_break 1270 1271 ldp x10,x11,[x26,#8*4] 1272 ldp x12,x13,[x26,#8*6] 1273 ldp x6,x7,[x1,#8*0] 1274 ldp x8,x9,[x1,#8*2] 1275 add x1,x1,#8*4 1276 adds x19,x19,x10 1277 adcs x20,x20,x11 1278 adcs x21,x21,x12 1279 adcs x22,x22,x13 1280 //adc x0,x0,xzr 1281 ldp x14,x15,[x3,#8*0] 1282 ldp x16,x17,[x3,#8*2] 1283 add x3,x3,#8*4 1284 b Loop_mul4x_tail 1285 1286.align 4 1287Loop_mul4x_break: 1288 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1289 adds x19,x19,x30 1290 add x2,x2,#8*4 // bp++ 1291 adcs x20,x20,xzr 1292 sub x1,x1,x5 // rewind ap 1293 adcs x21,x21,xzr 1294 stp x19,x20,[x26,#8*0] // result!!! 1295 adcs x22,x22,xzr 1296 ldp x19,x20,[sp,#8*4] // t[0..3] 1297 adc x30,x0,xzr 1298 stp x21,x22,[x26,#8*2] // result!!! 1299 cmp x2,x13 // done yet? 1300 ldp x21,x22,[sp,#8*6] 1301 ldp x14,x15,[x11,#8*0] // n[0..3] 1302 ldp x16,x17,[x11,#8*2] 1303 add x3,x11,#8*4 1304 b.eq Lmul4x_post 1305 1306 ldr x24,[x2] 1307 ldp x6,x7,[x1,#8*0] // a[0..3] 1308 ldp x8,x9,[x1,#8*2] 1309 adds x1,x1,#8*4 // clear carry bit 1310 mov x0,xzr 1311 mov x26,sp 1312 b Loop_mul4x_reduction 1313 1314.align 4 1315Lmul4x_post: 1316 // Final step. We see if result is larger than modulus, and 1317 // if it is, subtract the modulus. But comparison implies 1318 // subtraction. So we subtract modulus, see if it borrowed, 1319 // and conditionally copy original value. 1320 mov x0,x12 1321 mov x27,x12 // x0 copy 1322 subs x10,x19,x14 1323 add x26,sp,#8*8 1324 sbcs x11,x20,x15 1325 sub x28,x5,#8*4 1326 1327Lmul4x_sub: 1328 sbcs x12,x21,x16 1329 ldp x14,x15,[x3,#8*0] 1330 sub x28,x28,#8*4 1331 ldp x19,x20,[x26,#8*0] 1332 sbcs x13,x22,x17 1333 ldp x16,x17,[x3,#8*2] 1334 add x3,x3,#8*4 1335 ldp x21,x22,[x26,#8*2] 1336 add x26,x26,#8*4 1337 stp x10,x11,[x0,#8*0] 1338 sbcs x10,x19,x14 1339 stp x12,x13,[x0,#8*2] 1340 add x0,x0,#8*4 1341 sbcs x11,x20,x15 1342 cbnz x28,Lmul4x_sub 1343 1344 sbcs x12,x21,x16 1345 mov x26,sp 1346 add x1,sp,#8*4 1347 ldp x6,x7,[x27,#8*0] 1348 sbcs x13,x22,x17 1349 stp x10,x11,[x0,#8*0] 1350 ldp x8,x9,[x27,#8*2] 1351 stp x12,x13,[x0,#8*2] 1352 ldp x19,x20,[x1,#8*0] 1353 ldp x21,x22,[x1,#8*2] 1354 sbcs xzr,x30,xzr // did it borrow? 1355 ldr x30,[x29,#8] // pull return address 1356 1357 sub x28,x5,#8*4 1358Lmul4x_cond_copy: 1359 sub x28,x28,#8*4 1360 csel x10,x19,x6,lo 1361 stp xzr,xzr,[x26,#8*0] 1362 csel x11,x20,x7,lo 1363 ldp x6,x7,[x27,#8*4] 1364 ldp x19,x20,[x1,#8*4] 1365 csel x12,x21,x8,lo 1366 stp xzr,xzr,[x26,#8*2] 1367 add x26,x26,#8*4 1368 csel x13,x22,x9,lo 1369 ldp x8,x9,[x27,#8*6] 1370 ldp x21,x22,[x1,#8*6] 1371 add x1,x1,#8*4 1372 stp x10,x11,[x27,#8*0] 1373 stp x12,x13,[x27,#8*2] 1374 add x27,x27,#8*4 1375 cbnz x28,Lmul4x_cond_copy 1376 1377 csel x10,x19,x6,lo 1378 stp xzr,xzr,[x26,#8*0] 1379 csel x11,x20,x7,lo 1380 stp xzr,xzr,[x26,#8*2] 1381 csel x12,x21,x8,lo 1382 stp xzr,xzr,[x26,#8*3] 1383 csel x13,x22,x9,lo 1384 stp xzr,xzr,[x26,#8*4] 1385 stp x10,x11,[x27,#8*0] 1386 stp x12,x13,[x27,#8*2] 1387 1388 b Lmul4x_done 1389 1390.align 4 1391Lmul4x4_post_condition: 1392 adc x0,x0,xzr 1393 ldr x1,[x29,#96] // pull rp 1394 // x19-3,x0 hold result, x14-7 hold modulus 1395 subs x6,x19,x14 1396 ldr x30,[x29,#8] // pull return address 1397 sbcs x7,x20,x15 1398 stp xzr,xzr,[sp,#8*0] 1399 sbcs x8,x21,x16 1400 stp xzr,xzr,[sp,#8*2] 1401 sbcs x9,x22,x17 1402 stp xzr,xzr,[sp,#8*4] 1403 sbcs xzr,x0,xzr // did it borrow? 1404 stp xzr,xzr,[sp,#8*6] 1405 1406 // x6-3 hold result-modulus 1407 csel x6,x19,x6,lo 1408 csel x7,x20,x7,lo 1409 csel x8,x21,x8,lo 1410 csel x9,x22,x9,lo 1411 stp x6,x7,[x1,#8*0] 1412 stp x8,x9,[x1,#8*2] 1413 1414Lmul4x_done: 1415 ldp x19,x20,[x29,#16] 1416 mov sp,x29 1417 ldp x21,x22,[x29,#32] 1418 mov x0,#1 1419 ldp x23,x24,[x29,#48] 1420 ldp x25,x26,[x29,#64] 1421 ldp x27,x28,[x29,#80] 1422 ldr x29,[sp],#128 1423 // x30 is popped earlier 1424 AARCH64_VALIDATE_LINK_REGISTER 1425 ret 1426 1427.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1428.align 2 1429.align 4 1430#endif // !OPENSSL_NO_ASM 1431