1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include "openssl/arm_arch.h" 17 18.text 19.align 5 20Lpoly: 21.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 22LRR: // 2^512 mod P precomputed for NIST P256 polynomial 23.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 24Lone_mont: 25.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 26Lone: 27.quad 1,0,0,0 28Lord: 29.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 30LordK: 31.quad 0xccd1c8aaee00bc4f 32.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 33.align 2 34 35// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 36// const BN_ULONG x2[4]); 37.globl ecp_nistz256_mul_mont 38 39.def ecp_nistz256_mul_mont 40 .type 32 41.endef 42.align 4 43ecp_nistz256_mul_mont: 44 AARCH64_SIGN_LINK_REGISTER 45 stp x29,x30,[sp,#-32]! 46 add x29,sp,#0 47 stp x19,x20,[sp,#16] 48 49 ldr x3,[x2] // bp[0] 50 ldp x4,x5,[x1] 51 ldp x6,x7,[x1,#16] 52 ldr x12,Lpoly+8 53 ldr x13,Lpoly+24 54 55 bl __ecp_nistz256_mul_mont 56 57 ldp x19,x20,[sp,#16] 58 ldp x29,x30,[sp],#32 59 AARCH64_VALIDATE_LINK_REGISTER 60 ret 61 62 63// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 64.globl ecp_nistz256_sqr_mont 65 66.def ecp_nistz256_sqr_mont 67 .type 32 68.endef 69.align 4 70ecp_nistz256_sqr_mont: 71 AARCH64_SIGN_LINK_REGISTER 72 stp x29,x30,[sp,#-32]! 73 add x29,sp,#0 74 stp x19,x20,[sp,#16] 75 76 ldp x4,x5,[x1] 77 ldp x6,x7,[x1,#16] 78 ldr x12,Lpoly+8 79 ldr x13,Lpoly+24 80 81 bl __ecp_nistz256_sqr_mont 82 83 ldp x19,x20,[sp,#16] 84 ldp x29,x30,[sp],#32 85 AARCH64_VALIDATE_LINK_REGISTER 86 ret 87 88 89// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 90.globl ecp_nistz256_div_by_2 91 92.def ecp_nistz256_div_by_2 93 .type 32 94.endef 95.align 4 96ecp_nistz256_div_by_2: 97 AARCH64_SIGN_LINK_REGISTER 98 stp x29,x30,[sp,#-16]! 99 add x29,sp,#0 100 101 ldp x14,x15,[x1] 102 ldp x16,x17,[x1,#16] 103 ldr x12,Lpoly+8 104 ldr x13,Lpoly+24 105 106 bl __ecp_nistz256_div_by_2 107 108 ldp x29,x30,[sp],#16 109 AARCH64_VALIDATE_LINK_REGISTER 110 ret 111 112 113// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 114.globl ecp_nistz256_mul_by_2 115 116.def ecp_nistz256_mul_by_2 117 .type 32 118.endef 119.align 4 120ecp_nistz256_mul_by_2: 121 AARCH64_SIGN_LINK_REGISTER 122 stp x29,x30,[sp,#-16]! 123 add x29,sp,#0 124 125 ldp x14,x15,[x1] 126 ldp x16,x17,[x1,#16] 127 ldr x12,Lpoly+8 128 ldr x13,Lpoly+24 129 mov x8,x14 130 mov x9,x15 131 mov x10,x16 132 mov x11,x17 133 134 bl __ecp_nistz256_add_to // ret = a+a // 2*a 135 136 ldp x29,x30,[sp],#16 137 AARCH64_VALIDATE_LINK_REGISTER 138 ret 139 140 141// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 142.globl ecp_nistz256_mul_by_3 143 144.def ecp_nistz256_mul_by_3 145 .type 32 146.endef 147.align 4 148ecp_nistz256_mul_by_3: 149 AARCH64_SIGN_LINK_REGISTER 150 stp x29,x30,[sp,#-16]! 151 add x29,sp,#0 152 153 ldp x14,x15,[x1] 154 ldp x16,x17,[x1,#16] 155 ldr x12,Lpoly+8 156 ldr x13,Lpoly+24 157 mov x8,x14 158 mov x9,x15 159 mov x10,x16 160 mov x11,x17 161 mov x4,x14 162 mov x5,x15 163 mov x6,x16 164 mov x7,x17 165 166 bl __ecp_nistz256_add_to // ret = a+a // 2*a 167 168 mov x8,x4 169 mov x9,x5 170 mov x10,x6 171 mov x11,x7 172 173 bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a 174 175 ldp x29,x30,[sp],#16 176 AARCH64_VALIDATE_LINK_REGISTER 177 ret 178 179 180// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 181// const BN_ULONG x2[4]); 182.globl ecp_nistz256_sub 183 184.def ecp_nistz256_sub 185 .type 32 186.endef 187.align 4 188ecp_nistz256_sub: 189 AARCH64_SIGN_LINK_REGISTER 190 stp x29,x30,[sp,#-16]! 191 add x29,sp,#0 192 193 ldp x14,x15,[x1] 194 ldp x16,x17,[x1,#16] 195 ldr x12,Lpoly+8 196 ldr x13,Lpoly+24 197 198 bl __ecp_nistz256_sub_from 199 200 ldp x29,x30,[sp],#16 201 AARCH64_VALIDATE_LINK_REGISTER 202 ret 203 204 205// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 206.globl ecp_nistz256_neg 207 208.def ecp_nistz256_neg 209 .type 32 210.endef 211.align 4 212ecp_nistz256_neg: 213 AARCH64_SIGN_LINK_REGISTER 214 stp x29,x30,[sp,#-16]! 215 add x29,sp,#0 216 217 mov x2,x1 218 mov x14,xzr // a = 0 219 mov x15,xzr 220 mov x16,xzr 221 mov x17,xzr 222 ldr x12,Lpoly+8 223 ldr x13,Lpoly+24 224 225 bl __ecp_nistz256_sub_from 226 227 ldp x29,x30,[sp],#16 228 AARCH64_VALIDATE_LINK_REGISTER 229 ret 230 231 232// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 233// to x4-x7 and b[0] - to x3 234.def __ecp_nistz256_mul_mont 235 .type 32 236.endef 237.align 4 238__ecp_nistz256_mul_mont: 239 mul x14,x4,x3 // a[0]*b[0] 240 umulh x8,x4,x3 241 242 mul x15,x5,x3 // a[1]*b[0] 243 umulh x9,x5,x3 244 245 mul x16,x6,x3 // a[2]*b[0] 246 umulh x10,x6,x3 247 248 mul x17,x7,x3 // a[3]*b[0] 249 umulh x11,x7,x3 250 ldr x3,[x2,#8] // b[1] 251 252 adds x15,x15,x8 // accumulate high parts of multiplication 253 lsl x8,x14,#32 254 adcs x16,x16,x9 255 lsr x9,x14,#32 256 adcs x17,x17,x10 257 adc x19,xzr,x11 258 mov x20,xzr 259 subs x10,x14,x8 // "*0xffff0001" 260 sbc x11,x14,x9 261 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 262 mul x8,x4,x3 // lo(a[0]*b[i]) 263 adcs x15,x16,x9 264 mul x9,x5,x3 // lo(a[1]*b[i]) 265 adcs x16,x17,x10 // +=acc[0]*0xffff0001 266 mul x10,x6,x3 // lo(a[2]*b[i]) 267 adcs x17,x19,x11 268 mul x11,x7,x3 // lo(a[3]*b[i]) 269 adc x19,x20,xzr 270 271 adds x14,x14,x8 // accumulate low parts of multiplication 272 umulh x8,x4,x3 // hi(a[0]*b[i]) 273 adcs x15,x15,x9 274 umulh x9,x5,x3 // hi(a[1]*b[i]) 275 adcs x16,x16,x10 276 umulh x10,x6,x3 // hi(a[2]*b[i]) 277 adcs x17,x17,x11 278 umulh x11,x7,x3 // hi(a[3]*b[i]) 279 adc x19,x19,xzr 280 ldr x3,[x2,#8*(1+1)] // b[1+1] 281 adds x15,x15,x8 // accumulate high parts of multiplication 282 lsl x8,x14,#32 283 adcs x16,x16,x9 284 lsr x9,x14,#32 285 adcs x17,x17,x10 286 adcs x19,x19,x11 287 adc x20,xzr,xzr 288 subs x10,x14,x8 // "*0xffff0001" 289 sbc x11,x14,x9 290 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 291 mul x8,x4,x3 // lo(a[0]*b[i]) 292 adcs x15,x16,x9 293 mul x9,x5,x3 // lo(a[1]*b[i]) 294 adcs x16,x17,x10 // +=acc[0]*0xffff0001 295 mul x10,x6,x3 // lo(a[2]*b[i]) 296 adcs x17,x19,x11 297 mul x11,x7,x3 // lo(a[3]*b[i]) 298 adc x19,x20,xzr 299 300 adds x14,x14,x8 // accumulate low parts of multiplication 301 umulh x8,x4,x3 // hi(a[0]*b[i]) 302 adcs x15,x15,x9 303 umulh x9,x5,x3 // hi(a[1]*b[i]) 304 adcs x16,x16,x10 305 umulh x10,x6,x3 // hi(a[2]*b[i]) 306 adcs x17,x17,x11 307 umulh x11,x7,x3 // hi(a[3]*b[i]) 308 adc x19,x19,xzr 309 ldr x3,[x2,#8*(2+1)] // b[2+1] 310 adds x15,x15,x8 // accumulate high parts of multiplication 311 lsl x8,x14,#32 312 adcs x16,x16,x9 313 lsr x9,x14,#32 314 adcs x17,x17,x10 315 adcs x19,x19,x11 316 adc x20,xzr,xzr 317 subs x10,x14,x8 // "*0xffff0001" 318 sbc x11,x14,x9 319 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 320 mul x8,x4,x3 // lo(a[0]*b[i]) 321 adcs x15,x16,x9 322 mul x9,x5,x3 // lo(a[1]*b[i]) 323 adcs x16,x17,x10 // +=acc[0]*0xffff0001 324 mul x10,x6,x3 // lo(a[2]*b[i]) 325 adcs x17,x19,x11 326 mul x11,x7,x3 // lo(a[3]*b[i]) 327 adc x19,x20,xzr 328 329 adds x14,x14,x8 // accumulate low parts of multiplication 330 umulh x8,x4,x3 // hi(a[0]*b[i]) 331 adcs x15,x15,x9 332 umulh x9,x5,x3 // hi(a[1]*b[i]) 333 adcs x16,x16,x10 334 umulh x10,x6,x3 // hi(a[2]*b[i]) 335 adcs x17,x17,x11 336 umulh x11,x7,x3 // hi(a[3]*b[i]) 337 adc x19,x19,xzr 338 adds x15,x15,x8 // accumulate high parts of multiplication 339 lsl x8,x14,#32 340 adcs x16,x16,x9 341 lsr x9,x14,#32 342 adcs x17,x17,x10 343 adcs x19,x19,x11 344 adc x20,xzr,xzr 345 // last reduction 346 subs x10,x14,x8 // "*0xffff0001" 347 sbc x11,x14,x9 348 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 349 adcs x15,x16,x9 350 adcs x16,x17,x10 // +=acc[0]*0xffff0001 351 adcs x17,x19,x11 352 adc x19,x20,xzr 353 354 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 355 sbcs x9,x15,x12 356 sbcs x10,x16,xzr 357 sbcs x11,x17,x13 358 sbcs xzr,x19,xzr // did it borrow? 359 360 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 361 csel x15,x15,x9,lo 362 csel x16,x16,x10,lo 363 stp x14,x15,[x0] 364 csel x17,x17,x11,lo 365 stp x16,x17,[x0,#16] 366 367 ret 368 369 370// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 371// to x4-x7 372.def __ecp_nistz256_sqr_mont 373 .type 32 374.endef 375.align 4 376__ecp_nistz256_sqr_mont: 377 // | | | | | |a1*a0| | 378 // | | | | |a2*a0| | | 379 // | |a3*a2|a3*a0| | | | 380 // | | | |a2*a1| | | | 381 // | | |a3*a1| | | | | 382 // *| | | | | | | | 2| 383 // +|a3*a3|a2*a2|a1*a1|a0*a0| 384 // |--+--+--+--+--+--+--+--| 385 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 386 // 387 // "can't overflow" below mark carrying into high part of 388 // multiplication result, which can't overflow, because it 389 // can never be all ones. 390 391 mul x15,x5,x4 // a[1]*a[0] 392 umulh x9,x5,x4 393 mul x16,x6,x4 // a[2]*a[0] 394 umulh x10,x6,x4 395 mul x17,x7,x4 // a[3]*a[0] 396 umulh x19,x7,x4 397 398 adds x16,x16,x9 // accumulate high parts of multiplication 399 mul x8,x6,x5 // a[2]*a[1] 400 umulh x9,x6,x5 401 adcs x17,x17,x10 402 mul x10,x7,x5 // a[3]*a[1] 403 umulh x11,x7,x5 404 adc x19,x19,xzr // can't overflow 405 406 mul x20,x7,x6 // a[3]*a[2] 407 umulh x1,x7,x6 408 409 adds x9,x9,x10 // accumulate high parts of multiplication 410 mul x14,x4,x4 // a[0]*a[0] 411 adc x10,x11,xzr // can't overflow 412 413 adds x17,x17,x8 // accumulate low parts of multiplication 414 umulh x4,x4,x4 415 adcs x19,x19,x9 416 mul x9,x5,x5 // a[1]*a[1] 417 adcs x20,x20,x10 418 umulh x5,x5,x5 419 adc x1,x1,xzr // can't overflow 420 421 adds x15,x15,x15 // acc[1-6]*=2 422 mul x10,x6,x6 // a[2]*a[2] 423 adcs x16,x16,x16 424 umulh x6,x6,x6 425 adcs x17,x17,x17 426 mul x11,x7,x7 // a[3]*a[3] 427 adcs x19,x19,x19 428 umulh x7,x7,x7 429 adcs x20,x20,x20 430 adcs x1,x1,x1 431 adc x2,xzr,xzr 432 433 adds x15,x15,x4 // +a[i]*a[i] 434 adcs x16,x16,x9 435 adcs x17,x17,x5 436 adcs x19,x19,x10 437 adcs x20,x20,x6 438 lsl x8,x14,#32 439 adcs x1,x1,x11 440 lsr x9,x14,#32 441 adc x2,x2,x7 442 subs x10,x14,x8 // "*0xffff0001" 443 sbc x11,x14,x9 444 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 445 adcs x15,x16,x9 446 lsl x8,x14,#32 447 adcs x16,x17,x10 // +=acc[0]*0xffff0001 448 lsr x9,x14,#32 449 adc x17,x11,xzr // can't overflow 450 subs x10,x14,x8 // "*0xffff0001" 451 sbc x11,x14,x9 452 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 453 adcs x15,x16,x9 454 lsl x8,x14,#32 455 adcs x16,x17,x10 // +=acc[0]*0xffff0001 456 lsr x9,x14,#32 457 adc x17,x11,xzr // can't overflow 458 subs x10,x14,x8 // "*0xffff0001" 459 sbc x11,x14,x9 460 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 461 adcs x15,x16,x9 462 lsl x8,x14,#32 463 adcs x16,x17,x10 // +=acc[0]*0xffff0001 464 lsr x9,x14,#32 465 adc x17,x11,xzr // can't overflow 466 subs x10,x14,x8 // "*0xffff0001" 467 sbc x11,x14,x9 468 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 469 adcs x15,x16,x9 470 adcs x16,x17,x10 // +=acc[0]*0xffff0001 471 adc x17,x11,xzr // can't overflow 472 473 adds x14,x14,x19 // accumulate upper half 474 adcs x15,x15,x20 475 adcs x16,x16,x1 476 adcs x17,x17,x2 477 adc x19,xzr,xzr 478 479 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 480 sbcs x9,x15,x12 481 sbcs x10,x16,xzr 482 sbcs x11,x17,x13 483 sbcs xzr,x19,xzr // did it borrow? 484 485 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 486 csel x15,x15,x9,lo 487 csel x16,x16,x10,lo 488 stp x14,x15,[x0] 489 csel x17,x17,x11,lo 490 stp x16,x17,[x0,#16] 491 492 ret 493 494 495// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 496// x4-x7 and x8-x11. This is done because it's used in multiple 497// contexts, e.g. in multiplication by 2 and 3... 498.def __ecp_nistz256_add_to 499 .type 32 500.endef 501.align 4 502__ecp_nistz256_add_to: 503 adds x14,x14,x8 // ret = a+b 504 adcs x15,x15,x9 505 adcs x16,x16,x10 506 adcs x17,x17,x11 507 adc x1,xzr,xzr // zap x1 508 509 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus 510 sbcs x9,x15,x12 511 sbcs x10,x16,xzr 512 sbcs x11,x17,x13 513 sbcs xzr,x1,xzr // did subtraction borrow? 514 515 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 516 csel x15,x15,x9,lo 517 csel x16,x16,x10,lo 518 stp x14,x15,[x0] 519 csel x17,x17,x11,lo 520 stp x16,x17,[x0,#16] 521 522 ret 523 524 525.def __ecp_nistz256_sub_from 526 .type 32 527.endef 528.align 4 529__ecp_nistz256_sub_from: 530 ldp x8,x9,[x2] 531 ldp x10,x11,[x2,#16] 532 subs x14,x14,x8 // ret = a-b 533 sbcs x15,x15,x9 534 sbcs x16,x16,x10 535 sbcs x17,x17,x11 536 sbc x1,xzr,xzr // zap x1 537 538 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 539 adcs x9,x15,x12 540 adcs x10,x16,xzr 541 adc x11,x17,x13 542 cmp x1,xzr // did subtraction borrow? 543 544 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 545 csel x15,x15,x9,eq 546 csel x16,x16,x10,eq 547 stp x14,x15,[x0] 548 csel x17,x17,x11,eq 549 stp x16,x17,[x0,#16] 550 551 ret 552 553 554.def __ecp_nistz256_sub_morf 555 .type 32 556.endef 557.align 4 558__ecp_nistz256_sub_morf: 559 ldp x8,x9,[x2] 560 ldp x10,x11,[x2,#16] 561 subs x14,x8,x14 // ret = b-a 562 sbcs x15,x9,x15 563 sbcs x16,x10,x16 564 sbcs x17,x11,x17 565 sbc x1,xzr,xzr // zap x1 566 567 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 568 adcs x9,x15,x12 569 adcs x10,x16,xzr 570 adc x11,x17,x13 571 cmp x1,xzr // did subtraction borrow? 572 573 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 574 csel x15,x15,x9,eq 575 csel x16,x16,x10,eq 576 stp x14,x15,[x0] 577 csel x17,x17,x11,eq 578 stp x16,x17,[x0,#16] 579 580 ret 581 582 583.def __ecp_nistz256_div_by_2 584 .type 32 585.endef 586.align 4 587__ecp_nistz256_div_by_2: 588 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus 589 adcs x9,x15,x12 590 adcs x10,x16,xzr 591 adcs x11,x17,x13 592 adc x1,xzr,xzr // zap x1 593 tst x14,#1 // is a even? 594 595 csel x14,x14,x8,eq // ret = even ? a : a+modulus 596 csel x15,x15,x9,eq 597 csel x16,x16,x10,eq 598 csel x17,x17,x11,eq 599 csel x1,xzr,x1,eq 600 601 lsr x14,x14,#1 // ret >>= 1 602 orr x14,x14,x15,lsl#63 603 lsr x15,x15,#1 604 orr x15,x15,x16,lsl#63 605 lsr x16,x16,#1 606 orr x16,x16,x17,lsl#63 607 lsr x17,x17,#1 608 stp x14,x15,[x0] 609 orr x17,x17,x1,lsl#63 610 stp x16,x17,[x0,#16] 611 612 ret 613 614.globl ecp_nistz256_point_double 615 616.def ecp_nistz256_point_double 617 .type 32 618.endef 619.align 5 620ecp_nistz256_point_double: 621 AARCH64_SIGN_LINK_REGISTER 622 stp x29,x30,[sp,#-96]! 623 add x29,sp,#0 624 stp x19,x20,[sp,#16] 625 stp x21,x22,[sp,#32] 626 sub sp,sp,#32*4 627 628Ldouble_shortcut: 629 ldp x14,x15,[x1,#32] 630 mov x21,x0 631 ldp x16,x17,[x1,#48] 632 mov x22,x1 633 ldr x12,Lpoly+8 634 mov x8,x14 635 ldr x13,Lpoly+24 636 mov x9,x15 637 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont 638 mov x10,x16 639 mov x11,x17 640 ldp x6,x7,[x22,#64+16] 641 add x0,sp,#0 642 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 643 644 add x0,sp,#64 645 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 646 647 ldp x8,x9,[x22] 648 ldp x10,x11,[x22,#16] 649 mov x4,x14 // put Zsqr aside for p256_sub 650 mov x5,x15 651 mov x6,x16 652 mov x7,x17 653 add x0,sp,#32 654 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 655 656 add x2,x22,#0 657 mov x14,x4 // restore Zsqr 658 mov x15,x5 659 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 660 mov x16,x6 661 mov x17,x7 662 ldp x6,x7,[sp,#0+16] 663 add x0,sp,#64 664 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 665 666 add x0,sp,#0 667 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 668 669 ldr x3,[x22,#32] 670 ldp x4,x5,[x22,#64] 671 ldp x6,x7,[x22,#64+16] 672 add x2,x22,#32 673 add x0,sp,#96 674 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 675 676 mov x8,x14 677 mov x9,x15 678 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 679 mov x10,x16 680 mov x11,x17 681 ldp x6,x7,[sp,#0+16] 682 add x0,x21,#64 683 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 684 685 add x0,sp,#96 686 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 687 688 ldr x3,[sp,#64] // forward load for p256_mul_mont 689 ldp x4,x5,[sp,#32] 690 ldp x6,x7,[sp,#32+16] 691 add x0,x21,#32 692 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 693 694 add x2,sp,#64 695 add x0,sp,#32 696 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 697 698 mov x8,x14 // duplicate M 699 mov x9,x15 700 mov x10,x16 701 mov x11,x17 702 mov x4,x14 // put M aside 703 mov x5,x15 704 mov x6,x16 705 mov x7,x17 706 add x0,sp,#32 707 bl __ecp_nistz256_add_to 708 mov x8,x4 // restore M 709 mov x9,x5 710 ldr x3,[x22] // forward load for p256_mul_mont 711 mov x10,x6 712 ldp x4,x5,[sp,#0] 713 mov x11,x7 714 ldp x6,x7,[sp,#0+16] 715 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 716 717 add x2,x22,#0 718 add x0,sp,#0 719 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 720 721 mov x8,x14 722 mov x9,x15 723 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont 724 mov x10,x16 725 mov x11,x17 726 ldp x6,x7,[sp,#32+16] 727 add x0,sp,#96 728 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 729 730 add x0,x21,#0 731 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 732 733 add x2,sp,#96 734 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 735 736 add x2,sp,#0 737 add x0,sp,#0 738 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 739 740 ldr x3,[sp,#32] 741 mov x4,x14 // copy S 742 mov x5,x15 743 mov x6,x16 744 mov x7,x17 745 add x2,sp,#32 746 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 747 748 add x2,x21,#32 749 add x0,x21,#32 750 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 751 752 add sp,x29,#0 // destroy frame 753 ldp x19,x20,[x29,#16] 754 ldp x21,x22,[x29,#32] 755 ldp x29,x30,[sp],#96 756 AARCH64_VALIDATE_LINK_REGISTER 757 ret 758 759.globl ecp_nistz256_point_add 760 761.def ecp_nistz256_point_add 762 .type 32 763.endef 764.align 5 765ecp_nistz256_point_add: 766 AARCH64_SIGN_LINK_REGISTER 767 stp x29,x30,[sp,#-96]! 768 add x29,sp,#0 769 stp x19,x20,[sp,#16] 770 stp x21,x22,[sp,#32] 771 stp x23,x24,[sp,#48] 772 stp x25,x26,[sp,#64] 773 stp x27,x28,[sp,#80] 774 sub sp,sp,#32*12 775 776 ldp x4,x5,[x2,#64] // in2_z 777 ldp x6,x7,[x2,#64+16] 778 mov x21,x0 779 mov x22,x1 780 mov x23,x2 781 ldr x12,Lpoly+8 782 ldr x13,Lpoly+24 783 orr x8,x4,x5 784 orr x10,x6,x7 785 orr x25,x8,x10 786 cmp x25,#0 787 csetm x25,ne // ~in2infty 788 add x0,sp,#192 789 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 790 791 ldp x4,x5,[x22,#64] // in1_z 792 ldp x6,x7,[x22,#64+16] 793 orr x8,x4,x5 794 orr x10,x6,x7 795 orr x24,x8,x10 796 cmp x24,#0 797 csetm x24,ne // ~in1infty 798 add x0,sp,#128 799 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 800 801 ldr x3,[x23,#64] 802 ldp x4,x5,[sp,#192] 803 ldp x6,x7,[sp,#192+16] 804 add x2,x23,#64 805 add x0,sp,#320 806 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 807 808 ldr x3,[x22,#64] 809 ldp x4,x5,[sp,#128] 810 ldp x6,x7,[sp,#128+16] 811 add x2,x22,#64 812 add x0,sp,#352 813 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 814 815 ldr x3,[x22,#32] 816 ldp x4,x5,[sp,#320] 817 ldp x6,x7,[sp,#320+16] 818 add x2,x22,#32 819 add x0,sp,#320 820 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 821 822 ldr x3,[x23,#32] 823 ldp x4,x5,[sp,#352] 824 ldp x6,x7,[sp,#352+16] 825 add x2,x23,#32 826 add x0,sp,#352 827 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 828 829 add x2,sp,#320 830 ldr x3,[sp,#192] // forward load for p256_mul_mont 831 ldp x4,x5,[x22] 832 ldp x6,x7,[x22,#16] 833 add x0,sp,#160 834 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 835 836 orr x14,x14,x15 // see if result is zero 837 orr x16,x16,x17 838 orr x26,x14,x16 // ~is_equal(S1,S2) 839 840 add x2,sp,#192 841 add x0,sp,#256 842 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 843 844 ldr x3,[sp,#128] 845 ldp x4,x5,[x23] 846 ldp x6,x7,[x23,#16] 847 add x2,sp,#128 848 add x0,sp,#288 849 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 850 851 add x2,sp,#256 852 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 853 ldp x6,x7,[sp,#160+16] 854 add x0,sp,#96 855 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 856 857 orr x14,x14,x15 // see if result is zero 858 orr x16,x16,x17 859 orr x14,x14,x16 // ~is_equal(U1,U2) 860 861 mvn x27,x24 // -1/0 -> 0/-1 862 mvn x28,x25 // -1/0 -> 0/-1 863 orr x14,x14,x27 864 orr x14,x14,x28 865 orr x14,x14,x26 866 cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 867 868Ladd_double: 869 mov x1,x22 870 mov x0,x21 871 ldp x23,x24,[x29,#48] 872 ldp x25,x26,[x29,#64] 873 ldp x27,x28,[x29,#80] 874 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 875 b Ldouble_shortcut 876 877.align 4 878Ladd_proceed: 879 add x0,sp,#192 880 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 881 882 ldr x3,[x22,#64] 883 ldp x4,x5,[sp,#96] 884 ldp x6,x7,[sp,#96+16] 885 add x2,x22,#64 886 add x0,sp,#64 887 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 888 889 ldp x4,x5,[sp,#96] 890 ldp x6,x7,[sp,#96+16] 891 add x0,sp,#128 892 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 893 894 ldr x3,[x23,#64] 895 ldp x4,x5,[sp,#64] 896 ldp x6,x7,[sp,#64+16] 897 add x2,x23,#64 898 add x0,sp,#64 899 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 900 901 ldr x3,[sp,#96] 902 ldp x4,x5,[sp,#128] 903 ldp x6,x7,[sp,#128+16] 904 add x2,sp,#96 905 add x0,sp,#224 906 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 907 908 ldr x3,[sp,#128] 909 ldp x4,x5,[sp,#256] 910 ldp x6,x7,[sp,#256+16] 911 add x2,sp,#128 912 add x0,sp,#288 913 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 914 915 mov x8,x14 916 mov x9,x15 917 mov x10,x16 918 mov x11,x17 919 add x0,sp,#128 920 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 921 922 add x2,sp,#192 923 add x0,sp,#0 924 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 925 926 add x2,sp,#224 927 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 928 929 add x2,sp,#288 930 ldr x3,[sp,#224] // forward load for p256_mul_mont 931 ldp x4,x5,[sp,#320] 932 ldp x6,x7,[sp,#320+16] 933 add x0,sp,#32 934 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 935 936 add x2,sp,#224 937 add x0,sp,#352 938 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 939 940 ldr x3,[sp,#160] 941 ldp x4,x5,[sp,#32] 942 ldp x6,x7,[sp,#32+16] 943 add x2,sp,#160 944 add x0,sp,#32 945 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 946 947 add x2,sp,#352 948 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 949 950 ldp x4,x5,[sp,#0] // res 951 ldp x6,x7,[sp,#0+16] 952 ldp x8,x9,[x23] // in2 953 ldp x10,x11,[x23,#16] 954 ldp x14,x15,[x22,#0] // in1 955 cmp x24,#0 // ~, remember? 956 ldp x16,x17,[x22,#0+16] 957 csel x8,x4,x8,ne 958 csel x9,x5,x9,ne 959 ldp x4,x5,[sp,#0+0+32] // res 960 csel x10,x6,x10,ne 961 csel x11,x7,x11,ne 962 cmp x25,#0 // ~, remember? 963 ldp x6,x7,[sp,#0+0+48] 964 csel x14,x8,x14,ne 965 csel x15,x9,x15,ne 966 ldp x8,x9,[x23,#0+32] // in2 967 csel x16,x10,x16,ne 968 csel x17,x11,x17,ne 969 ldp x10,x11,[x23,#0+48] 970 stp x14,x15,[x21,#0] 971 stp x16,x17,[x21,#0+16] 972 ldp x14,x15,[x22,#32] // in1 973 cmp x24,#0 // ~, remember? 974 ldp x16,x17,[x22,#32+16] 975 csel x8,x4,x8,ne 976 csel x9,x5,x9,ne 977 ldp x4,x5,[sp,#0+32+32] // res 978 csel x10,x6,x10,ne 979 csel x11,x7,x11,ne 980 cmp x25,#0 // ~, remember? 981 ldp x6,x7,[sp,#0+32+48] 982 csel x14,x8,x14,ne 983 csel x15,x9,x15,ne 984 ldp x8,x9,[x23,#32+32] // in2 985 csel x16,x10,x16,ne 986 csel x17,x11,x17,ne 987 ldp x10,x11,[x23,#32+48] 988 stp x14,x15,[x21,#32] 989 stp x16,x17,[x21,#32+16] 990 ldp x14,x15,[x22,#64] // in1 991 cmp x24,#0 // ~, remember? 992 ldp x16,x17,[x22,#64+16] 993 csel x8,x4,x8,ne 994 csel x9,x5,x9,ne 995 csel x10,x6,x10,ne 996 csel x11,x7,x11,ne 997 cmp x25,#0 // ~, remember? 998 csel x14,x8,x14,ne 999 csel x15,x9,x15,ne 1000 csel x16,x10,x16,ne 1001 csel x17,x11,x17,ne 1002 stp x14,x15,[x21,#64] 1003 stp x16,x17,[x21,#64+16] 1004 1005Ladd_done: 1006 add sp,x29,#0 // destroy frame 1007 ldp x19,x20,[x29,#16] 1008 ldp x21,x22,[x29,#32] 1009 ldp x23,x24,[x29,#48] 1010 ldp x25,x26,[x29,#64] 1011 ldp x27,x28,[x29,#80] 1012 ldp x29,x30,[sp],#96 1013 AARCH64_VALIDATE_LINK_REGISTER 1014 ret 1015 1016.globl ecp_nistz256_point_add_affine 1017 1018.def ecp_nistz256_point_add_affine 1019 .type 32 1020.endef 1021.align 5 1022ecp_nistz256_point_add_affine: 1023 AARCH64_SIGN_LINK_REGISTER 1024 stp x29,x30,[sp,#-80]! 1025 add x29,sp,#0 1026 stp x19,x20,[sp,#16] 1027 stp x21,x22,[sp,#32] 1028 stp x23,x24,[sp,#48] 1029 stp x25,x26,[sp,#64] 1030 sub sp,sp,#32*10 1031 1032 mov x21,x0 1033 mov x22,x1 1034 mov x23,x2 1035 ldr x12,Lpoly+8 1036 ldr x13,Lpoly+24 1037 1038 ldp x4,x5,[x1,#64] // in1_z 1039 ldp x6,x7,[x1,#64+16] 1040 orr x8,x4,x5 1041 orr x10,x6,x7 1042 orr x24,x8,x10 1043 cmp x24,#0 1044 csetm x24,ne // ~in1infty 1045 1046 ldp x14,x15,[x2] // in2_x 1047 ldp x16,x17,[x2,#16] 1048 ldp x8,x9,[x2,#32] // in2_y 1049 ldp x10,x11,[x2,#48] 1050 orr x14,x14,x15 1051 orr x16,x16,x17 1052 orr x8,x8,x9 1053 orr x10,x10,x11 1054 orr x14,x14,x16 1055 orr x8,x8,x10 1056 orr x25,x14,x8 1057 cmp x25,#0 1058 csetm x25,ne // ~in2infty 1059 1060 add x0,sp,#128 1061 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1062 1063 mov x4,x14 1064 mov x5,x15 1065 mov x6,x16 1066 mov x7,x17 1067 ldr x3,[x23] 1068 add x2,x23,#0 1069 add x0,sp,#96 1070 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1071 1072 add x2,x22,#0 1073 ldr x3,[x22,#64] // forward load for p256_mul_mont 1074 ldp x4,x5,[sp,#128] 1075 ldp x6,x7,[sp,#128+16] 1076 add x0,sp,#160 1077 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1078 1079 add x2,x22,#64 1080 add x0,sp,#128 1081 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1082 1083 ldr x3,[x22,#64] 1084 ldp x4,x5,[sp,#160] 1085 ldp x6,x7,[sp,#160+16] 1086 add x2,x22,#64 1087 add x0,sp,#64 1088 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1089 1090 ldr x3,[x23,#32] 1091 ldp x4,x5,[sp,#128] 1092 ldp x6,x7,[sp,#128+16] 1093 add x2,x23,#32 1094 add x0,sp,#128 1095 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1096 1097 add x2,x22,#32 1098 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 1099 ldp x6,x7,[sp,#160+16] 1100 add x0,sp,#192 1101 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1102 1103 add x0,sp,#224 1104 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1105 1106 ldp x4,x5,[sp,#192] 1107 ldp x6,x7,[sp,#192+16] 1108 add x0,sp,#288 1109 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1110 1111 ldr x3,[sp,#160] 1112 ldp x4,x5,[sp,#224] 1113 ldp x6,x7,[sp,#224+16] 1114 add x2,sp,#160 1115 add x0,sp,#256 1116 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1117 1118 ldr x3,[x22] 1119 ldp x4,x5,[sp,#224] 1120 ldp x6,x7,[sp,#224+16] 1121 add x2,x22,#0 1122 add x0,sp,#96 1123 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1124 1125 mov x8,x14 1126 mov x9,x15 1127 mov x10,x16 1128 mov x11,x17 1129 add x0,sp,#224 1130 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 1131 1132 add x2,sp,#288 1133 add x0,sp,#0 1134 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1135 1136 add x2,sp,#256 1137 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1138 1139 add x2,sp,#96 1140 ldr x3,[x22,#32] // forward load for p256_mul_mont 1141 ldp x4,x5,[sp,#256] 1142 ldp x6,x7,[sp,#256+16] 1143 add x0,sp,#32 1144 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1145 1146 add x2,x22,#32 1147 add x0,sp,#128 1148 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1149 1150 ldr x3,[sp,#192] 1151 ldp x4,x5,[sp,#32] 1152 ldp x6,x7,[sp,#32+16] 1153 add x2,sp,#192 1154 add x0,sp,#32 1155 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1156 1157 add x2,sp,#128 1158 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1159 1160 ldp x4,x5,[sp,#0] // res 1161 ldp x6,x7,[sp,#0+16] 1162 ldp x8,x9,[x23] // in2 1163 ldp x10,x11,[x23,#16] 1164 ldp x14,x15,[x22,#0] // in1 1165 cmp x24,#0 // ~, remember? 1166 ldp x16,x17,[x22,#0+16] 1167 csel x8,x4,x8,ne 1168 csel x9,x5,x9,ne 1169 ldp x4,x5,[sp,#0+0+32] // res 1170 csel x10,x6,x10,ne 1171 csel x11,x7,x11,ne 1172 cmp x25,#0 // ~, remember? 1173 ldp x6,x7,[sp,#0+0+48] 1174 csel x14,x8,x14,ne 1175 csel x15,x9,x15,ne 1176 ldp x8,x9,[x23,#0+32] // in2 1177 csel x16,x10,x16,ne 1178 csel x17,x11,x17,ne 1179 ldp x10,x11,[x23,#0+48] 1180 stp x14,x15,[x21,#0] 1181 stp x16,x17,[x21,#0+16] 1182 adr x23,Lone_mont-64 1183 ldp x14,x15,[x22,#32] // in1 1184 cmp x24,#0 // ~, remember? 1185 ldp x16,x17,[x22,#32+16] 1186 csel x8,x4,x8,ne 1187 csel x9,x5,x9,ne 1188 ldp x4,x5,[sp,#0+32+32] // res 1189 csel x10,x6,x10,ne 1190 csel x11,x7,x11,ne 1191 cmp x25,#0 // ~, remember? 1192 ldp x6,x7,[sp,#0+32+48] 1193 csel x14,x8,x14,ne 1194 csel x15,x9,x15,ne 1195 ldp x8,x9,[x23,#32+32] // in2 1196 csel x16,x10,x16,ne 1197 csel x17,x11,x17,ne 1198 ldp x10,x11,[x23,#32+48] 1199 stp x14,x15,[x21,#32] 1200 stp x16,x17,[x21,#32+16] 1201 ldp x14,x15,[x22,#64] // in1 1202 cmp x24,#0 // ~, remember? 1203 ldp x16,x17,[x22,#64+16] 1204 csel x8,x4,x8,ne 1205 csel x9,x5,x9,ne 1206 csel x10,x6,x10,ne 1207 csel x11,x7,x11,ne 1208 cmp x25,#0 // ~, remember? 1209 csel x14,x8,x14,ne 1210 csel x15,x9,x15,ne 1211 csel x16,x10,x16,ne 1212 csel x17,x11,x17,ne 1213 stp x14,x15,[x21,#64] 1214 stp x16,x17,[x21,#64+16] 1215 1216 add sp,x29,#0 // destroy frame 1217 ldp x19,x20,[x29,#16] 1218 ldp x21,x22,[x29,#32] 1219 ldp x23,x24,[x29,#48] 1220 ldp x25,x26,[x29,#64] 1221 ldp x29,x30,[sp],#80 1222 AARCH64_VALIDATE_LINK_REGISTER 1223 ret 1224 1225//////////////////////////////////////////////////////////////////////// 1226// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1227// uint64_t b[4]); 1228.globl ecp_nistz256_ord_mul_mont 1229 1230.def ecp_nistz256_ord_mul_mont 1231 .type 32 1232.endef 1233.align 4 1234ecp_nistz256_ord_mul_mont: 1235 AARCH64_VALID_CALL_TARGET 1236 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1237 stp x29,x30,[sp,#-64]! 1238 add x29,sp,#0 1239 stp x19,x20,[sp,#16] 1240 stp x21,x22,[sp,#32] 1241 stp x23,x24,[sp,#48] 1242 1243 adr x23,Lord 1244 ldr x3,[x2] // bp[0] 1245 ldp x4,x5,[x1] 1246 ldp x6,x7,[x1,#16] 1247 1248 ldp x12,x13,[x23,#0] 1249 ldp x21,x22,[x23,#16] 1250 ldr x23,[x23,#32] 1251 1252 mul x14,x4,x3 // a[0]*b[0] 1253 umulh x8,x4,x3 1254 1255 mul x15,x5,x3 // a[1]*b[0] 1256 umulh x9,x5,x3 1257 1258 mul x16,x6,x3 // a[2]*b[0] 1259 umulh x10,x6,x3 1260 1261 mul x17,x7,x3 // a[3]*b[0] 1262 umulh x19,x7,x3 1263 1264 mul x24,x14,x23 1265 1266 adds x15,x15,x8 // accumulate high parts of multiplication 1267 adcs x16,x16,x9 1268 adcs x17,x17,x10 1269 adc x19,x19,xzr 1270 mov x20,xzr 1271 ldr x3,[x2,#8*1] // b[i] 1272 1273 lsl x8,x24,#32 1274 subs x16,x16,x24 1275 lsr x9,x24,#32 1276 sbcs x17,x17,x8 1277 sbcs x19,x19,x9 1278 sbc x20,x20,xzr 1279 1280 subs xzr,x14,#1 1281 umulh x9,x12,x24 1282 mul x10,x13,x24 1283 umulh x11,x13,x24 1284 1285 adcs x10,x10,x9 1286 mul x8,x4,x3 1287 adc x11,x11,xzr 1288 mul x9,x5,x3 1289 1290 adds x14,x15,x10 1291 mul x10,x6,x3 1292 adcs x15,x16,x11 1293 mul x11,x7,x3 1294 adcs x16,x17,x24 1295 adcs x17,x19,x24 1296 adc x19,x20,xzr 1297 1298 adds x14,x14,x8 // accumulate low parts 1299 umulh x8,x4,x3 1300 adcs x15,x15,x9 1301 umulh x9,x5,x3 1302 adcs x16,x16,x10 1303 umulh x10,x6,x3 1304 adcs x17,x17,x11 1305 umulh x11,x7,x3 1306 adc x19,x19,xzr 1307 mul x24,x14,x23 1308 adds x15,x15,x8 // accumulate high parts 1309 adcs x16,x16,x9 1310 adcs x17,x17,x10 1311 adcs x19,x19,x11 1312 adc x20,xzr,xzr 1313 ldr x3,[x2,#8*2] // b[i] 1314 1315 lsl x8,x24,#32 1316 subs x16,x16,x24 1317 lsr x9,x24,#32 1318 sbcs x17,x17,x8 1319 sbcs x19,x19,x9 1320 sbc x20,x20,xzr 1321 1322 subs xzr,x14,#1 1323 umulh x9,x12,x24 1324 mul x10,x13,x24 1325 umulh x11,x13,x24 1326 1327 adcs x10,x10,x9 1328 mul x8,x4,x3 1329 adc x11,x11,xzr 1330 mul x9,x5,x3 1331 1332 adds x14,x15,x10 1333 mul x10,x6,x3 1334 adcs x15,x16,x11 1335 mul x11,x7,x3 1336 adcs x16,x17,x24 1337 adcs x17,x19,x24 1338 adc x19,x20,xzr 1339 1340 adds x14,x14,x8 // accumulate low parts 1341 umulh x8,x4,x3 1342 adcs x15,x15,x9 1343 umulh x9,x5,x3 1344 adcs x16,x16,x10 1345 umulh x10,x6,x3 1346 adcs x17,x17,x11 1347 umulh x11,x7,x3 1348 adc x19,x19,xzr 1349 mul x24,x14,x23 1350 adds x15,x15,x8 // accumulate high parts 1351 adcs x16,x16,x9 1352 adcs x17,x17,x10 1353 adcs x19,x19,x11 1354 adc x20,xzr,xzr 1355 ldr x3,[x2,#8*3] // b[i] 1356 1357 lsl x8,x24,#32 1358 subs x16,x16,x24 1359 lsr x9,x24,#32 1360 sbcs x17,x17,x8 1361 sbcs x19,x19,x9 1362 sbc x20,x20,xzr 1363 1364 subs xzr,x14,#1 1365 umulh x9,x12,x24 1366 mul x10,x13,x24 1367 umulh x11,x13,x24 1368 1369 adcs x10,x10,x9 1370 mul x8,x4,x3 1371 adc x11,x11,xzr 1372 mul x9,x5,x3 1373 1374 adds x14,x15,x10 1375 mul x10,x6,x3 1376 adcs x15,x16,x11 1377 mul x11,x7,x3 1378 adcs x16,x17,x24 1379 adcs x17,x19,x24 1380 adc x19,x20,xzr 1381 1382 adds x14,x14,x8 // accumulate low parts 1383 umulh x8,x4,x3 1384 adcs x15,x15,x9 1385 umulh x9,x5,x3 1386 adcs x16,x16,x10 1387 umulh x10,x6,x3 1388 adcs x17,x17,x11 1389 umulh x11,x7,x3 1390 adc x19,x19,xzr 1391 mul x24,x14,x23 1392 adds x15,x15,x8 // accumulate high parts 1393 adcs x16,x16,x9 1394 adcs x17,x17,x10 1395 adcs x19,x19,x11 1396 adc x20,xzr,xzr 1397 lsl x8,x24,#32 // last reduction 1398 subs x16,x16,x24 1399 lsr x9,x24,#32 1400 sbcs x17,x17,x8 1401 sbcs x19,x19,x9 1402 sbc x20,x20,xzr 1403 1404 subs xzr,x14,#1 1405 umulh x9,x12,x24 1406 mul x10,x13,x24 1407 umulh x11,x13,x24 1408 1409 adcs x10,x10,x9 1410 adc x11,x11,xzr 1411 1412 adds x14,x15,x10 1413 adcs x15,x16,x11 1414 adcs x16,x17,x24 1415 adcs x17,x19,x24 1416 adc x19,x20,xzr 1417 1418 subs x8,x14,x12 // ret -= modulus 1419 sbcs x9,x15,x13 1420 sbcs x10,x16,x21 1421 sbcs x11,x17,x22 1422 sbcs xzr,x19,xzr 1423 1424 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 1425 csel x15,x15,x9,lo 1426 csel x16,x16,x10,lo 1427 stp x14,x15,[x0] 1428 csel x17,x17,x11,lo 1429 stp x16,x17,[x0,#16] 1430 1431 ldp x19,x20,[sp,#16] 1432 ldp x21,x22,[sp,#32] 1433 ldp x23,x24,[sp,#48] 1434 ldr x29,[sp],#64 1435 ret 1436 1437 1438//////////////////////////////////////////////////////////////////////// 1439// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1440// int rep); 1441.globl ecp_nistz256_ord_sqr_mont 1442 1443.def ecp_nistz256_ord_sqr_mont 1444 .type 32 1445.endef 1446.align 4 1447ecp_nistz256_ord_sqr_mont: 1448 AARCH64_VALID_CALL_TARGET 1449 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1450 stp x29,x30,[sp,#-64]! 1451 add x29,sp,#0 1452 stp x19,x20,[sp,#16] 1453 stp x21,x22,[sp,#32] 1454 stp x23,x24,[sp,#48] 1455 1456 adr x23,Lord 1457 ldp x4,x5,[x1] 1458 ldp x6,x7,[x1,#16] 1459 1460 ldp x12,x13,[x23,#0] 1461 ldp x21,x22,[x23,#16] 1462 ldr x23,[x23,#32] 1463 b Loop_ord_sqr 1464 1465.align 4 1466Loop_ord_sqr: 1467 sub x2,x2,#1 1468 //////////////////////////////////////////////////////////////// 1469 // | | | | | |a1*a0| | 1470 // | | | | |a2*a0| | | 1471 // | |a3*a2|a3*a0| | | | 1472 // | | | |a2*a1| | | | 1473 // | | |a3*a1| | | | | 1474 // *| | | | | | | | 2| 1475 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1476 // |--+--+--+--+--+--+--+--| 1477 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 1478 // 1479 // "can't overflow" below mark carrying into high part of 1480 // multiplication result, which can't overflow, because it 1481 // can never be all ones. 1482 1483 mul x15,x5,x4 // a[1]*a[0] 1484 umulh x9,x5,x4 1485 mul x16,x6,x4 // a[2]*a[0] 1486 umulh x10,x6,x4 1487 mul x17,x7,x4 // a[3]*a[0] 1488 umulh x19,x7,x4 1489 1490 adds x16,x16,x9 // accumulate high parts of multiplication 1491 mul x8,x6,x5 // a[2]*a[1] 1492 umulh x9,x6,x5 1493 adcs x17,x17,x10 1494 mul x10,x7,x5 // a[3]*a[1] 1495 umulh x11,x7,x5 1496 adc x19,x19,xzr // can't overflow 1497 1498 mul x20,x7,x6 // a[3]*a[2] 1499 umulh x1,x7,x6 1500 1501 adds x9,x9,x10 // accumulate high parts of multiplication 1502 mul x14,x4,x4 // a[0]*a[0] 1503 adc x10,x11,xzr // can't overflow 1504 1505 adds x17,x17,x8 // accumulate low parts of multiplication 1506 umulh x4,x4,x4 1507 adcs x19,x19,x9 1508 mul x9,x5,x5 // a[1]*a[1] 1509 adcs x20,x20,x10 1510 umulh x5,x5,x5 1511 adc x1,x1,xzr // can't overflow 1512 1513 adds x15,x15,x15 // acc[1-6]*=2 1514 mul x10,x6,x6 // a[2]*a[2] 1515 adcs x16,x16,x16 1516 umulh x6,x6,x6 1517 adcs x17,x17,x17 1518 mul x11,x7,x7 // a[3]*a[3] 1519 adcs x19,x19,x19 1520 umulh x7,x7,x7 1521 adcs x20,x20,x20 1522 adcs x1,x1,x1 1523 adc x3,xzr,xzr 1524 1525 adds x15,x15,x4 // +a[i]*a[i] 1526 mul x24,x14,x23 1527 adcs x16,x16,x9 1528 adcs x17,x17,x5 1529 adcs x19,x19,x10 1530 adcs x20,x20,x6 1531 adcs x1,x1,x11 1532 adc x3,x3,x7 1533 subs xzr,x14,#1 1534 umulh x9,x12,x24 1535 mul x10,x13,x24 1536 umulh x11,x13,x24 1537 1538 adcs x10,x10,x9 1539 adc x11,x11,xzr 1540 1541 adds x14,x15,x10 1542 adcs x15,x16,x11 1543 adcs x16,x17,x24 1544 adc x17,xzr,x24 // can't overflow 1545 mul x11,x14,x23 1546 lsl x8,x24,#32 1547 subs x15,x15,x24 1548 lsr x9,x24,#32 1549 sbcs x16,x16,x8 1550 sbc x17,x17,x9 // can't borrow 1551 subs xzr,x14,#1 1552 umulh x9,x12,x11 1553 mul x10,x13,x11 1554 umulh x24,x13,x11 1555 1556 adcs x10,x10,x9 1557 adc x24,x24,xzr 1558 1559 adds x14,x15,x10 1560 adcs x15,x16,x24 1561 adcs x16,x17,x11 1562 adc x17,xzr,x11 // can't overflow 1563 mul x24,x14,x23 1564 lsl x8,x11,#32 1565 subs x15,x15,x11 1566 lsr x9,x11,#32 1567 sbcs x16,x16,x8 1568 sbc x17,x17,x9 // can't borrow 1569 subs xzr,x14,#1 1570 umulh x9,x12,x24 1571 mul x10,x13,x24 1572 umulh x11,x13,x24 1573 1574 adcs x10,x10,x9 1575 adc x11,x11,xzr 1576 1577 adds x14,x15,x10 1578 adcs x15,x16,x11 1579 adcs x16,x17,x24 1580 adc x17,xzr,x24 // can't overflow 1581 mul x11,x14,x23 1582 lsl x8,x24,#32 1583 subs x15,x15,x24 1584 lsr x9,x24,#32 1585 sbcs x16,x16,x8 1586 sbc x17,x17,x9 // can't borrow 1587 subs xzr,x14,#1 1588 umulh x9,x12,x11 1589 mul x10,x13,x11 1590 umulh x24,x13,x11 1591 1592 adcs x10,x10,x9 1593 adc x24,x24,xzr 1594 1595 adds x14,x15,x10 1596 adcs x15,x16,x24 1597 adcs x16,x17,x11 1598 adc x17,xzr,x11 // can't overflow 1599 lsl x8,x11,#32 1600 subs x15,x15,x11 1601 lsr x9,x11,#32 1602 sbcs x16,x16,x8 1603 sbc x17,x17,x9 // can't borrow 1604 adds x14,x14,x19 // accumulate upper half 1605 adcs x15,x15,x20 1606 adcs x16,x16,x1 1607 adcs x17,x17,x3 1608 adc x19,xzr,xzr 1609 1610 subs x8,x14,x12 // ret -= modulus 1611 sbcs x9,x15,x13 1612 sbcs x10,x16,x21 1613 sbcs x11,x17,x22 1614 sbcs xzr,x19,xzr 1615 1616 csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus 1617 csel x5,x15,x9,lo 1618 csel x6,x16,x10,lo 1619 csel x7,x17,x11,lo 1620 1621 cbnz x2,Loop_ord_sqr 1622 1623 stp x4,x5,[x0] 1624 stp x6,x7,[x0,#16] 1625 1626 ldp x19,x20,[sp,#16] 1627 ldp x21,x22,[sp,#32] 1628 ldp x23,x24,[sp,#48] 1629 ldr x29,[sp],#64 1630 ret 1631 1632//////////////////////////////////////////////////////////////////////// 1633// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1634.globl ecp_nistz256_select_w5 1635 1636.def ecp_nistz256_select_w5 1637 .type 32 1638.endef 1639.align 4 1640ecp_nistz256_select_w5: 1641 AARCH64_VALID_CALL_TARGET 1642 1643 // x10 := x0 1644 // w9 := 0; loop counter and incremented internal index 1645 mov x10, x0 1646 mov w9, #0 1647 1648 // [v16-v21] := 0 1649 movi v16.16b, #0 1650 movi v17.16b, #0 1651 movi v18.16b, #0 1652 movi v19.16b, #0 1653 movi v20.16b, #0 1654 movi v21.16b, #0 1655 1656Lselect_w5_loop: 1657 // Loop 16 times. 1658 1659 // Increment index (loop counter); tested at the end of the loop 1660 add w9, w9, #1 1661 1662 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 1663 // and advance x1 to point to the next entry 1664 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1665 1666 // x11 := (w9 == w2)? All 1s : All 0s 1667 cmp w9, w2 1668 csetm x11, eq 1669 1670 // continue loading ... 1671 ld1 {v26.2d, v27.2d}, [x1],#32 1672 1673 // duplicate mask_64 into Mask (all 0s or all 1s) 1674 dup v3.2d, x11 1675 1676 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1677 // i.e., values in output registers will remain the same if w9 != w2 1678 bit v16.16b, v22.16b, v3.16b 1679 bit v17.16b, v23.16b, v3.16b 1680 1681 bit v18.16b, v24.16b, v3.16b 1682 bit v19.16b, v25.16b, v3.16b 1683 1684 bit v20.16b, v26.16b, v3.16b 1685 bit v21.16b, v27.16b, v3.16b 1686 1687 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1688 tbz w9, #4, Lselect_w5_loop 1689 1690 // Write [v16-v21] to memory at the output pointer 1691 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 1692 st1 {v20.2d, v21.2d}, [x10] 1693 1694 ret 1695 1696 1697 1698//////////////////////////////////////////////////////////////////////// 1699// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1700.globl ecp_nistz256_select_w7 1701 1702.def ecp_nistz256_select_w7 1703 .type 32 1704.endef 1705.align 4 1706ecp_nistz256_select_w7: 1707 AARCH64_VALID_CALL_TARGET 1708 1709 // w9 := 0; loop counter and incremented internal index 1710 mov w9, #0 1711 1712 // [v16-v21] := 0 1713 movi v16.16b, #0 1714 movi v17.16b, #0 1715 movi v18.16b, #0 1716 movi v19.16b, #0 1717 1718Lselect_w7_loop: 1719 // Loop 64 times. 1720 1721 // Increment index (loop counter); tested at the end of the loop 1722 add w9, w9, #1 1723 1724 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 1725 // and advance x1 to point to the next entry 1726 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1727 1728 // x11 := (w9 == w2)? All 1s : All 0s 1729 cmp w9, w2 1730 csetm x11, eq 1731 1732 // duplicate mask_64 into Mask (all 0s or all 1s) 1733 dup v3.2d, x11 1734 1735 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1736 // i.e., values in output registers will remain the same if w9 != w2 1737 bit v16.16b, v22.16b, v3.16b 1738 bit v17.16b, v23.16b, v3.16b 1739 1740 bit v18.16b, v24.16b, v3.16b 1741 bit v19.16b, v25.16b, v3.16b 1742 1743 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1744 tbz w9, #6, Lselect_w7_loop 1745 1746 // Write [v16-v19] to memory at the output pointer 1747 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] 1748 1749 ret 1750 1751#endif 1752#endif // !OPENSSL_NO_ASM 1753