1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include "openssl/arm_arch.h" 17 18.text 19.align 5 20.Lpoly: 21.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 22.LRR: // 2^512 mod P precomputed for NIST P256 polynomial 23.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd 24.Lone_mont: 25.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 26.Lone: 27.quad 1,0,0,0 28.Lord: 29.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 30.LordK: 31.quad 0xccd1c8aaee00bc4f 32.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 33.align 2 34 35// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], 36// const BN_ULONG x2[4]); 37.globl ecp_nistz256_mul_mont 38.hidden ecp_nistz256_mul_mont 39.type ecp_nistz256_mul_mont,%function 40.align 4 41ecp_nistz256_mul_mont: 42 AARCH64_SIGN_LINK_REGISTER 43 stp x29,x30,[sp,#-32]! 44 add x29,sp,#0 45 stp x19,x20,[sp,#16] 46 47 ldr x3,[x2] // bp[0] 48 ldp x4,x5,[x1] 49 ldp x6,x7,[x1,#16] 50 ldr x12,.Lpoly+8 51 ldr x13,.Lpoly+24 52 53 bl __ecp_nistz256_mul_mont 54 55 ldp x19,x20,[sp,#16] 56 ldp x29,x30,[sp],#32 57 AARCH64_VALIDATE_LINK_REGISTER 58 ret 59.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont 60 61// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); 62.globl ecp_nistz256_sqr_mont 63.hidden ecp_nistz256_sqr_mont 64.type ecp_nistz256_sqr_mont,%function 65.align 4 66ecp_nistz256_sqr_mont: 67 AARCH64_SIGN_LINK_REGISTER 68 stp x29,x30,[sp,#-32]! 69 add x29,sp,#0 70 stp x19,x20,[sp,#16] 71 72 ldp x4,x5,[x1] 73 ldp x6,x7,[x1,#16] 74 ldr x12,.Lpoly+8 75 ldr x13,.Lpoly+24 76 77 bl __ecp_nistz256_sqr_mont 78 79 ldp x19,x20,[sp,#16] 80 ldp x29,x30,[sp],#32 81 AARCH64_VALIDATE_LINK_REGISTER 82 ret 83.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont 84 85// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 86.globl ecp_nistz256_div_by_2 87.hidden ecp_nistz256_div_by_2 88.type ecp_nistz256_div_by_2,%function 89.align 4 90ecp_nistz256_div_by_2: 91 AARCH64_SIGN_LINK_REGISTER 92 stp x29,x30,[sp,#-16]! 93 add x29,sp,#0 94 95 ldp x14,x15,[x1] 96 ldp x16,x17,[x1,#16] 97 ldr x12,.Lpoly+8 98 ldr x13,.Lpoly+24 99 100 bl __ecp_nistz256_div_by_2 101 102 ldp x29,x30,[sp],#16 103 AARCH64_VALIDATE_LINK_REGISTER 104 ret 105.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 106 107// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); 108.globl ecp_nistz256_mul_by_2 109.hidden ecp_nistz256_mul_by_2 110.type ecp_nistz256_mul_by_2,%function 111.align 4 112ecp_nistz256_mul_by_2: 113 AARCH64_SIGN_LINK_REGISTER 114 stp x29,x30,[sp,#-16]! 115 add x29,sp,#0 116 117 ldp x14,x15,[x1] 118 ldp x16,x17,[x1,#16] 119 ldr x12,.Lpoly+8 120 ldr x13,.Lpoly+24 121 mov x8,x14 122 mov x9,x15 123 mov x10,x16 124 mov x11,x17 125 126 bl __ecp_nistz256_add_to // ret = a+a // 2*a 127 128 ldp x29,x30,[sp],#16 129 AARCH64_VALIDATE_LINK_REGISTER 130 ret 131.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 132 133// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); 134.globl ecp_nistz256_mul_by_3 135.hidden ecp_nistz256_mul_by_3 136.type ecp_nistz256_mul_by_3,%function 137.align 4 138ecp_nistz256_mul_by_3: 139 AARCH64_SIGN_LINK_REGISTER 140 stp x29,x30,[sp,#-16]! 141 add x29,sp,#0 142 143 ldp x14,x15,[x1] 144 ldp x16,x17,[x1,#16] 145 ldr x12,.Lpoly+8 146 ldr x13,.Lpoly+24 147 mov x8,x14 148 mov x9,x15 149 mov x10,x16 150 mov x11,x17 151 mov x4,x14 152 mov x5,x15 153 mov x6,x16 154 mov x7,x17 155 156 bl __ecp_nistz256_add_to // ret = a+a // 2*a 157 158 mov x8,x4 159 mov x9,x5 160 mov x10,x6 161 mov x11,x7 162 163 bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a 164 165 ldp x29,x30,[sp],#16 166 AARCH64_VALIDATE_LINK_REGISTER 167 ret 168.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 169 170// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], 171// const BN_ULONG x2[4]); 172.globl ecp_nistz256_sub 173.hidden ecp_nistz256_sub 174.type ecp_nistz256_sub,%function 175.align 4 176ecp_nistz256_sub: 177 AARCH64_SIGN_LINK_REGISTER 178 stp x29,x30,[sp,#-16]! 179 add x29,sp,#0 180 181 ldp x14,x15,[x1] 182 ldp x16,x17,[x1,#16] 183 ldr x12,.Lpoly+8 184 ldr x13,.Lpoly+24 185 186 bl __ecp_nistz256_sub_from 187 188 ldp x29,x30,[sp],#16 189 AARCH64_VALIDATE_LINK_REGISTER 190 ret 191.size ecp_nistz256_sub,.-ecp_nistz256_sub 192 193// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); 194.globl ecp_nistz256_neg 195.hidden ecp_nistz256_neg 196.type ecp_nistz256_neg,%function 197.align 4 198ecp_nistz256_neg: 199 AARCH64_SIGN_LINK_REGISTER 200 stp x29,x30,[sp,#-16]! 201 add x29,sp,#0 202 203 mov x2,x1 204 mov x14,xzr // a = 0 205 mov x15,xzr 206 mov x16,xzr 207 mov x17,xzr 208 ldr x12,.Lpoly+8 209 ldr x13,.Lpoly+24 210 211 bl __ecp_nistz256_sub_from 212 213 ldp x29,x30,[sp],#16 214 AARCH64_VALIDATE_LINK_REGISTER 215 ret 216.size ecp_nistz256_neg,.-ecp_nistz256_neg 217 218// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded 219// to x4-x7 and b[0] - to x3 220.type __ecp_nistz256_mul_mont,%function 221.align 4 222__ecp_nistz256_mul_mont: 223 mul x14,x4,x3 // a[0]*b[0] 224 umulh x8,x4,x3 225 226 mul x15,x5,x3 // a[1]*b[0] 227 umulh x9,x5,x3 228 229 mul x16,x6,x3 // a[2]*b[0] 230 umulh x10,x6,x3 231 232 mul x17,x7,x3 // a[3]*b[0] 233 umulh x11,x7,x3 234 ldr x3,[x2,#8] // b[1] 235 236 adds x15,x15,x8 // accumulate high parts of multiplication 237 lsl x8,x14,#32 238 adcs x16,x16,x9 239 lsr x9,x14,#32 240 adcs x17,x17,x10 241 adc x19,xzr,x11 242 mov x20,xzr 243 subs x10,x14,x8 // "*0xffff0001" 244 sbc x11,x14,x9 245 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 246 mul x8,x4,x3 // lo(a[0]*b[i]) 247 adcs x15,x16,x9 248 mul x9,x5,x3 // lo(a[1]*b[i]) 249 adcs x16,x17,x10 // +=acc[0]*0xffff0001 250 mul x10,x6,x3 // lo(a[2]*b[i]) 251 adcs x17,x19,x11 252 mul x11,x7,x3 // lo(a[3]*b[i]) 253 adc x19,x20,xzr 254 255 adds x14,x14,x8 // accumulate low parts of multiplication 256 umulh x8,x4,x3 // hi(a[0]*b[i]) 257 adcs x15,x15,x9 258 umulh x9,x5,x3 // hi(a[1]*b[i]) 259 adcs x16,x16,x10 260 umulh x10,x6,x3 // hi(a[2]*b[i]) 261 adcs x17,x17,x11 262 umulh x11,x7,x3 // hi(a[3]*b[i]) 263 adc x19,x19,xzr 264 ldr x3,[x2,#8*(1+1)] // b[1+1] 265 adds x15,x15,x8 // accumulate high parts of multiplication 266 lsl x8,x14,#32 267 adcs x16,x16,x9 268 lsr x9,x14,#32 269 adcs x17,x17,x10 270 adcs x19,x19,x11 271 adc x20,xzr,xzr 272 subs x10,x14,x8 // "*0xffff0001" 273 sbc x11,x14,x9 274 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 275 mul x8,x4,x3 // lo(a[0]*b[i]) 276 adcs x15,x16,x9 277 mul x9,x5,x3 // lo(a[1]*b[i]) 278 adcs x16,x17,x10 // +=acc[0]*0xffff0001 279 mul x10,x6,x3 // lo(a[2]*b[i]) 280 adcs x17,x19,x11 281 mul x11,x7,x3 // lo(a[3]*b[i]) 282 adc x19,x20,xzr 283 284 adds x14,x14,x8 // accumulate low parts of multiplication 285 umulh x8,x4,x3 // hi(a[0]*b[i]) 286 adcs x15,x15,x9 287 umulh x9,x5,x3 // hi(a[1]*b[i]) 288 adcs x16,x16,x10 289 umulh x10,x6,x3 // hi(a[2]*b[i]) 290 adcs x17,x17,x11 291 umulh x11,x7,x3 // hi(a[3]*b[i]) 292 adc x19,x19,xzr 293 ldr x3,[x2,#8*(2+1)] // b[2+1] 294 adds x15,x15,x8 // accumulate high parts of multiplication 295 lsl x8,x14,#32 296 adcs x16,x16,x9 297 lsr x9,x14,#32 298 adcs x17,x17,x10 299 adcs x19,x19,x11 300 adc x20,xzr,xzr 301 subs x10,x14,x8 // "*0xffff0001" 302 sbc x11,x14,x9 303 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 304 mul x8,x4,x3 // lo(a[0]*b[i]) 305 adcs x15,x16,x9 306 mul x9,x5,x3 // lo(a[1]*b[i]) 307 adcs x16,x17,x10 // +=acc[0]*0xffff0001 308 mul x10,x6,x3 // lo(a[2]*b[i]) 309 adcs x17,x19,x11 310 mul x11,x7,x3 // lo(a[3]*b[i]) 311 adc x19,x20,xzr 312 313 adds x14,x14,x8 // accumulate low parts of multiplication 314 umulh x8,x4,x3 // hi(a[0]*b[i]) 315 adcs x15,x15,x9 316 umulh x9,x5,x3 // hi(a[1]*b[i]) 317 adcs x16,x16,x10 318 umulh x10,x6,x3 // hi(a[2]*b[i]) 319 adcs x17,x17,x11 320 umulh x11,x7,x3 // hi(a[3]*b[i]) 321 adc x19,x19,xzr 322 adds x15,x15,x8 // accumulate high parts of multiplication 323 lsl x8,x14,#32 324 adcs x16,x16,x9 325 lsr x9,x14,#32 326 adcs x17,x17,x10 327 adcs x19,x19,x11 328 adc x20,xzr,xzr 329 // last reduction 330 subs x10,x14,x8 // "*0xffff0001" 331 sbc x11,x14,x9 332 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 333 adcs x15,x16,x9 334 adcs x16,x17,x10 // +=acc[0]*0xffff0001 335 adcs x17,x19,x11 336 adc x19,x20,xzr 337 338 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 339 sbcs x9,x15,x12 340 sbcs x10,x16,xzr 341 sbcs x11,x17,x13 342 sbcs xzr,x19,xzr // did it borrow? 343 344 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 345 csel x15,x15,x9,lo 346 csel x16,x16,x10,lo 347 stp x14,x15,[x0] 348 csel x17,x17,x11,lo 349 stp x16,x17,[x0,#16] 350 351 ret 352.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont 353 354// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded 355// to x4-x7 356.type __ecp_nistz256_sqr_mont,%function 357.align 4 358__ecp_nistz256_sqr_mont: 359 // | | | | | |a1*a0| | 360 // | | | | |a2*a0| | | 361 // | |a3*a2|a3*a0| | | | 362 // | | | |a2*a1| | | | 363 // | | |a3*a1| | | | | 364 // *| | | | | | | | 2| 365 // +|a3*a3|a2*a2|a1*a1|a0*a0| 366 // |--+--+--+--+--+--+--+--| 367 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 368 // 369 // "can't overflow" below mark carrying into high part of 370 // multiplication result, which can't overflow, because it 371 // can never be all ones. 372 373 mul x15,x5,x4 // a[1]*a[0] 374 umulh x9,x5,x4 375 mul x16,x6,x4 // a[2]*a[0] 376 umulh x10,x6,x4 377 mul x17,x7,x4 // a[3]*a[0] 378 umulh x19,x7,x4 379 380 adds x16,x16,x9 // accumulate high parts of multiplication 381 mul x8,x6,x5 // a[2]*a[1] 382 umulh x9,x6,x5 383 adcs x17,x17,x10 384 mul x10,x7,x5 // a[3]*a[1] 385 umulh x11,x7,x5 386 adc x19,x19,xzr // can't overflow 387 388 mul x20,x7,x6 // a[3]*a[2] 389 umulh x1,x7,x6 390 391 adds x9,x9,x10 // accumulate high parts of multiplication 392 mul x14,x4,x4 // a[0]*a[0] 393 adc x10,x11,xzr // can't overflow 394 395 adds x17,x17,x8 // accumulate low parts of multiplication 396 umulh x4,x4,x4 397 adcs x19,x19,x9 398 mul x9,x5,x5 // a[1]*a[1] 399 adcs x20,x20,x10 400 umulh x5,x5,x5 401 adc x1,x1,xzr // can't overflow 402 403 adds x15,x15,x15 // acc[1-6]*=2 404 mul x10,x6,x6 // a[2]*a[2] 405 adcs x16,x16,x16 406 umulh x6,x6,x6 407 adcs x17,x17,x17 408 mul x11,x7,x7 // a[3]*a[3] 409 adcs x19,x19,x19 410 umulh x7,x7,x7 411 adcs x20,x20,x20 412 adcs x1,x1,x1 413 adc x2,xzr,xzr 414 415 adds x15,x15,x4 // +a[i]*a[i] 416 adcs x16,x16,x9 417 adcs x17,x17,x5 418 adcs x19,x19,x10 419 adcs x20,x20,x6 420 lsl x8,x14,#32 421 adcs x1,x1,x11 422 lsr x9,x14,#32 423 adc x2,x2,x7 424 subs x10,x14,x8 // "*0xffff0001" 425 sbc x11,x14,x9 426 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 427 adcs x15,x16,x9 428 lsl x8,x14,#32 429 adcs x16,x17,x10 // +=acc[0]*0xffff0001 430 lsr x9,x14,#32 431 adc x17,x11,xzr // can't overflow 432 subs x10,x14,x8 // "*0xffff0001" 433 sbc x11,x14,x9 434 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 435 adcs x15,x16,x9 436 lsl x8,x14,#32 437 adcs x16,x17,x10 // +=acc[0]*0xffff0001 438 lsr x9,x14,#32 439 adc x17,x11,xzr // can't overflow 440 subs x10,x14,x8 // "*0xffff0001" 441 sbc x11,x14,x9 442 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 443 adcs x15,x16,x9 444 lsl x8,x14,#32 445 adcs x16,x17,x10 // +=acc[0]*0xffff0001 446 lsr x9,x14,#32 447 adc x17,x11,xzr // can't overflow 448 subs x10,x14,x8 // "*0xffff0001" 449 sbc x11,x14,x9 450 adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] 451 adcs x15,x16,x9 452 adcs x16,x17,x10 // +=acc[0]*0xffff0001 453 adc x17,x11,xzr // can't overflow 454 455 adds x14,x14,x19 // accumulate upper half 456 adcs x15,x15,x20 457 adcs x16,x16,x1 458 adcs x17,x17,x2 459 adc x19,xzr,xzr 460 461 adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus 462 sbcs x9,x15,x12 463 sbcs x10,x16,xzr 464 sbcs x11,x17,x13 465 sbcs xzr,x19,xzr // did it borrow? 466 467 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 468 csel x15,x15,x9,lo 469 csel x16,x16,x10,lo 470 stp x14,x15,[x0] 471 csel x17,x17,x11,lo 472 stp x16,x17,[x0,#16] 473 474 ret 475.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont 476 477// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to 478// x4-x7 and x8-x11. This is done because it's used in multiple 479// contexts, e.g. in multiplication by 2 and 3... 480.type __ecp_nistz256_add_to,%function 481.align 4 482__ecp_nistz256_add_to: 483 adds x14,x14,x8 // ret = a+b 484 adcs x15,x15,x9 485 adcs x16,x16,x10 486 adcs x17,x17,x11 487 adc x1,xzr,xzr // zap x1 488 489 adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus 490 sbcs x9,x15,x12 491 sbcs x10,x16,xzr 492 sbcs x11,x17,x13 493 sbcs xzr,x1,xzr // did subtraction borrow? 494 495 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 496 csel x15,x15,x9,lo 497 csel x16,x16,x10,lo 498 stp x14,x15,[x0] 499 csel x17,x17,x11,lo 500 stp x16,x17,[x0,#16] 501 502 ret 503.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to 504 505.type __ecp_nistz256_sub_from,%function 506.align 4 507__ecp_nistz256_sub_from: 508 ldp x8,x9,[x2] 509 ldp x10,x11,[x2,#16] 510 subs x14,x14,x8 // ret = a-b 511 sbcs x15,x15,x9 512 sbcs x16,x16,x10 513 sbcs x17,x17,x11 514 sbc x1,xzr,xzr // zap x1 515 516 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 517 adcs x9,x15,x12 518 adcs x10,x16,xzr 519 adc x11,x17,x13 520 cmp x1,xzr // did subtraction borrow? 521 522 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 523 csel x15,x15,x9,eq 524 csel x16,x16,x10,eq 525 stp x14,x15,[x0] 526 csel x17,x17,x11,eq 527 stp x16,x17,[x0,#16] 528 529 ret 530.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from 531 532.type __ecp_nistz256_sub_morf,%function 533.align 4 534__ecp_nistz256_sub_morf: 535 ldp x8,x9,[x2] 536 ldp x10,x11,[x2,#16] 537 subs x14,x8,x14 // ret = b-a 538 sbcs x15,x9,x15 539 sbcs x16,x10,x16 540 sbcs x17,x11,x17 541 sbc x1,xzr,xzr // zap x1 542 543 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus 544 adcs x9,x15,x12 545 adcs x10,x16,xzr 546 adc x11,x17,x13 547 cmp x1,xzr // did subtraction borrow? 548 549 csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret 550 csel x15,x15,x9,eq 551 csel x16,x16,x10,eq 552 stp x14,x15,[x0] 553 csel x17,x17,x11,eq 554 stp x16,x17,[x0,#16] 555 556 ret 557.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf 558 559.type __ecp_nistz256_div_by_2,%function 560.align 4 561__ecp_nistz256_div_by_2: 562 subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus 563 adcs x9,x15,x12 564 adcs x10,x16,xzr 565 adcs x11,x17,x13 566 adc x1,xzr,xzr // zap x1 567 tst x14,#1 // is a even? 568 569 csel x14,x14,x8,eq // ret = even ? a : a+modulus 570 csel x15,x15,x9,eq 571 csel x16,x16,x10,eq 572 csel x17,x17,x11,eq 573 csel x1,xzr,x1,eq 574 575 lsr x14,x14,#1 // ret >>= 1 576 orr x14,x14,x15,lsl#63 577 lsr x15,x15,#1 578 orr x15,x15,x16,lsl#63 579 lsr x16,x16,#1 580 orr x16,x16,x17,lsl#63 581 lsr x17,x17,#1 582 stp x14,x15,[x0] 583 orr x17,x17,x1,lsl#63 584 stp x16,x17,[x0,#16] 585 586 ret 587.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 588.globl ecp_nistz256_point_double 589.hidden ecp_nistz256_point_double 590.type ecp_nistz256_point_double,%function 591.align 5 592ecp_nistz256_point_double: 593 AARCH64_SIGN_LINK_REGISTER 594 stp x29,x30,[sp,#-96]! 595 add x29,sp,#0 596 stp x19,x20,[sp,#16] 597 stp x21,x22,[sp,#32] 598 sub sp,sp,#32*4 599 600.Ldouble_shortcut: 601 ldp x14,x15,[x1,#32] 602 mov x21,x0 603 ldp x16,x17,[x1,#48] 604 mov x22,x1 605 ldr x12,.Lpoly+8 606 mov x8,x14 607 ldr x13,.Lpoly+24 608 mov x9,x15 609 ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont 610 mov x10,x16 611 mov x11,x17 612 ldp x6,x7,[x22,#64+16] 613 add x0,sp,#0 614 bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); 615 616 add x0,sp,#64 617 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); 618 619 ldp x8,x9,[x22] 620 ldp x10,x11,[x22,#16] 621 mov x4,x14 // put Zsqr aside for p256_sub 622 mov x5,x15 623 mov x6,x16 624 mov x7,x17 625 add x0,sp,#32 626 bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); 627 628 add x2,x22,#0 629 mov x14,x4 // restore Zsqr 630 mov x15,x5 631 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 632 mov x16,x6 633 mov x17,x7 634 ldp x6,x7,[sp,#0+16] 635 add x0,sp,#64 636 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); 637 638 add x0,sp,#0 639 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); 640 641 ldr x3,[x22,#32] 642 ldp x4,x5,[x22,#64] 643 ldp x6,x7,[x22,#64+16] 644 add x2,x22,#32 645 add x0,sp,#96 646 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); 647 648 mov x8,x14 649 mov x9,x15 650 ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont 651 mov x10,x16 652 mov x11,x17 653 ldp x6,x7,[sp,#0+16] 654 add x0,x21,#64 655 bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); 656 657 add x0,sp,#96 658 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); 659 660 ldr x3,[sp,#64] // forward load for p256_mul_mont 661 ldp x4,x5,[sp,#32] 662 ldp x6,x7,[sp,#32+16] 663 add x0,x21,#32 664 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); 665 666 add x2,sp,#64 667 add x0,sp,#32 668 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); 669 670 mov x8,x14 // duplicate M 671 mov x9,x15 672 mov x10,x16 673 mov x11,x17 674 mov x4,x14 // put M aside 675 mov x5,x15 676 mov x6,x16 677 mov x7,x17 678 add x0,sp,#32 679 bl __ecp_nistz256_add_to 680 mov x8,x4 // restore M 681 mov x9,x5 682 ldr x3,[x22] // forward load for p256_mul_mont 683 mov x10,x6 684 ldp x4,x5,[sp,#0] 685 mov x11,x7 686 ldp x6,x7,[sp,#0+16] 687 bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); 688 689 add x2,x22,#0 690 add x0,sp,#0 691 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); 692 693 mov x8,x14 694 mov x9,x15 695 ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont 696 mov x10,x16 697 mov x11,x17 698 ldp x6,x7,[sp,#32+16] 699 add x0,sp,#96 700 bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); 701 702 add x0,x21,#0 703 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); 704 705 add x2,sp,#96 706 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); 707 708 add x2,sp,#0 709 add x0,sp,#0 710 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); 711 712 ldr x3,[sp,#32] 713 mov x4,x14 // copy S 714 mov x5,x15 715 mov x6,x16 716 mov x7,x17 717 add x2,sp,#32 718 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); 719 720 add x2,x21,#32 721 add x0,x21,#32 722 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); 723 724 add sp,x29,#0 // destroy frame 725 ldp x19,x20,[x29,#16] 726 ldp x21,x22,[x29,#32] 727 ldp x29,x30,[sp],#96 728 AARCH64_VALIDATE_LINK_REGISTER 729 ret 730.size ecp_nistz256_point_double,.-ecp_nistz256_point_double 731.globl ecp_nistz256_point_add 732.hidden ecp_nistz256_point_add 733.type ecp_nistz256_point_add,%function 734.align 5 735ecp_nistz256_point_add: 736 AARCH64_SIGN_LINK_REGISTER 737 stp x29,x30,[sp,#-96]! 738 add x29,sp,#0 739 stp x19,x20,[sp,#16] 740 stp x21,x22,[sp,#32] 741 stp x23,x24,[sp,#48] 742 stp x25,x26,[sp,#64] 743 stp x27,x28,[sp,#80] 744 sub sp,sp,#32*12 745 746 ldp x4,x5,[x2,#64] // in2_z 747 ldp x6,x7,[x2,#64+16] 748 mov x21,x0 749 mov x22,x1 750 mov x23,x2 751 ldr x12,.Lpoly+8 752 ldr x13,.Lpoly+24 753 orr x8,x4,x5 754 orr x10,x6,x7 755 orr x25,x8,x10 756 cmp x25,#0 757 csetm x25,ne // ~in2infty 758 add x0,sp,#192 759 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); 760 761 ldp x4,x5,[x22,#64] // in1_z 762 ldp x6,x7,[x22,#64+16] 763 orr x8,x4,x5 764 orr x10,x6,x7 765 orr x24,x8,x10 766 cmp x24,#0 767 csetm x24,ne // ~in1infty 768 add x0,sp,#128 769 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 770 771 ldr x3,[x23,#64] 772 ldp x4,x5,[sp,#192] 773 ldp x6,x7,[sp,#192+16] 774 add x2,x23,#64 775 add x0,sp,#320 776 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); 777 778 ldr x3,[x22,#64] 779 ldp x4,x5,[sp,#128] 780 ldp x6,x7,[sp,#128+16] 781 add x2,x22,#64 782 add x0,sp,#352 783 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 784 785 ldr x3,[x22,#32] 786 ldp x4,x5,[sp,#320] 787 ldp x6,x7,[sp,#320+16] 788 add x2,x22,#32 789 add x0,sp,#320 790 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); 791 792 ldr x3,[x23,#32] 793 ldp x4,x5,[sp,#352] 794 ldp x6,x7,[sp,#352+16] 795 add x2,x23,#32 796 add x0,sp,#352 797 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 798 799 add x2,sp,#320 800 ldr x3,[sp,#192] // forward load for p256_mul_mont 801 ldp x4,x5,[x22] 802 ldp x6,x7,[x22,#16] 803 add x0,sp,#160 804 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); 805 806 orr x14,x14,x15 // see if result is zero 807 orr x16,x16,x17 808 orr x26,x14,x16 // ~is_equal(S1,S2) 809 810 add x2,sp,#192 811 add x0,sp,#256 812 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); 813 814 ldr x3,[sp,#128] 815 ldp x4,x5,[x23] 816 ldp x6,x7,[x23,#16] 817 add x2,sp,#128 818 add x0,sp,#288 819 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); 820 821 add x2,sp,#256 822 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 823 ldp x6,x7,[sp,#160+16] 824 add x0,sp,#96 825 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); 826 827 orr x14,x14,x15 // see if result is zero 828 orr x16,x16,x17 829 orr x14,x14,x16 // ~is_equal(U1,U2) 830 831 mvn x27,x24 // -1/0 -> 0/-1 832 mvn x28,x25 // -1/0 -> 0/-1 833 orr x14,x14,x27 834 orr x14,x14,x28 835 orr x14,x14,x26 836 cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) 837 838.Ladd_double: 839 mov x1,x22 840 mov x0,x21 841 ldp x23,x24,[x29,#48] 842 ldp x25,x26,[x29,#64] 843 ldp x27,x28,[x29,#80] 844 add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames 845 b .Ldouble_shortcut 846 847.align 4 848.Ladd_proceed: 849 add x0,sp,#192 850 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 851 852 ldr x3,[x22,#64] 853 ldp x4,x5,[sp,#96] 854 ldp x6,x7,[sp,#96+16] 855 add x2,x22,#64 856 add x0,sp,#64 857 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 858 859 ldp x4,x5,[sp,#96] 860 ldp x6,x7,[sp,#96+16] 861 add x0,sp,#128 862 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 863 864 ldr x3,[x23,#64] 865 ldp x4,x5,[sp,#64] 866 ldp x6,x7,[sp,#64+16] 867 add x2,x23,#64 868 add x0,sp,#64 869 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); 870 871 ldr x3,[sp,#96] 872 ldp x4,x5,[sp,#128] 873 ldp x6,x7,[sp,#128+16] 874 add x2,sp,#96 875 add x0,sp,#224 876 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 877 878 ldr x3,[sp,#128] 879 ldp x4,x5,[sp,#256] 880 ldp x6,x7,[sp,#256+16] 881 add x2,sp,#128 882 add x0,sp,#288 883 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); 884 885 mov x8,x14 886 mov x9,x15 887 mov x10,x16 888 mov x11,x17 889 add x0,sp,#128 890 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 891 892 add x2,sp,#192 893 add x0,sp,#0 894 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 895 896 add x2,sp,#224 897 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 898 899 add x2,sp,#288 900 ldr x3,[sp,#224] // forward load for p256_mul_mont 901 ldp x4,x5,[sp,#320] 902 ldp x6,x7,[sp,#320+16] 903 add x0,sp,#32 904 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 905 906 add x2,sp,#224 907 add x0,sp,#352 908 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); 909 910 ldr x3,[sp,#160] 911 ldp x4,x5,[sp,#32] 912 ldp x6,x7,[sp,#32+16] 913 add x2,sp,#160 914 add x0,sp,#32 915 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 916 917 add x2,sp,#352 918 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 919 920 ldp x4,x5,[sp,#0] // res 921 ldp x6,x7,[sp,#0+16] 922 ldp x8,x9,[x23] // in2 923 ldp x10,x11,[x23,#16] 924 ldp x14,x15,[x22,#0] // in1 925 cmp x24,#0 // ~, remember? 926 ldp x16,x17,[x22,#0+16] 927 csel x8,x4,x8,ne 928 csel x9,x5,x9,ne 929 ldp x4,x5,[sp,#0+0+32] // res 930 csel x10,x6,x10,ne 931 csel x11,x7,x11,ne 932 cmp x25,#0 // ~, remember? 933 ldp x6,x7,[sp,#0+0+48] 934 csel x14,x8,x14,ne 935 csel x15,x9,x15,ne 936 ldp x8,x9,[x23,#0+32] // in2 937 csel x16,x10,x16,ne 938 csel x17,x11,x17,ne 939 ldp x10,x11,[x23,#0+48] 940 stp x14,x15,[x21,#0] 941 stp x16,x17,[x21,#0+16] 942 ldp x14,x15,[x22,#32] // in1 943 cmp x24,#0 // ~, remember? 944 ldp x16,x17,[x22,#32+16] 945 csel x8,x4,x8,ne 946 csel x9,x5,x9,ne 947 ldp x4,x5,[sp,#0+32+32] // res 948 csel x10,x6,x10,ne 949 csel x11,x7,x11,ne 950 cmp x25,#0 // ~, remember? 951 ldp x6,x7,[sp,#0+32+48] 952 csel x14,x8,x14,ne 953 csel x15,x9,x15,ne 954 ldp x8,x9,[x23,#32+32] // in2 955 csel x16,x10,x16,ne 956 csel x17,x11,x17,ne 957 ldp x10,x11,[x23,#32+48] 958 stp x14,x15,[x21,#32] 959 stp x16,x17,[x21,#32+16] 960 ldp x14,x15,[x22,#64] // in1 961 cmp x24,#0 // ~, remember? 962 ldp x16,x17,[x22,#64+16] 963 csel x8,x4,x8,ne 964 csel x9,x5,x9,ne 965 csel x10,x6,x10,ne 966 csel x11,x7,x11,ne 967 cmp x25,#0 // ~, remember? 968 csel x14,x8,x14,ne 969 csel x15,x9,x15,ne 970 csel x16,x10,x16,ne 971 csel x17,x11,x17,ne 972 stp x14,x15,[x21,#64] 973 stp x16,x17,[x21,#64+16] 974 975.Ladd_done: 976 add sp,x29,#0 // destroy frame 977 ldp x19,x20,[x29,#16] 978 ldp x21,x22,[x29,#32] 979 ldp x23,x24,[x29,#48] 980 ldp x25,x26,[x29,#64] 981 ldp x27,x28,[x29,#80] 982 ldp x29,x30,[sp],#96 983 AARCH64_VALIDATE_LINK_REGISTER 984 ret 985.size ecp_nistz256_point_add,.-ecp_nistz256_point_add 986.globl ecp_nistz256_point_add_affine 987.hidden ecp_nistz256_point_add_affine 988.type ecp_nistz256_point_add_affine,%function 989.align 5 990ecp_nistz256_point_add_affine: 991 AARCH64_SIGN_LINK_REGISTER 992 stp x29,x30,[sp,#-80]! 993 add x29,sp,#0 994 stp x19,x20,[sp,#16] 995 stp x21,x22,[sp,#32] 996 stp x23,x24,[sp,#48] 997 stp x25,x26,[sp,#64] 998 sub sp,sp,#32*10 999 1000 mov x21,x0 1001 mov x22,x1 1002 mov x23,x2 1003 ldr x12,.Lpoly+8 1004 ldr x13,.Lpoly+24 1005 1006 ldp x4,x5,[x1,#64] // in1_z 1007 ldp x6,x7,[x1,#64+16] 1008 orr x8,x4,x5 1009 orr x10,x6,x7 1010 orr x24,x8,x10 1011 cmp x24,#0 1012 csetm x24,ne // ~in1infty 1013 1014 ldp x14,x15,[x2] // in2_x 1015 ldp x16,x17,[x2,#16] 1016 ldp x8,x9,[x2,#32] // in2_y 1017 ldp x10,x11,[x2,#48] 1018 orr x14,x14,x15 1019 orr x16,x16,x17 1020 orr x8,x8,x9 1021 orr x10,x10,x11 1022 orr x14,x14,x16 1023 orr x8,x8,x10 1024 orr x25,x14,x8 1025 cmp x25,#0 1026 csetm x25,ne // ~in2infty 1027 1028 add x0,sp,#128 1029 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); 1030 1031 mov x4,x14 1032 mov x5,x15 1033 mov x6,x16 1034 mov x7,x17 1035 ldr x3,[x23] 1036 add x2,x23,#0 1037 add x0,sp,#96 1038 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); 1039 1040 add x2,x22,#0 1041 ldr x3,[x22,#64] // forward load for p256_mul_mont 1042 ldp x4,x5,[sp,#128] 1043 ldp x6,x7,[sp,#128+16] 1044 add x0,sp,#160 1045 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); 1046 1047 add x2,x22,#64 1048 add x0,sp,#128 1049 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); 1050 1051 ldr x3,[x22,#64] 1052 ldp x4,x5,[sp,#160] 1053 ldp x6,x7,[sp,#160+16] 1054 add x2,x22,#64 1055 add x0,sp,#64 1056 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); 1057 1058 ldr x3,[x23,#32] 1059 ldp x4,x5,[sp,#128] 1060 ldp x6,x7,[sp,#128+16] 1061 add x2,x23,#32 1062 add x0,sp,#128 1063 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); 1064 1065 add x2,x22,#32 1066 ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont 1067 ldp x6,x7,[sp,#160+16] 1068 add x0,sp,#192 1069 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); 1070 1071 add x0,sp,#224 1072 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); 1073 1074 ldp x4,x5,[sp,#192] 1075 ldp x6,x7,[sp,#192+16] 1076 add x0,sp,#288 1077 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); 1078 1079 ldr x3,[sp,#160] 1080 ldp x4,x5,[sp,#224] 1081 ldp x6,x7,[sp,#224+16] 1082 add x2,sp,#160 1083 add x0,sp,#256 1084 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); 1085 1086 ldr x3,[x22] 1087 ldp x4,x5,[sp,#224] 1088 ldp x6,x7,[sp,#224+16] 1089 add x2,x22,#0 1090 add x0,sp,#96 1091 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); 1092 1093 mov x8,x14 1094 mov x9,x15 1095 mov x10,x16 1096 mov x11,x17 1097 add x0,sp,#224 1098 bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); 1099 1100 add x2,sp,#288 1101 add x0,sp,#0 1102 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); 1103 1104 add x2,sp,#256 1105 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); 1106 1107 add x2,sp,#96 1108 ldr x3,[x22,#32] // forward load for p256_mul_mont 1109 ldp x4,x5,[sp,#256] 1110 ldp x6,x7,[sp,#256+16] 1111 add x0,sp,#32 1112 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); 1113 1114 add x2,x22,#32 1115 add x0,sp,#128 1116 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); 1117 1118 ldr x3,[sp,#192] 1119 ldp x4,x5,[sp,#32] 1120 ldp x6,x7,[sp,#32+16] 1121 add x2,sp,#192 1122 add x0,sp,#32 1123 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); 1124 1125 add x2,sp,#128 1126 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); 1127 1128 ldp x4,x5,[sp,#0] // res 1129 ldp x6,x7,[sp,#0+16] 1130 ldp x8,x9,[x23] // in2 1131 ldp x10,x11,[x23,#16] 1132 ldp x14,x15,[x22,#0] // in1 1133 cmp x24,#0 // ~, remember? 1134 ldp x16,x17,[x22,#0+16] 1135 csel x8,x4,x8,ne 1136 csel x9,x5,x9,ne 1137 ldp x4,x5,[sp,#0+0+32] // res 1138 csel x10,x6,x10,ne 1139 csel x11,x7,x11,ne 1140 cmp x25,#0 // ~, remember? 1141 ldp x6,x7,[sp,#0+0+48] 1142 csel x14,x8,x14,ne 1143 csel x15,x9,x15,ne 1144 ldp x8,x9,[x23,#0+32] // in2 1145 csel x16,x10,x16,ne 1146 csel x17,x11,x17,ne 1147 ldp x10,x11,[x23,#0+48] 1148 stp x14,x15,[x21,#0] 1149 stp x16,x17,[x21,#0+16] 1150 adr x23,.Lone_mont-64 1151 ldp x14,x15,[x22,#32] // in1 1152 cmp x24,#0 // ~, remember? 1153 ldp x16,x17,[x22,#32+16] 1154 csel x8,x4,x8,ne 1155 csel x9,x5,x9,ne 1156 ldp x4,x5,[sp,#0+32+32] // res 1157 csel x10,x6,x10,ne 1158 csel x11,x7,x11,ne 1159 cmp x25,#0 // ~, remember? 1160 ldp x6,x7,[sp,#0+32+48] 1161 csel x14,x8,x14,ne 1162 csel x15,x9,x15,ne 1163 ldp x8,x9,[x23,#32+32] // in2 1164 csel x16,x10,x16,ne 1165 csel x17,x11,x17,ne 1166 ldp x10,x11,[x23,#32+48] 1167 stp x14,x15,[x21,#32] 1168 stp x16,x17,[x21,#32+16] 1169 ldp x14,x15,[x22,#64] // in1 1170 cmp x24,#0 // ~, remember? 1171 ldp x16,x17,[x22,#64+16] 1172 csel x8,x4,x8,ne 1173 csel x9,x5,x9,ne 1174 csel x10,x6,x10,ne 1175 csel x11,x7,x11,ne 1176 cmp x25,#0 // ~, remember? 1177 csel x14,x8,x14,ne 1178 csel x15,x9,x15,ne 1179 csel x16,x10,x16,ne 1180 csel x17,x11,x17,ne 1181 stp x14,x15,[x21,#64] 1182 stp x16,x17,[x21,#64+16] 1183 1184 add sp,x29,#0 // destroy frame 1185 ldp x19,x20,[x29,#16] 1186 ldp x21,x22,[x29,#32] 1187 ldp x23,x24,[x29,#48] 1188 ldp x25,x26,[x29,#64] 1189 ldp x29,x30,[sp],#80 1190 AARCH64_VALIDATE_LINK_REGISTER 1191 ret 1192.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine 1193//////////////////////////////////////////////////////////////////////// 1194// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], 1195// uint64_t b[4]); 1196.globl ecp_nistz256_ord_mul_mont 1197.hidden ecp_nistz256_ord_mul_mont 1198.type ecp_nistz256_ord_mul_mont,%function 1199.align 4 1200ecp_nistz256_ord_mul_mont: 1201 AARCH64_VALID_CALL_TARGET 1202 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1203 stp x29,x30,[sp,#-64]! 1204 add x29,sp,#0 1205 stp x19,x20,[sp,#16] 1206 stp x21,x22,[sp,#32] 1207 stp x23,x24,[sp,#48] 1208 1209 adr x23,.Lord 1210 ldr x3,[x2] // bp[0] 1211 ldp x4,x5,[x1] 1212 ldp x6,x7,[x1,#16] 1213 1214 ldp x12,x13,[x23,#0] 1215 ldp x21,x22,[x23,#16] 1216 ldr x23,[x23,#32] 1217 1218 mul x14,x4,x3 // a[0]*b[0] 1219 umulh x8,x4,x3 1220 1221 mul x15,x5,x3 // a[1]*b[0] 1222 umulh x9,x5,x3 1223 1224 mul x16,x6,x3 // a[2]*b[0] 1225 umulh x10,x6,x3 1226 1227 mul x17,x7,x3 // a[3]*b[0] 1228 umulh x19,x7,x3 1229 1230 mul x24,x14,x23 1231 1232 adds x15,x15,x8 // accumulate high parts of multiplication 1233 adcs x16,x16,x9 1234 adcs x17,x17,x10 1235 adc x19,x19,xzr 1236 mov x20,xzr 1237 ldr x3,[x2,#8*1] // b[i] 1238 1239 lsl x8,x24,#32 1240 subs x16,x16,x24 1241 lsr x9,x24,#32 1242 sbcs x17,x17,x8 1243 sbcs x19,x19,x9 1244 sbc x20,x20,xzr 1245 1246 subs xzr,x14,#1 1247 umulh x9,x12,x24 1248 mul x10,x13,x24 1249 umulh x11,x13,x24 1250 1251 adcs x10,x10,x9 1252 mul x8,x4,x3 1253 adc x11,x11,xzr 1254 mul x9,x5,x3 1255 1256 adds x14,x15,x10 1257 mul x10,x6,x3 1258 adcs x15,x16,x11 1259 mul x11,x7,x3 1260 adcs x16,x17,x24 1261 adcs x17,x19,x24 1262 adc x19,x20,xzr 1263 1264 adds x14,x14,x8 // accumulate low parts 1265 umulh x8,x4,x3 1266 adcs x15,x15,x9 1267 umulh x9,x5,x3 1268 adcs x16,x16,x10 1269 umulh x10,x6,x3 1270 adcs x17,x17,x11 1271 umulh x11,x7,x3 1272 adc x19,x19,xzr 1273 mul x24,x14,x23 1274 adds x15,x15,x8 // accumulate high parts 1275 adcs x16,x16,x9 1276 adcs x17,x17,x10 1277 adcs x19,x19,x11 1278 adc x20,xzr,xzr 1279 ldr x3,[x2,#8*2] // b[i] 1280 1281 lsl x8,x24,#32 1282 subs x16,x16,x24 1283 lsr x9,x24,#32 1284 sbcs x17,x17,x8 1285 sbcs x19,x19,x9 1286 sbc x20,x20,xzr 1287 1288 subs xzr,x14,#1 1289 umulh x9,x12,x24 1290 mul x10,x13,x24 1291 umulh x11,x13,x24 1292 1293 adcs x10,x10,x9 1294 mul x8,x4,x3 1295 adc x11,x11,xzr 1296 mul x9,x5,x3 1297 1298 adds x14,x15,x10 1299 mul x10,x6,x3 1300 adcs x15,x16,x11 1301 mul x11,x7,x3 1302 adcs x16,x17,x24 1303 adcs x17,x19,x24 1304 adc x19,x20,xzr 1305 1306 adds x14,x14,x8 // accumulate low parts 1307 umulh x8,x4,x3 1308 adcs x15,x15,x9 1309 umulh x9,x5,x3 1310 adcs x16,x16,x10 1311 umulh x10,x6,x3 1312 adcs x17,x17,x11 1313 umulh x11,x7,x3 1314 adc x19,x19,xzr 1315 mul x24,x14,x23 1316 adds x15,x15,x8 // accumulate high parts 1317 adcs x16,x16,x9 1318 adcs x17,x17,x10 1319 adcs x19,x19,x11 1320 adc x20,xzr,xzr 1321 ldr x3,[x2,#8*3] // b[i] 1322 1323 lsl x8,x24,#32 1324 subs x16,x16,x24 1325 lsr x9,x24,#32 1326 sbcs x17,x17,x8 1327 sbcs x19,x19,x9 1328 sbc x20,x20,xzr 1329 1330 subs xzr,x14,#1 1331 umulh x9,x12,x24 1332 mul x10,x13,x24 1333 umulh x11,x13,x24 1334 1335 adcs x10,x10,x9 1336 mul x8,x4,x3 1337 adc x11,x11,xzr 1338 mul x9,x5,x3 1339 1340 adds x14,x15,x10 1341 mul x10,x6,x3 1342 adcs x15,x16,x11 1343 mul x11,x7,x3 1344 adcs x16,x17,x24 1345 adcs x17,x19,x24 1346 adc x19,x20,xzr 1347 1348 adds x14,x14,x8 // accumulate low parts 1349 umulh x8,x4,x3 1350 adcs x15,x15,x9 1351 umulh x9,x5,x3 1352 adcs x16,x16,x10 1353 umulh x10,x6,x3 1354 adcs x17,x17,x11 1355 umulh x11,x7,x3 1356 adc x19,x19,xzr 1357 mul x24,x14,x23 1358 adds x15,x15,x8 // accumulate high parts 1359 adcs x16,x16,x9 1360 adcs x17,x17,x10 1361 adcs x19,x19,x11 1362 adc x20,xzr,xzr 1363 lsl x8,x24,#32 // last reduction 1364 subs x16,x16,x24 1365 lsr x9,x24,#32 1366 sbcs x17,x17,x8 1367 sbcs x19,x19,x9 1368 sbc x20,x20,xzr 1369 1370 subs xzr,x14,#1 1371 umulh x9,x12,x24 1372 mul x10,x13,x24 1373 umulh x11,x13,x24 1374 1375 adcs x10,x10,x9 1376 adc x11,x11,xzr 1377 1378 adds x14,x15,x10 1379 adcs x15,x16,x11 1380 adcs x16,x17,x24 1381 adcs x17,x19,x24 1382 adc x19,x20,xzr 1383 1384 subs x8,x14,x12 // ret -= modulus 1385 sbcs x9,x15,x13 1386 sbcs x10,x16,x21 1387 sbcs x11,x17,x22 1388 sbcs xzr,x19,xzr 1389 1390 csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus 1391 csel x15,x15,x9,lo 1392 csel x16,x16,x10,lo 1393 stp x14,x15,[x0] 1394 csel x17,x17,x11,lo 1395 stp x16,x17,[x0,#16] 1396 1397 ldp x19,x20,[sp,#16] 1398 ldp x21,x22,[sp,#32] 1399 ldp x23,x24,[sp,#48] 1400 ldr x29,[sp],#64 1401 ret 1402.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont 1403 1404//////////////////////////////////////////////////////////////////////// 1405// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], 1406// int rep); 1407.globl ecp_nistz256_ord_sqr_mont 1408.hidden ecp_nistz256_ord_sqr_mont 1409.type ecp_nistz256_ord_sqr_mont,%function 1410.align 4 1411ecp_nistz256_ord_sqr_mont: 1412 AARCH64_VALID_CALL_TARGET 1413 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1414 stp x29,x30,[sp,#-64]! 1415 add x29,sp,#0 1416 stp x19,x20,[sp,#16] 1417 stp x21,x22,[sp,#32] 1418 stp x23,x24,[sp,#48] 1419 1420 adr x23,.Lord 1421 ldp x4,x5,[x1] 1422 ldp x6,x7,[x1,#16] 1423 1424 ldp x12,x13,[x23,#0] 1425 ldp x21,x22,[x23,#16] 1426 ldr x23,[x23,#32] 1427 b .Loop_ord_sqr 1428 1429.align 4 1430.Loop_ord_sqr: 1431 sub x2,x2,#1 1432 //////////////////////////////////////////////////////////////// 1433 // | | | | | |a1*a0| | 1434 // | | | | |a2*a0| | | 1435 // | |a3*a2|a3*a0| | | | 1436 // | | | |a2*a1| | | | 1437 // | | |a3*a1| | | | | 1438 // *| | | | | | | | 2| 1439 // +|a3*a3|a2*a2|a1*a1|a0*a0| 1440 // |--+--+--+--+--+--+--+--| 1441 // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow 1442 // 1443 // "can't overflow" below mark carrying into high part of 1444 // multiplication result, which can't overflow, because it 1445 // can never be all ones. 1446 1447 mul x15,x5,x4 // a[1]*a[0] 1448 umulh x9,x5,x4 1449 mul x16,x6,x4 // a[2]*a[0] 1450 umulh x10,x6,x4 1451 mul x17,x7,x4 // a[3]*a[0] 1452 umulh x19,x7,x4 1453 1454 adds x16,x16,x9 // accumulate high parts of multiplication 1455 mul x8,x6,x5 // a[2]*a[1] 1456 umulh x9,x6,x5 1457 adcs x17,x17,x10 1458 mul x10,x7,x5 // a[3]*a[1] 1459 umulh x11,x7,x5 1460 adc x19,x19,xzr // can't overflow 1461 1462 mul x20,x7,x6 // a[3]*a[2] 1463 umulh x1,x7,x6 1464 1465 adds x9,x9,x10 // accumulate high parts of multiplication 1466 mul x14,x4,x4 // a[0]*a[0] 1467 adc x10,x11,xzr // can't overflow 1468 1469 adds x17,x17,x8 // accumulate low parts of multiplication 1470 umulh x4,x4,x4 1471 adcs x19,x19,x9 1472 mul x9,x5,x5 // a[1]*a[1] 1473 adcs x20,x20,x10 1474 umulh x5,x5,x5 1475 adc x1,x1,xzr // can't overflow 1476 1477 adds x15,x15,x15 // acc[1-6]*=2 1478 mul x10,x6,x6 // a[2]*a[2] 1479 adcs x16,x16,x16 1480 umulh x6,x6,x6 1481 adcs x17,x17,x17 1482 mul x11,x7,x7 // a[3]*a[3] 1483 adcs x19,x19,x19 1484 umulh x7,x7,x7 1485 adcs x20,x20,x20 1486 adcs x1,x1,x1 1487 adc x3,xzr,xzr 1488 1489 adds x15,x15,x4 // +a[i]*a[i] 1490 mul x24,x14,x23 1491 adcs x16,x16,x9 1492 adcs x17,x17,x5 1493 adcs x19,x19,x10 1494 adcs x20,x20,x6 1495 adcs x1,x1,x11 1496 adc x3,x3,x7 1497 subs xzr,x14,#1 1498 umulh x9,x12,x24 1499 mul x10,x13,x24 1500 umulh x11,x13,x24 1501 1502 adcs x10,x10,x9 1503 adc x11,x11,xzr 1504 1505 adds x14,x15,x10 1506 adcs x15,x16,x11 1507 adcs x16,x17,x24 1508 adc x17,xzr,x24 // can't overflow 1509 mul x11,x14,x23 1510 lsl x8,x24,#32 1511 subs x15,x15,x24 1512 lsr x9,x24,#32 1513 sbcs x16,x16,x8 1514 sbc x17,x17,x9 // can't borrow 1515 subs xzr,x14,#1 1516 umulh x9,x12,x11 1517 mul x10,x13,x11 1518 umulh x24,x13,x11 1519 1520 adcs x10,x10,x9 1521 adc x24,x24,xzr 1522 1523 adds x14,x15,x10 1524 adcs x15,x16,x24 1525 adcs x16,x17,x11 1526 adc x17,xzr,x11 // can't overflow 1527 mul x24,x14,x23 1528 lsl x8,x11,#32 1529 subs x15,x15,x11 1530 lsr x9,x11,#32 1531 sbcs x16,x16,x8 1532 sbc x17,x17,x9 // can't borrow 1533 subs xzr,x14,#1 1534 umulh x9,x12,x24 1535 mul x10,x13,x24 1536 umulh x11,x13,x24 1537 1538 adcs x10,x10,x9 1539 adc x11,x11,xzr 1540 1541 adds x14,x15,x10 1542 adcs x15,x16,x11 1543 adcs x16,x17,x24 1544 adc x17,xzr,x24 // can't overflow 1545 mul x11,x14,x23 1546 lsl x8,x24,#32 1547 subs x15,x15,x24 1548 lsr x9,x24,#32 1549 sbcs x16,x16,x8 1550 sbc x17,x17,x9 // can't borrow 1551 subs xzr,x14,#1 1552 umulh x9,x12,x11 1553 mul x10,x13,x11 1554 umulh x24,x13,x11 1555 1556 adcs x10,x10,x9 1557 adc x24,x24,xzr 1558 1559 adds x14,x15,x10 1560 adcs x15,x16,x24 1561 adcs x16,x17,x11 1562 adc x17,xzr,x11 // can't overflow 1563 lsl x8,x11,#32 1564 subs x15,x15,x11 1565 lsr x9,x11,#32 1566 sbcs x16,x16,x8 1567 sbc x17,x17,x9 // can't borrow 1568 adds x14,x14,x19 // accumulate upper half 1569 adcs x15,x15,x20 1570 adcs x16,x16,x1 1571 adcs x17,x17,x3 1572 adc x19,xzr,xzr 1573 1574 subs x8,x14,x12 // ret -= modulus 1575 sbcs x9,x15,x13 1576 sbcs x10,x16,x21 1577 sbcs x11,x17,x22 1578 sbcs xzr,x19,xzr 1579 1580 csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus 1581 csel x5,x15,x9,lo 1582 csel x6,x16,x10,lo 1583 csel x7,x17,x11,lo 1584 1585 cbnz x2,.Loop_ord_sqr 1586 1587 stp x4,x5,[x0] 1588 stp x6,x7,[x0,#16] 1589 1590 ldp x19,x20,[sp,#16] 1591 ldp x21,x22,[sp,#32] 1592 ldp x23,x24,[sp,#48] 1593 ldr x29,[sp],#64 1594 ret 1595.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont 1596//////////////////////////////////////////////////////////////////////// 1597// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); 1598.globl ecp_nistz256_select_w5 1599.hidden ecp_nistz256_select_w5 1600.type ecp_nistz256_select_w5,%function 1601.align 4 1602ecp_nistz256_select_w5: 1603 AARCH64_VALID_CALL_TARGET 1604 1605 // x10 := x0 1606 // w9 := 0; loop counter and incremented internal index 1607 mov x10, x0 1608 mov w9, #0 1609 1610 // [v16-v21] := 0 1611 movi v16.16b, #0 1612 movi v17.16b, #0 1613 movi v18.16b, #0 1614 movi v19.16b, #0 1615 movi v20.16b, #0 1616 movi v21.16b, #0 1617 1618.Lselect_w5_loop: 1619 // Loop 16 times. 1620 1621 // Increment index (loop counter); tested at the end of the loop 1622 add w9, w9, #1 1623 1624 // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 1625 // and advance x1 to point to the next entry 1626 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1627 1628 // x11 := (w9 == w2)? All 1s : All 0s 1629 cmp w9, w2 1630 csetm x11, eq 1631 1632 // continue loading ... 1633 ld1 {v26.2d, v27.2d}, [x1],#32 1634 1635 // duplicate mask_64 into Mask (all 0s or all 1s) 1636 dup v3.2d, x11 1637 1638 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1639 // i.e., values in output registers will remain the same if w9 != w2 1640 bit v16.16b, v22.16b, v3.16b 1641 bit v17.16b, v23.16b, v3.16b 1642 1643 bit v18.16b, v24.16b, v3.16b 1644 bit v19.16b, v25.16b, v3.16b 1645 1646 bit v20.16b, v26.16b, v3.16b 1647 bit v21.16b, v27.16b, v3.16b 1648 1649 // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back 1650 tbz w9, #4, .Lselect_w5_loop 1651 1652 // Write [v16-v21] to memory at the output pointer 1653 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 1654 st1 {v20.2d, v21.2d}, [x10] 1655 1656 ret 1657.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 1658 1659 1660//////////////////////////////////////////////////////////////////////// 1661// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); 1662.globl ecp_nistz256_select_w7 1663.hidden ecp_nistz256_select_w7 1664.type ecp_nistz256_select_w7,%function 1665.align 4 1666ecp_nistz256_select_w7: 1667 AARCH64_VALID_CALL_TARGET 1668 1669 // w9 := 0; loop counter and incremented internal index 1670 mov w9, #0 1671 1672 // [v16-v21] := 0 1673 movi v16.16b, #0 1674 movi v17.16b, #0 1675 movi v18.16b, #0 1676 movi v19.16b, #0 1677 1678.Lselect_w7_loop: 1679 // Loop 64 times. 1680 1681 // Increment index (loop counter); tested at the end of the loop 1682 add w9, w9, #1 1683 1684 // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 1685 // and advance x1 to point to the next entry 1686 ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 1687 1688 // x11 := (w9 == w2)? All 1s : All 0s 1689 cmp w9, w2 1690 csetm x11, eq 1691 1692 // duplicate mask_64 into Mask (all 0s or all 1s) 1693 dup v3.2d, x11 1694 1695 // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] 1696 // i.e., values in output registers will remain the same if w9 != w2 1697 bit v16.16b, v22.16b, v3.16b 1698 bit v17.16b, v23.16b, v3.16b 1699 1700 bit v18.16b, v24.16b, v3.16b 1701 bit v19.16b, v25.16b, v3.16b 1702 1703 // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back 1704 tbz w9, #6, .Lselect_w7_loop 1705 1706 // Write [v16-v19] to memory at the output pointer 1707 st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] 1708 1709 ret 1710.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 1711#endif 1712#endif // !OPENSSL_NO_ASM 1713.section .note.GNU-stack,"",%progbits 1714