1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CURVE_SM2 18#include "crypt_arm.h" 19.file "ecp_sm2_armv8.S" 20 21#define s0 x7 22#define s1 x8 23#define s2 x9 24#define s3 x10 25#define s4 x11 26#define s5 x12 27#define s6 x13 28#define s7 x14 29 30.section .rodata 31# The polynomial 32.align 4 33.Lpoly: 34.quad 0xffffffffffffffff, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffeffffffff 35# The order of polynomial 36.Lord: 37.quad 0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff 38 39.Lpoly_div_2: 40.quad 0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff 41.Lord_div_2: 42.quad 0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff 43 44.Lzero: 45.quad 0, 0, 0, 0 46.Lord_1div4: 47.quad 0xd4eefd024e755049, 0xdc80f7dac871814a, 0xffffffffffffffff, 0x3fffffffbfffffff 48.Lord_2div4: 49.quad 0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff 50.Lord_3div4: 51.quad 0x7eccf706eb5ff0db, 0x9582e790595483e0, 0xffffffffffffffff, 0xbfffffff3fffffff 52 53.Lpoly_1div4: 54.quad 0x4000000000000000, 0xffffffffc0000000, 0xffffffffffffffff, 0x3fffffffbfffffff 55.Lpoly_2div4: 56.quad 0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff 57.Lpoly_3div4: 58.quad 0xc000000000000000, 0xffffffff40000000, 0xffffffffffffffff, 0xbfffffff3fffffff 59 60.LRR: // 2^512 mod P precomputed for sm2 polynomial 61.quad 0x0000000200000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002 62.Lone_mont: 63.quad 0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000 64.Lone: 65.quad 1,0,0,0 66 67.text 68### Right shift: in >> 1 ### 69# void ECP_Sm2Div2(BN_UINT *r, BN_UINT *a); 70# 1-bit right shift 71.globl ECP_Sm2Div2 72.type ECP_Sm2Div2, %function 73.align 4 74ECP_Sm2Div2: 75AARCH64_PACIASP 76 # Load inputs 77 ldp x9, x10, [x1] 78 ldp x11, x12, [x1, #16] 79 80 # Right shift 81 extr x9, x10, x9, #1 82 extr x10, x11, x10, #1 83 extr x11, x12, x11, #1 84 lsr x12, x12, #1 85 86 # Store results 87 stp x9, x10, [x0] 88 stp x11, x12, [x0, #16] 89AARCH64_AUTIASP 90 ret 91.size ECP_Sm2Div2, .-ECP_Sm2Div2 92 93### Right shift: in >> 2 ### 94 95# void ECP_Sm2Div4(BN_UINT *r, BN_UINT *a); 96# 2-bit right shift 97.globl ECP_Sm2Div4 98.type ECP_Sm2Div4, %function 99.align 4 100ECP_Sm2Div4: 101AARCH64_PACIASP 102 # Load inputs 103 ldp x7, x8, [x1] 104 ldp x9, x10, [x1, #16] 105 106 # Right shift 107 extr x7, x8, x7, #2 108 extr x8, x9, x8, #2 109 extr x9, x10, x9, #2 110 lsr x10, x10, #2 111 112 # Store results 113 stp x7, x8, [x0] 114 stp x9, x10, [x0, #16] 115AARCH64_AUTIASP 116 ret 117.size ECP_Sm2Div4, .-ECP_Sm2Div4 118 119### Sub: r = a-b ### 120.globl ECP_Sm2BnSub 121.type ECP_Sm2BnSub, %function 122.align 4 123ECP_Sm2BnSub: 124AARCH64_PACIASP 125 # Load inputs 126 ldp x7, x8, [x1] 127 ldp x11, x12, [x2] 128 ldp x9, x10, [x1, #16] 129 ldp x13, x14, [x2, #16] 130 131 # Sub 132 subs x7,x7,x11 133 sbcs x8,x8,x12 134 sbcs x9,x9,x13 135 sbc x10,x10,x14 136 137 # Store results 138 stp x7, x8, [x0] 139 stp x9, x10, [x0, #16] 140AARCH64_AUTIASP 141 ret 142.size ECP_Sm2BnSub, .-ECP_Sm2BnSub 143 144### Add: r = a+b ### 145.globl ECP_Sm2BnAdd 146.type ECP_Sm2BnAdd, %function 147.align 4 148ECP_Sm2BnAdd: 149AARCH64_PACIASP 150 # Load inputs 151 ldp x7, x8, [x1] 152 ldp x11, x12, [x2] 153 ldp x9, x10, [x1, #16] 154 ldp x13, x14, [x2, #16] 155 156 # Add 157 adds x7,x7,x11 158 adcs x8,x8,x12 159 adcs x9,x9,x13 160 adc x10,x10,x14 161 162 # Store results 163 stp x7, x8, [x0] 164 stp x9, x10, [x0, #16] 165AARCH64_AUTIASP 166 ret 167.size ECP_Sm2BnAdd, .-ECP_Sm2BnAdd 168 169### Modular div by 2: res = in/2 mod p ### 170# void ECP_Sm2Div2ModP(BN_UINT *r, BN_UINT *a) 171.globl ECP_Sm2Div2ModP 172.type ECP_Sm2Div2ModP, %function 173.align 4 174ECP_Sm2Div2ModP: 175AARCH64_PACIASP 176 # Load inputs 177 ldp x3, x4, [x1] 178 ldp x5, x6, [x1, #16] 179 180 # Save last bit 181 mov x11, x3 182 183 # Right shift 1 184 extr x3, x4, x3, #1 185 extr x4, x5, x4, #1 186 extr x5, x6, x5, #1 187 lsr x6, x6, #1 188 189 # Load polynomial 190 adrp x1, .Lpoly_div_2 191 add x1,x1,:lo12:.Lpoly_div_2 192 193 ldp x7, x8, [x1] 194 ldp x9, x10, [x1, #16] 195 196 # Parity check 197 tst x11, #1 198 csel x7,xzr,x7,eq 199 csel x8,xzr,x8,eq 200 csel x9,xzr,x9,eq 201 csel x10,xzr,x10,eq 202 203 # Add 204 adds x3,x3,x7 205 adcs x4,x4,x8 206 adcs x5,x5,x9 207 adc x6,x6,x10 208 209 # Store results 210 stp x3, x4, [x0] 211 stp x5, x6, [x0, #16] 212AARCH64_AUTIASP 213 ret 214.size ECP_Sm2Div2ModP, .-ECP_Sm2Div2ModP 215 216### Modular div by 2: res = in/2 mod n, where n = ord(p) ### 217# void ECP_Sm2Div2ModOrd(BN_UINT *r, BN_UINT *a) 218.globl ECP_Sm2Div2ModOrd 219.type ECP_Sm2Div2ModOrd, %function 220.align 4 221ECP_Sm2Div2ModOrd: 222AARCH64_PACIASP 223 # Load inputs 224 ldp x3, x4, [x1] 225 ldp x5, x6, [x1, #16] 226 227 # Save last bit 228 mov x11, x3 229 230 # Right shift 1 231 extr x3, x4, x3, #1 232 extr x4, x5, x4, #1 233 extr x5, x6, x5, #1 234 lsr x6, x6, #1 235 236 # Load polynomial 237 adrp x1, .Lord_div_2 238 add x1,x1,:lo12:.Lord_div_2 239 ldp x7, x8, [x1] 240 ldp x9, x10, [x1, #16] 241 242 # Parity check 243 tst x11, #1 244 csel x7,xzr,x7,eq 245 csel x8,xzr,x8,eq 246 csel x9,xzr,x9,eq 247 csel x10,xzr,x10,eq 248 249 # Add 250 adds x3,x3,x7 251 adcs x4,x4,x8 252 adcs x5,x5,x9 253 adc x6,x6,x10 254 255 # Store results 256 stp x3, x4, [x0] 257 stp x5, x6, [x0, #16] 258AARCH64_AUTIASP 259 ret 260.size ECP_Sm2Div2ModOrd, .-ECP_Sm2Div2ModOrd 261 262### Modular div by 4: res = in/4 mod p ### 263# void ECP_Sm2Div4ModP(BN_UINT *r, BN_UINT *a) 264.globl ECP_Sm2Div4ModP 265.type ECP_Sm2Div4ModP, %function 266.align 4 267 268ECP_Sm2Div4ModP: 269AARCH64_PACIASP 270 # Load inputs 271 ldp x3, x4, [x1] 272 ldp x5, x6, [x1, #16] 273 274 # Save last 2 bits 275 and x11, x3, 0x3 276 277 # Right shift 2 278 extr x3, x4, x3, #2 279 extr x4, x5, x4, #2 280 extr x5, x6, x5, #2 281 lsr x6, x6, #2 282 283 # Load polynomial 284 adrp x12, .Lzero 285 add x12,x12,:lo12:.Lzero 286 adrp x13, .Lpoly_1div4 287 add x13,x13,:lo12:.Lpoly_1div4 288 adrp x14, .Lpoly_2div4 289 add x14,x14,:lo12:.Lpoly_2div4 290 adrp x15, .Lpoly_3div4 291 add x15,x15,:lo12:.Lpoly_3div4 292 cmp x11, #1 293 csel x1,x12,x13,cc 294 cmp x11, #2 295 csel x1,x1,x14,cc 296 cmp x11, #3 297 csel x1,x1,x15,cc 298 299 ldp x7, x8, [x1] 300 ldp x9, x10, [x1, #16] 301 302 # Add 303 adds x3,x3,x7 304 adcs x4,x4,x8 305 adcs x5,x5,x9 306 adc x6,x6,x10 307 308 # Store results 309 stp x3, x4, [x0] 310 stp x5, x6, [x0, #16] 311AARCH64_AUTIASP 312 ret 313.size ECP_Sm2Div4ModP, .-ECP_Sm2Div4ModP 314 315### Modular div by 4: res = in/4 mod n, where n = ord(p) ### 316# void ECP_Sm2Div4ModOrd(BN_UINT *r, BN_UINT *a) 317.globl ECP_Sm2Div4ModOrd 318.type ECP_Sm2Div4ModOrd, %function 319.align 4 320 321ECP_Sm2Div4ModOrd: 322AARCH64_PACIASP 323 # Load inputs 324 ldp x3, x4, [x1] 325 ldp x5, x6, [x1, #16] 326 327 # Save last 2 bits 328 and x11, x3, 0x3 329 330 # Right shift 2 331 extr x3, x4, x3, #2 332 extr x4, x5, x4, #2 333 extr x5, x6, x5, #2 334 lsr x6, x6, #2 335 336 # Load polynomial 337 adrp x12, .Lzero 338 add x12,x12,:lo12:.Lzero 339 adrp x13, .Lord_1div4 340 add x13,x13,:lo12:.Lord_1div4 341 adrp x14, .Lord_2div4 342 add x14,x14,:lo12:.Lord_2div4 343 adrp x15, .Lord_3div4 344 add x15,x15,:lo12:.Lord_3div4 345 cmp x11, #1 346 csel x1,x12,x13,cc 347 cmp x11, #2 348 csel x1,x1,x14,cc 349 cmp x11, #3 350 csel x1,x1,x15,cc 351 352 ldp x7, x8, [x1] 353 ldp x9, x10, [x1, #16] 354 355 # Add 356 adds x3,x3,x7 357 adcs x4,x4,x8 358 adcs x5,x5,x9 359 adc x6,x6,x10 360 361 # Store results 362 stp x3, x4, [x0] 363 stp x5, x6, [x0, #16] 364AARCH64_AUTIASP 365 ret 366.size ECP_Sm2Div4ModOrd, .-ECP_Sm2Div4ModOrd 367 368#define bn_mod_add(mod) \ 369 /* Load inputs */ \ 370 ldp x3,x4,[x1]; \ 371 ldp x5,x6,[x1,#0x10]; \ 372 /* Addition */ \ 373 ldp x7,x8,[x2]; \ 374 ldp x9,x10,[x2,#0x10]; \ 375 adds x3,x3,x7; \ 376 adcs x4,x4,x8; \ 377 adcs x5,x5,x9; \ 378 adcs x6,x6,x10; \ 379 adc x15,xzr,xzr; \ 380 mov x11,x3; \ 381 mov x12,x4; \ 382 mov x13,x5; \ 383 mov x14,x6; \ 384 /* Sub polynomial */ \ 385 adrp x2, mod; \ 386 add x2, x2, :lo12:mod; \ 387 ldp x7,x8,[x2]; \ 388 ldp x9,x10,[x2,#0x10]; \ 389 subs x11,x11,x7; \ 390 sbcs x12,x12,x8; \ 391 sbcs x13,x13,x9; \ 392 sbcs x14,x14,x10; \ 393 sbcs x15,x15,xzr; \ 394 csel x3,x3,x11,cc; \ 395 csel x4,x4,x12,cc; \ 396 csel x5,x5,x13,cc; \ 397 csel x6,x6,x14,cc; \ 398 /* Store results */ \ 399 stp x3,x4,[x0]; \ 400 stp x5,x6,[x0,#0x10]; \ 401 402#define bn_mod_sub(mod) \ 403 /* Load inputs */ \ 404 ldp x3,x4,[x1]; \ 405 ldp x5,x6,[x1,#0x10]; \ 406 /* Addition */ \ 407 ldp x7,x8,[x2]; \ 408 ldp x9,x10,[x2,#0x10]; \ 409 subs x3,x3,x7; \ 410 sbcs x4,x4,x8; \ 411 sbcs x5,x5,x9; \ 412 sbcs x6,x6,x10; \ 413 sbc x15,xzr,xzr; \ 414 mov x11,x3; \ 415 mov x12,x4; \ 416 mov x13,x5; \ 417 mov x14,x6; \ 418 /* Add polynomial */ \ 419 adrp x2, mod; \ 420 add x2, x2, :lo12:mod; \ 421 ldp x7,x8,[x2]; \ 422 ldp x9,x10,[x2,#0x10]; \ 423 adds x11,x11,x7; \ 424 adcs x12,x12,x8; \ 425 adcs x13,x13,x9; \ 426 adcs x14,x14,x10; \ 427 tst x15,x15; \ 428 csel x3,x3,x11,eq; \ 429 csel x4,x4,x12,eq; \ 430 csel x5,x5,x13,eq; \ 431 csel x6,x6,x14,eq; \ 432 /* Store results */ \ 433 stp x3,x4,[x0]; \ 434 stp x5,x6,[x0,#0x10]; \ 435 436### Modular add: r = a+b mod p ### 437.globl ECP_Sm2AddModP 438.type ECP_Sm2AddModP, @function 439.align 4 440 441ECP_Sm2AddModP: 442 443AARCH64_PACIASP 444 bn_mod_add(.Lpoly); 445AARCH64_AUTIASP 446 ret 447.size ECP_Sm2AddModP, .-ECP_Sm2AddModP 448 449### Modular sub: r = p - r ### 450.globl ECP_Sm2Neg 451.type ECP_Sm2Neg, @function 452.align 4 453 454ECP_Sm2Neg: 455AARCH64_PACIASP 456 ldp x11, x12, [x1] 457 mov x7, #0xffffffff00000000 458 ldp x13, x14, [x1, #16] 459 mov x8, #0xfffffffeffffffff 460 461 mov x10, #-1 462 subs x9,x10,x11 463 sbcs x7,x7,x12 464 sbcs x10,x10,x13 465 sbc x8,x8,x14 466 stp x9, x7, [x0] 467 stp x10, x8, [x0, #16] 468 ret 469AARCH64_AUTIASP 470 ret 471.size ECP_Sm2Neg, .-ECP_Sm2Neg 472 473### Modular sub: r = a-b mod p ### 474.globl ECP_Sm2SubModP 475.type ECP_Sm2SubModP, @function 476.align 4 477 478ECP_Sm2SubModP: 479 480AARCH64_PACIASP 481 bn_mod_sub(.Lpoly); 482AARCH64_AUTIASP 483 ret 484.size ECP_Sm2SubModP, .-ECP_Sm2SubModP 485 486### Modular add: r = a+b mod n/p, where n = ord(p) ### 487.globl ECP_Sm2AddModOrd 488.type ECP_Sm2AddModOrd, @function 489.align 4 490ECP_Sm2AddModOrd: 491 492AARCH64_PACIASP 493 bn_mod_add(.Lord); 494AARCH64_AUTIASP 495 ret 496.size ECP_Sm2AddModOrd, .-ECP_Sm2AddModOrd 497 498### Modular sub: r = a-b mod n/p, where n = ord(p) ### 499.globl ECP_Sm2SubModOrd 500.type ECP_Sm2SubModOrd, @function 501.align 4 502ECP_Sm2SubModOrd: 503 504AARCH64_PACIASP 505 bn_mod_sub(.Lord); 506AARCH64_AUTIASP 507 ret 508.size ECP_Sm2SubModOrd, .-ECP_Sm2SubModOrd 509 510.macro RDC 511 # registers map 512 # x3 x4 x5 x6 x15 513 # rsi rax rcx rdx rbx 514 515 # r = a mod sm2 516 # a = a15 | a14 | ... | a0, where ai are 32–bit quantities 517 # | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | (+) 518 # | a8 | a11 | a10 | a9 | a8 | 0 | a9 | a8 | (+) 519 # | a9 | a14 | a13 | a12 | a11 | 0 | a10 | a9 | (+) 520 # | a10 | a15 | a14 | a13 | a12 | 0 | a11 | a10 | (+) 521 # | a11 | 0 | a15 | a14 | a13 | 0 | a12 | a11 | (+) 522 # | a12 | 0 | a15 | a14 | a13 | 0 | a13 | a12 | (+) 523 # | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) 524 # | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) 525 # | a13 | 0 | 0 | 0 | 0 | 0 | a15 | a14 | (+) 526 # | a14 | 0 | 0 | 0 | 0 | 0 | a15 | a14 | (+) 527 # | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 528 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 529 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 530 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 531 # | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) 532 # | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) 533 # | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) 534 # | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) 535 # | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| 536 # | V[3] | V[2] | V[1] | V[0] | 537 # until r < sm2 538 # s7 (a15|a14), s6 (a13|a12), s5 (a11|a10), s4 (a9|a8) 539 # s3 (a7|a6), s2 (a5|a4), s1 (a3|a2), s0 (a1|a0) 540 541 # 1. 64-bit addition 542 eor x3, x3, x3 // to store all carry 543 eor x4, x4, x4 544 mov x5, s6 // rcx <- s6 545 mov x6, s4 // rdx <- s4 546 # a13 | a12 547 adds x5, x5, s7 // rcx <- s6 + s7 548 adcs x4, xzr, xzr // rax <- carry(s6+s7) 549 adds x5, x5, s7 // rcx <- s6 + 2*s7 550 adcs x4, x4, xzr 551 # a9 | a8 552 mov x15, x4 // rbx <- carry (rax) 553 adds x6, x6, x5 // rdx <- s4 + s6 + 2*s7 554 adcs x15, x15, xzr 555 adds x6, x6, s5 // rdx <- s4 + s5 + s6 + 2*s7 556 adcs x15, x15, xzr 557 # sum 558 adds s0, s0, x6 // s0 <- s0 + s4 + s5 + s6 + 2*s7 559 adcs s1, s1, x15 // s1 <- s1 + rbx + carry 560 adcs s2, s2, x5 // s2 <- s2 + s6 + 2*s7 + carry 561 adcs s3, s3, s7 562 adcs x3, xzr, xzr 563 # add carry 564 adds s3, s3, x4 565 adcs x3, x3, xzr // all carry 566 567 stp s0, s1, [sp, #32] 568 stp s2, s3, [sp, #48] 569 # 2. 4 -> 8 64-bit to 32-bit spread 570 mov x4, #0xffffffff 571 mov s0, s4 572 mov s1, s5 573 mov s2, s6 574 mov s3, s7 575 and s0, s0, x4 // a8 576 and s1, s1, x4 // a10 577 and s2, s2, x4 // a12 578 and s3, s3, x4 // a14 579 lsr s4, s4, #32 // a9 580 lsr s5, s5, #32 // a11 581 lsr s6, s6, #32 // a13 582 lsr s7, s7, #32 // a15 583 # 3. 32-bit addition 584 mov x4, s3 585 add x4, x4, s2 // rax <- a12 + a14 586 mov x15, s3 587 add x15, x15, s1 // rbx <- a10 + a14 588 mov x5, s7 589 add x5, x5, s6 // rcx <- a13 + a15 590 mov x6, s0 591 add x6, x6, s4 // rdx <- a8 + a9 592 add s7, s7, s5 // s7 <- a11 + a15 593 mov s2, x5 // s2 <- a13 + a15 594 add s2, s2, x4 // s2 <- a12 + a13 + a14 + a15 595 add s1, s1, s2 // s1 <- a10 + a12 + a13 + a14 + a15 596 add s1, s1, s2 // s1 <- a10 + 2*(a12 + a13 + a14 + a15) 597 add s1, s1, x6 // s1 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) 598 add s1, s1, s5 // s1 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 599 add s2, s2, s6 // s2 <- a12 + 2*a13 + a14 + a15 600 add s2, s2, s5 // s2 <- a11 + a12 + 2*a13 + a14 + a15 601 add s2, s2, s0 // s2 <- a8 + a11 + a12 + 2*a13 + a14 + a15 602 add x6, x6, s3 // rdx <- a8 + a9 + a14 603 add x6, x6, s6 // rdx <- a8 + a9 + a13 + a14 604 add s4, s4, x5 // s4 <- a9 + a13 + a15 605 add s5, s5, s4 // s5 <- a9 + a11 + a13 + a15 606 add s5, s5, x5 // s5 <- a9 + a11 + 2*(a13 + a15) 607 add x4, x4, x15 // rax <- a10 + a12 + 2*a14 608 609 # U[0] s5 a9 + a11 + 2*(a13 + a15) 610 # U[1] %rax a10 + a12 + 2*a14 611 # U[2] 612 # U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 613 # U[4] s4 a9 + a13 + a15 614 # U[5] %rbx a10 + a14 615 # U[6] s7 a11 + a15 616 # U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 617 # sub %rdx a8 + a9 + a13 + a14 618 619 # s0 s3 s6 %rcx 620 621 # 4. 8 -> 4 32-bit to 64-bit 622 # sub %rdx 623 mov s0, x4 624 lsl s0, s0, #32 625 extr x4, s2, x4, #32 626 extr s2, x15, s2, #32 627 extr x15, s1, x15, #32 628 lsr s1, s1, #32 629 630 # 5. 64-bit addition 631 adds s5, s5, s0 632 adcs x4, x4, xzr 633 adcs s4, s4, s2 634 adcs s7, s7, x15 635 adcs x3, x3, s1 636 637 # V[0] s5 638 # V[1] %rax 639 # V[2] s4 640 # V[3] s7 641 # carry %rsi 642 # sub %rdx 643 644 # 5. ADD & SUB 645 ldp s0, s1, [sp, #32] 646 ldp s2, s3, [sp, #48] 647 648 # ADD 649 adds s0, s0, s5 650 adcs s1, s1, x4 651 adcs s2, s2, s4 652 adcs s3, s3, s7 653 adcs x3, x3, xzr 654 # SUB 655 subs s1, s1, x6 656 sbcs s2, s2, xzr 657 sbcs s3, s3, xzr 658 sbcs x3, x3, xzr 659 660 # 6. MOD 661 # First Mod 662 mov x4, x3 663 lsl x4, x4, #32 664 mov x5, x4 665 subs x4, x4, x3 666 667 adds s0, s0, x3 668 adcs s1, s1, x4 669 adcs s2, s2, xzr 670 adcs s3, s3, x5 671 672 # Last Mod 673 # return y - p if y > p else y 674 mov s4, s0 675 mov s5, s1 676 mov s6, s2 677 mov s7, s3 678 679 adrp x3, .Lpoly 680 add x3, x3, :lo12:.Lpoly 681 ldp x4, x15, [x3] 682 ldp x16, x17, [x3, #16] 683 684 eor x5, x5, x5 685 adcs x5, xzr, xzr 686 687 subs s0, s0, x4 688 sbcs s1, s1, x15 689 sbcs s2, s2, x16 690 sbcs s3, s3, x17 691 sbcs x5, x5, xzr 692 693 csel s0, s0, s4, cs 694 csel s1, s1, s5, cs 695 csel s2, s2, s6, cs 696 csel s3, s3, s7, cs 697 698 stp s0, s1, [x0] 699 stp s2, s3, [x0, #16] 700.endm 701 702### Modular mul: r = a*b mod p ### 703# void ECP_Sm2Mul(uint64_t *r, const uint64_t *a, const uint64_t *b) 704# 256-bit modular multiplication in SM2 705# r %rdi 706# a %rsi 707# b %rdx 708# registers map 709# s0 s1 s2 s3 s4 s5 s6 s7 710# x7 x8 x9 x10 x11 x12 x13 x14 x3 x4 x5 x6 x15 711# r8 r9 r10 r11 r12 r13 r14 r15 rax rdx rbx rcx rsi 712.globl ECP_Sm2Mul 713.type ECP_Sm2Mul, @function 714.align 4 715ECP_Sm2Mul: 716AARCH64_PACIASP 717 # Store scalar registers 718 stp x29, x30, [sp, #-80]! 719 add x29, sp, #0 720 stp x16, x17, [sp, #16] 721 stp x18, x19, [sp, #64] 722 723 # Load inputs 724 ldp s0, s1, [x1] 725 ldp s2, s3, [x1, #16] 726 ldp s4, s5, [x2] 727 ldp s6, s7, [x2, #16] 728 729### multiplication ### 730 731 # ======================== 732 # s7 s6 s5 s4 733 # * s3 s2 s1 s0 734 # ------------------------ 735 # + s0 s0 s0 s0 736 # * * * * 737 # s7 s6 s5 s4 738 # s1 s1 s1 s1 739 # * * * * 740 # s7 s6 s5 s4 741 # s2 s2 s2 s2 742 # * * * * 743 # s7 s6 s5 s4 744 # s3 s3 s3 s3 745 # * * * * 746 # s7 s6 s5 s4 747 # ------------------------ 748 # s7 s6 s5 s4 s3 s2 s1 s0 749 # ======================== 750 751### s0*s4 ### 752 mul x16, s0, s4 753 umulh x5, s0, s4 754 eor x6, x6, x6 755 756### s1*s4 + s0*s5 ### 757 mul x3, s1, s4 758 umulh x4, s1, s4 759 adds x5, x5, x3 760 adcs x6, x6, x4 761 eor x15, x15, x15 762 763 mul x3, s0, s5 764 umulh x4, s0, s5 765 adds x5, x5, x3 766 adcs x6, x6, x4 767 adcs x15, x15, xzr 768 mov x17, x5 769 eor x5, x5, x5 770 771### s2 * s4 + s1 * s5 + s0 *s6 ### 772 mul x3, s2, s4 773 umulh x4, s2, s4 774 adds x6, x6, x3 775 adcs x15, x15, x4 776 777 mul x3, s1, s5 778 umulh x4, s1, s5 779 adds x6, x6, x3 780 adcs x15, x15, x4 781 adcs x5, x5, xzr 782 783 mul x3, s0, s6 784 umulh x4, s0, s6 785 adds x6, x6, x3 786 adcs x15, x15, x4 787 adcs x5, x5, xzr 788 mov x18, x6 789 eor x6, x6, x6 790 791### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### 792 mul x3, s3, s4 793 umulh x4, s3, s4 794 adds x15, x15, x3 795 adcs x5, x5, x4 796 adcs x6, x6, xzr 797 798 mul x3, s2, s5 799 umulh x4, s2, s5 800 adds x15, x15, x3 801 adcs x5, x5, x4 802 adcs x6, x6, xzr 803 804 mul x3, s1, s6 805 umulh x4, s1, s6 806 adds x15, x15, x3 807 adcs x5, x5, x4 808 adcs x6, x6, xzr 809 810 mul x3, s0 ,s7 811 umulh x4, s0, s7 812 adds x15, x15, x3 813 adcs x5, x5, x4 814 adcs x6, x6, xzr 815 mov x19, x15 816 eor x15, x15, x15 817 818### s3*s5 + s2*s6 + s1*s7 ### 819 mul x3, s3, s5 820 umulh x4, s3, s5 821 adds x5, x5, x3 822 adcs x6, x6, x4 823 # carry 824 adcs x15, x15, xzr 825 826 mul x3, s2, s6 827 umulh x4, s2, s6 828 adds x5, x5, x3 829 adcs x6, x6, x4 830 adcs x15, x15, xzr 831 832 mul x3, s1, s7 833 umulh x4, s1, s7 834 adds x5, x5, x3 835 adcs x6, x6, x4 836 adcs x15, x15, xzr 837 mov s4, x5 838 eor x5, x5, x5 839 840### s3*s6 + s2*s7 ### 841 mul x3, s3, s6 842 umulh x4, s3, s6 843 adds x6, x6, x3 844 adcs x15, x15, x4 845 adcs x5, x5, xzr 846 847 mul x3, s2, s7 848 umulh x4, s2, s7 849 adds x6, x6, x3 850 adcs x15, x15, x4 851 adcs x5, x5, xzr 852 mov s5, x6 853 854### s3*s7 ### 855 mul x3, s3, s7 856 umulh x4, s3, s7 857 adds x15, x15, x3 858 adcs x5, x5, x4 859 mov s6, x15 860 mov s7, x5 861 862 mov s0, x16 863 mov s1, x17 864 mov s2, x18 865 mov s3, x19 866 867 # result of mul: s7 s6 s5 s4 s3 s2 s1 s0 868 869### Reduction ### 870 RDC 871 872 # Restore scalar registers 873 ldp x16, x17, [sp, #16] 874 ldp x18, x19, [sp, #64] 875 ldp x29, x30, [sp], #80 876AARCH64_AUTIASP 877 ret 878.size ECP_Sm2Mul, .-ECP_Sm2Mul 879 880### Modular sqr: r = a^2 mod p ### 881# void ECP_Sm2Sqr(uint64_t *r, const uint64_t *a) 882# 256-bit modular multiplication in SM2 ### 883# r %rdi 884# a %rsi 885# registers map 886# s0 s1 s2 s3 s4 s5 s6 s7 887# x7 x8 x9 x10 x11 x12 x13 x14 x3 x4 x5 x6 x15 x16 x17 888# r8 r9 r10 r11 r12 r13 r14 r15 rax rdx rbx rcx rsi rbp rdi 889.globl ECP_Sm2Sqr 890.type ECP_Sm2Sqr, @function 891.align 4 892ECP_Sm2Sqr: 893AARCH64_PACIASP 894 # Store scalar registers 895 stp x29, x30, [sp, #-64]! 896 add x29, sp, #0 897 stp x16, x17, [sp, #16] 898 899 # Load inputs 900 ldp s4, s5, [x1] 901 ldp s6, s7, [x1, #16] 902 903### square ### 904 905 # ======================== 906 # s7 s6 s5 s4 907 # * s7 s6 s5 s4 908 # ------------------------ 909 # + s4 s4 s4 s4 910 # * * * * 911 # s7 s6 s5 s4 912 # s5 s5 s5 s5 913 # * * * * 914 # s7 s6 s5 s4 915 # s6 s6 s6 s6 916 # * * * * 917 # s7 s6 s5 s4 918 # s7 s7 s7 s7 919 # * * * * 920 # s7 s6 s5 s4 921 # ------------------------ 922 # s7 s6 s5 s4 s3 s2 s1 s0 923 # ======================== 924 925### s1 <- s4*s5, s2 <- carry ### 926 mul s1, s4, s5 927 umulh s2, s4, s5 928 eor s3, s3, s3 929 930### s2 <- s4*s6 + carry(s2), s3 <- carry ### 931 mul x3, s6, s4 932 umulh s3, s6, s4 933 adds s2, s2, x3 934 adcs s3, s3, xzr 935 eor s0, s0, s0 936 937### s3 <- s4*s7 + s5*s6 + carry(s3), s0 <- carry ### 938 mul x3, s7, s4 939 umulh x4, s7, s4 940 adds s3, s3, x3 941 adcs s0, s0, x4 942 eor x5, x5, x5 943 944 mul x3, s6, s5 945 umulh x4, s6, s5 946 adds s3, s3, x3 947 adcs s0, s0, x4 948 adcs x5, xzr, xzr 949 950### s0 <- s5*s7 + carry(s0), rbx <- carry ### 951 mul x3, s7, s5 952 umulh x4, s7, s5 953 adds s0, s0, x3 954 adcs x5, x5, x4 955 eor x6, x6, x6 956 957### rbx <- s6*s7 + carry(rbx), rcx <- carry ### 958 mul x3, s7, s6 959 umulh x4, s7, s6 960 adds x5, x5, x3 961 adcs x6, x6, x4 962 eor x15, x15, x15 963 964### 2*s0|1|2|3 ### 965 adds s1, s1, s1 966 adcs s2, s2, s2 967 adcs s3, s3, s3 968 adcs s0, s0, s0 969 adcs x5, x5, x5 970 # update carry 971 adcs x6, x6, x6 972 adcs x15, xzr, xzr 973 974### rbp <- s4*s4, carry <- rdi ### 975 mul x16, s4, s4 976 umulh x17, s4, s4 977 978### s4 <- s5*s5, carry <- s5 ### 979 mul s4, s5, s5 980 umulh s5, s5, s5 981 982### s6*s6 ### 983 mul x3, s6, s6 984 umulh x4, s6, s6 985 986 # s1 += carry(s4*s4) 987 adds s1, s1, x17 988 # s2 += s5*s5 989 adcs s2, s2, s4 990 # s3 += carry(s5*s5) 991 adcs s3, s3, s5 992 # s4(s0) += s6*s6 993 adcs s0, s0, x3 994 # s5(rbx) += carry(s6*s6) 995 adcs x5, x5, x4 996 adcs x6, x6, xzr 997 adcs x15, x15, xzr 998 999### s7*s7 ### 1000 mul x3, s7, s7 1001 umulh x4, s7, s7 1002 # s6(rcx) += s7*s7 1003 adds x6, x6, x3 1004 # s7(rsi) += carry(s7*s7) 1005 adcs x15, x15, x4 1006 1007 mov s4, s0 1008 mov s0, x16 1009 mov s5, x5 1010 mov s6, x6 1011 mov s7, x15 1012 # result of mul: s7 s6 s5 s4 s3 s2 s1 s0= 1013### Reduction ### 1014 RDC 1015 1016 # Restore scalar registers 1017 ldp x16, x17, [sp, #16] 1018 ldp x29, x30, [sp], #64 1019AARCH64_AUTIASP 1020 ret 1021.size ECP_Sm2Sqr, .-ECP_Sm2Sqr 1022 1023.globl ECP_Sm2ToMont 1024.type ECP_Sm2ToMont,%function 1025.align 4 1026ECP_Sm2ToMont: 1027AARCH64_PACIASP 1028 stp x29, x30,[sp, #-32]! 1029 add x29,sp, #0 1030 stp x19, x20,[sp, #16] 1031 1032 adrp x3, .LRR // bp[0] 1033 add x3, x3,:lo12:.LRR 1034 ldr x3,[x3] 1035 1036 ldp x4, x5,[x1] 1037 ldp x6, x7,[x1, #16] 1038 1039 adrp x14, .Lpoly+8 1040 add x14, x14,:lo12:.Lpoly+8 1041 ldr x14,[x14] 1042 1043 adrp x15, .Lpoly+24 1044 add x15, x15,:lo12:.Lpoly+24 1045 ldr x15,[x15] 1046 1047 adrp x2, .LRR 1048 add x2, x2,:lo12:.LRR 1049 1050 bl ECP_Sm2MulMont 1051 1052 ldp x19, x20,[sp, #16] 1053 ldp x29, x30,[sp], #32 1054AARCH64_AUTIASP 1055 ret 1056.size ECP_Sm2ToMont,.-ECP_Sm2ToMont 1057 1058.globl ECP_Sm2FromMont 1059.type ECP_Sm2FromMont,%function 1060.align 4 1061ECP_Sm2FromMont: 1062AARCH64_PACIASP 1063 stp x29, x30,[sp, #-32]! 1064 add x29,sp, #0 1065 stp x19, x20,[sp, #16] 1066 1067 adrp x2, .Lone 1068 add x2, x2,:lo12:.Lone 1069 ldr x3, [x2] 1070 1071 ldp x4, x5,[x1] 1072 ldp x6, x7,[x1, #16] 1073 1074 adrp x14, .Lpoly+8 1075 add x14, x14,:lo12:.Lpoly+8 1076 ldr x14, [x14] 1077 1078 adrp x15, .Lpoly+24 1079 add x15, x15,:lo12:.Lpoly+24 1080 ldr x15, [x15] 1081 1082 bl ECP_Sm2MulMont 1083 1084 ldp x19, x20,[sp, #16] 1085 ldp x29, x30,[sp], #32 1086AARCH64_AUTIASP 1087 ret 1088.size ECP_Sm2FromMont,.-ECP_Sm2FromMont 1089 1090.type ECP_Sm2MulMont,%function 1091.align 4 1092ECP_Sm2MulMont: 1093AARCH64_PACIASP 1094 1095 // a[0~3] * b[0] 1096 mul x8, x4, x3 1097 umulh x16, x4, x3 1098 mul x9, x5, x3 1099 umulh x17, x5, x3 1100 mul x10, x6, x3 1101 umulh x19, x6, x3 1102 mul x11, x7, x3 1103 umulh x20, x7, x3 1104 1105 adds x9, x9, x16 1106 adcs x10, x10, x17 1107 adcs x11, x11, x19 1108 adc x12, xzr, x20 1109 ldr x3, [x2, #8] // get b[1] 1110 1111 // begin 1st reduce 1112 lsl x19, x8, #32 1113 lsr x20, x8, #32 1114 1115 subs x16, x8, x19 1116 sbcs x17, xzr, x20 1117 sbcs x19, xzr, x19 1118 sbc x20, x8, x20 1119 1120 mov x13, xzr 1121 adds x8, x9, x16 1122 adcs x9, x10, x17 1123 adcs x10, x11, x19 1124 adcs x11, x12, x20 1125 adc x12, x13, xzr 1126 1127 // lo(a[0~3]) * b[1] 1128 mul x16, x4, x3 1129 mul x17, x5, x3 1130 mul x19, x6, x3 1131 mul x20, x7, x3 1132 1133 adds x8, x8, x16 1134 adcs x9, x9, x17 1135 adcs x10, x10, x19 1136 adcs x11, x11, x20 1137 adc x12, x12, xzr 1138 1139 // hi(a[0~3]) * b[1] 1140 umulh x16, x4, x3 1141 umulh x17, x5, x3 1142 umulh x19, x6, x3 1143 umulh x20, x7, x3 1144 1145 adds x9, x9, x16 1146 adcs x10, x10, x17 1147 adcs x11, x11, x19 1148 adcs x12, x12, x20 1149 adc x13, xzr, xzr 1150 1151 ldr x3, [x2, #8*2] // get b[2] 1152 1153 // begin 2st reduce 1154 lsl x19, x8, #32 1155 lsr x20, x8, #32 1156 subs x16, x8, x19 1157 sbcs x17, xzr, x20 1158 sbcs x19, xzr, x19 1159 sbc x20, x8, x20 1160 1161 adds x8, x9, x16 1162 adcs x9, x10, x17 1163 adcs x10, x11, x19 1164 adcs x11, x12, x20 1165 adc x12, x13, xzr 1166 1167 // lo(a[0~3] * b[2]) 1168 mul x16, x4, x3 1169 mul x17, x5, x3 1170 mul x19, x6, x3 1171 mul x20, x7, x3 1172 1173 adds x8, x8, x16 1174 adcs x9, x9, x17 1175 adcs x10, x10, x19 1176 adcs x11, x11, x20 1177 adc x12, x12, xzr 1178 1179 // hi(a[0~3] * b[2]) 1180 umulh x16, x4, x3 1181 umulh x17, x5, x3 1182 umulh x19, x6, x3 1183 umulh x20, x7, x3 1184 1185 adds x9, x9, x16 1186 adcs x10, x10, x17 1187 adcs x11, x11, x19 1188 adcs x12, x12, x20 1189 adc x13, xzr, xzr 1190 1191 ldr x3,[x2, #8*3] // get b[3] 1192 1193 // begin 3st reduce 1194 lsl x19, x8, #32 1195 lsr x20, x8, #32 1196 subs x16, x8, x19 1197 sbcs x17, xzr, x20 1198 sbcs x19, xzr, x19 1199 sbc x20, x8, x20 1200 1201 adds x8, x9, x16 1202 adcs x9, x10, x17 1203 adcs x10, x11, x19 1204 adcs x11, x12, x20 1205 adc x12, x13, xzr 1206 1207 // lo(a[0~3] * b[3]) 1208 mul x16, x4, x3 1209 mul x17, x5, x3 1210 mul x19, x6, x3 1211 mul x20, x7, x3 1212 1213 adds x8, x8, x16 1214 adcs x9, x9, x17 1215 adcs x10, x10, x19 1216 adcs x11, x11, x20 1217 adc x12, x12, xzr 1218 1219 // hi(a[0~3] * b[3]) 1220 umulh x16, x4, x3 1221 umulh x17, x5, x3 1222 umulh x19, x6, x3 1223 umulh x20, x7, x3 1224 1225 adds x9, x9, x16 1226 adcs x10, x10, x17 1227 adcs x11, x11, x19 1228 adcs x12, x12, x20 1229 adc x13, xzr, xzr 1230 1231 lsl x19, x8, #32 1232 lsr x20, x8, #32 1233 1234 // begin 4st reduce 1235 subs x16, x8, x19 1236 sbcs x17, xzr, x20 1237 sbcs x19, xzr, x19 1238 sbc x20, x8, x20 1239 1240 adds x8, x9, x16 1241 adcs x9, x10, x17 1242 adcs x10, x11, x19 1243 adcs x11, x12, x20 1244 adc x12, x13, xzr 1245 1246 // for cal res - p 1247 adds x16, x8, #1 // - (0xffffffffffffffff) = (+1) 1248 sbcs x17, x9, x14 1249 adcs x19, x10, xzr 1250 sbcs x20, x11, x15 1251 sbcs xzr, x12, xzr 1252 1253 csel x8, x8, x16, lo 1254 csel x9, x9, x17, lo 1255 csel x10, x10, x19, lo 1256 csel x11, x11, x20, lo 1257 stp x8, x9,[x0] 1258 stp x10, x11,[x0, #8*2] 1259 1260AARCH64_AUTIASP 1261 ret 1262.size ECP_Sm2MulMont,.-ECP_Sm2MulMont 1263 1264.type ECP_Sm2SqrMont,%function 1265.align 4 1266ECP_Sm2SqrMont: 1267AARCH64_PACIASP 1268 1269 // a[1~3] * a[0] 1270 mul x9, x5, x4 1271 umulh x17, x5, x4 1272 mul x10, x6, x4 1273 umulh x19, x6, x4 1274 mul x11, x7, x4 1275 umulh x12, x7, x4 1276 1277 adds x10, x10, x17 1278 adcs x11, x11, x19 1279 adc x12, x12, xzr 1280 1281 // a[2~3] * a[1] 1282 mul x16, x6, x5 1283 umulh x17, x6, x5 1284 mul x19, x7, x5 1285 umulh x20, x7, x5 1286 1287 // a[3] * a[2] 1288 mul x13, x7, x6 1289 umulh x1, x7, x6 1290 1291 adds x17, x17, x19 1292 adc x19, x20, xzr 1293 1294 adds x11, x11, x16 1295 adcs x12, x12, x17 1296 adcs x13, x13, x19 1297 adc x1, x1, xzr 1298 1299 // a[0] * a[0] 1300 mul x8, x4, x4 1301 umulh x4, x4, x4 1302 // a[1] * a[1] 1303 mul x17, x5, x5 1304 umulh x5, x5, x5 1305 1306 adds x9, x9, x9 1307 adcs x10, x10, x10 1308 1309 adcs x11, x11, x11 1310 adcs x12, x12, x12 1311 adcs x13, x13, x13 1312 adcs x1, x1, x1 1313 adc x2, xzr, xzr 1314 1315 // a[2] * a[2] 1316 mul x19, x6, x6 1317 umulh x6, x6, x6 1318 // a[3] * a[3] 1319 mul x20, x7, x7 1320 umulh x7, x7, x7 1321 1322 adds x9, x9, x4 1323 adcs x10, x10, x17 1324 adcs x11, x11, x5 1325 adcs x12, x12, x19 1326 adcs x13, x13, x6 1327 adcs x1, x1, x20 1328 adc x2, x2, x7 1329 1330 // begin 1st reduce 1331 lsl x19, x8, #32 1332 lsr x20, x8, #32 1333 subs x16, x8, x19 1334 sbcs x17, xzr, x20 1335 sbcs x19, xzr, x19 1336 sbc x20, x8, x20 1337 1338 adds x8, x9, x16 1339 adcs x9, x10, x17 1340 adcs x10, x11, x19 1341 adc x11, xzr, x20 1342 1343 // begin 2st reduce 1344 lsl x19, x8, #32 1345 lsr x20, x8, #32 1346 subs x16, x8, x19 1347 sbcs x17, xzr, x20 1348 sbcs x19, xzr, x19 1349 sbc x20, x8, x20 1350 1351 adds x8, x9, x16 1352 adcs x9, x10, x17 1353 adcs x10, x11, x19 1354 adc x11, xzr, x20 1355 1356 // begin 3st reduce 1357 lsl x19, x8, #32 1358 lsr x20, x8, #32 1359 subs x16, x8, x19 1360 sbcs x17, xzr, x20 1361 sbcs x19, xzr, x19 1362 sbc x20, x8, x20 1363 1364 adds x8, x9, x16 1365 adcs x9, x10, x17 1366 adcs x10, x11, x19 1367 adc x11, xzr, x20 1368 1369 // begin 4st reduce 1370 lsl x19, x8, #32 1371 lsr x20, x8, #32 1372 subs x16, x8, x19 1373 sbcs x17, xzr, x20 1374 sbcs x19, xzr, x19 1375 sbc x20, x8, x20 1376 1377 adds x8, x9, x16 1378 adcs x9, x10, x17 1379 adcs x10, x11, x19 1380 adc x11, xzr, x20 1381 1382 adds x8, x8, x12 1383 adcs x9, x9, x13 1384 adcs x10, x10, x1 1385 adcs x11, x11, x2 1386 adc x12, xzr, xzr 1387 1388 // for cal res - p 1389 adds x16, x8, #1 1390 sbcs x17, x9, x14 1391 adcs x19, x10, xzr 1392 sbcs x20, x11, x15 1393 sbcs xzr, x12, xzr 1394 1395 csel x8, x8, x16, lo 1396 csel x9, x9, x17, lo 1397 csel x10, x10, x19, lo 1398 csel x11, x11, x20, lo 1399 stp x8, x9,[x0] 1400 stp x10, x11,[x0, #16] 1401AARCH64_AUTIASP 1402 ret 1403.size ECP_Sm2SqrMont,.-ECP_Sm2SqrMont 1404 1405.type ECP_Sm2AddCore,%function 1406.align 4 1407ECP_Sm2AddCore: 1408AARCH64_PACIASP 1409 adds x8, x8, x16 1410 adcs x9, x9, x17 1411 adcs x10, x10, x19 1412 adcs x11, x11, x20 1413 adc x1, xzr, xzr 1414 1415 // sum - p 1416 adds x16, x8, #1 1417 sbcs x17, x9, x14 // x14 = 0xffffffff00000000 1418 adcs x19, x10, xzr 1419 sbcs x20, x11, x15 // x15 = 0xfffffffeffffffff 1420 sbcs xzr, x1, xzr 1421 1422 csel x8, x8, x16,lo 1423 csel x9, x9, x17,lo 1424 csel x10, x10, x19,lo 1425 csel x11, x11, x20,lo 1426 stp x8, x9, [x0] 1427 stp x10, x11, [x0, #16] 1428AARCH64_AUTIASP 1429 ret 1430.size ECP_Sm2AddCore,.-ECP_Sm2AddCore 1431 1432.type ECP_Sm2DivBy2Core,%function 1433.align 4 1434ECP_Sm2DivBy2Core: 1435AARCH64_PACIASP 1436 subs x16, x8, #1 1437 adcs x17, x9, x14 1438 sbcs x19, x10, xzr 1439 adcs x20, x11, x15 1440 adc x1, xzr, xzr 1441 tst x8, #1 1442 1443 csel x8, x8, x16, eq 1444 csel x9, x9, x17, eq 1445 csel x10, x10, x19, eq 1446 csel x11, x11, x20 ,eq 1447 csel x1, xzr, x1, eq 1448 1449 lsr x8, x8, #1 1450 orr x8, x8, x9, lsl#63 1451 lsr x9, x9, #1 1452 orr x9, x9, x10, lsl#63 1453 lsr x10, x10, #1 1454 orr x10, x10, x11, lsl#63 1455 lsr x11, x11, #1 1456 orr x11, x11, x1, lsl#63 1457 stp x8, x9,[x0] 1458 stp x10, x11, [x0, #16] 1459AARCH64_AUTIASP 1460 ret 1461.size ECP_Sm2DivBy2Core,.-ECP_Sm2DivBy2Core 1462 1463.type ECP_Sm2SubAB,%function 1464.align 4 1465ECP_Sm2SubAB: 1466 1467AARCH64_PACIASP 1468 ldp x16, x17,[x2] 1469 ldp x19, x20,[x2, #16] 1470 subs x8, x8, x16 1471 sbcs x9, x9, x17 1472 sbcs x10, x10, x19 1473 sbcs x11, x11, x20 1474 csetm x16, cc 1475 1476 adds x8, x8, x16 1477 and x17, x16, x14 1478 adcs x9, x9, x17 1479 adcs x10, x10, x16 1480 and x19, x16, x15 1481 adc x11, x11, x19 1482 stp x8, x9,[x0] 1483 stp x10, x11,[x0, #16] 1484AARCH64_AUTIASP 1485 ret 1486.size ECP_Sm2SubAB,.-ECP_Sm2SubAB 1487 1488.type ECP_Sm2SubBA,%function 1489.align 4 1490ECP_Sm2SubBA: 1491AARCH64_PACIASP 1492 ldp x16, x17,[x2] 1493 ldp x19, x20,[x2, #16] 1494 subs x8, x16, x8 1495 sbcs x9, x17, x9 1496 sbcs x10, x19, x10 1497 sbcs x11, x20, x11 1498 csetm x16, cc 1499 1500 adds x8, x8, x16 1501 and x17, x16, x14 1502 adcs x9, x9, x17 1503 adcs x10, x10, x16 1504 and x19, x16, x15 1505 adc x11, x11, x19 1506 stp x8, x9,[x0] 1507 stp x10, x11,[x0, #16] 1508AARCH64_AUTIASP 1509 ret 1510.size ECP_Sm2SubBA,.-ECP_Sm2SubBA 1511 1512# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b 1513# Deal process: 1514# delta = Z12 1515# gamma = Y12 1516# beta = X1*gamma 1517# alpha = 3*(X1-delta)*(X1+delta) 1518# X3 = alpha2-8*beta 1519# Z3 = (Y1+Z1)2-gamma-delta 1520# Y3 = alpha*(4*beta-X3)-8*gamma2 1521.globl ECP_Sm2PointDoubleMont 1522.type ECP_Sm2PointDoubleMont,%function 1523.align 4 1524ECP_Sm2PointDoubleMont: 1525AARCH64_PACIASP 1526 stp x29, x30,[sp, #-80]! 1527 mov x29, sp 1528 stp x19, x20,[sp, #16] 1529 stp x21, x22,[sp, #32] 1530 sub sp, sp, #32*4 1531 1532.Lpoint_double: 1533 ldp x8, x9,[x1, #32] // a->y 1534 ldp x10, x11,[x1, #32+16] 1535 1536 mov x21, x0 1537 mov x22, x1 // backup point a 1538 1539 adrp x14, .Lpoly+8 1540 add x14, x14,:lo12:.Lpoly+8 // p[1] 1541 ldr x14, [x14] 1542 1543 adrp x15, .Lpoly+24 1544 add x15, x15,:lo12:.Lpoly+24 // p[3] 1545 ldr x15, [x15] 1546 1547 mov x16, x8 1548 mov x17, x9 1549 mov x19, x10 1550 mov x20, x11 1551 ldp x4, x5, [x22, #64] // a->z 1552 ldp x6, x7, [x22, #64+16] 1553 mov x0, sp 1554 bl ECP_Sm2AddCore // s = 2 * a->y 1555 1556 add x0, sp, #64 1557 bl ECP_Sm2SqrMont // zsqr = (a->z)^2 1558 1559 ldp x16, x17, [x22] 1560 ldp x19, x20, [x22, #16] 1561 mov x4, x8 1562 mov x5, x9 1563 mov x6, x10 1564 mov x7, x11 1565 add x0, sp, #32 1566 bl ECP_Sm2AddCore // m = a->x + zsqr 1567 1568 add x2, x22, #0 1569 mov x8, x4 1570 mov x9, x5 1571 ldp x4, x5,[sp, #0] 1572 mov x10, x6 1573 mov x11, x7 1574 ldp x6, x7,[sp, #16] 1575 add x0, sp, #64 1576 bl ECP_Sm2SubBA // zsqr = a->x - zsqr 1577 1578 add x0, sp, #0 1579 bl ECP_Sm2SqrMont // s = s^2 1580 1581 ldr x3, [x22, #32] 1582 ldp x4, x5,[x22, #64] 1583 ldp x6, x7,[x22, #64+16] 1584 add x2, x22, #32 // a->y 1585 add x0, sp, #96 1586 bl ECP_Sm2MulMont // res_z = a->z * a->y 1587 1588 mov x16, x8 1589 mov x17, x9 1590 ldp x4, x5, [sp, #0] 1591 mov x19, x10 1592 mov x20, x11 1593 ldp x6, x7, [sp, #16] 1594 add x0, x21, #64 1595 bl ECP_Sm2AddCore // res_z = 2 * res_z 1596 1597 add x0, sp, #96 1598 bl ECP_Sm2SqrMont // res_y = s^2 1599 1600 ldr x3, [sp, #64] 1601 ldp x4, x5, [sp, #32] 1602 ldp x6, x7, [sp, #32+16] 1603 add x0, x21, #32 1604 bl ECP_Sm2DivBy2Core // res_y = res_y / 2 1605 1606 add x2, sp, #64 1607 add x0, sp, #32 1608 bl ECP_Sm2MulMont // m = m * zsqr 1609 1610 mov x16, x8 1611 mov x17, x9 1612 mov x19, x10 1613 mov x20, x11 1614 mov x4, x8 1615 mov x5, x9 1616 mov x6, x10 1617 mov x7, x11 1618 add x0, sp, #32 1619 bl ECP_Sm2AddCore 1620 mov x16, x4 1621 mov x17, x5 1622 ldr x3, [x22] 1623 mov x19, x6 1624 ldp x4, x5, [sp, #0] 1625 mov x20, x7 1626 ldp x6, x7, [sp, #16] 1627 bl ECP_Sm2AddCore // m = 3 * m 1628 1629 mov x2, x22 1630 add x0, sp, #0 1631 bl ECP_Sm2MulMont // s = s * a->x 1632 1633 mov x16, x8 1634 mov x17, x9 1635 ldp x4, x5, [sp, #32] 1636 mov x19, x10 1637 mov x20, x11 1638 ldp x6, x7, [sp, #32+16] 1639 add x0, sp, #96 1640 bl ECP_Sm2AddCore // tmp = 2 * s 1641 1642 mov x0, x21 1643 bl ECP_Sm2SqrMont // res_x = m^2 1644 1645 add x2, sp, #96 1646 bl ECP_Sm2SubAB // res_x = res_x - tmp 1647 1648 add x2, sp, #0 1649 add x0, sp, #0 1650 bl ECP_Sm2SubBA // s = s - res_x 1651 1652 ldr x3, [sp, #32] 1653 mov x4, x8 1654 mov x5, x9 1655 mov x6, x10 1656 mov x7, x11 1657 add x2, sp, #32 1658 bl ECP_Sm2MulMont // s = s * m 1659 1660 add x2, x21, #32 1661 add x0, x21, #32 1662 bl ECP_Sm2SubAB // res_y = s - res_y 1663 1664 mov sp, x29 1665 ldp x19, x20,[x29, #16] 1666 ldp x21, x22,[x29, #32] 1667 ldp x29, x30,[sp], #80 1668AARCH64_AUTIASP 1669 ret 1670.size ECP_Sm2PointDoubleMont,.-ECP_Sm2PointDoubleMont 1671 1672# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo 1673# Deal process: 1674# U1 = X1*Z22 1675# U2 = X2*Z12 1676# S1 = Y1*Z23 1677# S2 = Y2*Z13 1678# H = U2-U1 1679# r = S2-S1 1680# X3 = r2-H3-2*U1*H2 1681# Y3 = r*(U1*H2-X3)-S1*H3 1682# Z3 = Z1*Z2*H 1683.globl ECP_Sm2PointAddMont 1684.type ECP_Sm2PointAddMont,%function 1685.align 4 1686ECP_Sm2PointAddMont: 1687AARCH64_PACIASP 1688 stp x29, x30,[sp, #-80]! 1689 mov x29, sp 1690 stp x19, x20,[sp, #16] 1691 stp x21, x22,[sp, #32] 1692 stp x23, x24,[sp, #48] 1693 stp x25, x26,[sp, #64] 1694 sub sp,sp, #32*12 1695 1696 ldp x4, x5,[x2, #64] 1697 ldp x6, x7,[x2, #64+16] 1698 mov x21, x0 1699 mov x22, x1 // backup points 1700 mov x23, x2 1701 1702 adrp x14, .Lpoly+8 1703 add x14, x14,:lo12:.Lpoly+8 // p[1] 1704 ldr x14, [x14] 1705 1706 adrp x15, .Lpoly+24 1707 add x15, x15,:lo12:.Lpoly+24 // p[3] 1708 ldr x15, [x15] 1709 1710 1711 orr x16, x4, x5 1712 orr x19, x6, x7 1713 orr x25, x16, x19 1714 cmp x25, #0 1715 csetm x25, ne // check the point is(x, y, 0) 1716 add x0, sp, #128 1717 bl ECP_Sm2SqrMont // z1sqr = z1^2 1718 1719 ldp x4, x5,[x22, #64] 1720 ldp x6, x7,[x22, #64+16] 1721 orr x16, x4, x5 1722 orr x19, x6, x7 1723 orr x24, x16, x19 1724 cmp x24, #0 1725 csetm x24, ne // check the point is(x, y, 0) 1726 1727 add x0, sp, #224 1728 bl ECP_Sm2SqrMont // z2sqr = z2^2 1729 1730 ldr x3, [x23, #64] 1731 ldp x4, x5,[sp, #128] 1732 ldp x6, x7,[sp, #128+16] 1733 add x2, x23, #64 1734 add x0,sp, #320 1735 bl ECP_Sm2MulMont // s2 = z1^3 1736 1737 ldr x3,[x22, #64] 1738 ldp x4, x5,[sp, #224] 1739 ldp x6, x7,[sp, #224+16] 1740 add x2, x22, #64 1741 add x0,sp, #352 1742 bl ECP_Sm2MulMont // s2 = y2 * z1^3 1743 1744 ldr x3,[x22, #32] 1745 ldp x4, x5,[sp, #320] 1746 ldp x6, x7,[sp, #320+16] 1747 add x2, x22, #32 1748 add x0,sp, #320 1749 bl ECP_Sm2MulMont 1750 1751 ldr x3,[x23, #32] 1752 ldp x4, x5,[sp, #352] 1753 ldp x6, x7,[sp, #352+16] 1754 add x2, x23, #32 1755 add x0,sp, #352 1756 bl ECP_Sm2MulMont 1757 1758 add x2,sp, #320 1759 ldr x3,[sp, #128] 1760 ldp x4, x5,[x22] 1761 ldp x6, x7,[x22, #16] 1762 add x0,sp, #96 1763 bl ECP_Sm2SubAB 1764 1765 orr x8, x8, x9 1766 orr x10, x10, x11 1767 orr x26, x8, x10 1768 1769 add x2,sp, #128 1770 add x0,sp, #256 1771 bl ECP_Sm2MulMont 1772 1773 ldr x3,[sp, #224] 1774 ldp x4, x5,[x23] 1775 ldp x6, x7,[x23, #16] 1776 add x2,sp, #224 1777 add x0,sp, #288 1778 bl ECP_Sm2MulMont 1779 1780 add x2,sp, #256 1781 ldp x4, x5,[sp, #96] 1782 ldp x6, x7,[sp, #96+16] 1783 add x0,sp, #192 1784 bl ECP_Sm2SubAB 1785 1786 orr x8, x8, x9 1787 orr x10, x10, x11 1788 orr x8, x8, x10 1789 tst x8, x8 1790 b.ne .Ladd_proceed 1791 1792 tst x24, x25 1793 b.eq .Ladd_proceed 1794 1795 tst x26, x26 1796 b.eq .Ladd_double 1797 1798 stp xzr, xzr,[x21] 1799 stp xzr, xzr,[x21, #16] 1800 stp xzr, xzr,[x21, #32] 1801 stp xzr, xzr,[x21, #48] 1802 stp xzr, xzr,[x21, #64] 1803 stp xzr, xzr,[x21, #80] 1804 b .Ladd_done 1805 1806.align 4 1807.Ladd_double: 1808 mov x1, x22 1809 mov x0, x21 1810 ldp x23, x24,[x29, #48] 1811 ldp x25, x26,[x29, #64] 1812 add sp,sp, #32*(12-4) 1813 b .Lpoint_double 1814 1815.align 4 1816.Ladd_proceed: 1817 add x0,sp, #128 1818 bl ECP_Sm2SqrMont 1819 1820 ldr x3,[x22, #64] 1821 ldp x4, x5,[sp, #192] 1822 ldp x6, x7,[sp, #192+16] 1823 add x2, x22, #64 1824 add x0,sp, #64 1825 bl ECP_Sm2MulMont 1826 1827 ldp x4, x5,[sp, #192] 1828 ldp x6, x7,[sp, #192+16] 1829 add x0,sp, #224 1830 bl ECP_Sm2SqrMont 1831 1832 ldr x3,[x23, #64] 1833 ldp x4, x5,[sp, #64] 1834 ldp x6, x7,[sp, #64+16] 1835 add x2, x23, #64 1836 add x0,sp, #64 1837 bl ECP_Sm2MulMont 1838 1839 ldr x3,[sp, #192] 1840 ldp x4, x5,[sp, #224] 1841 ldp x6, x7,[sp, #224+16] 1842 add x2,sp, #192 1843 add x0,sp, #160 1844 bl ECP_Sm2MulMont 1845 1846 ldr x3,[sp, #224] 1847 ldp x4, x5,[sp, #256] 1848 ldp x6, x7,[sp, #256+16] 1849 add x2,sp, #224 1850 add x0,sp, #288 1851 bl ECP_Sm2MulMont 1852 1853 mov x16, x8 1854 mov x17, x9 1855 mov x19, x10 1856 mov x20, x11 1857 add x0,sp, #224 1858 bl ECP_Sm2AddCore 1859 1860 add x2,sp, #128 1861 add x0,sp, #0 1862 bl ECP_Sm2SubBA 1863 1864 add x2,sp, #160 1865 bl ECP_Sm2SubAB 1866 1867 add x2,sp, #288 1868 ldr x3,[sp, #160] 1869 ldp x4, x5,[sp, #320] 1870 ldp x6, x7,[sp, #320+16] 1871 add x0,sp, #32 1872 bl ECP_Sm2SubBA 1873 1874 add x2,sp, #160 1875 add x0,sp, #352 1876 bl ECP_Sm2MulMont 1877 1878 ldr x3,[sp, #96] 1879 ldp x4, x5,[sp, #32] 1880 ldp x6, x7,[sp, #32+16] 1881 add x2,sp, #96 1882 add x0,sp, #32 1883 bl ECP_Sm2MulMont 1884 1885 add x2,sp, #352 1886 bl ECP_Sm2SubAB 1887 1888 ldp x4, x5,[sp, #0] 1889 ldp x6, x7,[sp, #16] 1890 ldp x16, x17,[x23] 1891 ldp x19, x20,[x23, #16] 1892 ldp x8, x9,[x22, #0] 1893 1894 cmp x24, #0 1895 csel x16, x4, x16,ne 1896 csel x17, x5, x17,ne 1897 csel x19, x6, x19,ne 1898 csel x20, x7, x20,ne 1899 1900 cmp x25, #0 1901 csel x8, x16, x8,ne 1902 csel x9, x17, x9,ne 1903 csel x10, x19, x10,ne 1904 csel x11, x20, x11,ne 1905 1906 stp x8, x9,[x21, #0] 1907 stp x10, x11,[x21, #16] 1908 1909 ldp x10, x11,[x22, #16] 1910 ldp x4, x5,[sp, #32] 1911 ldp x6, x7,[sp, #48] 1912 ldp x16, x17,[x23, #32] 1913 ldp x19, x20,[x23, #48] 1914 ldp x8, x9,[x22, #32] 1915 1916 cmp x24, #0 1917 csel x16, x4, x16,ne 1918 csel x17, x5, x17,ne 1919 csel x19, x6, x19,ne 1920 csel x20, x7, x20,ne 1921 1922 cmp x25, #0 1923 csel x8, x16, x8,ne 1924 csel x9, x17, x9,ne 1925 csel x10, x19, x10,ne 1926 csel x11, x20, x11,ne 1927 1928 stp x8, x9,[x21, #32] 1929 stp x10, x11,[x21, #32+16] 1930 1931 ldp x10, x11,[x22, #32+16] 1932 ldp x8, x9, [x22, #64] 1933 1934 ldp x16, x17,[x23, #32+32] 1935 ldp x19, x20,[x23, #32+48] 1936 ldp x4, x5,[sp, #32+32] 1937 ldp x6, x7,[sp, #32+48] 1938 1939 cmp x24, #0 1940 ldp x10, x11,[x22, #64+16] 1941 csel x16, x4, x16,ne 1942 csel x17, x5, x17,ne 1943 csel x19, x6, x19,ne 1944 csel x20, x7, x20,ne 1945 1946 cmp x25, #0 1947 csel x8, x16, x8, ne 1948 csel x9, x17, x9, ne 1949 csel x10, x19, x10, ne 1950 csel x11, x20, x11, ne 1951 1952 stp x8, x9,[x21, #64] 1953 stp x10, x11,[x21, #64+16] 1954 1955.Ladd_done: 1956 mov sp, x29 1957 ldp x19, x20,[x29, #16] 1958 ldp x21, x22,[x29, #32] 1959 ldp x23, x24,[x29, #48] 1960 ldp x25, x26,[x29, #64] 1961 ldp x29, x30,[sp], #80 1962AARCH64_AUTIASP 1963 ret 1964.size ECP_Sm2PointAddMont,.-ECP_Sm2PointAddMont 1965 1966# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl 1967# Deal process: 1968# Z1Z1 = Z12 1969# U2 = X2*Z1Z1 1970# S2 = Y2*Z1*Z1Z1 1971# H = U2-X1 1972# HH = H2 1973# I = 4*HH 1974# J = H*I 1975# r = 2*(S2-Y1) 1976# V = X1*I 1977# X3 = r2-J-2*V 1978# Y3 = r*(V-X3)-2*Y1*J 1979# Z3 = (Z1+H)2-Z1Z1-HH 1980.globl ECP_Sm2PointAddAffineMont 1981.type ECP_Sm2PointAddAffineMont,%function 1982.align 4 1983ECP_Sm2PointAddAffineMont: 1984AARCH64_PACIASP 1985 stp x29, x30,[sp, #-80]! 1986 mov x29, sp 1987 stp x19, x20,[sp, #16] 1988 stp x21, x22,[sp, #32] 1989 stp x23, x24,[sp, #48] 1990 stp x25, x26,[sp, #64] 1991 sub sp, sp, #32*10 1992 1993 mov x21, x0 // backup r 1994 mov x22, x1 // point a 1995 mov x23, x2 // point b 1996 1997 adrp x14, .Lpoly+8 1998 add x14, x14,:lo12:.Lpoly+8 1999 ldr x14,[x14] 2000 2001 adrp x15, .Lpoly+24 2002 add x15, x15,:lo12:.Lpoly+24 2003 ldr x15,[x15] 2004 2005 ldp x4, x5,[x1, #64] // &(a->z[0]), a->z[0] is marked as z1[0] 2006 ldp x6, x7,[x1, #64+16] // &(a->z[2]) 2007 2008 orr x16, x4, x5 2009 orr x19, x6, x7 2010 orr x24, x16, x19 2011 cmp x24, #0 2012 csetm x24, ne // check is (x, y 0) 2013 2014 ldp x8, x9, [x2] // &(b->x[0]) 2015 ldp x10, x11, [x2, #16] // &(b->x[2]) 2016 ldp x16, x17, [x2, #32] // &(b->y[0]) 2017 ldp x19, x20, [x2, #48] // &(b->y[2]) 2018 2019 orr x8, x8, x9 2020 orr x10, x10, x11 2021 orr x16, x16, x17 2022 orr x19, x19, x20 2023 orr x8, x8, x10 2024 orr x16, x16, x19 2025 orr x25, x8, x16 2026 cmp x25, #0 2027 csetm x25, ne // check is (x, y 0) 2028 2029 add x0, sp, #128 2030 bl ECP_Sm2SqrMont // zsqr = z1^2 2031 2032 mov x4, x8 2033 mov x5, x9 2034 mov x6, x10 2035 mov x7, x11 2036 2037 ldr x3, [x23] 2038 mov x2, x23 2039 add x0, sp, #96 2040 bl ECP_Sm2MulMont // u2 = z1^2 * x2 2041 2042 mov x2, x22 2043 ldr x3, [x22, #64] 2044 ldp x4, x5, [sp, #128] 2045 ldp x6, x7, [sp, #128+16] 2046 add x0,sp, #160 2047 bl ECP_Sm2SubAB 2048 2049 add x2, x22, #64 2050 add x0,sp, #128 2051 bl ECP_Sm2MulMont 2052 2053 ldr x3,[x22, #64] 2054 ldp x4, x5,[sp, #160] 2055 ldp x6, x7,[sp, #160+16] 2056 add x2, x22, #64 2057 add x0,sp, #64 2058 bl ECP_Sm2MulMont 2059 2060 ldr x3,[x23, #32] 2061 ldp x4, x5,[sp, #128] 2062 ldp x6, x7,[sp, #128+16] 2063 add x2, x23, #32 2064 add x0,sp, #128 2065 bl ECP_Sm2MulMont 2066 2067 add x2, x22, #32 2068 ldp x4, x5,[sp, #160] 2069 ldp x6, x7,[sp, #160+16] 2070 add x0,sp, #192 2071 bl ECP_Sm2SubAB 2072 2073 add x0,sp, #224 2074 bl ECP_Sm2SqrMont 2075 2076 ldp x4, x5,[sp, #192] 2077 ldp x6, x7,[sp, #192+16] 2078 add x0,sp, #288 2079 bl ECP_Sm2SqrMont 2080 2081 ldr x3,[sp, #160] 2082 ldp x4, x5,[sp, #224] 2083 ldp x6, x7,[sp, #224+16] 2084 add x2,sp, #160 2085 add x0,sp, #256 2086 bl ECP_Sm2MulMont 2087 2088 ldr x3,[x22] 2089 ldp x4, x5,[sp, #224] 2090 ldp x6, x7,[sp, #224+16] 2091 mov x2, x22 2092 add x0,sp, #96 2093 bl ECP_Sm2MulMont 2094 2095 mov x16, x8 2096 mov x17, x9 2097 mov x19, x10 2098 mov x20, x11 2099 add x0,sp, #224 2100 bl ECP_Sm2AddCore 2101 2102 add x2,sp, #288 2103 add x0,sp, #0 2104 bl ECP_Sm2SubBA 2105 2106 add x2,sp, #256 2107 bl ECP_Sm2SubAB 2108 2109 add x2,sp, #96 2110 ldr x3,[x22, #32] 2111 ldp x4, x5,[sp, #256] 2112 ldp x6, x7,[sp, #256+16] 2113 add x0,sp, #32 2114 bl ECP_Sm2SubBA 2115 2116 add x2, x22, #32 2117 add x0,sp, #128 2118 bl ECP_Sm2MulMont 2119 2120 ldr x3,[sp, #192] 2121 ldp x4, x5,[sp, #32] 2122 ldp x6, x7,[sp, #32+16] 2123 add x2,sp, #192 2124 add x0,sp, #32 2125 bl ECP_Sm2MulMont 2126 2127 add x2,sp, #128 2128 bl ECP_Sm2SubAB 2129 2130 ldp x4, x5,[sp, #0] 2131 ldp x6, x7,[sp, #16] 2132 ldp x16, x17,[x23] 2133 ldp x19, x20,[x23, #16] 2134 2135 ldp x8, x9,[x22, #0] 2136 cmp x24, #0 2137 ldp x10, x11,[x22, #16] 2138 2139 csel x16, x4, x16, ne 2140 csel x17, x5, x17, ne 2141 csel x19, x6, x19, ne 2142 csel x20, x7, x20, ne 2143 2144 cmp x25, #0 2145 csel x8, x16, x8,ne 2146 csel x9, x17, x9,ne 2147 csel x10, x19, x10,ne 2148 csel x11, x20, x11,ne 2149 2150 ldp x4, x5,[sp, #32] 2151 ldp x6, x7,[sp, #48] 2152 ldp x16, x17,[x23, #32] 2153 ldp x19, x20,[x23, #48] 2154 stp x8, x9,[x21, #0] 2155 stp x10, x11,[x21, #16] 2156 2157 2158 ldp x8, x9,[x22, #32] 2159 cmp x24, #0 2160 ldp x10, x11,[x22, #32+16] 2161 csel x16, x4, x16,ne 2162 csel x17, x5, x17,ne 2163 csel x19, x6, x19,ne 2164 csel x20, x7, x20,ne 2165 2166 cmp x25, #0 2167 csel x8, x16, x8,ne 2168 csel x9, x17, x9,ne 2169 csel x10, x19, x10,ne 2170 csel x11, x20, x11,ne 2171 stp x8, x9,[x21, #32] 2172 stp x10, x11,[x21, #32+16] 2173 2174 ldp x4, x5,[sp, #32+32] 2175 ldp x6, x7,[sp, #32+48] 2176 2177 adrp x23, .Lone_mont 2178 add x23,x23,:lo12:.Lone_mont 2179 ldp x16, x17,[x23] 2180 ldp x19, x20,[x23, #16] 2181 2182 ldp x8, x9,[x22, #64] 2183 ldp x10, x11,[x22, #64+16] 2184 2185 cmp x24, #0 2186 csel x16, x4, x16, ne 2187 csel x17, x5, x17, ne 2188 csel x19, x6, x19, ne 2189 csel x20, x7, x20, ne 2190 2191 cmp x25, #0 2192 csel x8, x16, x8, ne 2193 csel x9, x17, x9, ne 2194 csel x10, x19, x10, ne 2195 csel x11, x20, x11, ne 2196 2197 stp x8, x9, [x21, #64] 2198 stp x10, x11, [x21, #64+16] 2199 2200 mov sp, x29 2201 ldp x19, x20,[x29, #16] 2202 ldp x21, x22,[x29, #32] 2203 ldp x23, x24,[x29, #48] 2204 ldp x25, x26,[x29, #64] 2205 ldp x29, x30,[sp], #80 2206AARCH64_AUTIASP 2207 ret 2208.size ECP_Sm2PointAddAffineMont,.-ECP_Sm2PointAddAffineMont 2209#endif 2210