1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_CURVE_SM2 18 19.file "ecp_sm2_x86_64.S" 20.text 21 22.set s0,%r8 23.set s1,%r9 24.set s2,%r10 25.set s3,%r11 26.set s4,%r12 27.set s5,%r13 28.set s6,%r14 29.set s7,%r15 30 31.macro REGISTER_SAVE 32 pushq %r12 33 pushq %r13 34 pushq %r14 35 pushq %r15 36 pushq %rbx 37 pushq %rbp 38.endm 39 40.macro REGISTER_POP 41 popq %rbp 42 popq %rbx 43 popq %r15 44 popq %r14 45 popq %r13 46 popq %r12 47.endm 48 49# The polynomial 50.align 64 51.Lpoly: 52.quad 0xffffffffffffffff, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffeffffffff 53# The order of polynomial 54.Lord: 55.quad 0x53bbf40939d54123, 0x7203df6b21c6052b, 0xffffffffffffffff, 0xfffffffeffffffff 56 57.Lpoly_div_2: 58.quad 0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff 59.Lord_div_2: 60.quad 0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff 61 62.Lzero: 63.quad 0, 0, 0, 0 64.Lord_1div4: 65.quad 0xd4eefd024e755049, 0xdc80f7dac871814a, 0xffffffffffffffff, 0x3fffffffbfffffff 66.Lord_2div4: 67.quad 0xa9ddfa049ceaa092, 0xb901efb590e30295, 0xffffffffffffffff, 0x7fffffff7fffffff 68.Lord_3div4: 69.quad 0x7eccf706eb5ff0db, 0x9582e790595483e0, 0xffffffffffffffff, 0xbfffffff3fffffff 70 71.Lpoly_1div4: 72.quad 0x4000000000000000, 0xffffffffc0000000, 0xffffffffffffffff, 0x3fffffffbfffffff 73.Lpoly_2div4: 74.quad 0x8000000000000000, 0xffffffff80000000, 0xffffffffffffffff, 0x7fffffff7fffffff 75.Lpoly_3div4: 76.quad 0xc000000000000000, 0xffffffff40000000, 0xffffffffffffffff, 0xbfffffff3fffffff 77 78.LRR:// 2^512 mod P precomputed for sm2 polynomial 79.quad 0x0000000200000003, 0x00000002ffffffff, 0x0000000100000001, 0x0000000400000002 80.Lone_mont: 81.quad 0x0000000000000001, 0x00000000ffffffff, 0x0000000000000000, 0x0000000100000000 82.Lone: 83.quad 1,0,0,0 84.LOne: 85.long 1,1,1,1,1,1,1,1 86 87.globl ECP_Sm2Div2 88.type ECP_Sm2Div2,@function 89.align 64 90 91ECP_Sm2Div2: 92 93 movq (%rdi),%r8 94 movq 8(%rdi),%r9 95 movq 16(%rdi),%r10 96 movq 24(%rdi),%r11 97 98 shrdq $1,%r9,%r8 99 shrdq $1,%r10,%r9 100 shrdq $1,%r11,%r10 101 shrq $1,%r11 102 103 movq %r8,(%rdi) 104 movq %r9,8(%rdi) 105 movq %r10,16(%rdi) 106 movq %r11,24(%rdi) 107 108 ret 109.size ECP_Sm2Div2, .-ECP_Sm2Div2 110 111.globl ECP_Sm2Div4 112.type ECP_Sm2Div4,@function 113.align 64 114 115ECP_Sm2Div4: 116 117 movq (%rdi),%r8 118 movq 8(%rdi),%r9 119 movq 16(%rdi),%r10 120 movq 24(%rdi),%r11 121 122 shrdq $2,%r9,%r8 123 shrdq $2,%r10,%r9 124 shrdq $2,%r11,%r10 125 shrq $2,%r11 126 movq %r8,(%rdi) 127 movq %r9,8(%rdi) 128 movq %r10,16(%rdi) 129 movq %r11,24(%rdi) 130 131 ret 132.size ECP_Sm2Div4, .-ECP_Sm2Div4 133 134.globl ECP_Sm2Neg 135.type ECP_Sm2Neg,@function 136.align 64 137 138ECP_Sm2Neg: 139 movq (%rdi),%r8 140 xorq %rax,%rax 141 142 movq $-1,%r8 143 movq $0xffffffff00000000,%r9 144 movq $0xfffffffeffffffff,%r11 145 movq $-1,%r10 146 147 subq 0(%rsi),%r8 148 sbbq 8(%rsi),%r9 149 sbbq 16(%rsi),%r10 150 sbbq 24(%rsi),%r11 151 152 movq %r8,(%rdi) 153 movq %r9,8(%rdi) 154 movq %r10,16(%rdi) 155 movq %r11,24(%rdi) 156 157 ret 158.size ECP_Sm2Neg, .-ECP_Sm2Neg 159 160.globl ECP_Sm2BnSub 161.type ECP_Sm2BnSub,@function 162.align 64 163 164ECP_Sm2BnSub: 165 166 movq (%rsi),%r8 167 movq 8(%rsi),%r9 168 movq 16(%rsi),%r10 169 movq 24(%rsi),%r11 170 171 subq (%rdx),%r8 172 sbbq 8(%rdx),%r9 173 sbbq 16(%rdx),%r10 174 sbbq 24(%rdx),%r11 175 176 movq %r8,(%rdi) 177 movq %r9,8(%rdi) 178 movq %r10,16(%rdi) 179 movq %r11,24(%rdi) 180 ret 181.size ECP_Sm2BnSub, .-ECP_Sm2BnSub 182 183.globl ECP_Sm2BnAdd 184.type ECP_Sm2BnAdd,@function 185.align 64 186 187ECP_Sm2BnAdd: 188 189 movq (%rsi),%r8 190 movq 8(%rsi),%r9 191 movq 16(%rsi),%r10 192 movq 24(%rsi),%r11 193 194 addq (%rdx),%r8 195 adcq 8(%rdx),%r9 196 adcq 16(%rdx),%r10 197 adcq 24(%rdx),%r11 198 199 movq %r8,(%rdi) 200 movq %r9,8(%rdi) 201 movq %r10,16(%rdi) 202 movq %r11,24(%rdi) 203 ret 204.size ECP_Sm2BnAdd, .-ECP_Sm2BnAdd 205 206.globl ECP_Sm2Div2ModP 207.type ECP_Sm2Div2ModP,@function 208.align 64 209 210ECP_Sm2Div2ModP: 211 212 subq $24,%rsp 213 movq %rbx,(%rsp) 214 movq %r12,8(%rsp) 215 movq %r13,16(%rsp) 216 xorq %r12,%r12 217 218 movq (%rsi),%r8 219 movq 8(%rsi),%r9 220 movq 16(%rsi),%r10 221 movq 24(%rsi),%r11 222 223 movq %r8,%r13 224 andq $1,%r13 225 shrdq $1,%r9,%r8 226 shrdq $1,%r10,%r9 227 shrdq $1,%r11,%r10 228 shrdq $1,%r12,%r11 229 230 leaq .Lzero(%rip),%rax 231 leaq .Lpoly_div_2(%rip),%rbx 232 cmpq $1,%r13 233 cmoveq %rbx,%rax 234 235 addq (%rax),%r8 236 adcq 8(%rax),%r9 237 adcq 16(%rax),%r10 238 adcq 24(%rax),%r11 239 240 movq %r8,(%rdi) 241 movq %r9,8(%rdi) 242 movq %r10,16(%rdi) 243 movq %r11,24(%rdi) 244 245 movq (%rsp),%rbx 246 movq 8(%rsp),%r12 247 movq 16(%rsp),%r13 248 addq $24,%rsp 249 ret 250.size ECP_Sm2Div2ModP, .-ECP_Sm2Div2ModP 251 252.globl ECP_Sm2Div2ModOrd 253.type ECP_Sm2Div2ModOrd,@function 254.align 64 255 256ECP_Sm2Div2ModOrd: 257 258 subq $24,%rsp 259 movq %rbx,(%rsp) 260 movq %r12,8(%rsp) 261 movq %r13,16(%rsp) 262 xorq %r12,%r12 263 264 movq (%rsi),%r8 265 movq 8(%rsi),%r9 266 movq 16(%rsi),%r10 267 movq 24(%rsi),%r11 268 269 movq %r8,%r13 270 andq $1,%r13 271 shrdq $1,%r9,%r8 272 shrdq $1,%r10,%r9 273 shrdq $1,%r11,%r10 274 shrdq $1,%r12,%r11 275 276 leaq .Lzero(%rip),%rax 277 leaq .Lord_div_2(%rip),%rbx 278 cmpq $1,%r13 279 cmoveq %rbx,%rax 280 281 addq (%rax),%r8 282 adcq 8(%rax),%r9 283 adcq 16(%rax),%r10 284 adcq 24(%rax),%r11 285 286 movq %r8,(%rdi) 287 movq %r9,8(%rdi) 288 movq %r10,16(%rdi) 289 movq %r11,24(%rdi) 290 291 movq (%rsp),%rbx 292 movq 8(%rsp),%r12 293 movq 16(%rsp),%r13 294 addq $24,%rsp 295 ret 296.size ECP_Sm2Div2ModOrd, .-ECP_Sm2Div2ModOrd 297 298.globl ECP_Sm2Div4ModP 299.type ECP_Sm2Div4ModP,@function 300.align 64 301 302ECP_Sm2Div4ModP: 303 304 subq $24,%rsp 305 movq %rbx,(%rsp) 306 movq %r12,8(%rsp) 307 movq %r13,16(%rsp) 308 xorq %r12,%r12 309 310 movq (%rsi),%r8 311 movq 8(%rsi),%r9 312 movq 16(%rsi),%r10 313 movq 24(%rsi),%r11 314 315 movq %r8,%r13 316 andq $3,%r13 317 shrdq $2,%r9,%r8 318 shrdq $2,%r10,%r9 319 shrdq $2,%r11,%r10 320 shrdq $2,%r12,%r11 321 322 leaq .Lzero(%rip),%rax 323 leaq .Lpoly_1div4(%rip),%rbx 324 leaq .Lpoly_2div4(%rip),%rcx 325 leaq .Lpoly_3div4(%rip),%rdx 326 327 cmpq $1,%r13 328 cmoveq %rbx,%rax 329 cmpq $2,%r13 330 cmoveq %rcx,%rax 331 cmpq $3,%r13 332 cmoveq %rdx,%rax 333 334 335 addq (%rax),%r8 336 adcq 8(%rax),%r9 337 adcq 16(%rax),%r10 338 adcq 24(%rax),%r11 339 340 341 movq %r8,(%rdi) 342 movq %r9,8(%rdi) 343 movq %r10,16(%rdi) 344 movq %r11,24(%rdi) 345 346 movq (%rsp),%rbx 347 movq 8(%rsp),%r12 348 movq 16(%rsp),%r13 349 addq $24,%rsp 350 ret 351.size ECP_Sm2Div4ModP, .-ECP_Sm2Div4ModP 352 353.globl ECP_Sm2Div4ModOrd 354.type ECP_Sm2Div4ModOrd,@function 355.align 64 356 357ECP_Sm2Div4ModOrd: 358 359 subq $24,%rsp 360 movq %rbx,(%rsp) 361 movq %r12,8(%rsp) 362 movq %r13,16(%rsp) 363 xorq %r12,%r12 364 365 366 movq (%rsi),%r8 367 movq 8(%rsi),%r9 368 movq 16(%rsi),%r10 369 movq 24(%rsi),%r11 370 371 movq %r8,%r13 372 andq $3,%r13 373 shrdq $2,%r9,%r8 374 shrdq $2,%r10,%r9 375 shrdq $2,%r11,%r10 376 shrdq $2,%r12,%r11 377 378 leaq .Lzero(%rip),%rax 379 leaq .Lord_1div4(%rip),%rbx 380 leaq .Lord_2div4(%rip),%rcx 381 leaq .Lord_3div4(%rip),%rdx 382 383 cmpq $1,%r13 384 cmoveq %rbx,%rax 385 cmpq $2,%r13 386 cmoveq %rcx,%rax 387 cmpq $3,%r13 388 cmoveq %rdx,%rax 389 390 addq (%rax),%r8 391 adcq 8(%rax),%r9 392 adcq 16(%rax),%r10 393 adcq 24(%rax),%r11 394 395 movq %r8,(%rdi) 396 movq %r9,8(%rdi) 397 movq %r10,16(%rdi) 398 movq %r11,24(%rdi) 399 400 movq (%rsp),%rbx 401 movq 8(%rsp),%r12 402 movq 16(%rsp),%r13 403 addq $24,%rsp 404 ret 405.size ECP_Sm2Div4ModOrd, .-ECP_Sm2Div4ModOrd 406 407#define bn_mod_add(mod) \ 408 /* Store scalar registers */ \ 409 subq $32, %rsp; \ 410 movq %r12, (%rsp); \ 411 movq %r13, 8(%rsp); \ 412 movq %r14, 16(%rsp); \ 413 movq %r15, 24(%rsp); \ 414 xorq %rax, %rax; \ 415 /* Load inputs */ \ 416 movq (%rsi), %r8; \ 417 movq 8(%rsi), %r9; \ 418 movq 16(%rsi), %r10; \ 419 movq 24(%rsi), %r11; \ 420 /* Addition */ \ 421 addq (%rdx), %r8; \ 422 adcq 8(%rdx), %r9; \ 423 adcq 16(%rdx), %r10; \ 424 adcq 24(%rdx), %r11; \ 425 /* Store carry */ \ 426 adcq $0, %rax; \ 427 movq %r8, %r12; \ 428 movq %r9, %r13; \ 429 movq %r10, %r14; \ 430 movq %r11, %r15; \ 431 /* Sub polynomial */ \ 432 leaq mod, %rsi; \ 433 subq 0(%rsi), %r8; \ 434 sbbq 8(%rsi), %r9; \ 435 sbbq 16(%rsi), %r10; \ 436 sbbq 24(%rsi), %r11; \ 437 sbbq $0, %rax; \ 438 cmovcq %r12, %r8; \ 439 cmovcq %r13, %r9; \ 440 cmovcq %r14, %r10; \ 441 cmovcq %r15, %r11; \ 442 /* Store results */ \ 443 movq %r8, (%rdi); \ 444 movq %r9, 8(%rdi); \ 445 movq %r10, 16(%rdi); \ 446 movq %r11, 24(%rdi); \ 447 /* Restore scalar registers */ \ 448 movq (%rsp), %r12; \ 449 movq 8(%rsp), %r13; \ 450 movq 16(%rsp), %r14; \ 451 movq 24(%rsp), %r15; \ 452 addq $32, %rsp; \ 453 454#define bn_mod_sub(mod) \ 455 /* Store scalar registers */ \ 456 subq $32, %rsp; \ 457 movq %r12, (%rsp); \ 458 movq %r13, 8(%rsp); \ 459 movq %r14, 16(%rsp); \ 460 movq %r15, 24(%rsp); \ 461 xorq %rax, %rax; \ 462 /* Load inputs */ \ 463 movq (%rsi), %r8; \ 464 movq 8(%rsi), %r9; \ 465 movq 16(%rsi), %r10; \ 466 movq 24(%rsi), %r11; \ 467 /* Subtraction */ \ 468 subq (%rdx), %r8; \ 469 sbbq 8(%rdx), %r9; \ 470 sbbq 16(%rdx), %r10; \ 471 sbbq 24(%rdx), %r11; \ 472 sbbq $0, %rax; \ 473 movq %r8, %r12; \ 474 movq %r9, %r13; \ 475 movq %r10, %r14; \ 476 movq %r11, %r15; \ 477 /* Add polynomial */ \ 478 leaq mod, %rsi; \ 479 addq 0(%rsi), %r8; \ 480 adcq 8(%rsi), %r9; \ 481 adcq 16(%rsi), %r10; \ 482 adcq 24(%rsi), %r11; \ 483 testq %rax, %rax; \ 484 cmovzq %r12, %r8; \ 485 cmovzq %r13, %r9; \ 486 cmovzq %r14, %r10; \ 487 cmovzq %r15, %r11; \ 488 /* Store results */ \ 489 movq %r8, (%rdi); \ 490 movq %r9, 8(%rdi); \ 491 movq %r10, 16(%rdi); \ 492 movq %r11, 24(%rdi); \ 493 /* Restore scalar registers */ \ 494 movq (%rsp), %r12; \ 495 movq 8(%rsp), %r13; \ 496 movq 16(%rsp), %r14; \ 497 movq 24(%rsp), %r15; \ 498 addq $32, %rsp; \ 499 500### Modular add: r = a+b mod n/p, where n = ord(p) ### 501 # void ECP_Sm2AddModP(uint64_t *r, const uint64_t *a, const uint64_t *b) 502 # Modular poly add 503 # r %rdi 504 # a %rsi 505 # b %rdx 506 .globl ECP_Sm2AddModP 507 .type ECP_Sm2AddModP, @function 508 .align 64 509 510ECP_Sm2AddModP: 511 512 bn_mod_add(.Lpoly(%rip)) 513 514 ret 515 .size ECP_Sm2AddModP, .-ECP_Sm2AddModP 516 517 # void ECP_Sm2AddModOrd(uint64_t *r, const uint64_t *a, const uint64_t *b) 518 # Modular order add 519 # r %rdi 520 # a %rsi 521 # b %rdx 522 .globl ECP_Sm2AddModOrd 523 .type ECP_Sm2AddModOrd, @function 524 .align 64 525 526ECP_Sm2AddModOrd: 527 528 bn_mod_add(.Lord(%rip)) 529 530 ret 531 .size ECP_Sm2AddModOrd, .-ECP_Sm2AddModOrd 532 533### Modular sub: r = a-b mod n/p, where n = ord(p) ### 534 # void ECP_Sm2SubModP(uint64_t *r, const uint64_t *a, const uint64_t *b) 535 # Modular poly sub 536 # r %rdi 537 # a %rsi 538 # b %rdx 539 .globl ECP_Sm2SubModP 540 .type ECP_Sm2SubModP, @function 541 .align 64 542 543ECP_Sm2SubModP: 544 545 bn_mod_sub(.Lpoly(%rip)) 546 547 ret 548 .size ECP_Sm2SubModP, .-ECP_Sm2SubModP 549 550 # void ECP_Sm2SubModOrd(uint64_t *r, const uint64_t *a, const uint64_t *b) 551 # Modular order sub 552 # r %rdi 553 # a %rsi 554 # b %rdx 555 .globl ECP_Sm2SubModOrd 556 .type ECP_Sm2SubModOrd, @function 557 .align 64 558 559ECP_Sm2SubModOrd: 560 561 bn_mod_sub(.Lord(%rip)) 562 563 ret 564 .size ECP_Sm2SubModOrd, .-ECP_Sm2SubModOrd 565 566.macro RDC 567 # r = a mod p256 568 # a = a15 | a14 | ... | a0, where ai are 32–bit quantities 569 # | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | (+) 570 # | a8 | a11 | a10 | a9 | a8 | 0 | a9 | a8 | (+) 571 # | a9 | a14 | a13 | a12 | a11 | 0 | a10 | a9 | (+) 572 # | a10 | a15 | a14 | a13 | a12 | 0 | a11 | a10 | (+) 573 # | a11 | 0 | a15 | a14 | a13 | 0 | a12 | a11 | (+) 574 # | a12 | 0 | a15 | a14 | a13 | 0 | a13 | a12 | (+) 575 # | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) 576 # | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) 577 # | a13 | 0 | 0 | 0 | 0 | 0 | a15 | a14 | (+) 578 # | a14 | 0 | 0 | 0 | 0 | 0 | a15 | a14 | (+) 579 # | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 580 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 581 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 582 # | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 583 # | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) 584 # | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) 585 # | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) 586 # | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) 587 # | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| 588 # | V[3] | V[2] | V[1] | V[0] | 589 # until r < p256 590 # s7 (a15|a14), s6 (a13|a12), s5 (a11|a10), s4 (a9|a8) 591 # s3 (a7|a6), s2 (a5|a4), s1 (a3|a2), s0 (a1|a0) 592 593 # 1. 64-bit addition 594 xorq %rsi, %rsi # to store all carry 595 xorq %rax, %rax 596 movq s6, %rcx # rcx <- s6 597 movq s4, %rdx # rdx <- s4 598 # a13 | a12 599 addq s7, %rcx # rcx <- s6 + s7 600 adcq $0, %rax # rax <- carry(s6+s7) 601 addq s7, %rcx # rcx <- s6 + 2*s7 602 adcq $0, %rax 603 # a9 | a8 604 movq %rax, %rbx # rbx <- carry (rax) 605 addq %rcx, %rdx # rdx <- s4 + s6 + 2*s7 606 adcq $0, %rbx 607 addq s5, %rdx # rdx <- s4 + s5 + s6 + 2*s7 608 adcq $0, %rbx 609 # sum 610 addq %rdx, s0 # s0 <- s0 + s4 + s5 + s6 + 2*s7 611 adcq %rbx, s1 # s1 <- s1 + rbx + carry 612 adcq %rcx, s2 # s2 <- s2 + s6 + 2*s7 + carry 613 adcq s7, s3 # s3 <- s3 + s7 + carry 614 adcq $0, %rsi 615 # add carry 616 addq %rax, s3 617 adcq $0, %rsi # rsi <- carry 618 # store registers 619 movq s0, (%rsp) 620 movq s1, 8(%rsp) 621 movq s2, 16(%rsp) 622 movq s3, 24(%rsp) 623 # 2. 4 -> 8 64-bit to 32-bit spread 624 movq $0xffffffff, %rax 625 movq s4, s0 626 movq s5, s1 627 movq s6, s2 628 movq s7, s3 629 andq %rax, s0 # a8 630 andq %rax, s1 # a10 631 andq %rax, s2 # a12 632 andq %rax, s3 # a14 633 shrq $32, s4 # a9 634 shrq $32, s5 # a11 635 shrq $32, s6 # a13 636 shrq $32, s7 # a15 637 # 3. 32-bit addition 638 movq s3, %rax 639 addq s2, %rax # rax <- a12 + a14 640 movq s3, %rbx 641 addq s1, %rbx # rbx <- a10 + a14 642 movq s7, %rcx 643 addq s6, %rcx # rcx <- a13 + a15 644 movq s0, %rdx 645 addq s4, %rdx # rdx <- a8 + a9 646 addq s5, s7 # s7 <- a11 + a15 647 movq %rcx, s2 # s2 <- a13 + a15 648 addq %rax, s2 # s2 <- a12 + a13 + a14 + a15 649 addq s2, s1 # s1 <- a10 + a12 + a13 + a14 + a15 650 addq s2, s1 # s1 <- a10 + 2*(a12 + a13 + a14 + a15) 651 addq %rdx, s1 # s1 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) 652 addq s5, s1 # s1 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 653 addq s6, s2 # s2 <- a12 + 2*a13 + a14 + a15 654 addq s5, s2 # s2 <- a11 + a12 + 2*a13 + a14 + a15 655 addq s0, s2 # s2 <- a8 + a11 + a12 + 2*a13 + a14 + a15 656 addq s3, %rdx # rdx <- a8 + a9 + a14 657 addq s6, %rdx # rdx <- a8 + a9 + a13 + a14 658 addq %rcx, s4 # s4 <- a9 + a13 + a15 659 addq s4, s5 # s5 <- a9 + a11 + a13 + a15 660 addq %rcx, s5 # s5 <- a9 + a11 + 2*(a13 + a15) 661 addq %rbx, %rax # rax <- a10 + a12 + 2*a14 662 663 # U[0] s5 a9 + a11 + 2*(a13 + a15) 664 # U[1] %rax a10 + a12 + 2*a14 665 # U[2] 666 # U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 667 # U[4] s4 a9 + a13 + a15 668 # U[5] %rbx a10 + a14 669 # U[6] s7 a11 + a15 670 # U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 671 # sub %rdx a8 + a9 + a13 + a14 672 673 # vacant registers: s0 s3 s6 %rcx 674 675 # 4. 8 -> 4 32-bit to 64-bit 676 # sub %rdx 677 movq %rax, s0 678 shlq $32, s0 # U[1]'(s0) <- U[1] << 32 679 shrd $32, s2, %rax # U[3]'(%rax) <- U[3]U[1] >> 32 680 shrd $32, %rbx, s2 # U[5]'(s2) <- U[5]U[3] >> 32 681 shrd $32, s1, %rbx # U[7]'(%rbx) <- U[7]U[5] >> 32 682 shrq $32, s1 # U[7](s1) <- U[7] >> 32 (carry) 683 684 # 5. 64-bit addition 685 addq s0, s5 # U[0] <- U[1]' + U[0] 686 adcq $0, %rax # U[3]' <- 0 + U[3]' 687 adcq s2, s4 # U[4] <- U[5]' + U[4] 688 adcq %rbx, s7 # U[6] <- U[7]' + U[6] 689 adcq s1, %rsi # rsi <- U[7]carry + carry 690 691 # V[0] s5 692 # V[1] %rax 693 # V[2] s4 694 # V[3] s7 695 # carry %rsi 696 # sub %rdx 697 698 # 5. ADD & SUB 699 movq (%rsp), s0 700 movq 8(%rsp), s1 701 movq 16(%rsp), s2 702 movq 24(%rsp), s3 703 # ADD 704 addq s5, s0 705 adcq %rax, s1 706 adcq s4, s2 707 adcq s7, s3 708 adcq $0, %rsi 709 # SUB 710 subq %rdx, s1 711 sbbq $0, s2 712 sbbq $0, s3 713 sbbq $0, %rsi 714 715 # 6. MOD 716 # First Mod 717 movq %rsi, %rax # rax <- carry (rsi) +out[0] 718 shlq $32, %rax # rax <- carry << 32 719 movq %rax, %rcx # rcx <- rax +out[3] 720 subq %rsi, %rax # rax <- carry << 32 - carry +out[1] 721 722 addq %rsi, s0 723 adcq %rax, s1 724 adcq $0, s2 725 adcq %rcx, s3 726 727 # Last Mod 728 # return r - p if r > p else r 729 movq s0, s4 730 movq s1, s5 731 movq s2, s6 732 movq s3, s7 733 734 leaq .Lpoly(%rip), %rsi 735 736 movq $0, %rcx 737 adcq $0, %rcx 738 739 subq 0(%rsi), s0 740 sbbq 8(%rsi), s1 741 sbbq 16(%rsi), s2 742 sbbq 24(%rsi), s3 743 sbbq $0, %rcx 744 745 cmovcq s4, s0 746 cmovcq s5, s1 747 cmovcq s6, s2 748 cmovcq s7, s3 749 750 movq s0, (%rdi) 751 movq s1, 8(%rdi) 752 movq s2, 16(%rdi) 753 movq s3, 24(%rdi) 754.endm 755 756### Modular mul: r = a*b mod p ### 757 # void ECP_Sm2Mul(uint64_t *r, const uint64_t *a, const uint64_t *b) 758 # 256-bit modular multiplication in SM2 759 # r %rdi 760 # a %rsi 761 # b %rdx 762 .globl ECP_Sm2Mul 763 .type ECP_Sm2Mul, @function 764 .align 64 765 766ECP_Sm2Mul: 767 768 # Store scalar registers 769 subq $72, %rsp 770 movq %rbx, 32(%rsp) 771 movq %r12, 40(%rsp) 772 movq %r13, 48(%rsp) 773 movq %r14, 56(%rsp) 774 movq %r15, 64(%rsp) 775 776 # Load inputs 777 movq (%rsi), s0 778 movq 8(%rsi), s1 779 movq 16(%rsi), s2 780 movq 24(%rsi), s3 781 movq (%rdx), s4 782 movq 8(%rdx), s5 783 movq 16(%rdx), s6 784 movq 24(%rdx), s7 785 786### multiplication ### 787 788 # ======================== 789 # s7 s6 s5 s4 790 # * s3 s2 s1 s0 791 # ------------------------ 792 # + s0 s0 s0 s0 793 # * * * * 794 # s7 s6 s5 s4 795 # s1 s1 s1 s1 796 # * * * * 797 # s7 s6 s5 s4 798 # s2 s2 s2 s2 799 # * * * * 800 # s7 s6 s5 s4 801 # s3 s3 s3 s3 802 # * * * * 803 # s7 s6 s5 s4 804 # ------------------------ 805 # s7 s6 s5 s4 s3 s2 s1 s0 806 # ======================== 807 808### s0*s4 ### 809 movq s0, %rax 810 mulq s4 811 movq %rax, (%rsp) 812 movq %rdx, %rbx 813 xorq %rcx, %rcx 814 815### s1*s4 + s0*s5 ### 816 movq s1, %rax 817 mulq s4 818 addq %rax, %rbx 819 adcq %rdx, %rcx 820 xorq %rsi, %rsi 821 822 movq s0, %rax 823 mulq s5 824 addq %rax, %rbx 825 adcq %rdx, %rcx 826 adcq $0, %rsi 827 movq %rbx, 8(%rsp) 828 xorq %rbx, %rbx 829 830### s2 * s4 + s1 * s5 + s0 *s6 ### 831 movq s2, %rax 832 mulq s4 833 addq %rax, %rcx 834 adcq %rdx, %rsi 835 836 movq s1, %rax 837 mulq s5 838 addq %rax, %rcx 839 adcq %rdx, %rsi 840 adcq $0, %rbx 841 842 movq s0, %rax 843 mulq s6 844 addq %rax, %rcx 845 adcq %rdx, %rsi 846 adcq $0, %rbx 847 movq %rcx, 16(%rsp) 848 xorq %rcx, %rcx 849 850### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### 851 movq s3, %rax 852 mulq s4 853 addq %rax, %rsi 854 adcq %rdx, %rbx 855 adcq $0, %rcx 856 857 movq s2, %rax 858 mulq s5 859 addq %rax, %rsi 860 adcq %rdx, %rbx 861 adcq $0, %rcx 862 863 movq s1, %rax 864 mulq s6 865 addq %rax, %rsi 866 adcq %rdx, %rbx 867 adcq $0, %rcx 868 869 movq s0, %rax 870 mulq s7 871 addq %rax, %rsi 872 adcq %rdx, %rbx 873 adcq $0, %rcx 874 movq %rsi, 24(%rsp) 875 xorq %rsi, %rsi 876 877### s3*s5 + s2*s6 + s1*s7 ### 878 movq s3, %rax 879 mulq s5 880 addq %rax, %rbx 881 adcq %rdx, %rcx 882 # carry 883 adcq $0, %rsi 884 885 movq s2, %rax 886 mulq s6 887 addq %rax, %rbx 888 adcq %rdx, %rcx 889 adcq $0, %rsi 890 891 movq s1, %rax 892 mulq s7 893 addq %rax, %rbx 894 adcq %rdx, %rcx 895 adcq $0, %rsi 896 movq %rbx, s4 897 xorq %rbx, %rbx 898 899### s3*s6 + s2*s7 ### 900 movq s3, %rax 901 mulq s6 902 addq %rax, %rcx 903 adcq %rdx, %rsi 904 # carry 905 adcq $0, %rbx 906 907 movq s2, %rax 908 mulq s7 909 addq %rax, %rcx 910 adcq %rdx, %rsi 911 adcq $0, %rbx 912 movq %rcx, s5 913 914### s3*s7 ### 915 movq s3, %rax 916 mulq s7 917 addq %rax, %rsi 918 adcq %rdx, %rbx 919 movq %rsi, s6 920 movq %rbx, s7 921 922 movq (%rsp), s0 923 movq 8(%rsp), s1 924 movq 16(%rsp), s2 925 movq 24(%rsp), s3 926 927 # result of mul: s7 s6 s5 s4 s3 s2 s1 s0 928 929### Reduction ### 930 RDC 931 932 # Restore scalar registers 933 movq 32(%rsp), %rbx 934 movq 40(%rsp), %r12 935 movq 48(%rsp), %r13 936 movq 56(%rsp), %r14 937 movq 64(%rsp), %r15 938 addq $72, %rsp 939 940 ret 941 .size ECP_Sm2Mul, .-ECP_Sm2Mul 942 943### Modular sqr: r = a^2 mod p ### 944 # void ECP_Sm2Sqr(uint64_t *r, const uint64_t *a) 945 # 256-bit modular multiplication in SM2 ### 946 # r %rdi 947 # a %rsi 948 .globl ECP_Sm2Sqr 949 .type ECP_Sm2Sqr, @function 950 .align 64 951 952ECP_Sm2Sqr: 953 954 # Store scalar registers 955 subq $88, %rsp 956 movq %rbx, 32(%rsp) 957 movq %r12, 40(%rsp) 958 movq %r13, 48(%rsp) 959 movq %r14, 56(%rsp) 960 movq %r15, 64(%rsp) 961 movq %rbp, 72(%rsp) 962 movq %rdi, 80(%rsp) 963 964 # Load inputs 965 movq (%rsi), s4 966 movq 8(%rsi), s5 967 movq 16(%rsi), s6 968 movq 24(%rsi), s7 969 970### square ### 971 972 # ======================== 973 # s7 s6 s5 s4 974 # * s7 s6 s5 s4 975 # ------------------------ 976 # + s4 s4 s4 s4 977 # * * * * 978 # s7 s6 s5 s4 979 # s5 s5 s5 s5 980 # * * * * 981 # s7 s6 s5 s4 982 # s6 s6 s6 s6 983 # * * * * 984 # s7 s6 s5 s4 985 # s7 s7 s7 s7 986 # * * * * 987 # s7 s6 s5 s4 988 # ------------------------ 989 # s7 s6 s5 s4 s3 s2 s1 s0 990 # ======================== 991 992### s1 <- s4*s5, s2 <- carry ### 993 movq s5, %rax 994 mulq s4 995 movq %rax, s1 996 movq %rdx, s2 997 xorq s3, s3 998 999### s2 <- s4*s6 + carry(s2), s3 <- carry ### 1000 movq s6, %rax 1001 mulq s4 1002 addq %rax, s2 1003 adcq %rdx, s3 1004 xorq s0, s0 1005 1006### s3 <- s4*s7 + s5*s6 + carry(s3), s0 <- carry ### 1007 movq s7, %rax 1008 mulq s4 1009 addq %rax, s3 1010 adcq %rdx, s0 1011 xorq %rbx, %rbx 1012 1013 movq s6, %rax 1014 mulq s5 1015 addq %rax, s3 1016 adcq %rdx, s0 1017 adcq $0, %rbx 1018 1019### s0 <- s5*s7 + carry(s0), rbx <- carry ### 1020 movq s7, %rax 1021 mulq s5 1022 addq %rax, s0 1023 adcq %rdx, %rbx 1024 xorq %rcx, %rcx 1025 1026### rbx <- s6*s7 + carry(rbx), rcx <- carry ### 1027 movq s7, %rax 1028 mulq s6 1029 addq %rax, %rbx 1030 adcq %rdx, %rcx 1031 xorq %rsi, %rsi 1032 1033### 2*s0|1|2|3 ### 1034 addq s1, s1 1035 adcq s2, s2 1036 adcq s3, s3 1037 adcq s0, s0 1038 adcq %rbx, %rbx 1039 # update carry 1040 adcq %rcx, %rcx 1041 adcq $0, %rsi 1042### rbp <- s4*s4, carry <- rdi ### 1043 movq s4, %rax 1044 mulq s4 1045 movq %rax, %rbp 1046 movq %rdx, %rdi 1047 1048### s4 <- s5*s5, carry <- s5 ### 1049 movq s5, %rax 1050 mulq s5 1051 movq %rax, s4 1052 movq %rdx, s5 1053 1054### s6*s6 ### 1055 movq s6, %rax 1056 mulq s6 1057 1058 # s1 += carry(s4*s4) 1059 addq %rdi, s1 1060 # s2 += s5*s5 1061 adcq s4, s2 1062 # s3 += carry(s5*s5) 1063 adcq s5, s3 1064 # s4(s0) += s6*s6 1065 adcq %rax, s0 1066 # s5(rbx) += carry(s6*s6) 1067 adcq %rdx, %rbx 1068 adcq $0, %rcx 1069 adcq $0, %rsi 1070 1071### s7*s7 ### 1072 movq s7, %rax 1073 mulq s7 1074 # s6(rcx) += s7*s7 1075 addq %rax, %rcx 1076 # s7(rsi) += carry(s7*s7) 1077 adcq %rdx, %rsi 1078 1079 movq s0, s4 1080 movq %rbp, s0 1081 movq %rbx, s5 1082 movq %rcx, s6 1083 movq %rsi, s7 1084 1085 # Restore rdi 1086 movq 80(%rsp), %rdi 1087 1088 # result of mul: s7 s6 s5 s4 s3 s2 s1 s0 1089 1090### Reduction ### 1091 RDC 1092 1093 # Restore scalar registers 1094 movq 32(%rsp), %rbx 1095 movq 40(%rsp), %r12 1096 movq 48(%rsp), %r13 1097 movq 56(%rsp), %r14 1098 movq 64(%rsp), %r15 1099 movq 72(%rsp), %rbp 1100 addq $88, %rsp 1101 1102 ret 1103 .size ECP_Sm2Sqr, .-ECP_Sm2Sqr 1104 1105.globl ECP_Sm2ToMont 1106.type ECP_Sm2ToMont,@function 1107.align 32 1108ECP_Sm2ToMont: 1109 leaq .LRR(%rip), %rdx 1110 REGISTER_SAVE 1111 movq 0(%rsi), %r9 1112 movq 8(%rsi), %r10 1113 movq 16(%rsi), %r11 1114 movq 24(%rsi), %r12 1115 movq %rdx, %rbx 1116 movq 0(%rdx), %rax 1117 1118 call ECP_Sm2MulMont 1119 1120 REGISTER_POP 1121 ret 1122.size ECP_Sm2ToMont,.-ECP_Sm2ToMont 1123 1124.type ECP_Sm2MulMont,@function 1125.align 32 1126ECP_Sm2MulMont: 1127 1128 // a[0~3] * b[0] 1129 movq %rax, %rbp 1130 mulq %r9 1131 movq %rax, %r8 1132 movq %rdx, %r9 1133 movq %rbp, %rax 1134 1135 mulq %r10 1136 addq %rax, %r9 1137 adcq $0, %rdx 1138 movq %rbp, %rax 1139 movq %rdx, %r10 1140 1141 mulq %r11 1142 addq %rax, %r10 1143 adcq $0, %rdx 1144 movq %rbp, %rax 1145 movq %rdx, %r11 1146 1147 mulq %r12 1148 addq %rax, %r11 1149 adcq $0, %rdx 1150 movq %rdx, %r12 1151 movq %r8, %rax 1152 movq %r8, %r14 1153 xorq %r13, %r13 1154 1155 // begin 1st reduce 1156 shlq $32, %rax 1157 shrq $32, %r14 1158 1159 movq %r8, %rcx 1160 subq %rax, %rcx 1161 movq $0, %rdx 1162 sbbq %r14, %rdx 1163 movq %rdx, %rbp 1164 movq $0, %rdx 1165 sbbq %rax, %rdx 1166 movq %rdx, %rax 1167 sbbq %r14, %r8 1168 movq %r8, %rdx 1169 1170 movq %rcx, %r8 1171 addq %r9, %r8 1172 movq %rbp, %r9 1173 adcq %r10, %r9 1174 movq %rax, %r10 1175 adcq %r11, %r10 1176 movq %rdx, %r11 1177 adcq %r12, %r11 1178 movq $0, %r12 1179 adcq %r13, %r12 1180 movq 8(%rbx), %rax // b[1] 1181 1182 movq %rax, %rbp 1183 mulq 0(%rsi) 1184 addq %rax, %r8 1185 adcq $0, %rdx 1186 movq %rbp, %rax 1187 movq %rdx, %rcx 1188 1189 mulq 8(%rsi) 1190 addq %rcx, %r9 1191 adcq $0, %rdx 1192 addq %rax, %r9 1193 adcq $0, %rdx 1194 movq %rbp, %rax 1195 movq %rdx, %rcx 1196 1197 mulq 16(%rsi) 1198 addq %rcx, %r10 1199 adcq $0, %rdx 1200 addq %rax, %r10 1201 adcq $0, %rdx 1202 movq %rbp, %rax 1203 movq %rdx, %rcx 1204 1205 mulq 24(%rsi) 1206 addq %rcx, %r11 1207 adcq $0, %rdx 1208 addq %rax, %r11 1209 movq %r9, %rax 1210 adcq %rdx, %r12 1211 adcq $0, %r13 1212 1213 movq %r8, %rax 1214 movq %r8, %r14 1215 1216 // begin 2st reduce 1217 shlq $32, %rax 1218 shrq $32, %r14 1219 1220 movq %r8, %rcx 1221 subq %rax, %rcx 1222 movq $0, %rdx 1223 sbbq %r14, %rdx 1224 movq %rdx, %rbp 1225 movq $0, %rdx 1226 sbbq %rax, %rdx 1227 movq %rdx, %rax 1228 sbbq %r14, %r8 1229 movq %r8, %rdx 1230 1231 movq %rcx, %r8 1232 addq %r9, %r8 1233 movq %rbp, %r9 1234 adcq %r10, %r9 1235 movq %rax, %r10 1236 adcq %r11, %r10 1237 movq %rdx, %r11 1238 adcq %r12, %r11 1239 movq $0, %r12 1240 adcq %r13, %r12 1241 movq 16(%rbx), %rax // b[2] 1242 1243 movq %rax, %rbp 1244 mulq 0(%rsi) 1245 addq %rax, %r8 1246 movq %rbp, %rax 1247 adcq $0, %rdx 1248 movq %rdx, %rcx 1249 1250 mulq 8(%rsi) 1251 addq %rcx, %r9 1252 adcq $0, %rdx 1253 addq %rax, %r9 1254 adcq $0, %rdx 1255 movq %rbp, %rax 1256 movq %rdx, %rcx 1257 1258 mulq 16(%rsi) 1259 addq %rcx, %r10 1260 adcq $0, %rdx 1261 addq %rax, %r10 1262 adcq $0, %rdx 1263 movq %rbp, %rax 1264 movq %rdx, %rcx 1265 1266 mulq 24(%rsi) 1267 addq %rcx, %r11 1268 adcq $0, %rdx 1269 addq %rax, %r11 1270 movq %r9, %rax 1271 adcq %rdx, %r12 1272 adcq $0, %r13 1273 1274 movq %r8, %rax 1275 movq %r8, %r14 1276 1277 // begin 3st reduce 1278 shlq $32, %rax 1279 shrq $32, %r14 1280 1281 movq %r8, %rcx 1282 movq $0, %rdx 1283 subq %rax, %rcx 1284 sbbq %r14, %rdx 1285 movq %rdx, %rbp 1286 movq $0, %rdx 1287 sbbq %rax, %rdx 1288 sbbq %r14, %r8 1289 movq %rdx, %rax 1290 movq %r8, %rdx 1291 1292 movq %rcx, %r8 1293 addq %r9, %r8 1294 movq %rbp, %r9 1295 adcq %r10, %r9 1296 movq %rax, %r10 1297 adcq %r11, %r10 1298 movq %rdx, %r11 1299 adcq %r12, %r11 1300 movq $0, %r12 1301 adcq %r13, %r12 1302 movq 24(%rbx), %rax // b[3] 1303 1304 movq %rax, %rbp 1305 mulq 0(%rsi) 1306 addq %rax, %r8 1307 adcq $0, %rdx 1308 movq %rbp, %rax 1309 movq %rdx, %rcx 1310 1311 mulq 8(%rsi) 1312 addq %rcx, %r9 1313 adcq $0, %rdx 1314 addq %rax, %r9 1315 adcq $0, %rdx 1316 movq %rbp, %rax 1317 movq %rdx, %rcx 1318 1319 mulq 16(%rsi) 1320 addq %rcx, %r10 1321 adcq $0, %rdx 1322 addq %rax, %r10 1323 adcq $0, %rdx 1324 movq %rbp, %rax 1325 movq %rdx, %rcx 1326 1327 mulq 24(%rsi) 1328 addq %rcx, %r11 1329 adcq $0, %rdx 1330 addq %rax, %r11 1331 adcq %rdx, %r12 1332 adcq $0, %r13 1333 movq %r9, %rax 1334 1335 movq %r8, %rax 1336 movq %r8, %r14 1337 1338 // last reduction begin 1339 shlq $32, %rax 1340 shrq $32, %r14 1341 1342 movq %r8, %rcx 1343 subq %rax, %rcx 1344 movq $0, %rdx 1345 sbbq %r14, %rdx 1346 movq %rdx, %rbp 1347 movq $0, %rdx 1348 sbbq %rax, %rdx 1349 movq %rdx, %rax 1350 sbbq %r14, %r8 1351 movq %r8, %rdx 1352 movq %rcx, %r8 1353 1354 addq %r9, %r8 1355 movq %rbp, %r9 1356 adcq %r10, %r9 1357 movq %rax, %r10 1358 adcq %r11, %r10 1359 movq %rdx, %r11 1360 adcq %r12, %r11 1361 movq $0, %rcx 1362 adcq %r13, %rcx 1363 // last reduction end 1364 1365 // ret - p 1366 movq %r8, %r12 1367 subq $-1, %r12 1368 movq .Lpoly+8(%rip), %r14 1369 movq %r9, %r13 1370 sbbq %r14, %r13 1371 1372 movq %r10, %rbp 1373 sbbq $-1, %rbp 1374 1375 movq .Lpoly+24(%rip), %r15 1376 movq %r11, %rdx 1377 sbbq %r15, %rdx 1378 sbbq $0, %rcx 1379 1380 cmovcq %r8, %r12 1381 cmovcq %r9, %r13 1382 cmovcq %r10, %rbp 1383 movq %r12,(%rdi) 1384 movq %r13,8(%rdi) 1385 cmovcq %r11, %rdx 1386 movq %rbp,16(%rdi) 1387 movq %rdx,24(%rdi) 1388 1389 movq %rbp, %r8 1390 movq %rdx, %r9 1391 ret 1392.size ECP_Sm2MulMont, .-ECP_Sm2MulMont 1393 1394.globl ECP_Sm2FromMont 1395.type ECP_Sm2FromMont,@function 1396.align 32 1397ECP_Sm2FromMont: 1398 1399 leaq .Lone(%rip), %rdx 1400 REGISTER_SAVE 1401 movq %rdx, %rbx 1402 movq 0(%rsi), %r9 1403 movq 8(%rsi), %r10 1404 movq 16(%rsi), %r11 1405 movq 24(%rsi), %r12 1406 movq 0(%rdx), %rax 1407 1408 call ECP_Sm2MulMont 1409 1410 REGISTER_POP 1411 ret 1412.size ECP_Sm2FromMont,.-ECP_Sm2FromMont 1413 1414.type ECP_Sm2SqrMont,@function 1415.align 32 1416ECP_Sm2SqrMont: 1417 1418 movq %rax, %r13 1419 mulq %r14 // a[0] * a[1] 1420 movq %rax, %r9 1421 movq %rdx, %r10 1422 movq %r15, %rax 1423 1424 mulq %r13 // a[0] * a[2] 1425 addq %rax, %r10 1426 adcq $0, %rdx 1427 movq %r8, %rax 1428 movq %rdx, %r11 1429 1430 mulq %r13 // a[0] * a[3] 1431 addq %rax, %r11 1432 adcq $0, %rdx 1433 movq %r15, %rax 1434 movq %rdx, %r12 1435 1436 mulq %r14 // a[1] * a[2] 1437 addq %rax, %r11 1438 adcq $0, %rdx 1439 movq %r8, %rax 1440 movq %rdx, %rbp 1441 1442 mulq %r14 // a[1] * a[3] 1443 addq %rax, %r12 1444 adcq $0, %rdx 1445 addq %rbp, %r12 1446 movq %rdx, %r13 1447 movq %r8, %rax 1448 adcq $0, %r13 1449 1450 mulq %r15 // a[2] * a[3] 1451 addq %rax, %r13 1452 movq (%rsi), %rax 1453 movq %rdx, %r14 1454 adcq $0, %r14 1455 1456 movq $0, %r15 1457 addq %r9, %r9 1458 adcq %r10, %r10 1459 adcq %r11, %r11 1460 adcq %r12, %r12 1461 adcq %r13, %r13 1462 adcq %r14, %r14 1463 adcq $0, %r15 1464 1465 mulq %rax // cal a[0] * a[0] 1466 movq %rax, %r8 1467 movq 8(%rsi), %rax // get a[1] 1468 movq %rdx, %rcx 1469 1470 mulq %rax // a[1] * a[1] 1471 addq %rcx, %r9 1472 adcq %rax, %r10 1473 adcq $0, %rdx 1474 movq 16(%rsi), %rax 1475 movq %rdx, %rcx 1476 1477 mulq %rax // a[2] * a[2] 1478 addq %rcx, %r11 1479 adcq %rax, %r12 1480 adcq $0, %rdx 1481 movq 24(%rsi), %rax 1482 movq %rdx, %rcx 1483 1484 mulq %rax // a[3] * a[3] 1485 addq %rcx, %r13 1486 adcq %rax, %r14 1487 movq %r8, %rax 1488 adcq %rdx, %r15 1489 1490 movq %r8, %rax 1491 movq %r8, %rsi 1492 1493 // begin 1st reduce 1494 shlq $32, %rax 1495 shrq $32, %rsi 1496 movq %r8, %rcx 1497 subq %rax, %rcx 1498 1499 movq $0, %rdx 1500 sbbq %rsi, %rdx 1501 movq %rdx, %rbp 1502 movq $0, %rdx 1503 sbbq %rax, %rdx 1504 movq %rdx, %rax 1505 sbbq %rsi, %r8 1506 movq %r8, %rdx 1507 1508 movq %rcx, %r8 1509 addq %r9, %r8 1510 movq %rbp, %r9 1511 adcq %r10, %r9 1512 movq %rax, %r10 1513 adcq %r11, %r10 1514 movq %rdx, %r11 1515 adcq $0, %r11 1516 1517 movq %r8, %rax 1518 movq %r8, %rsi 1519 1520 // begin 2st reduce 1521 shlq $32, %rax 1522 shrq $32, %rsi 1523 movq %r8, %rcx 1524 subq %rax, %rcx 1525 1526 movq $0, %rdx 1527 sbbq %rsi, %rdx 1528 movq %rdx, %rbp 1529 movq $0, %rdx 1530 sbbq %rax, %rdx 1531 movq %rdx, %rax 1532 sbbq %rsi, %r8 1533 movq %r8, %rdx 1534 1535 movq %rcx, %r8 1536 addq %r9, %r8 1537 movq %rbp, %r9 1538 adcq %r10, %r9 1539 movq %rax, %r10 1540 adcq %r11, %r10 1541 movq %rdx, %r11 1542 adcq $0, %r11 1543 1544 movq %r8, %rax 1545 movq %r8, %rsi 1546 1547 // begin 3st reduce 1548 shlq $32, %rax 1549 shrq $32, %rsi 1550 movq %r8, %rcx 1551 subq %rax, %rcx 1552 1553 movq $0, %rdx 1554 sbbq %rsi, %rdx 1555 movq %rdx, %rbp 1556 movq $0, %rdx 1557 sbbq %rax, %rdx 1558 movq %rdx, %rax 1559 sbbq %rsi, %r8 1560 movq %r8, %rdx 1561 1562 movq %rcx, %r8 1563 addq %r9, %r8 1564 movq %rbp, %r9 1565 adcq %r10, %r9 1566 movq %rax, %r10 1567 adcq %r11, %r10 1568 movq %rdx, %r11 1569 adcq $0, %r11 1570 1571 movq %r8, %rax 1572 movq %r8, %rsi 1573 1574 // begin 4st reduce 1575 shlq $32, %rax 1576 shrq $32, %rsi 1577 movq %r8, %rcx 1578 subq %rax, %rcx 1579 1580 movq $0, %rdx 1581 sbbq %rsi, %rdx 1582 movq %rdx, %rbp 1583 movq $0, %rdx 1584 sbbq %rax, %rdx 1585 movq %rdx, %rax 1586 sbbq %rsi, %r8 1587 movq %r8, %rdx 1588 1589 movq %rcx, %r8 1590 addq %r9, %r8 1591 movq %rbp, %r9 1592 adcq %r10, %r9 1593 movq %rax, %r10 1594 adcq %r11, %r10 1595 movq %rdx, %r11 1596 adcq $0, %r11 1597 1598 movq .Lpoly+8(%rip), %rsi 1599 movq .Lpoly+24(%rip), %rbp 1600 1601 addq %r8, %r12 1602 adcq %r9, %r13 1603 adcq %r10, %r14 1604 adcq %r11, %r15 1605 movq $0, %r11 1606 adcq $0, %r11 1607 1608 // ret - q 1609 movq %r12, %rax 1610 subq $-1, %rax 1611 movq %r13, %rcx 1612 sbbq %rsi, %rcx 1613 movq %r14, %r8 1614 sbbq $-1, %r8 1615 movq %r15, %rdx 1616 sbbq %rbp, %rdx 1617 sbbq $0, %r11 1618 1619 cmovncq %rax, %r12 1620 cmovncq %rcx, %r13 1621 cmovncq %r8, %r14 1622 movq %r12,(%rdi) 1623 movq %r13,8(%rdi) 1624 cmovncq %rdx, %r15 1625 movq %r14,16(%rdi) 1626 movq %r15,24(%rdi) 1627 ret 1628.size ECP_Sm2SqrMont,.-ECP_Sm2SqrMont 1629 1630.type ECP_Sm2AddCore,@function 1631.align 32 1632ECP_Sm2AddCore: 1633 1634 addq (%rbx), %r12 1635 adcq 8(%rbx), %r13 1636 movq %r12, %rcx 1637 adcq 16(%rbx), %r8 1638 adcq 24(%rbx), %r9 1639 movq $0, %r11 1640 movq %r13, %rbp 1641 adcq $0, %r11 1642 1643 subq $-1, %r12 // + 0xffffffffffffffff = -(-1) 1644 movq %r8, %rax 1645 sbbq %r14, %r13 1646 sbbq $-1, %r8 1647 movq %r9, %r10 1648 sbbq %r15, %r9 1649 sbbq $0, %r11 1650 1651 cmovcq %rcx, %r12 1652 cmovcq %rbp, %r13 1653 movq %r12, 0(%rdi) 1654 cmovcq %rax, %r8 1655 movq %r13, 8(%rdi) 1656 cmovcq %r10, %r9 1657 movq %r8, 16(%rdi) 1658 movq %r9, 24(%rdi) 1659 1660 ret 1661.size ECP_Sm2AddCore,.-ECP_Sm2AddCore 1662 1663.type ECP_Sm2SubBA,@function 1664.align 32 1665ECP_Sm2SubBA: 1666 subq %r12, %rcx 1667 sbbq %r13, %rbp 1668 movq %rcx, %r12 1669 sbbq %r8, %rax 1670 sbbq %r9, %r10 1671 movq %rbp, %r13 1672 sbbq %r11, %r11 1673 1674 addq $-1, %rcx 1675 movq %rax, %r8 1676 adcq %r14, %rbp 1677 adcq $-1, %rax 1678 movq %r10, %r9 1679 adcq %r15, %r10 1680 testq %r11, %r11 1681 1682 cmovnzq %rcx, %r12 1683 cmovnzq %rbp, %r13 1684 cmovnzq %rax, %r8 1685 cmovnzq %r10, %r9 1686 ret 1687.size ECP_Sm2SubBA,.-ECP_Sm2SubBA 1688 1689.type ECP_Sm2SubAB,@function 1690.align 32 1691ECP_Sm2SubAB: 1692 subq 0(%rbx), %r12 1693 sbbq 8(%rbx), %r13 1694 sbbq 16(%rbx), %r8 1695 sbbq 24(%rbx), %r9 1696 sbbq %r11, %r11 1697 1698 movq %r14, %rbp 1699 andq %r11, %rbp 1700 movq %r11, %rax 1701 btrq $32, %rax 1702 1703 addq %r11, %r12 1704 adcq %rbp, %r13 1705 adcq %r11, %r8 1706 adcq %rax, %r9 1707 1708 movq %r12, (%rdi) 1709 movq %r13, 8(%rdi) 1710 movq %r8, 16(%rdi) 1711 movq %r9, 24(%rdi) 1712 1713 ret 1714.size ECP_Sm2SubAB,.-ECP_Sm2SubAB 1715 1716.type ECP_Sm2MulBy2Core,@function 1717.align 32 1718ECP_Sm2MulBy2Core: 1719 addq %r12, %r12 1720 adcq %r13, %r13 1721 movq %r12, %rcx 1722 adcq %r8, %r8 1723 adcq %r9, %r9 1724 movq $0, %r11 1725 movq %r13, %rbp 1726 adcq $0, %r11 1727 1728 subq $-1, %r12 // + 0xffffffffffffffff = -(-1) 1729 movq %r8, %rax 1730 sbbq %r14, %r13 1731 sbbq $-1, %r8 1732 movq %r9, %r10 1733 sbbq %r15, %r9 1734 sbbq $0, %r11 1735 1736 cmovcq %rcx, %r12 1737 cmovcq %rbp, %r13 1738 cmovcq %rax, %r8 1739 cmovcq %r10, %r9 1740 1741 movq %r12, (%rdi) 1742 movq %r13, 8(%rdi) 1743 movq %r8, 16(%rdi) 1744 movq %r9, 24(%rdi) 1745 ret 1746.size ECP_Sm2MulBy2Core,.-ECP_Sm2MulBy2Core 1747 1748# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b 1749# Deal process: 1750# delta = Z12 1751# gamma = Y12 1752# beta = X1*gamma 1753# alpha = 3*(X1-delta)*(X1+delta) 1754# X3 = alpha2-8*beta 1755# Z3 = (Y1+Z1)2-gamma-delta 1756# Y3 = alpha*(4*beta-X3)-8*gamma2 1757.globl ECP_Sm2PointDoubleMont 1758.type ECP_Sm2PointDoubleMont,@function 1759.align 32 1760ECP_Sm2PointDoubleMont: 1761 REGISTER_SAVE 1762 subq $168, %rsp 1763 1764.Lpoint_double: 1765 vmovdqu 0(%rsi), %xmm0 1766 vmovdqu 16(%rsi), %xmm1 1767 vmovdqa %xmm0,96(%rsp) 1768 vmovdqa %xmm1,96+16(%rsp) 1769 1770 movq %rsi, %rbx 1771 leaq 32(%rdi), %r10 1772 leaq 64(%rdi), %r11 1773 vmovq %rdi, %xmm0 1774 vmovq %r10, %xmm1 1775 vmovq %r11, %xmm2 1776 1777 movq 32(%rsi), %r12 1778 movq 40(%rsi), %r13 1779 movq 48(%rsi), %r8 1780 movq 56(%rsi), %r9 1781 1782 movq .Lpoly+8(%rip), %r14 1783 movq .Lpoly+24(%rip), %r15 1784 leaq (%rsp), %rdi 1785 call ECP_Sm2MulBy2Core 1786 1787 movq 64(%rsi), %rax 1788 movq 72(%rsi), %r14 1789 movq 80(%rsi), %r15 1790 movq 88(%rsi), %r8 1791 1792 leaq 64(%rsi), %rsi // Setting Input Parameters 1793 leaq 64(%rsp), %rdi // store the result 1794 call ECP_Sm2SqrMont 1795 1796 movq (%rsp), %rax 1797 movq 8(%rsp), %r14 1798 movq 16(%rsp), %r15 1799 movq 24(%rsp), %r8 1800 leaq (%rsp), %rsi 1801 leaq (%rsp), %rdi 1802 call ECP_Sm2SqrMont 1803 1804 movq 32(%rbx), %rax 1805 movq 64(%rbx), %r9 1806 movq 72(%rbx), %r10 1807 movq 80(%rbx), %r11 1808 movq 88(%rbx), %r12 1809 1810 leaq 64(%rbx), %rsi 1811 leaq 32(%rbx), %rbx 1812 vmovq %xmm2, %rdi 1813 call ECP_Sm2MulMont 1814 call ECP_Sm2MulBy2Core 1815 1816 movq 96(%rsp), %r12 1817 movq 104(%rsp), %r13 1818 movq 112(%rsp), %r8 1819 movq 120(%rsp), %r9 1820 1821 leaq 32(%rsp), %rdi 1822 leaq 64(%rsp), %rbx 1823 call ECP_Sm2AddCore 1824 1825 movq 96(%rsp), %r12 1826 movq 104(%rsp), %r13 1827 movq 112(%rsp), %r8 1828 movq 120(%rsp), %r9 1829 1830 leaq 64(%rsp), %rbx // intput 1831 leaq 64(%rsp), %rdi // output 1832 call ECP_Sm2SubAB 1833 1834 movq (%rsp), %rax 1835 movq 8(%rsp), %r14 1836 movq 16(%rsp), %r15 1837 movq 24(%rsp), %r8 1838 leaq (%rsp), %rsi 1839 vmovq %xmm1, %rdi 1840 1841 call ECP_Sm2SqrMont 1842 1843 movq %r12, %rcx 1844 addq $-1, %r12 1845 movq %r13, %r10 1846 adcq %rsi, %r13 1847 movq %r14, %rax 1848 adcq $-1, %r14 1849 movq $0, %r9 1850 movq %r15, %r8 1851 adcq %rbp, %r15 1852 adcq $0, %r9 1853 xorq %rsi, %rsi 1854 testq $1, %rcx 1855 1856 cmovzq %rcx, %r12 1857 cmovzq %r10, %r13 1858 cmovzq %rax, %r14 1859 cmovzq %r8, %r15 1860 cmovzq %rsi, %r9 1861 1862 movq %r13, %rcx 1863 shrq $1, %r12 1864 shlq $63, %rcx 1865 shrq $1, %r13 1866 movq %r14, %r10 1867 orq %rcx, %r12 1868 shlq $63, %r10 1869 movq %r15, %rax 1870 shrq $1, %r14 1871 orq %r10, %r13 1872 shlq $63, %rax 1873 movq %r12,0(%rdi) 1874 shrq $1, %r15 1875 movq %r13,8(%rdi) 1876 shlq $63, %r9 1877 orq %rax, %r14 1878 orq %r9, %r15 1879 1880 movq %r14,16(%rdi) 1881 movq %r15,24(%rdi) 1882 1883 movq 64(%rsp), %rax 1884 leaq 64(%rsp), %rbx 1885 movq 32(%rsp), %r9 1886 movq 40(%rsp), %r10 1887 leaq 32(%rsp), %rsi 1888 movq 48(%rsp), %r11 1889 movq 56(%rsp), %r12 1890 leaq 32(%rsp), %rdi 1891 call ECP_Sm2MulMont 1892 1893 leaq 128(%rsp), %rdi 1894 call ECP_Sm2MulBy2Core 1895 1896 leaq 32(%rsp), %rbx 1897 leaq 32(%rsp), %rdi 1898 call ECP_Sm2AddCore 1899 1900 movq 96(%rsp), %rax 1901 leaq 96(%rsp), %rbx 1902 movq (%rsp), %r9 1903 movq 8(%rsp), %r10 1904 leaq (%rsp), %rsi 1905 movq 16(%rsp), %r11 1906 movq 24(%rsp), %r12 1907 leaq 0(%rsp), %rdi 1908 call ECP_Sm2MulMont 1909 1910 leaq 128(%rsp), %rdi 1911 call ECP_Sm2MulBy2Core 1912 1913 movq 32(%rsp), %rax 1914 movq 40(%rsp), %r14 1915 leaq 32(%rsp), %rsi 1916 movq 48(%rsp), %r15 1917 movq 56(%rsp), %r8 1918 vmovq %xmm0, %rdi 1919 call ECP_Sm2SqrMont 1920 1921 leaq 128(%rsp), %rbx 1922 movq %r14, %r8 1923 movq %r15, %r9 1924 movq %rsi, %r14 1925 movq %rbp, %r15 1926 call ECP_Sm2SubAB 1927 1928 movq (%rsp), %rcx 1929 movq 8(%rsp), %rbp 1930 movq 16(%rsp), %rax 1931 movq 24(%rsp), %r10 1932 leaq 0(%rsp), %rdi 1933 call ECP_Sm2SubBA 1934 1935 movq 32(%rsp), %rax 1936 leaq 32(%rsp), %rbx 1937 movq %r12, %r14 1938 xorl %ecx, %ecx 1939 movq %r12,(%rsp) 1940 movq %r13, %r10 1941 movq %r13,8(%rsp) 1942 cmovzq %r8, %r11 1943 movq %r8,16(%rsp) 1944 cmovzq %r9, %r12 1945 movq %r9,24(%rsp) 1946 movq %r14, %r9 1947 1948 leaq 0(%rsp), %rsi 1949 leaq 0(%rsp), %rdi 1950 call ECP_Sm2MulMont 1951 1952 vmovq %xmm1, %rbx 1953 vmovq %xmm1, %rdi 1954 call ECP_Sm2SubAB 1955 1956 leaq 168(%rsp), %rsp 1957 REGISTER_POP 1958 ret 1959.size ECP_Sm2PointDoubleMont,.-ECP_Sm2PointDoubleMont 1960 1961# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo 1962# Deal process: 1963# U1 = X1*Z22 1964# U2 = X2*Z12 1965# S1 = Y1*Z23 1966# S2 = Y2*Z13 1967# H = U2-U1 1968# r = S2-S1 1969# X3 = r2-H3-2*U1*H2 1970# Y3 = r*(U1*H2-X3)-S1*H3 1971# Z3 = Z1*Z2*H 1972.globl ECP_Sm2PointAddMont 1973.type ECP_Sm2PointAddMont,@function 1974.align 32 1975ECP_Sm2PointAddMont: 1976 REGISTER_SAVE 1977 subq $584, %rsp 1978 1979 vmovdqu 0(%rsi), %xmm0 1980 vmovdqu 16(%rsi), %xmm1 1981 vmovdqu 32(%rsi), %xmm2 1982 vmovdqu 48(%rsi), %xmm3 1983 vmovdqu 64(%rsi), %xmm4 1984 vmovdqu 80(%rsi), %xmm5 1985 movq %rsi, %rbx 1986 movq %rdx, %rsi 1987 vmovdqa %xmm0,384(%rsp) 1988 vmovdqa %xmm1,384+16(%rsp) 1989 vmovdqa %xmm2,416(%rsp) 1990 vmovdqa %xmm3,416+16(%rsp) 1991 vmovdqa %xmm4,448(%rsp) 1992 vmovdqa %xmm5,448+16(%rsp) 1993 vpor %xmm4, %xmm5, %xmm5 1994 1995 vmovdqu 0(%rsi), %xmm0 1996 vpshufd $0xb1, %xmm5, %xmm3 1997 vmovdqu 16(%rsi), %xmm1 1998 vmovdqu 32(%rsi), %xmm2 1999 vpor %xmm3, %xmm5, %xmm5 2000 vmovdqu 48(%rsi), %xmm3 2001 2002 movq 64(%rsi), %rax 2003 movq 72(%rsi), %r14 2004 movq 80(%rsi), %r15 2005 movq 88(%rsi), %r8 2006 2007 vmovdqa %xmm0,480(%rsp) 2008 vpshufd $0x1e, %xmm5, %xmm4 2009 vmovdqa %xmm1,480+16(%rsp) 2010 vmovdqu 64(%rsi), %xmm0 2011 vmovdqu 80(%rsi), %xmm1 2012 vmovdqa %xmm2,512(%rsp) 2013 vmovdqa %xmm3,512+16(%rsp) 2014 vpor %xmm4, %xmm5, %xmm5 2015 vpxor %xmm4, %xmm4, %xmm4 2016 vpor %xmm0, %xmm1, %xmm1 2017 vmovq %rdi, %xmm0 2018 2019 leaq 64(%rsi), %rsi 2020 movq %rax,544(%rsp) 2021 movq %r14,544+8(%rsp) 2022 movq %r15,544+16(%rsp) 2023 movq %r8,544+24(%rsp) 2024 leaq 96(%rsp), %rdi 2025 call ECP_Sm2SqrMont 2026 2027 vpcmpeqd %xmm4, %xmm5, %xmm5 2028 vpshufd $0xb1, %xmm1, %xmm4 2029 vpor %xmm1, %xmm4, %xmm4 2030 vpshufd $0, %xmm5, %xmm5 2031 vpshufd $0x1e, %xmm4, %xmm3 2032 vpor %xmm3, %xmm4, %xmm4 2033 vpxor %xmm3, %xmm3, %xmm3 2034 vpcmpeqd %xmm3, %xmm4, %xmm4 2035 vpshufd $0, %xmm4, %xmm4 2036 2037 movq 64(%rbx), %rax 2038 movq 72(%rbx), %r14 2039 movq 80(%rbx), %r15 2040 movq 88(%rbx), %r8 2041 vmovq %rbx, %xmm1 2042 2043 leaq 64(%rbx), %rsi 2044 leaq 32(%rsp), %rdi 2045 call ECP_Sm2SqrMont 2046 2047 movq 544(%rsp), %rax 2048 leaq 544(%rsp), %rbx 2049 movq 96(%rsp), %r9 2050 movq 104(%rsp), %r10 2051 leaq 96(%rsp), %rsi 2052 movq 112(%rsp), %r11 2053 movq 120(%rsp), %r12 2054 leaq 224(%rsp), %rdi 2055 call ECP_Sm2MulMont 2056 2057 movq 448(%rsp), %rax 2058 leaq 448(%rsp), %rbx 2059 movq 32(%rsp), %r9 2060 movq 40(%rsp), %r10 2061 leaq 32(%rsp), %rsi 2062 movq 48(%rsp), %r11 2063 movq 56(%rsp), %r12 2064 leaq 256(%rsp), %rdi 2065 call ECP_Sm2MulMont 2066 2067 movq 416(%rsp), %rax 2068 leaq 416(%rsp), %rbx 2069 movq 224(%rsp), %r9 2070 movq 232(%rsp), %r10 2071 leaq 224(%rsp), %rsi 2072 movq 240(%rsp), %r11 2073 movq 248(%rsp), %r12 2074 leaq 224(%rsp), %rdi 2075 call ECP_Sm2MulMont 2076 2077 movq 512(%rsp), %rax 2078 leaq 512(%rsp), %rbx 2079 movq 256(%rsp), %r9 2080 movq 264(%rsp), %r10 2081 leaq 256(%rsp), %rsi 2082 movq 272(%rsp), %r11 2083 movq 280(%rsp), %r12 2084 leaq 256(%rsp), %rdi 2085 call ECP_Sm2MulMont 2086 2087 leaq 224(%rsp), %rbx 2088 leaq 64(%rsp), %rdi 2089 call ECP_Sm2SubAB 2090 2091 orq %r13, %r12 2092 vmovdqa %xmm4, %xmm2 2093 orq %r8, %r12 2094 orq %r9, %r12 2095 vpor %xmm5, %xmm2, %xmm2 2096 vmovq %r12, %xmm3 2097 2098 movq 384(%rsp), %rax 2099 leaq 384(%rsp), %rbx 2100 movq 96(%rsp), %r9 2101 movq 104(%rsp), %r10 2102 leaq 96(%rsp), %rsi 2103 movq 112(%rsp), %r11 2104 movq 120(%rsp), %r12 2105 leaq 160(%rsp), %rdi 2106 call ECP_Sm2MulMont 2107 2108 movq 480(%rsp), %rax 2109 leaq 480(%rsp), %rbx 2110 movq 32(%rsp), %r9 2111 movq 40(%rsp), %r10 2112 leaq 32(%rsp), %rsi 2113 movq 48(%rsp), %r11 2114 movq 56(%rsp), %r12 2115 leaq 192(%rsp), %rdi 2116 call ECP_Sm2MulMont 2117 2118 leaq 160(%rsp), %rbx 2119 leaq 0(%rsp), %rdi 2120 call ECP_Sm2SubAB 2121 2122 orq %r13, %r12 2123 orq %r8, %r12 2124 orq %r9, %r12 2125 2126 vmovq %xmm2, %r8 2127 vmovq %xmm3, %r9 2128 2129 orq %r8, %r12 2130 orq %r9, %r12 2131 jnz .Lpoint_add 2132 2133.Ladd_double: 2134 vmovq %xmm1, %rsi 2135 vmovq %xmm0, %rdi 2136 addq $416, %rsp 2137 jmp .Lpoint_double 2138 2139.align 32 2140.Lpoint_add: 2141 movq 64(%rsp), %rax 2142 movq 72(%rsp), %r14 2143 leaq 64(%rsp), %rsi 2144 movq 80(%rsp), %r15 2145 movq 88(%rsp), %r8 2146 leaq 96(%rsp), %rdi 2147 call ECP_Sm2SqrMont 2148 2149 movq 448(%rsp), %rax 2150 leaq 448(%rsp), %rbx 2151 movq (%rsp), %r9 2152 movq 8(%rsp), %r10 2153 leaq (%rsp), %rsi 2154 movq 16(%rsp), %r11 2155 movq 24(%rsp), %r12 2156 leaq 352(%rsp), %rdi 2157 call ECP_Sm2MulMont 2158 2159 movq (%rsp), %rax 2160 movq 8(%rsp), %r14 2161 leaq (%rsp), %rsi 2162 movq 16(%rsp), %r15 2163 movq 24(%rsp), %r8 2164 leaq 32(%rsp), %rdi 2165 call ECP_Sm2SqrMont 2166 2167 movq 544(%rsp), %rax 2168 leaq 544(%rsp), %rbx 2169 movq 352(%rsp), %r9 2170 movq 360(%rsp), %r10 2171 leaq 352(%rsp), %rsi 2172 movq 368(%rsp), %r11 2173 movq 24+352(%rsp), %r12 2174 leaq 352(%rsp), %rdi 2175 call ECP_Sm2MulMont 2176 2177 movq (%rsp), %rax 2178 leaq (%rsp), %rbx 2179 movq 32(%rsp), %r9 2180 movq 40(%rsp), %r10 2181 leaq 32(%rsp), %rsi 2182 movq 48(%rsp), %r11 2183 movq 56(%rsp), %r12 2184 leaq 128(%rsp), %rdi 2185 call ECP_Sm2MulMont 2186 2187 movq 160(%rsp), %rax 2188 leaq 160(%rsp), %rbx 2189 movq 32(%rsp), %r9 2190 movq 40(%rsp), %r10 2191 leaq 32(%rsp), %rsi 2192 movq 48(%rsp), %r11 2193 movq 56(%rsp), %r12 2194 leaq 192(%rsp), %rdi 2195 call ECP_Sm2MulMont 2196 2197 leaq 96(%rsp), %rsi 2198 movq $0, %r11 2199 addq %r12, %r12 2200 adcq %r13, %r13 2201 movq %r12, %rcx 2202 adcq %r8, %r8 2203 adcq %r9, %r9 2204 movq %r13, %rbp 2205 adcq $0, %r11 2206 2207 subq $-1, %r12 2208 movq %r8, %rax 2209 sbbq %r14, %r13 2210 sbbq $-1, %r8 2211 movq %r9, %r10 2212 sbbq %r15, %r9 2213 sbbq $0, %r11 2214 2215 cmovcq %rcx, %r12 2216 movq (%rsi), %rcx 2217 cmovcq %rbp, %r13 2218 movq 8(%rsi), %rbp 2219 cmovcq %rax, %r8 2220 movq 16(%rsi), %rax 2221 cmovcq %r10, %r9 2222 movq 24(%rsi), %r10 2223 2224 call ECP_Sm2SubBA 2225 2226 leaq 128(%rsp), %rbx 2227 leaq 288(%rsp), %rdi 2228 call ECP_Sm2SubAB 2229 2230 movq 192(%rsp), %rcx 2231 movq 200(%rsp), %rbp 2232 movq 208(%rsp), %rax 2233 movq 216(%rsp), %r10 2234 leaq 320(%rsp), %rdi 2235 2236 call ECP_Sm2SubBA 2237 2238 movq %r12,(%rdi) 2239 movq %r13,8(%rdi) 2240 movq %r8,16(%rdi) 2241 movq %r9,24(%rdi) 2242 2243 movq 128(%rsp), %rax 2244 leaq 128(%rsp), %rbx 2245 movq 224(%rsp), %r9 2246 movq 232(%rsp), %r10 2247 leaq 224(%rsp), %rsi 2248 movq 240(%rsp), %r11 2249 movq 248(%rsp), %r12 2250 leaq 256(%rsp), %rdi 2251 call ECP_Sm2MulMont 2252 2253 movq 320(%rsp), %rax 2254 leaq 320(%rsp), %rbx 2255 movq 64(%rsp), %r9 2256 movq 72(%rsp), %r10 2257 leaq 64(%rsp), %rsi 2258 movq 80(%rsp), %r11 2259 movq 88(%rsp), %r12 2260 leaq 320(%rsp), %rdi 2261 call ECP_Sm2MulMont 2262 2263 leaq 256(%rsp), %rbx 2264 leaq 320(%rsp), %rdi 2265 call ECP_Sm2SubAB 2266 2267 vmovq %xmm0, %rdi 2268 vmovdqa %xmm5, %xmm0 2269 vmovdqa %xmm5, %xmm1 2270 vpandn 352(%rsp), %xmm0, %xmm0 2271 vmovdqa %xmm5, %xmm2 2272 vpandn 368(%rsp), %xmm1, %xmm1 2273 vmovdqa %xmm5, %xmm3 2274 vpand 544(%rsp), %xmm2, %xmm2 2275 vpand 560(%rsp), %xmm3, %xmm3 2276 vpor %xmm0, %xmm2, %xmm2 2277 vpor %xmm1, %xmm3, %xmm3 2278 2279 vmovdqa %xmm4, %xmm0 2280 vmovdqa %xmm4, %xmm1 2281 vpandn %xmm2, %xmm0, %xmm0 2282 vmovdqa %xmm4, %xmm2 2283 vpandn %xmm3, %xmm1, %xmm1 2284 vmovdqa %xmm4, %xmm3 2285 vpand 448(%rsp), %xmm2, %xmm2 2286 vpand 464(%rsp), %xmm3, %xmm3 2287 vpor %xmm0, %xmm2, %xmm2 2288 vpor %xmm1, %xmm3, %xmm3 2289 vmovdqu %xmm2,64(%rdi) 2290 vmovdqu %xmm3,80(%rdi) 2291 2292 vmovdqa %xmm5, %xmm0 2293 vmovdqa %xmm5, %xmm1 2294 vpandn 288(%rsp), %xmm0, %xmm0 2295 vmovdqa %xmm5, %xmm2 2296 vpandn 304(%rsp), %xmm1, %xmm1 2297 vmovdqa %xmm5, %xmm3 2298 vpand 480(%rsp), %xmm2, %xmm2 2299 vpand 496(%rsp), %xmm3, %xmm3 2300 vpor %xmm0, %xmm2, %xmm2 2301 vpor %xmm1, %xmm3, %xmm3 2302 2303 vmovdqa %xmm4, %xmm0 2304 vmovdqa %xmm4, %xmm1 2305 vpandn %xmm2, %xmm0, %xmm0 2306 vmovdqa %xmm4, %xmm2 2307 vpandn %xmm3, %xmm1, %xmm1 2308 vmovdqa %xmm4, %xmm3 2309 vpand 384(%rsp), %xmm2, %xmm2 2310 vpand 400(%rsp), %xmm3, %xmm3 2311 vpor %xmm0, %xmm2, %xmm2 2312 vpor %xmm1, %xmm3, %xmm3 2313 vmovdqu %xmm2,(%rdi) 2314 vmovdqu %xmm3,16(%rdi) 2315 2316 vmovdqa %xmm5, %xmm0 2317 vmovdqa %xmm5, %xmm1 2318 vpandn 320(%rsp), %xmm0, %xmm0 2319 vmovdqa %xmm5, %xmm2 2320 vpandn 336(%rsp), %xmm1, %xmm1 2321 vmovdqa %xmm5, %xmm3 2322 vpand 512(%rsp), %xmm2, %xmm2 2323 vpand 528(%rsp), %xmm3, %xmm3 2324 vpor %xmm0, %xmm2, %xmm2 2325 vpor %xmm1, %xmm3, %xmm3 2326 2327 vmovdqa %xmm4, %xmm0 2328 vmovdqa %xmm4, %xmm1 2329 vpandn %xmm2, %xmm0, %xmm0 2330 vmovdqa %xmm4, %xmm2 2331 vpandn %xmm3, %xmm1, %xmm1 2332 vmovdqa %xmm4, %xmm3 2333 vpand 416(%rsp), %xmm2, %xmm2 2334 vpand 432(%rsp), %xmm3, %xmm3 2335 vpor %xmm0, %xmm2, %xmm2 2336 vpor %xmm1, %xmm3, %xmm3 2337 vmovdqu %xmm2,32(%rdi) 2338 vmovdqu %xmm3,48(%rdi) 2339 2340.Ladd_done: 2341 leaq 584(%rsp), %rsp 2342 REGISTER_POP 2343 ret 2344.size ECP_Sm2PointAddMont,.-ECP_Sm2PointAddMont 2345 2346# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl 2347# Deal process: 2348# Z1Z1 = Z12 2349# U2 = X2*Z1Z1 2350# S2 = Y2*Z1*Z1Z1 2351# H = U2-X1 2352# HH = H2 2353# I = 4*HH 2354# J = H*I 2355# r = 2*(S2-Y1) 2356# V = X1*I 2357# X3 = r2-J-2*V 2358# Y3 = r*(V-X3)-2*Y1*J 2359# Z3 = (Z1+H)2-Z1Z1-HH 2360.globl ECP_Sm2PointAddAffineMont 2361.type ECP_Sm2PointAddAffineMont,@function 2362.align 32 2363ECP_Sm2PointAddAffineMont: 2364 REGISTER_SAVE 2365 subq $488, %rsp 2366 vmovdqu (%rsi), %xmm0 2367 vmovdqu 16(%rsi), %xmm1 2368 vmovdqu 32(%rsi), %xmm2 2369 vmovdqu 48(%rsi), %xmm3 2370 vmovdqu 64(%rsi), %xmm4 2371 vmovdqu 80(%rsi), %xmm5 2372 movq %rdx, %rbx 2373 movq 64(%rsi), %rax 2374 movq 72(%rsi), %r14 2375 movq 80(%rsi), %r15 2376 movq 88(%rsi), %r8 2377 2378 vmovdqa %xmm0,320(%rsp) 2379 vmovdqa %xmm1,336(%rsp) 2380 vmovdqa %xmm2,352(%rsp) 2381 vmovdqa %xmm3,368(%rsp) 2382 vmovdqa %xmm4,384(%rsp) 2383 vmovdqa %xmm5,400(%rsp) 2384 vpor %xmm4, %xmm5, %xmm5 2385 2386 vmovdqu (%rbx), %xmm0 2387 vpshufd $0xb1, %xmm5, %xmm3 2388 vmovdqu 16(%rbx), %xmm1 2389 vmovdqu 32(%rbx), %xmm2 2390 vpor %xmm3, %xmm5, %xmm5 2391 vmovdqu 48(%rbx), %xmm3 2392 vmovdqa %xmm0, 416(%rsp) 2393 vpshufd $0x1e, %xmm5, %xmm4 2394 vmovdqa %xmm1, 416+16(%rsp) 2395 vpor %xmm0, %xmm1, %xmm1 2396 2397 vmovq %rdi, %xmm0 2398 vmovdqa %xmm2, 448(%rsp) 2399 vmovdqa %xmm3, 464(%rsp) 2400 vpor %xmm2, %xmm3, %xmm3 2401 vpor %xmm4, %xmm5, %xmm5 2402 vpxor %xmm4, %xmm4, %xmm4 2403 vpor %xmm1, %xmm3, %xmm3 2404 2405 leaq 64(%rsi), %rsi 2406 leaq 32(%rsp), %rdi 2407 call ECP_Sm2SqrMont 2408 2409 vpcmpeqd %xmm4, %xmm5, %xmm5 2410 vpshufd $0xb1, %xmm3, %xmm4 2411 vpor %xmm3, %xmm4, %xmm4 2412 vpshufd $0, %xmm5, %xmm5 2413 vpshufd $0x1e, %xmm4, %xmm3 2414 vpor %xmm3, %xmm4, %xmm4 2415 vpxor %xmm3, %xmm3, %xmm3 2416 vpcmpeqd %xmm3, %xmm4, %xmm4 2417 vpshufd $0, %xmm4, %xmm4 2418 2419 movq (%rbx), %rax 2420 movq %r12, %r9 2421 movq %r13, %r10 2422 movq %r14, %r11 2423 2424 leaq 32(%rsp), %rsi 2425 movq %r15, %r12 2426 leaq (%rsp), %rdi 2427 call ECP_Sm2MulMont 2428 2429 leaq 320(%rsp), %rbx 2430 leaq 64(%rsp), %rdi 2431 call ECP_Sm2SubAB 2432 2433 movq 384(%rsp), %rax 2434 leaq 384(%rsp), %rbx 2435 movq 32(%rsp), %r9 2436 movq 40(%rsp), %r10 2437 leaq 32(%rsp), %rsi 2438 movq 48(%rsp), %r11 2439 movq 56(%rsp), %r12 2440 leaq 32(%rsp), %rdi 2441 call ECP_Sm2MulMont 2442 2443 movq 384(%rsp), %rax 2444 leaq 384(%rsp), %rbx 2445 movq 64(%rsp), %r9 2446 movq 72(%rsp), %r10 2447 leaq 64(%rsp), %rsi 2448 movq 80(%rsp), %r11 2449 movq 88(%rsp), %r12 2450 leaq 288(%rsp), %rdi 2451 call ECP_Sm2MulMont 2452 2453 movq 448(%rsp), %rax 2454 leaq 448(%rsp), %rbx 2455 movq 32(%rsp), %r9 2456 movq 40(%rsp), %r10 2457 leaq 32(%rsp), %rsi 2458 movq 48(%rsp), %r11 2459 movq 56(%rsp), %r12 2460 leaq 32(%rsp), %rdi 2461 call ECP_Sm2MulMont 2462 2463 leaq 352(%rsp), %rbx 2464 leaq 96(%rsp), %rdi 2465 call ECP_Sm2SubAB 2466 2467 movq 64(%rsp), %rax 2468 movq 72(%rsp), %r14 2469 leaq 64(%rsp), %rsi 2470 movq 80(%rsp), %r15 2471 movq 88(%rsp), %r8 2472 leaq 128(%rsp), %rdi 2473 call ECP_Sm2SqrMont 2474 2475 movq 96(%rsp), %rax 2476 movq 104(%rsp), %r14 2477 leaq 96(%rsp), %rsi 2478 movq 112(%rsp), %r15 2479 movq 120(%rsp), %r8 2480 leaq 192(%rsp), %rdi 2481 call ECP_Sm2SqrMont 2482 2483 movq 128(%rsp), %rax 2484 leaq 128(%rsp), %rbx 2485 movq 64(%rsp), %r9 2486 movq 72(%rsp), %r10 2487 leaq 64(%rsp), %rsi 2488 movq 80(%rsp), %r11 2489 movq 88(%rsp), %r12 2490 leaq 160(%rsp), %rdi 2491 call ECP_Sm2MulMont 2492 2493 movq 320(%rsp), %rax 2494 leaq 320(%rsp), %rbx 2495 movq 128(%rsp), %r9 2496 movq 136(%rsp), %r10 2497 leaq 128(%rsp), %rsi 2498 movq 144(%rsp), %r11 2499 movq 152(%rsp), %r12 2500 leaq (%rsp), %rdi 2501 call ECP_Sm2MulMont 2502 2503 leaq 192(%rsp), %rsi 2504 movq $0, %r11 2505 addq %r12, %r12 2506 adcq %r13, %r13 2507 movq %r12, %rcx 2508 adcq %r8, %r8 2509 adcq %r9, %r9 2510 movq %r13, %rbp 2511 adcq $0, %r11 2512 2513 subq $-1, %r12 2514 movq %r8, %rax 2515 sbbq %r14, %r13 2516 sbbq $-1, %r8 2517 movq %r9, %r10 2518 sbbq %r15, %r9 2519 sbbq $0, %r11 2520 2521 cmovcq %rcx, %r12 2522 movq (%rsi), %rcx 2523 cmovcq %rbp, %r13 2524 movq 8(%rsi), %rbp 2525 cmovcq %rax, %r8 2526 movq 16(%rsi), %rax 2527 cmovcq %r10, %r9 2528 movq 24(%rsi), %r10 2529 2530 call ECP_Sm2SubBA 2531 2532 leaq 160(%rsp), %rbx 2533 leaq 224(%rsp), %rdi 2534 call ECP_Sm2SubAB 2535 2536 movq (%rsp), %rcx 2537 movq 8(%rsp), %rbp 2538 movq 16(%rsp), %rax 2539 movq 24(%rsp), %r10 2540 leaq 64(%rsp), %rdi 2541 2542 call ECP_Sm2SubBA 2543 2544 movq %r12,(%rdi) 2545 movq %r13,8(%rdi) 2546 movq %r8,16(%rdi) 2547 movq %r9,24(%rdi) 2548 2549 movq 352(%rsp), %rax 2550 leaq 352(%rsp), %rbx 2551 movq 160(%rsp), %r9 2552 movq 168(%rsp), %r10 2553 leaq 160(%rsp), %rsi 2554 movq 176(%rsp), %r11 2555 movq 184(%rsp), %r12 2556 leaq 32(%rsp), %rdi 2557 call ECP_Sm2MulMont 2558 2559 movq 96(%rsp), %rax 2560 leaq 96(%rsp), %rbx 2561 movq 64(%rsp), %r9 2562 movq 72(%rsp), %r10 2563 leaq 64(%rsp), %rsi 2564 movq 80(%rsp), %r11 2565 movq 88(%rsp), %r12 2566 leaq 64(%rsp), %rdi 2567 call ECP_Sm2MulMont 2568 2569 leaq 32(%rsp), %rbx 2570 leaq 256(%rsp), %rdi 2571 call ECP_Sm2SubAB 2572 2573 vmovq %xmm0, %rdi 2574 vmovdqa %xmm5, %xmm0 2575 vmovdqa %xmm5, %xmm1 2576 vpandn 288(%rsp), %xmm0, %xmm0 2577 vmovdqa %xmm5, %xmm2 2578 vpandn 304(%rsp), %xmm1, %xmm1 2579 vmovdqa %xmm5, %xmm3 2580 vpand .Lone_mont(%rip), %xmm2, %xmm2 2581 vpand .Lone_mont+16(%rip), %xmm3, %xmm3 2582 vpor %xmm0, %xmm2, %xmm2 2583 vpor %xmm1, %xmm3, %xmm3 2584 2585 vmovdqa %xmm4, %xmm0 2586 vmovdqa %xmm4, %xmm1 2587 vpandn %xmm2, %xmm0, %xmm0 2588 vmovdqa %xmm4, %xmm2 2589 vpandn %xmm3, %xmm1, %xmm1 2590 vmovdqa %xmm4, %xmm3 2591 vpand 384(%rsp), %xmm2, %xmm2 2592 vpand 400(%rsp), %xmm3, %xmm3 2593 vpor %xmm0, %xmm2, %xmm2 2594 vpor %xmm1, %xmm3, %xmm3 2595 vmovdqu %xmm2,64(%rdi) 2596 vmovdqu %xmm3,80(%rdi) 2597 2598 vmovdqa %xmm5, %xmm0 2599 vmovdqa %xmm5, %xmm1 2600 vpandn 224(%rsp), %xmm0, %xmm0 2601 vmovdqa %xmm5, %xmm2 2602 vpandn 240(%rsp), %xmm1, %xmm1 2603 vmovdqa %xmm5, %xmm3 2604 vpand 416(%rsp), %xmm2, %xmm2 2605 vpand 432(%rsp), %xmm3, %xmm3 2606 vpor %xmm0, %xmm2, %xmm2 2607 vpor %xmm1, %xmm3, %xmm3 2608 2609 vmovdqa %xmm4, %xmm0 2610 vmovdqa %xmm4, %xmm1 2611 vpandn %xmm2, %xmm0, %xmm0 2612 vmovdqa %xmm4, %xmm2 2613 vpandn %xmm3, %xmm1, %xmm1 2614 vmovdqa %xmm4, %xmm3 2615 vpand 320(%rsp), %xmm2, %xmm2 2616 vpand 336(%rsp), %xmm3, %xmm3 2617 vpor %xmm0, %xmm2, %xmm2 2618 vpor %xmm1, %xmm3, %xmm3 2619 vmovdqu %xmm2,(%rdi) 2620 vmovdqu %xmm3,16(%rdi) 2621 2622 vmovdqa %xmm5, %xmm0 2623 vmovdqa %xmm5, %xmm1 2624 vpandn 256(%rsp), %xmm0, %xmm0 2625 vmovdqa %xmm5, %xmm2 2626 vpandn 272(%rsp), %xmm1, %xmm1 2627 vmovdqa %xmm5, %xmm3 2628 vpand 448(%rsp), %xmm2, %xmm2 2629 vpand 464(%rsp), %xmm3, %xmm3 2630 vpor %xmm0, %xmm2, %xmm2 2631 vpor %xmm1, %xmm3, %xmm3 2632 2633 vmovdqa %xmm4, %xmm0 2634 vmovdqa %xmm4, %xmm1 2635 vpandn %xmm2, %xmm0, %xmm0 2636 vmovdqa %xmm4, %xmm2 2637 vpandn %xmm3, %xmm1, %xmm1 2638 vmovdqa %xmm4, %xmm3 2639 vpand 352(%rsp), %xmm2, %xmm2 2640 vpand 368(%rsp), %xmm3, %xmm3 2641 vpor %xmm0, %xmm2, %xmm2 2642 vpor %xmm1, %xmm3, %xmm3 2643 vmovdqu %xmm2,32(%rdi) 2644 vmovdqu %xmm3,48(%rdi) 2645 2646 leaq 488(%rsp), %rsp 2647 REGISTER_POP 2648 ret 2649.size ECP_Sm2PointAddAffineMont,.-ECP_Sm2PointAddAffineMont 2650#endif