1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#if defined(HITLS_CRYPTO_CURVE_NISTP256) && defined(HITLS_CRYPTO_NIST_USE_ACCEL) 18 19#include "ecp256_pre_comp_table.s" 20.file "ecp256_x86.S" 21 22.data 23.align 64 24.Lpoly: // P 25.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 26.LrrModP: // Indicates the calculated value of R * R mod p, which is used in montgomery modular multiplication. 27.quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd 28.Lone_mont: // R mod P, R = 2^256, = 2^256 - P 29.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe 30.Lord: // order, n 31.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 32.LordK: // (2^64 - ord[0]) * ordK = 1 (mod 2^64)) LordK = -(ord[0])^(-1) (mod 2^64) LordK * Lord, 33 // The lower 64 bits are all Fs. 34.quad 0xccd1c8aaee00bc4f 35.LOne: 36.quad 0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 37 38.text 39/** 40 * Function description: Returns the address of the field calculation table of the ECP256 field. 41 * Function prototype: const ECP256_TableRow *ECP256_GetPreCompTable(void); 42 * Input register: None 43 * Change register: None 44 * Output register: rax 45 * Function/Macro Call: None 46 */ 47.globl ECP256_GetPreCompTable 48.type ECP256_GetPreCompTable,@function 49.align 32 50ECP256_GetPreCompTable: 51.cfi_startproc 52 53 leaq g_preCompTable(%rip), %rax 54 55 ret 56.cfi_endproc 57.size ECP256_GetPreCompTable, .-ECP256_GetPreCompTable 58 59/** 60 * Function description: Addition of the ECP256 field. res = a + b mod P 61 * Function prototype: void ECP256_Add(Coord *r, const Coord *a, const Coord *b); 62 * Input register: 63 * rdi: Pointer to the output Coord structure 64 * rsi: Address pointing to input data a 65 * rdx: Address pointing to input data b 66 * Change register: rsi, rdx, rcx, rax, r8, r9, r10, r11, r12, r13 67 * Output register: None 68 * Function/Macro call: Addition can be implemented by calling ECP256_AddCore. 69 */ 70.globl ECP256_Add 71.type ECP256_Add,@function 72.align 32 73ECP256_Add: 74.cfi_startproc 75 pushq %r12 76 pushq %r13 77 78 movq (%rsi), %r8 // a[0] 79 movq 8(%rsi), %r9 // a[1] 80 xorq %r13, %r13 // Save carry 81 movq 16(%rsi), %r10 // a[2] 82 movq 24(%rsi), %r11 // a[3] 83 leaq .Lpoly(%rip), %rsi // P 84 85 addq (%rdx), %r8 // c[0] = a[0] + b[0] 86 adcq 8(%rdx), %r9 // c[1] = a[1] + b[1] + carry 87 movq %r8, %rax // save c[0] 88 adcq 16(%rdx), %r10 // c[2] = a[2] + b[2] + carry 89 adcq 24(%rdx), %r11 // c[3] = a[3] = b[3] + carry 90 movq %r9, %rcx // save c[1] 91 adcq $0, %r13 // save carry value to r13 92 93 subq $-1, %r8 // d[0] = c[0] - P[0] 94 movq %r10, %rdx // save c[2] 95 sbbq 8(%rsi), %r9 // d[1] = c[1] - P[1] - borrow 96 sbbq $0, %r10 // d[2] = c[2] - P[2] - borrow 97 movq %r11, %r12 // save c[3] 98 sbbq 24(%rsi), %r11 // d[3] = c[3] - P[3] - borrow 99 sbbq $0, %r13 // r13 = 0 + carry - borrow 100 101 cmovcq %rax, %r8 // res[0] = (r13 < 0) ? c[0]: d[0] 102 cmovcq %rcx, %r9 // res[1] = (r13 < 0) ? c[1]: d[1] 103 movq %r8, (%rdi) 104 cmovcq %rdx, %r10 // res[2] = (r13 < 0) ? c[2]: d[2] 105 movq %r9, 8(%rdi) 106 cmovcq %r12, %r11 // res[3] = (r13 < 0) ? c[3]: d[3] 107 movq %r10, 16(%rdi) 108 109 movq (%rsp), %r13 110 movq %r11, 24(%rdi) 111 movq 8(%rsp), %r12 112 leaq 16(%rsp), %rsp 113 ret 114.cfi_endproc 115.size ECP256_Add, .-ECP256_Add 116 117/** 118 * Function description: Addition core part of the ECP256 field, a + b mod P; Outputs r8-r11, r14,15 are P[1] and P[3] 119 * Input register: 120 * r8-r11:256-bit input data a 121 * rdx: points to the input 256-bit data b. 122 * r14:P[1] 123 * r15:P[3] 124 * Change register: rdx, rcx, rax, r8, r9, r10, r11, r12, r13 125 * Output register: r8-r11 126 */ 127.type ECP256_AddCore,@function 128.align 32 129ECP256_AddCore: 130.cfi_startproc 131 xorq %r13, %r13 132 addq (%rdx), %r8 // Addition result. 133 adcq 8(%rdx), %r9 134 movq %r8, %rax 135 adcq 16(%rdx), %r10 136 adcq 24(%rdx), %r11 137 movq %r9, %rcx 138 adcq $0, %r13 // Save carry value to r13. 139 140 subq $-1, %r8 // Mod P. 141 movq %r10, %rdx 142 sbbq %r14, %r9 143 sbbq $0, %r10 144 movq %r11, %r12 145 sbbq %r15, %r11 146 sbbq $0, %r13 147 148 cmovcq %rax, %r8 // Obtain mod P result. 149 cmovcq %rcx, %r9 150 movq %r8, (%rdi) 151 cmovcq %rdx, %r10 152 movq %r9, 8(%rdi) 153 cmovcq %r12, %r11 154 movq %r10, 16(%rdi) 155 movq %r11, 24(%rdi) 156 ret 157.cfi_endproc 158.size ECP256_AddCore, .-ECP256_AddCore 159 160/** 161 * Function description: Subtraction of the ECP256 field. Res = a - b mod P 162 * Function prototype: void ECP256_Sub(Coord *r, const Coord *a, const Coord *b); 163 * Input register: 164 * rdi: Pointer to the output Coord structure 165 * rsi: Address pointing to input data a 166 * rdx: Address pointing to input data b 167 * Change register: rsi, rdx, rcx, rax, r8, r9, r10, r11, r12, r13 168 * Output register: None 169 * Function/Macro call: Subtraction can be implemented by calling ECP256_SubCore. 170 */ 171.globl ECP256_Sub 172.type ECP256_Sub,@function 173.align 32 174ECP256_Sub: 175.cfi_startproc 176 pushq %r12 177 pushq %r13 178 179 movq (%rsi), %r8 // a[0] 180 movq 8(%rsi), %r9 // a[1] 181 xorq %r13, %r13 // Save borrow 182 movq 16(%rsi), %r10 // a[3] 183 movq 24(%rsi), %r11 // a[4] 184 leaq .Lpoly(%rip), %rsi // P 185 186 subq (%rdx), %r8 // c[0] = a[0] - b[0] 187 sbbq 8(%rdx), %r9 // c[1] = a[1] - b[1] - borrow 188 movq %r8, %rax // save c[0] 189 sbbq 16(%rdx), %r10 // c[2] = a[2] - b[2] - borrow 190 sbbq 24(%rdx), %r11 // c[3] = a[3] - b[3] - borrow 191 movq %r9, %rcx // save c[1] 192 sbbq $0, %r13 // save borrow value to r13 193 194 addq $-1, %r8 // d[0] = c[0] + P[0] 195 movq %r10, %rdx // save c[2] 196 adcq 8(%rsi), %r9 // d[1] = c[1] + P[1] + carry 197 adcq $0, %r10 // d[2] = c[2] + P[2] + carry 198 movq %r11, %r12 // save c[3] 199 adcq 24(%rsi), %r11 // d[3] = c[3] + P[3] + carry 200 testq %r13, %r13 201 202 cmovzq %rax, %r8 // res[0] = (r13 == 0) ? c[0] : d[0] 203 cmovzq %rcx, %r9 // res[1] = (r13 == 0) ? c[1] : d[1] 204 movq %r8, (%rdi) 205 cmovzq %rdx, %r10 // res[2] = (r13 == 0) ? c[2] : d[2] 206 movq %r9, 8(%rdi) 207 cmovzq %r12, %r11 // res[3] = (r13 == 0) ? c[3] : d[3] 208 movq %r10, 16(%rdi) 209 movq %r11, 24(%rdi) 210 211 movq (%rsp), %r13 212 movq 8(%rsp), %r12 213 leaq 16(%rsp), %rsp 214 ret 215.cfi_endproc 216.size ECP256_Sub, .-ECP256_Sub 217 218/** 219 * Function description: subtraction core part of the ECP256 field, a-b mod P; no writeback 220 * Input register: 221 * r8-r11:256-bit input data a 222 * rdx:Points to the input 256-bit data b. 223 * r14:P[1] 224 * r15:P[3] 225 * Change register: rdx, rcx, rax, r8, r9, r10, r11, r12, r13 226 * Output register: r8-r11 227 */ 228.type ECP256_SubCore,@function 229.align 32 230ECP256_SubCore: 231.cfi_startproc 232 xorq %r13, %r13 233 subq (%rdx), %r8 // Subtraction results. 234 sbbq 8(%rdx), %r9 235 movq %r8, %rax // Save Results. 236 sbbq 16(%rdx), %r10 237 sbbq 24(%rdx), %r11 238 movq %r9, %rcx 239 sbbq $0, %r13 // Borrowing saved in r13. 240 241 addq $-1, %r8 // a - b + P 242 movq %r10, %rdx 243 adcq %r14, %r9 244 adcq $0, %r10 245 movq %r11, %r12 246 adcq %r15, %r11 247 testq %r13, %r13 248 249 cmovzq %rax, %r8 // If r13 is equal to 0, a-b is used. Otherwise, a-b+P is used. 250 cmovzq %rcx, %r9 251 cmovzq %rdx, %r10 252 cmovzq %r12, %r11 253 254 ret 255.cfi_endproc 256.size ECP256_SubCore, .-ECP256_SubCore 257 258/** 259 * Function description: negation of the ECP256 field. res = -a mod P 260 * Function prototype: void ECP256_Neg(Coord *r, const Coord *a); 261 * Input register: 262 * rdi: Pointer to the output Coord structure 263 * rsi: Address pointing to input data a 264 * Change register: rsi, rdx, rcx, rax, r8, r9, r10, r11, r12, r13 265 * Output register: None 266 * Function/Macro Call: 267 */ 268.globl ECP256_Neg 269.type ECP256_Neg,@function 270.align 32 271ECP256_Neg: 272.cfi_startproc 273 pushq %r12 274 pushq %r13 275 276 xorq %r8, %r8 // -a = 0 - a 277 xorq %r9, %r9 278 xorq %r13, %r13 279 leaq .Lpoly(%rip), %rdx 280 xorq %r10, %r10 281 xorq %r11, %r11 282 283 subq (%rsi),%r8 284 sbbq 8(%rsi),%r9 285 movq %r8, %rax 286 sbbq 16(%rsi),%r10 287 sbbq 24(%rsi),%r11 288 movq %r9, %rcx 289 sbbq $0, %r13 290 291 addq $-1, %r8 292 movq %r10, %rsi 293 adcq 8(%rdx), %r9 294 adcq $0, %r10 295 movq %r11, %r12 296 adcq 24(%rdx), %r11 297 testq %r13, %r13 // Choost result 298 299 cmovzq %rax, %r8 300 cmovzq %rcx, %r9 301 movq %r8, (%rdi) 302 cmovzq %rsi, %r10 303 movq %r9, 8(%rdi) 304 cmovzq %r12, %r11 305 movq %r10, 16(%rdi) 306 movq %r11, 24(%rdi) 307 308 movq (%rsp), %r13 309 movq 8(%rsp), %r12 310 leaq 16(%rsp), %rsp 311 ret 312.cfi_endproc 313.size ECP256_Neg, .-ECP256_Neg 314 315/** 316 * Function description: multiplication of the ECP256 field: res = a * b * 2^-256 mod P 317 * Function prototype: void ECP256_Mul(Coord *r, const Coord *a, const Coord *b); 318 * Input register: 319 * rdi: Pointer to the output Coord structure 320 * rsi: Address pointing to input data a 321 * rdx: Address pointing to input data b 322 * Change register: rax, rbx, rcx, rdx, rbp, r8, r9, r10, r11, r12, r13, r14, r15 323 * Output register: None 324 * Function/macro call: Multiplication can be implemented by calling ECP256_MulCore. 325 */ 326.globl ECP256_Mul 327.type ECP256_Mul,@function 328.align 32 329ECP256_Mul: 330.cfi_startproc 331 pushq %rbx 332 pushq %rbp 333 pushq %r12 334 pushq %r13 335 pushq %r14 336 pushq %r15 337 338 movq %rdx, %rcx // rdx is for mul 339 movq .Lpoly+8(%rip), %r14 340 movq .Lpoly+24(%rip), %r15 341 call ECP256_MulCore_q 342 343 movq (%rsp), %r15 344 movq 8(%rsp), %r14 345 movq 16(%rsp), %r13 346 movq 24(%rsp), %r12 347 movq 32(%rsp), %rbp 348 movq 40(%rsp), %rbx 349 leaq 48(%rsp), %rsp 350 ret 351.cfi_endproc 352.size ECP256_Mul, .-ECP256_Mul 353 354/** 355 * Function description: Montgomery multiplication of the ECP256 field 356 * Input register: 357 * rdi: Return address. 358 * rsi: Factor address. 359 * rcx: Factor address. 360 * Change register: rax,rbx,rcx,rdx,rbp,r8-r13 361 * Output register: None. 362 */ 363.type ECP256_MulCore_q,@function 364.align 32 365ECP256_MulCore_q: 366.cfi_startproc 367 movq (%rcx), %rax // b[0] 368 369 movq %rax, %rbp // save b[0] 370 mulq (%rsi) // a[0] * b[0] 371 movq %rax, %r10 372 movq %rbp, %rax // b[0] 373 movq %rdx, %r11 374 375 mulq 8(%rsi) // a[1] * b[0] 376 addq %rax, %r11 377 movq %rbp, %rax 378 adcq $0, %rdx // a[1:0] * b[0] < 2^192, no overflow 379 movq %rdx, %r12 380 381 mulq 16(%rsi) // a[2] * b[0] 382 addq %rax, %r12 383 movq %rbp, %rax 384 adcq $0, %rdx 385 movq %rdx, %r13 386 387 mulq 24(%rsi) // a[3] * b[0] 388 addq %rax, %r13 389 adcq $0, %rdx 390 movq %rdx, %r8 // result: r8 r13 r12 r11 r10 391 xorq %r9, %r9 392 movq %r10, %rax // first reduction 393 movq %r10, %rbp 394 shlq $32, %r10 // r10 * 2^96 low 395 mulq %r15 // r10 * 0xffffffff00000001 396 shrq $32, %rbp // r10 * 2^96 high 397 addq %r10, %r11 398 adcq %rbp, %r12 399 adcq %rax, %r13 400 adcq %rdx, %r8 401 movq 8(%rcx), %rax 402 adcq $0, %r9 403 xorq %r10, %r10 404 movq %rax, %rbp 405 406 mulq (%rsi) 407 addq %rax, %r11 408 adcq $0, %rdx 409 movq %rbp, %rax 410 movq %rdx, %rbx 411 412 mulq 8(%rsi) 413 addq %rbx, %r12 414 adcq $0, %rdx 415 addq %rax, %r12 416 adcq $0, %rdx 417 movq %rbp, %rax 418 movq %rdx, %rbx 419 420 mulq 16(%rsi) 421 addq %rbx, %r13 422 adcq $0, %rdx 423 addq %rax, %r13 424 adcq $0, %rdx 425 movq %rbp, %rax 426 movq %rdx, %rbx 427 428 mulq 24(%rsi) 429 addq %rbx, %r8 430 adcq $0, %rdx 431 addq %rax, %r8 432 adcq %rdx, %r9 433 adcq $0, %r10 434 435 movq %r11, %rbp 436 movq %r11, %rax 437 shlq $32, %r11 // r11 * 2^96 low 438 mulq %r15 // r11 * 0xffffffff00000001 439 shrq $32, %rbp // r11 * 2^96 high 440 addq %r11, %r12 441 adcq %rbp, %r13 442 movq 16(%rcx), %rbp 443 adcq %rax, %r8 444 adcq %rdx, %r9 445 movq %rbp, %rax 446 adcq $0, %r10 447 xorq %r11, %r11 448 449 mulq (%rsi) // a[0] * b[2] 450 addq %rax, %r12 451 adcq $0, %rdx 452 movq %rbp, %rax 453 movq %rdx, %rbx 454 455 mulq 8(%rsi) 456 addq %rbx, %r13 457 adcq $0, %rdx 458 addq %rax, %r13 459 movq %rbp, %rax 460 adcq $0, %rdx 461 movq %rdx, %rbx 462 463 mulq 16(%rsi) 464 addq %rbx, %r8 465 adcq $0, %rdx 466 addq %rax, %r8 467 movq %rbp, %rax 468 adcq $0, %rdx 469 movq %rdx, %rbx 470 471 mulq 24(%rsi) 472 addq %rbx, %r9 473 adcq $0, %rdx 474 addq %rax, %r9 475 adcq %rdx, %r10 476 movq %r12, %rbp 477 adcq $0, %r11 478 479 movq %r12, %rax // third reduction 480 shlq $32, %r12 // r12 * 2^96 low 481 mulq %r15 // r12 * 0xffffffff00000001 482 shrq $32, %rbp // r12 * 2^96 high 483 addq %r12, %r13 484 adcq %rbp, %r8 485 movq 24(%rcx), %rbp 486 adcq %rax, %r9 487 adcq %rdx, %r10 488 movq %rbp, %rax 489 adcq $0, %r11 490 xorq %r12, %r12 491 492 mulq (%rsi) // a[0] * b[3] 493 addq %rax, %r13 494 adcq $0, %rdx 495 movq %rdx, %rbx 496 497 movq %rbp, %rax 498 mulq 8(%rsi) 499 addq %rbx, %r8 500 adcq $0, %rdx 501 addq %rax, %r8 502 adcq $0, %rdx 503 movq %rbp, %rax 504 movq %rdx, %rbx 505 506 mulq 16(%rsi) 507 addq %rbx, %r9 508 adcq $0, %rdx 509 addq %rax, %r9 510 movq %rbp, %rax 511 adcq $0, %rdx 512 movq %rdx, %rbx 513 514 mulq 24(%rsi) 515 addq %rbx, %r10 516 adcq $0, %rdx 517 addq %rax, %r10 518 adcq %rdx, %r11 519 adcq $0, %r12 520 521 movq %r13, %rbp 522 movq %r13, %rax // last reduction 523 shlq $32, %r13 // r13 * 2^96 low 524 mulq %r15 // r13 * 0xffffffff00000001 525 shrq $32, %rbp // r13 * 2^96 high 526 527 addq %r13, %r8 528 adcq %rbp, %r9 529 adcq %rax, %r10 530 adcq %rdx, %r11 531 movq %r8, %rbx 532 movq %r9, %rbp 533 adcq $0, %r12 534 movq %r10, %rax 535 movq %r11, %rdx 536 subq $-1, %r8 537 sbbq %r14, %r9 538 sbbq $0, %r10 539 sbbq %r15, %r11 540 sbbq $0, %r12 541 542 cmovcq %rbx, %r8 543 cmovcq %rbp, %r9 544 cmovcq %rax, %r10 545 movq %r8, (%rdi) 546 movq %r9, 8(%rdi) 547 cmovcq %rdx, %r11 548 movq %r10, 16(%rdi) 549 movq %r11, 24(%rdi) 550 551 ret 552.cfi_endproc 553.size ECP256_MulCore_q, .-ECP256_MulCore_q 554 555/** 556 * Function description: ECP256 Montgomery form 557 * Function prototype: void ECP256_ToMont(Coord *r, const Coord *a); 558 * Input register: 559 * rdi: pointer to the output Coord structure 560 * rsi: address pointing to input data a 561 * Change register: rax,rbx,rcx,rdx,rbp,r8-r13 562 * Output register: None 563 * Function/Macro invoking: This function can be implemented by calling ECP256_Mul. 564 */ 565.globl ECP256_ToMont 566.type ECP256_ToMont,@function 567.align 32 568ECP256_ToMont: 569.cfi_startproc 570 leaq .LrrModP(%rip),%rcx 571 pushq %rbx 572 pushq %rbp 573 pushq %r12 574 pushq %r13 575 pushq %r14 576 pushq %r15 577 578 movq .Lpoly+8(%rip), %r14 579 movq .Lpoly+24(%rip), %r15 580 call ECP256_MulCore_q 581 582 movq (%rsp), %r15 583 movq 8(%rsp), %r14 584 movq 16(%rsp), %r13 585 movq 24(%rsp), %r12 586 movq 32(%rsp), %rbp 587 movq 40(%rsp), %rbx 588 leaq 48(%rsp), %rsp 589 590 ret 591.cfi_endproc 592.size ECP256_ToMont, .-ECP256_ToMont 593 594/** 595 * Function description: ECP256 Montgomery form converted to normal form 596 * Function prototype: void ECP256_FromMont(Coord *r, const Coord *a); 597 * Input register: 598 * rdi: Pointer to the output Coord structure. 599 * rsi: Address pointing to input data a. 600 * Change register: rax,rcx,rdx,r8-r13 601 * Output register: None. 602 * Function/Macro Call: 603 */ 604.globl ECP256_FromMont 605.type ECP256_FromMont,@function 606.align 32 607ECP256_FromMont: 608.cfi_startproc 609 pushq %r12 610 pushq %r13 611 612 movq .Lpoly+8(%rip), %r12 613 movq .Lpoly+24(%rip), %r13 614 615 movq (%rsi), %r8 616 movq 8(%rsi), %r9 617 movq 16(%rsi), %r10 618 movq 24(%rsi), %r11 619 620 movq %r8, %rax 621 movq %r8, %rcx 622 shlq $32, %r8 623 mulq %r13 // 0xff * 0xff = 0xfe01 624 shrq $32, %rcx 625 addq %r8, %r9 626 adcq %rcx, %r10 627 movq %r9, %rcx 628 adcq %rax, %r11 629 adcq $0, %rdx // rdx + 1 <= 0xff 630 movq %r9, %rax 631 movq %rdx, %r8 632 633 shlq $32, %r9 634 mulq %r13 635 shrq $32, %rcx 636 addq %r9, %r10 637 adcq %rcx, %r11 638 movq %r10, %rcx 639 adcq %rax, %r8 640 adcq $0, %rdx 641 movq %r10, %rax 642 movq %rdx, %r9 643 644 shlq $32, %r10 645 mulq %r13 646 shrq $32, %rcx 647 addq %r10, %r11 648 adcq %rcx, %r8 649 movq %r11, %rcx 650 adcq %rax, %r9 651 adcq $0, %rdx 652 movq %r11, %rax 653 movq %rdx, %r10 654 655 shlq $32, %r11 656 mulq %r13 657 shrq $32, %rcx 658 addq %r11, %r8 659 adcq %rcx, %r9 660 movq %r8, %rsi 661 adcq %rax, %r10 662 adcq $0, %rdx 663 movq %rdx, %r11 // r8 r9 r10 r11 664 665 movq %r9, %rdx 666 subq $-1, %r8 667 movq %r10, %rcx 668 sbbq %r12, %r9 669 movq %r11, %rax 670 sbbq $0, %r10 671 sbbq %r13, %r11 672 673 cmovcq %rsi, %r8 // < P 674 cmovcq %rdx, %r9 675 movq %r8, (%rdi) 676 cmovcq %rcx, %r10 677 movq %r9, 8(%rdi) 678 cmovcq %rax, %r11 679 movq %r10, 16(%rdi) 680 movq %r11, 24(%rdi) 681 682 movq (%rsp), %r13 683 movq 8(%rsp), %r12 684 leaq 16(%rsp), %rsp 685 ret 686.cfi_endproc 687.size ECP256_FromMont, .-ECP256_FromMont 688 689/** 690 * Function description: Multiplication of the ECP256 field:res = a*b*2^-256 mod P 691 * Function prototype: void ECP256_Sqr(Coord *r, const Coord *a); 692 * Input register: 693 * rdi: pointer to the output Coord structure 694 * rsi: address pointing to input data a 695 * Change register: rax, rbx, rcx, rdx, rsi, rdi, rbp, r8, r9, r10, r11, r12, r13, r14, r15 696 * Output register: None 697 * Function/Macro Call: Multiplication can be implemented by calling ECP256_SqrCore_q. 698 */ 699.globl ECP256_Sqr 700.type ECP256_Sqr,@function 701.align 32 702ECP256_Sqr: 703.cfi_startproc 704 pushq %rbx 705 pushq %rbp 706 pushq %r12 707 pushq %r13 708 pushq %r14 709 pushq %r15 710 711 nop // add this instruction to improve performance, movq %rsi, %rdx is ok 712 movq (%rsi), %rax // a[0] 713 movq 8(%rsi), %r14 // a[1] 714 movq 16(%rsi), %rbp // a[2] 715 movq 24(%rsi), %r15 // a[3] 716 call ECP256_SqrCore_q 717 718 movq (%rsp), %r15 719 movq 8(%rsp), %r14 720 movq 16(%rsp), %r13 721 movq 24(%rsp), %r12 722 movq 32(%rsp), %rbp 723 movq 40(%rsp), %rbx 724 leaq 48(%rsp), %rsp 725 ret 726.cfi_endproc 727.size ECP256_Sqr, .-ECP256_Sqr 728 729/** 730 * Function description: Montgomery square of the ECP256 field 731 * Input register: 732 * rdi: Return address 733 * rsi: Factor address 734 * Change register: rax, rbx, rcx, rdx, rbp, rsi, r8-r15 735 * Output register: None 736 */ 737.type ECP256_SqrCore_q,@function 738.align 32 739ECP256_SqrCore_q: 740.cfi_startproc 741 movq %rax, %r8 742 mulq %r14 // rdx:rax = a[0] * a[1] 743 xorq %rcx, %rcx 744 movq %rax, %r9 // r9 = rax 745 xorq %r11, %r11 746 xorq %r12, %r12 747 movq %r8, %rax 748 xorq %r13, %r13 749 xorq %rbx, %rbx 750 movq %rdx, %r10 // r10:r9 = a[0] * a[1] 751 752 mulq %rbp // rdx:rax = a[0] * a[2] 753 addq %rax, %r10 // r10 += rax 754 movq %r8, %rax // a[0] --> rax 755 adcq %rdx, %r11 // a[0] * (a[2] * 2^64 + a[1]) < 2^196, no overflow 756 757 mulq %r15 // rdx:rax = a[0] * a[3] 758 addq %rax, %r11 // r11 += rax 759 movq %r14, %rax // a[0] --> rax 760 adcq %rdx, %r12 // a[0] * (a[3] * 2^128 + a[2] * 2^64 + a[1]) < 2^256, no overflow 761 762 mulq %rbp // rdx:rax = a[1] * a[2] 763 addq %rax, %r11 764 movq %r14, %rax 765 adcq %rdx, %r12 766 adcq $0, %r13 767 768 mulq %r15 // rdx:rax = a[1] * a[3] 769 addq %rax, %r12 770 movq %rbp, %rax 771 adcq %rdx, %r13 772 adcq $0, %rbx 773 774 mulq %r15 // rdx:rax = a[2] * a[3] 775 addq %rax, %r13 776 adcq %rdx, %rbx // rbx not overflow 777 778 movq %r8, %rax 779 addq %r9, %r9 // twice 780 adcq %r10, %r10 781 adcq %r11, %r11 782 adcq %r12, %r12 783 adcq %r13, %r13 784 adcq %rbx, %rbx 785 adcq $0, %rcx 786 787 mulq %rax // rdx:rax = a[0] * a[0] 788 movq %rax, %r8 789 movq %r14, %rax 790 movq %rdx, %rsi 791 792 mulq %rax // rdx:rax = a[1] * a[1] 793 addq %rsi, %r9 794 adcq %rax, %r10 795 movq %rbp, %rax 796 adcq $0, %rdx 797 movq %rdx, %rsi 798 799 mulq %rax // rdx:rax = a[2] * a[2] 800 addq %rsi, %r11 801 adcq %rax, %r12 802 movq %r15, %rax 803 adcq $0, %rdx 804 movq %rdx, %rsi 805 806 mulq %rax // rdx:rax = a[3] * a[3] 807 addq %rsi, %r13 808 adcq %rax, %rbx 809 movq %r8, %rax 810 adcq %rdx, %rcx // rcx not overflow 811 812 movq .Lpoly+8(%rip), %r14 813 movq .Lpoly+24(%rip), %r15 814 movq %r8, %rbp // First reduction 815 shlq $32, %r8 // l32[r8 << 96] 816 mulq %r15 817 shrq $32, %rbp // h32[r8 << 96] 818 addq %r8, %r9 819 adcq %rbp, %r10 820 adcq %rax, %r11 821 adcq $0, %rdx 822 movq %r9, %rax 823 movq %r9, %rbp 824 movq %rdx, %r8 // r8 r11 r10 r9 0 825 826 shlq $32, %r9 // Second reduction 827 mulq %r15 828 shrq $32, %rbp 829 addq %r9, %r10 830 adcq %rbp, %r11 831 adcq %rax, %r8 832 adcq $0, %rdx 833 movq %r10, %rax 834 movq %r10, %rbp 835 movq %rdx, %r9 // r9 r8 r11 r10 0 836 837 shlq $32, %r10 // Third reduction 838 mulq %r15 839 shrq $32, %rbp 840 addq %r10, %r11 841 adcq %rbp, %r8 842 adcq %rax, %r9 843 adcq $0, %rdx 844 movq %r11, %rax 845 movq %r11, %rbp 846 movq %rdx, %r10 // r10 r9 r8 r11 0 847 848 shlq $32, %r11 // Last reduction 849 mulq %r15 850 shrq $32, %rbp 851 addq %r11, %r8 852 adcq %rbp, %r9 853 adcq %rax, %r10 854 adcq $0, %rdx // rdx r10 r9 r8 0 855 856 xorq %rsi, %rsi // Add the reduction result 857 addq %r8, %r12 858 adcq %r9, %r13 859 movq %r12, %r8 860 adcq %r10, %rbx 861 adcq %rdx, %rcx 862 movq %r13, %r9 863 adcq $0, %rsi // Reserve carry value 864 865 subq $-1, %r8 866 movq %rbx, %r10 867 sbbq %r14, %r9 868 sbbq $0, %r10 869 movq %rcx, %r11 870 sbbq %r15, %r11 871 sbbq $0, %rsi 872 873 cmovcq %r12, %r8 874 cmovcq %r13, %r9 875 movq %r8, (%rdi) 876 cmovcq %rbx, %r10 877 movq %r9, 8(%rdi) 878 cmovcq %rcx, %r11 879 movq %r10, 16(%rdi) 880 movq %r11, 24(%rdi) 881 882 ret 883.cfi_endproc 884.size ECP256_SqrCore_q, .-ECP256_SqrCore_q 885 886/** 887 * Function description: Multiplication of the ECP256 field: res = a*b*2^-256 mod Order(P) 888 * Function prototype: void ECP256_OrdSqr(Coord *r, const Coord *a, int32_t repeat); 889 * Input register: 890 * rdi: Pointer to the output Coord structure 891 * rsi: Address pointing to input data a 892 * rdx:Repeat 893 * Change register: rax, rbx, rcx, rdx, rsi, rbp, r8, r9, r10, r11, r12, r13, r14, r15 894 * Output register: None 895 * Function/Macro Call: 896 */ 897.globl ECP256_OrdSqr 898.type ECP256_OrdSqr,@function 899.align 32 900ECP256_OrdSqr: 901.cfi_startproc 902 pushq %rbx 903 pushq %rbp 904 pushq %r12 905 pushq %r13 906 pushq %r14 907 pushq %r15 908 909 movq (%rsi), %r8 910 movq 8(%rsi), %rax 911 movq 16(%rsi), %r14 912 movq 24(%rsi), %r15 913 leaq .Lord(%rip), %rbp // ptr(N) --> rbp 914 movq %rdx, %rbx 915.align 32 916.Lord_sqr_loop: 917 movq %rax, %rsi 918 mulq %r8 // rdx:rax = acc[0] * acc[1] 919 movq %rax, %r9 // r9 = rax 920 vmovq %rsi, %xmm1 // save acc[1] -> xmm1 921 movq %r14, %rax // acc[2] --> rax 922 movq %rdx, %r10 // r10:r9 = acc[0] * acc[1] 923 924 mulq %r8 // rdx:rax = acc[0] * acc[2] 925 addq %rax, %r10 // r10 += rax 926 vmovq %r14, %xmm2 // save acc[2] -> xmm2 927 adcq $0, %rdx // acc[0] * (acc[2] * 2^64 + acc[1]) < 2^196, no overflow 928 movq %r15, %rax // acc[3] --> rax 929 movq %rdx, %r11 930 931 mulq %r8 // rdx:rax = a[0] * a[3] 932 addq %rax, %r11 // r11 += rax 933 vmovq %r15, %xmm3 // Save acc[3] -> xmm3 934 adcq $0, %rdx // acc[0] * (acc[3] * 2^128 + acc[2] * 2^64 + acc[1]) < 2^256, no overflow 935 movq %r15, %rax // acc[1] --> rax 936 movq %rdx, %r12 937 938 mulq %r14 939 movq %rax, %r13 940 movq %r14, %rax 941 movq %rdx, %r14 942 943 mulq %rsi 944 addq %rax, %r11 945 movq %r15, %rax 946 adcq $0, %rdx 947 movq %rdx, %r15 948 949 mulq %rsi 950 addq %rax, %r12 951 adcq $0, %rdx 952 addq %r15, %r12 953 adcq %rdx, %r13 954 movq %r8, %rax // acc[0] --> rax 955 adcq $0, %r14 // r14 r13 r12 r11 r10 r9 956 957 xorq %r15, %r15 // 0 --> r15 958 addq %r9, %r9 // twice 959 adcq %r10, %r10 960 adcq %r11, %r11 961 adcq %r12, %r12 962 adcq %r13, %r13 963 adcq %r14, %r14 964 adcq $0, %r15 // result: r15 r14 r13 r12 r11 r10 r9 965 966 mulq %rax // rdx:rax = acc[0] * acc[0] 967 movq %rax, %r8 // rax --> r8 968 vmovq %xmm1, %rax // acc[1] --> rax 969 movq %rdx, %rcx // save rdx to rcx 970 971 mulq %rax // rdx:rax = acc[1] * acc[1] 972 addq %rcx, %r9 // r9 += rcx 973 adcq %rax, %r10 // r10 += rax 974 adcq $0, %rdx // no overflow 975 vmovq %xmm2, %rax // acc[2] --> rax 976 movq %rdx, %rcx // save rdx to rcx 977 978 mulq %rax // rdx:rax = a[2] * a[2] 979 addq %rcx, %r11 // r11 += rcx 980 adcq %rax, %r12 // r12 += rax 981 movq %r8, %rsi // acc[0] --> rsi 982 adcq $0, %rdx // no overflow 983 vmovq %xmm3, %rax // acc[3] --> rax 984 movq %rdx, %rcx // save rcx to rdx 985 986 imulq 32(%rbp), %r8 // m = acc[0] * LordK (mod 2^64) --> r8 987 988 mulq %rax // rdx:rax = a[3] * a[3] 989 addq %rcx, %r13 // r13 += rcx 990 adcq %rax, %r14 // r14 += r 991 movq (%rbp), %rax // N[0] --> rax 992 adcq %rdx, %r15 // r15 not overflow 993 994 /* Result acc[8:0] = r15 r14 r13 r12 r11 r10 r9 r8; */ 995 /* The first reduction */ 996 mulq %r8 // rdx:rax = m * N[0] 997 addq %rax, %rsi // rsi = 0 998 movq %r8, %rcx // m --> rcx 999 adcq %rdx, %rsi // rsi = rdx + carry --> acc[1] 1000 1001 subq %r8, %r10 // acc[2] - m 1002 sbbq $0, %rcx // m - borrwo, to acc[3] 1003 1004 movq 8(%rbp), %rax // N[1] --> rax 1005 mulq %r8 // rdx:rax = m * N[1] 1006 addq %rsi, %r9 // acc[1] += high[m * N[0]] 1007 adcq $0, %rdx // save carry 1008 addq %rax, %r9 // acc[1] += low[m * N[1]] 1009 movq %r8, %rax // m --> rax 1010 adcq %rdx, %r10 // acc[2] += high[m * N[1]] 1011 movq %r9, %rsi // acc[1] --> rsi 1012 movq %r8, %rdx // m --> rdx 1013 adcq $0, %rcx // m - borrwoto acc[3] 1014 1015 imulq 32(%rbp), %r9 // m = acc[1] * LordK --> r9 1016 1017 shlq $32, %rax // low(m) --> rax low(m * 2^228) 1018 shrq $32, %rdx // high(m) --> rdx high(m * 2^228) 1019 subq %rax, %r11 // acc[3] - low(m * 2^228) 1020 movq (%rbp), %rax // N[0] --> rax 1021 sbbq %rdx, %r8 // m - high(m * 2^228) to acc[4] 1022 1023 addq %rcx, %r11 // acc[3] += m + carry - borrow 1024 adcq $0, %r8 // to acc[4] 1025 1026 /* Second reduction */ 1027 mulq %r9 // rdx:rax = m * N[0] 1028 addq %rax, %rsi // acc[1] += rax --> 0 1029 movq %r9, %rcx // m --> rcx 1030 adcq %rdx, %rsi // rsi = high[m * N[0]] + carry --> acc[2] 1031 1032 movq 8(%rbp), %rax // N[1] --> rax 1033 subq %r9, %r11 // acc[3] -= m 1034 sbbq $0, %rcx // m - borrow --> rcx 1035 1036 mulq %r9 // rdx:rax = m * N[1] 1037 addq %rsi, %r10 // acc[2] += high[m * N[1]] + carry 1038 adcq $0, %rdx // rdx += carry, no overflow 1039 addq %rax, %r10 // acc[2] += rax 1040 movq %r9, %rax // m --> rax 1041 adcq %rdx, %r11 // acc[3] += rdx 1042 movq %r10, %rsi // acc[2] --> rsi 1043 movq %r9, %rdx // m --> rdx 1044 adcq $0, %rcx // m - borrow + carry --> rcx 1045 1046 imulq 32(%rbp), %r10 // m = acc[2] * LordK --> r10 1047 1048 shlq $32, %rax // low(m * 2^228) 1049 shrq $32, %rdx // high(m * 2^228) 1050 subq %rax, %r8 // t0 acc[4] - low(m * 2^228) 1051 movq (%rbp), %rax // N[0] --> rax 1052 sbbq %rdx, %r9 // m - high(m * 2^228) to acc[5] 1053 1054 addq %rcx, %r8 // to acc[4] 1055 adcq $0, %r9 // to acc[5] 1056 1057 /* Third reduction */ 1058 mulq %r10 // rdx:rax = m * N[0] 1059 movq %r10, %rcx // m --> rcx 1060 addq %rax, %rsi // acc[2] += rax --> 0 1061 adcq %rdx, %rsi // rsi = high[m * N[0]] + carry --> acc[3] 1062 movq 8(%rbp), %rax // N[1] --> rax 1063 subq %r10, %r8 // to acc[4] -= m 1064 sbbq $0, %rcx // m - borrow --> rcx 1065 1066 mulq %r10 // rdx:rax = m * N[1] 1067 addq %rsi, %r11 // acc[3] += high[m * N[1]] + carry 1068 adcq $0, %rdx // rdx += carry, no overflow 1069 addq %rax, %r11 // acc[3] += rax 1070 movq %r10, %rax // m --> rax 1071 adcq %rdx, %r8 // to acc[4] += rdx 1072 movq %r11, %rsi // acc[3] --> rsi 1073 movq %r10, %rdx // m --> rdx 1074 adcq $0, %rcx // m - borrow + carry --> rcx 1075 1076 imulq 32(%rbp), %r11 // m = acc[3] * LordK --> r11 1077 1078 shlq $32, %rax // low(m * 2^228) 1079 shrq $32, %rdx // high(m * 2^228) 1080 subq %rax, %r9 // to acc[5]: - low(m * 2^228) 1081 sbbq %rdx, %r10 // to acc[6]: m - high(m * 2^228) 1082 movq (%rbp), %rax // N[0] --> rax 1083 addq %rcx, %r9 // to acc[5] 1084 adcq $0, %r10 // to acc[6] 1085 1086 /* Last reduction */ 1087 mulq %r11 // rdx:rax = m * N[0] 1088 addq %rax, %rsi // acc[3] += rax --> 0 1089 movq %r11, %rcx // m --> rcx 1090 adcq %rdx, %rsi // rsi = high[m * N[0]] + carry --> acc[4] 1091 movq 8(%rbp), %rax // N[1] --> rax 1092 subq %r11, %r9 // to acc[5] : -= m 1093 sbbq $0, %rcx // to acc[6] : m - borrow 1094 1095 mulq %r11 // rdx:rax = m * N[1] 1096 addq %rsi, %r8 // to acc[4]: += high[m * N[1]] + carry 1097 adcq $0, %rdx // rdx += carry, no overflow 1098 addq %rax, %r8 // to acc[4]: += rax 1099 movq %r11, %rax // m --> rax 1100 adcq %rdx, %r9 // to acc[5]: += rdx 1101 movq %r11, %rdx // m --> rdx 1102 adcq $0, %rcx // m - borrow + carry --> rcx 1103 1104 shlq $32, %rax // low(m * 2^228) 1105 shrq $32, %rdx // high(m * 2^228) 1106 1107 subq %rax, %r10 // to acc[6]: - low(m * 2^228) 1108 sbbq %rdx, %r11 // to acc[7]: m - high(m * 2^228) 1109 1110 addq %rcx, %r10 // to acc[6] 1111 adcq $0, %r11 // to acc[7] 1112 1113 /* r15 r14 r13 r12 + r11 r10 r9 r8 */ 1114 xorq %rdx, %rdx 1115 addq %r12, %r8 1116 adcq %r13, %r9 1117 movq %r8, %r12 1118 adcq %r14, %r10 1119 adcq %r15, %r11 1120 movq %r9, %rax 1121 adcq $0, %rdx 1122 1123 subq (%rbp), %r8 1124 movq %r10, %r14 1125 sbbq 8(%rbp), %r9 1126 sbbq 16(%rbp), %r10 1127 movq %r11, %r15 1128 sbbq 24(%rbp), %r11 1129 sbbq $0, %rdx 1130 1131 cmovcq %r12, %r8 1132 cmovncq %r9, %rax 1133 cmovncq %r10, %r14 1134 cmovncq %r11, %r15 // r8 rax r14 r15 1135 1136 decq %rbx 1137 jnz .Lord_sqr_loop 1138 1139 movq %r8, (%rdi) 1140 movq %rax, 8(%rdi) 1141 vpxor %xmm2, %xmm2, %xmm2 1142 movq %r14, 16(%rdi) 1143 vpxor %xmm3, %xmm3, %xmm3 1144 movq %r15, 24(%rdi) 1145 vpxor %xmm1, %xmm1, %xmm1 1146 1147 movq (%rsp), %r15 1148 movq 8(%rsp), %r14 1149 movq 16(%rsp), %r13 1150 movq 24(%rsp), %r12 1151 movq 32(%rsp), %rbp 1152 movq 40(%rsp), %rbx 1153 leaq 48(%rsp), %rsp 1154 1155 ret 1156.cfi_endproc 1157.size ECP256_OrdSqr, .-ECP256_OrdSqr 1158 1159/** 1160 * Function description: half calculation of the ECP256 field: res = a/2 mod P 1161 * Input register: 1162 * rdi: Pointer to the output Coord structure 1163 * r8: a[0] 1164 * r9: a[1] 1165 * r10:a[2] 1166 * r11:a[3] 1167 * r14:P[1] 1168 * r15:P[3] 1169 * Change register: rax, rcx, rdx, rsi, r8, r9, r10, r11, r12, r13 1170 * Output register: r8, r9, r10, r11 1171 */ 1172.type ECP256_DivBy2Core,@function 1173ECP256_DivBy2Core: 1174.cfi_startproc 1175 xorq %r13, %r13 1176 movq %r8, %rax 1177 movq %r9, %rcx 1178 1179 addq $-1, %r8 1180 movq %r10, %rdx 1181 adcq %r14, %r9 1182 movq %r11, %r12 1183 adcq $0, %r10 1184 adcq %r15, %r11 1185 adcq $0, %r13 1186 xorq %rsi, %rsi 1187 1188 testq $1, %rax 1189 cmovzq %rax, %r8 1190 cmovzq %rcx, %r9 1191 cmovzq %rdx, %r10 1192 cmovzq %r12, %r11 1193 movq %r9, %rcx 1194 cmovzq %rsi, %r13 1195 movq %r10, %rdx 1196 movq %r11, %r12 1197 1198 shrq $1, %r8 1199 shlq $63, %rcx 1200 shrq $1, %r9 1201 shlq $63, %rdx 1202 shrq $1, %r10 1203 orq %rcx, %r8 1204 shlq $63, %r12 1205 1206 shrq $1, %r11 1207 shlq $63, %r13 1208 orq %rdx, %r9 1209 orq %r12, %r10 1210 orq %r13, %r11 1211 1212 movq %r8, (%rdi) 1213 movq %r9, 8(%rdi) 1214 movq %r10, 16(%rdi) 1215 movq %r11, 24(%rdi) 1216 ret 1217.cfi_endproc 1218.size ECP256_DivBy2Core, .-ECP256_DivBy2Core 1219 1220/** 1221 * Function description: Half calculation of the ECP256 field: res = a/2 mod P 1222 * Function prototype: void ECP256_DivBy2(Coord *r, const Coord *a); 1223 * Input register: 1224 * rdi: Pointer to the output Coord structure 1225 * rsi: Address pointing to input data a 1226 * Change register: rax, rcx, rdx, rsi, r8, r9, r10, r11, r12, r13, r14, r15 1227 * Output register: None 1228 * Function/macro call: Call ECP256_DivBy2Core to implement half calculation. 1229 */ 1230.globl ECP256_DivBy2 1231.type ECP256_DivBy2, @function 1232ECP256_DivBy2: 1233.cfi_startproc 1234 pushq %r12 1235 pushq %r13 1236 pushq %r14 1237 pushq %r15 1238 1239 movq (%rsi), %r8 1240 movq 8(%rsi), %r9 1241 movq 16(%rsi), %r10 1242 movq 24(%rsi), %r11 1243 movq 8+.Lpoly(%rip), %r14 1244 movq 24+.Lpoly(%rip), %r15 1245 call ECP256_DivBy2Core 1246 1247 movq (%rsp), %r15 1248 movq 8(%rsp), %r14 1249 movq 16(%rsp), %r13 1250 movq 24(%rsp), %r12 1251 leaq 32(%rsp), %rsp 1252 ret 1253.cfi_endproc 1254.size ECP256_DivBy2, .-ECP256_DivBy2 1255 1256/* r14 = .Lpoly[1], r15 = .Lpoly[3] */ 1257.type ECP256_MulBy2Core,@function 1258.align 32 1259ECP256_MulBy2Core: 1260.cfi_startproc 1261 xorq %r13, %r13 1262 1263 addq %r8, %r8 1264 adcq %r9, %r9 1265 movq %r8, %rax 1266 adcq %r10, %r10 1267 adcq %r11, %r11 1268 movq %r9, %rcx 1269 adcq $0, %r13 1270 1271 subq $-1, %r8 1272 movq %r10, %rdx 1273 sbbq %r14, %r9 1274 movq %r11, %r12 1275 sbbq $0, %r10 1276 sbbq %r15, %r11 1277 sbbq $0, %r13 1278 1279 cmovcq %rax, %r8 // Obtain mod P result 1280 cmovcq %rcx, %r9 1281 movq %r8, (%rdi) 1282 cmovcq %rdx, %r10 1283 movq %r9, 8(%rdi) 1284 cmovcq %r12, %r11 1285 movq %r10, 16(%rdi) 1286 movq %r11, 24(%rdi) 1287 ret 1288.cfi_endproc 1289.size ECP256_MulBy2Core, .-ECP256_MulBy2Core 1290 1291.globl ECP256_MulBy2 1292.type ECP256_MulBy2,@function 1293.align 32 1294ECP256_MulBy2: 1295.cfi_startproc 1296 pushq %r12 1297 pushq %r13 1298 1299 movq (%rsi), %r8 1300 movq 8(%rsi), %r9 1301 movq 16(%rsi), %r10 1302 movq 24(%rsi), %r11 1303 xorq %r13, %r13 1304 1305 leaq .Lpoly(%rip), %rsi 1306 1307 addq %r8, %r8 1308 adcq %r9, %r9 1309 movq %r8, %rax 1310 adcq %r10, %r10 1311 adcq %r11, %r11 1312 movq %r9, %rcx 1313 adcq $0, %r13 1314 1315 subq $-1, %r8 1316 movq %r10, %rdx 1317 sbbq 8(%rsi), %r9 1318 movq %r11, %r12 1319 sbbq $0, %r10 1320 sbbq 24(%rsi), %r11 1321 sbbq $0, %r13 1322 1323 cmovcq %rax, %r8 // Obtain mod P result 1324 cmovcq %rcx, %r9 1325 movq %r8, (%rdi) 1326 cmovcq %rdx, %r10 1327 movq %r9, 8(%rdi) 1328 cmovcq %r12, %r11 1329 movq %r10, 16(%rdi) 1330 movq %r11, 24(%rdi) 1331 1332 movq 0(%rsp), %r13 1333 movq 8(%rsp), %r12 1334 leaq 16(%rsp), %rsp 1335 ret 1336.cfi_endproc 1337.size ECP256_MulBy2, .-ECP256_MulBy2 1338 1339/* r14 = .Lpoly[1], r15 = .Lpoly[3] */ 1340.type ECP256_MulBy3Core,@function 1341.align 32 1342ECP256_MulBy3Core: 1343.cfi_startproc 1344 xorq %r13, %r13 1345 addq %r8, %r8 1346 adcq %r9, %r9 1347 movq %r8, %rax 1348 adcq %r10, %r10 1349 adcq %r11, %r11 1350 movq %r9, %rcx 1351 adcq $0, %r13 1352 subq $-1, %r8 1353 movq %r10, %rdx 1354 sbbq %r14, %r9 1355 movq %r11, %r12 1356 sbbq $0, %r10 1357 sbbq %r15, %r11 1358 sbbq $0, %r13 1359 1360 cmovcq %rax, %r8 // Obtain mod P result 1361 cmovcq %rcx, %r9 1362 cmovcq %rdx, %r10 1363 cmovcq %r12, %r11 1364 1365 xorq %r13, %r13 1366 addq (%rsi), %r8 1367 adcq 8(%rsi), %r9 1368 movq %r8, %rax 1369 adcq 16(%rsi), %r10 1370 adcq 24(%rsi), %r11 1371 movq %r9, %rcx 1372 adcq $0, %r13 1373 subq $-1, %r8 1374 movq %r10, %rdx 1375 sbbq %r14, %r9 1376 sbbq $0, %r10 1377 movq %r11, %r12 1378 sbbq %r15, %r11 1379 sbbq $0, %r13 1380 1381 cmovcq %rax, %r8 // Obtain mod P result 1382 cmovcq %rcx, %r9 1383 movq %r8, (%rdi) 1384 cmovcq %rdx, %r10 1385 movq %r9, 8(%rdi) 1386 cmovcq %r12, %r11 1387 movq %r10, 16(%rdi) 1388 movq %r11, 24(%rdi) 1389 ret 1390.cfi_endproc 1391.size ECP256_MulBy3Core, .-ECP256_MulBy3Core 1392 1393.globl ECP256_MulBy3 1394.type ECP256_MulBy3,@function 1395.align 32 1396ECP256_MulBy3: 1397.cfi_startproc 1398 pushq %r12 1399 pushq %r13 1400 pushq %r14 1401 pushq %r15 1402 1403 movq (%rsi), %r8 1404 movq 8(%rsi), %r9 1405 movq 16(%rsi), %r10 1406 movq 24(%rsi), %r11 1407 movq 8+.Lpoly(%rip), %r14 1408 movq 24+.Lpoly(%rip), %r15 1409 call ECP256_MulBy3Core 1410 1411 movq (%rsp), %r15 1412 movq 8(%rsp), %r14 1413 movq 16(%rsp), %r13 1414 movq 24(%rsp), %r12 1415 leaq 32(%rsp), %rsp 1416 ret 1417.cfi_endproc 1418.size ECP256_MulBy3, .-ECP256_MulBy3 1419 1420/** 1421 * Function description: This function is used to calculate the Montgomery multiplication of the ord(P256) field: 1422 * res = a * b * 2^(-256) mod ord(P) 1423 * Input register: 1424 * rdi: Pointer to the output Coord structure 1425 * rsi: Address pointing to input data a 1426 * rdx: Address pointing to input data b 1427 * Change register: rax, rbx, rcx, rdx, rbp, r8, r9, r10, r11, r12, r13, r14, r15 1428 * Function/Macro invoking: The calculation can be implemented by calling ECP256_OrdMulCore. 1429 */ 1430.globl ECP256_OrdMul 1431.type ECP256_OrdMul,@function 1432.align 32 1433ECP256_OrdMul: 1434.cfi_startproc 1435 pushq %rbx 1436 pushq %rbp 1437 pushq %r12 1438 pushq %r13 1439 pushq %r14 1440 pushq %r15 1441 1442 movq %rdx, %rcx // rdx used to output the mul multiplication result. 1443 // The rcx saves the b address. 1444 leaq .Lord(%rip), %r15 1445 movq .LordK(%rip), %r14 1446 movq (%rdx), %rax // b[0] 1447 1448 //a[0-3] * b[0] 1449 movq %rax, %rbp // save b[0] 1450 mulq (%rsi) // a[0] * b[0] 1451 movq %rax, %r8 1452 movq %rbp, %rax // b[0] 1453 movq %rdx, %r9 1454 1455 mulq 8(%rsi) // a[1] * b[0] 1456 addq %rax, %r9 1457 movq %rbp, %rax 1458 adcq $0, %rdx // a[1:0] * b[0] The result must be less than 2 ^ 192, 1459 // So no carry is required. 1460 movq %rdx, %r10 1461 1462 mulq 16(%rsi) // a[2] * b[0] 1463 addq %rax, %r10 1464 movq %rbp, %rax 1465 adcq $0, %rdx 1466 movq %rdx, %r11 1467 1468 mulq 24(%rsi) // a[3] * b[0] 1469 addq %rax, %r11 1470 adcq $0, %rdx 1471 movq %r8, %r13 1472 movq %rdx, %r12 // First round multiplication results r12 r11 r10 r9 r8 1473 1474 // First round of reduction 1475 // n[0] = 0xf3b9cac2fc632551 1476 // n[1] = 0xbce6faada7179e84 1477 // n[2] = 0xffffffffffffffff, lo(q*n[2]) = -q , hi(q*n[2]) = q 1478 // n[3] = 0xffffffff00000000, lo(q*n[3]) = -lq<<32, hi(q*n[3]) = q-hq 1479 imulq %r14, %r8 // r8 = r8 * ordK = q 1480 1481 movq %r8, %rax 1482 mulq (%r15) // n[0] * q 1483 addq %rax, %r13 // %r13 must be 0. 1484 adcq $0, %rdx // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 No further carry is required. 1485 movq %r8, %rax 1486 movq %rdx, %rbp // %rbp = hi(n[0]*q) 1487 1488 mulq 8(%r15) // n[1] * q 1489 addq %rbp, %rax // %rax = lo(n[1]*q)+hi(n[0]*q) 1490 adcq $0, %rdx // %rdx = hi(n[1]*q), be the same as the above hi(n[1]*q) + 1 < 2^64 1491 // No further carry is required. 1492 1493 movq %r8, %rbx 1494 subq %r8, %r10 // r10 = r[2] - q 1495 sbbq $0, %rbx // When q>0, rbx - 1 = q - 1 >= 0, when q=0 (r[2]-q) does not borrow, 1496 // rbx = rbx - 0 >= 0, so the following formula does not borrow 1497 1498 addq %rax, %r9 // r9 = r[1] + lo(n[1]*q) + hi(n[0]*q) 1499 adcq %rdx, %r10 // r10 = r[2] - q + hi(n[1]*q) 1500 movq %r8, %rax 1501 adcq $0, %rbx // Overflowing is not possible. 1502 1503 movq %r8, %rdx 1504 shrq $32, %rax // rax = hq 1505 shlq $32, %rdx // rdx = lq<<32 1506 1507 subq %rdx, %rbx // q - lq<<32 1508 sbbq %rax, %r8 // r8 = q - hq = hq * 2^32 + lq - hq >= lq, 1509 // When lq!=0, The following formula does not borrow. 1510 // When lq==0, the upper formula does not borrow. 1511 1512 adcq %rbx, %r11 1513 movq 8(%rcx), %rax 1514 adcq %r8, %r12 1515 movq %rax, %rbp 1516 adcq $0, %r13 1517 1518 // a[0-3] * b[1] 1519 mulq (%rsi) 1520 addq %rax, %r9 1521 adcq $0, %rdx 1522 movq %rbp, %rax 1523 movq %rdx, %rbx 1524 1525 mulq 8(%rsi) 1526 addq %rbx, %r10 1527 adcq $0, %rdx 1528 addq %rax, %r10 1529 adcq $0, %rdx 1530 movq %rbp, %rax 1531 movq %rdx, %rbx 1532 1533 mulq 16(%rsi) 1534 addq %rbx, %r11 1535 adcq $0, %rdx 1536 addq %rax, %r11 1537 adcq $0, %rdx 1538 movq %rbp, %rax 1539 movq %rdx, %rbx 1540 1541 xorq %r8, %r8 // r8 = 0 1542 mulq 24(%rsi) 1543 addq %rbx, %r12 1544 adcq $0, %rdx 1545 addq %rax, %r12 1546 adcq %rdx, %r13 1547 movq %r9, %rbx 1548 adcq $0, %r8 // Second round of multiplication results r9,r10,r11,r12,r13,r8 1549 1550 // Second round of reduction 1551 imulq %r14, %r9 // r9 = r9 * ordK = q 1552 movq %r9, %rax 1553 1554 mulq (%r15) // n[0] * q 1555 addq %rax, %rbx // %rbx must be 0 1556 adcq $0, %rdx // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 No further carry is required 1557 movq %r9, %rax 1558 movq %rdx, %rbp // %rbp = hi(n[0]*q) 1559 1560 mulq 8(%r15) // n[1] * q 1561 addq %rbp, %rax // %rax = lo(n[1]*q)+hi(n[0]*q) 1562 adcq $0, %rdx // %rdx = hi(n[1]*q), be the same as the above hi(n[1]*q) + 1 < 2^64 1563 // No further carry is required 1564 1565 movq %r9, %rbx 1566 subq %r9, %r11 // r11 = r[2] - q 1567 sbbq $0, %rbx // when q>0 rbx - 1 = q - 1 >= 0, 1568 // When q=0 (r[2]-q) does not borrow, rbx = rbx - 0 >= 0, 1569 // So the following equation does not borrow 1570 1571 addq %rax, %r10 // r10 = r[1] + lo(n[1]*q) + hi(n[0]*q) 1572 adcq %rdx, %r11 // r11 = r[2] - q + hi(n[1]*q) 1573 movq %r9, %rax 1574 adcq $0, %rbx // Overflowing is not possible. 1575 1576 movq %r9, %rdx 1577 shrq $32, %rax // rax = hq 1578 shlq $32, %rdx // rdx = lq<<32 1579 1580 subq %rdx, %rbx // q - lq<<32 1581 sbbq %rax, %r9 // r9 = q - hq = hq * 2^32 + lq - hq >= lq, 1582 // When lq!=0, The following formula does not borrow. 1583 // When lq==0, the preceding formula does not borrow. 1584 1585 movq 16(%rcx), %rax 1586 adcq %rbx, %r12 1587 adcq %r9, %r13 1588 adcq $0, %r8 1589 1590 // a[0-3] * b[2] 1591 movq %rax, %rbp 1592 mulq (%rsi) // a[0] * b[2] 1593 addq %rax, %r10 1594 adcq $0, %rdx 1595 movq %rbp, %rax 1596 movq %rdx, %rbx 1597 1598 mulq 8(%rsi) 1599 addq %rbx, %r11 1600 adcq $0, %rdx 1601 addq %rax, %r11 1602 adcq $0, %rdx 1603 movq %rbp, %rax 1604 movq %rdx, %rbx 1605 1606 mulq 16(%rsi) 1607 addq %rbx, %r12 1608 adcq $0, %rdx 1609 addq %rax, %r12 1610 adcq $0, %rdx 1611 movq %rbp, %rax 1612 movq %rdx, %rbx 1613 1614 xorq %r9, %r9 1615 mulq 24(%rsi) 1616 addq %rbx, %r13 1617 adcq $0, %rdx 1618 addq %rax, %r13 1619 adcq %rdx, %r8 1620 movq %r10, %rbx 1621 adcq $0, %r9 1622 1623 // Third round reduction 1624 imulq %r14, %r10 // r10 = r10 * ordK = q 1625 1626 movq %r10, %rax 1627 mulq (%r15) // n[0] * q 1628 addq %rax, %rbx // %rbx must be 0 1629 adcq $0, %rdx // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 no further carry is required. 1630 movq %r10, %rax 1631 movq %rdx, %rbp // %rbp = hi(n[0]*q) 1632 1633 mulq 8(%r15) // n[1] * q 1634 addq %rbp, %rax // %rax = lo(n[1]*q)+hi(n[0]*q) 1635 adcq $0, %rdx // %rdx = hi(n[1]*q), same as above hi(n[1]*q) + 1 < 2^64 no further carry is required. 1636 1637 movq %r10, %rbx 1638 subq %r10, %r12 // r12 = r[2] - q 1639 sbbq $0, %rbx // if q>0, rbx - 1 = q - 1 >= 0. 1640 // if q=0, (r[2]-q) Won't borrow , rbx = rbx - 0 >= 0, 1641 // the following formula will not borrow. 1642 1643 addq %rax, %r11 // r11 = r[1] + lo(n[1]*q) + hi(n[0]*q) 1644 adcq %rdx, %r12 // r12 = r[2] - q + hi(n[1]*q) 1645 movq %r10, %rax 1646 adcq $0, %rbx // Overflowing is not possible. 1647 1648 movq %r10, %rdx 1649 shrq $32, %rax // rax = hq 1650 shlq $32, %rdx // rdx = lq<<32 1651 1652 subq %rdx, %rbx // q - lq<<32 1653 sbbq %rax, %r10 // r10 = q - hq = hq * 2^32 + lq - hq >= lq, 1654 // if lq!=0, the following formula does not borrow, 1655 // if lq==0, The above formula does not borrow. 1656 1657 movq 24(%rcx), %rax 1658 adcq %rbx, %r13 1659 adcq %r10, %r8 1660 adcq $0, %r9 1661 1662 // a[0-3] * b[3] 1663 movq %rax, %rbp 1664 mulq (%rsi) // a[0] * b[3] 1665 addq %rax, %r11 1666 adcq $0, %rdx 1667 movq %rbp, %rax 1668 movq %rdx, %rbx 1669 1670 mulq 8(%rsi) 1671 addq %rbx, %r12 1672 adcq $0, %rdx 1673 addq %rax, %r12 1674 adcq $0, %rdx 1675 movq %rbp, %rax 1676 movq %rdx, %rbx 1677 1678 mulq 16(%rsi) 1679 addq %rbx, %r13 1680 adcq $0, %rdx 1681 addq %rax, %r13 1682 adcq $0, %rdx 1683 movq %rbp, %rax 1684 movq %rdx, %rbx 1685 1686 xorq %r10, %r10 1687 mulq 24(%rsi) 1688 addq %rbx, %r8 1689 adcq $0, %rdx 1690 addq %rax, %r8 1691 adcq %rdx, %r9 1692 movq %r11, %rbx 1693 adcq $0, %r10 1694 1695 // last round reduction. 1696 imulq %r14, %r11 // r11 = r11 * ordK = q 1697 1698 movq %r11, %rax 1699 mulq (%r15) // n[0] * q 1700 addq %rax, %rbx // %rbx must be 0 1701 adcq $0, %rdx // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 no further carry is required. 1702 movq %r11, %rax 1703 movq %rdx, %rbp // %rbp = hi(n[0]*q) 1704 1705 mulq 8(%r15) // n[1] * q 1706 addq %rbp, %rax // %rax = lo(n[1]*q)+hi(n[0]*q) 1707 adcq $0, %rdx // %rdx = hi(n[1]*q), same as above, 1708 // hi(n[1]*q) + 1 < 2^64 no further carry is required. 1709 1710 movq %r11, %rbx 1711 subq %r11, %r13 // r13 = r[2] - q 1712 sbbq $0, %rbx // When q>0 rbx - 1 = q - 1 >= 0, 1713 // When q=0,(r[2]-q)No borrowing, rbx = rbx - 0 >= 0, 1714 // so the following formula does not borrow. 1715 1716 addq %rax, %r12 // r12 = r[1] + lo(n[1]*q) + hi(n[0]*q) 1717 adcq %rdx, %r13 // r13 = r[2] - q + hi(n[1]*q) 1718 movq %r11, %rax 1719 adcq $0, %rbx // Overflowing is not possible. 1720 1721 movq %r11, %rdx 1722 shrq $32, %rax // rax = hq 1723 shlq $32, %rdx // rdx = lq<<32 1724 1725 subq %rdx, %rbx // q - lq<<32 1726 sbbq %rax, %r11 // r11 = q - hq = hq * 2^32 + lq - hq >= lq, 1727 // when lq!=0, the following formula does not borrow. 1728 // When lq==0, the preceding formula does not borrow. 1729 1730 adcq %rbx, %r8 1731 adcq %r11, %r9 1732 adcq $0, %r10 1733 1734 // mod n 1735 movq %r12, %rbx 1736 movq %r13, %rbp 1737 movq %r8, %rax 1738 movq %r9, %rdx 1739 1740 subq (%r15), %r12 1741 sbbq 8(%r15), %r13 1742 sbbq 16(%r15), %r8 1743 sbbq 24(%r15), %r9 1744 sbbq $0, %r10 1745 1746 cmovcq %rbx, %r12 1747 cmovcq %rbp, %r13 1748 cmovcq %rax, %r8 1749 cmovcq %rdx, %r9 1750 1751 movq %r12, (%rdi) 1752 movq %r13, 8(%rdi) 1753 movq %r8, 16(%rdi) 1754 movq %r9, 24(%rdi) 1755 1756 movq (%rsp), %r15 1757 movq 8(%rsp), %r14 1758 movq 16(%rsp), %r13 1759 movq 24(%rsp), %r12 1760 movq 32(%rsp), %rbp 1761 movq 40(%rsp), %rbx 1762 leaq 48(%rsp), %rsp 1763 ret 1764.cfi_endproc 1765.size ECP256_OrdMul, .-ECP256_OrdMul 1766 1767/** 1768 * Function description: Calculate the point doubling of an elliptic curve: res = 2*a 1769 * Function prototype: void ECP256_PointDouble(P256_Point *r, const P256_Point *a); 1770 * Input register: 1771 * rdi: pointer to the output P256_POINT structure 1772 * rsi: address pointing to input data a 1773 * Change register: rax, rbx, rcx, rdx, rbp, rsi, r8, r9, r10, r11, r12, r13, r14, r15 1774 * Function/Macro Call: ECP256_MulBy2Core, ECP256_SqrCore_q, ECP256_AddCore, ECP256_MulCore_q, ECP256_SubCore 1775 * ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b 1776 * Deal process: 1777 * delta = Z12 1778 * gamma = Y12 1779 * beta = X1*gamma 1780 * alpha = 3*(X1-delta)*(X1+delta) 1781 * X3 = alpha2-8*beta 1782 * Z3 = (Y1+Z1)2-gamma-delta 1783 * Y3 = alpha*(4*beta-X3)-8*gamma2 1784 */ 1785.globl ECP256_PointDouble 1786.type ECP256_PointDouble,@function 1787.align 32 1788ECP256_PointDouble: 1789.cfi_startproc 1790 pushq %rbx 1791 pushq %rbp 1792 pushq %r12 1793 pushq %r13 1794 pushq %r14 1795 pushq %r15 1796 subq $168, %rsp // Create 32 x 5 + 8 stack space. 1797.Lpoint_double_core: 1798 vmovdqu (%rsi), %xmm0 // Save x to stack 1799 vmovdqu 16(%rsi), %xmm1 1800 vmovdqa %xmm0, (%rsp) 1801 vmovdqa %xmm1, 16(%rsp) 1802 1803 vmovq %rsi, %xmm3 // Backup a 1804 vmovq %rdi, %xmm0 // Backup &r->x, &r->y, &r->z 1805 leaq 32(%rdi), %r12 1806 leaq 64(%rdi), %r13 1807 vmovq %r12, %xmm1 1808 vmovq %r13, %xmm2 1809 1810 movq 32(%rsi), %r8 // Read a->y 1811 movq 40(%rsi), %r9 1812 movq 48(%rsi), %r10 1813 movq 56(%rsi), %r11 1814 1815 movq 8+.Lpoly(%rip), %r14 // Read P[1], P[3] 1816 movq 24+.Lpoly(%rip), %r15 1817 1818 leaq 32(%rsp), %rdi 1819 call ECP256_MulBy2Core // ECP256_MulBy2(S, &a->y), Not overwritten rsi 1820 1821 movq 64(%rsi), %rax 1822 movq 72(%rsi), %r14 1823 movq 80(%rsi), %rbp 1824 movq 88(%rsi), %r15 1825 leaq 64(%rsi), %rsi // Setting Input Parameters 1826 leaq 64(%rsp), %rdi // Z2 = rsp + 64 1827 call ECP256_SqrCore_q // ECP256_Sqr(Z2, &a->z) 1828 1829 leaq (%rsp), %rdx 1830 leaq 96(%rsp), %rdi // M = rsp + 96 1831 call ECP256_AddCore // ECP256_Add(M, a->x, Z2) 1832 1833 movq 32(%rsp), %rax 1834 movq 40(%rsp), %r14 1835 movq 48(%rsp), %rbp 1836 movq 56(%rsp), %r15 1837 leaq 32(%rsp), %rdi // S = rsp + 32 1838 call ECP256_SqrCore_q // ECP256_Sqr(S, S) 1839 1840 vmovq %xmm3, %rcx 1841 leaq 32(%rcx), %rsi 1842 leaq 64(%rcx), %rcx 1843 vmovq %xmm2, %rdi 1844 call ECP256_MulCore_q // ECP256_Mul(r->z, a->y, a->z) 1845 call ECP256_MulBy2Core // ECP256_MulBy2(r->z, r->z) 1846 1847 movq (%rsp), %r8 1848 movq 8(%rsp), %r9 1849 movq 16(%rsp), %r10 1850 movq 24(%rsp), %r11 1851 leaq 64(%rsp), %rdx 1852 call ECP256_SubCore // ECP256_SubCore(Z2,a->x,Z2) 1853 movq %r8, 64(%rsp) 1854 movq %r9, 72(%rsp) 1855 movq %r10, 80(%rsp) 1856 movq %r11, 88(%rsp) 1857 1858 movq 32(%rsp), %rax 1859 movq 40(%rsp), %r14 1860 movq 48(%rsp), %rbp 1861 movq 56(%rsp), %r15 1862 vmovq %xmm1, %rdi 1863 call ECP256_SqrCore_q // ECP256_Sqr(r->y,S) 1864 1865 call ECP256_DivBy2Core // ECP256_Div(r->y,r->y) 1866 1867 leaq 96(%rsp), %rdi 1868 leaq 96(%rsp), %rsi 1869 leaq 64(%rsp), %rcx 1870 call ECP256_MulCore_q // ECP256_MulCore_q(M,M,Z2) 1871 call ECP256_MulBy3Core // ECP256_MulBy3Core(M,M) 1872 1873 leaq (%rsp), %rcx 1874 leaq 32(%rsp), %rsi 1875 leaq 32(%rsp), %rdi 1876 call ECP256_MulCore_q // ECP256_MulCore_q(S, S, a->x) 1877 1878 leaq 128(%rsp), %rdi // T = 128 + rsp 1879 call ECP256_MulBy2Core // ECP256_MulBy2Core(T, S) 1880 1881 movq 96(%rsp), %rax 1882 movq 104(%rsp), %r14 1883 movq 112(%rsp), %rbp 1884 movq 120(%rsp), %r15 1885 vmovq %xmm0, %rdi 1886 call ECP256_SqrCore_q // ECP256_Sqr(r->x, M) 1887 1888 leaq 128(%rsp), %rdx 1889 call ECP256_SubCore // ECP256_SubCore(r->x, r->x, T) 1890 movq %r8, (%rdi) 1891 movq %r9, 8(%rdi) 1892 movq %r10, 16(%rdi) 1893 movq %r11, 24(%rdi) 1894 1895 xorq %rsi, %rsi // ECP256_SubCore(S, S, r->x), output %r12, %r13, %r8, %r9 1896 movq 32(%rsp), %rax 1897 movq 40(%rsp), %rbx 1898 movq 48(%rsp), %rcx 1899 subq %r8, %rax 1900 sbbq %r9, %rbx 1901 movq 56(%rsp), %rdx 1902 movq %rax, %r12 1903 sbbq %r10, %rcx 1904 sbbq %r11, %rdx 1905 movq %rbx, %r13 1906 movq %rcx, %r8 1907 sbbq $0, %rsi 1908 addq $-1, %rax 1909 movq %rdx, %r9 1910 adcq %r14, %rbx 1911 adcq $0, %rcx 1912 adcq %r15, %rdx 1913 testq %rsi, %rsi 1914 cmovnzq %rax, %r12 1915 cmovnzq %rbx, %r13 1916 cmovnzq %rcx, %r8 1917 cmovnzq %rdx, %r9 1918 movq %r12, 32(%rsp) 1919 movq %r13, 40(%rsp) 1920 movq %r8, 48(%rsp) 1921 movq %r9, 56(%rsp) 1922 1923 leaq 32(%rsp), %rdi 1924 leaq 32(%rsp), %rsi 1925 leaq 96(%rsp), %rcx 1926 call ECP256_MulCore_q // ECP256_MulCore_q(S, S, M) 1927 1928 vmovq %xmm1, %rdx 1929 vmovq %xmm1, %rdi 1930 call ECP256_SubCore // ECP256_SubCore(r->y, S, r->y) 1931 movq %r8, (%rdi) 1932 movq %r9, 8(%rdi) 1933 leaq 168+48(%rsp), %rsi 1934 movq %r10, 16(%rdi) 1935 movq %r11, 24(%rdi) 1936 1937 movq -48(%rsi), %r15 1938 movq -40(%rsi), %r14 1939 movq -32(%rsi), %r13 1940 movq -24(%rsi), %r12 1941 movq -16(%rsi), %rbp 1942 movq -8(%rsi), %rbx 1943 leaq (%rsi), %rsp 1944 ret 1945.cfi_endproc 1946.size ECP256_PointDouble, .-ECP256_PointDouble 1947 1948/** 1949 * Function description: Elliptic curve point addition calculation: res = a + b 1950 * Function prototype: void ECP256_PointAdd(P256_Point *r, const P256_Point *a, const P256_Point *b); 1951 * Input register: 1952 * rdi: Pointer to the output P256_POINT structure 1953 * rsi: Address pointing to input data a 1954 * rdx: Address pointing to input data b 1955 * Change register: rax, rbx, rcx, rdx, rbp, rsi, r8, r9, r10, r11, r12, r13, r14, r15 1956 * Function/Macro Call: ECP256_PointDouble, ECP256_SqrCore_q, ECP256_MulCore_q, ECP256_SubCore, ECP256_MulBy2Core 1957 * ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo 1958 * Deal process: 1959 * U1 = X1*Z22 1960 * U2 = X2*Z12 1961 * S1 = Y1*Z23 1962 * S2 = Y2*Z13 1963 * H = U2-U1 1964 * r = S2-S1 1965 * X3 = r2-H3-2*U1*H2 1966 * Y3 = r*(U1*H2-X3)-S1*H3 1967 * Z3 = Z1*Z2*H 1968 */ 1969.globl ECP256_PointAdd 1970.type ECP256_PointAdd,@function 1971.align 32 1972ECP256_PointAdd: 1973.cfi_startproc 1974 pushq %rbx 1975 pushq %rbp 1976 pushq %r12 1977 pushq %r13 1978 pushq %r14 1979 pushq %r15 1980 subq $640+8, %rsp // Create 32 x 20 + 8 stack space. 1981 1982 vmovdqu (%rsi), %xmm0 1983 vmovdqu 16(%rsi), %xmm1 1984 vmovdqu 32(%rsi), %xmm2 1985 vmovdqu 48(%rsi), %xmm3 1986 vmovdqu 64(%rsi), %xmm4 1987 vmovdqu 80(%rsi), %xmm5 1988 movq %rsi, %rcx 1989 movq %rdx, %rsi 1990 vmovdqu %xmm0, (%rsp) // Save a on the stack, a_cpy: 0~96(%rsp) 1991 vmovdqu %xmm1, 16(%rsp) 1992 vmovdqu %xmm2, 32(%rsp) 1993 vmovdqu %xmm3, 48(%rsp) 1994 vmovdqu %xmm4, 64(%rsp) 1995 vmovdqu %xmm5, 80(%rsp) 1996 vpor %xmm4, %xmm5, %xmm5 // xmm5 = (Za[3]|Za[1], Za[2]|Za[0]) 1997 1998 vmovdqu (%rsi), %xmm0 1999 vpshufd $0xb1, %xmm5, %xmm3 // xmm3 = ((lo(Za[3]|Za[1])<<32) | hi(Za[3]|Za[1]), (lo(Za[2]|Za[0])<<32) | hi(Za[2]|Za[0])) 2000 vmovdqu 16(%rsi), %xmm1 2001 vmovdqu 32(%rsi), %xmm2 2002 vpor %xmm3, %xmm5, %xmm5 // xmm5 = ((lo(Za[3]|Za[1])|hi(Za[3]|Za[1]))##~, (lo(Za[2]|Za[0])|hi(Za[2]|Za[0]))##~) 2003 vmovdqu 48(%rsi), %xmm3 2004 2005 movq 64(%rsi), %rax // Read b.z, then calculate (b.z)^2 2006 movq 72(%rsi), %r14 2007 movq 80(%rsi), %rbp 2008 movq 88(%rsi), %r15 2009 2010 vmovdqu %xmm0, 96(%rsp) // Save b on the stack. b_cpy: 96–192(%rsp) 2011 vpshufd $0x1e, %xmm5, %xmm4 // xmm4 = ((lo(Za[2]|Za[0])|hi(Za[2]|Za[0]))##~, (lo(Za[3]|Za[1])|hi(Za[3]|Za[1]))##~) 2012 vmovdqu %xmm1, 112(%rsp) 2013 vmovdqu 64(%rsi), %xmm0 2014 vmovdqu 80(%rsi), %xmm1 2015 vmovdqu %xmm2, 128(%rsp) 2016 vmovdqu %xmm3, 144(%rsp) 2017 vpor %xmm4, %xmm5, %xmm5 // xmm5 = ((lo(Za[0]|Za[1]|Za[2]|Za[3])|hi(Za[0]|Za[1]|Za[2]|Za[3]))##~##~##~) 2018 vpxor %xmm4, %xmm4, %xmm4 2019 vpor %xmm0, %xmm1, %xmm1 2020 vmovq %rdi, %xmm0 // Backup rdi 2021 movq %rax, 160(%rsp) 2022 movq %r14, 168(%rsp) 2023 leaq 192(%rsp), %rdi // Zb^2: 192~224(%rsp) 2024 movq %rbp, 176(%rsp) 2025 movq %r15, 184(%rsp) 2026 2027 vmovq %rcx, %xmm2 // Backup a 2028 call ECP256_SqrCore_q // sqr(Zb^2, Zb) 2029 2030 vpcmpeqd %xmm4, %xmm5, %xmm5 // a_infty, Whether a is an infinity point (Za == 0) 2031 vpshufd $0xb1, %xmm1, %xmm4 2032 vpor %xmm1, %xmm4, %xmm4 2033 vpshufd $0x1e, %xmm4, %xmm3 2034 vpor %xmm3, %xmm4, %xmm4 2035 vpxor %xmm3, %xmm3, %xmm3 2036 vpcmpeqd %xmm3, %xmm4, %xmm4 // b_infty, Whether b is an infinity point (Zb == 0) 2037 2038 movq 64(%rsp), %rax 2039 movq 72(%rsp), %r14 2040 leaq 224(%rsp), %rdi // Za^2: 224~256(%rsp) 2041 movq 80(%rsp), %rbp 2042 movq 88(%rsp), %r15 2043 call ECP256_SqrCore_q // sqr(Za^2, Za) 2044 2045 leaq 160(%rsp), %rsi // Zb 2046 leaq 192(%rsp), %rcx // Zb^2 2047 leaq 256(%rsp), %rdi // S1: 256~288(%rsp) 2048 call ECP256_MulCore_q // mul(S1, Zb, Zb^2) 2049 2050 leaq 64(%rsp), %rsi // Za 2051 leaq 224(%rsp), %rcx // Za^2 2052 leaq 288(%rsp), %rdi // S2: 288~320(%rsp) 2053 call ECP256_MulCore_q // mul(S2, Za, Za^2) 2054 2055 leaq 32(%rsp), %rsi // Ya 2056 leaq 256(%rsp), %rcx // S1 2057 leaq 256(%rsp), %rdi // S1 2058 call ECP256_MulCore_q // mul(S1,S1,Ya) 2059 2060 leaq 128(%rsp), %rsi // Yb 2061 leaq 288(%rsp), %rcx // S2 2062 leaq 288(%rsp), %rdi // S2 2063 call ECP256_MulCore_q // mul(S2,S2,Yb) 2064 2065 leaq 256(%rsp), %rdx // S1 2066 call ECP256_SubCore // sub(R,S2,S1) 2067 movq %r8, 320(%rsp) // R: 320~352(%rsp) 2068 movq %r9, 328(%rsp) 2069 movq %r10, 336(%rsp) 2070 movq %r11, 344(%rsp) 2071 2072 orq %r9, %r8 2073 vmovdqa %xmm4, %xmm1 2074 orq %r10, %r8 2075 orq %r11, %r8 2076 vpor %xmm5, %xmm1, %xmm1 // a_infty | b_infty 2077 vmovq %r8, %xmm3 2078 2079 leaq (%rsp), %rsi // Xa 2080 leaq 192(%rsp), %rcx // Zb^2 2081 leaq 352(%rsp), %rdi // U1: 352~384(%rsp) 2082 call ECP256_MulCore_q // Mul(U1, Xa, Zb^2) 2083 2084 leaq 96(%rsp), %rsi // Xb 2085 leaq 224(%rsp), %rcx // Za^2 2086 leaq 384(%rsp), %rdi // U2: 384~416(%rsp) 2087 call ECP256_MulCore_q // Mul(U2, Xb, Za^2) 2088 2089 leaq 352(%rsp), %rdx // U1 2090 leaq 416(%rsp), %rdi // H: 416~448(%rsp) 2091 call ECP256_SubCore // sub(H,U2,U1) 2092 movq %r8, 416(%rsp) 2093 movq %r9, 424(%rsp) 2094 movq %r10, 432(%rsp) 2095 movq %r11, 440(%rsp) 2096 2097 orq %r9, %r8 2098 vmovq %xmm1, %r12 2099 orq %r10, %r8 2100 vmovq %xmm3, %r13 2101 orq %r11, %r8 2102 2103 orq %r12, %r8 2104 orq %r13, %r8 2105 2106 jnz .Lpoint_add 2107 2108.Lequal_point: 2109 vmovq %xmm0, %rdi 2110 vmovq %xmm2, %rsi 2111 addq $640-32*5, %rsp 2112 jmp .Lpoint_double_core 2113 2114.align 32 2115.Lpoint_add: 2116 movq 320(%rsp), %rax // R 2117 movq 328(%rsp), %r14 2118 leaq 448(%rsp), %rdi // R^2: 448~480(%rsp) 2119 movq 336(%rsp), %rbp 2120 movq 344(%rsp), %r15 2121 call ECP256_SqrCore_q // sqr(R^2,R) 2122 2123 leaq 64(%rsp), %rsi // Za 2124 leaq 416(%rsp), %rcx // H 2125 leaq 480(%rsp), %rdi // Zr:480~512(%rsp) 2126 call ECP256_MulCore_q // Mul(Zr,H,Za) 2127 2128 movq 416(%rsp), %rax // H 2129 movq 424(%rsp), %r14 2130 leaq 512(%rsp), %rdi // H^2:512~544(%rsp) 2131 movq 432(%rsp), %rbp 2132 movq 440(%rsp), %r15 2133 call ECP256_SqrCore_q // sqr(H^2, H) 2134 2135 leaq 480(%rsp), %rdi // Zr 2136 leaq 480(%rsp), %rsi // Zr 2137 leaq 160(%rsp), %rcx // Zb 2138 call ECP256_MulCore_q // Mul(Zr, Zr, Zb) 2139 2140 leaq 544(%rsp), %rdi // H3:544~576(%rsp) 2141 leaq 512(%rsp), %rsi // H2 2142 leaq 416(%rsp), %rcx // H 2143 call ECP256_MulCore_q // mul(H^3,H,H^2) 2144 2145 leaq 384(%rsp), %rdi // U2 2146 leaq 352(%rsp), %rsi // U1 2147 leaq 512(%rsp), %rcx // H2 2148 call ECP256_MulCore_q // mul(U2,U1,H^2) 2149 2150 leaq 512(%rsp), %rdi // H^2 2151 call ECP256_MulBy2Core // mulby2(H^2,U2) 2152 2153 movq 448(%rsp), %rax // sub(Xr,R^2,H^2) 2154 movq 456(%rsp), %rbx 2155 xorq %rsi, %rsi 2156 movq 464(%rsp), %rcx 2157 movq 472(%rsp), %rdx 2158 subq %r8, %rax 2159 sbbq %r9, %rbx 2160 movq %rax, %r8 2161 sbbq %r10, %rcx 2162 sbbq %r11, %rdx 2163 movq %rbx, %r9 2164 sbbq $0, %rsi 2165 2166 addq $-1, %r8 2167 movq %rcx, %r10 2168 adcq %r14, %r9 2169 adcq $0, %r10 2170 movq %rdx, %r11 2171 adcq %r15, %r11 2172 testq %rsi, %rsi 2173 cmovzq %rax, %r8 2174 cmovzq %rbx, %r9 2175 cmovzq %rcx, %r10 2176 cmovzq %rdx, %r11 2177 2178 leaq 576(%rsp), %rdi // Xr: 576~608(%rsp) 2179 leaq 544(%rsp), %rdx // H^3 2180 call ECP256_SubCore // sub(Xr, Xr, H^3) 2181 movq %r8, 576(%rsp) 2182 movq %r9, 584(%rsp) 2183 movq %r10, 592(%rsp) 2184 movq %r11, 600(%rsp) 2185 2186 movq 384(%rsp), %rax // sub(Yr, U2, Xr) 2187 movq 392(%rsp), %rbx 2188 xorq %rsi, %rsi 2189 movq 400(%rsp), %rcx 2190 movq 408(%rsp), %rdx 2191 subq %r8, %rax 2192 sbbq %r9, %rbx 2193 movq %rax, %r8 2194 sbbq %r10, %rcx 2195 sbbq %r11, %rdx 2196 movq %rbx, %r9 2197 sbbq $0, %rsi 2198 2199 addq $-1, %r8 2200 movq %rcx, %r10 2201 adcq %r14, %r9 2202 adcq $0, %r10 2203 movq %rdx, %r11 2204 adcq %r15, %r11 2205 testq %rsi, %rsi 2206 cmovzq %rax, %r8 2207 cmovzq %rbx, %r9 2208 cmovzq %rcx, %r10 2209 cmovzq %rdx, %r11 2210 movq %r8, 608(%rsp) // Yr: 608~640(%rsp) 2211 movq %r9, 616(%rsp) 2212 movq %r10, 624(%rsp) 2213 movq %r11, 632(%rsp) 2214 2215 leaq 288(%rsp), %rdi // S2 2216 leaq 256(%rsp), %rsi // S1 2217 leaq 544(%rsp), %rcx // H^3 2218 call ECP256_MulCore_q // Mul(S2, S1, H^3) 2219 2220 leaq 608(%rsp), %rdi // Yr 2221 leaq 608(%rsp), %rsi 2222 leaq 320(%rsp), %rcx // r 2223 call ECP256_MulCore_q // Mul(Yr, Yr, R) 2224 2225 leaq 608(%rsp), %rdi // Yr 2226 leaq 288(%rsp), %rdx // S2 2227 call ECP256_SubCore // sub(Yr,Yr,S2) 2228 movq %r8, 608(%rsp) 2229 movq %r9, 616(%rsp) 2230 movq %r10, 624(%rsp) 2231 movq %r11, 632(%rsp) 2232 2233 vmovq %xmm0, %rdi 2234 2235 vmovdqa %xmm5, %xmm0 // a_infty 2236 vmovdqa %xmm5, %xmm1 2237 vpandn 576(%rsp), %xmm0, %xmm0 // !a_infty & Xr 2238 vpandn 592(%rsp), %xmm1, %xmm1 2239 vmovdqa %xmm5, %xmm2 2240 vmovdqa %xmm5, %xmm3 2241 vpand 96(%rsp), %xmm2, %xmm2 // a_infty & Xb 2242 vpand 112(%rsp), %xmm3, %xmm3 2243 vpor %xmm0, %xmm2, %xmm2 // a_infty ? Xb : Xr 2244 vpor %xmm1, %xmm3, %xmm3 2245 2246 vmovdqa %xmm4, %xmm0 // b_infty 2247 vmovdqa %xmm4, %xmm1 2248 vpandn %xmm2, %xmm0, %xmm0 // !b_infty & (a_infty ? Xb : Xr) 2249 vpandn %xmm3, %xmm1, %xmm1 2250 vmovdqa %xmm4, %xmm2 2251 vmovdqa %xmm4, %xmm3 2252 vpand (%rsp), %xmm2, %xmm2 // b_infty & Xa 2253 vpand 16(%rsp), %xmm3, %xmm3 2254 vpor %xmm0, %xmm2, %xmm2 // b_infty ? Xa : (a_infty ? Xb : Xr) 2255 vpor %xmm1, %xmm3, %xmm3 2256 vmovdqu %xmm2, (%rdi) 2257 vmovdqu %xmm3, 16(%rdi) 2258 2259 vmovdqa %xmm5, %xmm0 // a_infty 2260 vmovdqa %xmm5, %xmm1 2261 vpandn 608(%rsp), %xmm0, %xmm0 // !a_infty & Yr 2262 vpandn 624(%rsp), %xmm1, %xmm1 2263 vmovdqa %xmm5, %xmm2 2264 vmovdqa %xmm5, %xmm3 2265 vpand 128(%rsp), %xmm2, %xmm2 // a_infty & Yb 2266 vpand 144(%rsp), %xmm3, %xmm3 2267 vpor %xmm0, %xmm2, %xmm2 // a_infty ? Yb : Yr 2268 vpor %xmm1, %xmm3, %xmm3 2269 2270 vmovdqa %xmm4, %xmm0 // b_infty 2271 vmovdqa %xmm4, %xmm1 2272 vpandn %xmm2, %xmm0, %xmm0 // !b_infty & (a_infty ? Yb : Yr) 2273 vpandn %xmm3, %xmm1, %xmm1 2274 vmovdqa %xmm4, %xmm2 2275 vmovdqa %xmm4, %xmm3 2276 vpand 32(%rsp), %xmm2, %xmm2 // b_infty & Ya 2277 vpand 48(%rsp), %xmm3, %xmm3 2278 vpor %xmm0, %xmm2, %xmm2 // b_infty ? Ya : (a_infty ? Yb : Yr) 2279 vpor %xmm1, %xmm3, %xmm3 2280 vmovdqu %xmm2, 32(%rdi) 2281 vmovdqu %xmm3, 48(%rdi) 2282 2283 vmovdqa %xmm5, %xmm0 // a_infty 2284 vmovdqa %xmm5, %xmm1 2285 vpandn 480(%rsp), %xmm0, %xmm0 // !a_infty & Zr 2286 vpandn 496(%rsp), %xmm1, %xmm1 2287 vmovdqa %xmm5, %xmm2 2288 vmovdqa %xmm5, %xmm3 2289 vpand 160(%rsp), %xmm2, %xmm2 // a_infty & Zb 2290 vpand 176(%rsp), %xmm3, %xmm3 2291 vpor %xmm0, %xmm2, %xmm2 // a_infty ? Zb : Zr 2292 vpor %xmm1, %xmm3, %xmm3 2293 2294 vmovdqa %xmm4, %xmm0 // b_infty 2295 vmovdqa %xmm4, %xmm1 2296 vpandn %xmm2, %xmm0, %xmm0 // !b_infty & (a_infty ? Zb : Zr) 2297 vpandn %xmm3, %xmm1, %xmm1 2298 vmovdqa %xmm4, %xmm2 2299 vmovdqa %xmm4, %xmm3 2300 vpand 64(%rsp), %xmm2, %xmm2 // b_infty & Za 2301 vpand 80(%rsp), %xmm3, %xmm3 2302 vpor %xmm0, %xmm2, %xmm2 // b_infty ? Za : (a_infty ? Zb : Zr) 2303 vpor %xmm1, %xmm3, %xmm3 2304 vmovdqu %xmm2, 64(%rdi) 2305 vmovdqu %xmm3, 80(%rdi) 2306 2307 leaq 640+56(%rsp), %rsi 2308 movq -48(%rsi), %r15 2309 movq -40(%rsi), %r14 2310 movq -32(%rsi), %r13 2311 movq -24(%rsi), %r12 2312 movq -16(%rsi), %rbp 2313 movq -8(%rsi), %rbx 2314 leaq (%rsi), %rsp 2315 ret 2316.cfi_endproc 2317.size ECP256_PointAdd, .-ECP256_PointAdd 2318 2319/** 2320 * Function description: Point addition of normal coordinates and affine coordinates assembly implementation 2321 * Function prototype: void ECP256_AddAffine(P256_Point *r, const P256_Point *a, const P256_AffinePoint *b); 2322 * Input register: 2323 * rdi: Points to the returned P256_Point. 2324 * rsi: Points to the input P256_Point. 2325 * rdx: P256_AffinePoint that points to the input 2326 * Change register: rax, rbx, rcx, rdx, rsi, rdi, rbp, r8, r9, r10, r11, r12, r13, r14, r15 2327 * Output register: None 2328 * Function/Macro Call: ECP256_MulBy2Core, ECP256_SqrCore_q, ECP256_AddCore, ECP256_MulCore_q, ECP256_SubCore 2329 * ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl 2330 * Deal process: 2331 * Z1Z1 = Z12 2332 * U2 = X2*Z1Z1 2333 * S2 = Y2*Z1*Z1Z1 2334 * H = U2-X1 2335 * HH = H2 2336 * I = 4*HH 2337 * J = H*I 2338 * r = 2*(S2-Y1) 2339 * V = X1*I 2340 * X3 = r2-J-2*V 2341 * Y3 = r*(V-X3)-2*Y1*J 2342 * Z3 = (Z1+H)2-Z1Z1-HH 2343 */ 2344.globl ECP256_AddAffine 2345.type ECP256_AddAffine,@function 2346.align 32 2347ECP256_AddAffine: 2348.cfi_startproc 2349 pushq %rbp 2350 pushq %rbx 2351 pushq %r12 2352 pushq %r13 2353 pushq %r14 2354 pushq %r15 2355 2356 subq $488, %rsp // open up stack space 32 * 15 + 8 = 488 2357 2358 vmovdqu 0(%rsi), %xmm0 // X1[1]X1[0] --> xmm0 2359 vmovdqu 16(%rsi), %xmm1 // X1[3]X1[2] 2360 vmovdqu 32(%rsi), %xmm2 // Y1[1]Y1[0] 2361 vmovdqu 48(%rsi), %xmm3 // Y1[3]Y1[2] 2362 movq 72(%rsi), %r14 // Z1[1] 64 + 8 2363 movq 64(%rsi), %rax // Z1[0] 64 + 0 2364 vmovdqu 64(%rsi), %xmm4 // Z1[1]Z1[0] 2365 movq 88(%rsi), %r15 // Z1[3] 64 + 24 2366 movq 80(%rsi), %rbp // Z1[2] 64 + 16 2367 vmovdqu 80(%rsi), %xmm5 // Z1[3]Z1[2] 2368 2369 vmovdqa %xmm0, 320(%rsp) // save X1[1]X1[0] to stack 2370 vmovdqa %xmm1, 336(%rsp) // save X1[3]X1[2] to stack 2371 vmovdqa %xmm2, 352(%rsp) // save Y1[1]Y1[0] to stack 2372 vmovdqa %xmm3, 368(%rsp) // save Y1[3]Y1[2] to stack 2373 vmovdqa %xmm4, 384(%rsp) // save Z1[1]Z1[0] to stack 2374 vmovdqa %xmm5, 400(%rsp) // save Z1[3]Z1[2] to stack 2375 vpor %xmm4, %xmm5, %xmm5 // Z1[1]Z1[0] | Z1[3]Z1[2] 2376 2377 vmovdqu (%rdx), %xmm0 // X2[1]X2[0] --> xmm0 2378 vpshufd $0xb1, %xmm5, %xmm3 // Order(10 11 00 01) --> [2 3 0 1] with 32bit 2379 vmovdqu 16(%rdx), %xmm1 // X2[3]X2[2] --> xmm1 2380 vmovdqu 32(%rdx), %xmm2 // Y2[1]Y2[0] --> xmm2 2381 vpor %xmm3, %xmm5, %xmm5 // [2 3 0 1] | [3 2 1 0] 2382 vmovdqu 48(%rdx), %xmm3 // Y2[3]Y2[2] --> xmm3 2383 vmovdqa %xmm0, 416(%rsp) // save X2[1]X2[0] to stack 2384 vpshufd $0x1e, %xmm5, %xmm4 // Order(00 01 11 10) --> [0 1 3 2] 2385 vmovdqa %xmm1, 432(%rsp) // save X2[3]X2[1] to stack 2386 vpor %xmm0, %xmm1, %xmm1 // X2[1]X2[0] | X2[3]X2[2] 2387 2388 vmovq %rdi, %xmm0 // save rdi to xmm0 2389 vmovdqa %xmm2, 448(%rsp) // save X2[1]X2[0] to stack 2390 vmovdqa %xmm3, 464(%rsp) // save X2[3]X2[2] to stack 2391 vpor %xmm2, %xmm3, %xmm3 // Y2[1]Y2[0] | Y2[3]Y2[2] 2392 vpor %xmm4, %xmm5, %xmm5 2393 vpxor %xmm4, %xmm4, %xmm4 // 0 2394 vpor %xmm1, %xmm3, %xmm3 // X2[1]X2[0] | X2[3]X2[2] | Y2[1]Y2[0] | Y2[3]Y2[2] 2395 2396.balign 32 2397 /* Z1Z1 = Z1 ^ 2 */ 2398 leaq 64(%rsi), %rsi // addr(z) 2399 leaq 32(%rsp), %rdi // save Z1Z1 to stack 2400 call ECP256_SqrCore_q // Output: r8 - r11 P[1] --> r14 P[3] --> r15 2401 2402 vpcmpeqd %xmm4, %xmm5, %xmm5 2403 vpshufd $0xb1, %xmm3, %xmm4 // Order(10 11 00 01) 2404 vpor %xmm3, %xmm4, %xmm4 2405 vpshufd $0, %xmm5, %xmm5 // Order(00 00 00 00) 2406 vpshufd $0x1e, %xmm4, %xmm3 // Order(00 01 11 10) 2407 vpor %xmm3, %xmm4, %xmm4 2408 vpxor %xmm3, %xmm3, %xmm3 2409 vpcmpeqd %xmm3, %xmm4, %xmm4 2410 vpshufd $0, %xmm4, %xmm4 2411 2412 /* U2 = X2 * Z1Z1 */ 2413 leaq 416(%rsp), %rcx // addr of X2 in stack 2414 leaq 32(%rsp), %rsi // read Z1Z1 from stack 2415 leaq (%rsp), %rdi // save U2 to stack 2416 call ECP256_MulCore_q // ouput: r8 - r11 P[1] --> r14 P[3] --> r15 2417 2418 /* H = U2 - X1 */ 2419 leaq 320(%rsp), %rdx // read X1 from stack 2420 leaq 64(%rsp), %rdi // save H to stack 2421 call ECP256_SubCore // input: rdx, r8 - r11 P[1] --> r14 P[3] --> r15 2422 movq %r8, (%rdi) 2423 movq %r9, 8(%rdi) 2424 movq %r10, 16(%rdi) 2425 movq %r11, 24(%rdi) 2426 2427 /* Z1Z1Z1 = Z1Z1 * Z1 */ 2428 leaq 384(%rsp), %rcx // read Z1 from stack 2429 leaq 32(%rsp), %rsi // read Z1Z1 from stack 2430 leaq 32(%rsp), %rdi // save Z1Z1Z1 to stack 2431 call ECP256_MulCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2432 movq %r8, (%rdi) 2433 movq %r9, 8(%rdi) 2434 movq %r10, 16(%rdi) 2435 movq %r11, 24(%rdi) 2436 2437 /* Z3/2 = H * Z1 */ 2438 leaq 384(%rsp), %rcx // read Z1 from stack 2439 leaq 64(%rsp), %rsi // read H from stack 2440 leaq 288(%rsp), %rdi // save Z3/2 to stack 2441 call ECP256_MulCore_q // P[1] --> r14 P[3] --> r15 2442 2443 /* S2 = Y2 * Z1Z1Z1 */ 2444 leaq 448(%rsp), %rcx // read Y2 from stack 2445 leaq 32(%rsp), %rsi // read Z1Z1Z1 from stack 2446 leaq 32(%rsp), %rdi // save S2 to stack 2447 call ECP256_MulCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2448 2449 /* r/2 = (S2 - Y1) */ 2450 leaq 352(%rsp), %rdx // read Y1 from stack 2451 leaq 96(%rsp), %rdi // save r/2 to stack 2452 call ECP256_SubCore // output: r8-r11 P[1] --> r14 P[3] --> r15 2453 movq %r8, (%rdi) // save r/2 to stack 2454 movq %r9, 8(%rdi) 2455 movq %r10, 16(%rdi) 2456 movq %r11, 24(%rdi) 2457 2458 /* I/4 = H ^ 2 */ 2459 leaq 64(%rsp), %rsi // read H from stack 2460 movq (%rsi), %rax // a[0] 2461 movq 8(%rsi), %r14 // a[1] 2462 movq 16(%rsi), %rbp // a[2] 2463 movq 24(%rsi), %r15 // a[3] 2464 leaq 128(%rsp), %rdi // save I/4 to stack 2465 call ECP256_SqrCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2466 2467 /* (r/2)^2 = (r^2)/4 */ 2468 leaq 96(%rsp), %rsi // read r/2 from stack 2469 movq (%rsi), %rax // a[0] 2470 movq 8(%rsi), %r14 // a[1] 2471 movq 16(%rsi), %rbp // a[2] 2472 movq 24(%rsi), %r15 // a[3] 2473 leaq 192(%rsp), %rdi // save (r^2)/4 to stack 2474 call ECP256_SqrCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2475 2476 /* J/4 = I/4 * H */ 2477 leaq 128(%rsp), %rcx // read I/4 from stack 2478 leaq 64(%rsp), %rsi // read H from stack 2479 leaq 160(%rsp), %rdi // save J/4 to stack 2480 call ECP256_MulCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2481 2482 /* V/4 = X1 * I */ 2483 leaq 320(%rsp), %rcx // read X1 from stack 2484 leaq 128(%rsp), %rsi // read I/4 from stack 2485 leaq (%rsp), %rdi // save V/4 to stack 2486 call ECP256_MulCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2487 2488 xorq %r12, %r12 2489 addq %r8, %r8 2490 adcq %r9, %r9 2491 movq %r8, %rax 2492 adcq %r10, %r10 2493 adcq %r11, %r11 2494 movq %r9, %rbp 2495 adcq $0, %r12 2496 2497 subq $-1, %r8 2498 movq %r10, %rcx 2499 sbbq %r14, %r9 2500 sbbq $0, %r10 2501 movq %r11, %r13 2502 sbbq %r15, %r11 2503 sbbq $0, %r12 2504 leaq 192(%rsp), %rsi // read (r^2)/4 from 2505 cmovcq %rax,%r8 // b[0] V/2 --> r8-r11 2506 cmovcq %rbp,%r9 // b[1] 2507 cmovcq %rcx,%r10 // b[2] 2508 cmovcq %r13,%r11 // b[3] 2509 2510 /* (r^2 - 2 * V)/4 = (r^2)/4 - V/2 */ 2511 xorq %r13, %r13 2512 movq (%rsi), %rax // a[0] 2513 movq 8(%rsi), %rcx // a[1] 2514 movq 16(%rsi), %rdx // a[2] 2515 movq 24(%rsi), %r12 // a[3] 2516 2517 subq %r8, %rax 2518 sbbq %r9, %rcx 2519 movq %rax, %r8 2520 sbbq %r10, %rdx 2521 sbbq %r11, %r12 2522 movq %rcx, %r9 2523 sbbq $0, %r13 2524 2525 addq $-1, %r8 // a - b + P 2526 movq %rdx, %r10 2527 adcq %r14, %r9 2528 adcq $0, %r10 2529 movq %r12, %r11 2530 adcq %r15, %r11 2531 testq %r13, %r13 2532 2533 cmovzq %rax, %r8 2534 cmovzq %rcx, %r9 2535 cmovzq %rdx, %r10 2536 cmovzq %r12, %r11 // output: r8-r11 P[1] --> r14 P[3] --> r15 2537 2538 /* X3/4 = (r^2 - 2 * V - J)/4 = (r^2 - 2 * V)/4 - J/4 */ 2539 leaq 160(%rsp), %rdx // read J/4 from stack 2540 leaq 224(%rsp), %rdi // save (r^2 - 2 * V - J)/4 to stack 2541 call ECP256_SubCore // output: r8-r11 P[1] --> r14 P[3] --> r15 2542 movq %r8, (%rdi) // b[0] 2543 movq %r9, 8(%rdi) // b[1] 2544 movq %r10, 16(%rdi) // b[2] 2545 movq %r11, 24(%rdi) // b[3] 2546 2547 /* (V - X3)/4 = V/4 - X3/4 */ 2548 leaq (%rsp), %rsi // read (r^2 - 2 * V)/4 from stack 2549 xorq %r13, %r13 2550 movq (%rsi), %rax // a[0] 2551 movq 8(%rsi), %rcx // a[1] 2552 movq 16(%rsi), %rdx // a[2] 2553 movq 24(%rsi), %r12 // a[3] 2554 2555 subq %r8, %rax 2556 sbbq %r9, %rcx 2557 movq %rax, %r8 2558 sbbq %r10, %rdx 2559 sbbq %r11, %r12 2560 movq %rcx, %r9 2561 sbbq $0, %r13 2562 movq %rdx, %r10 2563 movq %r12, %r11 2564 2565 addq $-1, %r8 // a - b + P 2566 adcq %r14, %r9 2567 adcq $0, %r10 2568 adcq %r15, %r11 2569 testq %r13, %r13 2570 2571 cmovzq %rax, %r8 2572 cmovzq %rcx, %r9 2573 cmovzq %rdx, %r10 2574 cmovzq %r12, %r11 2575 2576 leaq 64(%rsp), %rdi // save (V - X3)/4 from stack 2577 movq %r8, (%rdi) 2578 movq %r9, 8(%rdi) 2579 movq %r10, 16(%rdi) 2580 movq %r11, 24(%rdi) // output: r8-r11 P[1] --> r14 P[3] --> r15 2581 2582 /* (J * Y1)/4 = Y1 * J/4 */ 2583 leaq 352(%rsp), %rcx // read Y1 from stack 2584 leaq 160(%rsp), %rsi // read J/4 from stack 2585 leaq 32(%rsp), %rdi // save (J * Y1)/4 to stack 2586 call ECP256_MulCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2587 2588 /* (r * (V - X3)/8) = (V - X3)/4 * r/2 */ 2589 leaq 96(%rsp), %rcx // read r/2 from stack 2590 leaq 64(%rsp), %rsi // read (V - X3)/4 from stack 2591 leaq 64(%rsp), %rdi // save (r * (V - X3)/8) to stack 2592 call ECP256_MulCore_q // output: r8-r11 P[1] --> r14 P[3] --> r15 2593 2594 /* Y3/8 = (r * (V - X3)/8) - (J * Y1)/4 */ 2595 leaq 32(%rsp), %rdx // read (J * Y1)/4 from stack 2596 leaq 256(%rsp), %rdi // save Y3/8 2597 call ECP256_SubCore 2598 movq %r8, (%rdi) 2599 movq %r9, 8(%rdi) 2600 movq %r10, 16(%rdi) 2601 movq %r11, 24(%rdi) 2602 2603 vmovq %xmm0, %rdi 2604 2605 vmovdqa %xmm5, %xmm0 2606 vmovdqa %xmm5, %xmm1 2607 vpandn 288(%rsp), %xmm0, %xmm0 2608 vmovdqa %xmm5, %xmm2 2609 vpandn 304(%rsp), %xmm1, %xmm1 2610 vmovdqa %xmm5, %xmm3 2611 vpand .Lone_mont(%rip), %xmm2, %xmm2 2612 vpand .Lone_mont+16(%rip), %xmm3, %xmm3 2613 vpor %xmm0, %xmm2, %xmm2 2614 vpor %xmm1, %xmm3, %xmm3 2615 2616 vmovdqa %xmm4, %xmm0 2617 vmovdqa %xmm4, %xmm1 2618 vpandn %xmm2, %xmm0, %xmm0 2619 vmovdqa %xmm4, %xmm2 2620 vpandn %xmm3, %xmm1, %xmm1 2621 vmovdqa %xmm4, %xmm3 2622 vpand 384(%rsp), %xmm2, %xmm2 2623 vpand 400(%rsp), %xmm3, %xmm3 2624 vpor %xmm0, %xmm2, %xmm2 2625 vpor %xmm1, %xmm3, %xmm3 2626 vmovdqu %xmm2, 64(%rdi) 2627 vmovdqu %xmm3, 80(%rdi) 2628 2629 vmovdqa %xmm5, %xmm0 2630 vmovdqa %xmm5, %xmm1 2631 vpandn 224(%rsp), %xmm0, %xmm0 2632 vmovdqa %xmm5, %xmm2 2633 vpandn 224+16(%rsp), %xmm1, %xmm1 2634 vmovdqa %xmm5, %xmm3 2635 vpand 416(%rsp), %xmm2, %xmm2 2636 vpand 416+16(%rsp), %xmm3, %xmm3 2637 vpor %xmm0, %xmm2, %xmm2 2638 vpor %xmm1, %xmm3, %xmm3 2639 2640 vmovdqa %xmm4, %xmm0 2641 vmovdqa %xmm4, %xmm1 2642 vpandn %xmm2, %xmm0, %xmm0 2643 vmovdqa %xmm4, %xmm2 2644 vpandn %xmm3, %xmm1, %xmm1 2645 vmovdqa %xmm4, %xmm3 2646 vpand 320(%rsp), %xmm2, %xmm2 2647 vpand 336(%rsp), %xmm3, %xmm3 2648 vpor %xmm0, %xmm2, %xmm2 2649 vpor %xmm1, %xmm3, %xmm3 2650 vmovdqu %xmm2, 0(%rdi) 2651 vmovdqu %xmm3, 16(%rdi) 2652 2653 vmovdqa %xmm5, %xmm0 2654 vmovdqa %xmm5, %xmm1 2655 vpandn 256(%rsp), %xmm0, %xmm0 2656 vmovdqa %xmm5, %xmm2 2657 vpandn 272(%rsp), %xmm1, %xmm1 2658 vmovdqa %xmm5, %xmm3 2659 vpand 448(%rsp), %xmm2, %xmm2 2660 vpand 464(%rsp), %xmm3, %xmm3 2661 vpor %xmm0, %xmm2, %xmm2 2662 vpor %xmm1, %xmm3, %xmm3 2663 2664 vmovdqa %xmm4, %xmm0 2665 vmovdqa %xmm4, %xmm1 2666 vpandn %xmm2, %xmm0, %xmm0 2667 vmovdqa %xmm4, %xmm2 2668 vpandn %xmm3, %xmm1, %xmm1 2669 vmovdqa %xmm4, %xmm3 2670 vpand 352(%rsp), %xmm2, %xmm2 2671 vpand 368(%rsp), %xmm3, %xmm3 2672 vpor %xmm0, %xmm2, %xmm2 2673 vpor %xmm1, %xmm3, %xmm3 2674 vmovdqu %xmm2, 32(%rdi) 2675 vmovdqu %xmm3, 48(%rdi) 2676 2677 addq $488, %rsp 2678 popq %r15 2679 popq %r14 2680 popq %r13 2681 popq %r12 2682 popq %rbx 2683 popq %rbp 2684 ret 2685.cfi_endproc 2686.size ECP256_AddAffine, .-ECP256_AddAffine 2687 2688/** 2689 * Function description: This interface is used to store the G-16G pre-computation table discretely. 2690 * Function prototype: void ECP256_Scatterw5(P256_Point *table, const P256_Point *point, uint32_t index); 2691 * Input register: 2692 * rdi: Points to the base address of the pre-computation table. 2693 * rsi: Points to P256_Point. 2694 * rdx: Index value. The value ranges from 1 to 16. 2695 * Change register: rdx, rsi, rdi, r8, r9, r10, r11 2696 * Output register: None 2697 * Function/Macro Call: 2698 */ 2699.globl ECP256_Scatterw5 2700.type ECP256_Scatterw5,@function 2701.align 32 2702ECP256_Scatterw5: 2703.cfi_startproc 2704 subq $1, %rdx // index - 1 2705 movq (%rsi), %r8 // x[0] 2706 movq 8(%rsi), %r9 // x[1] 2707 movq 16(%rsi), %r10 // x[2] 2708 movq 24(%rsi), %r11 // x[3] 2709 leaq (%rdi, %rdx, 8), %rdi // base = base + (index - 1) * 8 . offset for table 2710 2711 movq %r8, (%rdi) // x[0] --> base + 0 * 128 2712 movq %r9, 128(%rdi) // x[1] --> base + 1 * 128 2713 movq %r10, 256(%rdi) // x[2] --> base + 2 * 128 2714 movq %r11, 384(%rdi) // x[3] --> base + 3 * 128 2715 leaq 512(%rdi), %rdi // base + 128 * 4 2716 2717 leaq 32(%rsi), %rsi // addr(y) --> rsi 2718 2719 movq (%rsi), %r8 // y[0] 2720 movq 8(%rsi), %r9 // y[1] 2721 movq 16(%rsi), %r10 // y[2] 2722 movq 24(%rsi), %r11 // y[3] 2723 2724 movq %r8, (%rdi) // y[0] --> base + 4 * 128 2725 movq %r9, 128(%rdi) // y[1] --> base + 5 * 128 2726 movq %r10, 256(%rdi) // y[2] --> base + 6 * 128 2727 movq %r11, 384(%rdi) // y[3] --> base + 7 * 128 2728 leaq 512(%rdi), %rdi // base + 128 * 8 2729 2730 leaq 32(%rsi), %rsi // addr(z) --> rsi 2731 2732 movq (%rsi), %r8 // z[0] 2733 movq 8(%rsi), %r9 // z[1] 2734 movq 16(%rsi), %r10 // z[2] 2735 movq 24(%rsi), %r11 // z[3] 2736 2737 movq %r8, (%rdi) // z[0] --> base + 8 * 128 2738 movq %r9, 128(%rdi) // z[1] --> base + 9 * 128 2739 movq %r10, 256(%rdi) // z[2] --> base + 10 * 128 2740 movq %r11, 384(%rdi) // z[3] --> base + 11 * 128 2741 2742 ret 2743.cfi_endproc 2744.size ECP256_Scatterw5, .-ECP256_Scatterw5 2745 2746/** 2747 * Function description: This interface is used to obtain the G-16G pre-computation table discretely. 2748 * Function prototype: void ECP256_Gatherw5(P256_Point *point, const P256_Point *table, uint32_t index); 2749 * Input register: 2750 * rdi: points to P256_Point. 2751 * rsi: points to the base address of the pre-computation table. 2752 * edx: index value 2753 * Change register: edx, rsi, rdi, r8, r9, r10, r11 2754 * Output register: None 2755 * Function/Macro Call: 2756 */ 2757.globl ECP256_Gatherw5 2758.type ECP256_Gatherw5,@function 2759.align 32 2760ECP256_Gatherw5: 2761.cfi_startproc 2762 movq $-1, %rax 2763 xorq %rcx, %rcx 2764 cmp $0, %rdx 2765 cmovzq %rcx, %rax // rax = (rdx == 0) ? 0 : -1 2766 add %rax, %rdx // rdx = (rdx == 0) ? rdx : (rdx - 1) 2767 2768 leaq (%rsi, %rdx, 8), %rsi // Calculate offset. base = base + (index -1) * 8 2769 2770 movq (%rsi), %r8 // x[0] 2771 movq 128(%rsi), %r9 // x[1] 2772 movq 256(%rsi), %r10 // x[2] 2773 movq 384(%rsi), %r11 // x[3] 2774 leaq 512(%rsi), %rsi // base += 512 2775 2776 andq %rax, %r8 2777 andq %rax, %r9 2778 andq %rax, %r10 2779 andq %rax, %r11 2780 2781 movq %r8, (%rdi) // Write back 2782 movq %r9, 8(%rdi) 2783 movq %r10, 16(%rdi) 2784 movq %r11, 24(%rdi) 2785 2786 leaq 32(%rdi), %rdi // Write back point offset 2787 2788 movq (%rsi), %r8 // y[0] 2789 movq 128(%rsi), %r9 // y[1] 2790 movq 256(%rsi), %r10 // y[2] 2791 movq 384(%rsi), %r11 // y[3] 2792 leaq 512(%rsi), %rsi // base += 512 2793 2794 andq %rax, %r8 2795 andq %rax, %r9 2796 andq %rax, %r10 2797 andq %rax, %r11 2798 2799 movq %r8, (%rdi) // Write back 2800 movq %r9, 8(%rdi) 2801 movq %r10, 16(%rdi) 2802 movq %r11, 24(%rdi) 2803 2804 leaq 32(%rdi), %rdi // Write back point offset 2805 2806 movq (%rsi), %r8 // z[0] 2807 movq 128(%rsi), %r9 // z[1] 2808 movq 256(%rsi), %r10 // z[2] 2809 movq 384(%rsi), %r11 // z[3] 2810 2811 andq %rax, %r8 2812 andq %rax, %r9 2813 andq %rax, %r10 2814 andq %rax, %r11 2815 2816 movq %r8, (%rdi) // Write back 2817 movq %r9, 8(%rdi) 2818 movq %r10, 16(%rdi) 2819 movq %r11, 24(%rdi) 2820 2821 ret 2822.cfi_endproc 2823.size ECP256_Gatherw5, .-ECP256_Gatherw5 2824 2825/** 2826 * Function description: Discretely obtains affine points in the precomputation table. 2827 * Function prototype: void ECP256_Gatherw7(P256_AffinePoint *point, const P256_AffinePoint *table, uint32_t index); 2828 * Input register: 2829 * rdi: points to the returned P256_AffinePoint. 2830 * rsi: points to the base address of the pre-computation table. 2831 * rdx: index value 2832 * Change register: rax, rcx, rdx, rsi, rdi, rbp, r8, r9, r10 2833 * Output register: None 2834 * Function/Macro Call: 2835 */ 2836.globl ECP256_Gatherw7 2837.type ECP256_Gatherw7,@function 2838.align 32 2839ECP256_Gatherw7: 2840.cfi_startproc 2841 movq $-1, %rax 2842 xorq %rcx, %rcx 2843 cmp $0, %rdx 2844 cmovzq %rcx, %rax // rax = (rdx == 0) ? 0 : -1 2845 addq %rax, %rdx // rdx = (rdx == 0) ? rdx : (rdx - 1) 2846 subq $63, %rdx // rdx = (rdx == 0) ? (rdx - 63) : (rdx - 1 - 63) 2847 subq %rdx, %rsi // rsi = (rdx == 0) ? (rsi + 63 - rdx) : (rsi + 64 - rdx) 2848 movq $8, %r10 // Loop value 2849 2850.Lgather_w7_loop: 2851 xorq %r8, %r8 // Empty reg for data low 32 2852 xorq %r9, %r9 // Empty reg for data high 32 2853 movb 192(%rsi), %r8b // r8 = [0 0 0 byte(3)] 2854 movb 448(%rsi), %r9b // r9 = [0 0 0 byte(7)] 2855 shlq $8, %r8 // r8 = [0 0 byte(3) 0] 2856 shlq $8, %r9 // r9 = [0 0 byte(7) 0] 2857 movb 128(%rsi), %r8b // r8 = [0 0 byte(3) byte(2)] 2858 movb 384(%rsi), %r9b // r9 = [0 0 byte(7) byte(6)] 2859 shlq $8, %r8 // r8 = [0 byte(3) byte(2) 0] 2860 shlq $8, %r9 // r9 = [0 byte(7) byte(6) 0] 2861 movb 64(%rsi), %r8b // r8 = [0 byte(3) byte(2) byte(1)] 2862 movb 320(%rsi), %r9b // r9 = [0 byte(7) byte(6) byte(5)] 2863 shlq $8, %r8 // r8 = [byte(3) byte(2) byte(1) 0] 2864 shlq $8, %r9 // r9 = [byte(7) byte(6) byte(5) 0] 2865 movb (%rsi), %r8b // r8 = [byte(3) byte(2) byte(1) byte(0)] 2866 movb 256(%rsi), %r9b // r9 = [byte(7) byte(6) byte(5) byte(4)] 2867 leaq 512(%rsi), %rsi // base += 64 * 8 2868 shlq $32, %r9 // r9 = [byte(7) byte(6) byte(5) byte(4) 0 0 0 0] 2869 orq %r9, %r8 // r8 = [byte(7) byte(6) byte(5) byte(4) byte(3) byte(2) byte(1) byte(0)] 2870 2871 andq %rax, %r8 2872 movq %r8, (%rdi) 2873 leaq 8(%rdi), %rdi 2874 2875 subq $1, %r10 2876 jnz .Lgather_w7_loop 2877 2878 ret 2879.cfi_endproc 2880.size ECP256_Gatherw7, .-ECP256_Gatherw7 2881 2882#endif 2883