1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_X25519 18 19.file "x25519_x86_64.S" 20.text 21 22.macro push_stack 23 /* Save register. The following registers need to be saved by the caller and restored when the function exits. */ 24 pushq %rbx 25 pushq %rbp 26 pushq %r12 27 pushq %r13 28 pushq %r14 29 pushq %r15 30 31 /* Allocate stack space and store the following necessary content: */ 32 leaq -32(%rsp), %rsp 33.endm 34 35.macro pop_stack 36 /* Recovery register */ 37 movq 32(%rsp),%r15 38 movq 40(%rsp),%r14 39 movq 48(%rsp),%r13 40 movq 56(%rsp),%r12 41 movq 64(%rsp),%rbp 42 movq 72(%rsp),%rbx 43 44 /* Restore stack pointer. The stack is opened with 32 bytes and 6 registers are restored. 45 The total number is 80 bytes. */ 46 leaq 80(%rsp), %rsp 47.endm 48 49.macro u51mul cur, low, high, next 50 mulq \cur 51 addq %rax, \low 52 movq \next, %rax 53 adcq %rdx, \high 54.endm 55 56.macro reduce 57 /* Retain the last 51 digits. */ 58 movq $0x7ffffffffffff, %rbp 59 60 /* Calculate h2' */ 61 movq %r12, %rax 62 shrq $51, %r12 63 shlq $13, %r13 64 65 /* Calculate h0' */ 66 movq %r8, %rsi 67 shrq $51, %r8 68 shlq $13, %r9 69 70 /* Calculate h2' */ 71 andq %rbp, %rax // h2' = rax = h2 & (2^51 - 1) = r12 & (2^51 - 1) 72 orq %r12, %r13 // r13 = (h2 >> 51) 73 addq %r13, %r14 // h3 += (h2 >> 51) 74 adcq $0, %r15 75 76 /* Calculate h0' */ 77 andq %rbp, %rsi // h0' = rsi = h0 & (2^51 - 1) = r8 & (2^51 - 1) 78 orq %r8, %r9 // r9 = (h0 >> 51) 79 addq %r9, %r10 // h1 += (h0 >> 51) 80 adcq $0, %r11 81 82 /* Calculate h3' */ 83 movq %r14, %r8 84 shrq $51, %r14 85 shlq $13, %r15 86 andq %rbp, %r8 // h3' = r8 = h3 & (2^51 - 1) = r14 & (2^51 - 1) 87 orq %r14, %r15 // r15 = (h3 >> 51) 88 addq %r15, %rbx // h4 += (h3 >> 51) 89 adcq $0, %rcx 90 91 /* Calculate h1' */ 92 movq %r10, %rdx 93 shrq $51, %r10 94 shlq $13, %r11 95 andq %rbp, %rdx // h1' = rdx = h1 & (2^51 - 1) = r10 & (2^51 - 1) 96 orq %r10, %r11 // r11 = (h1 >> 51) 97 addq %r11, %rax // h2 += (h1 >> 51) 98 99 /* Calculate h4' */ 100 movq %rbx, %r9 101 shrq $51, %rbx 102 shlq $13, %rcx 103 andq %rbp, %r9 // h4' = r9 = h4 & (2^51 - 1) = rbx & (2^51 - 1) 104 orq %rbx, %rcx // rcx = (h4 >> 51) 105 106 /* out[0] = out[0] + 19 * carry */ 107 leaq (%rcx, %rcx, 8), %r10 // r10 = 8 * rcx 108 leaq (%rcx, %r10, 2), %rcx // rcx = 2 * (8 * rcx) + rcx = 19 * rcx 109 addq %rcx, %rsi 110 111 /* h2 remaining */ 112 movq %rax, %r10 113 andq %rbp, %rax // h2 &= (2^51 - 1) 114 shrq $51, %r10 115 addq %r10, %r8 116 117 /* out[1] += out[0] >> 51 */ 118 movq %rsi, %r10 119 120 /* out[0] &= (2^51 - 1) */ 121 andq %rbp, %rsi 122 shrq $51, %r10 123 addq %r10, %rdx 124 125 /* Storing Results */ 126 movq %rsi, (%rdi) // h0' 127 movq %rdx, 8(%rdi) // h1' 128 movq %rax, 16(%rdi) // h2' 129 movq %r8, 24(%rdi) // h3' 130 movq %r9, 32(%rdi) // h4' 131.endm 132 133############################################################# 134# void Fp51Mul (Fp51 *out, const Fp51 *f, const Fp51 *g); 135############################################################# 136 137.globl Fp51Mul 138.type Fp51Mul, @function 139.align 32 140Fp51Mul: 141.cfi_startproc 142 /* Save Register */ 143 push_stack 144 145 /* The input and output parameters are transferred by registers rdi, rsi, and rdx. 146 * rdi: out; rsi: f; rdx: g; fp51 is an array of [u64; 5] 147 * rdx will be overwritten in subsequent calculation. 148 * Therefore, you need to load the data in the rdx variable in advance. 149 */ 150 movq (%rsi), %rax // f0 151 movq (%rdx), %rbx // g0 152 movq 8(%rdx), %r14 // g1 153 movq 16(%rdx), %r15 // g2 154 movq 24(%rdx), %rbp // g3, Store g0-g3, store g3 in unaffected registers 155 movq 32(%rdx), %rcx // g4 156 157 /* Stores the out pointer and frees the rdi so that the rdi can be used in subsequent calculations. Stores 19 * g4. */ 158 movq %rdi, 24(%rsp) 159 movq %rax, %rdi // f0 160 /* r14, r15, rbx, and rcx will be overwritten in subsequent calculations. g0 to g2 will be stored. 161 * Storage actions will be scattered in the calculation code for performance purposes. 162 */ 163 164 /* h0 = f0g0 + 19f1g4 + 19f2g3 + 19f3g2 + 19f4g1; Stored in r8, r9 */ 165 mulq %rbx // (rax, rdx) = f0 * g0, in le 166 movq %rax, %r8 167 movq %rdi, %rax // f0 168 movq %rbx, 16(%rsp) // g0 169 movq %rdx, %r9 170 171 /* h1 = f0g1 + f1g0 + 19f2g4 + 19f3g3 + 19f4g2; Stored in r10, r11 */ 172 mulq %r14 // (rax, rdx) = f0 * g1 173 movq %rax, %r10 174 movq %rdi, %rax // f0 175 leaq (%rcx, %rcx, 8), %rbx // g4 * 8 + g4 = g4 * 9 176 movq %r14, 8(%rsp) // g1 177 movq %rdx, %r11 178 179 /* h2 = f0g2 + f1g1 + f2g0 + 19f3g4 + 19f4g3; Stored in r12, r13 */ 180 mulq %r15 // (rax, rdx) = f0 * g2 181 movq %rax, %r12 182 movq %rdi, %rax // f0 183 leaq (%rcx, %rbx, 2), %rdi // rdi = 2 * (9 * g4) + g4, Store 19 * g4 to rdi before rcx is overwritten 184 movq %r15, (%rsp) // g2 185 movq %rdx, %r13 186 187 /* h3 = f0g3 + f1g2 + f2g1 + f3g0 + 19f4g4; Stored in r14, r15 */ 188 mulq %rbp // (rax, rdx) = f0 * g3 189 movq %rax, %r14 190 movq (%rsi), %rax // f0 191 movq %rdx, %r15 192 193 /* h4 = f0g4 + f1g3 + f2g2 + f3g1 + f4g0; Stored in rbx, rcx */ 194 mulq %rcx // (rax, rdx) = f0 * g4 195 movq %rax, %rbx 196 movq 8(%rsi), %rax // f1 197 movq %rdx, %rcx 198 199 /* Calculate 19 * g4 related */ 200 u51mul %rdi, %r8, %r9, 16(%rsi) // (rax, rdx) = 19 * f1 * g4; load f2 201 u51mul %rdi, %r10, %r11, 24(%rsi) // (rax, rdx) = 19 * f2 * g4; load f3 202 u51mul %rdi, %r12, %r13, 32(%rsi) // (rax, rdx) = 19 * f3 * g4; load f4 203 204 mulq %rdi // (rax, rdx) = 19 * f4 * g4 205 imulq $19, %rbp, %rdi // 19 * g3 206 addq %rax, %r14 207 movq 8(%rsi), %rax // f1 208 adcq %rdx, %r15 209 210 /* Calculate g3 related */ 211 mulq %rbp // (rax, rdx) = f1 * g3 212 movq (%rsp), %rbp // g2 213 addq %rax, %rbx 214 movq 16(%rsi), %rax // f2 215 adcq %rdx, %rcx 216 217 u51mul %rdi, %r8, %r9, 24(%rsi) // (rax, rdx) = 19 * f2 * g3; load f3 218 u51mul %rdi, %r10, %r11, 32(%rsi) // (rax, rdx) = 19 * f3 * g3; load f4 219 220 mulq %rdi // (rax, rdx) = 19 * f4 * g3 221 imulq $19, %rbp, %rdi // 19 * g2 222 addq %rax, %r12 223 movq 8(%rsi), %rax // f1 224 adcq %rdx, %r13 225 226 /* Calculate g2 related */ 227 u51mul %rbp, %r14, %r15, 16(%rsi) // (rax, rdx) = f1 * g2; load f2 228 229 mulq %rbp // (rax, rdx) = f2 * g2 230 movq 8(%rsp), %rbp // g1 231 addq %rax, %rbx 232 movq 24(%rsi), %rax // f3 233 adcq %rdx, %rcx 234 235 u51mul %rdi, %r8, %r9, 32(%rsi) // (rax, rdx) = 19 * f3 * g2; load f4 236 u51mul %rdi, %r10, %r11, 8(%rsi) // (rax, rdx) = 19 * f4 * g2; load f2 237 238 /* Calculate g1 related */ 239 mulq %rbp // (rax, rdx) = f1 * g1 240 imulq $19, %rbp, %rdi // 19 * g1 241 addq %rax, %r12 242 movq 16(%rsi), %rax // f2 243 adcq %rdx, %r13 244 245 u51mul %rbp, %r14, %r15, 24(%rsi) // (rax, rdx) = f2 * g1; load f3 246 247 mulq %rbp // (rax, rdx) = f3 * g1 248 movq 16(%rsp), %rbp // g0 249 addq %rax, %rbx 250 movq 32(%rsi), %rax // f4 251 adcq %rdx, %rcx 252 253 u51mul %rdi, %r8, %r9, 8(%rsi) // (rax, rdx) = 19 * f4 * g1; load f1 254 255 /* Calculate g0 related */ 256 u51mul %rbp, %r10, %r11, 16(%rsi) // (rax, rdx) = f1 * g0; load f2 257 u51mul %rbp, %r12, %r13, 24(%rsi) // (rax, rdx) = f2 * g0; load f3 258 u51mul %rbp, %r14, %r15, 32(%rsi) // (rax, rdx) = f3 * g0; load f4 259 260 mulq %rbp // (rax, rdx) = f4 * g0 261 addq %rax, %rbx 262 adcq %rdx, %rcx 263 264 /* Restore the stack pointer. */ 265 movq 24(%rsp), %rdi 266 267 reduce 268 269 /* Recovery register */ 270 pop_stack 271 ret 272.cfi_endproc 273.size Fp51Mul,.-Fp51Mul 274 275############################################################# 276# void Fp51Square(Fp51 *out, const Fp51 *f); 277############################################################# 278 279.globl Fp51Square 280.type Fp51Square, @function 281.align 32 282Fp51Square: 283.cfi_startproc 284 /* Save Register */ 285 push_stack 286 287 /* The input and output parameters are transferred by registers rdi and rsi. 288 * rdi: out; rsi: f; fp51 is an array of [u64; 5] 289 * Loads only non-adjacent data, vacating registers for storage calculations 290 */ 291 movq (%rsi), %rax // f0 292 movq 16(%rsi), %r15 // f2 293 movq 32(%rsi), %rcx // f4 294 295 /* Open the stack and store the following necessary content, which is consistent with the Fp51Mul. 296 * Stores the out pointer, frees the rdi, 297 * so that the rdi can be used in subsequent calculations, and stores 19 * f4. 298 */ 299 leaq (%rax, %rax, 1), %rbp // 2 * f0 300 movq %rdi, 24(%rsp) 301 302 /* h0 = f0^2 + 38f1f4 + 38f2f3; Stored in r8, r9 */ 303 mulq %rax // (rax, rdx) = f0^2 304 movq %rax, %r8 305 movq 8(%rsi), %rax // f1 306 movq %rdx, %r9 307 308 /* h1 = 19f3^2 + 2f0f1 + 38f2g4; Stored in r10, r11 */ 309 mulq %rbp // (rax, rdx) = 2f0 * f1 310 movq %rax, %r10 311 movq %r15, %rax // f2 312 movq %r15, 16(%rsp) // Store f2 for later use of rsi 313 movq %rdx, %r11 314 315 /* h2 = f1^2 + 2f0f2 + 38f3g4; Stored in r12, r13 */ 316 mulq %rbp // (rax, rdx) = 2f0 * f2 317 movq %rax, %r12 318 movq 24(%rsi), %rax // f3 319 movq %rdx, %r13 320 321 imulq $19, %rcx, %rdi // Store 19 * f4 to rdi before rcx is overwritten 322 323 /* h3 = 19f4^2 + 2f0f3 + 2f1f2; Stored in r14, r15 */ 324 mulq %rbp // (rax, rdx) = 2f0 * f3 325 movq %rax, %r14 326 movq %rcx, %rax // f4 327 movq %rdx, %r15 328 329 /* h4 = f2^2 + 2f0f4 + 2f1f3; Stored in rbx, rcx */ 330 mulq %rbp // (rax, rdx) = 2f0 * f4 331 movq %rax, %rbx 332 movq %rcx, %rax // f4 333 movq %rdx, %rcx 334 335 /* Calculate 19 * f4 related 336 * h3 337 */ 338 u51mul %rdi, %r14, %r15, 8(%rsi) // (rax, rdx) = 19 * f4^2; load f1 339 340 movq 24(%rsi), %rsi // f3 341 342 /* Calculate f1 related 343 * h2 344 */ 345 leaq (%rax, %rax, 1), %rbp // 2 * f1 346 u51mul %rax, %r12, %r13, 16(%rsp) // (rax, rdx) = f1^2; load f2 347 348 /* h3 */ 349 u51mul %rbp, %r14, %r15, %rsi // (rax, rdx) = 2 * f1 * f2; load f3 350 351 /* h4 */ 352 u51mul %rbp, %rbx, %rcx, %rbp // (rax, rdx) = 2 * f1 * f3; load 2 * f1 353 354 imulq $19, %rsi, %rbp // 19 * f3 355 356 /* h0 */ 357 mulq %rdi // (rax, rdx) = 2 * f1 * 19 * f4 358 addq %rax, %r8 359 leaq (%rsi, %rsi, 1), %rax // 2 * f3 360 adcq %rdx, %r9 361 362 /* Calculate f3 related 363 * h2 364 */ 365 u51mul %rdi, %r12, %r13, %rsi // (rax, rdx) = f3 * 2 * 19 * f4; load f3 366 367 /* h1 */ 368 u51mul %rbp, %r10, %r11, 16(%rsp) // (rax, rdx) = 19 * f3^2; load f2 369 370 /* Calculate f2 related 371 * h4 372 */ 373 leaq (%rax, %rax, 1), %rsi // 2 * f2 374 u51mul %rax, %rbx, %rcx, %rbp // (rax, rdx) = f2^2; load 19 * f3 375 376 /* h0 */ 377 u51mul %rsi, %r8, %r9, %rsi // (rax, rdx) = 2 * f2 * 19 * f3; load 2 * f2 378 379 /* h1 */ 380 mulq %rdi // (rax, rdx) = 2 * f2 * 19 * f4 381 addq %rax, %r10 382 adcq %rdx, %r11 383 384 /* Recovery register */ 385 movq 24(%rsp), %rdi 386 387 reduce 388 389 /* Recovery register */ 390 pop_stack 391 ret 392.cfi_endproc 393.size Fp51Square,.-Fp51Square 394 395############################################################# 396# void Fp51MulScalar(Fp51 *out, const Fp51 *in); 397############################################################# 398 399.globl Fp51MulScalar 400.type Fp51MulScalar, @function 401.align 32 402Fp51MulScalar: 403.cfi_startproc 404 /* Save Register */ 405 push_stack 406 407 /*The input and output parameters are transferred by registers rdi, rsi, and rdx. 408 * rdi: out; rsi: in; rdx: scalar; fp51 Is an array of [u64; 5] 409 * Open stack, consistent with Fp51Mul 410 */ 411 412 /* h0 */ 413 movl $121666, %eax 414 mulq (%rsi) // f0 * 121666 415 movq %rax, %r8 416 movl $121666, %eax // Modify the rax immediately after the rax is vacated. 417 movq %rdx, %r9 418 419 /* h1 */ 420 mulq 8(%rsi) // f1 * 121666 421 movq %rax, %r10 422 movl $121666, %eax 423 movq %rdx, %r11 424 425 /* h2 */ 426 mulq 16(%rsi) // f2 * 121666 427 movq %rax, %r12 428 movl $121666, %eax 429 movq %rdx, %r13 430 431 /* h3 */ 432 mulq 24(%rsi) // f3 * 121666 433 movq %rax, %r14 434 movl $121666, %eax 435 movq %rdx, %r15 436 437 /* h4 */ 438 mulq 32(%rsi) // f4 * 121666 439 movq %rax, %rbx 440 movq %rdx, %rcx 441 442 reduce 443 444 /* Recovery register */ 445 pop_stack 446 ret 447.cfi_endproc 448.size Fp51MulScalar,.-Fp51MulScalar 449 450/** 451 * Fp64 reduce: 452 * +------+-----+-----+-----+------+ 453 * | | r15 | r14 | r13 | r12 | 454 * | | | | | 38 | 455 * +-------------------------------+ 456 * | | | | r12'| r12' | 457 * | | | r13'| r13'| | 458 * | | r14'| r14'| | | 459 * | r15' | r15'| | | | 460 * +-------------------------------+ 461 * | | r11'| r10'| r9' | r8' | 462 * | | | | |19r15'| 463 * +-------------------------------+ 464 * | | r11 | r10 | r9 | r8 | 465 * +------+-----+-----+-----+------+ 466 */ 467.macro Fp64Reduce 468 xorq %rsi, %rsi 469 movq $38, %rdx 470 mulx %r12, %rax, %rbx 471 adcx %rax, %r8 472 adox %rbx, %r9 473 mulx %r13, %rax, %rbx 474 adcx %rax, %r9 475 adox %rbx, %r10 476 mulx %r14, %rax, %rbx 477 adcx %rax, %r10 478 adox %rbx, %r11 479 mulx %r15, %rax, %r12 480 adcx %rax, %r11 481 adcx %rsi, %r12 482 adox %rsi, %r12 483 484 shld $1, %r11, %r12 485 movq $0x7FFFFFFFFFFFFFFF, %rbp 486 andq %rbp, %r11 487 imulq $19, %r12, %r12 488 addq %r12, %r8 489 adcx %rsi, %r9 490 adcx %rsi, %r10 491 adcx %rsi, %r11 492 493 movq 0(%rsp), %rdi 494 movq %r9, 8(%rdi) 495 movq %r10, 16(%rdi) 496 movq %r11, 24(%rdi) 497 movq %r8, 0(%rdi) 498.endm 499 500.globl Fp64Mul 501.type Fp64Mul,@function 502.align 32 503Fp64Mul: 504.cfi_startproc 505 pushq %rbp 506 pushq %rbx 507 pushq %r12 508 pushq %r13 509 pushq %r14 510 pushq %r15 511 pushq %rdi 512 513/** 514 * (f3, f2, f1, f0) * (g3, g2, g1, g0) : 515 * + + + + + + + + + 516 * | | | | | A3 | A2 | A1 | A0 | 517 * | | | | | B3 | B2 | B1 | B0 | 518 * +------------------------------------------+ 519 * | | | | | | |A0B0|A0B0| 520 * | | | | | |A1B0|A1B0| | 521 * | | | | |A2B0|A2B0| | | 522 * | | | |A3B0|A3B0| | | | 523 * | | | | | |A0B1|A0B1| | 524 * | | | | |A1B1|A1B1| | | 525 * | | | |A2B1|A2B1| | | | 526 * | | |A3B1|A3B1| | | | | 527 * | | | | |A2B0|A2B0| | | 528 * | | | |A2B1|A2B1| | | | 529 * | | |A2B2|A2B2| | | | | 530 * | |A2B3|A2B3| | | | | | 531 * | | | |A3B0|A3B0| | | | 532 * | | |A3B1|A3B1| | | | | 533 * | |A3B2|A3B2| | | | | | 534 * |A3B3|A3B3| | | | | | | 535 * +------------------------------------------+ 536 * |r15 |r14 |r13 |r12 |r11 |r10 |r9 |r8 | 537 * + + + + + + + + + 538 */ 539 movq 0(%rdx), %rcx 540 movq 8(%rdx), %rbp 541 movq 16(%rdx), %rdi 542 movq 24(%rdx), %r15 543 movq 0(%rsi), %rdx 544 xorq %r14, %r14 545 546 // (f3, f2, f1, f0) * g0 547 mulx %rcx, %r8, %rax 548 mulx %rbp, %r9, %rbx 549 adcx %rax, %r9 550 mulx %rdi, %r10, %rax 551 adcx %rbx, %r10 552 mulx %r15, %r11, %r12 553 movq 8(%rsi), %rdx 554 adcx %rax, %r11 555 adcx %r14, %r12 556 557 // (f3, f2, f1, f0) * g1 558 mulx %rcx, %rax, %rbx 559 adcx %rax, %r9 560 adox %rbx, %r10 561 mulx %rbp, %rax, %rbx 562 adcx %rax, %r10 563 adox %rbx, %r11 564 mulx %rdi, %rax, %rbx 565 adcx %rax, %r11 566 adox %rbx, %r12 567 mulx %r15, %rax, %r13 568 movq 16(%rsi), %rdx 569 adcx %rax, %r12 570 adox %r14, %r13 571 adcx %r14, %r13 572 573 // (f3, f2, f1, f0) * g2 574 mulx %rcx, %rax, %rbx 575 adcx %rax, %r10 576 adox %rbx, %r11 577 mulx %rbp, %rax, %rbx 578 adcx %rax, %r11 579 adox %rbx, %r12 580 mulx %rdi, %rax, %rbx 581 adcx %rax, %r12 582 adox %rbx, %r13 583 mulx %r15, %rax, %r14 584 movq 24(%rsi), %rdx 585 adcx %rax, %r13 586 movq $0, %rsi 587 adox %rsi, %r14 588 adcx %rsi, %r14 589 590 // (f3, f2, f1, f0) * g3 591 mulx %rcx, %rax, %rbx 592 adcx %rax, %r11 593 adox %rbx, %r12 594 mulx %rbp, %rax, %rbx 595 adcx %rax, %r12 596 adox %rbx, %r13 597 mulx %rdi, %rax, %rbx 598 adcx %rax, %r13 599 adox %rbx, %r14 600 mulx %r15, %rax, %r15 601 adcx %rax, %r14 602 adox %rsi, %r15 603 adcx %rsi, %r15 604 605 // reduce 606 Fp64Reduce 607 608 movq 8(%rsp), %r15 609 movq 16(%rsp), %r14 610 movq 24(%rsp), %r13 611 movq 32(%rsp), %r12 612 movq 40(%rsp), %rbx 613 movq 48(%rsp), %rbp 614 leaq 56(%rsp), %rsp 615 616 ret 617.cfi_endproc 618.size Fp64Mul,.-Fp64Mul 619 620.globl Fp64Sqr 621.type Fp64Sqr,@function 622.align 32 623Fp64Sqr: 624.cfi_startproc 625 pushq %rbp 626 pushq %rbx 627 pushq %r12 628 pushq %r13 629 pushq %r14 630 pushq %r15 631 pushq %rdi 632/** 633 * (f3, f2, f1, f0) ^ 2 : 634 * +----+----+----+----+----+----+----+----+----+ 635 * | | | | | | A3 | A2 | A1 | A0 | 636 * | * | | | | | A3 | A2 | A1 | A0 | 637 * +--------------------------------------------+ 638 * | | | | | | |A0A1|A0A1| | 639 * | | | | | |A0A2|A0A2| | | 640 * | + | | | |A0A3|A0A3| | | | 641 * | | | | |A1A2|A1A2| | | | 642 * | | | |A1A3|A1A3| | | | | 643 * | | |A2A3|A2A3| | | | | | 644 * +--------------------------------------------+ 645 * | *2 | |r14`|r13`|r12`|r11`|r10`|r9` | | 646 * +--------------------------------------------+ 647 * | |r15'|r14'|r13'|r12'|r11'|r10'|r9' | | 648 * +--------------------------------------------+ 649 * | | | | | | | |A0A0|A0A0| 650 * | | | | | |A1A1|A1A1| | | 651 * | + | | |A2A2|A2A2| | | | | 652 * | |A3A3|A3A3| | | | | | | 653 * +--------------------------------------------+ 654 * | |r15 |r14 |r13 |r12 |r11 |r10 |r9 |r8 | 655 * +--------------------------------------------+ 656 */ 657 movq 0(%rsi), %rbx // a0 658 movq 8(%rsi), %rcx // a1 659 movq 16(%rsi), %rbp // a2 660 movq 24(%rsi), %rdi // a3 661 xorq %r15, %r15 662 663 // (a1, a2, a3) * a0 664 movq %rbx, %rdx 665 mulx %rcx, %r9, %rsi 666 mulx %rbp, %r10, %rax 667 adcx %rsi, %r10 668 mulx %rdi, %r11, %r12 669 movq %rcx, %rdx 670 adcx %rax, %r11 671 adcx %r15, %r12 672 673 // (a2, a3) * a1 674 mulx %rbp, %rsi, %rax 675 adcx %rsi, %r11 676 adox %rax, %r12 677 mulx %rdi, %rsi, %r13 678 movq %rbp, %rdx 679 adcx %rsi, %r12 680 adcx %r15, %r13 681 adox %r15, %r13 682 683 // a3 * a2 684 mulx %rdi, %rsi, %r14 685 movq %rbx, %rdx 686 adcx %rsi, %r13 687 adcx %r15, %r14 688 689 // (r9 --- r14) *2 690 shld $1, %r14, %r15 691 shld $1, %r13, %r14 692 shld $1, %r12, %r13 693 shld $1, %r11, %r12 694 shld $1, %r10, %r11 695 shld $1, %r9, %r10 696 shlq $1, %r9 697 xorq %r8, %r8 // clear cf flag 698 // a0 * a0 699 mulx %rdx, %r8, %rax 700 movq %rcx, %rdx 701 adcx %rax, %r9 702 703 // a1 * a1 704 mulx %rdx, %rsi, %rax 705 movq %rbp, %rdx 706 adcx %rsi, %r10 707 adcx %rax, %r11 708 709 // a2 * a2 710 mulx %rdx, %rsi, %rax 711 movq %rdi, %rdx 712 adcx %rsi, %r12 713 adcx %rax, %r13 714 715 // a3 * a3 716 mulx %rdx, %rsi, %rax 717 adcx %rsi, %r14 718 adcx %rax, %r15 719 720 // reduce 721 Fp64Reduce 722 723 movq 8(%rsp), %r15 724 movq 16(%rsp), %r14 725 movq 24(%rsp), %r13 726 movq 32(%rsp), %r12 727 movq 40(%rsp), %rbx 728 movq 48(%rsp), %rbp 729 leaq 56(%rsp), %rsp 730 ret 731.cfi_endproc 732.size Fp64Sqr, .-Fp64Sqr 733 734.globl Fp64MulScalar 735.type Fp64MulScalar, @function 736.align 32 737Fp64MulScalar: 738.cfi_startproc 739 movl $121666, %edx 740 mulx 0(%rsi), %r8, %rax 741 mulx 8(%rsi), %r9, %rcx 742 addq %rax, %r9 743 mulx 16(%rsi), %r10, %rax 744 adcx %rcx, %r10 745 mulx 24(%rsi), %r11, %rcx 746 adcx %rax, %r11 747 movl $0, %edx 748 adcx %rdx, %rcx 749 movq $0x7FFFFFFFFFFFFFFF, %rax 750 shld $1, %r11, %rcx 751 andq %rax, %r11 752 imulq $19, %rcx, %rcx 753 754 addq %rcx, %r8 755 adcx %rdx, %r9 756 movq %r8, 0(%rdi) 757 adcx %rdx, %r10 758 movq %r9, 8(%rdi) 759 adcx %rdx, %r11 760 movq %r10, 16(%rdi) 761 movq %r11, 24(%rdi) 762 ret 763.cfi_endproc 764.size Fp64MulScalar, .-Fp64MulScalar 765 766.globl Fp64Add 767.type Fp64Add, @function 768.align 32 769Fp64Add: 770.cfi_startproc 771 movq 0(%rsi),%r8 772 movq 8(%rsi),%r9 773 addq 0(%rdx),%r8 774 adcx 8(%rdx),%r9 775 movq 16(%rsi),%r10 776 movq 24(%rsi),%r11 777 adcx 16(%rdx),%r10 778 adcx 24(%rdx),%r11 779 780 movq $0, %rax 781 movq $38, %rcx 782 cmovae %rax, %rcx 783 addq %rcx, %r8 784 adcx %rax, %r9 785 adcx %rax, %r10 786 movq %r9, 8(%rdi) 787 adcx %rax, %r11 788 movq %r10, 16(%rdi) 789 movq %r11, 24(%rdi) 790 791 cmovc %rcx, %rax 792 addq %rax, %r8 793 movq %r8, 0(%rdi) 794 ret 795.cfi_endproc 796.size Fp64Add, .-Fp64Add 797 798.globl Fp64Sub 799.type Fp64Sub,@function 800.align 32 801Fp64Sub: 802.cfi_startproc 803 movq 0(%rsi),%r8 804 movq 8(%rsi),%r9 805 subq 0(%rdx),%r8 806 sbbq 8(%rdx),%r9 807 movq 16(%rsi),%r10 808 movq 24(%rsi),%r11 809 sbbq 16(%rdx),%r10 810 sbbq 24(%rdx),%r11 811 812 movq $0, %rax 813 movq $38, %rcx 814 cmovae %rax, %rcx 815 816 subq %rcx, %r8 817 sbbq %rax, %r9 818 sbbq %rax, %r10 819 movq %r9,8(%rdi) 820 sbbq %rax, %r11 821 movq %r10,16(%rdi) 822 cmovc %rcx, %rax 823 movq %r11,24(%rdi) 824 subq %rax, %r8 825 movq %r8,0(%rdi) 826 827 ret 828.cfi_endproc 829.size Fp64Sub,.-Fp64Sub 830 831.globl Fp64PolyToData 832.type Fp64PolyToData,@function 833.align 32 834Fp64PolyToData: 835.cfi_startproc 836 movq 24(%rsi), %r11 837 movq 16(%rsi), %r10 838 xorq %rax, %rax 839 840 leaq (%r11, %r11, 1), %rcx 841 sarq $63, %r11 842 shrq $1, %rcx 843 andq $19, %r11 844 addq $19, %r11 845 846 movq 0(%rsi), %r8 847 movq 8(%rsi), %r9 848 849 addq %r11, %r8 850 adcx %rax, %r9 851 adcx %rax, %r10 852 adcx %rax, %rcx 853 854 leaq (%rcx, %rcx, 1), %r11 855 sarq $63, %rcx 856 shrq $1, %r11 857 notq %rcx 858 andq $19, %rcx 859 860 subq %rcx, %r8 861 sbbq $0, %r9 862 movq %r8, 0(%rdi) 863 movq %r9, 8(%rdi) 864 sbbq $0, %r10 865 sbbq $0, %r11 866 movq %r10, 16(%rdi) 867 movq %r11, 24(%rdi) 868 869 ret 870.cfi_endproc 871.size Fp64PolyToData,.-Fp64PolyToData 872 873#endif 874