• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_X25519
18
19.file "x25519_x86_64.S"
20.text
21
22.macro push_stack
23    /* Save register. The following registers need to be saved by the caller and restored when the function exits. */
24    pushq   %rbx
25    pushq   %rbp
26    pushq   %r12
27    pushq   %r13
28    pushq   %r14
29    pushq   %r15
30
31    /* Allocate stack space and store the following necessary content: */
32    leaq	-32(%rsp), %rsp
33.endm
34
35.macro pop_stack
36    /* Recovery register */
37    movq    32(%rsp),%r15
38    movq    40(%rsp),%r14
39    movq    48(%rsp),%r13
40    movq    56(%rsp),%r12
41    movq    64(%rsp),%rbp
42    movq    72(%rsp),%rbx
43
44    /* Restore stack pointer. The stack is opened with 32 bytes and 6 registers are restored.
45       The total number is 80 bytes. */
46    leaq    80(%rsp), %rsp
47.endm
48
49.macro u51mul cur, low, high, next
50    mulq    \cur
51    addq    %rax, \low
52    movq    \next, %rax
53    adcq    %rdx, \high
54.endm
55
56.macro reduce
57    /* Retain the last 51 digits. */
58    movq    $0x7ffffffffffff, %rbp
59
60    /* Calculate h2' */
61    movq    %r12, %rax
62    shrq    $51, %r12
63    shlq    $13, %r13
64
65    /* Calculate h0' */
66    movq    %r8, %rsi
67    shrq    $51, %r8
68    shlq    $13, %r9
69
70    /* Calculate h2' */
71    andq    %rbp, %rax              // h2' = rax = h2 & (2^51 - 1) = r12 & (2^51 - 1)
72    orq     %r12, %r13              // r13 = (h2 >> 51)
73    addq    %r13, %r14              // h3 += (h2 >> 51)
74    adcq    $0, %r15
75
76    /* Calculate h0' */
77    andq    %rbp, %rsi              // h0' = rsi = h0 & (2^51 - 1) = r8 & (2^51 - 1)
78    orq     %r8, %r9                // r9 = (h0 >> 51)
79    addq    %r9, %r10               // h1 += (h0 >> 51)
80    adcq    $0, %r11
81
82    /* Calculate h3' */
83    movq    %r14, %r8
84    shrq    $51, %r14
85    shlq    $13, %r15
86    andq    %rbp, %r8               // h3' = r8 = h3 & (2^51 - 1) = r14 & (2^51 - 1)
87    orq     %r14, %r15              // r15 = (h3 >> 51)
88    addq    %r15, %rbx              // h4 += (h3 >> 51)
89    adcq    $0, %rcx
90
91    /* Calculate h1' */
92    movq    %r10, %rdx
93    shrq    $51, %r10
94    shlq    $13, %r11
95    andq    %rbp, %rdx              // h1' = rdx = h1 & (2^51 - 1) = r10 & (2^51 - 1)
96    orq     %r10, %r11              // r11 = (h1 >> 51)
97    addq    %r11, %rax              // h2 += (h1 >> 51)
98
99    /* Calculate h4' */
100    movq    %rbx, %r9
101    shrq    $51, %rbx
102    shlq    $13, %rcx
103    andq    %rbp, %r9               // h4' = r9 = h4 & (2^51 - 1) = rbx & (2^51 - 1)
104    orq     %rbx, %rcx              // rcx = (h4 >> 51)
105
106    /* out[0] = out[0] + 19 * carry */
107    leaq    (%rcx, %rcx, 8), %r10   // r10 = 8 * rcx
108    leaq    (%rcx, %r10, 2), %rcx   // rcx = 2 * (8 * rcx) + rcx = 19 * rcx
109    addq    %rcx, %rsi
110
111    /* h2 remaining */
112    movq    %rax, %r10
113    andq    %rbp, %rax              // h2 &= (2^51 - 1)
114    shrq    $51, %r10
115    addq    %r10, %r8
116
117    /* out[1] += out[0] >> 51 */
118    movq    %rsi, %r10
119
120    /* out[0] &= (2^51 - 1) */
121    andq    %rbp, %rsi
122    shrq    $51, %r10
123    addq    %r10, %rdx
124
125    /* Storing Results */
126    movq    %rsi, (%rdi)            // h0'
127    movq    %rdx, 8(%rdi)           // h1'
128    movq    %rax, 16(%rdi)          // h2'
129    movq    %r8, 24(%rdi)           // h3'
130    movq    %r9, 32(%rdi)           // h4'
131.endm
132
133#############################################################
134# void Fp51Mul (Fp51 *out, const Fp51 *f, const Fp51 *g);
135#############################################################
136
137.globl  Fp51Mul
138.type   Fp51Mul, @function
139.align  32
140Fp51Mul:
141.cfi_startproc
142    /* Save Register */
143    push_stack
144
145    /* The input and output parameters are transferred by registers rdi, rsi, and rdx.
146     * rdi: out; rsi: f; rdx: g; fp51 is an array of [u64; 5]
147     * rdx will be overwritten in subsequent calculation.
148     * Therefore, you need to load the data in the rdx variable in advance.
149     */
150    movq    (%rsi), %rax                // f0
151    movq    (%rdx), %rbx                // g0
152    movq    8(%rdx), %r14               // g1
153    movq    16(%rdx), %r15              // g2
154    movq    24(%rdx), %rbp              // g3, Store g0-g3, store g3 in unaffected registers
155    movq    32(%rdx), %rcx              // g4
156
157    /* Stores the out pointer and frees the rdi so that the rdi can be used in subsequent calculations. Stores 19 * g4. */
158    movq	%rdi, 24(%rsp)
159    movq    %rax, %rdi                  // f0
160    /* r14, r15, rbx, and rcx will be overwritten in subsequent calculations. g0 to g2 will be stored.
161     * Storage actions will be scattered in the calculation code for performance purposes.
162     */
163
164    /* h0 = f0g0 + 19f1g4 + 19f2g3 + 19f3g2 + 19f4g1; Stored in r8, r9 */
165    mulq    %rbx                        // (rax, rdx) = f0 * g0, in le
166    movq    %rax, %r8
167    movq    %rdi, %rax                  // f0
168    movq    %rbx, 16(%rsp)              // g0
169    movq    %rdx, %r9
170
171    /* h1 = f0g1 + f1g0 + 19f2g4 + 19f3g3 + 19f4g2; Stored in r10, r11 */
172    mulq    %r14                        // (rax, rdx) = f0 * g1
173    movq    %rax, %r10
174    movq    %rdi, %rax                  // f0
175    leaq    (%rcx, %rcx, 8), %rbx       // g4 * 8 + g4 = g4 * 9
176    movq    %r14, 8(%rsp)               // g1
177    movq    %rdx, %r11
178
179    /* h2 = f0g2 + f1g1 + f2g0 + 19f3g4 + 19f4g3; Stored in r12, r13 */
180    mulq    %r15                        // (rax, rdx) = f0 * g2
181    movq    %rax, %r12
182    movq    %rdi, %rax                  // f0
183    leaq    (%rcx, %rbx, 2), %rdi       // rdi = 2 * (9 * g4) + g4, Store 19 * g4 to rdi before rcx is overwritten
184    movq    %r15, (%rsp)                // g2
185    movq    %rdx, %r13
186
187    /* h3 = f0g3 + f1g2 + f2g1 + f3g0 + 19f4g4; Stored in r14, r15 */
188    mulq    %rbp                        // (rax, rdx) = f0 * g3
189    movq    %rax, %r14
190    movq    (%rsi), %rax                // f0
191    movq    %rdx, %r15
192
193    /* h4 = f0g4 + f1g3 + f2g2 + f3g1 + f4g0; Stored in rbx, rcx */
194    mulq    %rcx                        // (rax, rdx) = f0 * g4
195    movq    %rax, %rbx
196    movq    8(%rsi), %rax               // f1
197    movq    %rdx, %rcx
198
199    /* Calculate 19 * g4 related */
200    u51mul  %rdi, %r8, %r9, 16(%rsi)    // (rax, rdx) = 19 * f1 * g4; load f2
201    u51mul  %rdi, %r10, %r11, 24(%rsi)  // (rax, rdx) = 19 * f2 * g4; load f3
202    u51mul  %rdi, %r12, %r13, 32(%rsi)  // (rax, rdx) = 19 * f3 * g4; load f4
203
204    mulq    %rdi                        // (rax, rdx) = 19 * f4 * g4
205    imulq   $19, %rbp, %rdi             // 19 * g3
206    addq    %rax, %r14
207    movq    8(%rsi), %rax               // f1
208    adcq    %rdx, %r15
209
210    /* Calculate g3 related */
211    mulq    %rbp                        // (rax, rdx) = f1 * g3
212    movq    (%rsp), %rbp                // g2
213    addq    %rax, %rbx
214    movq    16(%rsi), %rax              // f2
215    adcq    %rdx, %rcx
216
217    u51mul  %rdi, %r8, %r9, 24(%rsi)    // (rax, rdx) = 19 * f2 * g3; load f3
218    u51mul  %rdi, %r10, %r11, 32(%rsi)  // (rax, rdx) = 19 * f3 * g3; load f4
219
220    mulq    %rdi                        // (rax, rdx) = 19 * f4 * g3
221    imulq   $19, %rbp, %rdi             // 19 * g2
222    addq    %rax, %r12
223    movq    8(%rsi), %rax               // f1
224    adcq    %rdx, %r13
225
226    /* Calculate g2 related */
227    u51mul  %rbp, %r14, %r15, 16(%rsi)  // (rax, rdx) = f1 * g2; load f2
228
229    mulq    %rbp                    // (rax, rdx) = f2 * g2
230    movq    8(%rsp), %rbp           // g1
231    addq    %rax, %rbx
232    movq    24(%rsi), %rax          // f3
233    adcq    %rdx, %rcx
234
235    u51mul %rdi, %r8, %r9, 32(%rsi) // (rax, rdx) = 19 * f3 * g2; load f4
236    u51mul %rdi, %r10, %r11, 8(%rsi) // (rax, rdx) = 19 * f4 * g2; load f2
237
238    /* Calculate g1 related */
239    mulq    %rbp                    // (rax, rdx) = f1 * g1
240    imulq   $19, %rbp, %rdi         // 19 * g1
241    addq    %rax, %r12
242    movq    16(%rsi), %rax          // f2
243    adcq    %rdx, %r13
244
245    u51mul %rbp, %r14, %r15, 24(%rsi) // (rax, rdx) = f2 * g1; load f3
246
247    mulq    %rbp                    // (rax, rdx) = f3 * g1
248    movq    16(%rsp), %rbp          // g0
249    addq    %rax, %rbx
250    movq    32(%rsi), %rax          // f4
251    adcq    %rdx, %rcx
252
253    u51mul  %rdi, %r8, %r9, 8(%rsi) // (rax, rdx) = 19 * f4 * g1; load f1
254
255    /* Calculate g0 related */
256    u51mul  %rbp, %r10, %r11, 16(%rsi) // (rax, rdx) = f1 * g0; load f2
257    u51mul  %rbp, %r12, %r13, 24(%rsi) // (rax, rdx) = f2 * g0; load f3
258    u51mul  %rbp, %r14, %r15, 32(%rsi) // (rax, rdx) = f3 * g0; load f4
259
260    mulq    %rbp                    // (rax, rdx) = f4 * g0
261    addq    %rax, %rbx
262    adcq    %rdx, %rcx
263
264    /* Restore the stack pointer. */
265    movq    24(%rsp), %rdi
266
267    reduce
268
269    /* Recovery register */
270    pop_stack
271    ret
272.cfi_endproc
273.size   Fp51Mul,.-Fp51Mul
274
275#############################################################
276# void Fp51Square(Fp51 *out, const Fp51 *f);
277#############################################################
278
279.globl  Fp51Square
280.type   Fp51Square, @function
281.align  32
282Fp51Square:
283.cfi_startproc
284    /* Save Register */
285    push_stack
286
287    /* The input and output parameters are transferred by registers rdi and rsi.
288     * rdi: out; rsi: f; fp51 is an array of [u64; 5]
289     * Loads only non-adjacent data, vacating registers for storage calculations
290     */
291    movq    (%rsi), %rax                // f0
292    movq    16(%rsi), %r15              // f2
293    movq    32(%rsi), %rcx              // f4
294
295    /* Open the stack and store the following necessary content, which is consistent with the Fp51Mul.
296     * Stores the out pointer, frees the rdi,
297     * so that the rdi can be used in subsequent calculations, and stores 19 * f4.
298     */
299    leaq    (%rax, %rax, 1), %rbp       // 2 * f0
300    movq    %rdi, 24(%rsp)
301
302    /* h0 = f0^2 + 38f1f4 + 38f2f3; Stored in r8, r9 */
303    mulq    %rax                        // (rax, rdx) = f0^2
304    movq    %rax, %r8
305    movq    8(%rsi), %rax               // f1
306    movq    %rdx, %r9
307
308    /* h1 = 19f3^2 + 2f0f1 + 38f2g4; Stored in r10, r11 */
309    mulq    %rbp                        // (rax, rdx) = 2f0 * f1
310    movq    %rax, %r10
311    movq    %r15, %rax                  // f2
312    movq    %r15, 16(%rsp)              // Store f2 for later use of rsi
313    movq    %rdx, %r11
314
315    /* h2 = f1^2 + 2f0f2 + 38f3g4; Stored in r12, r13 */
316    mulq    %rbp                        // (rax, rdx) = 2f0 * f2
317    movq    %rax, %r12
318    movq    24(%rsi), %rax              // f3
319    movq    %rdx, %r13
320
321    imulq    $19, %rcx, %rdi            // Store 19 * f4 to rdi before rcx is overwritten
322
323    /* h3 = 19f4^2 + 2f0f3 + 2f1f2; Stored in r14, r15 */
324    mulq    %rbp                        // (rax, rdx) = 2f0 * f3
325    movq    %rax, %r14
326    movq    %rcx, %rax                  // f4
327    movq    %rdx, %r15
328
329    /* h4 = f2^2 + 2f0f4 + 2f1f3; Stored in rbx, rcx */
330    mulq    %rbp                        // (rax, rdx) = 2f0 * f4
331    movq    %rax, %rbx
332    movq    %rcx, %rax                  // f4
333    movq    %rdx, %rcx
334
335    /* Calculate 19 * f4 related
336     * h3
337     */
338    u51mul  %rdi, %r14, %r15, 8(%rsi)   // (rax, rdx) = 19 * f4^2; load f1
339
340    movq    24(%rsi), %rsi              // f3
341
342    /* Calculate f1 related
343     * h2
344     */
345    leaq   (%rax, %rax, 1), %rbp        // 2 * f1
346    u51mul  %rax, %r12, %r13, 16(%rsp)  // (rax, rdx) = f1^2; load f2
347
348    /* h3 */
349    u51mul  %rbp, %r14, %r15, %rsi      // (rax, rdx) = 2 * f1 * f2; load f3
350
351    /* h4 */
352    u51mul  %rbp, %rbx, %rcx, %rbp      // (rax, rdx) = 2 * f1 * f3; load 2 * f1
353
354    imulq   $19, %rsi, %rbp             // 19 * f3
355
356    /* h0 */
357    mulq    %rdi                        // (rax, rdx) = 2 * f1 * 19 * f4
358    addq    %rax, %r8
359    leaq    (%rsi, %rsi, 1), %rax       // 2 * f3
360    adcq    %rdx, %r9
361
362    /* Calculate f3 related
363     * h2
364     */
365    u51mul  %rdi, %r12, %r13, %rsi       // (rax, rdx) = f3 * 2 * 19 * f4; load f3
366
367    /* h1 */
368    u51mul  %rbp, %r10, %r11, 16(%rsp)   // (rax, rdx) = 19 * f3^2; load f2
369
370    /* Calculate f2 related
371     * h4
372     */
373    leaq    (%rax, %rax, 1), %rsi       // 2 * f2
374    u51mul  %rax, %rbx, %rcx, %rbp      // (rax, rdx) = f2^2; load 19 * f3
375
376    /* h0 */
377    u51mul  %rsi, %r8, %r9, %rsi        // (rax, rdx) = 2 * f2 * 19 * f3; load 2 * f2
378
379    /* h1 */
380    mulq    %rdi                    // (rax, rdx) = 2 * f2 * 19 * f4
381    addq    %rax, %r10
382    adcq    %rdx, %r11
383
384    /* Recovery register */
385    movq    24(%rsp), %rdi
386
387    reduce
388
389    /* Recovery register */
390    pop_stack
391    ret
392.cfi_endproc
393.size   Fp51Square,.-Fp51Square
394
395#############################################################
396# void Fp51MulScalar(Fp51 *out, const Fp51 *in);
397#############################################################
398
399.globl  Fp51MulScalar
400.type   Fp51MulScalar, @function
401.align  32
402Fp51MulScalar:
403.cfi_startproc
404    /* Save Register */
405    push_stack
406
407    /*The input and output parameters are transferred by registers rdi, rsi, and rdx.
408     * rdi: out; rsi: in; rdx: scalar; fp51 Is an array of [u64; 5]
409     * Open stack, consistent with Fp51Mul
410     */
411
412    /* h0 */
413    movl   $121666, %eax
414    mulq   (%rsi)                    // f0 * 121666
415    movq   %rax, %r8
416    movl   $121666, %eax             // Modify the rax immediately after the rax is vacated.
417    movq   %rdx, %r9
418
419    /* h1 */
420    mulq   8(%rsi)                   // f1 * 121666
421    movq   %rax, %r10
422    movl   $121666, %eax
423    movq   %rdx, %r11
424
425    /* h2 */
426    mulq   16(%rsi)                  // f2 * 121666
427    movq   %rax, %r12
428    movl   $121666, %eax
429    movq   %rdx, %r13
430
431    /* h3 */
432    mulq   24(%rsi)                  // f3 * 121666
433    movq   %rax, %r14
434    movl   $121666, %eax
435    movq   %rdx, %r15
436
437    /* h4 */
438    mulq   32(%rsi)                 // f4 * 121666
439    movq   %rax, %rbx
440    movq   %rdx, %rcx
441
442    reduce
443
444    /* Recovery register */
445    pop_stack
446    ret
447.cfi_endproc
448.size   Fp51MulScalar,.-Fp51MulScalar
449
450/**
451 * Fp64 reduce:
452 *     +------+-----+-----+-----+------+
453 *     |      | r15 | r14 | r13 | r12  |
454 *     |      |     |     |     |  38  |
455 *     +-------------------------------+
456 *     |      |     |     | r12'| r12' |
457 *     |      |     | r13'| r13'|      |
458 *     |      | r14'| r14'|     |      |
459 *     | r15' | r15'|     |     |      |
460 *     +-------------------------------+
461 *     |      | r11'| r10'| r9' | r8'  |
462 *     |      |     |     |     |19r15'|
463 *     +-------------------------------+
464 *     |      | r11 | r10 | r9  | r8   |
465 *     +------+-----+-----+-----+------+
466 */
467.macro Fp64Reduce
468    xorq     %rsi, %rsi
469    movq     $38, %rdx
470    mulx    %r12, %rax, %rbx
471    adcx    %rax, %r8
472    adox    %rbx, %r9
473    mulx    %r13, %rax, %rbx
474    adcx    %rax, %r9
475    adox    %rbx, %r10
476    mulx    %r14, %rax, %rbx
477    adcx    %rax, %r10
478    adox    %rbx, %r11
479    mulx    %r15, %rax, %r12
480    adcx    %rax, %r11
481    adcx    %rsi, %r12
482    adox    %rsi, %r12
483
484    shld     $1, %r11, %r12
485    movq     $0x7FFFFFFFFFFFFFFF, %rbp
486    andq     %rbp, %r11
487    imulq    $19, %r12, %r12
488    addq     %r12, %r8
489    adcx     %rsi, %r9
490    adcx     %rsi, %r10
491    adcx     %rsi, %r11
492
493    movq    0(%rsp), %rdi
494    movq    %r9, 8(%rdi)
495    movq    %r10, 16(%rdi)
496    movq    %r11, 24(%rdi)
497    movq    %r8, 0(%rdi)
498.endm
499
500.globl    Fp64Mul
501.type    Fp64Mul,@function
502.align    32
503Fp64Mul:
504.cfi_startproc
505    pushq    %rbp
506    pushq    %rbx
507    pushq    %r12
508    pushq    %r13
509    pushq    %r14
510    pushq    %r15
511    pushq    %rdi
512
513/**
514 * (f3, f2, f1, f0) * (g3, g2, g1, g0) :
515 *         +    +    +    +    +    +    +    +    +
516 *         |    |    |    |    | A3 | A2 | A1 | A0 |
517 *         |    |    |    |    | B3 | B2 | B1 | B0 |
518 *       +------------------------------------------+
519 *         |    |    |    |    |    |    |A0B0|A0B0|
520 *         |    |    |    |    |    |A1B0|A1B0|    |
521 *         |    |    |    |    |A2B0|A2B0|    |    |
522 *         |    |    |    |A3B0|A3B0|    |    |    |
523 *         |    |    |    |    |    |A0B1|A0B1|    |
524 *         |    |    |    |    |A1B1|A1B1|    |    |
525 *         |    |    |    |A2B1|A2B1|    |    |    |
526 *         |    |    |A3B1|A3B1|    |    |    |    |
527 *         |    |    |    |    |A2B0|A2B0|    |    |
528 *         |    |    |    |A2B1|A2B1|    |    |    |
529 *         |    |    |A2B2|A2B2|    |    |    |    |
530 *         |    |A2B3|A2B3|    |    |    |    |    |
531 *         |    |    |    |A3B0|A3B0|    |    |    |
532 *         |    |    |A3B1|A3B1|    |    |    |    |
533 *         |    |A3B2|A3B2|    |    |    |    |    |
534 *         |A3B3|A3B3|    |    |    |    |    |    |
535 *       +------------------------------------------+
536 *         |r15 |r14 |r13 |r12 |r11 |r10 |r9  |r8  |
537 *         +    +    +    +    +    +    +    +    +
538 */
539    movq    0(%rdx), %rcx
540    movq    8(%rdx), %rbp
541    movq    16(%rdx), %rdi
542    movq    24(%rdx), %r15
543    movq    0(%rsi), %rdx
544    xorq    %r14, %r14
545
546    // (f3, f2, f1, f0) * g0
547    mulx     %rcx, %r8, %rax
548    mulx     %rbp, %r9, %rbx
549    adcx     %rax, %r9
550    mulx     %rdi, %r10, %rax
551    adcx     %rbx, %r10
552    mulx     %r15, %r11, %r12
553    movq     8(%rsi), %rdx
554    adcx     %rax, %r11
555    adcx     %r14, %r12
556
557    // (f3, f2, f1, f0) * g1
558    mulx     %rcx, %rax, %rbx
559    adcx     %rax, %r9
560    adox     %rbx, %r10
561    mulx     %rbp, %rax, %rbx
562    adcx     %rax, %r10
563    adox     %rbx, %r11
564    mulx     %rdi, %rax, %rbx
565    adcx     %rax, %r11
566    adox     %rbx, %r12
567    mulx     %r15, %rax, %r13
568    movq     16(%rsi), %rdx
569    adcx     %rax, %r12
570    adox     %r14, %r13
571    adcx     %r14, %r13
572
573    // (f3, f2, f1, f0) * g2
574    mulx     %rcx, %rax, %rbx
575    adcx     %rax, %r10
576    adox     %rbx, %r11
577    mulx     %rbp, %rax, %rbx
578    adcx     %rax, %r11
579    adox     %rbx, %r12
580    mulx     %rdi, %rax, %rbx
581    adcx     %rax, %r12
582    adox     %rbx, %r13
583    mulx     %r15, %rax, %r14
584    movq     24(%rsi), %rdx
585    adcx     %rax, %r13
586    movq     $0, %rsi
587    adox     %rsi, %r14
588    adcx     %rsi, %r14
589
590    // (f3, f2, f1, f0) * g3
591    mulx    %rcx, %rax, %rbx
592    adcx    %rax, %r11
593    adox    %rbx, %r12
594    mulx    %rbp, %rax, %rbx
595    adcx    %rax, %r12
596    adox    %rbx, %r13
597    mulx    %rdi, %rax, %rbx
598    adcx    %rax, %r13
599    adox    %rbx, %r14
600    mulx    %r15, %rax, %r15
601    adcx    %rax, %r14
602    adox    %rsi, %r15
603    adcx    %rsi, %r15
604
605    // reduce
606    Fp64Reduce
607
608    movq    8(%rsp), %r15
609    movq    16(%rsp), %r14
610    movq    24(%rsp), %r13
611    movq    32(%rsp), %r12
612    movq    40(%rsp), %rbx
613    movq    48(%rsp), %rbp
614    leaq    56(%rsp), %rsp
615
616    ret
617.cfi_endproc
618.size    Fp64Mul,.-Fp64Mul
619
620.globl    Fp64Sqr
621.type    Fp64Sqr,@function
622.align    32
623Fp64Sqr:
624.cfi_startproc
625    pushq    %rbp
626    pushq    %rbx
627    pushq    %r12
628    pushq    %r13
629    pushq    %r14
630    pushq    %r15
631    pushq    %rdi
632/**
633 * (f3, f2, f1, f0) ^ 2 :
634 *      +----+----+----+----+----+----+----+----+----+
635 *      |    |    |    |    |    | A3 | A2 | A1 | A0 |
636 *      | *  |    |    |    |    | A3 | A2 | A1 | A0 |
637 *      +--------------------------------------------+
638 *      |    |    |    |    |    |    |A0A1|A0A1|    |
639 *      |    |    |    |    |    |A0A2|A0A2|    |    |
640 *      | +  |    |    |    |A0A3|A0A3|    |    |    |
641 *      |    |    |    |    |A1A2|A1A2|    |    |    |
642 *      |    |    |    |A1A3|A1A3|    |    |    |    |
643 *      |    |    |A2A3|A2A3|    |    |    |    |    |
644 *      +--------------------------------------------+
645 *      | *2 |    |r14`|r13`|r12`|r11`|r10`|r9` |    |
646 *      +--------------------------------------------+
647 *      |    |r15'|r14'|r13'|r12'|r11'|r10'|r9' |    |
648 *      +--------------------------------------------+
649 *      |    |    |    |    |    |    |    |A0A0|A0A0|
650 *      |    |    |    |    |    |A1A1|A1A1|    |    |
651 *      | +  |    |    |A2A2|A2A2|    |    |    |    |
652 *      |    |A3A3|A3A3|    |    |    |    |    |    |
653 *      +--------------------------------------------+
654 *      |    |r15 |r14 |r13 |r12 |r11 |r10 |r9  |r8  |
655 *      +--------------------------------------------+
656 */
657    movq   0(%rsi), %rbx  // a0
658    movq   8(%rsi), %rcx  // a1
659    movq   16(%rsi), %rbp // a2
660    movq   24(%rsi), %rdi // a3
661    xorq   %r15, %r15
662
663    // (a1, a2, a3) * a0
664    movq   %rbx, %rdx
665    mulx   %rcx, %r9, %rsi
666    mulx   %rbp, %r10, %rax
667    adcx   %rsi, %r10
668    mulx   %rdi, %r11, %r12
669    movq   %rcx, %rdx
670    adcx   %rax, %r11
671    adcx   %r15, %r12
672
673    // (a2, a3) * a1
674    mulx   %rbp, %rsi, %rax
675    adcx   %rsi, %r11
676    adox   %rax, %r12
677    mulx   %rdi, %rsi, %r13
678    movq   %rbp, %rdx
679    adcx   %rsi, %r12
680    adcx   %r15, %r13
681    adox   %r15, %r13
682
683    // a3 * a2
684    mulx   %rdi, %rsi, %r14
685    movq   %rbx, %rdx
686    adcx   %rsi, %r13
687    adcx   %r15, %r14
688
689    // (r9 --- r14) *2
690    shld   $1, %r14, %r15
691    shld   $1, %r13, %r14
692    shld   $1, %r12, %r13
693    shld   $1, %r11, %r12
694    shld   $1, %r10, %r11
695    shld   $1, %r9, %r10
696    shlq   $1, %r9
697    xorq   %r8, %r8   // clear cf flag
698    // a0 * a0
699    mulx   %rdx, %r8, %rax
700    movq   %rcx, %rdx
701    adcx   %rax, %r9
702
703    // a1 * a1
704    mulx    %rdx, %rsi, %rax
705    movq    %rbp, %rdx
706    adcx    %rsi, %r10
707    adcx    %rax, %r11
708
709    // a2 * a2
710    mulx   %rdx, %rsi, %rax
711    movq   %rdi, %rdx
712    adcx   %rsi, %r12
713    adcx   %rax, %r13
714
715    // a3 * a3
716    mulx   %rdx, %rsi, %rax
717    adcx   %rsi, %r14
718    adcx   %rax, %r15
719
720    // reduce
721    Fp64Reduce
722
723    movq    8(%rsp), %r15
724    movq    16(%rsp), %r14
725    movq    24(%rsp), %r13
726    movq    32(%rsp), %r12
727    movq    40(%rsp), %rbx
728    movq    48(%rsp), %rbp
729    leaq    56(%rsp), %rsp
730    ret
731.cfi_endproc
732.size   Fp64Sqr, .-Fp64Sqr
733
734.globl  Fp64MulScalar
735.type   Fp64MulScalar, @function
736.align  32
737Fp64MulScalar:
738.cfi_startproc
739    movl    $121666, %edx
740    mulx   0(%rsi), %r8, %rax
741    mulx   8(%rsi), %r9, %rcx
742    addq    %rax, %r9
743    mulx   16(%rsi), %r10, %rax
744    adcx    %rcx, %r10
745    mulx   24(%rsi), %r11, %rcx
746    adcx    %rax, %r11
747    movl    $0, %edx
748    adcx    %rdx, %rcx
749    movq    $0x7FFFFFFFFFFFFFFF, %rax
750    shld    $1, %r11, %rcx
751    andq    %rax, %r11
752    imulq   $19, %rcx, %rcx
753
754    addq    %rcx, %r8
755    adcx    %rdx, %r9
756    movq    %r8, 0(%rdi)
757    adcx    %rdx, %r10
758    movq    %r9, 8(%rdi)
759    adcx    %rdx, %r11
760    movq    %r10, 16(%rdi)
761    movq    %r11, 24(%rdi)
762    ret
763.cfi_endproc
764.size   Fp64MulScalar, .-Fp64MulScalar
765
766.globl   Fp64Add
767.type    Fp64Add, @function
768.align   32
769Fp64Add:
770.cfi_startproc
771    movq    0(%rsi),%r8
772    movq    8(%rsi),%r9
773    addq    0(%rdx),%r8
774    adcx    8(%rdx),%r9
775    movq    16(%rsi),%r10
776    movq    24(%rsi),%r11
777    adcx    16(%rdx),%r10
778    adcx    24(%rdx),%r11
779
780    movq    $0,   %rax
781    movq    $38,  %rcx
782    cmovae  %rax, %rcx
783    addq    %rcx, %r8
784    adcx    %rax, %r9
785    adcx    %rax, %r10
786    movq    %r9,  8(%rdi)
787    adcx    %rax, %r11
788    movq    %r10, 16(%rdi)
789    movq    %r11, 24(%rdi)
790
791    cmovc   %rcx, %rax
792    addq    %rax, %r8
793    movq    %r8,  0(%rdi)
794    ret
795.cfi_endproc
796.size   Fp64Add, .-Fp64Add
797
798.globl   Fp64Sub
799.type    Fp64Sub,@function
800.align   32
801Fp64Sub:
802.cfi_startproc
803    movq    0(%rsi),%r8
804    movq    8(%rsi),%r9
805    subq    0(%rdx),%r8
806    sbbq    8(%rdx),%r9
807    movq    16(%rsi),%r10
808    movq    24(%rsi),%r11
809    sbbq    16(%rdx),%r10
810    sbbq    24(%rdx),%r11
811
812    movq    $0,   %rax
813    movq    $38,  %rcx
814    cmovae  %rax, %rcx
815
816    subq    %rcx, %r8
817    sbbq    %rax, %r9
818    sbbq    %rax, %r10
819    movq    %r9,8(%rdi)
820    sbbq    %rax, %r11
821    movq    %r10,16(%rdi)
822    cmovc   %rcx, %rax
823    movq    %r11,24(%rdi)
824    subq    %rax, %r8
825    movq    %r8,0(%rdi)
826
827    ret
828.cfi_endproc
829.size    Fp64Sub,.-Fp64Sub
830
831.globl    Fp64PolyToData
832.type    Fp64PolyToData,@function
833.align    32
834Fp64PolyToData:
835.cfi_startproc
836    movq    24(%rsi), %r11
837    movq    16(%rsi), %r10
838    xorq    %rax, %rax
839
840    leaq    (%r11, %r11, 1), %rcx
841    sarq    $63, %r11
842    shrq    $1, %rcx
843    andq    $19, %r11
844    addq    $19, %r11
845
846    movq    0(%rsi), %r8
847    movq    8(%rsi), %r9
848
849    addq    %r11, %r8
850    adcx    %rax, %r9
851    adcx    %rax, %r10
852    adcx    %rax, %rcx
853
854    leaq    (%rcx, %rcx, 1), %r11
855    sarq    $63, %rcx
856    shrq    $1, %r11
857    notq    %rcx
858    andq    $19, %rcx
859
860    subq    %rcx, %r8
861    sbbq    $0, %r9
862    movq    %r8, 0(%rdi)
863    movq    %r9, 8(%rdi)
864    sbbq    $0, %r10
865    sbbq    $0, %r11
866    movq    %r10, 16(%rdi)
867    movq    %r11, 24(%rdi)
868
869    ret
870.cfi_endproc
871.size    Fp64PolyToData,.-Fp64PolyToData
872
873#endif
874