• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_CURVE_NISTP256) && defined(HITLS_CRYPTO_NIST_USE_ACCEL)
18
19#include "ecp256_pre_comp_table.s"
20.file  "ecp256_x86.S"
21
22.data
23.align	64
24.Lpoly:         // P
25.quad   0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
26.LrrModP:       // Indicates the calculated value of R * R mod p, which is used in montgomery modular multiplication.
27.quad   0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd
28.Lone_mont:     // R mod P, R = 2^256, = 2^256 - P
29.quad   0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
30.Lord:          // order, n
31.quad   0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
32.LordK:         // (2^64 - ord[0]) * ordK = 1 (mod 2^64)) LordK = -(ord[0])^(-1) (mod 2^64)  LordK * Lord,
33                // The lower 64 bits are all Fs.
34.quad   0xccd1c8aaee00bc4f
35.LOne:
36.quad   0x0000000000000001, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000
37
38.text
39/**
40 * Function description: Returns the address of the field calculation table of the ECP256 field.
41 * Function prototype: const ECP256_TableRow *ECP256_GetPreCompTable(void);
42 * Input register: None
43 * Change register: None
44 * Output register: rax
45 * Function/Macro Call: None
46 */
47.globl  ECP256_GetPreCompTable
48.type   ECP256_GetPreCompTable,@function
49.align 32
50ECP256_GetPreCompTable:
51.cfi_startproc
52
53    leaq    g_preCompTable(%rip), %rax
54
55    ret
56.cfi_endproc
57.size   ECP256_GetPreCompTable, .-ECP256_GetPreCompTable
58
59/**
60 * Function description: Addition of the ECP256 field. res = a + b mod P
61 * Function prototype: void ECP256_Add(Coord *r, const Coord *a, const Coord *b);
62 * Input register:
63 *     rdi: Pointer to the output Coord structure
64 *     rsi: Address pointing to input data a
65 *     rdx: Address pointing to input data b
66 * Change register: rsi, rdx, rcx, rax, r8, r9, r10, r11, r12, r13
67 * Output register: None
68 * Function/Macro call: Addition can be implemented by calling ECP256_AddCore.
69 */
70.globl  ECP256_Add
71.type   ECP256_Add,@function
72.align 32
73ECP256_Add:
74.cfi_startproc
75    pushq   %r12
76    pushq   %r13
77
78    movq    (%rsi), %r8         // a[0]
79    movq    8(%rsi), %r9        // a[1]
80    xorq    %r13, %r13          // Save carry
81    movq    16(%rsi), %r10      // a[2]
82    movq    24(%rsi), %r11      // a[3]
83    leaq    .Lpoly(%rip), %rsi  // P
84
85    addq    (%rdx), %r8         // c[0] = a[0] + b[0]
86    adcq    8(%rdx), %r9        // c[1] = a[1] + b[1] + carry
87    movq    %r8, %rax           // save c[0]
88    adcq    16(%rdx), %r10      // c[2] = a[2] + b[2] + carry
89    adcq    24(%rdx), %r11      // c[3] = a[3] = b[3] + carry
90    movq    %r9, %rcx           // save c[1]
91    adcq    $0, %r13            // save carry value to r13
92
93    subq    $-1, %r8            // d[0] = c[0] - P[0]
94    movq    %r10, %rdx          // save c[2]
95    sbbq    8(%rsi), %r9        // d[1] = c[1] - P[1] - borrow
96    sbbq    $0, %r10            // d[2] = c[2] - P[2] - borrow
97    movq    %r11, %r12          // save c[3]
98    sbbq    24(%rsi), %r11      // d[3] = c[3] - P[3] - borrow
99    sbbq    $0, %r13            // r13 = 0 + carry - borrow
100
101    cmovcq  %rax, %r8           // res[0] = (r13 < 0) ? c[0]: d[0]
102    cmovcq  %rcx, %r9           // res[1] = (r13 < 0) ? c[1]: d[1]
103    movq    %r8, (%rdi)
104    cmovcq  %rdx, %r10          // res[2] = (r13 < 0) ? c[2]: d[2]
105    movq    %r9, 8(%rdi)
106    cmovcq  %r12, %r11          // res[3] = (r13 < 0) ? c[3]: d[3]
107    movq    %r10, 16(%rdi)
108
109    movq    (%rsp), %r13
110    movq    %r11, 24(%rdi)
111    movq    8(%rsp), %r12
112    leaq    16(%rsp), %rsp
113    ret
114.cfi_endproc
115.size   ECP256_Add, .-ECP256_Add
116
117/**
118 * Function description: Addition core part of the ECP256 field, a + b mod P; Outputs r8-r11, r14,15 are P[1] and P[3]
119 * Input register:
120 *        r8-r11:256-bit input data a
121 *        rdx: points to the input 256-bit data b.
122 *        r14:P[1]
123 *        r15:P[3]
124 * Change register: rdx, rcx, rax, r8, r9, r10, r11, r12, r13
125 * Output register: r8-r11
126 */
127.type   ECP256_AddCore,@function
128.align 32
129ECP256_AddCore:
130.cfi_startproc
131    xorq    %r13, %r13
132    addq    (%rdx), %r8         // Addition result.
133    adcq    8(%rdx), %r9
134    movq    %r8, %rax
135    adcq    16(%rdx), %r10
136    adcq    24(%rdx), %r11
137    movq    %r9, %rcx
138    adcq    $0, %r13            // Save carry value to r13.
139
140    subq    $-1, %r8            // Mod P.
141    movq    %r10, %rdx
142    sbbq    %r14, %r9
143    sbbq    $0, %r10
144    movq    %r11, %r12
145    sbbq    %r15, %r11
146    sbbq    $0, %r13
147
148    cmovcq  %rax, %r8           // Obtain mod P result.
149    cmovcq  %rcx, %r9
150    movq    %r8, (%rdi)
151    cmovcq  %rdx, %r10
152    movq    %r9, 8(%rdi)
153    cmovcq  %r12, %r11
154    movq    %r10, 16(%rdi)
155    movq    %r11, 24(%rdi)
156    ret
157.cfi_endproc
158.size   ECP256_AddCore, .-ECP256_AddCore
159
160/**
161 * Function description: Subtraction of the ECP256 field. Res = a - b mod P
162 * Function prototype: void ECP256_Sub(Coord *r, const Coord *a, const Coord *b);
163 * Input register:
164 *        rdi: Pointer to the output Coord structure
165 *        rsi: Address pointing to input data a
166 *        rdx: Address pointing to input data b
167 * Change register: rsi, rdx, rcx, rax, r8, r9, r10, r11, r12, r13
168 * Output register: None
169 * Function/Macro call: Subtraction can be implemented by calling ECP256_SubCore.
170 */
171.globl  ECP256_Sub
172.type   ECP256_Sub,@function
173.align 32
174ECP256_Sub:
175.cfi_startproc
176    pushq   %r12
177    pushq   %r13
178
179    movq    (%rsi), %r8         // a[0]
180    movq    8(%rsi), %r9        // a[1]
181    xorq    %r13, %r13          // Save borrow
182    movq    16(%rsi), %r10      // a[3]
183    movq    24(%rsi), %r11      // a[4]
184    leaq    .Lpoly(%rip), %rsi  // P
185
186    subq    (%rdx), %r8         // c[0] = a[0] - b[0]
187    sbbq    8(%rdx), %r9        // c[1] = a[1] - b[1] - borrow
188    movq    %r8, %rax           // save c[0]
189    sbbq    16(%rdx), %r10      // c[2] = a[2] - b[2] - borrow
190    sbbq    24(%rdx), %r11      // c[3] = a[3] - b[3] - borrow
191    movq    %r9, %rcx           // save c[1]
192    sbbq    $0, %r13            // save borrow value to r13
193
194    addq    $-1, %r8            // d[0] = c[0] + P[0]
195    movq    %r10, %rdx          // save c[2]
196    adcq    8(%rsi), %r9        // d[1] = c[1] + P[1] + carry
197    adcq    $0, %r10            // d[2] = c[2] + P[2] + carry
198    movq    %r11, %r12          // save c[3]
199    adcq    24(%rsi), %r11      // d[3] = c[3] + P[3] + carry
200    testq   %r13, %r13
201
202    cmovzq  %rax, %r8           // res[0] = (r13 == 0) ? c[0] : d[0]
203    cmovzq  %rcx, %r9           // res[1] = (r13 == 0) ? c[1] : d[1]
204    movq    %r8, (%rdi)
205    cmovzq  %rdx, %r10          // res[2] = (r13 == 0) ? c[2] : d[2]
206    movq    %r9, 8(%rdi)
207    cmovzq  %r12, %r11          // res[3] = (r13 == 0) ? c[3] : d[3]
208    movq    %r10, 16(%rdi)
209    movq    %r11, 24(%rdi)
210
211    movq    (%rsp), %r13
212    movq    8(%rsp), %r12
213    leaq    16(%rsp), %rsp
214    ret
215.cfi_endproc
216.size   ECP256_Sub, .-ECP256_Sub
217
218/**
219 * Function description: subtraction core part of the ECP256 field, a-b mod P; no writeback
220 * Input register:
221 *        r8-r11:256-bit input data a
222 *        rdx:Points to the input 256-bit data b.
223 *        r14:P[1]
224 *        r15:P[3]
225 * Change register: rdx, rcx, rax, r8, r9, r10, r11, r12, r13
226 * Output register: r8-r11
227 */
228.type   ECP256_SubCore,@function
229.align 32
230ECP256_SubCore:
231.cfi_startproc
232    xorq    %r13, %r13
233    subq    (%rdx), %r8         // Subtraction results.
234    sbbq    8(%rdx), %r9
235    movq    %r8, %rax           // Save Results.
236    sbbq    16(%rdx), %r10
237    sbbq    24(%rdx), %r11
238    movq    %r9, %rcx
239    sbbq    $0, %r13            // Borrowing saved in r13.
240
241    addq    $-1, %r8            // a - b + P
242    movq    %r10, %rdx
243    adcq    %r14, %r9
244    adcq    $0, %r10
245    movq    %r11, %r12
246    adcq    %r15, %r11
247    testq   %r13, %r13
248
249    cmovzq  %rax, %r8           // If r13 is equal to 0, a-b is used. Otherwise, a-b+P is used.
250    cmovzq  %rcx, %r9
251    cmovzq  %rdx, %r10
252    cmovzq  %r12, %r11
253
254    ret
255.cfi_endproc
256.size   ECP256_SubCore, .-ECP256_SubCore
257
258/**
259 * Function description: negation of the ECP256 field. res = -a mod P
260 * Function prototype: void ECP256_Neg(Coord *r, const Coord *a);
261 * Input register:
262 *       rdi: Pointer to the output Coord structure
263 *       rsi: Address pointing to input data a
264 * Change register: rsi, rdx, rcx, rax, r8, r9, r10, r11, r12, r13
265 * Output register: None
266 * Function/Macro Call:
267 */
268.globl  ECP256_Neg
269.type   ECP256_Neg,@function
270.align 32
271ECP256_Neg:
272.cfi_startproc
273    pushq   %r12
274    pushq   %r13
275
276    xorq    %r8, %r8                    // -a = 0 - a
277    xorq    %r9, %r9
278    xorq    %r13, %r13
279    leaq    .Lpoly(%rip), %rdx
280    xorq    %r10, %r10
281    xorq    %r11, %r11
282
283    subq    (%rsi),%r8
284    sbbq    8(%rsi),%r9
285    movq    %r8, %rax
286    sbbq    16(%rsi),%r10
287    sbbq    24(%rsi),%r11
288    movq    %r9, %rcx
289    sbbq    $0, %r13
290
291    addq    $-1, %r8
292    movq    %r10, %rsi
293    adcq    8(%rdx), %r9
294    adcq    $0, %r10
295    movq    %r11, %r12
296    adcq    24(%rdx), %r11
297    testq   %r13, %r13                  // Choost result
298
299    cmovzq  %rax, %r8
300    cmovzq  %rcx, %r9
301    movq    %r8, (%rdi)
302    cmovzq  %rsi, %r10
303    movq    %r9, 8(%rdi)
304    cmovzq  %r12, %r11
305    movq    %r10, 16(%rdi)
306    movq    %r11, 24(%rdi)
307
308    movq    (%rsp), %r13
309    movq    8(%rsp), %r12
310    leaq    16(%rsp), %rsp
311    ret
312.cfi_endproc
313.size   ECP256_Neg, .-ECP256_Neg
314
315/**
316 *  Function description: multiplication of the ECP256 field: res = a * b * 2^-256 mod P
317 *  Function prototype: void ECP256_Mul(Coord *r, const Coord *a, const Coord *b);
318 *  Input register:
319 *        rdi: Pointer to the output Coord structure
320 *        rsi: Address pointing to input data a
321 *        rdx: Address pointing to input data b
322 *  Change register: rax, rbx, rcx, rdx, rbp, r8, r9, r10, r11, r12, r13, r14, r15
323 *  Output register: None
324 *  Function/macro call: Multiplication can be implemented by calling ECP256_MulCore.
325 */
326.globl  ECP256_Mul
327.type   ECP256_Mul,@function
328.align 32
329ECP256_Mul:
330.cfi_startproc
331    pushq   %rbx
332    pushq   %rbp
333    pushq   %r12
334    pushq   %r13
335    pushq   %r14
336    pushq   %r15
337
338    movq    %rdx, %rcx                      // rdx is for mul
339    movq	.Lpoly+8(%rip), %r14
340    movq	.Lpoly+24(%rip), %r15
341    call    ECP256_MulCore_q
342
343    movq    (%rsp), %r15
344    movq    8(%rsp), %r14
345    movq    16(%rsp), %r13
346    movq    24(%rsp), %r12
347    movq    32(%rsp), %rbp
348    movq    40(%rsp), %rbx
349    leaq    48(%rsp), %rsp
350    ret
351.cfi_endproc
352.size   ECP256_Mul, .-ECP256_Mul
353
354/**
355 *  Function description: Montgomery multiplication of the ECP256 field
356 *  Input register:
357 *       rdi: Return address.
358 *       rsi: Factor address.
359 *       rcx: Factor address.
360 *  Change register: rax,rbx,rcx,rdx,rbp,r8-r13
361 *  Output register: None.
362 */
363.type   ECP256_MulCore_q,@function
364.align 32
365ECP256_MulCore_q:
366.cfi_startproc
367    movq    (%rcx), %rax                    // b[0]
368
369    movq    %rax, %rbp                      // save b[0]
370    mulq    (%rsi)                          // a[0] * b[0]
371    movq    %rax, %r10
372    movq    %rbp, %rax                      // b[0]
373    movq    %rdx, %r11
374
375    mulq    8(%rsi)                         // a[1] * b[0]
376    addq    %rax, %r11
377    movq    %rbp, %rax
378    adcq    $0, %rdx                        // a[1:0] * b[0] < 2^192, no overflow
379    movq    %rdx, %r12
380
381    mulq    16(%rsi)                        // a[2] * b[0]
382    addq    %rax, %r12
383    movq    %rbp, %rax
384    adcq    $0, %rdx
385    movq    %rdx, %r13
386
387    mulq    24(%rsi)                        // a[3] * b[0]
388    addq    %rax, %r13
389    adcq    $0, %rdx
390    movq    %rdx, %r8                       // result: r8 r13 r12 r11 r10
391    xorq    %r9, %r9
392    movq    %r10, %rax                      // first reduction
393    movq    %r10, %rbp
394    shlq    $32, %r10                       // r10 * 2^96 low
395    mulq    %r15                            // r10 * 0xffffffff00000001
396    shrq    $32, %rbp                       // r10 * 2^96 high
397    addq    %r10, %r11
398    adcq    %rbp, %r12
399    adcq    %rax, %r13
400    adcq    %rdx, %r8
401    movq    8(%rcx), %rax
402    adcq    $0, %r9
403    xorq    %r10, %r10
404    movq    %rax, %rbp
405
406    mulq    (%rsi)
407    addq    %rax, %r11
408    adcq    $0, %rdx
409    movq    %rbp, %rax
410    movq    %rdx, %rbx
411
412    mulq    8(%rsi)
413    addq    %rbx, %r12
414    adcq    $0, %rdx
415    addq    %rax, %r12
416    adcq    $0, %rdx
417    movq    %rbp, %rax
418    movq    %rdx, %rbx
419
420    mulq    16(%rsi)
421    addq    %rbx, %r13
422    adcq    $0, %rdx
423    addq    %rax, %r13
424    adcq    $0, %rdx
425    movq    %rbp, %rax
426    movq    %rdx, %rbx
427
428    mulq    24(%rsi)
429    addq    %rbx, %r8
430    adcq    $0, %rdx
431    addq    %rax, %r8
432    adcq    %rdx, %r9
433    adcq    $0, %r10
434
435    movq    %r11, %rbp
436    movq    %r11, %rax
437    shlq    $32, %r11                       // r11 * 2^96 low
438    mulq    %r15                            // r11 * 0xffffffff00000001
439    shrq    $32, %rbp                       // r11 * 2^96 high
440    addq    %r11, %r12
441    adcq    %rbp, %r13
442    movq    16(%rcx), %rbp
443    adcq    %rax, %r8
444    adcq    %rdx, %r9
445    movq    %rbp, %rax
446    adcq    $0, %r10
447    xorq    %r11, %r11
448
449    mulq    (%rsi)                          // a[0] * b[2]
450    addq    %rax, %r12
451    adcq    $0, %rdx
452    movq    %rbp, %rax
453    movq    %rdx, %rbx
454
455    mulq    8(%rsi)
456    addq    %rbx, %r13
457    adcq    $0, %rdx
458    addq    %rax, %r13
459    movq    %rbp, %rax
460    adcq    $0, %rdx
461    movq    %rdx, %rbx
462
463    mulq    16(%rsi)
464    addq    %rbx, %r8
465    adcq    $0, %rdx
466    addq    %rax, %r8
467    movq    %rbp, %rax
468    adcq    $0, %rdx
469    movq    %rdx, %rbx
470
471    mulq    24(%rsi)
472    addq    %rbx, %r9
473    adcq    $0, %rdx
474    addq    %rax, %r9
475    adcq    %rdx, %r10
476    movq    %r12, %rbp
477    adcq    $0, %r11
478
479    movq    %r12, %rax                      // third reduction
480    shlq    $32, %r12                       // r12 * 2^96 low
481    mulq    %r15                            // r12 * 0xffffffff00000001
482    shrq    $32, %rbp                       // r12 * 2^96 high
483    addq    %r12, %r13
484    adcq    %rbp, %r8
485    movq    24(%rcx), %rbp
486    adcq    %rax, %r9
487    adcq    %rdx, %r10
488    movq    %rbp, %rax
489    adcq    $0, %r11
490    xorq    %r12, %r12
491
492    mulq    (%rsi)                          // a[0] * b[3]
493    addq    %rax, %r13
494    adcq    $0, %rdx
495    movq    %rdx, %rbx
496
497    movq    %rbp, %rax
498    mulq    8(%rsi)
499    addq    %rbx, %r8
500    adcq    $0, %rdx
501    addq    %rax, %r8
502    adcq    $0, %rdx
503    movq    %rbp, %rax
504    movq    %rdx, %rbx
505
506    mulq    16(%rsi)
507    addq    %rbx, %r9
508    adcq    $0, %rdx
509    addq    %rax, %r9
510    movq    %rbp, %rax
511    adcq    $0, %rdx
512    movq    %rdx, %rbx
513
514    mulq    24(%rsi)
515    addq    %rbx, %r10
516    adcq    $0, %rdx
517    addq    %rax, %r10
518    adcq    %rdx, %r11
519    adcq    $0, %r12
520
521    movq    %r13, %rbp
522    movq    %r13, %rax                      // last reduction
523    shlq    $32, %r13                       // r13 * 2^96 low
524    mulq    %r15                            // r13 * 0xffffffff00000001
525    shrq    $32, %rbp                       // r13 * 2^96 high
526
527    addq    %r13, %r8
528    adcq    %rbp, %r9
529    adcq    %rax, %r10
530    adcq    %rdx, %r11
531    movq    %r8, %rbx
532    movq    %r9, %rbp
533    adcq    $0, %r12
534    movq    %r10, %rax
535    movq    %r11, %rdx
536    subq    $-1, %r8
537    sbbq    %r14, %r9
538    sbbq    $0, %r10
539    sbbq    %r15, %r11
540    sbbq    $0, %r12
541
542    cmovcq  %rbx, %r8
543    cmovcq  %rbp, %r9
544    cmovcq  %rax, %r10
545    movq    %r8, (%rdi)
546    movq    %r9, 8(%rdi)
547    cmovcq  %rdx, %r11
548    movq    %r10, 16(%rdi)
549    movq    %r11, 24(%rdi)
550
551    ret
552.cfi_endproc
553.size   ECP256_MulCore_q, .-ECP256_MulCore_q
554
555/**
556 *  Function description: ECP256 Montgomery form
557 *  Function prototype: void ECP256_ToMont(Coord *r, const Coord *a);
558 *  Input register:
559 *        rdi: pointer to the output Coord structure
560 *        rsi: address pointing to input data a
561 *  Change register: rax,rbx,rcx,rdx,rbp,r8-r13
562 *  Output register: None
563 *  Function/Macro invoking: This function can be implemented by calling ECP256_Mul.
564 */
565.globl  ECP256_ToMont
566.type   ECP256_ToMont,@function
567.align 32
568ECP256_ToMont:
569.cfi_startproc
570    leaq	.LrrModP(%rip),%rcx
571    pushq   %rbx
572    pushq   %rbp
573    pushq   %r12
574    pushq   %r13
575    pushq   %r14
576    pushq   %r15
577
578    movq	.Lpoly+8(%rip), %r14
579    movq	.Lpoly+24(%rip), %r15
580    call    ECP256_MulCore_q
581
582    movq    (%rsp), %r15
583    movq    8(%rsp), %r14
584    movq    16(%rsp), %r13
585    movq    24(%rsp), %r12
586    movq    32(%rsp), %rbp
587    movq    40(%rsp), %rbx
588    leaq    48(%rsp), %rsp
589
590    ret
591.cfi_endproc
592.size   ECP256_ToMont, .-ECP256_ToMont
593
594/**
595 * Function description: ECP256 Montgomery form converted to normal form
596 * Function prototype: void ECP256_FromMont(Coord *r, const Coord *a);
597 * Input register:
598 *        rdi: Pointer to the output Coord structure.
599 *        rsi: Address pointing to input data a.
600 * Change register: rax,rcx,rdx,r8-r13
601 * Output register: None.
602 * Function/Macro Call:
603 */
604.globl  ECP256_FromMont
605.type   ECP256_FromMont,@function
606.align 32
607ECP256_FromMont:
608.cfi_startproc
609    pushq   %r12
610    pushq   %r13
611
612    movq	.Lpoly+8(%rip), %r12
613    movq	.Lpoly+24(%rip), %r13
614
615    movq    (%rsi), %r8
616    movq    8(%rsi), %r9
617    movq    16(%rsi), %r10
618    movq    24(%rsi), %r11
619
620    movq    %r8, %rax
621    movq    %r8, %rcx
622    shlq    $32, %r8
623    mulq    %r13                    // 0xff * 0xff = 0xfe01
624    shrq    $32, %rcx
625    addq    %r8, %r9
626    adcq    %rcx, %r10
627    movq    %r9, %rcx
628    adcq    %rax, %r11
629    adcq    $0, %rdx                // rdx + 1 <= 0xff
630    movq    %r9, %rax
631    movq    %rdx, %r8
632
633    shlq    $32, %r9
634    mulq    %r13
635    shrq    $32, %rcx
636    addq    %r9, %r10
637    adcq    %rcx, %r11
638    movq    %r10, %rcx
639    adcq    %rax, %r8
640    adcq    $0, %rdx
641    movq    %r10, %rax
642    movq    %rdx, %r9
643
644    shlq    $32, %r10
645    mulq    %r13
646    shrq    $32, %rcx
647    addq    %r10, %r11
648    adcq    %rcx, %r8
649    movq    %r11, %rcx
650    adcq    %rax, %r9
651    adcq    $0, %rdx
652    movq    %r11, %rax
653    movq    %rdx, %r10
654
655    shlq    $32, %r11
656    mulq    %r13
657    shrq    $32, %rcx
658    addq    %r11, %r8
659    adcq    %rcx, %r9
660    movq    %r8, %rsi
661    adcq    %rax, %r10
662    adcq    $0, %rdx
663    movq    %rdx, %r11              // r8 r9 r10 r11
664
665    movq    %r9, %rdx
666    subq    $-1, %r8
667    movq    %r10, %rcx
668    sbbq    %r12, %r9
669    movq    %r11, %rax
670    sbbq    $0, %r10
671    sbbq    %r13, %r11
672
673    cmovcq  %rsi, %r8               // < P
674    cmovcq  %rdx, %r9
675    movq    %r8, (%rdi)
676    cmovcq  %rcx, %r10
677    movq    %r9, 8(%rdi)
678    cmovcq  %rax, %r11
679    movq    %r10, 16(%rdi)
680    movq    %r11, 24(%rdi)
681
682    movq    (%rsp), %r13
683    movq    8(%rsp), %r12
684    leaq    16(%rsp), %rsp
685    ret
686.cfi_endproc
687.size   ECP256_FromMont, .-ECP256_FromMont
688
689/**
690 *  Function description: Multiplication of the ECP256 field:res = a*b*2^-256 mod P
691 *  Function prototype: void ECP256_Sqr(Coord *r, const Coord *a);
692 *  Input register:
693 *        rdi: pointer to the output Coord structure
694 *        rsi: address pointing to input data a
695 *  Change register: rax, rbx, rcx, rdx, rsi, rdi, rbp, r8, r9, r10, r11, r12, r13, r14, r15
696 *  Output register: None
697 *  Function/Macro Call: Multiplication can be implemented by calling ECP256_SqrCore_q.
698 */
699.globl  ECP256_Sqr
700.type   ECP256_Sqr,@function
701.align 32
702ECP256_Sqr:
703.cfi_startproc
704    pushq   %rbx
705    pushq   %rbp
706    pushq   %r12
707    pushq   %r13
708    pushq   %r14
709    pushq   %r15
710
711    nop                                 // add this instruction to improve performance, movq %rsi, %rdx is ok
712    movq    (%rsi), %rax                // a[0]
713    movq    8(%rsi), %r14               // a[1]
714    movq    16(%rsi), %rbp              // a[2]
715    movq    24(%rsi), %r15              // a[3]
716    call    ECP256_SqrCore_q
717
718    movq    (%rsp), %r15
719    movq    8(%rsp), %r14
720    movq    16(%rsp), %r13
721    movq    24(%rsp), %r12
722    movq    32(%rsp), %rbp
723    movq    40(%rsp), %rbx
724    leaq    48(%rsp), %rsp
725    ret
726.cfi_endproc
727.size   ECP256_Sqr, .-ECP256_Sqr
728
729/**
730 * Function description: Montgomery square of the ECP256 field
731 * Input register:
732 *        rdi: Return address
733 *        rsi: Factor address
734 * Change register: rax, rbx, rcx, rdx, rbp, rsi, r8-r15
735 * Output register: None
736 */
737.type   ECP256_SqrCore_q,@function
738.align 32
739ECP256_SqrCore_q:
740.cfi_startproc
741    movq    %rax, %r8
742    mulq    %r14                        // rdx:rax = a[0] * a[1]
743    xorq    %rcx, %rcx
744    movq    %rax, %r9                   // r9 = rax
745    xorq    %r11, %r11
746    xorq    %r12, %r12
747    movq    %r8, %rax
748    xorq    %r13, %r13
749    xorq    %rbx, %rbx
750    movq    %rdx, %r10                  // r10:r9 = a[0] * a[1]
751
752    mulq    %rbp                        // rdx:rax = a[0] * a[2]
753    addq    %rax, %r10                  // r10 += rax
754    movq    %r8, %rax                   // a[0] --> rax
755    adcq    %rdx, %r11                  // a[0] * (a[2] * 2^64 + a[1]) < 2^196, no overflow
756
757    mulq    %r15                        // rdx:rax = a[0] * a[3]
758    addq    %rax, %r11                  // r11 += rax
759    movq    %r14, %rax                  // a[0] --> rax
760    adcq    %rdx, %r12                  // a[0] * (a[3] * 2^128 + a[2] * 2^64 + a[1]) < 2^256, no overflow
761
762    mulq    %rbp                        // rdx:rax = a[1] * a[2]
763    addq    %rax, %r11
764    movq    %r14, %rax
765    adcq    %rdx, %r12
766    adcq    $0, %r13
767
768    mulq    %r15                        // rdx:rax = a[1] * a[3]
769    addq    %rax, %r12
770    movq    %rbp, %rax
771    adcq    %rdx, %r13
772    adcq    $0, %rbx
773
774    mulq    %r15                        // rdx:rax = a[2] * a[3]
775    addq    %rax, %r13
776    adcq    %rdx, %rbx                  // rbx not overflow
777
778    movq    %r8, %rax
779    addq    %r9, %r9                    // twice
780    adcq    %r10, %r10
781    adcq    %r11, %r11
782    adcq    %r12, %r12
783    adcq    %r13, %r13
784    adcq    %rbx, %rbx
785    adcq    $0, %rcx
786
787    mulq    %rax                        // rdx:rax = a[0] * a[0]
788    movq    %rax, %r8
789    movq    %r14, %rax
790    movq    %rdx, %rsi
791
792    mulq    %rax                        // rdx:rax = a[1] * a[1]
793    addq    %rsi, %r9
794    adcq    %rax, %r10
795    movq    %rbp, %rax
796    adcq    $0, %rdx
797    movq    %rdx, %rsi
798
799    mulq    %rax                        // rdx:rax = a[2] * a[2]
800    addq    %rsi, %r11
801    adcq    %rax, %r12
802    movq    %r15, %rax
803    adcq    $0, %rdx
804    movq    %rdx, %rsi
805
806    mulq    %rax                        // rdx:rax = a[3] * a[3]
807    addq    %rsi, %r13
808    adcq    %rax, %rbx
809    movq    %r8, %rax
810    adcq    %rdx, %rcx                  // rcx not overflow
811
812    movq	.Lpoly+8(%rip), %r14
813    movq	.Lpoly+24(%rip), %r15
814    movq    %r8, %rbp                   // First reduction
815    shlq    $32, %r8                    // l32[r8 << 96]
816    mulq    %r15
817    shrq    $32, %rbp                   // h32[r8 << 96]
818    addq    %r8, %r9
819    adcq    %rbp, %r10
820    adcq    %rax, %r11
821    adcq    $0, %rdx
822    movq    %r9, %rax
823    movq    %r9, %rbp
824    movq    %rdx, %r8                   // r8 r11 r10 r9 0
825
826    shlq    $32, %r9                    // Second reduction
827    mulq    %r15
828    shrq    $32, %rbp
829    addq    %r9, %r10
830    adcq    %rbp, %r11
831    adcq    %rax, %r8
832    adcq    $0, %rdx
833    movq    %r10, %rax
834    movq    %r10, %rbp
835    movq    %rdx, %r9                   // r9 r8 r11 r10 0
836
837    shlq    $32, %r10                   // Third reduction
838    mulq    %r15
839    shrq    $32, %rbp
840    addq    %r10, %r11
841    adcq    %rbp, %r8
842    adcq    %rax, %r9
843    adcq    $0, %rdx
844    movq    %r11, %rax
845    movq    %r11, %rbp
846    movq    %rdx, %r10                  // r10 r9 r8 r11 0
847
848    shlq    $32, %r11                   // Last reduction
849    mulq    %r15
850    shrq    $32, %rbp
851    addq    %r11, %r8
852    adcq    %rbp, %r9
853    adcq    %rax, %r10
854    adcq    $0, %rdx                    // rdx r10 r9 r8 0
855
856    xorq    %rsi, %rsi                  // Add the reduction result
857    addq    %r8, %r12
858    adcq    %r9, %r13
859    movq    %r12, %r8
860    adcq    %r10, %rbx
861    adcq    %rdx, %rcx
862    movq    %r13, %r9
863    adcq    $0, %rsi                    // Reserve carry value
864
865    subq    $-1, %r8
866    movq    %rbx, %r10
867    sbbq    %r14, %r9
868    sbbq    $0, %r10
869    movq    %rcx, %r11
870    sbbq    %r15, %r11
871    sbbq    $0, %rsi
872
873    cmovcq  %r12, %r8
874    cmovcq  %r13, %r9
875    movq    %r8, (%rdi)
876    cmovcq  %rbx, %r10
877    movq    %r9, 8(%rdi)
878    cmovcq  %rcx, %r11
879    movq    %r10, 16(%rdi)
880    movq    %r11, 24(%rdi)
881
882    ret
883.cfi_endproc
884.size   ECP256_SqrCore_q, .-ECP256_SqrCore_q
885
886/**
887 *  Function description: Multiplication of the ECP256 field: res = a*b*2^-256 mod Order(P)
888 *  Function prototype: void ECP256_OrdSqr(Coord *r, const Coord *a, int32_t repeat);
889 *  Input register:
890 *        rdi: Pointer to the output Coord structure
891 *        rsi: Address pointing to input data a
892 *        rdx:Repeat
893 *  Change register: rax, rbx, rcx, rdx, rsi, rbp, r8, r9, r10, r11, r12, r13, r14, r15
894 *  Output register: None
895 *  Function/Macro Call:
896 */
897.globl  ECP256_OrdSqr
898.type   ECP256_OrdSqr,@function
899.align 32
900ECP256_OrdSqr:
901.cfi_startproc
902    pushq   %rbx
903    pushq   %rbp
904    pushq   %r12
905    pushq   %r13
906    pushq   %r14
907    pushq   %r15
908
909    movq    (%rsi), %r8
910    movq    8(%rsi), %rax
911    movq    16(%rsi), %r14
912    movq    24(%rsi), %r15
913    leaq    .Lord(%rip), %rbp           // ptr(N) --> rbp
914    movq    %rdx, %rbx
915.align 32
916.Lord_sqr_loop:
917    movq    %rax, %rsi
918    mulq    %r8                         // rdx:rax = acc[0] * acc[1]
919    movq    %rax, %r9                   // r9 = rax
920    vmovq   %rsi, %xmm1                 // save acc[1] -> xmm1
921    movq    %r14, %rax                  // acc[2] --> rax
922    movq    %rdx, %r10                  // r10:r9 = acc[0] * acc[1]
923
924    mulq    %r8                         // rdx:rax = acc[0] * acc[2]
925    addq    %rax, %r10                  // r10 += rax
926    vmovq   %r14, %xmm2                 // save acc[2] -> xmm2
927    adcq    $0, %rdx                    // acc[0] * (acc[2] * 2^64 + acc[1]) < 2^196, no overflow
928    movq    %r15, %rax                  // acc[3] --> rax
929    movq    %rdx, %r11
930
931    mulq    %r8                         // rdx:rax = a[0] * a[3]
932    addq    %rax, %r11                  // r11 += rax
933    vmovq   %r15, %xmm3                 // Save acc[3] -> xmm3
934    adcq    $0, %rdx                    // acc[0] * (acc[3] * 2^128 + acc[2] * 2^64 + acc[1]) < 2^256, no overflow
935    movq    %r15, %rax                  // acc[1] --> rax
936    movq    %rdx, %r12
937
938    mulq    %r14
939    movq    %rax, %r13
940    movq    %r14, %rax
941    movq    %rdx, %r14
942
943    mulq    %rsi
944    addq    %rax, %r11
945    movq    %r15, %rax
946    adcq    $0, %rdx
947    movq    %rdx, %r15
948
949    mulq    %rsi
950    addq    %rax, %r12
951    adcq    $0, %rdx
952    addq    %r15, %r12
953    adcq    %rdx, %r13
954    movq    %r8, %rax                   // acc[0] --> rax
955    adcq    $0, %r14                    // r14 r13 r12 r11 r10 r9
956
957    xorq    %r15, %r15                  // 0 --> r15
958    addq    %r9, %r9                    // twice
959    adcq    %r10, %r10
960    adcq    %r11, %r11
961    adcq    %r12, %r12
962    adcq    %r13, %r13
963    adcq    %r14, %r14
964    adcq    $0, %r15                    // result: r15 r14 r13 r12 r11 r10 r9
965
966    mulq    %rax                        // rdx:rax = acc[0] * acc[0]
967    movq    %rax, %r8                   // rax --> r8
968    vmovq   %xmm1, %rax                 // acc[1] --> rax
969    movq    %rdx, %rcx                  // save rdx to rcx
970
971    mulq    %rax                        // rdx:rax = acc[1] * acc[1]
972    addq    %rcx, %r9                   // r9 += rcx
973    adcq    %rax, %r10                  // r10 += rax
974    adcq    $0, %rdx                    // no overflow
975    vmovq   %xmm2, %rax                 // acc[2] --> rax
976    movq    %rdx, %rcx                  // save rdx to rcx
977
978    mulq    %rax                        // rdx:rax = a[2] * a[2]
979    addq    %rcx, %r11                  // r11 += rcx
980    adcq    %rax, %r12                  // r12 += rax
981    movq    %r8, %rsi                   // acc[0] --> rsi
982    adcq    $0, %rdx                    // no overflow
983    vmovq   %xmm3, %rax                 // acc[3] --> rax
984    movq    %rdx, %rcx                  // save rcx to rdx
985
986    imulq   32(%rbp), %r8               // m = acc[0] * LordK (mod 2^64) --> r8
987
988    mulq    %rax                        // rdx:rax = a[3] * a[3]
989    addq    %rcx, %r13                  // r13 += rcx
990    adcq    %rax, %r14                  // r14 += r
991    movq    (%rbp), %rax                // N[0] --> rax
992    adcq    %rdx, %r15                  // r15 not overflow
993
994    /* Result acc[8:0] = r15 r14 r13 r12 r11 r10 r9 r8;   */
995    /* The first reduction */
996    mulq    %r8                         // rdx:rax = m * N[0]
997    addq    %rax, %rsi                  // rsi = 0
998    movq    %r8, %rcx                   // m --> rcx
999    adcq    %rdx, %rsi                  // rsi = rdx + carry  --> acc[1]
1000
1001    subq    %r8, %r10                   // acc[2] - m
1002    sbbq    $0, %rcx                    // m - borrwo, to acc[3]
1003
1004    movq    8(%rbp), %rax               // N[1] --> rax
1005    mulq    %r8                         // rdx:rax = m * N[1]
1006    addq    %rsi, %r9                   // acc[1] += high[m * N[0]]
1007    adcq    $0, %rdx                    // save carry
1008    addq    %rax, %r9                   // acc[1] += low[m * N[1]]
1009    movq    %r8, %rax                   // m --> rax
1010    adcq    %rdx, %r10                  // acc[2] += high[m * N[1]]
1011    movq    %r9, %rsi                   // acc[1] --> rsi
1012    movq    %r8, %rdx                   // m --> rdx
1013    adcq    $0, %rcx                    // m - borrwoto acc[3]
1014
1015    imulq   32(%rbp), %r9               // m = acc[1] * LordK --> r9
1016
1017    shlq    $32, %rax                   // low(m) --> rax   low(m * 2^228)
1018    shrq    $32, %rdx                   // high(m) --> rdx   high(m * 2^228)
1019    subq    %rax, %r11                  // acc[3] - low(m * 2^228)
1020    movq    (%rbp), %rax                // N[0] --> rax
1021    sbbq    %rdx, %r8                   // m - high(m * 2^228)  to acc[4]
1022
1023    addq    %rcx, %r11                  // acc[3] += m + carry - borrow
1024    adcq    $0, %r8                     // to acc[4]
1025
1026    /* Second reduction */
1027    mulq    %r9                         // rdx:rax = m * N[0]
1028    addq    %rax, %rsi                  // acc[1] += rax  -->  0
1029    movq    %r9, %rcx                   // m --> rcx
1030    adcq    %rdx, %rsi                  // rsi = high[m * N[0]] + carry  --> acc[2]
1031
1032    movq    8(%rbp), %rax               // N[1] --> rax
1033    subq    %r9, %r11                   // acc[3] -= m
1034    sbbq    $0, %rcx                    // m - borrow --> rcx
1035
1036    mulq    %r9                         // rdx:rax = m * N[1]
1037    addq    %rsi, %r10                  // acc[2] += high[m * N[1]] + carry
1038    adcq    $0, %rdx                    // rdx += carry, no overflow
1039    addq    %rax, %r10                  // acc[2] += rax
1040    movq    %r9, %rax                   // m --> rax
1041    adcq    %rdx, %r11                  // acc[3] += rdx
1042    movq    %r10, %rsi                  // acc[2] --> rsi
1043    movq    %r9, %rdx                   // m --> rdx
1044    adcq    $0, %rcx                    // m - borrow + carry --> rcx
1045
1046    imulq   32(%rbp), %r10              // m = acc[2] * LordK --> r10
1047
1048    shlq    $32, %rax                   // low(m * 2^228)
1049    shrq    $32, %rdx                   // high(m * 2^228)
1050    subq    %rax, %r8                   // t0 acc[4] - low(m * 2^228)
1051    movq    (%rbp), %rax                // N[0] --> rax
1052    sbbq    %rdx, %r9                   // m - high(m * 2^228)  to acc[5]
1053
1054    addq    %rcx, %r8                   // to acc[4]
1055    adcq    $0, %r9                     // to acc[5]
1056
1057    /* Third reduction */
1058    mulq    %r10                        // rdx:rax = m * N[0]
1059    movq    %r10, %rcx                  // m --> rcx
1060    addq    %rax, %rsi                  // acc[2] += rax --> 0
1061    adcq    %rdx, %rsi                  // rsi = high[m * N[0]] + carry  --> acc[3]
1062    movq    8(%rbp), %rax               // N[1] --> rax
1063    subq    %r10, %r8                   // to acc[4] -= m
1064    sbbq    $0, %rcx                    // m - borrow --> rcx
1065
1066    mulq    %r10                        // rdx:rax = m * N[1]
1067    addq    %rsi, %r11                  // acc[3] += high[m * N[1]] + carry
1068    adcq    $0, %rdx                    // rdx += carry, no overflow
1069    addq    %rax, %r11                  // acc[3] += rax
1070    movq    %r10, %rax                  // m --> rax
1071    adcq    %rdx, %r8                   // to acc[4] += rdx
1072    movq    %r11, %rsi                  // acc[3] --> rsi
1073    movq    %r10, %rdx                  // m --> rdx
1074    adcq    $0, %rcx                    // m - borrow + carry --> rcx
1075
1076    imulq   32(%rbp), %r11              // m = acc[3] * LordK --> r11
1077
1078    shlq    $32, %rax                   // low(m * 2^228)
1079    shrq    $32, %rdx                   // high(m * 2^228)
1080    subq    %rax, %r9                   // to acc[5]: - low(m * 2^228)
1081    sbbq    %rdx, %r10                  // to acc[6]: m - high(m * 2^228)
1082    movq    (%rbp), %rax                // N[0] --> rax
1083    addq    %rcx, %r9                   // to acc[5]
1084    adcq    $0, %r10                    // to acc[6]
1085
1086    /* Last reduction */
1087    mulq    %r11                        // rdx:rax = m * N[0]
1088    addq    %rax, %rsi                  // acc[3] += rax --> 0
1089    movq    %r11, %rcx                  // m --> rcx
1090    adcq    %rdx, %rsi                  // rsi = high[m * N[0]] + carry  --> acc[4]
1091    movq    8(%rbp), %rax               // N[1] --> rax
1092    subq    %r11, %r9                   // to acc[5] : -= m
1093    sbbq    $0, %rcx                    // to acc[6] : m - borrow
1094
1095    mulq    %r11                        // rdx:rax = m * N[1]
1096    addq    %rsi, %r8                   // to acc[4]: += high[m * N[1]] + carry
1097    adcq    $0, %rdx                    // rdx += carry, no overflow
1098    addq    %rax, %r8                   // to acc[4]: += rax
1099    movq    %r11, %rax                  // m --> rax
1100    adcq    %rdx, %r9                   // to acc[5]: += rdx
1101    movq    %r11, %rdx                  // m --> rdx
1102    adcq    $0, %rcx                    // m - borrow + carry --> rcx
1103
1104    shlq    $32, %rax                   // low(m * 2^228)
1105    shrq    $32, %rdx                   // high(m * 2^228)
1106
1107    subq    %rax, %r10                  // to acc[6]: - low(m * 2^228)
1108    sbbq    %rdx, %r11                  // to acc[7]: m - high(m * 2^228)
1109
1110    addq    %rcx, %r10                  // to acc[6]
1111    adcq    $0, %r11                    // to acc[7]
1112
1113    /* r15 r14 r13 r12 + r11 r10 r9 r8 */
1114    xorq    %rdx, %rdx
1115    addq    %r12, %r8
1116    adcq    %r13, %r9
1117    movq    %r8, %r12
1118    adcq    %r14, %r10
1119    adcq    %r15, %r11
1120    movq    %r9, %rax
1121    adcq    $0, %rdx
1122
1123    subq    (%rbp), %r8
1124    movq    %r10, %r14
1125    sbbq    8(%rbp), %r9
1126    sbbq    16(%rbp), %r10
1127    movq    %r11, %r15
1128    sbbq    24(%rbp), %r11
1129    sbbq    $0, %rdx
1130
1131    cmovcq  %r12, %r8
1132    cmovncq  %r9, %rax
1133    cmovncq  %r10, %r14
1134    cmovncq  %r11, %r15                     // r8 rax r14 r15
1135
1136    decq    %rbx
1137    jnz     .Lord_sqr_loop
1138
1139    movq    %r8, (%rdi)
1140    movq    %rax, 8(%rdi)
1141    vpxor   %xmm2, %xmm2, %xmm2
1142    movq    %r14, 16(%rdi)
1143    vpxor   %xmm3, %xmm3, %xmm3
1144    movq    %r15, 24(%rdi)
1145    vpxor   %xmm1, %xmm1, %xmm1
1146
1147    movq    (%rsp), %r15
1148    movq    8(%rsp), %r14
1149    movq    16(%rsp), %r13
1150    movq    24(%rsp), %r12
1151    movq    32(%rsp), %rbp
1152    movq    40(%rsp), %rbx
1153    leaq    48(%rsp), %rsp
1154
1155    ret
1156.cfi_endproc
1157.size   ECP256_OrdSqr, .-ECP256_OrdSqr
1158
1159/**
1160 *  Function description: half calculation of the ECP256 field: res = a/2 mod P
1161 *  Input register:
1162 *        rdi: Pointer to the output Coord structure
1163 *        r8: a[0]
1164 *        r9: a[1]
1165 *        r10:a[2]
1166 *        r11:a[3]
1167 *        r14:P[1]
1168 *        r15:P[3]
1169 *  Change register: rax, rcx, rdx, rsi, r8, r9, r10, r11, r12, r13
1170 *  Output register: r8, r9, r10, r11
1171 */
1172.type   ECP256_DivBy2Core,@function
1173ECP256_DivBy2Core:
1174.cfi_startproc
1175    xorq    %r13, %r13
1176    movq    %r8, %rax
1177    movq    %r9, %rcx
1178
1179    addq    $-1, %r8
1180    movq    %r10, %rdx
1181    adcq    %r14, %r9
1182    movq    %r11, %r12
1183    adcq    $0, %r10
1184    adcq    %r15, %r11
1185    adcq    $0, %r13
1186    xorq    %rsi, %rsi
1187
1188    testq   $1, %rax
1189    cmovzq  %rax, %r8
1190    cmovzq  %rcx, %r9
1191    cmovzq  %rdx, %r10
1192    cmovzq  %r12, %r11
1193    movq    %r9, %rcx
1194    cmovzq  %rsi, %r13
1195    movq    %r10, %rdx
1196    movq    %r11, %r12
1197
1198    shrq    $1, %r8
1199    shlq    $63, %rcx
1200    shrq    $1, %r9
1201    shlq    $63, %rdx
1202    shrq    $1, %r10
1203    orq     %rcx, %r8
1204    shlq    $63, %r12
1205
1206    shrq    $1, %r11
1207    shlq    $63, %r13
1208    orq     %rdx, %r9
1209    orq     %r12, %r10
1210    orq     %r13, %r11
1211
1212    movq    %r8, (%rdi)
1213    movq    %r9, 8(%rdi)
1214    movq    %r10, 16(%rdi)
1215    movq    %r11, 24(%rdi)
1216    ret
1217.cfi_endproc
1218.size   ECP256_DivBy2Core, .-ECP256_DivBy2Core
1219
1220/**
1221 *  Function description: Half calculation of the ECP256 field: res = a/2 mod P
1222 *  Function prototype: void ECP256_DivBy2(Coord *r, const Coord *a);
1223 *  Input register:
1224 *        rdi: Pointer to the output Coord structure
1225 *        rsi: Address pointing to input data a
1226 *  Change register: rax, rcx, rdx, rsi, r8, r9, r10, r11, r12, r13, r14, r15
1227 *  Output register: None
1228 *  Function/macro call: Call ECP256_DivBy2Core to implement half calculation.
1229 */
1230.globl  ECP256_DivBy2
1231.type   ECP256_DivBy2, @function
1232ECP256_DivBy2:
1233.cfi_startproc
1234    pushq   %r12
1235    pushq   %r13
1236    pushq   %r14
1237    pushq   %r15
1238
1239    movq    (%rsi), %r8
1240    movq    8(%rsi), %r9
1241    movq    16(%rsi), %r10
1242    movq    24(%rsi), %r11
1243    movq    8+.Lpoly(%rip), %r14
1244    movq    24+.Lpoly(%rip), %r15
1245    call    ECP256_DivBy2Core
1246
1247    movq    (%rsp), %r15
1248    movq    8(%rsp), %r14
1249    movq    16(%rsp), %r13
1250    movq    24(%rsp), %r12
1251    leaq    32(%rsp),  %rsp
1252    ret
1253.cfi_endproc
1254.size   ECP256_DivBy2, .-ECP256_DivBy2
1255
1256/* r14 = .Lpoly[1], r15 = .Lpoly[3] */
1257.type   ECP256_MulBy2Core,@function
1258.align 32
1259ECP256_MulBy2Core:
1260.cfi_startproc
1261    xorq    %r13, %r13
1262
1263    addq    %r8, %r8
1264    adcq    %r9, %r9
1265    movq    %r8, %rax
1266    adcq    %r10, %r10
1267    adcq    %r11, %r11
1268    movq    %r9, %rcx
1269    adcq    $0, %r13
1270
1271    subq    $-1, %r8
1272    movq    %r10, %rdx
1273    sbbq    %r14, %r9
1274    movq    %r11, %r12
1275    sbbq    $0, %r10
1276    sbbq    %r15, %r11
1277    sbbq    $0, %r13
1278
1279    cmovcq  %rax, %r8           // Obtain mod P result
1280    cmovcq  %rcx, %r9
1281    movq    %r8, (%rdi)
1282    cmovcq  %rdx, %r10
1283    movq    %r9, 8(%rdi)
1284    cmovcq  %r12, %r11
1285    movq    %r10, 16(%rdi)
1286    movq    %r11, 24(%rdi)
1287    ret
1288.cfi_endproc
1289.size   ECP256_MulBy2Core, .-ECP256_MulBy2Core
1290
1291.globl  ECP256_MulBy2
1292.type   ECP256_MulBy2,@function
1293.align 32
1294ECP256_MulBy2:
1295.cfi_startproc
1296    pushq   %r12
1297    pushq   %r13
1298
1299    movq    (%rsi), %r8
1300    movq    8(%rsi), %r9
1301    movq    16(%rsi), %r10
1302    movq    24(%rsi), %r11
1303    xorq    %r13, %r13
1304
1305    leaq    .Lpoly(%rip), %rsi
1306
1307    addq    %r8, %r8
1308    adcq    %r9, %r9
1309    movq    %r8, %rax
1310    adcq    %r10, %r10
1311    adcq    %r11, %r11
1312    movq    %r9, %rcx
1313    adcq    $0, %r13
1314
1315    subq    $-1, %r8
1316    movq    %r10, %rdx
1317    sbbq    8(%rsi), %r9
1318    movq    %r11, %r12
1319    sbbq    $0, %r10
1320    sbbq    24(%rsi), %r11
1321    sbbq    $0, %r13
1322
1323    cmovcq  %rax, %r8           // Obtain mod P result
1324    cmovcq  %rcx, %r9
1325    movq    %r8, (%rdi)
1326    cmovcq  %rdx, %r10
1327    movq    %r9, 8(%rdi)
1328    cmovcq  %r12, %r11
1329    movq    %r10, 16(%rdi)
1330    movq    %r11, 24(%rdi)
1331
1332    movq    0(%rsp), %r13
1333    movq    8(%rsp), %r12
1334    leaq    16(%rsp), %rsp
1335    ret
1336.cfi_endproc
1337.size   ECP256_MulBy2, .-ECP256_MulBy2
1338
1339/* r14 = .Lpoly[1], r15 = .Lpoly[3] */
1340.type   ECP256_MulBy3Core,@function
1341.align 32
1342ECP256_MulBy3Core:
1343.cfi_startproc
1344    xorq    %r13, %r13
1345    addq    %r8, %r8
1346    adcq    %r9, %r9
1347    movq    %r8, %rax
1348    adcq    %r10, %r10
1349    adcq    %r11, %r11
1350    movq    %r9, %rcx
1351    adcq    $0, %r13
1352    subq    $-1, %r8
1353    movq    %r10, %rdx
1354    sbbq    %r14, %r9
1355    movq    %r11, %r12
1356    sbbq    $0, %r10
1357    sbbq    %r15, %r11
1358    sbbq    $0, %r13
1359
1360    cmovcq  %rax, %r8           // Obtain mod P result
1361    cmovcq  %rcx, %r9
1362    cmovcq  %rdx, %r10
1363    cmovcq  %r12, %r11
1364
1365    xorq    %r13, %r13
1366    addq    (%rsi), %r8
1367    adcq    8(%rsi), %r9
1368    movq    %r8, %rax
1369    adcq    16(%rsi), %r10
1370    adcq    24(%rsi), %r11
1371    movq    %r9, %rcx
1372    adcq    $0, %r13
1373    subq    $-1, %r8
1374    movq    %r10, %rdx
1375    sbbq    %r14, %r9
1376    sbbq    $0, %r10
1377    movq    %r11, %r12
1378    sbbq    %r15, %r11
1379    sbbq    $0, %r13
1380
1381    cmovcq  %rax, %r8           // Obtain mod P result
1382    cmovcq  %rcx, %r9
1383    movq    %r8, (%rdi)
1384    cmovcq  %rdx, %r10
1385    movq    %r9, 8(%rdi)
1386    cmovcq  %r12, %r11
1387    movq    %r10, 16(%rdi)
1388    movq    %r11, 24(%rdi)
1389    ret
1390.cfi_endproc
1391.size   ECP256_MulBy3Core, .-ECP256_MulBy3Core
1392
1393.globl  ECP256_MulBy3
1394.type   ECP256_MulBy3,@function
1395.align 32
1396ECP256_MulBy3:
1397.cfi_startproc
1398    pushq   %r12
1399    pushq   %r13
1400    pushq   %r14
1401    pushq   %r15
1402
1403    movq    (%rsi), %r8
1404    movq    8(%rsi), %r9
1405    movq    16(%rsi), %r10
1406    movq    24(%rsi), %r11
1407    movq    8+.Lpoly(%rip), %r14
1408    movq    24+.Lpoly(%rip), %r15
1409    call    ECP256_MulBy3Core
1410
1411    movq    (%rsp), %r15
1412    movq    8(%rsp), %r14
1413    movq    16(%rsp), %r13
1414    movq    24(%rsp), %r12
1415    leaq    32(%rsp), %rsp
1416    ret
1417.cfi_endproc
1418.size   ECP256_MulBy3, .-ECP256_MulBy3
1419
1420/**
1421 *  Function description: This function is used to calculate the Montgomery multiplication of the ord(P256) field:
1422 *  res = a * b * 2^(-256) mod ord(P)
1423 *  Input register:
1424 *        rdi: Pointer to the output Coord structure
1425 *        rsi: Address pointing to input data a
1426 *        rdx: Address pointing to input data b
1427 *  Change register: rax, rbx, rcx, rdx, rbp, r8, r9, r10, r11, r12, r13, r14, r15
1428 *  Function/Macro invoking: The calculation can be implemented by calling ECP256_OrdMulCore.
1429 */
1430.globl  ECP256_OrdMul
1431.type   ECP256_OrdMul,@function
1432.align 32
1433ECP256_OrdMul:
1434.cfi_startproc
1435    pushq   %rbx
1436    pushq   %rbp
1437    pushq   %r12
1438    pushq   %r13
1439    pushq   %r14
1440    pushq   %r15
1441
1442    movq    %rdx, %rcx                      // rdx used to output the mul multiplication result.
1443                                            // The rcx saves the b address.
1444    leaq	.Lord(%rip), %r15
1445    movq	.LordK(%rip), %r14
1446    movq    (%rdx), %rax                    // b[0]
1447
1448    //a[0-3] * b[0]
1449    movq    %rax, %rbp                      // save b[0]
1450    mulq    (%rsi)                          // a[0] * b[0]
1451    movq    %rax, %r8
1452    movq    %rbp, %rax                      // b[0]
1453    movq    %rdx, %r9
1454
1455    mulq    8(%rsi)                         // a[1] * b[0]
1456    addq    %rax, %r9
1457    movq    %rbp, %rax
1458    adcq    $0, %rdx                        // a[1:0] * b[0] The result must be less than 2 ^ 192,
1459                                            // So no carry is required.
1460    movq    %rdx, %r10
1461
1462    mulq    16(%rsi)                        // a[2] * b[0]
1463    addq    %rax, %r10
1464    movq    %rbp, %rax
1465    adcq    $0, %rdx
1466    movq    %rdx, %r11
1467
1468    mulq    24(%rsi)                        // a[3] * b[0]
1469    addq    %rax, %r11
1470    adcq    $0, %rdx
1471    movq    %r8, %r13
1472    movq    %rdx, %r12                      // First round multiplication results r12 r11 r10 r9 r8
1473
1474    // First round of reduction
1475    // n[0] = 0xf3b9cac2fc632551
1476    // n[1] = 0xbce6faada7179e84
1477    // n[2] = 0xffffffffffffffff, lo(q*n[2]) = -q     , hi(q*n[2]) = q
1478    // n[3] = 0xffffffff00000000, lo(q*n[3]) = -lq<<32, hi(q*n[3]) = q-hq
1479    imulq   %r14, %r8                        // r8 = r8 * ordK = q
1480
1481    movq    %r8, %rax
1482    mulq    (%r15)                           // n[0] * q
1483    addq    %rax, %r13                       // %r13 must be 0.
1484    adcq    $0, %rdx                         // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 No further carry is required.
1485    movq    %r8, %rax
1486    movq    %rdx, %rbp                       // %rbp = hi(n[0]*q)
1487
1488    mulq    8(%r15)                          // n[1] * q
1489    addq    %rbp, %rax                       // %rax = lo(n[1]*q)+hi(n[0]*q)
1490    adcq    $0, %rdx                         // %rdx = hi(n[1]*q),   be the same as the above hi(n[1]*q) + 1 < 2^64
1491                                             // No further carry is required.
1492
1493    movq    %r8, %rbx
1494    subq    %r8, %r10                        // r10 = r[2] - q
1495    sbbq    $0, %rbx                         // When q>0, rbx - 1 = q - 1 >= 0, when q=0 (r[2]-q) does not borrow,
1496                                             // rbx = rbx - 0 >= 0, so the following formula does not borrow
1497
1498    addq    %rax, %r9                        // r9 = r[1] + lo(n[1]*q) + hi(n[0]*q)
1499    adcq    %rdx, %r10                       // r10 = r[2] - q + hi(n[1]*q)
1500    movq    %r8, %rax
1501    adcq    $0, %rbx                         // Overflowing is not possible.
1502
1503    movq    %r8, %rdx
1504    shrq    $32, %rax                        // rax = hq
1505    shlq    $32, %rdx                        // rdx = lq<<32
1506
1507    subq    %rdx, %rbx                       // q - lq<<32
1508    sbbq    %rax, %r8                        // r8 = q - hq = hq * 2^32 + lq - hq >= lq,
1509                                             // When lq!=0, The following formula does not borrow.
1510                                             // When lq==0, the upper formula does not borrow.
1511
1512    adcq    %rbx, %r11
1513    movq    8(%rcx), %rax
1514    adcq    %r8, %r12
1515    movq    %rax, %rbp
1516    adcq    $0, %r13
1517
1518    // a[0-3] * b[1]
1519    mulq    (%rsi)
1520    addq    %rax, %r9
1521    adcq    $0, %rdx
1522    movq    %rbp, %rax
1523    movq    %rdx, %rbx
1524
1525    mulq    8(%rsi)
1526    addq    %rbx, %r10
1527    adcq    $0, %rdx
1528    addq    %rax, %r10
1529    adcq    $0, %rdx
1530    movq    %rbp, %rax
1531    movq    %rdx, %rbx
1532
1533    mulq    16(%rsi)
1534    addq    %rbx, %r11
1535    adcq    $0, %rdx
1536    addq    %rax, %r11
1537    adcq    $0, %rdx
1538    movq    %rbp, %rax
1539    movq    %rdx, %rbx
1540
1541    xorq    %r8, %r8                         // r8 = 0
1542    mulq    24(%rsi)
1543    addq    %rbx, %r12
1544    adcq    $0, %rdx
1545    addq    %rax, %r12
1546    adcq    %rdx, %r13
1547    movq    %r9, %rbx
1548    adcq    $0, %r8                          // Second round of multiplication results r9,r10,r11,r12,r13,r8
1549
1550    // Second round of reduction
1551    imulq   %r14, %r9                        // r9 = r9 * ordK = q
1552    movq    %r9, %rax
1553
1554    mulq    (%r15)                           // n[0] * q
1555    addq    %rax, %rbx                       // %rbx must be 0
1556    adcq    $0, %rdx                         // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 No further carry is required
1557    movq    %r9, %rax
1558    movq    %rdx, %rbp                       // %rbp = hi(n[0]*q)
1559
1560    mulq    8(%r15)                          // n[1] * q
1561    addq    %rbp, %rax                       // %rax = lo(n[1]*q)+hi(n[0]*q)
1562    adcq    $0, %rdx                         // %rdx = hi(n[1]*q),  be the same as the above hi(n[1]*q) + 1 < 2^64
1563                                             // No further carry is required
1564
1565    movq    %r9, %rbx
1566    subq    %r9, %r11                        // r11 = r[2] - q
1567    sbbq    $0, %rbx                         // when q>0 rbx - 1 = q - 1 >= 0,
1568                                             // When q=0 (r[2]-q) does not borrow, rbx = rbx - 0 >= 0,
1569                                             // So the following equation does not borrow
1570
1571    addq    %rax, %r10                       // r10 = r[1] + lo(n[1]*q) + hi(n[0]*q)
1572    adcq    %rdx, %r11                       // r11 = r[2] - q + hi(n[1]*q)
1573    movq    %r9, %rax
1574    adcq    $0, %rbx                         // Overflowing is not possible.
1575
1576    movq    %r9, %rdx
1577    shrq    $32, %rax                        // rax = hq
1578    shlq    $32, %rdx                        // rdx = lq<<32
1579
1580    subq    %rdx, %rbx                       // q - lq<<32
1581    sbbq    %rax, %r9                        // r9 = q - hq = hq * 2^32 + lq - hq >= lq,
1582                                             // When lq!=0,  The following formula does not borrow.
1583                                             // When lq==0, the preceding formula does not borrow.
1584
1585    movq    16(%rcx), %rax
1586    adcq    %rbx, %r12
1587    adcq    %r9, %r13
1588    adcq    $0, %r8
1589
1590    //  a[0-3] * b[2]
1591    movq    %rax, %rbp
1592    mulq    (%rsi)                           // a[0] * b[2]
1593    addq    %rax, %r10
1594    adcq    $0, %rdx
1595    movq    %rbp, %rax
1596    movq    %rdx, %rbx
1597
1598    mulq    8(%rsi)
1599    addq    %rbx, %r11
1600    adcq    $0, %rdx
1601    addq    %rax, %r11
1602    adcq    $0, %rdx
1603    movq    %rbp, %rax
1604    movq    %rdx, %rbx
1605
1606    mulq    16(%rsi)
1607    addq    %rbx, %r12
1608    adcq    $0, %rdx
1609    addq    %rax, %r12
1610    adcq    $0, %rdx
1611    movq    %rbp, %rax
1612    movq    %rdx, %rbx
1613
1614    xorq    %r9, %r9
1615    mulq    24(%rsi)
1616    addq    %rbx, %r13
1617    adcq    $0, %rdx
1618    addq    %rax, %r13
1619    adcq    %rdx, %r8
1620    movq    %r10, %rbx
1621    adcq    $0, %r9
1622
1623    // Third round reduction
1624    imulq   %r14, %r10                       // r10 = r10 * ordK = q
1625
1626    movq    %r10, %rax
1627    mulq    (%r15)                           // n[0] * q
1628    addq    %rax, %rbx                       // %rbx must be 0
1629    adcq    $0, %rdx                         // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 no further carry is required.
1630    movq    %r10, %rax
1631    movq    %rdx, %rbp                       // %rbp = hi(n[0]*q)
1632
1633    mulq    8(%r15)                          // n[1] * q
1634    addq    %rbp, %rax                       // %rax = lo(n[1]*q)+hi(n[0]*q)
1635    adcq    $0, %rdx                         // %rdx = hi(n[1]*q),  same as above hi(n[1]*q) + 1 < 2^64 no further carry is required.
1636
1637    movq    %r10, %rbx
1638    subq    %r10, %r12                       // r12 = r[2] - q
1639    sbbq    $0, %rbx                         // if q>0, rbx - 1 = q - 1 >= 0.
1640                                             // if q=0, (r[2]-q) Won't borrow ,  rbx = rbx - 0 >= 0,
1641                                             // the following formula will not borrow.
1642
1643    addq    %rax, %r11                       // r11 = r[1] + lo(n[1]*q) + hi(n[0]*q)
1644    adcq    %rdx, %r12                       // r12 = r[2] - q + hi(n[1]*q)
1645    movq    %r10, %rax
1646    adcq    $0, %rbx                         // Overflowing is not possible.
1647
1648    movq    %r10, %rdx
1649    shrq    $32, %rax                        // rax = hq
1650    shlq    $32, %rdx                        // rdx = lq<<32
1651
1652    subq    %rdx, %rbx                       // q - lq<<32
1653    sbbq    %rax, %r10                       // r10 = q - hq = hq * 2^32 + lq - hq >= lq,
1654                                             // if lq!=0,  the following formula does not borrow,
1655                                             // if lq==0,  The above formula does not borrow.
1656
1657    movq    24(%rcx), %rax
1658    adcq    %rbx, %r13
1659    adcq    %r10, %r8
1660    adcq    $0, %r9
1661
1662    // a[0-3] * b[3]
1663    movq    %rax, %rbp
1664    mulq    (%rsi)                           // a[0] * b[3]
1665    addq    %rax, %r11
1666    adcq    $0, %rdx
1667    movq    %rbp, %rax
1668    movq    %rdx, %rbx
1669
1670    mulq    8(%rsi)
1671    addq    %rbx, %r12
1672    adcq    $0, %rdx
1673    addq    %rax, %r12
1674    adcq    $0, %rdx
1675    movq    %rbp, %rax
1676    movq    %rdx, %rbx
1677
1678    mulq    16(%rsi)
1679    addq    %rbx, %r13
1680    adcq    $0, %rdx
1681    addq    %rax, %r13
1682    adcq    $0, %rdx
1683    movq    %rbp, %rax
1684    movq    %rdx, %rbx
1685
1686    xorq    %r10, %r10
1687    mulq    24(%rsi)
1688    addq    %rbx, %r8
1689    adcq    $0, %rdx
1690    addq    %rax, %r8
1691    adcq    %rdx, %r9
1692    movq    %r11, %rbx
1693    adcq    $0, %r10
1694
1695    // last round reduction.
1696    imulq   %r14, %r11                       // r11 = r11 * ordK = q
1697
1698    movq    %r11, %rax
1699    mulq    (%r15)                           // n[0] * q
1700    addq    %rax, %rbx                       // %rbx must be 0
1701    adcq    $0, %rdx                         // hi(n[0]*q) + 1 < 2^32 + 1 < 2^64 no further carry is required.
1702    movq    %r11, %rax
1703    movq    %rdx, %rbp                       // %rbp = hi(n[0]*q)
1704
1705    mulq    8(%r15)                          // n[1] * q
1706    addq    %rbp, %rax                       // %rax = lo(n[1]*q)+hi(n[0]*q)
1707    adcq    $0, %rdx                         // %rdx = hi(n[1]*q), same as above,
1708                                             // hi(n[1]*q) + 1 < 2^64 no further carry is required.
1709
1710    movq    %r11, %rbx
1711    subq    %r11, %r13                       // r13 = r[2] - q
1712    sbbq    $0, %rbx                         // When q>0 rbx - 1 = q - 1 >= 0,
1713                                             // When q=0,(r[2]-q)No borrowing, rbx = rbx - 0 >= 0,
1714                                             // so the following formula does not borrow.
1715
1716    addq    %rax, %r12                       // r12 = r[1] + lo(n[1]*q) + hi(n[0]*q)
1717    adcq    %rdx, %r13                       // r13 = r[2] - q + hi(n[1]*q)
1718    movq    %r11, %rax
1719    adcq    $0, %rbx                         // Overflowing is not possible.
1720
1721    movq    %r11, %rdx
1722    shrq    $32, %rax                        // rax = hq
1723    shlq    $32, %rdx                        // rdx = lq<<32
1724
1725    subq    %rdx, %rbx                       // q - lq<<32
1726    sbbq    %rax, %r11                       // r11 = q - hq = hq * 2^32 + lq - hq >= lq,
1727                                             // when lq!=0, the following formula does not borrow.
1728                                             // When lq==0, the preceding formula does not borrow.
1729
1730    adcq    %rbx, %r8
1731    adcq    %r11, %r9
1732    adcq    $0, %r10
1733
1734    // mod n
1735    movq    %r12, %rbx
1736    movq    %r13, %rbp
1737    movq    %r8, %rax
1738    movq    %r9, %rdx
1739
1740    subq    (%r15), %r12
1741    sbbq    8(%r15), %r13
1742    sbbq    16(%r15), %r8
1743    sbbq    24(%r15), %r9
1744    sbbq    $0, %r10
1745
1746    cmovcq  %rbx, %r12
1747    cmovcq  %rbp, %r13
1748    cmovcq  %rax, %r8
1749    cmovcq  %rdx, %r9
1750
1751    movq    %r12, (%rdi)
1752    movq    %r13, 8(%rdi)
1753    movq    %r8, 16(%rdi)
1754    movq    %r9, 24(%rdi)
1755
1756    movq    (%rsp), %r15
1757    movq    8(%rsp), %r14
1758    movq    16(%rsp), %r13
1759    movq    24(%rsp), %r12
1760    movq    32(%rsp), %rbp
1761    movq    40(%rsp), %rbx
1762    leaq    48(%rsp), %rsp
1763    ret
1764.cfi_endproc
1765.size   ECP256_OrdMul, .-ECP256_OrdMul
1766
1767/**
1768 *  Function description: Calculate the point doubling of an elliptic curve: res = 2*a
1769 *  Function prototype: void ECP256_PointDouble(P256_Point *r, const P256_Point *a);
1770 *  Input register:
1771 *        rdi: pointer to the output P256_POINT structure
1772 *        rsi: address pointing to input data a
1773 *  Change register: rax, rbx, rcx, rdx, rbp, rsi, r8, r9, r10, r11, r12, r13, r14, r15
1774 *  Function/Macro Call: ECP256_MulBy2Core, ECP256_SqrCore_q, ECP256_AddCore, ECP256_MulCore_q, ECP256_SubCore
1775 * ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1776 * Deal process:
1777 *     delta = Z12
1778 *     gamma = Y12
1779 *     beta = X1*gamma
1780 *     alpha = 3*(X1-delta)*(X1+delta)
1781 *     X3 = alpha2-8*beta
1782 *     Z3 = (Y1+Z1)2-gamma-delta
1783 *     Y3 = alpha*(4*beta-X3)-8*gamma2
1784 */
1785.globl ECP256_PointDouble
1786.type  ECP256_PointDouble,@function
1787.align 32
1788ECP256_PointDouble:
1789.cfi_startproc
1790    pushq   %rbx
1791    pushq   %rbp
1792    pushq   %r12
1793    pushq   %r13
1794    pushq   %r14
1795    pushq   %r15
1796    subq    $168, %rsp                  // Create 32 x 5 + 8 stack space.
1797.Lpoint_double_core:
1798    vmovdqu  (%rsi), %xmm0               // Save x to stack
1799    vmovdqu  16(%rsi), %xmm1
1800    vmovdqa  %xmm0, (%rsp)
1801    vmovdqa  %xmm1, 16(%rsp)
1802
1803    vmovq    %rsi, %xmm3                 // Backup a
1804    vmovq    %rdi, %xmm0                 // Backup &r->x, &r->y, &r->z
1805    leaq    32(%rdi), %r12
1806    leaq    64(%rdi), %r13
1807    vmovq    %r12, %xmm1
1808    vmovq    %r13, %xmm2
1809
1810    movq    32(%rsi), %r8               // Read a->y
1811    movq    40(%rsi), %r9
1812    movq    48(%rsi), %r10
1813    movq    56(%rsi), %r11
1814
1815    movq    8+.Lpoly(%rip), %r14        // Read P[1], P[3]
1816    movq    24+.Lpoly(%rip), %r15
1817
1818    leaq    32(%rsp), %rdi
1819    call    ECP256_MulBy2Core           // ECP256_MulBy2(S, &a->y), Not overwritten rsi
1820
1821    movq    64(%rsi), %rax
1822    movq    72(%rsi), %r14
1823    movq    80(%rsi), %rbp
1824    movq    88(%rsi), %r15
1825    leaq    64(%rsi), %rsi              // Setting Input Parameters
1826    leaq    64(%rsp), %rdi              // Z2 = rsp + 64
1827    call    ECP256_SqrCore_q            // ECP256_Sqr(Z2, &a->z)
1828
1829    leaq    (%rsp), %rdx
1830    leaq    96(%rsp), %rdi              // M = rsp + 96
1831    call    ECP256_AddCore              // ECP256_Add(M, a->x, Z2)
1832
1833    movq    32(%rsp), %rax
1834    movq    40(%rsp), %r14
1835    movq    48(%rsp), %rbp
1836    movq    56(%rsp), %r15
1837    leaq    32(%rsp), %rdi              // S = rsp + 32
1838    call    ECP256_SqrCore_q            // ECP256_Sqr(S, S)
1839
1840    vmovq    %xmm3, %rcx
1841    leaq    32(%rcx), %rsi
1842    leaq    64(%rcx), %rcx
1843    vmovq    %xmm2, %rdi
1844    call    ECP256_MulCore_q            // ECP256_Mul(r->z, a->y, a->z)
1845    call    ECP256_MulBy2Core           // ECP256_MulBy2(r->z, r->z)
1846
1847    movq    (%rsp), %r8
1848    movq    8(%rsp), %r9
1849    movq    16(%rsp), %r10
1850    movq    24(%rsp), %r11
1851    leaq    64(%rsp), %rdx
1852    call    ECP256_SubCore              // ECP256_SubCore(Z2,a->x,Z2)
1853    movq    %r8, 64(%rsp)
1854    movq    %r9, 72(%rsp)
1855    movq    %r10, 80(%rsp)
1856    movq    %r11, 88(%rsp)
1857
1858    movq    32(%rsp), %rax
1859    movq    40(%rsp), %r14
1860    movq    48(%rsp), %rbp
1861    movq    56(%rsp), %r15
1862    vmovq    %xmm1, %rdi
1863    call    ECP256_SqrCore_q            // ECP256_Sqr(r->y,S)
1864
1865    call    ECP256_DivBy2Core           // ECP256_Div(r->y,r->y)
1866
1867    leaq    96(%rsp), %rdi
1868    leaq    96(%rsp), %rsi
1869    leaq    64(%rsp), %rcx
1870    call    ECP256_MulCore_q            // ECP256_MulCore_q(M,M,Z2)
1871    call    ECP256_MulBy3Core           // ECP256_MulBy3Core(M,M)
1872
1873    leaq    (%rsp), %rcx
1874    leaq    32(%rsp), %rsi
1875    leaq    32(%rsp), %rdi
1876    call    ECP256_MulCore_q            // ECP256_MulCore_q(S, S, a->x)
1877
1878    leaq    128(%rsp), %rdi             // T = 128 + rsp
1879    call    ECP256_MulBy2Core           // ECP256_MulBy2Core(T,  S)
1880
1881    movq    96(%rsp), %rax
1882    movq    104(%rsp), %r14
1883    movq    112(%rsp), %rbp
1884    movq    120(%rsp), %r15
1885    vmovq    %xmm0, %rdi
1886    call    ECP256_SqrCore_q            // ECP256_Sqr(r->x,  M)
1887
1888    leaq    128(%rsp), %rdx
1889    call    ECP256_SubCore              // ECP256_SubCore(r->x,  r->x,  T)
1890    movq    %r8, (%rdi)
1891    movq    %r9, 8(%rdi)
1892    movq    %r10, 16(%rdi)
1893    movq    %r11, 24(%rdi)
1894
1895    xorq    %rsi, %rsi                  // ECP256_SubCore(S,  S,  r->x), output %r12, %r13, %r8, %r9
1896    movq    32(%rsp), %rax
1897    movq    40(%rsp), %rbx
1898    movq    48(%rsp), %rcx
1899    subq    %r8, %rax
1900    sbbq    %r9, %rbx
1901    movq    56(%rsp), %rdx
1902    movq    %rax, %r12
1903    sbbq    %r10, %rcx
1904    sbbq    %r11, %rdx
1905    movq    %rbx, %r13
1906    movq    %rcx, %r8
1907    sbbq    $0, %rsi
1908    addq    $-1, %rax
1909    movq    %rdx, %r9
1910    adcq    %r14, %rbx
1911    adcq    $0, %rcx
1912    adcq    %r15, %rdx
1913    testq   %rsi, %rsi
1914    cmovnzq  %rax, %r12
1915    cmovnzq  %rbx, %r13
1916    cmovnzq  %rcx, %r8
1917    cmovnzq  %rdx, %r9
1918    movq    %r12, 32(%rsp)
1919    movq    %r13, 40(%rsp)
1920    movq    %r8, 48(%rsp)
1921    movq    %r9, 56(%rsp)
1922
1923    leaq    32(%rsp), %rdi
1924    leaq    32(%rsp), %rsi
1925    leaq    96(%rsp), %rcx
1926    call    ECP256_MulCore_q            // ECP256_MulCore_q(S,  S,  M)
1927
1928    vmovq    %xmm1, %rdx
1929    vmovq    %xmm1, %rdi
1930    call    ECP256_SubCore              // ECP256_SubCore(r->y,  S,  r->y)
1931    movq    %r8, (%rdi)
1932    movq    %r9, 8(%rdi)
1933    leaq    168+48(%rsp), %rsi
1934    movq    %r10, 16(%rdi)
1935    movq    %r11, 24(%rdi)
1936
1937    movq    -48(%rsi), %r15
1938    movq    -40(%rsi), %r14
1939    movq    -32(%rsi), %r13
1940    movq    -24(%rsi), %r12
1941    movq    -16(%rsi), %rbp
1942    movq    -8(%rsi), %rbx
1943    leaq    (%rsi), %rsp
1944    ret
1945.cfi_endproc
1946.size   ECP256_PointDouble, .-ECP256_PointDouble
1947
1948/**
1949 *  Function description: Elliptic curve point addition calculation: res = a + b
1950 *  Function prototype: void ECP256_PointAdd(P256_Point *r, const P256_Point *a, const P256_Point *b);
1951 *  Input register:
1952 *        rdi: Pointer to the output P256_POINT structure
1953 *        rsi: Address pointing to input data a
1954 *        rdx: Address pointing to input data b
1955 *  Change register: rax, rbx, rcx, rdx, rbp, rsi, r8, r9, r10, r11, r12, r13, r14, r15
1956 *  Function/Macro Call: ECP256_PointDouble, ECP256_SqrCore_q, ECP256_MulCore_q, ECP256_SubCore, ECP256_MulBy2Core
1957 * ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo
1958 * Deal process:
1959 *     U1 = X1*Z22
1960 *     U2 = X2*Z12
1961 *     S1 = Y1*Z23
1962 *     S2 = Y2*Z13
1963 *     H = U2-U1
1964 *     r = S2-S1
1965 *     X3 = r2-H3-2*U1*H2
1966 *     Y3 = r*(U1*H2-X3)-S1*H3
1967 *     Z3 = Z1*Z2*H
1968 */
1969.globl ECP256_PointAdd
1970.type  ECP256_PointAdd,@function
1971.align 32
1972ECP256_PointAdd:
1973.cfi_startproc
1974    pushq   %rbx
1975    pushq   %rbp
1976    pushq   %r12
1977    pushq   %r13
1978    pushq   %r14
1979    pushq   %r15
1980    subq    $640+8, %rsp                // Create 32 x 20 + 8 stack space.
1981
1982    vmovdqu  (%rsi), %xmm0
1983    vmovdqu  16(%rsi), %xmm1
1984    vmovdqu  32(%rsi), %xmm2
1985    vmovdqu  48(%rsi), %xmm3
1986    vmovdqu  64(%rsi), %xmm4
1987    vmovdqu  80(%rsi), %xmm5
1988    movq    %rsi, %rcx
1989    movq    %rdx, %rsi
1990    vmovdqu  %xmm0, (%rsp)              // Save a on the stack, a_cpy: 0~96(%rsp)
1991    vmovdqu  %xmm1, 16(%rsp)
1992    vmovdqu  %xmm2, 32(%rsp)
1993    vmovdqu  %xmm3, 48(%rsp)
1994    vmovdqu  %xmm4, 64(%rsp)
1995    vmovdqu  %xmm5, 80(%rsp)
1996    vpor     %xmm4, %xmm5, %xmm5        // xmm5 = (Za[3]|Za[1], Za[2]|Za[0])
1997
1998    vmovdqu  (%rsi), %xmm0
1999    vpshufd  $0xb1, %xmm5, %xmm3        // xmm3 = ((lo(Za[3]|Za[1])<<32) | hi(Za[3]|Za[1]), (lo(Za[2]|Za[0])<<32) | hi(Za[2]|Za[0]))
2000    vmovdqu  16(%rsi), %xmm1
2001    vmovdqu  32(%rsi), %xmm2
2002    vpor     %xmm3, %xmm5, %xmm5        // xmm5 = ((lo(Za[3]|Za[1])|hi(Za[3]|Za[1]))##~, (lo(Za[2]|Za[0])|hi(Za[2]|Za[0]))##~)
2003    vmovdqu  48(%rsi), %xmm3
2004
2005    movq    64(%rsi), %rax              // Read b.z, then calculate (b.z)^2
2006    movq    72(%rsi), %r14
2007    movq    80(%rsi), %rbp
2008    movq    88(%rsi), %r15
2009
2010    vmovdqu  %xmm0, 96(%rsp)            // Save b on the stack. b_cpy: 96–192(%rsp)
2011    vpshufd  $0x1e, %xmm5, %xmm4        // xmm4 = ((lo(Za[2]|Za[0])|hi(Za[2]|Za[0]))##~, (lo(Za[3]|Za[1])|hi(Za[3]|Za[1]))##~)
2012    vmovdqu  %xmm1, 112(%rsp)
2013    vmovdqu  64(%rsi), %xmm0
2014    vmovdqu  80(%rsi), %xmm1
2015    vmovdqu  %xmm2, 128(%rsp)
2016    vmovdqu  %xmm3, 144(%rsp)
2017    vpor     %xmm4, %xmm5, %xmm5        // xmm5 = ((lo(Za[0]|Za[1]|Za[2]|Za[3])|hi(Za[0]|Za[1]|Za[2]|Za[3]))##~##~##~)
2018    vpxor    %xmm4, %xmm4, %xmm4
2019    vpor     %xmm0, %xmm1, %xmm1
2020    vmovq    %rdi, %xmm0                // Backup rdi
2021    movq    %rax, 160(%rsp)
2022    movq    %r14, 168(%rsp)
2023    leaq    192(%rsp), %rdi             // Zb^2: 192~224(%rsp)
2024    movq    %rbp, 176(%rsp)
2025    movq    %r15, 184(%rsp)
2026
2027    vmovq    %rcx, %xmm2                // Backup a
2028    call    ECP256_SqrCore_q            // sqr(Zb^2, Zb)
2029
2030    vpcmpeqd %xmm4, %xmm5, %xmm5        // a_infty, Whether a is an infinity point (Za == 0)
2031    vpshufd  $0xb1, %xmm1, %xmm4
2032    vpor     %xmm1, %xmm4, %xmm4
2033    vpshufd  $0x1e, %xmm4, %xmm3
2034    vpor     %xmm3, %xmm4, %xmm4
2035    vpxor    %xmm3, %xmm3, %xmm3
2036    vpcmpeqd %xmm3, %xmm4, %xmm4        // b_infty, Whether b is an infinity point (Zb == 0)
2037
2038    movq    64(%rsp), %rax
2039    movq    72(%rsp), %r14
2040    leaq    224(%rsp), %rdi             // Za^2: 224~256(%rsp)
2041    movq    80(%rsp), %rbp
2042    movq    88(%rsp), %r15
2043    call    ECP256_SqrCore_q            // sqr(Za^2, Za)
2044
2045    leaq    160(%rsp), %rsi             // Zb
2046    leaq    192(%rsp), %rcx             // Zb^2
2047    leaq    256(%rsp), %rdi             // S1: 256~288(%rsp)
2048    call    ECP256_MulCore_q            // mul(S1, Zb, Zb^2)
2049
2050    leaq    64(%rsp), %rsi              // Za
2051    leaq    224(%rsp), %rcx             // Za^2
2052    leaq    288(%rsp), %rdi             // S2: 288~320(%rsp)
2053    call    ECP256_MulCore_q            // mul(S2, Za, Za^2)
2054
2055    leaq    32(%rsp), %rsi              // Ya
2056    leaq    256(%rsp), %rcx             // S1
2057    leaq    256(%rsp), %rdi             // S1
2058    call    ECP256_MulCore_q            // mul(S1,S1,Ya)
2059
2060    leaq    128(%rsp), %rsi             // Yb
2061    leaq    288(%rsp), %rcx             // S2
2062    leaq    288(%rsp), %rdi             // S2
2063    call    ECP256_MulCore_q            // mul(S2,S2,Yb)
2064
2065    leaq    256(%rsp), %rdx             // S1
2066    call    ECP256_SubCore              // sub(R,S2,S1)
2067    movq    %r8, 320(%rsp)              // R: 320~352(%rsp)
2068    movq    %r9, 328(%rsp)
2069    movq    %r10, 336(%rsp)
2070    movq    %r11, 344(%rsp)
2071
2072    orq     %r9, %r8
2073    vmovdqa  %xmm4, %xmm1
2074    orq     %r10, %r8
2075    orq     %r11, %r8
2076    vpor     %xmm5, %xmm1, %xmm1        // a_infty | b_infty
2077    vmovq    %r8, %xmm3
2078
2079    leaq    (%rsp), %rsi                // Xa
2080    leaq    192(%rsp), %rcx             // Zb^2
2081    leaq    352(%rsp), %rdi             // U1: 352~384(%rsp)
2082    call    ECP256_MulCore_q            // Mul(U1, Xa, Zb^2)
2083
2084    leaq    96(%rsp), %rsi              // Xb
2085    leaq    224(%rsp), %rcx             // Za^2
2086    leaq    384(%rsp), %rdi             // U2: 384~416(%rsp)
2087    call    ECP256_MulCore_q            // Mul(U2, Xb, Za^2)
2088
2089    leaq    352(%rsp), %rdx             // U1
2090    leaq    416(%rsp), %rdi             // H: 416~448(%rsp)
2091    call    ECP256_SubCore              // sub(H,U2,U1)
2092    movq    %r8, 416(%rsp)
2093    movq    %r9, 424(%rsp)
2094    movq    %r10, 432(%rsp)
2095    movq    %r11, 440(%rsp)
2096
2097    orq     %r9, %r8
2098    vmovq    %xmm1, %r12
2099    orq     %r10, %r8
2100    vmovq    %xmm3, %r13
2101    orq     %r11, %r8
2102
2103    orq     %r12, %r8
2104    orq     %r13, %r8
2105
2106    jnz     .Lpoint_add
2107
2108.Lequal_point:
2109    vmovq    %xmm0, %rdi
2110    vmovq    %xmm2, %rsi
2111    addq    $640-32*5, %rsp
2112    jmp     .Lpoint_double_core
2113
2114.align 32
2115.Lpoint_add:
2116    movq    320(%rsp), %rax             // R
2117    movq    328(%rsp), %r14
2118    leaq    448(%rsp), %rdi             // R^2: 448~480(%rsp)
2119    movq    336(%rsp), %rbp
2120    movq    344(%rsp), %r15
2121    call    ECP256_SqrCore_q            // sqr(R^2,R)
2122
2123    leaq    64(%rsp), %rsi              // Za
2124    leaq    416(%rsp), %rcx             // H
2125    leaq    480(%rsp), %rdi             // Zr:480~512(%rsp)
2126    call    ECP256_MulCore_q            // Mul(Zr,H,Za)
2127
2128    movq    416(%rsp), %rax             // H
2129    movq    424(%rsp), %r14
2130    leaq    512(%rsp), %rdi             // H^2:512~544(%rsp)
2131    movq    432(%rsp), %rbp
2132    movq    440(%rsp), %r15
2133    call    ECP256_SqrCore_q            // sqr(H^2, H)
2134
2135    leaq    480(%rsp), %rdi             // Zr
2136    leaq    480(%rsp), %rsi             // Zr
2137    leaq    160(%rsp), %rcx             // Zb
2138    call    ECP256_MulCore_q            // Mul(Zr, Zr, Zb)
2139
2140    leaq    544(%rsp), %rdi             // H3:544~576(%rsp)
2141    leaq    512(%rsp), %rsi             // H2
2142    leaq    416(%rsp), %rcx             // H
2143    call    ECP256_MulCore_q            // mul(H^3,H,H^2)
2144
2145    leaq    384(%rsp), %rdi             // U2
2146    leaq    352(%rsp), %rsi             // U1
2147    leaq    512(%rsp), %rcx             // H2
2148    call    ECP256_MulCore_q            // mul(U2,U1,H^2)
2149
2150    leaq    512(%rsp), %rdi             // H^2
2151    call    ECP256_MulBy2Core           // mulby2(H^2,U2)
2152
2153    movq    448(%rsp), %rax             // sub(Xr,R^2,H^2)
2154    movq    456(%rsp), %rbx
2155    xorq    %rsi, %rsi
2156    movq    464(%rsp), %rcx
2157    movq    472(%rsp), %rdx
2158    subq    %r8, %rax
2159    sbbq    %r9, %rbx
2160    movq    %rax, %r8
2161    sbbq    %r10, %rcx
2162    sbbq    %r11, %rdx
2163    movq    %rbx, %r9
2164    sbbq    $0, %rsi
2165
2166    addq    $-1, %r8
2167    movq    %rcx, %r10
2168    adcq    %r14, %r9
2169    adcq    $0, %r10
2170    movq    %rdx, %r11
2171    adcq    %r15, %r11
2172    testq   %rsi, %rsi
2173    cmovzq  %rax, %r8
2174    cmovzq  %rbx, %r9
2175    cmovzq  %rcx, %r10
2176    cmovzq  %rdx, %r11
2177
2178    leaq    576(%rsp), %rdi             // Xr: 576~608(%rsp)
2179    leaq    544(%rsp), %rdx             // H^3
2180    call    ECP256_SubCore              // sub(Xr, Xr, H^3)
2181    movq    %r8, 576(%rsp)
2182    movq    %r9, 584(%rsp)
2183    movq    %r10, 592(%rsp)
2184    movq    %r11, 600(%rsp)
2185
2186    movq    384(%rsp), %rax             // sub(Yr, U2, Xr)
2187    movq    392(%rsp), %rbx
2188    xorq    %rsi, %rsi
2189    movq    400(%rsp), %rcx
2190    movq    408(%rsp), %rdx
2191    subq    %r8, %rax
2192    sbbq    %r9, %rbx
2193    movq    %rax, %r8
2194    sbbq    %r10, %rcx
2195    sbbq    %r11, %rdx
2196    movq    %rbx, %r9
2197    sbbq    $0, %rsi
2198
2199    addq    $-1, %r8
2200    movq    %rcx, %r10
2201    adcq    %r14, %r9
2202    adcq    $0, %r10
2203    movq    %rdx, %r11
2204    adcq    %r15, %r11
2205    testq   %rsi, %rsi
2206    cmovzq  %rax, %r8
2207    cmovzq  %rbx, %r9
2208    cmovzq  %rcx, %r10
2209    cmovzq  %rdx, %r11
2210    movq    %r8, 608(%rsp)              // Yr: 608~640(%rsp)
2211    movq    %r9, 616(%rsp)
2212    movq    %r10, 624(%rsp)
2213    movq    %r11, 632(%rsp)
2214
2215    leaq    288(%rsp), %rdi             // S2
2216    leaq    256(%rsp), %rsi             // S1
2217    leaq    544(%rsp), %rcx             // H^3
2218    call    ECP256_MulCore_q            // Mul(S2, S1, H^3)
2219
2220    leaq    608(%rsp), %rdi             // Yr
2221    leaq    608(%rsp), %rsi
2222    leaq    320(%rsp), %rcx             // r
2223    call    ECP256_MulCore_q            // Mul(Yr, Yr, R)
2224
2225    leaq    608(%rsp), %rdi             // Yr
2226    leaq    288(%rsp), %rdx             // S2
2227    call    ECP256_SubCore              // sub(Yr,Yr,S2)
2228    movq    %r8, 608(%rsp)
2229    movq    %r9, 616(%rsp)
2230    movq    %r10, 624(%rsp)
2231    movq    %r11, 632(%rsp)
2232
2233    vmovq    %xmm0, %rdi
2234
2235    vmovdqa  %xmm5, %xmm0               // a_infty
2236    vmovdqa  %xmm5, %xmm1
2237    vpandn   576(%rsp), %xmm0, %xmm0    // !a_infty & Xr
2238    vpandn   592(%rsp), %xmm1, %xmm1
2239    vmovdqa  %xmm5, %xmm2
2240    vmovdqa  %xmm5, %xmm3
2241    vpand    96(%rsp), %xmm2, %xmm2     // a_infty & Xb
2242    vpand    112(%rsp), %xmm3, %xmm3
2243    vpor     %xmm0, %xmm2, %xmm2        // a_infty ? Xb : Xr
2244    vpor     %xmm1, %xmm3, %xmm3
2245
2246    vmovdqa  %xmm4, %xmm0               // b_infty
2247    vmovdqa  %xmm4, %xmm1
2248    vpandn   %xmm2, %xmm0, %xmm0        // !b_infty & (a_infty ? Xb : Xr)
2249    vpandn   %xmm3, %xmm1, %xmm1
2250    vmovdqa  %xmm4, %xmm2
2251    vmovdqa  %xmm4, %xmm3
2252    vpand    (%rsp), %xmm2, %xmm2       // b_infty & Xa
2253    vpand    16(%rsp), %xmm3, %xmm3
2254    vpor     %xmm0, %xmm2, %xmm2        // b_infty ? Xa : (a_infty ? Xb : Xr)
2255    vpor     %xmm1, %xmm3, %xmm3
2256    vmovdqu  %xmm2, (%rdi)
2257    vmovdqu  %xmm3, 16(%rdi)
2258
2259    vmovdqa  %xmm5, %xmm0               // a_infty
2260    vmovdqa  %xmm5, %xmm1
2261    vpandn   608(%rsp), %xmm0, %xmm0    // !a_infty & Yr
2262    vpandn   624(%rsp), %xmm1, %xmm1
2263    vmovdqa  %xmm5, %xmm2
2264    vmovdqa  %xmm5, %xmm3
2265    vpand    128(%rsp), %xmm2, %xmm2    // a_infty & Yb
2266    vpand    144(%rsp), %xmm3, %xmm3
2267    vpor     %xmm0, %xmm2, %xmm2        // a_infty ? Yb : Yr
2268    vpor     %xmm1, %xmm3, %xmm3
2269
2270    vmovdqa  %xmm4, %xmm0               // b_infty
2271    vmovdqa  %xmm4, %xmm1
2272    vpandn   %xmm2, %xmm0, %xmm0        // !b_infty & (a_infty ? Yb : Yr)
2273    vpandn   %xmm3, %xmm1, %xmm1
2274    vmovdqa  %xmm4, %xmm2
2275    vmovdqa  %xmm4, %xmm3
2276    vpand    32(%rsp), %xmm2, %xmm2     // b_infty & Ya
2277    vpand    48(%rsp), %xmm3, %xmm3
2278    vpor     %xmm0, %xmm2, %xmm2        // b_infty ? Ya : (a_infty ? Yb : Yr)
2279    vpor     %xmm1, %xmm3, %xmm3
2280    vmovdqu  %xmm2, 32(%rdi)
2281    vmovdqu  %xmm3, 48(%rdi)
2282
2283    vmovdqa  %xmm5, %xmm0               // a_infty
2284    vmovdqa  %xmm5, %xmm1
2285    vpandn   480(%rsp), %xmm0, %xmm0    // !a_infty & Zr
2286    vpandn   496(%rsp), %xmm1, %xmm1
2287    vmovdqa  %xmm5, %xmm2
2288    vmovdqa  %xmm5, %xmm3
2289    vpand    160(%rsp), %xmm2, %xmm2    // a_infty & Zb
2290    vpand    176(%rsp), %xmm3, %xmm3
2291    vpor     %xmm0, %xmm2, %xmm2        // a_infty ? Zb : Zr
2292    vpor     %xmm1, %xmm3, %xmm3
2293
2294    vmovdqa  %xmm4, %xmm0               // b_infty
2295    vmovdqa  %xmm4, %xmm1
2296    vpandn   %xmm2, %xmm0, %xmm0        // !b_infty & (a_infty ? Zb : Zr)
2297    vpandn   %xmm3, %xmm1, %xmm1
2298    vmovdqa  %xmm4, %xmm2
2299    vmovdqa  %xmm4, %xmm3
2300    vpand    64(%rsp), %xmm2, %xmm2     // b_infty & Za
2301    vpand    80(%rsp), %xmm3, %xmm3
2302    vpor     %xmm0, %xmm2, %xmm2        // b_infty ? Za : (a_infty ? Zb : Zr)
2303    vpor     %xmm1, %xmm3, %xmm3
2304    vmovdqu  %xmm2, 64(%rdi)
2305    vmovdqu  %xmm3, 80(%rdi)
2306
2307    leaq    640+56(%rsp), %rsi
2308    movq    -48(%rsi), %r15
2309    movq    -40(%rsi), %r14
2310    movq    -32(%rsi), %r13
2311    movq    -24(%rsi), %r12
2312    movq    -16(%rsi), %rbp
2313    movq    -8(%rsi), %rbx
2314    leaq    (%rsi), %rsp
2315    ret
2316.cfi_endproc
2317.size   ECP256_PointAdd, .-ECP256_PointAdd
2318
2319/**
2320 *  Function description: Point addition of normal coordinates and affine coordinates assembly implementation
2321 *  Function prototype: void ECP256_AddAffine(P256_Point *r, const P256_Point *a, const P256_AffinePoint *b);
2322 *  Input register:
2323 *        rdi: Points to the returned P256_Point.
2324 *        rsi: Points to the input P256_Point.
2325 *        rdx: P256_AffinePoint that points to the input
2326 *  Change register: rax, rbx, rcx, rdx, rsi, rdi, rbp, r8, r9, r10, r11, r12, r13, r14, r15
2327 *  Output register: None
2328 *  Function/Macro Call: ECP256_MulBy2Core, ECP256_SqrCore_q, ECP256_AddCore, ECP256_MulCore_q, ECP256_SubCore
2329 *  ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl
2330 *  Deal process:
2331 *     Z1Z1 = Z12
2332 *     U2 = X2*Z1Z1
2333 *     S2 = Y2*Z1*Z1Z1
2334 *     H = U2-X1
2335 *     HH = H2
2336 *     I = 4*HH
2337 *     J = H*I
2338 *     r = 2*(S2-Y1)
2339 *     V = X1*I
2340 *     X3 = r2-J-2*V
2341 *     Y3 = r*(V-X3)-2*Y1*J
2342 *     Z3 = (Z1+H)2-Z1Z1-HH
2343 */
2344.globl  ECP256_AddAffine
2345.type   ECP256_AddAffine,@function
2346.align 32
2347ECP256_AddAffine:
2348.cfi_startproc
2349    pushq       %rbp
2350    pushq	    %rbx
2351    pushq	    %r12
2352    pushq	    %r13
2353    pushq	    %r14
2354    pushq	    %r15
2355
2356    subq	    $488, %rsp                      // open up stack space 32 * 15 + 8 = 488
2357
2358    vmovdqu	    0(%rsi), %xmm0                  // X1[1]X1[0] --> xmm0
2359	vmovdqu	    16(%rsi), %xmm1                 // X1[3]X1[2]
2360    vmovdqu	    32(%rsi), %xmm2                 // Y1[1]Y1[0]
2361	vmovdqu	    48(%rsi), %xmm3                 // Y1[3]Y1[2]
2362    movq        72(%rsi), %r14                  // Z1[1] 64 + 8
2363    movq        64(%rsi), %rax                  // Z1[0] 64 + 0
2364	vmovdqu	    64(%rsi), %xmm4                 // Z1[1]Z1[0]
2365    movq        88(%rsi), %r15                  // Z1[3] 64 + 24
2366    movq        80(%rsi), %rbp                  // Z1[2] 64 + 16
2367    vmovdqu	    80(%rsi), %xmm5                 // Z1[3]Z1[2]
2368
2369    vmovdqa     %xmm0, 320(%rsp)                // save X1[1]X1[0] to stack
2370    vmovdqa     %xmm1, 336(%rsp)                // save X1[3]X1[2] to stack
2371    vmovdqa     %xmm2, 352(%rsp)                // save Y1[1]Y1[0] to stack
2372    vmovdqa     %xmm3, 368(%rsp)                // save Y1[3]Y1[2] to stack
2373    vmovdqa     %xmm4, 384(%rsp)                // save Z1[1]Z1[0] to stack
2374    vmovdqa     %xmm5, 400(%rsp)                // save Z1[3]Z1[2] to stack
2375    vpor        %xmm4, %xmm5, %xmm5             // Z1[1]Z1[0] | Z1[3]Z1[2]
2376
2377    vmovdqu     (%rdx), %xmm0                   // X2[1]X2[0] --> xmm0
2378    vpshufd     $0xb1, %xmm5, %xmm3             // Order(10 11 00 01) --> [2 3 0 1] with 32bit
2379    vmovdqu     16(%rdx), %xmm1                 // X2[3]X2[2] --> xmm1
2380    vmovdqu     32(%rdx), %xmm2                 // Y2[1]Y2[0] --> xmm2
2381    vpor        %xmm3, %xmm5, %xmm5             // [2 3 0 1] | [3 2 1 0]
2382    vmovdqu     48(%rdx), %xmm3                 // Y2[3]Y2[2] --> xmm3
2383    vmovdqa     %xmm0, 416(%rsp)                // save X2[1]X2[0] to stack
2384    vpshufd     $0x1e, %xmm5, %xmm4             // Order(00 01 11 10) --> [0 1 3 2]
2385    vmovdqa     %xmm1, 432(%rsp)                // save X2[3]X2[1] to stack
2386    vpor        %xmm0, %xmm1, %xmm1             // X2[1]X2[0] | X2[3]X2[2]
2387
2388    vmovq       %rdi, %xmm0                     // save rdi to xmm0
2389    vmovdqa     %xmm2, 448(%rsp)                // save X2[1]X2[0] to stack
2390    vmovdqa     %xmm3, 464(%rsp)                // save X2[3]X2[2] to stack
2391    vpor        %xmm2, %xmm3, %xmm3             // Y2[1]Y2[0] | Y2[3]Y2[2]
2392    vpor        %xmm4, %xmm5, %xmm5
2393    vpxor       %xmm4, %xmm4, %xmm4             // 0
2394    vpor        %xmm1, %xmm3, %xmm3             // X2[1]X2[0] | X2[3]X2[2] | Y2[1]Y2[0] | Y2[3]Y2[2]
2395
2396.balign 32
2397    /* Z1Z1 = Z1 ^ 2 */
2398    leaq        64(%rsi), %rsi                  // addr(z)
2399    leaq        32(%rsp), %rdi                  // save Z1Z1 to stack
2400    call        ECP256_SqrCore_q                // Output: r8 - r11  P[1] --> r14 P[3] --> r15
2401
2402    vpcmpeqd    %xmm4, %xmm5, %xmm5
2403    vpshufd     $0xb1, %xmm3, %xmm4             // Order(10 11 00 01)
2404    vpor        %xmm3, %xmm4, %xmm4
2405    vpshufd     $0, %xmm5, %xmm5                // Order(00 00 00 00)
2406    vpshufd     $0x1e, %xmm4, %xmm3             // Order(00 01 11 10)
2407    vpor        %xmm3, %xmm4, %xmm4
2408    vpxor       %xmm3, %xmm3, %xmm3
2409    vpcmpeqd    %xmm3, %xmm4, %xmm4
2410    vpshufd     $0, %xmm4, %xmm4
2411
2412    /* U2 = X2 * Z1Z1 */
2413    leaq        416(%rsp), %rcx                 // addr of X2 in stack
2414    leaq        32(%rsp), %rsi                  // read Z1Z1 from stack
2415    leaq        (%rsp), %rdi                    // save U2 to stack
2416    call        ECP256_MulCore_q                // ouput: r8 - r11 P[1] --> r14 P[3] --> r15
2417
2418    /* H = U2 - X1 */
2419    leaq        320(%rsp), %rdx                 // read X1 from stack
2420    leaq        64(%rsp), %rdi                  // save H to stack
2421    call        ECP256_SubCore                  // input: rdx, r8 - r11 P[1] --> r14 P[3] --> r15
2422    movq        %r8, (%rdi)
2423    movq        %r9, 8(%rdi)
2424    movq        %r10, 16(%rdi)
2425    movq        %r11, 24(%rdi)
2426
2427    /* Z1Z1Z1 = Z1Z1 * Z1 */
2428    leaq        384(%rsp), %rcx                 // read Z1 from stack
2429    leaq        32(%rsp), %rsi                  // read Z1Z1 from stack
2430    leaq        32(%rsp), %rdi                  // save Z1Z1Z1 to stack
2431    call        ECP256_MulCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2432    movq        %r8, (%rdi)
2433    movq        %r9, 8(%rdi)
2434    movq        %r10, 16(%rdi)
2435    movq        %r11, 24(%rdi)
2436
2437    /* Z3/2 = H * Z1 */
2438    leaq        384(%rsp), %rcx                 // read Z1 from stack
2439    leaq        64(%rsp), %rsi                  // read H from stack
2440    leaq        288(%rsp), %rdi                 // save Z3/2 to stack
2441    call        ECP256_MulCore_q                // P[1] --> r14 P[3] --> r15
2442
2443    /* S2 = Y2 * Z1Z1Z1 */
2444    leaq        448(%rsp), %rcx                 // read Y2 from stack
2445    leaq        32(%rsp), %rsi                  // read Z1Z1Z1 from stack
2446    leaq        32(%rsp), %rdi                  // save S2 to stack
2447    call        ECP256_MulCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2448
2449    /* r/2 = (S2 - Y1) */
2450    leaq        352(%rsp), %rdx                 // read Y1 from stack
2451    leaq        96(%rsp), %rdi                  // save r/2 to stack
2452    call        ECP256_SubCore                  // output: r8-r11 P[1] --> r14 P[3] --> r15
2453    movq        %r8, (%rdi)                     // save r/2 to stack
2454    movq        %r9, 8(%rdi)
2455    movq        %r10, 16(%rdi)
2456    movq        %r11, 24(%rdi)
2457
2458    /* I/4 = H ^ 2 */
2459    leaq        64(%rsp), %rsi                  // read H from stack
2460    movq        (%rsi), %rax                    // a[0]
2461    movq        8(%rsi), %r14                   // a[1]
2462    movq        16(%rsi), %rbp                  // a[2]
2463    movq        24(%rsi), %r15                  // a[3]
2464    leaq        128(%rsp), %rdi                 // save I/4 to stack
2465    call        ECP256_SqrCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2466
2467    /* (r/2)^2 = (r^2)/4 */
2468    leaq        96(%rsp), %rsi                  // read r/2 from stack
2469    movq        (%rsi), %rax                    // a[0]
2470    movq        8(%rsi), %r14                   // a[1]
2471    movq        16(%rsi), %rbp                  // a[2]
2472    movq        24(%rsi), %r15                  // a[3]
2473    leaq        192(%rsp), %rdi                 // save (r^2)/4 to stack
2474    call        ECP256_SqrCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2475
2476    /* J/4 = I/4 * H */
2477    leaq        128(%rsp), %rcx                 // read I/4 from stack
2478    leaq        64(%rsp), %rsi                  // read H from stack
2479    leaq        160(%rsp), %rdi                 // save J/4 to stack
2480    call        ECP256_MulCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2481
2482    /* V/4 = X1 * I */
2483    leaq        320(%rsp), %rcx                 // read X1 from stack
2484    leaq        128(%rsp), %rsi                 // read I/4 from stack
2485    leaq        (%rsp), %rdi                    // save V/4 to stack
2486    call        ECP256_MulCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2487
2488    xorq        %r12, %r12
2489    addq        %r8, %r8
2490    adcq	    %r9, %r9
2491	movq	    %r8, %rax
2492    adcq	    %r10, %r10
2493	adcq	    %r11, %r11
2494	movq	    %r9, %rbp
2495    adcq	    $0, %r12
2496
2497	subq	    $-1, %r8
2498	movq	    %r10, %rcx
2499    sbbq	    %r14, %r9
2500	sbbq	    $0, %r10
2501	movq	    %r11, %r13
2502    sbbq	    %r15, %r11
2503	sbbq	    $0, %r12
2504	leaq        192(%rsp), %rsi                 // read (r^2)/4 from
2505    cmovcq	    %rax,%r8                        // b[0]  V/2 --> r8-r11
2506    cmovcq	    %rbp,%r9                        // b[1]
2507    cmovcq	    %rcx,%r10                       // b[2]
2508    cmovcq	    %r13,%r11                       // b[3]
2509
2510    /* (r^2 - 2 * V)/4 = (r^2)/4 - V/2 */
2511    xorq        %r13, %r13
2512    movq        (%rsi), %rax                    // a[0]
2513    movq        8(%rsi), %rcx                   // a[1]
2514    movq        16(%rsi), %rdx                  // a[2]
2515    movq        24(%rsi), %r12                  // a[3]
2516
2517    subq        %r8, %rax
2518    sbbq        %r9, %rcx
2519    movq        %rax, %r8
2520    sbbq        %r10, %rdx
2521    sbbq        %r11, %r12
2522    movq        %rcx, %r9
2523    sbbq        $0, %r13
2524
2525    addq        $-1, %r8                        // a - b + P
2526    movq        %rdx, %r10
2527    adcq        %r14, %r9
2528    adcq        $0, %r10
2529    movq        %r12, %r11
2530    adcq        %r15, %r11
2531    testq       %r13, %r13
2532
2533    cmovzq      %rax, %r8
2534    cmovzq      %rcx, %r9
2535    cmovzq      %rdx, %r10
2536    cmovzq      %r12, %r11                      // output: r8-r11 P[1] --> r14 P[3] --> r15
2537
2538    /* X3/4 = (r^2 - 2 * V - J)/4 = (r^2 - 2 * V)/4 - J/4 */
2539    leaq        160(%rsp), %rdx                 // read J/4 from stack
2540    leaq        224(%rsp), %rdi                 // save (r^2 - 2 * V - J)/4 to stack
2541    call        ECP256_SubCore                  // output: r8-r11 P[1] --> r14 P[3] --> r15
2542    movq        %r8, (%rdi)                     // b[0]
2543    movq        %r9, 8(%rdi)                    // b[1]
2544    movq        %r10, 16(%rdi)                  // b[2]
2545    movq        %r11, 24(%rdi)                  // b[3]
2546
2547    /* (V - X3)/4 = V/4 - X3/4 */
2548    leaq        (%rsp), %rsi                    // read (r^2 - 2 * V)/4 from stack
2549    xorq        %r13, %r13
2550    movq        (%rsi), %rax                    // a[0]
2551    movq        8(%rsi), %rcx                   // a[1]
2552    movq        16(%rsi), %rdx                  // a[2]
2553    movq        24(%rsi), %r12                  // a[3]
2554
2555    subq        %r8, %rax
2556    sbbq        %r9, %rcx
2557    movq        %rax, %r8
2558    sbbq        %r10, %rdx
2559    sbbq        %r11, %r12
2560    movq        %rcx, %r9
2561    sbbq        $0, %r13
2562    movq        %rdx, %r10
2563    movq        %r12, %r11
2564
2565    addq        $-1, %r8                        // a - b + P
2566    adcq        %r14, %r9
2567    adcq        $0, %r10
2568    adcq        %r15, %r11
2569    testq       %r13, %r13
2570
2571    cmovzq      %rax, %r8
2572    cmovzq      %rcx, %r9
2573    cmovzq      %rdx, %r10
2574    cmovzq      %r12, %r11
2575
2576    leaq        64(%rsp), %rdi                  // save (V - X3)/4 from stack
2577    movq        %r8, (%rdi)
2578    movq        %r9, 8(%rdi)
2579    movq        %r10, 16(%rdi)
2580    movq        %r11, 24(%rdi)                  // output: r8-r11 P[1] --> r14 P[3] --> r15
2581
2582    /* (J * Y1)/4 = Y1 * J/4  */
2583    leaq        352(%rsp), %rcx                 // read Y1 from stack
2584    leaq        160(%rsp), %rsi                 // read J/4 from stack
2585    leaq        32(%rsp), %rdi                  // save (J * Y1)/4 to stack
2586    call        ECP256_MulCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2587
2588    /* (r * (V - X3)/8) = (V - X3)/4 * r/2 */
2589    leaq        96(%rsp), %rcx                  // read r/2 from stack
2590    leaq        64(%rsp), %rsi                  // read (V - X3)/4 from stack
2591    leaq        64(%rsp), %rdi                  // save (r * (V - X3)/8) to stack
2592    call        ECP256_MulCore_q                // output: r8-r11 P[1] --> r14 P[3] --> r15
2593
2594    /* Y3/8 =  (r * (V - X3)/8) - (J * Y1)/4 */
2595    leaq        32(%rsp), %rdx                  // read (J * Y1)/4 from stack
2596    leaq        256(%rsp), %rdi                 // save Y3/8
2597    call        ECP256_SubCore
2598    movq        %r8, (%rdi)
2599    movq        %r9, 8(%rdi)
2600    movq        %r10, 16(%rdi)
2601    movq        %r11, 24(%rdi)
2602
2603    vmovq       %xmm0, %rdi
2604
2605    vmovdqa     %xmm5, %xmm0
2606    vmovdqa     %xmm5, %xmm1
2607    vpandn      288(%rsp), %xmm0, %xmm0
2608    vmovdqa     %xmm5, %xmm2
2609    vpandn      304(%rsp), %xmm1, %xmm1
2610    vmovdqa     %xmm5, %xmm3
2611    vpand       .Lone_mont(%rip), %xmm2, %xmm2
2612    vpand       .Lone_mont+16(%rip), %xmm3, %xmm3
2613    vpor        %xmm0, %xmm2, %xmm2
2614    vpor        %xmm1, %xmm3, %xmm3
2615
2616    vmovdqa     %xmm4, %xmm0
2617    vmovdqa     %xmm4, %xmm1
2618    vpandn      %xmm2, %xmm0, %xmm0
2619    vmovdqa     %xmm4, %xmm2
2620    vpandn      %xmm3, %xmm1, %xmm1
2621    vmovdqa     %xmm4, %xmm3
2622    vpand       384(%rsp), %xmm2, %xmm2
2623    vpand       400(%rsp), %xmm3, %xmm3
2624    vpor        %xmm0, %xmm2, %xmm2
2625    vpor        %xmm1, %xmm3, %xmm3
2626    vmovdqu     %xmm2, 64(%rdi)
2627    vmovdqu     %xmm3, 80(%rdi)
2628
2629    vmovdqa	    %xmm5, %xmm0
2630	vmovdqa	    %xmm5, %xmm1
2631	vpandn	    224(%rsp), %xmm0, %xmm0
2632	vmovdqa	    %xmm5, %xmm2
2633	vpandn	    224+16(%rsp), %xmm1, %xmm1
2634	vmovdqa	    %xmm5, %xmm3
2635	vpand	    416(%rsp), %xmm2, %xmm2
2636	vpand	    416+16(%rsp), %xmm3, %xmm3
2637	vpor	    %xmm0, %xmm2, %xmm2
2638	vpor	    %xmm1, %xmm3, %xmm3
2639
2640	vmovdqa	    %xmm4, %xmm0
2641	vmovdqa	    %xmm4, %xmm1
2642	vpandn	    %xmm2, %xmm0, %xmm0
2643	vmovdqa	    %xmm4, %xmm2
2644	vpandn	    %xmm3, %xmm1, %xmm1
2645	vmovdqa	    %xmm4, %xmm3
2646	vpand	    320(%rsp), %xmm2, %xmm2
2647	vpand	    336(%rsp), %xmm3, %xmm3
2648	vpor	    %xmm0, %xmm2, %xmm2
2649	vpor	    %xmm1, %xmm3, %xmm3
2650	vmovdqu	    %xmm2, 0(%rdi)
2651	vmovdqu	    %xmm3, 16(%rdi)
2652
2653	vmovdqa	    %xmm5, %xmm0
2654	vmovdqa	    %xmm5, %xmm1
2655	vpandn	    256(%rsp), %xmm0, %xmm0
2656	vmovdqa	    %xmm5, %xmm2
2657	vpandn	    272(%rsp), %xmm1, %xmm1
2658	vmovdqa	    %xmm5, %xmm3
2659	vpand	    448(%rsp), %xmm2, %xmm2
2660	vpand	    464(%rsp), %xmm3, %xmm3
2661	vpor	    %xmm0, %xmm2, %xmm2
2662	vpor	    %xmm1, %xmm3, %xmm3
2663
2664	vmovdqa	    %xmm4, %xmm0
2665	vmovdqa	    %xmm4, %xmm1
2666	vpandn	    %xmm2, %xmm0, %xmm0
2667	vmovdqa	    %xmm4, %xmm2
2668	vpandn	    %xmm3, %xmm1, %xmm1
2669	vmovdqa	    %xmm4, %xmm3
2670	vpand	    352(%rsp), %xmm2, %xmm2
2671	vpand	    368(%rsp), %xmm3, %xmm3
2672	vpor	    %xmm0, %xmm2, %xmm2
2673	vpor	    %xmm1, %xmm3, %xmm3
2674	vmovdqu	    %xmm2, 32(%rdi)
2675	vmovdqu	    %xmm3, 48(%rdi)
2676
2677    addq        $488, %rsp
2678    popq        %r15
2679    popq        %r14
2680    popq        %r13
2681    popq        %r12
2682    popq        %rbx
2683    popq        %rbp
2684    ret
2685.cfi_endproc
2686.size   ECP256_AddAffine, .-ECP256_AddAffine
2687
2688/**
2689 *  Function description: This interface is used to store the G-16G pre-computation table discretely.
2690 *  Function prototype: void ECP256_Scatterw5(P256_Point *table, const P256_Point *point, uint32_t index);
2691 *  Input register:
2692 *        rdi: Points to the base address of the pre-computation table.
2693 *        rsi: Points to P256_Point.
2694 *        rdx: Index value. The value ranges from 1 to 16.
2695 *  Change register: rdx, rsi, rdi, r8, r9, r10, r11
2696 *  Output register: None
2697 *  Function/Macro Call:
2698 */
2699.globl  ECP256_Scatterw5
2700.type   ECP256_Scatterw5,@function
2701.align 32
2702ECP256_Scatterw5:
2703.cfi_startproc
2704    subq    $1, %rdx                // index - 1
2705    movq    (%rsi), %r8             // x[0]
2706    movq    8(%rsi), %r9            // x[1]
2707    movq    16(%rsi), %r10          // x[2]
2708    movq    24(%rsi), %r11          // x[3]
2709    leaq    (%rdi, %rdx, 8), %rdi   // base = base + (index - 1) * 8 . offset for table
2710
2711    movq    %r8, (%rdi)             // x[0] --> base + 0 * 128
2712    movq    %r9, 128(%rdi)          // x[1] --> base + 1 * 128
2713    movq    %r10, 256(%rdi)         // x[2] --> base + 2 * 128
2714    movq    %r11, 384(%rdi)         // x[3] --> base + 3 * 128
2715    leaq    512(%rdi), %rdi         // base + 128 * 4
2716
2717    leaq    32(%rsi), %rsi          // addr(y) --> rsi
2718
2719    movq    (%rsi), %r8             // y[0]
2720    movq    8(%rsi), %r9            // y[1]
2721    movq    16(%rsi), %r10          // y[2]
2722    movq    24(%rsi), %r11          // y[3]
2723
2724    movq    %r8, (%rdi)             // y[0] --> base + 4 * 128
2725    movq    %r9, 128(%rdi)          // y[1] --> base + 5 * 128
2726    movq    %r10, 256(%rdi)         // y[2] --> base + 6 * 128
2727    movq    %r11, 384(%rdi)         // y[3] --> base + 7 * 128
2728    leaq    512(%rdi), %rdi         // base + 128 * 8
2729
2730    leaq    32(%rsi), %rsi          // addr(z) --> rsi
2731
2732    movq    (%rsi), %r8             // z[0]
2733    movq    8(%rsi), %r9            // z[1]
2734    movq    16(%rsi), %r10          // z[2]
2735    movq    24(%rsi), %r11          // z[3]
2736
2737    movq    %r8, (%rdi)             // z[0] --> base + 8 * 128
2738    movq    %r9, 128(%rdi)          // z[1] --> base + 9 * 128
2739    movq    %r10, 256(%rdi)         // z[2] --> base + 10 * 128
2740    movq    %r11, 384(%rdi)         // z[3] --> base + 11 * 128
2741
2742    ret
2743.cfi_endproc
2744.size   ECP256_Scatterw5, .-ECP256_Scatterw5
2745
2746/**
2747 *  Function description: This interface is used to obtain the G-16G pre-computation table discretely.
2748 *  Function prototype: void ECP256_Gatherw5(P256_Point *point, const P256_Point *table, uint32_t index);
2749 *  Input register:
2750 *        rdi: points to P256_Point.
2751 *        rsi: points to the base address of the pre-computation table.
2752 *        edx: index value
2753 *  Change register: edx, rsi, rdi, r8, r9, r10, r11
2754 *  Output register: None
2755 *  Function/Macro Call:
2756 */
2757.globl  ECP256_Gatherw5
2758.type   ECP256_Gatherw5,@function
2759.align 32
2760ECP256_Gatherw5:
2761.cfi_startproc
2762    movq    $-1, %rax
2763    xorq    %rcx, %rcx
2764    cmp     $0, %rdx
2765    cmovzq  %rcx, %rax              // rax = (rdx == 0) ? 0 : -1
2766    add     %rax, %rdx              // rdx = (rdx == 0) ? rdx : (rdx - 1)
2767
2768    leaq    (%rsi, %rdx, 8), %rsi   // Calculate offset. base = base + (index -1) * 8
2769
2770    movq    (%rsi), %r8             // x[0]
2771    movq    128(%rsi), %r9          // x[1]
2772    movq    256(%rsi), %r10         // x[2]
2773    movq    384(%rsi), %r11         // x[3]
2774    leaq    512(%rsi), %rsi         // base += 512
2775
2776    andq    %rax, %r8
2777    andq    %rax, %r9
2778    andq    %rax, %r10
2779    andq    %rax, %r11
2780
2781    movq    %r8, (%rdi)             // Write back
2782    movq    %r9, 8(%rdi)
2783    movq    %r10, 16(%rdi)
2784    movq    %r11, 24(%rdi)
2785
2786    leaq    32(%rdi), %rdi          // Write back point offset
2787
2788    movq    (%rsi), %r8             // y[0]
2789    movq    128(%rsi), %r9          // y[1]
2790    movq    256(%rsi), %r10         // y[2]
2791    movq    384(%rsi), %r11         // y[3]
2792    leaq    512(%rsi), %rsi         // base += 512
2793
2794    andq    %rax, %r8
2795    andq    %rax, %r9
2796    andq    %rax, %r10
2797    andq    %rax, %r11
2798
2799    movq    %r8, (%rdi)             // Write back
2800    movq    %r9, 8(%rdi)
2801    movq    %r10, 16(%rdi)
2802    movq    %r11, 24(%rdi)
2803
2804    leaq    32(%rdi), %rdi          // Write back point offset
2805
2806    movq    (%rsi), %r8             // z[0]
2807    movq    128(%rsi), %r9          // z[1]
2808    movq    256(%rsi), %r10         // z[2]
2809    movq    384(%rsi), %r11         // z[3]
2810
2811    andq    %rax, %r8
2812    andq    %rax, %r9
2813    andq    %rax, %r10
2814    andq    %rax, %r11
2815
2816    movq    %r8, (%rdi)             // Write back
2817    movq    %r9, 8(%rdi)
2818    movq    %r10, 16(%rdi)
2819    movq    %r11, 24(%rdi)
2820
2821    ret
2822.cfi_endproc
2823.size   ECP256_Gatherw5, .-ECP256_Gatherw5
2824
2825/**
2826 *  Function description: Discretely obtains affine points in the precomputation table.
2827 *  Function prototype: void ECP256_Gatherw7(P256_AffinePoint *point, const P256_AffinePoint *table, uint32_t index);
2828 *  Input register:
2829 *        rdi: points to the returned P256_AffinePoint.
2830 *        rsi: points to the base address of the pre-computation table.
2831 *        rdx: index value
2832 *  Change register: rax, rcx, rdx, rsi, rdi, rbp, r8, r9, r10
2833 *  Output register: None
2834 *  Function/Macro Call:
2835 */
2836.globl  ECP256_Gatherw7
2837.type   ECP256_Gatherw7,@function
2838.align 32
2839ECP256_Gatherw7:
2840.cfi_startproc
2841    movq    $-1, %rax
2842    xorq    %rcx, %rcx
2843    cmp     $0, %rdx
2844    cmovzq  %rcx, %rax          // rax = (rdx == 0) ? 0 : -1
2845    addq    %rax, %rdx          // rdx = (rdx == 0) ? rdx : (rdx - 1)
2846    subq    $63, %rdx           // rdx = (rdx == 0) ? (rdx - 63) : (rdx - 1 - 63)
2847    subq    %rdx, %rsi          // rsi = (rdx == 0) ? (rsi + 63 - rdx) : (rsi + 64 - rdx)
2848    movq    $8, %r10            // Loop value
2849
2850.Lgather_w7_loop:
2851    xorq    %r8, %r8            // Empty reg for data low 32
2852    xorq    %r9, %r9            // Empty reg for data high 32
2853    movb    192(%rsi), %r8b     // r8 = [0 0 0 byte(3)]
2854    movb    448(%rsi), %r9b     // r9 = [0 0 0 byte(7)]
2855    shlq    $8, %r8             // r8 = [0 0 byte(3) 0]
2856    shlq    $8, %r9             // r9 = [0 0 byte(7) 0]
2857    movb    128(%rsi), %r8b     // r8 = [0 0 byte(3) byte(2)]
2858    movb    384(%rsi), %r9b     // r9 = [0 0 byte(7) byte(6)]
2859    shlq    $8, %r8             // r8 = [0 byte(3) byte(2) 0]
2860    shlq    $8, %r9             // r9 = [0 byte(7) byte(6) 0]
2861    movb    64(%rsi), %r8b      // r8 = [0 byte(3) byte(2) byte(1)]
2862    movb    320(%rsi), %r9b     // r9 = [0 byte(7) byte(6) byte(5)]
2863    shlq    $8, %r8             // r8 = [byte(3) byte(2) byte(1) 0]
2864    shlq    $8, %r9             // r9 = [byte(7) byte(6) byte(5) 0]
2865    movb    (%rsi), %r8b        // r8 = [byte(3) byte(2) byte(1) byte(0)]
2866    movb    256(%rsi), %r9b     // r9 = [byte(7) byte(6) byte(5) byte(4)]
2867    leaq    512(%rsi), %rsi     // base += 64 * 8
2868    shlq    $32, %r9            // r9 = [byte(7) byte(6) byte(5) byte(4) 0 0 0 0]
2869    orq     %r9, %r8            // r8 = [byte(7) byte(6) byte(5) byte(4) byte(3) byte(2) byte(1) byte(0)]
2870
2871    andq    %rax, %r8
2872    movq    %r8, (%rdi)
2873    leaq    8(%rdi), %rdi
2874
2875    subq    $1, %r10
2876    jnz     .Lgather_w7_loop
2877
2878    ret
2879.cfi_endproc
2880.size   ECP256_Gatherw7, .-ECP256_Gatherw7
2881
2882#endif
2883