• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#if defined(HITLS_CRYPTO_CURVE_NISTP256) && defined(HITLS_CRYPTO_NIST_USE_ACCEL)
18
19#include "crypt_arm.h"
20#include "ecp256_pre_comp_table.s"
21
22.arch   armv8-a+crypto
23.file   "ecp256_armv8.S"
24
25.section .rodata
26.align  4
27.Lpoly:         // P
28.quad   0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
29.Lone_mont:     // R mod P, R = 2^256, = 2^256 - P
30.quad   0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe
31.Lord:          // Order, n
32.quad   0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000
33.LordK:         // (2^64 - ord[0]) * ordK = 1 // LordK * Lord, the lower 64 bits are all Fs.
34                // LordK * Lord, The lower 64 bits are all Fs, coincidence?
35.quad   0xccd1c8aaee00bc4f
36.Lone:
37.quad   1, 0, 0, 0
38/*
39Initial z is not set to 1
400000000300000000, 00000001FFFFFFFE, FFFFFFFD00000002, FFFFFFFE00000003
41*/
42
43.text
44.globl  ECP256_GetPreCompTable
45.type   ECP256_GetPreCompTable,%function
46.align  4
47ECP256_GetPreCompTable:
48AARCH64_PACIASP
49    adrp x0, g_preCompTable
50    add	x0, x0, :lo12:g_preCompTable
51AARCH64_AUTIASP
52    ret
53.size    ECP256_GetPreCompTable,.-ECP256_GetPreCompTable
54
55.globl  ECP256_FromMont
56.type   ECP256_FromMont,%function
57.align  4
58ECP256_FromMont:
59AARCH64_PACIASP
60    stp     x29, x30, [sp, #-32]!
61    add     x29, sp , #0
62    stp     x19, x20, [sp, #16]
63
64    ldp     x4 , x5 , [x1]
65    ldp     x6 , x7 , [x1, #16]
66    adrp    x14, .Lpoly+8
67    add x14,x14,:lo12:.Lpoly+8
68    ldr x12,[x14]
69    ldr x13,[x14,#16]
70    adrp    x2, .Lone // &bp[0]
71    add	    x2, x2, :lo12:.Lone
72
73    bl      ECP256_MulCore
74
75    ldp     x19, x20, [sp , #16]
76    ldp     x29, x30, [sp], #32
77AARCH64_AUTIASP
78    ret
79.size    ECP256_FromMont,.-ECP256_FromMont
80
81.type   ECP256_AddCore,%function
82.align  4
83ECP256_AddCore:
84AARCH64_PACIASP
85    adds    x14, x14, x8            // ret = a+b
86    adcs    x15, x15, x9
87    adcs    x16, x16, x10
88    adcs    x17, x17, x11
89    adc     x1 , xzr, xzr
90
91    // r -= p, p = 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001
92    adds    x8 , x14, #1            // x8  = x14 - 0xffffffffffffffff = x14 + 1
93    sbcs    x9 , x15, x12           // x9  = x15 - 0x00000000ffffffff
94    sbcs    x10, x16, xzr           // x10 = x16 - 0
95    sbcs    x11, x17, x13           // x11 = x17 - 0xffffffff00000001
96    sbcs    xzr, x1 , xzr           // Determine whether r-p has borrowing
97
98    // r = r < p ? r : r-p
99    // x8~x11 saves r-p, x14~x17 saves r
100    csel    x14, x14, x8 , lo
101    csel    x15, x15, x9 , lo
102    csel    x16, x16, x10, lo
103    csel    x17, x17, x11, lo
104
105    stp     x14, x15, [x0]
106    stp     x16, x17, [x0, #16]
107AARCH64_AUTIASP
108    ret
109.size    ECP256_AddCore,.-ECP256_AddCore
110
111
112///////////////////////////////////////
113// x14~x17 = a[0~3], x8~x11 = b[0~3] //
114///////////////////////////////////////
115.type   ECP256_SubCore1,%function
116.align  4
117ECP256_SubCore1:
118AARCH64_PACIASP
119    subs    x14, x14, x8
120    sbcs    x15, x15, x9
121    sbcs    x16, x16, x10
122    sbcs    x17, x17, x11
123    sbc     x1 , xzr, xzr           // x1 = CF
124
125    // r += p
126    subs    x8 , x14, #1            // x8  = x14 + 0xffffffffffffffff = x14 - 1
127    adcs    x9 , x15, x12           // x9  = x15 + 0x00000000ffffffff
128    adcs    x10, x16, xzr           // x10 = x16 + 0
129    adc     x11, x17, x13           // x11 = x17 + 0xffffffff00000001
130    cmp     x1 , xzr
131
132    // r = borrow==0 ? r : r+p
133    csel    x14, x14, x8 , eq
134    csel    x15, x15, x9 , eq
135    csel    x16, x16, x10, eq
136    csel    x17, x17, x11, eq
137    stp     x14, x15, [x0]
138    stp     x16, x17, [x0, #16]
139AARCH64_AUTIASP
140    ret
141.size   ECP256_SubCore1,.-ECP256_SubCore1
142
143.type   ECP256_SubCore2,%function
144.align  4
145ECP256_SubCore2:
146AARCH64_PACIASP
147    subs    x14, x8 , x14
148    sbcs    x15, x9 , x15
149    sbcs    x16, x10, x16
150    sbcs    x17, x11, x17
151    sbc     x1 , xzr, xzr           // x1 = CF
152
153    // r += p
154    subs    x8 , x14, #1            // x8  = x14 + 0xffffffffffffffff = x14 - 1
155    adcs    x9 , x15, x12           // x9  = x15 + 0x00000000ffffffff
156    adcs    x10, x16, xzr           // x10 = x16 + 0
157    adc     x11, x17, x13           // x11 = x17 + 0xffffffff00000001
158    cmp     x1 , xzr
159
160    // r = borrow==0 ? r : r+p
161    csel    x14, x14, x8 , eq
162    csel    x15, x15, x9 , eq
163    csel    x16, x16, x10, eq
164    csel    x17, x17, x11, eq
165    stp     x14, x15, [x0]
166    stp     x16, x17, [x0, #16]
167AARCH64_AUTIASP
168    ret
169.size   ECP256_SubCore2,.-ECP256_SubCore2
170
171
172.type   ECP256_DivBy2Core,%function
173.align  4
174ECP256_DivBy2Core:
175AARCH64_PACIASP
176    // a+p
177    subs    x8 , x14, #1            // x8  = x14 + 0xffffffffffffffff = x14 - 1
178    adcs    x9 , x15, x12           // x9  = x15 + 0x00000000ffffffff
179    adcs    x10, x16, xzr           // x10 = x16 + 0
180    adcs    x11, x17, x13           // x11 = x17 + 0xffffffff00000001
181    adc     x1 , xzr, xzr           // x1 = CF
182    tst     x14, #1                 // Check whether a is an even number. eq indicates that a is an even number.
183
184    // a = a Is an even number ? a : a+p
185    csel    x14, x14, x8 , eq
186    csel    x15, x15, x9 , eq
187    csel    x16, x16, x10, eq
188    csel    x17, x17, x11, eq
189    csel    x1 , xzr, x1 , eq       // If a is still a, then x1, which holds the carry, is set to 0.
190
191    lsr     x14, x14, #1            // r >>= 1, Divided by 2.
192    orr     x14, x14, x15, lsl#63   // x15 is shifted leftward by 63 bits. The lower 63 bits become 0.
193                                    // The most significant bit is the least significant bit before the shift.
194    lsr     x15, x15, #1            // The most significant bit of x14 is changed to the least significant bit of x15
195                                    // to achieve cyclic right shift.
196    orr     x15, x15, x16, lsl#63
197    lsr     x16, x16, #1
198    orr     x16, x16, x17, lsl#63
199    lsr     x17, x17, #1
200    orr     x17, x17, x1 , lsl#63   // The most significant bit of the highest block is x1.
201    stp     x14, x15, [x0]
202    stp     x16, x17, [x0, #16]
203AARCH64_AUTIASP
204    ret
205.size    ECP256_DivBy2Core,.-ECP256_DivBy2Core
206
207.globl  ECP256_Neg
208.type   ECP256_Neg,%function
209.align  4
210ECP256_Neg:
211AARCH64_PACIASP
212    stp     x29, x30, [sp, #-16]!
213    add     x29, sp , #0
214
215    ldp     x14, x15, [x1]
216    ldp     x16, x17, [x1, #16]
217
218    mov     x8 , xzr
219    mov     x9 , xzr
220    mov     x10, xzr
221    mov     x11, xzr
222
223    adrp    x7, .Lpoly+8
224    add x7,x7,:lo12:.Lpoly+8
225    ldr x12,[x7]
226    ldr x13,[x7,#16]
227    // -a = 0 - a
228    bl      ECP256_SubCore2
229
230    ldp     x29, x30, [sp], #16
231AARCH64_AUTIASP
232    ret
233.size    ECP256_Neg,.-ECP256_Neg
234
235
236.type   ECP256_MulCore,%function
237.align  4
238ECP256_MulCore:
239AARCH64_PACIASP
240    ldr     x3 , [x2]               // b[0]
241
242    mul     x14, x4 , x3
243    umulh   x8 , x4 , x3
244    mul     x15, x5 , x3
245    umulh   x9 , x5 , x3
246    mul     x16, x6 , x3
247    umulh   x10, x6 , x3
248    mul     x17, x7 , x3
249    umulh   x11, x7 , x3
250
251    adds    x15, x15, x8
252    adcs    x16, x16, x9
253    adcs    x17, x17, x10
254    adc     x19, xzr, x11           // x19 = x11 + CF
255    mov     x20, xzr
256
257    lsl     x8 , x14, #32           // In this case, x14 stores the lower 32 bits of the lo(a[0]*b[i]), x8 = lo(a[0]*b[i]).
258                                    // The lower 32 bits are shifted to the left by 32 bits, and 0s are padded to the lower bits.
259    lsr     x9 , x14, #32           // x9 = The most significant 32 bits of lo(a[0]*b[i]) are shifted rightward by 32 bits
260                                    // and 0s are padded to the most significant bits.
261    subs    x10, x14, x8
262    sbc     x11, x14, x9
263
264    adds    x14, x15, x8
265    adcs    x15, x16, x9
266    adcs    x16, x17, x10
267    adcs    x17, x19, x11
268    adc     x19, x20, xzr
269
270    ldr     x3 , [x2, #8]           // b[1]
271
272    // lo(a[0~3] * b[i])
273    mul     x8 , x4 , x3
274    mul     x9 , x5 , x3
275    mul     x10, x6 , x3
276    mul     x11, x7 , x3
277
278    adds    x14, x14, x8
279    adcs    x15, x15, x9
280    adcs    x16, x16, x10
281    adcs    x17, x17, x11
282    adc     x19, x19, xzr
283
284    // hi(a[0~3] * b[i])
285    umulh   x8 , x4 , x3
286    umulh   x9 , x5 , x3
287    umulh   x10, x6 , x3
288    umulh   x11, x7 , x3
289
290    adds    x15, x15, x8
291    adcs    x16, x16, x9
292    adcs    x17, x17, x10
293    adcs    x19, x19, x11
294    adc     x20, xzr, xzr
295
296    lsl     x8 , x14, #32           // In this case, x14 stores the lower 32 bits of the lo(a[0]*b[i]),
297                                    // x8 = lo(a[0]*b[i]). The lower 32 bits are shifted to the left by 32 bits,
298                                    // and 0s are padded to the lower bits.
299    lsr     x9 , x14, #32           // x9 = The most significant 32 bits of lo(a[0]*b[i]) are shifted rightward
300                                    // by 32 bits and 0s are padded to the most significant bits.
301    subs    x10, x14, x8
302    sbc     x11, x14, x9
303
304    adds    x14, x15, x8
305    adcs    x15, x16, x9
306    adcs    x16, x17, x10
307    adcs    x17, x19, x11
308    adc     x19, x20, xzr
309
310    ldr     x3 , [x2, #8*2]           // b[2]
311
312    // lo(a[0~3] * b[i])
313    mul     x8 , x4 , x3
314    mul     x9 , x5 , x3
315    mul     x10, x6 , x3
316    mul     x11, x7 , x3
317
318    adds    x14, x14, x8
319    adcs    x15, x15, x9
320    adcs    x16, x16, x10
321    adcs    x17, x17, x11
322    adc     x19, x19, xzr
323
324    // hi(a[0~3] * b[i])
325    umulh   x8 , x4 , x3
326    umulh   x9 , x5 , x3
327    umulh   x10, x6 , x3
328    umulh   x11, x7 , x3
329
330    adds    x15, x15, x8
331    adcs    x16, x16, x9
332    adcs    x17, x17, x10
333    adcs    x19, x19, x11
334    adc     x20, xzr, xzr
335
336    lsl     x8 , x14, #32           // x8  = lq<<32
337    lsr     x9 , x14, #32           // x9  = hq
338    subs    x10, x14, x8            // x10 = q - lq<<32
339    sbc     x11, x14, x9            // x11 = q - hq
340
341    adds    x14, x15, x8
342    adcs    x15, x16, x9
343    adcs    x16, x17, x10
344    adcs    x17, x19, x11
345    adc     x19, x20, xzr
346
347    ldr     x3 , [x2, #8*3]           // b[3]
348
349    // lo(a[0~3] * b[i])
350    mul     x8 , x4 , x3
351    mul     x9 , x5 , x3
352    mul     x10, x6 , x3
353    mul     x11, x7 , x3
354
355    adds    x14, x14, x8
356    adcs    x15, x15, x9
357    adcs    x16, x16, x10
358    adcs    x17, x17, x11
359    adc     x19, x19, xzr
360
361    // hi(a[0~3] * b[i])
362    umulh   x8 , x4 , x3
363    umulh   x9 , x5 , x3
364    umulh   x10, x6 , x3
365    umulh   x11, x7 , x3
366
367    adds    x15, x15, x8
368    adcs    x16, x16, x9
369    adcs    x17, x17, x10
370    adcs    x19, x19, x11
371    adc     x20, xzr, xzr
372
373    // last reduction
374    lsl     x8 , x14, #32           // In this case, x14 stores the lower 32 bits of the lo(a[0]*b[i]),
375                                    // x8 = The lower 32 bits of (lo(a[0]*b[i])) are shifted to the left by 32 bits,
376                                    // and 0s are padded to the lower bits.
377    lsr     x9 , x14, #32           // x9 = The most significant 32 bits of (lo(a[0]*b[i])) are shifted rightward
378                                    // by 32 bits and 0s are padded to the most significant bits.
379    subs    x10, x14, x8
380    sbc     x11, x14, x9
381
382    adds    x14, x15, x8
383    adcs    x15, x16, x9
384    adcs    x16, x17, x10
385    adcs    x17, x19, x11
386    adc     x19, x20, xzr
387
388    // x8~x11 = r - p (r may be greater than p)
389    adds    x8 , x14, #1            // x8  = x14 - 0xffffffffffffffff = x14 + 1
390    sbcs    x9 , x15, x12           // x9  = x15 - 0x00000000ffffffff
391    sbcs    x10, x16, xzr           // x10 = x16 - 0
392    sbcs    x11, x17, x13           // x11 = x17 - 0xffffffff00000001
393    sbcs    xzr, x19, xzr           // Determine whether to borrow
394
395    csel    x14, x14, x8 , lo
396    csel    x15, x15, x9 , lo
397    csel    x16, x16, x10, lo
398    csel    x17, x17, x11, lo
399
400    stp     x14, x15, [x0]
401    stp     x16, x17, [x0, #8*2]
402AARCH64_AUTIASP
403    ret
404.size   ECP256_MulCore, .-ECP256_MulCore
405
406
407.type   ECP256_SqrCore,%function
408.align  4
409ECP256_SqrCore:
410AARCH64_PACIASP
411/******************************************
412    x7   x1   x20  x19  x17  x16  x15  x14
413                             h0*1 l0*1
414                        h0*2 l0*2
415                   h0*3 l0*3
416                   h1*2 l1*2
417              h1*3 l1*3
418         h2*3 l2*3
419    h3*3 l3*3 h2*2 l2*2 h1*1 l1*1 h0*0 l0*0
420*******************************************/
421
422    // a[1~3] * a[0]
423    mul     x15, x5 , x4            // lo(a[1] * a[0])
424    umulh   x8 , x5 , x4            // hi(a[1] * a[0])
425    mul     x16, x6 , x4            // lo(a[2] * a[0])
426    umulh   x9 , x6 , x4            // hi(a[2] * a[0])
427    mul     x17, x7 , x4            // lo(a[3] * a[0])
428    umulh   x19, x7 , x4            // hi(a[3] * a[0])
429
430    adds    x16, x16, x8
431    adcs    x17, x17, x9
432    adc     x19, x19, xzr           // There will be no more carry.
433
434    // a[2~3] * a[1]
435    mul     x8 , x6 , x5            // lo(a[2] * a[1])
436    umulh   x9 , x6 , x5            // hi(a[2] * a[1])
437    mul     x10, x7 , x5            // lo(a[3] * a[1])
438    umulh   x11, x7 , x5            // hi(a[3] * a[1])
439
440    // a[3] * a[2]
441    mul     x20, x7 , x6
442    umulh   x1 , x7 , x6
443
444    // Add the results of the current round, and then add the acc (register in the note above).
445    adds    x9 , x10, x9
446    adc     x10, x11, xzr
447
448    adds    x17, x17, x8
449    adcs    x19, x19, x9
450    adcs    x20, x20, x10
451    adc     x1 , x1 , xzr           // a[0-3] has been saved in x4 to x7.
452
453    // a[0~3] ^ 2
454    mul     x14, x4 , x4
455    umulh   x4 , x4 , x4
456    mul     x8 , x5 , x5
457    umulh   x5 , x5 , x5
458    mul     x9 , x6 , x6
459    umulh   x6 , x6 , x6
460    mul     x10, x7 , x7
461    umulh   x7 , x7 , x7
462
463    // acc[1~6] << 1
464    adds    x15, x15, x15
465    adcs    x16, x16, x16
466    adcs    x17, x17, x17
467    adcs    x19, x19, x19
468    adcs    x20, x20, x20
469    adcs    x1 , x1 , x1
470    adc     x2 , xzr, xzr
471
472    // acc[] += a[] ^ 2
473    adds    x15, x15, x4
474    adcs    x16, x16, x8
475    adcs    x17, x17, x5
476    adcs    x19, x19, x9
477    adcs    x20, x20, x6
478    adcs    x1 , x1 , x10
479    adc     x7 , x7 , x2
480
481    // Four rounds of reduce.
482    lsl     x8 , x14, #32
483    lsr     x9 , x14, #32
484    subs    x10, x14, x8
485    sbc     x11, x14, x9
486    adds    x14, x15, x8
487    adcs    x15, x16, x9
488    adcs    x16, x17, x10
489    adc     x17, x11, xzr
490
491    lsl     x8 , x14, #32
492    lsr     x9 , x14, #32
493    subs    x10, x14, x8
494    sbc     x11, x14, x9
495    adds    x14, x15, x8
496    adcs    x15, x16, x9
497    adcs    x16, x17, x10
498    adc     x17, x11, xzr
499
500    lsl     x8 , x14, #32
501    lsr     x9 , x14, #32
502    subs    x10, x14, x8
503    sbc     x11, x14, x9
504    adds    x14, x15, x8
505    adcs    x15, x16, x9
506    adcs    x16, x17, x10
507    adc     x17, x11, xzr
508
509    lsl     x8 , x14, #32
510    lsr     x9 , x14, #32
511    subs    x10, x14, x8
512    sbc     x11, x14, x9
513    adds    x14, x15, x8
514    adcs    x15, x16, x9
515    adcs    x16, x17, x10
516    adc     x17, x11, xzr
517
518    // Add acc[4-7]. x14~x19 is the current calculation result r.
519    adds    x14, x14, x19
520    adcs    x15, x15, x20
521    adcs    x16, x16, x1
522    adcs    x17, x17, x7
523    adc     x19, xzr, xzr
524
525    // r -= p
526    adds    x8 , x14, #1            // subs x8,x14,#-1
527    sbcs    x9 , x15, x12
528    sbcs    x10, x16, xzr
529    sbcs    x11, x17, x13
530    sbcs    xzr, x19, xzr           // The set CF is used to determine the size relationship between r and p.
531
532    // r = r-p < 0 ? r : r-p
533    csel    x14, x14, x8 , lo
534    csel    x15, x15, x9 , lo
535    csel    x16, x16, x10, lo
536    csel    x17, x17, x11, lo
537
538    stp     x14, x15, [x0]
539    stp     x16, x17, [x0, #16]
540AARCH64_AUTIASP
541    ret
542.size   ECP256_SqrCore,.-ECP256_SqrCore
543
544.globl  ECP256_Mul
545.type   ECP256_Mul,%function
546.align  4
547ECP256_Mul:
548AARCH64_PACIASP
549    stp     x29, x30, [sp, #-32]!
550    add     x29, sp , #0
551    stp     x19, x20, [sp, #16]
552
553    ldp     x4 , x5 , [x1]
554    ldp     x6 , x7 , [x1,#16]
555
556    adrp    x3, .Lpoly+8
557    add x3,x3,:lo12:.Lpoly+8
558    ldr x12,[x3]
559    ldr x13,[x3,#16]
560
561    bl      ECP256_MulCore
562
563    ldp     x19, x20, [sp , #16]
564    ldp     x29, x30, [sp], #32
565AARCH64_AUTIASP
566    ret
567.size   ECP256_Mul,.-ECP256_Mul
568
569.globl  ECP256_Sqr
570.type   ECP256_Sqr,%function
571.align  4
572ECP256_Sqr:
573AARCH64_PACIASP
574    stp     x29, x30, [sp, #-32]!
575    add     x29, sp , #0
576    stp     x19, x20, [sp, #16]
577
578    ldp     x4 , x5 , [x1]
579    ldp     x6 , x7 , [x1, #16]
580    adrp    x3, .Lpoly+8
581    add x3,x3,:lo12:.Lpoly+8
582    ldr x12,[x3]
583    ldr x13,[x3,#16]
584
585    bl      ECP256_SqrCore
586
587    ldp     x19, x20, [sp , #16]
588    ldp     x29, x30, [sp], #32
589AARCH64_AUTIASP
590    ret
591.size   ECP256_Sqr,.-ECP256_Sqr
592
593# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl
594# Deal process:
595#     Z1Z1 = Z12
596#     U2 = X2*Z1Z1
597#     S2 = Y2*Z1*Z1Z1
598#     H = U2-X1
599#     HH = H2
600#     I = 4*HH
601#     J = H*I
602#     r = 2*(S2-Y1)
603#     V = X1*I
604#     X3 = r2-J-2*V
605#     Y3 = r*(V-X3)-2*Y1*J
606#     Z3 = (Z1+H)2-Z1Z1-HH
607.globl  ECP256_AddAffine
608.type   ECP256_AddAffine,%function
609.align  5
610ECP256_AddAffine:
611AARCH64_PACIASP
612
613    stp     x29, x30, [sp, #-80]!
614    add     x29, sp , #0
615    stp     x19, x20, [sp, #16]
616    stp     x21, x22, [sp, #32]
617    stp     x23, x24, [sp, #48]
618    stp     x25, x26, [sp, #64]
619    sub     sp , sp , #32*10            // Each coordinate is 256 bits, that is, 32 bytes, and a space of 10 coordinates is applied for.
620                                        // Based on C implementation, z1sqr and S2 lifecycles do not overlap.
621                                        // You can share one piece of memory and save an extra piece of memory.
622
623    // x0 to x2 will be used to invoke the interface to transfer parameters.
624    // Therefore, the initial input parameters are saved to x21 to x23.
625    // Obtains the structure members based on the address offset. Each member is 256 bits, that is, 32 bytes.
626    mov     x21, x0                     // CRYPT_P256_Point *r
627    mov     x22, x1                     // CRYPT_P256_Point *a
628    mov     x23, x2                     // CRYPT_P256_AffinePoint *b
629
630
631    adrp    x14, .Lpoly+8               // p[1]
632    add x14,x14,:lo12:.Lpoly+8
633    ldr x12,[x14]                       // p[3]
634    ldr x13,[x14,#16]
635
636
637    ldp     x4, x5, [x1, #64]           // x1+64     = &(a->z[0]), a->z[0] is marked as z1[0]
638    ldp     x6, x7, [x1, #64+16]        // x1+64+16 = &(a->z[2])
639
640    orr     x24, x4 , x5                // x24 = z1[0] | z1[1]
641    orr     x24, x24, x6                // x24 |= z1[2]
642    orr     x24, x24, x7                // x24 |= z1[3]
643    cmp     x24, #0
644    csetm   x24, ne                     // x24 = x24!=0 ? -1 : 0, it means ~iszero(z1),
645                                        // determine whether point a is (x, y, 0)
646
647    ldp     x8 , x9 , [x2]              // x2        = &(b->x[0]), b->x[0] is marked as x2[0]
648    ldp     x10, x11, [x2, #16]         // x2+16     = &(b->x[2])
649    ldp     x14, x15, [x2, 32]          // x2+32     = &(b->y[0])
650    ldp     x16, x17, [x2, #32+16]      // x2+32+16  = &(b->y[2])
651
652    // x25 = x2[0] | x2[1] | x2[2] | x2[3] | y2[0] | y2[1] | y2[2] | y2[3]
653    orr     x25, x8 , x9
654    orr     x25, x25, x10
655    orr     x25, x25, x11
656    orr     x25, x25, x14
657    orr     x25, x25, x15
658    orr     x25, x25, x16
659    orr     x25, x25, x17
660    cmp     x25, #0
661    csetm   x25, ne                     // x25 = x25!=0 ? -1 : 0, it means ~iszero(x2|y2),
662                                        // Check whether point b is (0, 0).
663
664    add     x0 , sp , #32*3             // zsqr = sp[32*3]
665    // x4~x7 = z1[0~3] (the value has been assigned.)
666    bl      ECP256_SqrCore     // zsqr = z1^2
667    // x14~x17 also temporarily stored results
668
669    // The next calculation requires the zsqr calculation result as the input.
670    // The calculation result is stored in x4 to x7 in advance to prevent x14 to x17 from being overwritten
671    // by the next calculation. Therefore, the calculation result needs to be read from the memory.
672    mov     x4 , x14
673    mov     x5 , x15
674    mov     x6 , x16
675    mov     x7 , x17
676
677    add     x0 , sp , #32*4             // u2 = sp[32*4]
678    add     x2 , x23, #0                // x2 points to b->x.
679    // x4~x7 * [x2]
680    bl      ECP256_MulCore        // u2 = z1^2 * x2
681    // x14~x17  also temporarily stored results.
682
683    ldp     x8 , x9 , [x22]             // a->x
684    ldp     x10, x11, [x22, #16]
685    add     x0 , sp , #32*5             // h = sp[32*5]
686    // x14~x17 - x8~x11
687    bl      ECP256_SubCore1       // h = u2 - x1
688
689    ldp     x4 , x5 , [sp, #32*3]       // zsqr = sp[32*3]
690    ldp     x6 , x7 , [sp, #32*3+16]
691    add     x2 , x22, #64               // x2 points to a->z
692    add     x0 , sp , #32*3             // s2 = sp[32*3], zsqr will not be used later.
693                                        // You can use this block memory to store and calculate s2.
694    bl      ECP256_MulCore        // s2 = zsqr * z1 = z1^3
695
696    ldp     x4 , x5 , [sp, #32*5]       // x4~x7 = h = sp[32*5]
697    ldp     x6 , x7 , [sp, #32*5+16]
698    add     x2 , x22, #64
699    add     x0 , sp , #32*2             // x0 points to res_z.
700    bl      ECP256_MulCore        // res_z = h * z1
701
702    ldp     x4 , x5 , [sp, #32*3]       // x4~x7 = s2 = sp[32*3]
703    ldp     x6 , x7 , [sp, #32*3+16]
704    add     x2 , x23, #32               // x2 points to b->y.
705    add     x0 , sp , #32*3             // s2 = sp[32*3]
706    bl      ECP256_MulCore        // s2 = s2 * y2 = y2 * z1^3
707
708    ldp     x8 , x9 , [x22, #32]
709    ldp     x10, x11, [x22, #32+16]
710    add     x0 , sp , #32*6             // R = sp[32*6]
711    // x14~x17 - x8~x11
712    bl      ECP256_SubCore1       // R = s2 - y1
713
714    // Rsqr = R^2 and hsqr = h^2 are independent of each other. The sequence can be adjusted.
715    // If Rsqr is calculated first, the memory can be read less once. (The pipeline bubble needs to be optimized.)
716    // In addition, the calculated hsqr can read less memory in hcub = hsqr x h.
717    mov     x4 , x14
718    mov     x5 , x15
719    mov     x6 , x16
720    mov     x7 , x17
721    add     x0 , sp , #32*7             // Rsqr = sp[32*7]
722    bl      ECP256_SqrCore     // Rsqr = R^2
723
724    ldp     x4 , x5 , [sp, #32*5]       // h = sp[32*5]
725    ldp     x6 , x7 , [sp, #32*5+16]
726    add     x0 , sp , #32*8             // hsqr = sp[32*8]
727    bl      ECP256_SqrCore     // hsqr = h^2
728
729    // x14 to x17 stores the hsqr results and does not need to be read from the memory.
730    // (The pipeline bubble exists and needs to be optimized.)
731    mov     x4 , x14
732    mov     x5 , x15
733    mov     x6 , x16
734    mov     x7 , x17
735    add     x2 , sp , #32*5             // h = sp[32*5]
736    add     x0 , sp , #32*9             // hcub = sp[32*9]
737    bl      ECP256_MulCore        // hcub = hsqr * h = h^3
738
739    // Because the multiplication involves too many registers, the preceding hsqr cannot be backed up.
740    // If the registers after x19 are used, the hsqr needs to be pushed and popped at the beginning
741    // and end of the stack, which outweighs the gain. Therefore, the hsqr can only be read from the memory.
742    ldp     x4 , x5 , [sp, #32*8]       // hsqr = sp[32*8]
743    ldp     x6 , x7 , [sp, #32*8+16]
744    add     x2 , x22, #0                // x2 points to a->x
745    add     x0 , sp , #32*4             // u2 = sp[32*4]
746    bl      ECP256_MulCore        // u2 = hsqr * x1
747
748    mov     x8 , x14
749    mov     x9 , x15
750    mov     x10, x16
751    mov     x11, x17
752    add     x0 , sp , #32*8             // hsqr = sp[32*8]
753    // x14~x17 + x8~x11
754    bl      ECP256_AddCore        // hsqr = 2 * u2
755
756    // x14~x17 saves hsqr, so ECP256_SubCore2 is called.
757    ldp     x8 , x9 , [sp, #32*7]       // Rsqr = sp[32*7]
758    ldp     x10, x11, [sp, #32*7+16]
759    // x8~x11 - x14~x17
760    add     x0 , sp , #0                // x0 points to res_x
761    bl      ECP256_SubCore2       // res_x = Rsqr - hsqr
762
763    // x14~x17 save res_x, so call ECP256_SubCore1
764    ldp     x8 , x9 , [sp, #32*9]       // hcub = sp[32*9]
765    ldp     x10, x11, [sp, #32*9+16]
766    // x14~x17 - x8~x11
767    bl      ECP256_SubCore1       // res_x = res_x - hcub
768
769    // x14~x17 save res_x, so call ECP256_SubCore2
770    ldp     x8 , x9 , [sp, #32*4]       // u2 = sp[32*4]
771    ldp     x10, x11, [sp, #32*4+16]
772    add     x0 , sp , #32*1
773    bl      ECP256_SubCore2       // res_y = u2 - res_x
774
775    ldp     x4 , x5 , [sp, #32*9]       // hcub = sp[32*9]
776    ldp     x6 , x7 , [sp, #32*9+16]
777    add     x2 , x22, #32               // y1 = a->y
778    add     x0 , sp , #32*3             // s2 = sp[32*3]
779    bl      ECP256_MulCore        // s2 = y1 * hcub
780
781    add     x2 , sp , #32*6             // R = sp[32*6]
782    add     x0 , sp , #32*1             // res_y = sp[32*1]
783    ldp     x4 , x5 , [x0]
784    ldp     x6 , x7 , [x0, #16]
785    bl      ECP256_MulCore        // res_y = res_y * R
786
787    // x14–x17 stores res_y, and x0 is still res_y. No value needs to be assigned again.
788    ldp     x8 , x9 , [sp, #32*3]       // s2 = sp[32*3]
789    ldp     x10, x11, [sp, #32*3+16]
790    // x14~x17 - x8~x11
791    bl      ECP256_SubCore1       // res_y = res_y - s2
792
793
794    // Four memory blocks are not read. (The pipeline bubble exists and needs to be optimized.)
795    // x14-x17 stores res_y. Therefore, res_y is judged and assigned a value.
796    // copy y
797    ldp     x4 , x5 , [x23, #32]        // in2_y, b->y
798    ldp     x6 , x7 , [x23, #32+16]
799    ldp     x8 , x9 , [x22, #32]        // in1_y, a->y
800    ldp     x10, x11, [x22, #32+16]
801
802    // res_y = in1infty == 0 ? res_y : in2_y
803    cmp     x24, #0                     // x24 = ~in1infty
804    csel    x14, x14, x4 , ne
805    csel    x15, x15, x5 , ne
806    csel    x16, x16, x6 , ne
807    csel    x17, x17, x7 , ne
808    // res_y = in2infty == 0 ? res_y : in1_y
809    cmp     x25, #0                     // x25 = ~in2infty
810    csel    x14, x14, x8 , ne
811    csel    x15, x15, x9 , ne
812    csel    x16, x16, x10, ne
813    csel    x17, x17, x11, ne
814
815    stp     x14, x15, [x21, #32]
816    stp     x16, x17, [x21, #32+16]
817
818    // copy x
819    ldp     x4 , x5 , [x23]             // in2_x, b->x
820    ldp     x6 , x7 , [x23, #16]
821    ldp     x8 , x9 , [x22]             // in1_x, a->x
822    ldp     x10, x11, [x22, #16]
823    ldp     x14, x15, [sp]              // res_x
824    ldp     x16, x17, [sp , #16]
825
826    // res_x = in1infty == 0 ? res_x : in2_x
827    cmp     x24, #0                     // x24 = ~in1infty
828    csel    x14, x14, x4 , ne
829    csel    x15, x15, x5 , ne
830    csel    x16, x16, x6 , ne
831    csel    x17, x17, x7 , ne
832    // res_x = in2infty == 0 ? res_x : in1_x
833    cmp     x25, #0                     // x25 = ~in2infty
834    csel    x14, x14, x8 , ne
835    csel    x15, x15, x9 , ne
836    csel    x16, x16, x10, ne
837    csel    x17, x17, x11, ne
838
839    stp     x14, x15, [x21]
840    stp     x16, x17, [x21, #16]
841
842    // copy z
843    adrp    x23, .Lone_mont
844    add	    x23, x23, :lo12:.Lone_mont
845    ldp     x4 , x5 , [x23]             // one_mont, 1 * RR * R' = R (mod p)
846    ldp     x6 , x7 , [x23, #16]
847    ldp     x8 , x9 , [x22, #64]        // in1_z, a->z
848    ldp     x10, x11, [x22, #64+16]
849    ldp     x14, x15, [sp , #64]        // res_z
850    ldp     x16, x17, [sp , #64+16]
851
852    // res_z = in1infty == 0 ? res_z : ONE
853    cmp     x24, #0                     // x24 = ~in1infty
854    csel    x14, x14, x4 , ne
855    csel    x15, x15, x5 , ne
856    csel    x16, x16, x6 , ne
857    csel    x17, x17, x7 , ne
858    // res_z = in2infty == 0 ? res_z : in1_z
859    cmp     x25, #0                     // x25 = ~in2infty
860    csel    x14, x14, x8 , ne
861    csel    x15, x15, x9 , ne
862    csel    x16, x16, x10, ne
863    csel    x17, x17, x11, ne
864
865    stp     x14, x15, [x21, #64]
866    stp     x16, x17, [x21, #64+16]
867
868    add     sp , x29, #0
869    ldp     x19, x20, [x29, #16]
870    ldp     x21, x22, [x29, #32]
871    ldp     x23, x24, [x29, #48]
872    ldp     x25, x26, [x29, #64]
873    ldp     x29, x30, [sp], #80
874AARCH64_AUTIASP
875    ret
876.size   ECP256_AddAffine,.-ECP256_AddAffine
877
878# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo
879# Deal process:
880#     U1 = X1*Z22
881#     U2 = X2*Z12
882#     S1 = Y1*Z23
883#     S2 = Y2*Z13
884#     H = U2-U1
885#     r = S2-S1
886#     X3 = r2-H3-2*U1*H2
887#     Y3 = r*(U1*H2-X3)-S1*H3
888#     Z3 = Z1*Z2*H
889.globl  ECP256_PointAdd
890.type   ECP256_PointAdd,%function
891.align  5
892ECP256_PointAdd:
893AARCH64_PACIASP
894    stp     x29, x30, [sp, #-96]!
895    add     x29, sp , #0
896    stp     x19, x20, [sp, #16]
897    stp     x21, x22, [sp, #32]
898    stp     x23, x24, [sp, #48]
899    stp     x25, x26, [sp, #64]
900    stp     x27, x28, [sp, #80]
901    sub     sp , sp , #32*12            // Each coordinate is 256 bits, that is, 32 bytes,
902                                        // and a space of 10 coordinates is applied for.
903
904    adrp    x14, .Lpoly+8               // p[1]
905    add x14,x14,:lo12:.Lpoly+8
906    ldr x12,[x14]                       // p[3]
907    ldr x13,[x14,#16]
908
909    // Pointer to the initial backup input parameter.
910    // x0 to x2 are used to transfer parameters when the interface is invoked.
911    mov     x21, x0
912    mov     x22, x1
913    mov     x23, x2
914
915    ldp     x4 , x5 , [x22, #64]        // z1
916    ldp     x6 , x7 , [x22, #64+16]
917    // x24 = z1[0] | z1[1] | z1[2] | z1[3]
918    orr     x24, x4 , x5
919    orr     x24, x24, x6
920    orr     x24, x24, x7                // x24 = ~in1infty
921    cmp     x24, #0
922    csetm   x24, ne                     // x24 = (x24 != 0) ? -1 : 0, that is ~iszero(z1),
923                                        // Determine whether point a is(x, y, 0)
924
925    // Because x4 to x7 exactly hold z1, z1^2 can be calculated first.
926    add     x0 , sp , #0                 // z1sqr = sp[32*0]
927    bl      ECP256_SqrCore     // z1sqr = z1^2
928
929    // x14 to x17 exactly save the z1sqr result (the pipeline bubble exists and needs to be optimized).
930    mov     x4 , x14
931    mov     x5 , x15
932    mov     x6 , x16
933    mov     x7 , x17
934    add     x2 , x22, #64               // x2 points to z1, that is a->z
935    add     x0 , sp , #32*1             // s2 = sp[32*1]
936    bl      ECP256_MulCore        // s2 = z1^3
937
938    // x14 to x17 exactly save the result of s2. (The pipeline bubble exists and needs to be optimized.)
939    mov     x4 , x14
940    mov     x5 , x15
941    mov     x6 , x16
942    mov     x7 , x17
943    add     x2 , x23, #32               // x2 points to y2, that is b->y
944    // add     x0 , sp , #32*1             // s2 = sp[32*1], x0 It is exactly s2 and does not need to be set.
945                                           // (There are pipeline bubbles to be optimized.)
946    bl      ECP256_MulCore        // s2 = y2 * z1^3
947
948    ldp     x4 , x5 , [x23, #64]        // z2
949    ldp     x6 , x7 , [x23, #64+16]
950    // x25 = z2[0] | z2[1] | z2[2] | z2[3]
951    orr     x25, x4 , x5
952    orr     x25, x25, x6
953    orr     x25, x25, x7                // x25 = ~in2infty
954    cmp     x25, #0
955    csetm   x25, ne                     // x25 = (x25 != 0) ? -1 : 0, that is ~iszero(z2),
956                                        // Check whether point b is(x, y, 0)
957
958    // Because x4~x7 happens to hold z2, z2^2 can be calculated
959    add     x0 , sp , #32*2             // z2sqr = sp[32*2]
960    bl      ECP256_SqrCore     // z2sqr = z2^2
961
962    // x14 to x17 exactly save the z2sqr result. (There are pipeline bubbles to be optimized.)
963    mov     x4 , x14
964    mov     x5 , x15
965    mov     x6 , x16
966    mov     x7 , x17
967    add     x2 , x23, #64               // x2 points to z2, that is b->z
968    add     x0 , sp , #32*3             // s1 = sp[32*3]
969    bl      ECP256_MulCore        // s1 = z2^3
970
971    // x14 to x17 exactly save the result of s2. (The pipeline bubble exists and needs to be optimized.)
972    mov     x4 , x14
973    mov     x5 , x15
974    mov     x6 , x16
975    mov     x7 , x17
976    add     x2 , x22, #32               // x2 points to y1, that is a->y
977    // add     x0 , sp , #32*3             // s1 = sp[32*3], x0 is exactly s1 and does not need to be set.
978                                           //  (There are pipeline bubbles to be optimized.)
979    bl      ECP256_MulCore        // s1 = y1 * z2^3
980
981
982    // x14 to x17 save the result of s1. Therefore, the ECP256_SubCore2 interface is selected.
983    ldp     x8 , x9 , [sp, #32*1]       // s2 = sp[32*1]
984    ldp     x10, x11, [sp, #32*1+16]
985    // x8~x11 - x14~x17
986    add     x0 , sp , #32*4             // R = sp[32*4]
987    bl      ECP256_SubCore2       // R = s2 - s1
988
989    // x14 to x17 save the result of R, (R==0) <==> (s2 == s1)
990    orr     x26, x14, x15
991    orr     x26, x26, x16
992    orr     x26, x26, x17               // x26 = ~(s1 == s2), The data of R is not read from the memory in advance.
993
994    ldp     x4 , x5 , [x22]             // Take x1, that is a->x
995    ldp     x6 , x7 , [x22, #16]
996    add     x2 , sp , #32*2             // z2sqr = sp[32*2]
997    add     x0 , sp , #32*5             // u1 = sp[32*5]
998    bl      ECP256_MulCore        // u1 = x1 * z2sqr
999
1000    ldp     x4 , x5 , [x23]             // Take x2, that is b->x
1001    ldp     x6 , x7 , [x23, #16]
1002    add     x2 , sp , #0                // z1sqr = sp[32*0]
1003    add     x0 , sp , #32*6             // u2 = sp[32*6]
1004    bl      ECP256_MulCore        // u2 = x2 * z1sqr
1005
1006    // x14 to x17 save the result of u2. Therefore, the ECP256_SubCore1 interface is selected.
1007    ldp     x8 , x9 , [sp, #32*5]       // u1 = sp[32*5]
1008    ldp     x10, x11, [sp, #32*5+16]
1009    // x14~x17 - x8~x11
1010    add     x0 , sp , #32*7             // h = sp[32*7]
1011    bl      ECP256_SubCore1       // h = u2 - u1
1012
1013    // x14 to x17 save the result of h, (h==0) <==> (u2 == u1)
1014    orr     x14, x14, x15
1015    orr     x14, x14, x16
1016    orr     x14, x14, x17               // x14 = ~(u1 == u2), It is used for subsequent judgment.
1017                                        // The data of h does not need to be read from the memory in advance.
1018
1019    // x24 = ~in2infty
1020    // x25 = ~in1infty
1021    // x26 = ~(s1 == s2) = ~is_equal(S1, S2)
1022    // x14 = ~(u1 == u2) = ~is_equal(U1, U2)
1023    // if(is_equal(U1, U2) & ~in1infty & ~in2infty & is_equal(S1, S2))
1024    // 将(is_equal(U1, U2) & ~in1infty & ~in2infty & is_equal(S1, S2))记为flag
1025    // <==>
1026    // if ((~is_equal(U1, U2) | in1infty | in2infty | ~is_equal(S1, S2)) == 0)
1027    // if (flag == 0)
1028
1029    mvn     x27, x24                    // x27 = in1infty
1030    mvn     x28, x25                    // x28 = in2infty
1031
1032    orr     x14, x14, x26
1033    orr     x14, x14, x27
1034    orr     x14, x14, x28
1035    cbnz    x14, .Lpoint_add            // flag != 0, Continue calculation by C implementation
1036
1037.Ladd_double:
1038    mov     x0 , x21                    // x0 = r
1039    mov     x1 , x22                    // x1 = a
1040    ldp     x23, x24, [x29, #48]        // The double function uses only x19 to x22, and only x19 to x22
1041                                        // is popped at the end. Therefore, x23 to x28 is popped in advance.
1042    ldp     x25, x26, [x29, #64]
1043    ldp     x27, x28, [x29, #80]
1044    add     sp , sp , #32*(12-4)        // The size of the stack space used varies. The double function
1045                                        // only releases 32 x 4. Therefore, you need to release part of the stack space in advance.
1046    b       .Lpoint_double
1047
1048.align  4
1049.Lpoint_add:
1050    ldp     x4 , x5 , [sp, #32*4]       // R = sp[32*4]
1051    ldp     x6 , x7 , [sp, #32*4+16]
1052    add     x0 , sp , #0                // Rsqr = sp[0], Because z1sqr will not be used later,
1053                                        // this memory can be used to save Rsqr.
1054    bl      ECP256_SqrCore     // Rsqr = R^2
1055
1056    ldp     x4 , x5 , [sp, #32*7]       // h = sp[32*7]
1057    ldp     x6 , x7 , [sp, #32*7+16]
1058    add     x2 , x22, #64               // x2 points to z1
1059    add     x0 , sp , #32*11            // res_z = sp[32*11]
1060    bl      ECP256_MulCore        // res_z = h * z1
1061
1062    // The res_z result is stored in x14 to x17. (The pipeline bubble needs to be optimized.)
1063    mov     x4 , x14
1064    mov     x5 , x15
1065    mov     x6 , x16
1066    mov     x7 , x17
1067    add     x2 , x23, #64               // x2 points to z2
1068    // x0 is still res_z and does not need to be reassigned.
1069    // add     x0 , sp , #32*11         // res_z = sp[32*11]
1070    bl      ECP256_MulCore        // res_z = res_z * z2
1071
1072    ldp     x4 , x5 , [sp, #32*7]       // h = sp[32*7]
1073    ldp     x6 , x7 , [sp, #32*7+16]
1074    add     x0 , sp , #32*2             // hsqr = sp[32*2], Because z2sqr will not be used later,
1075                                        // you can use this memory to save hsqr.
1076    bl      ECP256_SqrCore
1077
1078    // The hsqr result is stored in x14 to x17. (The pipeline bubble needs to be optimized.)
1079    mov     x4 , x14
1080    mov     x5 , x15
1081    mov     x6 , x16
1082    mov     x7 , x17
1083    add     x2 , sp , #32*7             // h = sp[32*7]
1084    add     x0 , sp , #32*8             // hcub = sp[32*8]
1085    bl      ECP256_MulCore        // hcub = hsqr * h
1086
1087    ldp     x4 , x5 , [sp, #32*5]       // u1 = sp[32*5]
1088    ldp     x6 , x7 , [sp, #32*5+16]
1089    add     x2 , sp , #32*2             // hsqr = sp[32*2]
1090    add     x0 , sp , #32*6             // u2 = sp[32*6]
1091    bl      ECP256_MulCore        // u2 = u1 * hsqr
1092
1093    // x14~x17 exactly save the result of u2.
1094    mov     x8 , x14
1095    mov     x9 , x15
1096    mov     x10, x16
1097    mov     x11, x17
1098    // add     x0 , sp , #32*2          // hsqr = sp[32*2]
1099    // The previous operation x2 also saves the address of hsqr.
1100    mov     x0 , x2
1101    // x14~x17 + x8~x11
1102    bl      ECP256_AddCore        // hsqr = 2 * u2
1103
1104    // x14 to x17 save the hsqr result. Therefore, ECP256_SubCore2 is called.
1105    ldp     x8 , x9 , [sp]              // Rsqr = sp[0]
1106    ldp     x10, x11, [sp, #16]
1107    // x8~x11 - x14~x17
1108    add     x0 , sp , #32*9             // res_x = sp[32*9]
1109    bl      ECP256_SubCore2       // res_x = Rsqr - hsqr
1110
1111    // x14 to x17 exactly save the result of res_x, so ECP256_SubCore1 is called.
1112    ldp     x8 , x9 , [sp, #32*8]       // hcub = sp[32*8]
1113    ldp     x10, x11, [sp, #32*8+16]
1114    // x14~x17 - x8~11
1115    // The previous operation x0 also stores the address of res_x.
1116    bl      ECP256_SubCore1       // res_x = res_x - hcub
1117
1118    // x14 to x17 exactly save the result of res_x. Therefore, ECP256_SubCore2 is called.
1119    ldp     x8 , x9 , [sp, #32*6]       // u2 = sp[0]
1120    ldp     x10, x11, [sp, #32*6+16]
1121    // x8~x11 - x14~x17
1122    add     x0 , sp , #32*10
1123    bl      ECP256_SubCore2       // res_y = u2 - res_x
1124
1125    ldp     x4 , x5 , [sp, #32*3]       // s1 = sp[32*3]
1126    ldp     x6 , x7 , [sp, #32*3+16]
1127    add     x2 , sp , #32*8             // hcub = sp[32*8]
1128    add     x0 , sp , #32*1             // s2 = sp[32*1]
1129    bl      ECP256_MulCore        // s2 = s1 * hcub
1130
1131    add     x2 , sp , #32*4             // R = sp[32*4]
1132    add     x0 , sp , #32*10            // res_y = sp[32*10]
1133    ldp     x4 , x5 , [x0]              // res_y = sp[32*10], Borrowing x0 address without offset
1134    ldp     x6 , x7 , [x0, #16]
1135    bl      ECP256_MulCore        // res_y = res_y * R
1136
1137    // x14 to x17 exactly save the result of res_y. Therefore, ECP256_SubCore1 is called.
1138    ldp     x8 , x9 , [sp, #32]         // s2 = sp[32*1]
1139    ldp     x10, x11, [sp, #32+16]
1140    // x14~x17 - x8~x11
1141    // The previous operation x0 also stores the address of res_y.
1142    bl      ECP256_SubCore1       // res_y = res_y - s2
1143
1144    // Four memory blocks are not read. (The pipeline bubble exists and needs to be optimized.)
1145    // x14-x17 stores res_y. Therefore, res_y is judged and assigned a value.
1146    // copy y
1147    ldp     x4 , x5 , [x23, #32]        // in2_y, b->y
1148    ldp     x6 , x7 , [x23, #32+16]
1149    ldp     x8 , x9 , [x22, #32]        // in1_y, a->y
1150    ldp     x10, x11, [x22, #32+16]
1151
1152    // res_y = in1infty == 0 ? res_y : in2_y
1153    cmp     x24, #0                     // x24 = ~in1infty
1154    csel    x14, x14, x4 , ne
1155    csel    x15, x15, x5 , ne
1156    csel    x16, x16, x6 , ne
1157    csel    x17, x17, x7 , ne
1158    // res_y = in2infty == 0 ? res_y : in1_y
1159    cmp     x25, #0                     // x25 = ~in2infty
1160    csel    x14, x14, x8 , ne
1161    csel    x15, x15, x9 , ne
1162    csel    x16, x16, x10, ne
1163    csel    x17, x17, x11, ne
1164
1165    stp     x14, x15, [x21, #32]
1166    stp     x16, x17, [x21, #32+16]
1167
1168    // copy x
1169    ldp     x4 , x5 , [x23]             // in2_x, b->x
1170    ldp     x6 , x7 , [x23, #16]
1171    ldp     x8 , x9 , [x22]             // in1_x, a->x
1172    ldp     x10, x11, [x22, #16]
1173    ldp     x14, x15, [sp , #32*9]      // res_x
1174    ldp     x16, x17, [sp , #32*9+16]
1175
1176    // res_x = in1infty == 0 ? res_x : in2_x
1177    cmp     x24, #0                     // x24 = ~in1infty
1178    csel    x14, x14, x4 , ne
1179    csel    x15, x15, x5 , ne
1180    csel    x16, x16, x6 , ne
1181    csel    x17, x17, x7 , ne
1182    // res_x = in2infty == 0 ? res_x : in1_x
1183    cmp     x25, #0                     // x25 = ~in2infty
1184    csel    x14, x14, x8 , ne
1185    csel    x15, x15, x9 , ne
1186    csel    x16, x16, x10, ne
1187    csel    x17, x17, x11, ne
1188
1189    stp     x14, x15, [x21]
1190    stp     x16, x17, [x21, #16]
1191
1192    // copy z
1193    ldp     x4 , x5 , [x23, #64]        // in2_z, b->z
1194    ldp     x6 , x7 , [x23, #64+16]
1195    ldp     x8 , x9 , [x22, #64]        // in1_z, a->z
1196    ldp     x10, x11, [x22, #64+16]
1197    ldp     x14, x15, [sp , #32*11]     // res_z
1198    ldp     x16, x17, [sp , #32*11+16]
1199
1200    // res_z = in1infty == 0 ? res_z : in2_z
1201    cmp     x24, #0                     // x24 = ~in1infty
1202    csel    x14, x14, x4 , ne
1203    csel    x15, x15, x5 , ne
1204    csel    x16, x16, x6 , ne
1205    csel    x17, x17, x7 , ne
1206    // res_z = in2infty == 0 ? res_z : in1_z
1207    cmp     x25, #0                     // x25 = ~in2infty
1208    csel    x14, x14, x8 , ne
1209    csel    x15, x15, x9 , ne
1210    csel    x16, x16, x10, ne
1211    csel    x17, x17, x11, ne
1212
1213    stp     x14, x15, [x21, #64]
1214    stp     x16, x17, [x21, #64+16]
1215
1216    add     sp , x29, #0                // Free temporary variable
1217    ldp     x19, x20, [x29, #16]
1218    ldp     x21, x22, [x29, #32]
1219    ldp     x23, x24, [x29, #48]
1220    ldp     x25, x26, [x29, #64]
1221    ldp     x27, x28, [x29, #80]
1222    ldp     x29, x30, [sp], #96
1223AARCH64_AUTIASP
1224    ret
1225.size    ECP256_PointAdd,.-ECP256_PointAdd
1226
1227# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1228# Deal process:
1229#     delta = Z12
1230#     gamma = Y12
1231#     beta = X1*gamma
1232#     alpha = 3*(X1-delta)*(X1+delta)
1233#     X3 = alpha2-8*beta
1234#     Z3 = (Y1+Z1)2-gamma-delta
1235#     Y3 = alpha*(4*beta-X3)-8*gamma2
1236.globl  ECP256_PointDouble
1237.type   ECP256_PointDouble,%function
1238.align  5
1239ECP256_PointDouble:
1240AARCH64_PACIASP
1241    stp     x29, x30, [sp, #-96]!       // Why is 96 space given here? Only x19~x22 is stacked. Theoretically, only [sp, #-48] is required.
1242    add     x29, sp , #0                // Whether it is related to the Lpoint_double jump of the ecp_nistz256_point_add interface
1243    stp     x19, x20, [sp, #16]
1244    stp     x21, x22, [sp, #32]
1245    sub     sp , sp , #32*4
1246
1247.Lpoint_double:
1248    ldp     x14, x15, [x1 , #32]        // a->y
1249    ldp     x16, x17, [x1 , #32+16]
1250    // Back up the initial input parameter pointer r, a.
1251    mov     x21, x0
1252    mov     x22, x1
1253    adrp    x3, .Lpoly+8               // p[1]
1254    add x3,x3,:lo12:.Lpoly+8
1255    ldr x12,[x3]                       // p[3]
1256    ldr x13,[x3,#16]
1257
1258    mov     x8 , x14
1259    mov     x9 , x15
1260    ldp     x4 , x5 , [x22, #64]        // a->z
1261    mov     x10, x16
1262    mov     x11, x17
1263    ldp     x6 , x7 , [x22, #64+16]
1264    add     x0 , sp , #0                // s = sp[0]
1265    bl      ECP256_AddCore        // s = 2 * a->y
1266
1267    add     x0 , sp , #32               // zsqr = sp[32*1]
1268    bl      ECP256_SqrCore        // zsqr = (a->z)^2
1269
1270    // x14 to x17 save the result of s. Calculate s^2 first.
1271    ldp     x4 , x5 , [sp]
1272    add     x0 , sp , #0                // s = sp[0]
1273    ldp     x6 , x7 , [sp, #16]
1274    bl      ECP256_SqrCore        // s = s^2
1275
1276    ldp     x14, x15, [sp, #32]             // a->x
1277    ldp     x8 , x9 , [x22]             // a->x
1278    ldp     x16, x17, [sp, #32+16]
1279    add     x0 , sp , #64               // m = sp[32*2]
1280    ldp     x10, x11, [x22, #16]
1281    bl      ECP256_AddCore        // m = a->x + zsqr
1282
1283    ldp     x14 , x15 , [sp, #32]             // a->x
1284    ldp     x8 , x9 , [x22]             // a->x
1285    ldp     x16, x17, [sp, #32+16]
1286    add     x0 , sp , #32
1287    ldp     x10, x11, [x22, #16]
1288    bl      ECP256_SubCore2       // zsqr = a->x - zsqr
1289
1290    ldp     x4 , x5 , [x22, #64]        // a->z
1291    ldp     x6 , x7 , [x22, #64+16]
1292    add     x0 , x21, #64               // res_z
1293    add     x2 , x22, #32               // a->y
1294    bl      ECP256_MulCore        // res_z = a->z * a->y
1295
1296    // x14 to x17 save the result of res_z.
1297    mov     x8 , x14
1298    mov     x9 , x15
1299    ldp     x4 , x5 , [sp]              // s = sp[0]
1300    mov     x10, x16
1301    mov     x11, x17
1302    ldp     x6 , x7 , [sp, #16]
1303    add     x0 , x21, #64               // res_z
1304    bl      ECP256_AddCore        // res_z = 2 * res_z
1305
1306    add     x0 , x21, #32               // x0 points to res_y, that is r->y
1307    bl      ECP256_SqrCore        // res_y = s^2
1308
1309    add     x0 , x21, #32
1310    bl      ECP256_DivBy2Core     // res_y = res_y / 2
1311
1312    ldp     x4 , x5 , [sp, #64]         // m = sp[64]
1313    ldp     x6 , x7 , [sp, #64+16]
1314    add     x2 , sp , #32               // zsqr = sp[32]
1315    add     x0 , sp , #64
1316    bl      ECP256_MulCore        // m = m * zsqr
1317
1318    // x14–x17 saves the result of m. Save an extra copy to x4–x7.
1319    mov     x8 , x14
1320    mov     x4 , x14
1321    mov     x9 , x15
1322    mov     x5 , x15
1323    mov     x10, x16
1324    mov     x6 , x16
1325    mov     x11, x17
1326    mov     x7 , x17
1327    add     x0 , sp , #64
1328    bl      ECP256_AddCore        // m = 2 * m
1329    mov     x8 , x4
1330    mov     x9 , x5
1331    mov     x10, x6
1332    mov     x11, x7
1333    bl      ECP256_AddCore        // m = 3 * m
1334
1335    ldp     x4 , x5 , [sp]              // s = sp[0]
1336    add     x2 , x22, #0                // a->x
1337    ldp     x6 , x7 , [sp, #16]
1338    add     x0 , sp , #0                // s = sp[0]
1339    bl      ECP256_MulCore        // s = s * a->x
1340
1341    // x14~x17 exactly saves the result of s
1342    mov     x8 , x14
1343    mov     x9 , x15
1344    ldp     x4 , x5 , [sp, #64]
1345    mov     x10, x16
1346    mov     x11, x17
1347    ldp     x6 , x7 , [sp, #64+16]
1348    add     x0 , sp , #96               // tmp = sp[96]
1349    bl      ECP256_AddCore        // tmp = 2 * s
1350
1351    // x14~x17 exactly saves the result of m.
1352    add     x0 , x21, #0                // res_x = r->x
1353    bl      ECP256_SqrCore        // res_x = m^2
1354
1355    // x14~x17 exactly saves the result of res_x.
1356    ldp     x8 , x9 , [sp, #96]         // tmp = sp[96]
1357    ldp     x10, x11, [sp, #96+16]
1358    bl      ECP256_SubCore1       // res_x = res_x - tmp
1359
1360    // x14~x17 exactly saves the result of res_x, Therefore, ECP256_SubCore2 is invoked.
1361    ldp     x8 , x9 , [sp]              // s = sp[0]
1362    ldp     x10, x11, [sp, #16]
1363    add     x0 , sp , #0
1364    bl      ECP256_SubCore2       // s = s - res_x
1365
1366    // x14~x17 exactly saves the result of s, x0 still points to s.
1367    mov     x4 , x14
1368    mov     x5 , x15
1369    mov     x6 , x16
1370    mov     x7 , x17
1371    add     x2 , sp , #64               // m = sp[64]
1372    bl      ECP256_MulCore        // s = s * m
1373
1374    // x14 to x17 just save the result of s, so call ECP256_SubCore1.
1375    add     x0 , x21, #32               // x0 points to res_y
1376    ldp     x8 , x9 , [x0]              // res_y
1377    ldp     x10, x11, [x0, #16]
1378    bl      ECP256_SubCore1       // res_y = s - res_y
1379
1380    add     sp , x29, #0                // Free temporary variable
1381    ldp     x19, x20, [x29, #16]
1382    ldp     x21, x22, [x29, #32]
1383    ldp     x29, x30, [sp], #96
1384AARCH64_AUTIASP
1385    ret
1386.size    ECP256_PointDouble,.-ECP256_PointDouble
1387
1388.globl  ECP256_OrdMul
1389.type   ECP256_OrdMul,%function
1390.align  4
1391ECP256_OrdMul:
1392AARCH64_PACIASP
1393    stp     x29, x30, [sp, #-64]!
1394    add     x29, sp , #0
1395    stp     x19, x20, [sp, #16]
1396    stp     x21, x22, [sp, #32]
1397    stp     x23, x24, [sp, #48]
1398
1399    adrp    x23, .Lord
1400    add	    x23, x23, :lo12:.Lord       // x23 = &n
1401    ldp     x12, x13, [x23]             // n[0~3]
1402    ldp     x21, x22, [x23, #16]
1403    ldr     x23, [x23, #32]             // x23 = LordK
1404
1405    ldp     x4 , x5 , [x1]              // a[0~3]
1406    ldp     x6 , x7 , [x1, #16]
1407
1408    ldr     x3 , [x2]                   // x3 = b[0]
1409
1410    mul     x14, x4 , x3                // a[0~3] * b[0]
1411    umulh   x8 , x4 , x3
1412    mul     x15, x5 , x3
1413    umulh   x9 , x5 , x3
1414    mul     x16, x6 , x3
1415    umulh   x10, x6 , x3
1416    mul     x17, x7 , x3
1417    umulh   x11, x7 , x3
1418
1419    // a[0-3] * b[0] is stored in x14~x17, x19, x20.
1420    adds    x15, x15, x8
1421    adcs    x16, x16, x9
1422    adcs    x17, x17, x10
1423    adc     x19, xzr, x11               // x19 = x11 + CF
1424    mov     x20, xzr
1425
1426    // reduce
1427    // n[0] = 0xf3b9cac2fc632551
1428    // n[1] = 0xbce6faada7179e84
1429    // n[2] = 0xffffffffffffffff, lo(q*n[2]) = -q     , hi(q*n[2]) = q
1430    // n[3] = 0xffffffff00000000, lo(q*n[3]) = -lq<<32, hi(q*n[3]) = q-hq
1431    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), Equivalent to q in Montgomery
1432                                        // modular multiplication
1433
1434    lsl     x8 , x24, #32               // lq << 32
1435    lsr     x9 , x24, #32               // hq
1436    subs    x16, x16, x24               // x16 = x16 + lo(q*n[2]) = x16 - q
1437    sbcs    x17, x17, x8                // x17 = x17 + lo(q*n[3]) = x17 - lq<<32
1438    sbcs    x19, x19, x9                // x19 = x19 - hq, Should have added q-hq, here first subtract hq, then add q.
1439    sbc     x20, x20, xzr               // x20 = x20 - CF
1440
1441    umulh   x9 , x12, x24               // hi(q*n[0])
1442    mul     x10, x13, x24               // lo(q*n[1])
1443    umulh   x11, x13, x24               // hi(q*n[1])
1444
1445    // First calculate the value to be added to the same accumulator.
1446    subs    xzr, x14, #1                // CF = (x14 < 1)?
1447    adcs    x10, x10, x9                // x10 = hi(q*n[0]) + lo(q*n[1])
1448    adc     x11, x11, xzr               // x11 = hi(q*n[1]) + CF
1449
1450    // S /= r, accumulation of staggered blocks
1451    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1452    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1453    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1454    adcs    x17, x19, x24               // x17 = x19 + q, hq has been subtracted from x19 in front,
1455                                        // so here x17 = hi(q*n[3])
1456    adc     x19, x20, xzr               // x19 = x20 + CF
1457
1458    ldr     x3 , [x2, #8]               // b[1]
1459
1460    mul     x8 , x4 , x3
1461    mul     x9 , x5 , x3
1462    mul     x10, x6 , x3
1463    mul     x11, x7 , x3
1464
1465    // add lo(a[0~3] * b[i])
1466    adds    x14, x14, x8
1467    adcs    x15, x15, x9
1468    adcs    x16, x16, x10
1469    adcs    x17, x17, x11
1470    adc     x19, x19, xzr
1471
1472    umulh   x8 , x4 , x3
1473    umulh   x9 , x5 , x3
1474    umulh   x10, x6 , x3
1475    umulh   x11, x7 , x3
1476    // add hi(a[0~3] * b[i])
1477    adds    x15, x15, x8
1478    adcs    x16, x16, x9
1479    adcs    x17, x17, x10
1480    adcs    x19, x19, x11
1481    adc     x20, xzr, xzr
1482
1483    // reduce
1484    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), equivalent to q
1485                                        // in Montgomery modular multiplication
1486
1487    lsl     x8 , x24, #32
1488    lsr     x9 , x24, #32
1489    subs    x16, x16, x24
1490    sbcs    x17, x17, x8
1491    sbcs    x19, x19, x9
1492    sbc     x20, x20, xzr
1493
1494    umulh   x9 , x12, x24               // hi(q*n[0])
1495    mul     x10, x13, x24               // lo(q*n[1])
1496    umulh   x11, x13, x24               // hi(q*n[1])
1497
1498    subs    xzr, x14, #1
1499    adcs    x10, x10, x9
1500    adc     x11, x11, xzr
1501
1502    // S /= r, accumulation of staggered blocks
1503    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1504    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1505    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1506    adcs    x17, x19, x24               // x17 = x19 + q, hq has been subtracted from x19 in front,
1507                                        // so here x17 = hi(q*n[3])
1508    adc     x19, x20, xzr               // x19 = x20 + CF
1509
1510    ldr     x3 , [x2, #16]               // b[2]
1511
1512    mul     x8 , x4 , x3
1513    mul     x9 , x5 , x3
1514    mul     x10, x6 , x3
1515    mul     x11, x7 , x3
1516
1517    // adds lo(a[0~3] * b[i])
1518    adds    x14, x14, x8
1519    adcs    x15, x15, x9
1520    adcs    x16, x16, x10
1521    adcs    x17, x17, x11
1522    adc     x19, x19, xzr
1523
1524    umulh   x8 , x4 , x3
1525    umulh   x9 , x5 , x3
1526    umulh   x10, x6 , x3
1527    umulh   x11, x7 , x3
1528    // adds hi(a[0~3] * b[i])
1529    adds    x15, x15, x8
1530    adcs    x16, x16, x9
1531    adcs    x17, x17, x10
1532    adcs    x19, x19, x11
1533    adc     x20, xzr, xzr
1534
1535    // reduce
1536    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery
1537                                        // modular multiplication
1538
1539    lsl     x8 , x24, #32
1540    lsr     x9 , x24, #32
1541    subs    x16, x16, x24
1542    sbcs    x17, x17, x8
1543    sbcs    x19, x19, x9
1544    sbc     x20, x20, xzr
1545
1546    umulh   x9 , x12, x24               // hi(q*n[0])
1547    mul     x10, x13, x24               // lo(q*n[1])
1548    umulh   x11, x13, x24               // hi(q*n[1])
1549
1550    subs    xzr, x14, #1
1551    adcs    x10, x10, x9
1552    adc     x11, x11, xzr
1553
1554    // S /= r, accumulation of staggered blocks
1555    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1556    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1557    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1558    adcs    x17, x19, x24               // x17 = x19 + q, hq has been subtracted from x19 in front,
1559                                        // so here x17 = hi(q*n[3])
1560    adc     x19, x20, xzr               // x19 = x20 + CF
1561
1562    ldr     x3 , [x2, #24]               // b[3]
1563
1564    mul     x8 , x4 , x3
1565    mul     x9 , x5 , x3
1566    mul     x10, x6 , x3
1567    mul     x11, x7 , x3
1568
1569    // add lo(a[0~3] * b[i])
1570    adds    x14, x14, x8
1571    adcs    x15, x15, x9
1572    adcs    x16, x16, x10
1573    adcs    x17, x17, x11
1574    adc     x19, x19, xzr
1575
1576    umulh   x8 , x4 , x3
1577    umulh   x9 , x5 , x3
1578    umulh   x10, x6 , x3
1579    umulh   x11, x7 , x3
1580    // add hi(a[0~3] * b[i])
1581    adds    x15, x15, x8
1582    adcs    x16, x16, x9
1583    adcs    x17, x17, x10
1584    adcs    x19, x19, x11
1585    adc     x20, xzr, xzr
1586
1587    // reduce
1588    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery
1589                                        // modular multiplication.
1590
1591    lsl     x8 , x24, #32
1592    lsr     x9 , x24, #32
1593    subs    x16, x16, x24
1594    sbcs    x17, x17, x8
1595    sbcs    x19, x19, x9
1596    sbc     x20, x20, xzr
1597
1598    umulh   x9 , x12, x24               // hi(q*n[0])
1599    mul     x10, x13, x24               // lo(q*n[1])
1600    umulh   x11, x13, x24               // hi(q*n[1])
1601
1602    subs    xzr, x14, #1
1603    adcs    x10, x10, x9
1604    adc     x11, x11, xzr
1605
1606    // S /= r, Accumulation of staggered blocks
1607    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1608    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1609    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1610    adcs    x17, x19, x24               // x17 = x19 + q, hq has been subtracted from x19 in front, so here x17 = hi(q*n[3])
1611    adc     x19, x20, xzr               // x19 = x20 + CF
1612
1613    // So far, x14 to x17, and x19 retain the calculation results.
1614    // x8~x11 save r -= n
1615    subs    x8 , x14, x12
1616    sbcs    x9 , x15, x13
1617    sbcs    x10, x16, x21
1618    sbcs    x11, x17, x22
1619    sbcs    xzr, x19, xzr               // Finally, the CF bit is set to indicate whether r-n needs to be borrowed.
1620
1621    // r = r-n < 0 ? r : r-n
1622    csel    x14, x14, x8 , lo
1623    csel    x15, x15, x9 , lo
1624    csel    x16, x16, x10, lo
1625    csel    x17, x17, x11, lo
1626
1627    stp     x14, x15, [x0]              // Write the result to r
1628    stp     x16, x17, [x0, #16]
1629
1630    ldp     x19, x20, [sp, #16]
1631    ldp     x21, x22, [sp, #32]
1632    ldp     x23, x24, [sp, #48]
1633    ldr     x29, [sp], #64
1634AARCH64_AUTIASP
1635    ret
1636.size   ECP256_OrdMul,.-ECP256_OrdMul
1637
1638.globl  ECP256_OrdSqr
1639.type   ECP256_OrdSqr,%function
1640.align  4
1641ECP256_OrdSqr:
1642AARCH64_PACIASP
1643    stp     x29, x30, [sp, #-64]!
1644    add     x29, sp , #0
1645    stp     x19, x20, [sp, #16]
1646    stp     x21, x22, [sp, #32]
1647    stp     x23, x24, [sp, #48]
1648
1649    adrp    x23, .Lord
1650    add	    x23, x23, :lo12:.Lord       // x23 = &n
1651    ldp     x12, x13, [x23]             // n[0~3]
1652    ldp     x21, x22, [x23, #16]
1653    ldr     x23, [x23, #32]             // x23 = LordK
1654
1655    ldp     x4 , x5 , [x1]              // x4~x7 = a[0~3]
1656    ldp     x6 , x7 , [x1, #16]
1657
1658.align  4
1659.Lord_sqr:
1660    sub     x2 , x2 , #1
1661/******************************************
1662    x7   x1   x20  x19  x17  x16  x15  x14
1663                             h0*1 l0*1
1664                        h0*2 l0*2
1665                   h0*3 l0*3
1666                   h1*2 l1*2
1667              h1*3 l1*3
1668         h2*3 l2*3
1669    h3*3 l3*3 h2*2 l2*2 h1*1 l1*1 h0*0 l0*0
1670*******************************************/
1671
1672    // a[1~3] * a[0]
1673    mul     x15, x5 , x4            // lo(a[1] * a[0])
1674    umulh   x8 , x5 , x4            // hi(a[1] * a[0])
1675    mul     x16, x6 , x4            // lo(a[2] * a[0])
1676    umulh   x9 , x6 , x4            // hi(a[2] * a[0])
1677    mul     x17, x7 , x4            // lo(a[3] * a[0])
1678    umulh   x19, x7 , x4            // hi(a[3] * a[0])
1679
1680    adds    x16, x16, x8
1681    adcs    x17, x17, x9
1682    adc     x19, x19, xzr           // No more carry
1683
1684    // a[2~3] * a[1]
1685    mul     x8 , x6 , x5            // lo(a[2] * a[1])
1686    umulh   x9 , x6 , x5            // hi(a[2] * a[1])
1687    mul     x10, x7 , x5            // lo(a[3] * a[1])
1688    umulh   x11, x7 , x5            // hi(a[3] * a[1])
1689
1690    // a[3] * a[2]
1691    mul     x20, x7 , x6
1692    umulh   x1 , x7 , x6
1693
1694    // Add the calculation result of the current round,
1695    // and then add the calculation result with the acc (register in the preceding note).
1696    adds    x9 , x10, x9
1697    adc     x10, x11, xzr
1698
1699    adds    x17, x17, x8
1700    adcs    x19, x19, x9
1701    adcs    x20, x20, x10
1702    adc     x1 , x1 , xzr           // a[0-3] has been saved in x4 to x7.
1703
1704    // a[0~3] ^ 2
1705    mul     x14, x4 , x4
1706    umulh   x4 , x4 , x4
1707    mul     x8 , x5 , x5
1708    umulh   x5 , x5 , x5
1709    mul     x9 , x6 , x6
1710    umulh   x6 , x6 , x6
1711    mul     x10, x7 , x7
1712    umulh   x7 , x7 , x7
1713
1714    // acc[1~6] << 1
1715    adds    x15, x15, x15
1716    adcs    x16, x16, x16
1717    adcs    x17, x17, x17
1718    adcs    x19, x19, x19
1719    adcs    x20, x20, x20
1720    adcs    x1 , x1 , x1
1721    adc     x3 , xzr, xzr
1722
1723    // acc[] += a[] ^ 2
1724    adds    x15, x15, x4
1725    adcs    x16, x16, x8
1726    adcs    x17, x17, x5
1727    adcs    x19, x19, x9
1728    adcs    x20, x20, x6
1729    adcs    x1 , x1 , x10
1730    adc     x7 , x7 , x3
1731
1732    // Four rounds of reduce
1733    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery
1734                                        // modular multiplication
1735
1736    umulh   x9 , x12, x24               // hi(q*n[0])
1737    mul     x10, x13, x24               // lo(q*n[1])
1738    umulh   x11, x13, x24               // hi(q*n[1])
1739
1740    // First calculate the value to be added to the same accumulator.
1741    subs    xzr, x14, #1                // CF = (x14 < 1)
1742    adcs    x10, x10, x9                // x10 = hi(q*n[0]) + lo(q*n[1])
1743    adc     x11, x11, xzr               // x11 = hi(q*n[1]) + CF
1744
1745    // S /= r, accumulation of staggered blocks
1746    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1747    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1748    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1749    adc     x17, xzr, x24               // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later.
1750
1751    lsl     x8 , x24, #32               // lq << 32
1752    lsr     x9 , x24, #32               // hq
1753
1754    subs    x15, x15, x24               // x15 = x15 + lo(q*n[2]) = x15 - q
1755    sbcs    x16, x16, x8                // x16 = x16 + lo(q*n[3]) = x16 - lq<<32
1756    sbc     x17, x17, x9                // x17 = x17 - hq, (remaining part of hi(q*n[3]))
1757
1758    // Round 2 reduce
1759    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery
1760                                        // modular multiplication
1761
1762    umulh   x9 , x12, x24               // hi(q*n[0])
1763    mul     x10, x13, x24               // lo(q*n[1])
1764    umulh   x11, x13, x24               // hi(q*n[1])
1765
1766    // First calculate the value to be added to the same accumulator.
1767    subs    xzr, x14, #1                // CF = (x14 < 1)
1768    adcs    x10, x10, x9                // x10 = hi(q*n[0]) + lo(q*n[1])
1769    adc     x11, x11, xzr               // x11 = hi(q*n[1]) + CF
1770
1771    // S /= r, accumulation of staggered blocks
1772    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1773    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1774    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1775    adc     x17, xzr, x24               // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later.
1776
1777    lsl     x8 , x24, #32               // lq << 32
1778    lsr     x9 , x24, #32               // hq
1779
1780    subs    x15, x15, x24               // x15 = x15 + lo(q*n[2]) = x15 - q
1781    sbcs    x16, x16, x8                // x16 = x16 + lo(q*n[3]) = x16 - lq<<32
1782    sbc     x17, x17, x9                // x17 = x17 - hq, (The remainder of the hi(q*n[3]))
1783
1784    // Round 3 reduce
1785    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery
1786                                        // modular multiplication
1787
1788    umulh   x9 , x12, x24               // hi(q*n[0])
1789    mul     x10, x13, x24               // lo(q*n[1])
1790    umulh   x11, x13, x24               // hi(q*n[1])
1791
1792    // First calculate the value to be added to the same accumulator.
1793    subs    xzr, x14, #1                // CF = (x14 < 1)
1794    adcs    x10, x10, x9                // x10 = hi(q*n[0]) + lo(q*n[1])
1795    adc     x11, x11, xzr               // x11 = hi(q*n[1]) + CF
1796
1797    // S /= r, accumulation of staggered blocks
1798    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1799    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1800    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1801    adc     x17, xzr, x24               // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later.
1802
1803    lsl     x8 , x24, #32               // lq << 32
1804    lsr     x9 , x24, #32               // hq
1805
1806    subs    x15, x15, x24               // x15 = x15 + lo(q*n[2]) = x15 - q
1807    sbcs    x16, x16, x8                // x16 = x16 + lo(q*n[3]) = x16 - lq<<32
1808    sbc     x17, x17, x9                // x17 = x17 - hq, (Remaining part of hi(q*n[3]))
1809
1810    // Round 4 reduce
1811    mul     x24, x14, x23               // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery
1812                                        // modular multiplication
1813
1814    umulh   x9 , x12, x24               // hi(q*n[0])
1815    mul     x10, x13, x24               // lo(q*n[1])
1816    umulh   x11, x13, x24               // hi(q*n[1])
1817
1818    // First calculate the value to be added to the same accumulator.
1819    subs    xzr, x14, #1                // CF = (x14 < 1)
1820    adcs    x10, x10, x9                // x10 = hi(q*n[0]) + lo(q*n[1])
1821    adc     x11, x11, xzr               // x11 = hi(q*n[1]) + CF
1822
1823    // S /= r, accumulation of staggered blocks
1824    adds    x14, x15, x10               // x14 = x15 + hi(q*n[0]) + lo(q*n[1])
1825    adcs    x15, x16, x11               // x15 = x16 + hi(q*n[1])
1826    adcs    x16, x17, x24               // x16 = x17 + hi(q*n[2])
1827    adc     x17, xzr, x24               // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later.
1828
1829    lsl     x8 , x24, #32               // lq << 32
1830    lsr     x9 , x24, #32               // hq
1831
1832    subs    x15, x15, x24               // x15 = x15 + lo(q*n[2]) = x15 - q
1833    sbcs    x16, x16, x8                // x16 = x16 + lo(q*n[3]) = x16 - lq<<32
1834    sbc     x17, x17, x9                // x17 = x17 - hq, (Remaining part of hi(q*n[3]))
1835
1836    // add acc[4-7], x14~x19 is the current calculation result r
1837    adds    x14, x14, x19
1838    adcs    x15, x15, x20
1839    adcs    x16, x16, x1
1840    adcs    x17, x17, x7
1841    adc     x19, xzr, xzr
1842
1843    // r -= p
1844    subs    x8 , x14, x12
1845    sbcs    x9 , x15, x13
1846    sbcs    x10, x16, x21
1847    sbcs    x11, x17, x22
1848    sbcs    xzr, x19, xzr           // The set CF is used to determine the relationship between r and p.
1849
1850    // r = r-p < 0 ? r : r-p
1851    // Use x4 to x7 to save the result. You can perform the square operation cyclically.
1852    csel    x4 , x14, x8 , lo
1853    csel    x5 , x15, x9 , lo
1854    csel    x6 , x16, x10, lo
1855    csel    x7 , x17, x11, lo
1856
1857    cbnz    x2 , .Lord_sqr      // Number of square operations, that is a^(2^rep)
1858
1859    stp     x4 , x5 , [x0]
1860    stp     x6 , x7 , [x0, #16]
1861
1862    ldp     x19, x20, [sp, #16]
1863    ldp     x21, x22, [sp, #32]
1864    ldp     x23, x24, [sp, #48]
1865    ldr     x29, [sp], #64
1866AARCH64_AUTIASP
1867    ret
1868.size   ECP256_OrdSqr,.-ECP256_OrdSqr
1869
1870.globl  ECP256_Scatterw5
1871.type   ECP256_Scatterw5,%function
1872.align  4
1873ECP256_Scatterw5:
1874AARCH64_PACIASP
1875    stp     x29, x30, [sp, #-16]!
1876    add     x29, sp , #0
1877
1878    add     x0 , x0 , x2 , lsl#3    // Needs x0 = x0 + (x2-1)*8 = x0 + x2*8 - 8
1879                                    // The action of -8 is placed in the str instruction.
1880
1881    ldp     x4 , x5 , [x1]          // x
1882    ldp     x6 , x7 , [x1, #16]
1883
1884    str     x4 , [x0, #128*0-8]     // 8*16*0
1885    str     x5 , [x0, #128*1-8]     // 8*16*1
1886    str     x6 , [x0, #128*2-8]     // 8*16*2
1887    str     x7 , [x0, #128*3-8]     // 8*16*3
1888
1889    add     x0 , x0 , #64*8
1890
1891    ldp     x4 , x5 , [x1, #32]     // y
1892    ldp     x6 , x7 , [x1, #32+16]
1893
1894    str     x4 , [x0, #128*0-8]     // 8*16*0
1895    str     x5 , [x0, #128*1-8]     // 8*16*1
1896    str     x6 , [x0, #128*2-8]     // 8*16*2
1897    str     x7 , [x0, #128*3-8]     // 8*16*3
1898
1899    add     x0 , x0 , #64*8
1900
1901    ldp     x4 , x5 , [x1, #64]     // z
1902    ldp     x6 , x7 , [x1, #64+16]
1903
1904    str     x4 , [x0, #128*0-8]     // 8*16*0
1905    str     x5 , [x0, #128*1-8]     // 8*16*1
1906    str     x6 , [x0, #128*2-8]     // 8*16*2
1907    str     x7 , [x0, #128*3-8]     // 8*16*3
1908
1909    ldr     x29, [sp], #16
1910AARCH64_AUTIASP
1911    ret
1912.size   ECP256_Scatterw5,.-ECP256_Scatterw5
1913
1914.globl  ECP256_Gatherw5
1915.type   ECP256_Gatherw5,%function
1916.align  4
1917ECP256_Gatherw5:
1918AARCH64_PACIASP
1919    stp     x29, x30, [sp, #-16]!
1920    add     x29, sp , #0
1921
1922    cmp     x2 , xzr
1923    csetm   x3 , ne                 // x3 = (x2 == 0) ? 0 : -1
1924    add     x2 , x2 , x3            // x2 += x3, if x2 != 0, then x2 = x2 - 1, if not x2 = 0
1925
1926    add     x1 , x1 , x2 , lsl#3    // x1 += x2*8, offset
1927
1928    ldr     x4 , [x1, #128*0]
1929    ldr     x5 , [x1, #128*1]
1930    ldr     x6 , [x1, #128*2]
1931    ldr     x7 , [x1, #128*3]
1932
1933    csel    x4 , x4 , xzr, ne       // If x2 = 0, Then return to 0. Otherwise, returns the read point coordinates
1934    csel    x5 , x5 , xzr, ne
1935    csel    x6 , x6 , xzr, ne
1936    csel    x7 , x7 , xzr, ne
1937
1938    stp     x4 , x5 , [x0]          // r->x
1939    stp     x6 , x7 , [x0, #16]
1940
1941    add     x1 , x1 , #64*8
1942
1943    ldr     x4 , [x1, #128*0]
1944    ldr     x5 , [x1, #128*1]
1945    ldr     x6 , [x1, #128*2]
1946    ldr     x7 , [x1, #128*3]
1947
1948    csel    x4 , x4 , xzr, ne       // If x2 = 0, return 0. Otherwise, returns the read point coordinates
1949    csel    x5 , x5 , xzr, ne
1950    csel    x6 , x6 , xzr, ne
1951    csel    x7 , x7 , xzr, ne
1952
1953    stp     x4 , x5 , [x0, #32]          // r->y
1954    stp     x6 , x7 , [x0, #48]
1955
1956    add     x1 , x1 , #64*8
1957
1958    ldr     x4 , [x1, #128*0]
1959    ldr     x5 , [x1, #128*1]
1960    ldr     x6 , [x1, #128*2]
1961    ldr     x7 , [x1, #128*3]
1962
1963    csel    x4 , x4 , xzr, ne       // If x2 = 0, return 0. Otherwise, returns the read point coordinates
1964    csel    x5 , x5 , xzr, ne
1965    csel    x6 , x6 , xzr, ne
1966    csel    x7 , x7 , xzr, ne
1967
1968    stp     x4 , x5 , [x0, #64]          // r->z
1969    stp     x6 , x7 , [x0, #80]
1970
1971    ldr     x29, [sp], #16
1972AARCH64_AUTIASP
1973    ret
1974.size   ECP256_Gatherw5,.-ECP256_Gatherw5
1975
1976.globl  ECP256_Scatterw7
1977.type   ECP256_Scatterw7,%function
1978.align  4
1979ECP256_Scatterw7:
1980AARCH64_PACIASP
1981    stp     x29, x30, [sp, #-16]!
1982    add     x29, sp , #0
1983
1984    // add     x0 , x0 , x2
1985    sub     x2 , x2 , #63           // x2 = x2 - 63
1986    sub     x0 , x0 , x2            // x0 = x0 - (x2(original) - 63) = x0 + (63 - x2(original))
1987    mov     x2 , #8                 // Loop count
1988.Lscatter_w7:
1989    ldr     x3 , [x1], #8
1990    subs    x2 , x2 , #1
1991
1992    strb    w3 , [x0, #64*0]
1993    lsr     x3 , x3 , #8
1994    strb    w3 , [x0, #64*1]
1995    lsr     x3 , x3 , #8
1996    strb    w3 , [x0, #64*2]
1997    lsr     x3 , x3 , #8
1998    strb    w3 , [x0, #64*3]
1999    lsr     x3 , x3 , #8
2000    strb    w3 , [x0, #64*4]
2001    lsr     x3 , x3 , #8
2002    strb    w3 , [x0, #64*5]
2003    lsr     x3 , x3 , #8
2004    strb    w3 , [x0, #64*6]
2005    lsr     x3 , x3 , #8
2006    strb    w3 , [x0, #64*7]
2007    add     x0 , x0 , #64*8
2008    b.ne    .Lscatter_w7
2009
2010    ldr    x29, [sp], #16
2011AARCH64_AUTIASP
2012    ret
2013.size   ECP256_Scatterw7,.-ECP256_Scatterw7
2014
2015// The anti-cache attack solution can be used together with ECP256_Scatterw7.
2016.globl  ECP256_Gatherw7
2017.type   ECP256_Gatherw7,%function
2018.align  4
2019ECP256_Gatherw7:
2020AARCH64_PACIASP
2021    stp     x29, x30, [sp, #-16]!
2022    add     x29, sp , #0
2023
2024    cmp     x2 , xzr
2025    csetm   x3 , ne               // x3 = (x2 == 0) ? 0 : -1
2026    add     x2 , x2 , x3          // x2 = (x2 == 0) ? 0 : (x2 - 1) (offset)
2027    sub     x2 , x2 , #63         // x2 = x2 - 63
2028    sub     x1 , x1 , x2          // x0 = x0 - (x2(original) - 63) = x0 + (63 - x2(original))
2029    mov     x2 , #8               // Indicates the number of cycles. The x and y coordinates contain 64 bytes in total,
2030                                  // and 8 bytes are obtained at a time. Therefore, eight cycles are required.
2031    nop
2032.Lgather_w7:
2033
2034    ldrb    w4 , [x1, #64*0]
2035    ldrb    w5 , [x1, #64*1]
2036    ldrb    w6 , [x1, #64*2]
2037    ldrb    w7 , [x1, #64*3]
2038    ldrb    w8 , [x1, #64*4]
2039    ldrb    w9 , [x1, #64*5]
2040    ldrb    w10, [x1, #64*6]
2041    ldrb    w11, [x1, #64*7]
2042
2043    // x4 = x4 | x5 << 8
2044    // x6 = x6 | x7 << 8
2045    // x4 = x4 | x6 << 16
2046    // x4 = [x7][x6][x5][x4]
2047    orr     x4 , x4 , x5 , lsl#8
2048    orr     x6 , x6 , x7 , lsl#8
2049    orr     x4 , x4 , x6 , lsl#16
2050
2051    // x8  = x8  | x9 << 8
2052    // x10 = x10 | x11 << 8
2053    // x8  = x8  | x10 << 16
2054    // x8  = [x11][x10][x9][x8]
2055    orr     x8 , x8 , x9 , lsl#8
2056    orr     x10, x10, x11, lsl#8
2057    orr     x8 , x8 , x10, lsl#16
2058
2059    // x4  = x4 | x8 << 32
2060    // x4  = [x11][x10][x9][x8]
2061    orr     x4 , x4 , x8 , lsl#32
2062
2063    and     x4 , x4 , x3
2064    str     x4 , [x0], #8
2065
2066    add     x1 , x1 , #64*8
2067    subs    x2 , x2, #1
2068    b.ne    .Lgather_w7
2069
2070    ldr    x29,[sp],#16
2071AARCH64_AUTIASP
2072    ret
2073.size   ECP256_Gatherw7,.-ECP256_Gatherw7
2074
2075#endif
2076