• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16/*
17 * Description: Big number Montgomery modular multiplication in armv8 implementation, MontMul_Asm
18 * Ref: Montgomery Multiplication
19 * Process: To cal A * B mod n, we can convert to mont form, and cal A*B*R^(-1).
20 * Detail:
21 * intput:A = (An-1,...,A1,A0)b, B = (Bn-1,...,B1,B0)b, n, n'
22 * output:A*B*R^(-1)
23 * tmp = (tn,tn-1,...,t1,t0)b, initialize to 0
24 * for i: 0 -> (n-1)
25 *     ui = (t0 + Ai*B0)m' mod b
26 *     t = (t + Ai*B + ui * m) / b
27 * if t >= m
28 *     t -= m
29 * return t;
30 *
31 * Deal process:
32 *   i.    size % 8 == 0 & a == b --> Sqr8x --> complete multiplication
33 *                                                --> size == 8, goto single reduce step
34 *                                                --> size >= 8, goto loop reduce process
35 *   ii.   size % 4 == 0 --> Mul4x
36 *                           --> size == 4, goto single step
37 *                           --> size >= 4, goto loop process
38 *   iii.  Ordinary --> Mul1x
39 *                       --> size == 2, goto single step
40 *                       --> size >= 2, goto loop process
41 *
42 */
43
44#include "hitls_build.h"
45#ifdef HITLS_CRYPTO_BN
46
47#include "crypt_arm.h"
48
49.arch   armv8-a+crypto
50.file   "bn_mont_armv8.S"
51.text
52
53.global MontMul_Asm
54.type   MontMul_Asm, %function
55.align  5
56MontMul_Asm:
57AARCH64_PACIASP
58    tst     x5, #7
59    b.eq    MontSqr8
60    tst     x5, #3
61    b.eq    MontMul4
62
63    stp     x29, x30, [sp, #-64]!
64    mov     x29, sp
65    stp     x23, x24, [sp, #16]
66    stp     x21, x22, [sp, #32]
67    stp     x19, x20, [sp, #48]
68
69    sub     x21, x5 , #2           // j = size-2
70    cbnz    x21,.LMul1xBegin
71
72    // if size == 2, goto single step
73    ldp     x15, x16, [x1]         // a[0], a[1]
74    ldp     x19, x20, [x2]         // b[0], b[1]
75    ldp     x9, x10, [x3]          // n[0], n[1]
76
77    mul     x23 , x15 , x19             // x23 = lo(a[0] * b[0])
78    umulh   x24 , x15 , x19             // x24 = hi(a[0] * b[0])
79    mul     x6, x16 , x19               // x6 = lo(a[1] * b[0])
80    umulh   x7, x16 , x19               // x7 = hi(a[1] * b[0])
81
82    mul     x11, x23 , x4               // x11 = lo(t[0] * k0)
83    umulh   x19, x9, x11                // x19 = hi(n[0] * t[0]*k0)
84    mul     x12, x10, x11               // x12 = lo(n[1] * t[0]*k0)
85    umulh   x13, x10, x11               // x13 = hi(n[1] * t[0]*k0)
86
87    // we knowns a*b + n'n = 0 (mod R)
88    // so lo(a[0] * b[0]) + lo(n[0] * t[0]*k0) = 0 (mod R)
89    // if lo(a[0] * b[0]) > 0, then 'lo(a[0] * b[0]) + lo(n[0] * t[0]*k0)' would overflow,
90    // else if lo(a[0] * b[0]) == 0, then lo(n[0] * t[0]*k0) == 0
91    cmp     x23, #1
92    adc     x19, x19, xzr
93
94    adds    x23 , x6, x24               // x23 = lo(a[1] * b[0]) + hi(a[0] * b[0])
95    adc     x24 , x7, xzr               // x24 = hi(a[1] * b[0]) + CF
96
97    adds    x8, x12, x19                // x8 = lo(n[1] * t[0]*k0) + hi(n[0] * t[0]*k0)
98    adc     x19, x13, xzr               // x19 = hi(n[1] * t[0]*k0) + CF
99
100    adds    x21, x8, x23                // x21 = lo(n[1] * t[0]*k0) + hi(n[0] * t[0]*k0)
101    adcs    x22, x19, x24
102    adc     x23, xzr, xzr               // x23 = CF
103
104    mul     x14 , x15 , x20             // a[0] * b[1]
105    umulh   x15 , x15 , x20
106    mul     x6, x16 , x20               // a[1] * b[1]
107    umulh   x7, x16 , x20
108    adds    x14 , x14 , x21
109    adc     x15 , x15 , xzr
110
111    mul     x11, x14 , x4               // t[0] * k0
112
113    umulh   x20, x9, x11
114    mul     x12, x10, x11               // n[1] * t[0]*k0
115    umulh   x13, x10, x11
116    cmp     x14 , #1                    // Check whether the low location is carried.
117    adc     x20, x20, xzr
118    adds    x14 , x6, x15
119    adc     x15 , x7, xzr
120
121    adds    x21, x12, x20
122    adcs    x20, x13, x23
123    adc     x23, xzr, xzr
124
125    adds    x14 , x14 , x22
126    adc     x15 , x15 , xzr
127
128    adds    x21, x21, x14
129    adcs    x20, x20, x15
130    adc     x23, x23, xzr               // x23 += CF
131
132    subs    x12 , x21, x9
133    sbcs    x13, x20, x10
134    sbcs    x23, x23, xzr               // update CF
135
136    csel    x10, x21, x12, lo
137    csel    x11, x20, x13, lo
138    stp     x10, x11, [x0]
139    b       .LMul1xEnd
140
141.LMul1xBegin:
142    mov     x24, x5                 // the outermost pointers of our loop
143    lsl     x5 , x5 , #3
144    sub     x22, sp , x5            // The space size needs to be applied for.
145    and     x22, x22, #-16          // For 4-byte alignment
146    mov     sp , x22                // Apply for Space.
147    mov     x6, x5
148    mov     x23, xzr
149
150.LMul1xInitstack:
151    sub     x6, x6, #8
152    str     xzr, [x22], #8
153    cbnz    x6, .LMul1xInitstack
154    mov     x22, sp
155
156.LMul1xLoopProces:
157    sub     x24, x24, #1
158    sub     x21, x5 , #16           // j = size-2
159
160    // Begin mulx
161    ldr     x17, [x2], #8           // b[i]
162    ldp     x15, x16, [x1], #16     // a[0], a[1]
163    ldp     x9, x10, [x3], #16      // n[0], n[1]
164    ldr     x19, [x22]              // The sp val is 0 during initialization.
165
166    mul     x14 , x15 , x17         // a[0] * b[i]
167    umulh   x15 , x15 , x17
168    mul     x6, x16 , x17           // a[1] * b[i]
169    umulh   x7, x16 , x17
170    adds    x14 , x14 , x19
171    adc     x15 , x15 , xzr
172
173    mul     x11, x14 , x4
174    umulh   x9, x9, x11
175    mul     x12, x10, x11           // n[1] * t[0]*k0
176    cmp     x14, #1
177    umulh   x13, x10, x11
178
179.LMul1xPrepare:
180    sub     x21, x21, #8            // index -= 1
181    ldr     x16, [x1], #8           // a[i]
182    ldr     x10, [x3], #8           // n[i]
183    ldr     x19, [x22, #8]          // t[j]
184
185    adc     x9, x9, xzr
186    adds    x14 , x6, x15
187    adc     x15 , x7, xzr
188
189    adds    x8, x12, x9
190    adc     x9, x13, xzr
191
192    mul     x6, x16, x17            // a[j] * b[i]
193    adds    x14, x14 , x19
194    umulh   x7, x16 , x17
195    adc     x15, x15 , xzr
196
197    mul     x12, x10, x11           // n[j] * t[0]*k0
198    adds    x8, x8, x14
199    umulh   x13, x10, x11
200    str     x8, [x22], #8           // t[j-1]
201    cbnz    x21, .LMul1xPrepare
202
203.LMul1xReduce:
204    ldr     x19, [x22, #8]
205    adc     x9, x9, xzr
206    adds    x14 , x6, x15
207    adc     x15 , x7, xzr
208
209    adds    x8, x12, x9
210    adcs    x9, x13, x23
211    adc     x23, xzr, xzr
212
213    adds    x14 , x14 , x19
214    adc     x15 , x15 , xzr
215
216    adds    x8, x8, x14
217    adcs    x9, x9, x15
218    adc     x23, x23, xzr           // x23 += CF, carry of the most significant bit.
219    stp     x8, x9, [x22], #8
220
221    mov     x22, sp
222    sub     x1 , x1 , x5
223    subs    x3 , x3 , x5            // x3 = &n[0]
224    cbnz    x24, .LMul1xLoopProces
225
226    mov     x1, x0
227    mov     x21, x5                 // get index
228
229.LMul1xSubMod:
230    ldr     x19, [x22], #8
231    ldr     x10, [x3], #8
232    sub     x21, x21, #8            // j--
233    sbcs    x16, x19, x10           // t[j] - n[j]
234    str     x16, [x1], #8           // r[j] = t[j] - n[j]
235    cbnz    x21,.LMul1xSubMod
236
237    sbcs    x23, x23, xzr           // x23 -= CF
238    mov     x22, sp
239.LMul1xCopy:
240    ldr     x19, [x22], #8
241    ldr     x16, [x0]
242    sub     x5, x5, #8              // size--
243    csel    x10, x19, x16, lo
244    str     x10, [x0], #8
245    cbnz    x5 , .LMul1xCopy
246
247.LMul1xEnd:
248    ldp     x23, x24, [x29, #16]
249    mov     sp , x29
250    ldp     x21, x22, [x29, #32]
251    ldp     x19, x20, [x29, #48]
252    ldr     x29, [sp], #64
253AARCH64_AUTIASP
254    ret
255.size   MontMul_Asm, .-MontMul_Asm
256
257.type   MontSqr8, %function
258MontSqr8:
259AARCH64_PACIASP
260    cmp     x1, x2
261    b.ne    MontMul4
262
263    stp     x29, x30, [sp, #-128]!  //  sp = sp - 128(Modify the SP and then save the SP.), [sp] = x29, [sp + 8] = x30,
264                                    //  !Indicates modification sp
265    mov     x29, sp                 //  x29  = sp, The sp here has been reduced by 128.
266
267    stp     x27, x28, [sp, #16]
268    stp     x25, x26, [sp, #32]
269    stp     x23, x24, [sp, #48]
270    stp     x21, x22, [sp, #64]
271    stp     x19, x20, [sp, #80]
272    stp     x0 , x3 , [sp, #96]     // offload r and n, Push the pointers of r and n into the stack.
273    str     x4 , [sp, #112]         // store n0
274
275    lsl     x5, x5, #3              // x5 = x5 * 8, Converts size to bytes.
276    sub     x2, sp, x5, lsl#1       // x2 = sp - 2*x5*8, x5 = size, x2 points to the start address of a 2*size memory. *8 is to convert to bytes
277    mov     sp, x2                  // Alloca, Apply for Space.
278    mov     x19, x5                 // The lowest eight data blocks do not need to be cleared.
279    eor     v0.16b,v0.16b,v0.16b
280    eor     v1.16b,v1.16b,v1.16b
281
282.LSqr8xStackInit:
283    sub     x19, x19, #8*8          // Offset 64, cyclic increment.
284    st1     {v0.2d, v1.2d}, [x2], #32
285    st1     {v0.2d, v1.2d}, [x2], #32
286    st1     {v0.2d, v1.2d}, [x2], #32
287    st1     {v0.2d, v1.2d}, [x2], #32
288    cbnz    x19, .LSqr8xStackInit   // When x19 = 0, the loop exits.
289
290    mov     x2 , sp                 // After clear to zero, assign sp back to x2.
291
292    ldp     x27, x28, [x2]
293    ldp     x25, x26, [x2]
294    ldp     x23, x24, [x2]
295    ldp     x21, x22, [x2]
296
297    add     x3 , x1 , x5            // x3 = x1 + bytes(size * 8)
298
299    ldp     x14 , x15 , [x1], #16    // x14  = a[0], x15  = a[1]
300    ldp     x16 , x17 , [x1], #16    // x16  = a[2], x17  = a[3]
301    ldp     x6, x7, [x1], #16    // x6 = a[4], x7 = a[5]
302    ldp     x8, x9, [x1], #16    // x8 = a[6], x9 = a[7]
303
304.LSqr8xLoopMul:
305    mul     x10, x14, x15             // a[0] * a[1~4]
306    mul     x11, x14, x16             // keep cache hit ratio of x6
307    mul     x12, x14, x17
308    mul     x13, x14, x6
309    adds    x28, x28, x10           // x27~x22 = t[0~7], x28 = t[1] = lo(a[0]*a[1]), adds is used to set CF to 0.
310    adcs    x25, x25, x11           // x10~x17 Used to save subsequent calculation results
311
312    mul     x10, x14 , x7           // lo(a[0] * a[5~7]), keep cache hit ratio of x14, the same below
313    mul     x11, x14 , x8
314    adcs    x26, x26, x12
315    adcs    x23, x23, x13           // t[4] = lo(a[0] * a[4])
316    adcs    x24, x24, x10           // x24~x22 = t[5~7]
317    mul     x12, x14 , x9           // lo(a[0] * a[7])
318
319    stp     x27, x28, [x2], #8*2        // t[0] = a[0]^2, Because the square term is not calculated temporarily,
320                                    // so t[0] = 0, t[1] = a[0] * a[1] + carry
321
322    adcs    x21, x21, x11
323    adcs    x22, x22, x12           // t[7] += lo(a[0] * a[7]), Carrying has to be given t[8]
324    adc     x27, xzr, xzr               // x27 = CF ( Set by t[7] += lo(a[0] * a[7]) ),
325    umulh   x13, x14 , x15          // hi(a[0] * a[1~4]), Use x17 to keep the cache hit
326    umulh   x10, x14 , x16
327    umulh   x11, x14 , x17
328    umulh   x12, x14 , x6
329
330    // In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified.
331    adds    x25, x25, x13           // t[2] += hi(a[0] * a[1])
332    adcs    x26, x26, x10
333    adcs    x23, x23, x11
334    adcs    x24, x24, x12           // t[5] += hi(a[0] * a[4])
335    umulh   x13, x14 , x7           // hi(a[0] * a[5~7])
336    umulh   x10, x14 , x8
337    umulh   x11, x14 , x9
338
339    //----- lo(a[1] * a[2~4]) ------
340    adcs    x21, x21, x13           // t[6] += hi(a[0] * a[5])
341    adcs    x22, x22, x10           // t[7] += hi(a[0] * a[6])
342    adc     x27, x27, x11           // t[8] += hi(a[0] * a[7])
343    mul     x12, x15, x16           // lo(a[1] * a[2])
344    mul     x13, x15, x17
345    mul     x10, x15, x6
346
347    //----- lo(a[1] * a[5~7]) ------
348    adds    x26, x26, x12           // t[3] += lo(a[1] * a[2]), The first calculation of this round
349                                    // does not take into account the previous carry, and the CF is not modified in line 118.
350    adcs    x23, x23, x13           // t[4] += lo(a[1] * a[3])
351    adcs    x24, x24, x10           // t[5] += lo(a[1] * a[4])
352    mul     x11, x15 , x7
353    mul     x12, x15 , x8
354    mul     x13, x15 , x9
355
356    //----- hi(a[1] * a[2~5]) ------
357    adcs    x21, x21, x11           // t[6] += lo(a[1] * a[5])
358    adcs    x22, x22, x12           // t[7] += lo(a[1] * a[6])
359    adcs    x27, x27, x13           // t[8] += lo(a[1] * a[7])
360    umulh   x10, x15, x16           // hi(a[1] * a[2])
361    umulh   x11, x15, x17
362    umulh   x12, x15, x6
363    umulh   x13, x15, x7
364
365    stp     x25, x26, [x2], #8*2        // t[2] and t[3] are calculated and stored in the memory.
366                                        // x25 and x22 are used to store t[10] and t[11].
367    adc     x28, xzr, xzr               // t[9] = CF ( Set by t[8] += lo(a[1] * a[7]) )
368    //In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified.
369    //----- hi(a[1] * a[6~7]) ------
370    adds    x23, x23, x10           // t[4] += hi(a[1] * a[2])
371    adcs    x24, x24, x11           // t[5] += hi(a[1] * a[3])
372    adcs    x21, x21, x12           // t[6] += hi(a[1] * a[4])
373    umulh   x10, x15 , x8           // hi(a[1] * a[6])
374    umulh   x11, x15 , x9           // hi(a[1] * a[7])
375
376    //----- lo(a[2] * a[3~7]) ------
377    adcs    x22, x22, x13           // t[7] += hi(a[1] * a[5])
378    adcs    x27, x27, x10           // t[8] += hi(a[1] * a[6])
379    adc     x28, x28, x11           // t[9] += hi(a[1] * a[7]), Here, only the carry of the previous round
380    mul     x12, x16, x17           // lo(a[2] * a[3])
381    mul     x13, x16, x6
382    mul     x10, x16, x7
383                                    // of calculation is retained before x20 calculation. Add x15 to the carry.
384    mul     x11, x16 , x8
385    adds    x24, x24, x12           // t[5] += lo(a[2] * a[3]), For the first calculation of this round,
386                                    // the previous carry is not considered.
387    mul     x12, x16 , x9
388    adcs    x21, x21, x13           // t[6] += lo(a[2] * a[4])
389
390    //----- hi(a[2] * a[3~7]) ------
391    adcs    x22, x22, x10           // t[7] += lo(a[2] * a[5])
392    umulh   x13, x16, x17           // hi(a[2] * a[3])
393    umulh   x10, x16, x6
394
395    adcs    x27, x27, x11           // t[8] += lo(a[2] * a[6])
396    adcs    x28, x28, x12           // t[9] += lo(a[2] * a[7])
397    umulh   x11, x16, x7
398    umulh   x12, x16, x8
399
400    stp     x23, x24, [x2], #8*2    // After t[4] and t[5] are calculated, they are stored in the memory.
401                                    // x23 and x24 are used to store t[12] and t[13].
402    adc     x25, xzr, xzr           // t[10] = CF ( Set by t[9] += lo(a[2] * a[7]) )
403    // In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified.
404    adds    x21, x21, x13           // t[6] += hi(a[2] * a[3])
405    adcs    x22, x22, x10           // t[7] += hi(a[2] * a[4])
406    umulh   x13, x16, x9
407
408    //----- lo(a[3] * a[4~7]) ------
409    adcs    x27, x27, x11           // t[8] += hi(a[2] * a[5])
410    adcs    x28, x28, x12           // t[9] += hi(a[2] * a[6])
411    adc     x25, x25, x13           // t[10] += hi(a[2] * a[7])
412    mul     x10, x17, x6
413    mul     x11, x17, x7
414    mul     x12, x17, x8
415    mul     x13, x17, x9
416
417    //----- hi(a[3] * a[4~7]) ------
418    adds    x22, x22, x10           // t[7] += lo(a[3] * a[4])
419    adcs    x27, x27, x11           // t[8] += lo(a[3] * a[5])
420    adcs    x28, x28, x12           // t[9] += lo(a[3] * a[6])
421    adcs    x25, x25, x13           // t[10] += lo(a[3] * a[7])
422    umulh   x10, x17, x6
423    umulh   x11, x17, x7
424    umulh   x12, x17, x8
425    umulh   x13, x17, x9
426
427    stp     x21, x22, [x2], #8*2    // t[6] and t[7] are calculated and stored in the memory.
428                                    // x21 and x26 are used to store t[14] and t[15].
429    adc     x26, xzr, xzr           // t[11] = CF ( Set by t[10] += lo(a[3] * a[7]) )
430    // In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified.
431    adds    x27, x27, x10           // t[8] += hi(a[3] * a[4])
432
433    //----- lo(a[4] * a[5~7]) ------
434    adcs    x28, x28, x11           // t[9] += hi(a[3] * a[5])
435    adcs    x25, x25, x12           // t[10] += hi(a[3] * a[6])
436    adc     x26, x26, x13           // t[11] += hi(a[3] * a[7])
437    mul     x10, x6, x7
438    mul     x11, x6, x8
439    mul     x12, x6, x9
440
441    //----- hi(a[4] * a[5~7]) ------
442    adds    x28, x28, x10           // t[9] += lo(a[4] * a[5])
443    adcs    x25, x25, x11           // t[10] += lo(a[4] * a[6])
444    adcs    x26, x26, x12           // t[11] += lo(a[4] * a[7])
445    umulh   x13, x6, x7
446    umulh   x10, x6, x8
447    umulh   x11, x6, x9
448
449    //----- lo(a[5] * a[6~7]) ------
450    mul     x12, x7, x8
451    // This is actually a new round, but only t[0-7] can be calculated in each cycle,
452    // and t[8-15] retains the intermediate calculation result.
453    adc     x23, xzr, xzr           // t[12] = CF( Set by t[11] += lo(a[4] * a[7]) )
454    adds    x25, x25, x13           // t[10] += hi(a[4] * a[5])
455    adcs    x26, x26, x10           // t[11] += hi(a[4] * a[6])
456    mul     x13, x7, x9
457
458    //----- hi(a[5] * a[6~7]) ------
459    adc     x23, x23, x11           // t[12] += hi(a[4] * a[7])
460    umulh   x10, x7, x8
461    umulh   x11, x7, x9
462
463    adds    x26, x26, x12           // t[11] += lo(a[5] * a[6])
464    //----- lo(a[6] * a[7]) ------
465    adcs    x23, x23, x13           // t[12] += lo(a[5] * a[7])
466    mul     x12, x8, x9
467
468    //----- hi(a[6] * a[7])  ------
469    adc     x24, xzr, xzr           // t[13] = CF ( Set by t[12] += lo(a[5] * a[7]) ),
470    umulh   x13, x8, x9
471                                    // This operation is required when a new umulh is added.
472    adds    x23, x23, x10           // t[12] += hi(a[5] * a[6])
473    adc     x24, x24, x11           // t[13] += hi(a[5] * a[7])
474    sub     x19, x3, x1             // x3 = &a[size], x1 = &a[8], x19 = (size - 8) * 8
475
476    adds    x24, x24, x12           // t[13] += lo(a[6] * a[7])
477    adc     x21, xzr, xzr           // t[14] = CF ( set by t[13] += lo(a[6] * a[7]) )
478    add     x21, x21, x13           // t[14] += hi(a[6] * a[7]), There must be no carry in the last step.
479
480    cbz     x19, .LSqr8xLoopMulEnd
481
482    mov     x0, x1                  // x0 = &a[8]
483    mov     x22, xzr
484
485//########################################
486//#          a[0~7] * a[8~15]            #
487//########################################
488.LSqr8xHighMulBegian:
489    mov     x19, #-8*8                // Loop range. x0 can retrieve a[0–7] based on this offset.
490    ldp     x14 , x15 , [x2, #8*0]    // x14  = t[8] , x15  = t[9]
491    adds    x27, x27, x14             // t[8](t[8] reserved in the previous round of calculation) + = t[8]
492                                      // (t[8] taken from memory, initially 0)
493    adcs    x28, x28, x15             // t[9] += t[9], be the same as the above
494    ldp     x16 , x17 , [x2, #8*2]    // x16 = t[10], x17 = t[11]
495    ldp     x14 , x15 , [x1], #16   // x14 = a[8], x15 = a[9]
496    adcs    x25, x25, x16
497    adcs    x26, x26, x17
498
499    ldp     x6, x7, [x2, #8*4]    // x6 = t[12], x7 = t[13]
500    ldp     x16 , x17 , [x1], #16    // x16 = a[10], x17 = a[11]
501    adcs    x23, x23, x6
502    adcs    x24, x24, x7
503
504    ldp     x8, x9, [x2, #8*6]    // x8 = t[14], x9 = t[15]
505    ldp     x6, x7, [x1], #16    // x6 = a[12], x7 = a[13]
506    adcs    x21, x21, x8
507    adcs    x22, x22, x9           // t[15] = t[15] + CF, Because a[7]*a[7] is not calculated previously, t[15]=0
508    ldp     x8, x9, [x1], #16    // x8 = a[14], x9 = a[15]
509
510.LSqr8xHighMulProces:
511    ldr     x4 , [x0, x19]          // x4 = [x0 + x19] = [x0 - 56] = [&a[8] - 56] = a[8 - 7] = a[1]
512    //-----lo(a[0] * a[8~11])-----
513    adc     x20, xzr, xzr           // x20 += CF, Save the carry of t[15]. The same operation is performed below.
514    add     x19, x19, #8            // x19 += 8, Loop step size
515    mul     x10, x4 , x14            // x4 = a[0], x14 = a[8], x10 = lo(a[0] * a[8])
516    mul     x11, x4 , x15            // x11 = lo(a[0] * a[9])
517    mul     x12, x4 , x16            // x12 = lo(a[0] * a[10])
518    mul     x13, x4 , x17            // x13 = lo(a[0] * a[11])
519
520    //-----lo(a[0] * a[12~15])-----
521    adds    x27, x27, x10           // CF does not need to be added for the first calculation,
522                                    // t[8] += lo(a[0] * a[8])
523    adcs    x28, x28, x11           // t[9] += lo(a[0] * a[9])
524    adcs    x25, x25, x12           // t[10] += lo(a[0] * a[10])
525    adcs    x26, x26, x13           // t[11] += lo(a[0] * a[11])
526    mul     x10, x4 , x6
527    mul     x11, x4 , x7
528    mul     x12, x4 , x8
529    mul     x13, x4 , x9
530
531    //-----hi(a[0] * a[8~11])-----
532    adcs    x23, x23, x10           // t[12] += lo(a[0] * a[12])
533    adcs    x24, x24, x11           // t[13] += lo(a[0] * a[13])
534    adcs    x21, x21, x12           // t[14] += lo(a[0] * a[14])
535    adcs    x22, x22, x13           // t[15] += lo(a[0] * a[15])
536    umulh   x10, x4 , x14
537    umulh   x11, x4 , x15
538    umulh   x12, x4 , x16
539    umulh   x13, x4 , x17
540
541    adc     x20, x20, xzr           // x20 += CF, Save the carry of t[15]
542    str     x27, [x2], #8           // [x2] = t[8], x2 += 8, x27~x22 = t[9~16],
543                                    // Update the mapping relationship to facilitate cycling.
544                                    // x27~x26 always correspond to t[m~m+7], and x19 is always the LSB of the window
545
546    //-----hi(a[0] * a[12~15])-----
547    adds    x27, x28, x10           // t[9] += hi(a[0] * a[8]), The last calculation was to calculate t[15],
548                                    // so carry cannot be added to t[9], so adds is used
549    adcs    x28, x25, x11           // t[10] += hi(a[0] * a[9])
550    adcs    x25, x26, x12           // t[11] += hi(a[0] * a[10])
551    adcs    x26, x23, x13           // t[12] += hi(a[0] * a[11])
552    umulh   x10, x4 , x6
553    umulh   x11, x4 , x7
554    umulh   x12, x4 , x8
555    umulh   x13, x4 , x9           // x13 = hi(a[0] * a[15])
556
557    adcs    x23, x24, x10           // t[13] += hi(a[0] * a[12])
558    adcs    x24, x21, x11           // t[14] += hi(a[0] * a[13])
559    adcs    x21, x22, x12           // t[15] += hi(a[0] * a[14])
560    adcs    x22, x20, x13           // t[16] = hi(a[0] * a[15]) + CF
561
562    cbnz    x19, .LSqr8xHighMulProces        // When exiting the loop, x0 = &a[8], x2 = &t[16]
563
564    sub     x16, x1, x3            // x3 = x1 + x5 * 8(Converted to bytes), When x1 = x3, the loop ends.
565    cbnz    x16, .LSqr8xHighMulBegian      // x0 is the outer loop, x1 is the inner loop, and the inner loop ends.
566                                   // In this case, x2 = &a[size], out-of-bounds position.
567
568    mov     x1, x0                   // Outer Loop Increment, x1 = &a[16]
569    ldp     x14 , x15 , [x1], #16    // x14  = a[8] , x15  = a[9]
570    ldp     x16 , x17 , [x1], #16    // x16  = a[10], x17  = a[11]
571    ldp     x6, x7, [x1], #16    // x6 = a[12], x7 = a[13]
572    ldp     x8, x9, [x1], #16    // x8 = a[14], x9 = a[15]
573
574    sub     x10, x3 , x1            // Check whether the outer loop ends, x3 = &a[size], x10 = (size - 16)*8
575    cbz     x10, .LSqr8xLoopMul
576
577    sub     x11, x2 , x10           // x2 = &t[24], x11 = &t[16]
578    stp     x27, x28, [x2 , #8*0]   // t[24] = x27, t[25] = x28
579    ldp     x27, x28, [x11, #8*0]   // x27 = t[16], x28 = t[17]
580
581    stp     x25, x26, [x2 , #8*2]   // t[26] = x25, t[27] = x26
582    ldp     x25, x26, [x11, #8*2]   // x25 = t[18], x26 = t[19]
583
584    stp     x23, x24, [x2 , #8*4]   // t[28] = x23, t[29] = x24
585    ldp     x23, x24, [x11, #8*4]   // x23 = t[20], x24 = t[21]
586
587    stp     x21, x22, [x2 , #8*6]   // t[30] = x21, t[31] = x22
588    ldp     x21, x22, [x11, #8*6]   // x21 = t[22], x22 = t[23]
589
590    mov     x2 , x11                // x2 = &t[16]
591    b       .LSqr8xLoopMul
592
593.align  4
594.LSqr8xLoopMulEnd:
595    //===== Calculate the squared term =====
596    //----- sp = &t[0] , x2 = &t[24]-----
597    sub     x10, x3, x5             // x10 = a[0]
598    stp     x27, x28, [x2, #8*0]    // t[24] = x27, t[25] = x28
599    stp     x25, x26, [x2, #8*2]    // When this step is performed, the calculation results reserved for x27–x26
600                                    // are not pushed to the stack.
601    stp     x23, x24, [x2, #8*4]
602    stp     x21, x22, [x2, #8*6]
603    ldp     x11, x12, [sp, #8*1]    // x11 = t[1], x12 = t[2]
604    ldp     x15, x17, [x10], #16    // x15 = a[0], x17 = a[1]
605    ldp     x7, x9, [x10], #16      // x7 = a[2], x9 = a[3]
606    mov     x1, x10
607    ldp     x13, x10, [sp, #8*3]    // x13 = t[3], x10 = t[4]
608
609    mul     x27, x15, x15           // x27 = lo(a[0] * a[0])
610    umulh   x15, x15, x15           // x15 = hi(a[0] * a[0])
611    mov     x2 , sp                 // x2 = sp = &t[0]
612    mul     x16, x17, x17           // x16  = lo(a[1] * a[1])
613    adds    x28, x15, x11, lsl#1    // x28 = x15 + (x11 * 2) = hi(a[0] * a[0]) + 2 * t[1]
614    umulh   x17, x17, x17           // x17  = hi(a[1] * a[1])
615
616    extr    x11, x12, x11, #63      // Lower 63 bits of x11 = x16 | most significant bit of x15
617                                    // Cyclic right shift by 63 bits to obtain the lower bit,
618                                    // which is equivalent to cyclic left shift by 1 bit to obtain the upper bit.
619                                    // The purpose is to *2.
620                                    // x11 = 2*t[2](Ignore the overflowed part) + carry of (2*t[1])
621    mov     x19, x5                 // x19 = size*8
622
623.LSqr8xDealSquare:
624    adcs    x25, x16 , x11           // x25 = lo(a[1] * a[1]) + 2*t[2]
625    extr    x12, x13, x12, #63       // x12 = 2*t[3](Ignore the overflowed part) + carry of (2*t[2])
626    adcs    x26, x17 , x12           // x26 = hi(a[1] * a[1]) + 2*t[3]
627
628    sub     x19, x19, #8*4          // x19 = (size - 8)*8
629    stp     x27, x28, [x2], #16    // t[0~3]Re-push stack
630    stp     x25, x26, [x2], #16
631
632    mul     x6, x7, x7           // x6 = lo(a[2] * a[2])
633    umulh   x7, x7, x7           // x7 = hi(a[2] * a[2])
634    mul     x8, x9, x9           // x6 = lo(a[3] * a[3])
635    umulh   x9, x9, x9           // x7 = hi(a[3] * a[3])
636    ldp     x11, x12, [x2, #8]    // x11 = t[5], x12 = t[6]
637
638    extr    x13, x10, x13, #63      // x13 = 2*t[4](Ignore the overflowed part) + carry of(2*t[3])
639    extr    x10, x11, x10, #63      // x10 = 2*t[5](Ignore the overflowed part) + carry of(2*t[4])
640    adcs    x23, x6, x13           // x23 = lo(a[2] * a[2]) + 2*t[4]
641    adcs    x24, x7, x10           // x24 = hi(a[2] * a[2]) + 2*t[5]
642
643    cbz     x19, .LSqr8xReduceStart
644
645    ldp     x13, x10, [x2, #24]     // x13 = t[7], x10 = t[8]
646    extr    x11, x12, x11, #63      // x11 = 2*t[6](Ignore the overflowed part) + carry of(2*t[5])
647    extr    x12, x13, x12, #63      // x12 = 2*t[7](Ignore the overflowed part) + carry of(2*t[6])
648    adcs    x21, x8, x11            // x21 = lo(a[3] * a[3]) + 2*t[6]
649    adcs    x22, x9, x12            // x22 = hi(a[3] * a[3]) + 2*t[7]
650    stp     x23, x24, [x2], #16     // t[4~7]re-push stack
651    stp     x21, x22, [x2], #16
652    ldp     x15, x17, [x1], #8*2    // x15  = a[4], x17  = a[5], x1 += 16 = &a[6]
653    ldp     x11, x12, [x2, #8]      // x11 = t[9], x12 = t[10]
654
655    mul     x14 , x15 , x15            // x14  = lo(a[4] * a[4])
656    umulh   x15 , x15 , x15            // x15  = hi(a[4] * a[4])
657    mul     x16 , x17 , x17            // x16  = lo(a[5] * a[5])
658    umulh   x17 , x17 , x17            // x17  = hi(a[5] * a[5])
659
660    extr    x13, x10, x13, #63      // x13 = 2*t[8](Ignore the overflowed part) + carry of(2*t[7])
661    adcs    x27, x14 , x13           // x27 = lo(a[4] * a[4]) + 2*t[8]
662    extr    x10, x11, x10, #63      // x10 = 2*t[9](Ignore the overflowed part) + carry of(2*t[8])
663    adcs    x28, x15 , x10           // x28 = hi(a[4] * a[4]) + 2*t[9]
664
665    extr    x11, x12, x11, #63      // x11 = 2*t[10](Ignore the overflowed part) + carry of(2*t[9])
666
667    ldp     x13, x10, [x2, #8*3]    // Line 438 has obtained t[9] and t[10], x13 = &t[11], x10 = &t[12]
668    ldp     x7, x9, [x1], #8*2    // x7 = a[6], x9 = a[7], x1 += 16 = &a[8]
669
670    b       .LSqr8xDealSquare
671
672.LSqr8xReduceStart:
673    extr    x11, x12, x11, #63      // x11 = 2*t[2*size-2](Ignore the overflowed part) + carry of (2*t[2*size-3])
674    adcs    x21, x8, x11            // x21 = lo(a[size-1] * a[size-1]) + 2*t[2*size-2]
675    extr    x12, xzr, x12, #63      // x12 = 2*t[2*size-1](Ignore the overflowed part) + carry of (2*t[2*size-2])
676    adc     x22, x9, x12            // x22 = hi(a[size-1] * a[size-1]) + 2*t[2*size-1]
677
678    ldp     x1, x4, [x29, #104]     // Pop n and k0 out of the stack, x1 = &n[0], x4 = k0
679    stp     x23, x24, [x2]          // t[2*size-4 ~ 2*size-1]re-push stack
680    stp     x21, x22, [x2,#8*2]
681
682    cmp     x5, #64      // if size == 8, we can goto Single step reduce
683    b.ne    .LSqr8xReduceLoop
684
685    ldp     x14 , x15 , [x1], #16    // x14~x9 = n[0~7]
686    ldp     x16 , x17 , [x1], #16
687    ldp     x6, x7, [x1], #16
688    ldp     x8, x9, [x1], #16
689
690    ldp     x27, x28, [sp]          // x14~x9 = t[0~7]
691    ldp     x25, x26, [sp,#8*2]
692    ldp     x23, x24, [sp,#8*4]
693    ldp     x21, x22, [sp,#8*6]
694    mov     x19, #8
695    mov     x2 , sp
696
697    // if size == 8, goto single reduce step
698.LSqr8xSingleReduce:
699    mul     x20, x4, x27
700    sub     x19, x19, #1
701    //----- lo(n[1~7] * lo(t[0]*k0)) -----
702    mul     x11, x15 , x20
703    mul     x12, x16 , x20
704    mul     x13, x17 , x20
705    mul     x10, x6, x20
706
707    cmp     x27, #1
708    adcs    x27, x28, x11
709    adcs    x28, x25, x12
710    adcs    x25, x26, x13
711    adcs    x26, x23, x10
712    mul     x11, x7, x20
713    mul     x12, x8, x20
714    mul     x13, x9, x20
715
716    //----- hi(n[0~7] * lo(t[0]*k0)) -----
717    adcs    x23, x24, x11
718    adcs    x24, x21, x12
719    adcs    x21, x22, x13
720    adc     x22, xzr, xzr           // x22 += CF
721    umulh   x10, x14 , x20
722    umulh   x11, x15 , x20
723    umulh   x12, x16 , x20
724    umulh   x13, x17 , x20
725
726    adds    x27, x27, x10
727    adcs    x28, x28, x11
728    adcs    x25, x25, x12
729    adcs    x26, x26, x13
730    umulh   x10, x6, x20
731    umulh   x11, x7, x20
732    umulh   x12, x8, x20
733    umulh   x13, x9, x20
734
735    adcs    x23, x23, x10
736    adcs    x24, x24, x11
737    adcs    x21, x21, x12
738    adc     x22, x22, x13
739    cbnz    x19, .LSqr8xSingleReduce   // Need cycle 8 times
740
741    ldp     x10, x11, [x2, #64]    // x10 = t[8], x11 = t[9]
742    ldp     x12, x13, [x2, #80]
743    adds    x27, x27, x10
744    adcs    x28, x28, x11
745    ldp     x10, x11, [x2, #96]
746    adcs    x25, x25, x12
747    adcs    x26, x26, x13
748    adcs    x23, x23, x10
749    ldp     x12, x13, [x2, #112]
750
751    adcs    x24, x24, x11
752    adcs    x21, x21, x12
753    adcs    x22, x22, x13
754    adc     x20, xzr, xzr
755    ldr     x0, [x29, #96]          // r Pop-Stack
756
757    // t - mod
758    subs    x14, x27, x14
759    sbcs    x15, x28, x15
760    sbcs    x16, x25, x16
761    sbcs    x17, x26, x17
762    sbcs    x6, x23, x6
763    sbcs    x7, x24, x7
764    sbcs    x8, x21, x8
765    sbcs    x9, x22, x9
766    sbcs    x20, x20, xzr           // determine whether there is a borrowing
767
768    // according to CF choose our result
769    csel    x14 , x27, x14 , lo
770    csel    x15 , x28, x15 , lo
771    csel    x16 , x25, x16 , lo
772    csel    x17 , x26, x17 , lo
773    stp     x14 , x15 , [x0, #8*0]
774    csel    x6, x23, x6, lo
775    csel    x7, x24, x7, lo
776    stp     x16 , x17 , [x0, #8*2]
777    csel    x8, x21, x8, lo
778    csel    x9, x22, x9, lo
779    stp     x6, x7, [x0, #8*4]
780    stp     x8, x9, [x0, #8*6]
781    b       .LMontSqr8xEnd
782
783.LSqr8xReduceLoop:
784    add     x3, x1, x5             // x3 = &n[size]
785    mov     x30, xzr
786    ldp     x14 , x15 , [x1], #16    // x14~x9 = n[0~7]
787    ldp     x16 , x17 , [x1], #16
788    ldp     x6, x7, [x1], #16
789    ldp     x8, x9, [x1], #16
790
791    ldp     x27, x28, [sp]          // x27 = t[0], x28 = t[1]
792    ldp     x25, x26, [sp,#8*2]     // x25~x22 = t[2~7]
793    ldp     x23, x24, [sp,#8*4]
794    ldp     x21, x22, [sp,#8*6]
795    mov     x19, #8
796    mov     x2 , sp
797
798.LSqr8xReduceProcess:
799    mul     x20, x4, x27           // x20 = lo(k0 * t[0])
800    sub     x19, x19, #1
801    //----- lo(n[1~7] * lo(t[0]*k0)) -----
802    mul     x11, x15, x20           // x11 = n[1] * lo(t[0]*k0)
803    mul     x12, x16, x20           // x12 = n[2] * lo(t[0]*k0)
804    mul     x13, x17, x20           // x13 = n[3] * lo(t[0]*k0)
805    mul     x10, x6, x20            // x10 = n[4] * lo(t[0]*k0)
806
807    str     x20, [x2], #8           // Push lo(t[0]*k0) on the stack., x2 += 8
808    cmp     x27, #1                 // Check whether the low location is carried.
809
810    adcs    x27, x28, x11           // x27 = t[1] + lo(n[1] * lo(t[0]*k0))
811    adcs    x28, x25, x12           // x28 = t[2] + lo(n[2] * lo(t[0]*k0))
812    adcs    x25, x26, x13           // x25 = t[3] + lo(n[3] * lo(t[0]*k0))
813    adcs    x26, x23, x10           // x26 = t[4] + lo(n[4] * lo(t[0]*k0))
814    mul     x11, x7, x20
815    mul     x12, x8, x20
816    mul     x13, x9, x20
817
818    //----- hi(n[0~7] * lo(t[0]*k0)) -----
819    adcs    x23, x24, x11           // x23 = t[5] + lo(n[5] * lo(t[0]*k0))
820    adcs    x24, x21, x12           // x24 = t[6] + lo(n[6] * lo(t[0]*k0))
821    adcs    x21, x22, x13           // x21 = t[7] + lo(n[7] * lo(t[0]*k0))
822    adc     x22, xzr, xzr           // x22 += CF
823    umulh   x10, x14 , x20
824    umulh   x11, x15 , x20
825    umulh   x12, x16 , x20
826    umulh   x13, x17 , x20
827
828    adds    x27, x27, x10           // x27 += hi(n[0] * lo(t[0]*k0))
829    adcs    x28, x28, x11           // x28 += hi(n[1] * lo(t[0]*k0))
830    adcs    x25, x25, x12           // x25 += hi(n[2] * lo(t[0]*k0))
831    adcs    x26, x26, x13           // x26 += hi(n[3] * lo(t[0]*k0))
832    umulh   x10, x6, x20
833    umulh   x11, x7, x20
834    umulh   x12, x8, x20
835    umulh   x13, x9, x20
836
837    adcs    x23, x23, x10           // x23 += hi(n[4] * lo(t[0]*k0))
838    adcs    x24, x24, x11           // x24 += hi(n[5] * lo(t[0]*k0))
839    adcs    x21, x21, x12           // x21 += hi(n[6] * lo(t[0]*k0))
840    adc     x22, x22, x13           // x22 += hi(n[7] * lo(t[0]*k0))
841    cbnz    x19, .LSqr8xReduceProcess   // Cycle 8 times, and at the end of the cycle, x2 += 8*8
842
843    ldp     x10, x11, [x2, #8*0]    // x10 = t[8], x11 = t[9]
844    ldp     x12, x13, [x2, #8*2]
845    mov     x0, x2
846
847    adds    x27, x27, x10
848    adcs    x28, x28, x11
849    ldp     x10, x11, [x2,#8*4]
850    adcs    x25, x25, x12
851    adcs    x26, x26, x13
852    ldp     x12, x13, [x2,#8*6]
853    adcs    x23, x23, x10
854    adcs    x24, x24, x11
855    adcs    x21, x21, x12
856    adcs    x22, x22, x13
857
858    ldr     x4 , [x2, #-8*8]        // x4 = t[0]
859    ldp     x14 , x15 , [x1], #16    // x14~x9 = &n[8]~&n[15]
860    ldp     x16 , x17 , [x1], #16
861    ldp     x6, x7, [x1], #16
862    ldp     x8, x9, [x1], #16
863    mov     x19, #-8*8
864
865.LSqr8xReduce:
866    adc     x20, xzr, xzr           // x20 = CF
867    add     x19, x19, #8
868
869    mul     x10, x14 , x4
870    mul     x11, x15 , x4
871    mul     x12, x16 , x4
872    mul     x13, x17 , x4
873
874    adds    x27, x27, x10
875    adcs    x28, x28, x11
876    adcs    x25, x25, x12
877    adcs    x26, x26, x13
878    mul     x10, x6, x4
879    mul     x11, x7, x4
880    mul     x12, x8, x4
881    mul     x13, x9, x4
882
883    adcs    x23, x23, x10
884    adcs    x24, x24, x11
885    adcs    x21, x21, x12
886    adcs    x22, x22, x13
887    umulh   x10, x14 , x4
888    umulh   x11, x15 , x4
889    umulh   x12, x16 , x4
890    umulh   x13, x17 , x4
891
892    adc     x20, x20, xzr
893
894    str     x27, [x2], #8           // x27 = t[8], x2 += 8
895
896    adds    x27, x28, x10           // x27 = t[1] + lo(n[1] * lo(t[0]*k0))
897    adcs    x28, x25, x11           // x28 = t[2] + lo(n[2] * lo(t[0]*k0))
898    adcs    x25, x26, x12           // x25 = t[3] + lo(n[3] * lo(t[0]*k0))
899    adcs    x26, x23, x13           // x26 = t[4] + lo(n[4] * lo(t[0]*k0))
900    umulh   x10, x6, x4
901    umulh   x11, x7, x4
902    umulh   x12, x8, x4
903    umulh   x13, x9, x4
904
905    // x0 = &t[8]
906    ldr     x4 , [x0, x19]
907
908    adcs    x23, x24, x10
909    adcs    x24, x21, x11
910    adcs    x21, x22, x12
911    adcs    x22, x20, x13
912
913    cbnz    x19, .LSqr8xReduce
914
915    ldp     x14 , x15 , [x2, #8*0]
916    ldp     x16 , x17 , [x2, #8*2]
917    sub     x19, x3, x1            // x19 = (size-16)*8
918
919    ldp     x6, x7, [x2, #8*4]
920    ldp     x8, x9, [x2, #8*6]
921    cbz     x19, .LSqr8xReduceBreak
922
923    ldr     x4 , [x0, #-8*8]
924
925    adds    x27, x27, x14
926    adcs    x28, x28, x15
927    adcs    x25, x25, x16
928    adcs    x26, x26, x17
929    adcs    x23, x23, x6
930    adcs    x24, x24, x7
931    adcs    x21, x21, x8
932    adcs    x22, x22, x9
933
934    ldp     x14 , x15 , [x1], #16
935    ldp     x16 , x17 , [x1], #16
936    ldp     x6, x7, [x1], #16
937    ldp     x8, x9, [x1], #16
938
939    mov     x19, #-8*8
940    b       .LSqr8xReduce
941
942.align  4
943.LSqr8xReduceBreak:
944    sub     x12, x3, x5             // x12 = n, reassign to n
945    ldr     x4 , [x29, #112]        // k0 pop-stack
946
947    cmp     x30, #1                 // Check whether the low location is carried.
948
949    adcs    x10, x27, x14
950    adcs    x11, x28, x15
951    stp     x10, x11, [x2] , #16
952
953    ldp     x27 ,x28, [x0 , #8*0]
954    ldp     x14 , x15, [x12], #16  // x12 = &n[0] (Line 638 assigns a value)
955
956    adcs    x25, x25, x16
957    adcs    x26, x26, x17
958    adcs    x23, x23, x6
959    adcs    x24, x24, x7
960    adcs    x21, x21, x8
961    adcs    x22, x22, x9
962    adc     x30, xzr, xzr
963
964    ldp     x16, x17, [x12], #16
965    ldp     x6, x7, [x12], #16
966    ldp     x8, x9, [x12], #16
967
968    stp     x25, x26, [x2], #16
969    ldp     x25, x26, [x0, #8*2]
970
971    stp     x23, x24, [x2], #16
972    ldp     x23, x24, [x0, #8*4]
973
974    stp     x21, x22, [x2], #16
975    ldp     x21, x22, [x0, #8*6]
976
977    sub     x20, x2, x29                // Check whether the loop ends
978    mov     x1, x12
979    mov     x2, x0                      // sliding window
980    mov     x19, #8
981    cbnz    x20, .LSqr8xReduceProcess
982
983    // Final step
984    ldr     x0 , [x29, #96]         // r Pop-Stack
985    add     x2 , x2 , #8*8
986    subs    x10, x27, x14
987    sbcs    x11, x28, x15
988    sub     x19, x5 , #8*8
989    mov     x3 , x0                 // backup x0
990
991.LSqr8xSubMod:
992    ldp     x14 , x15, [x1], #16
993    sbcs    x12, x25, x16
994    sbcs    x13, x26, x17
995    ldp     x16 , x17 , [x1], #16
996    stp     x10, x11, [x0], #16
997    stp     x12, x13, [x0], #16
998
999    sbcs    x10, x23, x6
1000    sbcs    x11, x24, x7
1001    ldp     x6, x7, [x1], #16
1002
1003    sbcs    x12, x21, x8
1004    sbcs    x13, x22, x9
1005    ldp     x8, x9, [x1], #16
1006    stp     x10, x11, [x0], #16
1007    stp     x12, x13, [x0], #16
1008
1009    ldp     x27, x28, [x2], #16
1010    ldp     x25, x26, [x2], #16
1011    ldp     x23, x24, [x2], #16
1012    ldp     x21, x22, [x2], #16
1013
1014    sub     x19, x19, #8*8
1015    sbcs    x10, x27, x14
1016    sbcs    x11, x28, x15
1017
1018    cbnz    x19, .LSqr8xSubMod
1019
1020    mov     x2 , sp
1021    add     x1 , sp , x5
1022
1023    sbcs    x12, x25, x16
1024    sbcs    x13, x26, x17
1025    stp     x12, x13, [x0, #8*2]
1026    stp     x10, x11, [x0, #8*0]
1027    sbcs    x10, x23, x6
1028    sbcs    x11, x24, x7
1029    stp     x10, x11, [x0, #8*4]
1030    sbcs    x12, x21, x8
1031    sbcs    x13, x22, x9
1032    stp     x12, x13, [x0, #8*6]
1033    sbcs    xzr, x30, xzr           // Determine whether there is a borrowing
1034
1035.LSqr8xCopy:
1036    ldp     x14, x15, [x3, #8*0]
1037    ldp     x16, x17, [x3, #8*2]
1038    ldp     x6, x7, [x3, #8*4]
1039    ldp     x8, x9, [x3, #8*6]
1040
1041    ldp     x27, x28, [x1], #16
1042    ldp     x25, x26, [x1], #16
1043    ldp     x23, x24, [x1], #16
1044    ldp     x21, x22, [x1], #16
1045
1046    sub     x5, x5, #8*8
1047
1048    csel    x10, x27, x14, lo        // Condition selection instruction, lo = less than,
1049                                      // equivalent to x14 = (conf==lo) ? x27 : x14
1050    csel    x11, x28, x15, lo
1051    csel    x12, x25, x16 , lo
1052    csel    x13, x26, x17, lo
1053    csel    x14, x23, x6, lo
1054    csel    x15, x24, x7, lo
1055    csel    x16, x21, x8, lo
1056    csel    x17, x22, x9, lo
1057
1058    stp     x10, x11, [x3], #16
1059    stp     x12, x13, [x3], #16
1060    stp     x14, x15, [x3], #16
1061    stp     x16, x17, [x3], #16
1062    cbnz    x5, .LSqr8xCopy
1063
1064.LMontSqr8xEnd:
1065    ldr     x30, [x29, #8]          // Pop-Stack
1066    ldp     x27, x28, [x29, #16]
1067    mov     sp ,  x29
1068    ldp     x25, x26, [x29, #32]
1069    mov     x0 ,  #1
1070    ldp     x23, x24, [x29, #48]
1071    ldp     x21, x22, [x29, #64]
1072    ldp     x19, x20, [x29, #80]    // x19 = [x29 + 80], x20 = [x29 + 80 + 8],
1073                                    // ldp reads two 8-byte memory blocks at a time.
1074    ldr     x29, [sp], #128         // x29 = [sp], sp = sp + 128,ldr reads only an 8-byte block of memory
1075AARCH64_AUTIASP
1076    ret
1077.size   MontSqr8, .-MontSqr8
1078
1079
1080.type   MontMul4, %function
1081MontMul4:
1082AARCH64_PACIASP
1083    stp     x29, x30, [sp, #-128]!
1084    mov     x29, sp
1085    stp     x27, x28, [sp, #16]
1086    stp     x25, x26, [sp, #32]
1087    stp     x23, x24, [sp, #48]
1088    stp     x21, x22, [sp, #64]
1089    stp     x19, x20, [sp, #80]
1090    mov     x27, xzr
1091    mov     x28, xzr
1092    mov     x25, xzr
1093    mov     x26, xzr
1094    mov     x30, xzr
1095
1096    lsl     x5 , x5 , #3
1097    sub     x22, sp , x5
1098    sub     sp , x22, #8*4          // The space of size + 4 is applied for
1099    mov     x22, sp
1100
1101    sub     x6, x5, #32
1102    cbnz    x6, .LMul4xProcesStart
1103
1104    ldp     x14 , x15 , [x1, #8*0]
1105    ldp     x16 , x17 , [x1, #8*2]    // x14~x17 = a[0~3]
1106    ldp     x10, x11, [x3]          // x10~x13 = n[0~3]
1107    ldp     x12, x13, [x3, #8*2]
1108
1109    mov     x1 , xzr
1110    mov     x20, #4
1111
1112    // if size == 4, goto single step
1113.LMul4xSingleStep:
1114    sub     x20, x20, #0x1
1115    ldr     x24, [x2], #8           // b[i]
1116
1117    //----- lo(a[0~3] * b[0]) -----
1118    mul     x6, x14 , x24
1119    mul     x7, x15 , x24
1120    mul     x8, x16 , x24
1121    mul     x9, x17 , x24
1122
1123    //----- hi(a[0~3] * b[0]) -----
1124    adds    x27, x27, x6
1125    umulh   x6, x14 , x24
1126    adcs    x28, x28, x7
1127    umulh   x7, x15 , x24
1128    adcs    x25, x25, x8
1129    umulh   x8, x16 , x24
1130    adcs    x26, x26, x9
1131    umulh   x9, x17 , x24
1132
1133    mul     x21, x27, x4
1134    adc     x23, xzr, xzr
1135    //----- lo(n[0~3] * t[0]*k0) -----
1136    adds    x28, x28, x6
1137    adcs    x25, x25, x7
1138    mul     x7, x11, x21
1139    adcs    x26, x26, x8
1140    mul     x8, x12, x21
1141    adc     x23, x23, x9
1142    mul     x9, x13, x21
1143    cmp     x27, #1
1144    adcs    x27, x28, x7
1145
1146    //----- hi(n[0~3] *t[0]*k0) -----
1147    umulh   x6, x10, x21
1148    umulh   x7, x11, x21
1149    adcs    x28, x25, x8
1150    umulh   x8, x12, x21
1151    adcs    x25, x26, x9
1152    umulh   x9, x13, x21
1153    adcs    x26, x23, x1
1154    adc     x1 , xzr, xzr
1155
1156    adds    x27, x27, x6
1157    adcs    x28, x28, x7
1158    adcs    x25, x25, x8
1159    adcs    x26, x26, x9
1160    adc     x1 , x1 , xzr
1161
1162    cbnz    x20, .LMul4xSingleStep
1163
1164    subs    x14 , x27, x10
1165    sbcs    x15 , x28, x11
1166    sbcs    x16 , x25, x12
1167    sbcs    x17 , x26, x13
1168    sbcs    xzr, x1 , xzr           // update CF, Determine whether to borrow
1169
1170    csel    x14 , x27, x14 , lo
1171    csel    x15 , x28, x15 , lo
1172    csel    x16 , x25, x16 , lo
1173    csel    x17 , x26, x17 , lo
1174    stp     x14 , x15 , [x0, #8*0]
1175    stp     x16 , x17 , [x0, #8*2]
1176    b       .LMontMul4xEnd
1177
1178.LMul4xProcesStart:
1179    add     x6, x5, #32
1180    eor     v0.16b,v0.16b,v0.16b
1181    eor     v1.16b,v1.16b,v1.16b
1182.LMul4xInitstack:
1183    sub     x6,  x6, #32
1184    st1     {v0.2d, v1.2d}, [x22], #32
1185    cbnz    x6, .LMul4xInitstack
1186    mov     x22, sp
1187
1188    add     x6, x2 , x5            // x6 = &b[size]
1189    adds    x19, x1 , x5           // x19 = &a[size]
1190    stp     x0 , x6, [x29, #96]    // r and b[size] push stack
1191    mov     x0 , xzr
1192    mov     x20, #0
1193
1194 .LMul4xLoopProces:
1195    ldr     x24, [x2]                // x24 = b[0]
1196    ldp     x14 , x15 , [x1], #16
1197    ldp     x16 , x17 , [x1], #16    // x14~x17 = a[0~3]
1198
1199    ldp     x10 , x11 , [x3], #16
1200    ldp     x12 , x13 , [x3], #16    // x10~x13 = n[0~3]
1201
1202.LMul4xPrepare:
1203    //----- lo(a[0~3] * b[0]) -----
1204    adc     x0 , x0 , xzr           // x0 += CF
1205    add     x20, x20, #8
1206    and     x20, x20, #31           // x20 &= 0xff. The lower eight bits are used.
1207                                    // When x28 = 32, the instruction becomes 0.
1208    mul     x6, x14 , x24
1209    mul     x7, x15 , x24
1210    mul     x8, x16 , x24
1211    mul     x9, x17 , x24
1212
1213    //----- hi(a[0~3] * b[0]) -----
1214    adds    x27, x27, x6           // t[0] += lo(a[0] * b[0])
1215    adcs    x28, x28, x7           // t[1] += lo(a[1] * b[0])
1216    adcs    x25, x25, x8           // t[2] += lo(a[2] * b[0])
1217    adcs    x26, x26, x9           // t[3] += lo(a[3] * b[0])
1218    umulh   x6, x14 , x24           // x6 = hi(a[0] * b[0])
1219    umulh   x7, x15 , x24
1220    umulh   x8, x16 , x24
1221    umulh   x9, x17 , x24
1222
1223    mul     x21, x27, x4            // x21 = t[0] * k0, t[0]*k0 needs to be recalculated in each round.
1224                                    // t[0] is different in each round.
1225    adc     x23, xzr, xzr           // t[4] += CF(set by t[3] += lo(a[3] * b[0]) )
1226
1227    ldr     x24, [x2, x20]          // b[i]
1228
1229    str     x21, [x22], #8          // t[0] * k0 push stack, x22 += 8
1230
1231    //----- lo(n[0~3] * t[0]*k0) -----
1232    adds    x28, x28, x6           // t[1] += hi(a[0] * b[0])
1233    adcs    x25, x25, x7           // t[2] += hi(a[1] * b[0])
1234    adcs    x26, x26, x8           // t[3] += hi(a[2] * b[0])
1235    adc     x23, x23, x9           // t[4] += hi(a[3] * b[0])
1236    mul     x7, x11, x21           // x7 = lo(n[1] * t[0]*k0)
1237    mul     x8, x12, x21           // x8 = lo(n[2] * t[0]*k0)
1238    mul     x9, x13, x21           // x9 = lo(n[3] * t[0]*k0)
1239    cmp     x27, #1
1240    adcs    x27, x28, x7           // (t[0]) = x27 = t[1] + lo(n[1] * t[0]*k0),
1241                                   // Perform S/R operations, r=2^64, shift right 64 bits.
1242
1243    //----- hi(n[0~3] *t[0]*k0) -----
1244    adcs    x28, x25, x8           // x28 = t[2] + lo(n[2] * t[0]*k0)
1245    adcs    x25, x26, x9           // x25 = t[3] + lo(n[3] * t[0]*k0)
1246    adcs    x26, x23, x0            // x26 = t[4] + 0 + CF
1247    adc     x0 , xzr, xzr           // x0 = CF
1248    umulh   x6, x10, x21           // x6 = hi(n[0] * t[0]*k0)
1249    umulh   x7, x11, x21           // x7 = hi(n[1] * t[0]*k0)
1250    umulh   x8, x12, x21           // x8 = hi(n[2] * t[0]*k0)
1251    umulh   x9, x13, x21           // x9 = hi(n[3] * t[0]*k0)
1252
1253    adds    x27, x27, x6           // x27 = t[1] + hi(n[0] * t[0]*k0)
1254    adcs    x28, x28, x7           // x28 = t[2] + hi(n[1] * t[0]*k0)
1255    adcs    x25, x25, x8           // x25 = t[3] + hi(n[2] * t[0]*k0)
1256    adcs    x26, x26, x9           // x26 = t[4] + hi(n[3] * t[0]*k0)
1257
1258    cbnz    x20, .LMul4xPrepare
1259
1260    // Four t[0] * k0s are stacked in each loop.
1261    adc     x0 , x0 , xzr
1262    ldp     x6, x7, [x22, #8*4]   // load A (cal before)
1263    ldp     x8, x9, [x22, #8*6]
1264
1265    adds    x27, x27, x6
1266    adcs    x28, x28, x7
1267    adcs    x25, x25, x8
1268    adcs    x26, x26, x9
1269    ldr     x21, [sp]               // x21 = t[0] * k0
1270
1271.LMul4xReduceBegin:
1272    ldp     x14 , x15 , [x1], #16
1273    ldp     x16 , x17 , [x1], #16    // x14~x17 = a[4~7]
1274    ldp     x10 , x11 , [x3], #16
1275    ldp     x12 , x13 , [x3], #16    // n[4~7]
1276
1277.LMul4xReduceProces:
1278    adc     x0 , x0 , xzr
1279    add     x20, x20, #8
1280    and     x20, x20, #31
1281    //----- lo(a[4~7] * b[i]) -----
1282    mul     x6, x14 , x24
1283    mul     x7, x15 , x24
1284    mul     x8, x16 , x24
1285    mul     x9, x17 , x24
1286
1287    //----- hi(a[4~7] * b[i]) -----
1288    adds    x27, x27, x6           // x27 += lo(a[4~7] * b[i])
1289    adcs    x28, x28, x7
1290    adcs    x25, x25, x8
1291    adcs    x26, x26, x9
1292    adc     x23, xzr, xzr
1293    umulh   x6, x14 , x24
1294    umulh   x7, x15 , x24
1295    umulh   x8, x16 , x24
1296    umulh   x9, x17 , x24
1297
1298    ldr     x24, [x2, x20]          // b[i]
1299
1300    //----- lo(n[4~7] * t[0]*k0) -----
1301    adds    x28, x28, x6
1302    adcs    x25, x25, x7
1303    adcs    x26, x26, x8
1304    adc     x23, x23, x9
1305    mul     x6, x10, x21
1306    mul     x7, x11, x21
1307    mul     x8, x12, x21
1308    mul     x9, x13, x21
1309
1310    //----- hi(n[4~7] * t[0]*k0) -----
1311    adds    x27, x27, x6
1312    adcs    x28, x28, x7
1313    adcs    x25, x25, x8
1314    adcs    x26, x26, x9
1315    adcs    x23, x23, x0
1316    umulh   x6, x10, x21
1317    umulh   x7, x11, x21
1318    umulh   x8, x12, x21
1319    umulh   x9, x13, x21
1320
1321    ldr     x21, [sp, x20]          // t[0]*k0
1322
1323    adc     x0 , xzr, xzr           // x0 = CF, record carry
1324    str     x27, [x22], #8          // s[i] the calculation is complete, write the result, x22 += 8
1325
1326    adds    x27, x28, x6
1327    adcs    x28, x25, x7
1328    adcs    x25, x26, x8
1329    adcs    x26, x23, x9
1330
1331    cbnz    x20, .LMul4xReduceProces
1332    sub     x6, x19, x1            // x6 = &a[size] - &a[i]
1333                                   // (The value of x1 increases cyclically.) Check whether the loop ends
1334    adc     x0 , x0 , xzr
1335    cbz     x6, .LMul4xLoopExitCheck
1336
1337    ldp     x6, x7, [x22, #8*4]   // t[4~7]
1338    ldp     x8, x9, [x22, #8*6]
1339
1340    adds    x27, x27, x6
1341    adcs    x28, x28, x7
1342    adcs    x25, x25, x8
1343    adcs    x26, x26, x9
1344
1345    b       .LMul4xReduceBegin
1346
1347.LMul4xLoopExitCheck:
1348    ldr     x9, [x29, #104]         // b[size] Pop-Stack
1349    add     x2 , x2 , #8*4          // b subscript is offset once by 4 each time, &b[4], &b[8]......
1350    sub     x9 , x9, x2                // Indicates whether the outer loop ends.
1351
1352    adds    x27, x27, x30
1353    adcs    x28, x28, xzr
1354    stp     x27, x28, [x22, #8*0]   // x27, x20 After the calculation is complete, Push-stack storage
1355    ldp     x27, x28, [sp , #8*4]   // t[0], t[1]
1356
1357    adcs    x25, x25, xzr
1358    adcs    x26, x26, xzr
1359    stp     x25, x26, [x22, #8*2]   // x25, x22 After the calculation is complete, Push-stack storage
1360    ldp     x25, x26, [sp , #8*6]   // t[2], t[3]
1361    adc     x30, x0 , xzr
1362    sub     x3 , x3 , x5            // x1 = &n[0]
1363    cbz     x9, .LMul4xEnd
1364
1365    sub     x1 , x1 , x5            // x1 = &a[0]
1366
1367    mov     x22, sp
1368    mov     x0 , xzr
1369    b       .LMul4xLoopProces
1370
1371.LMul4xEnd:
1372    ldp     x10, x11, [x3], #16
1373    ldp     x12, x13, [x3], #16
1374    ldr     x0, [x29, #96]         // r[0] Pop-Stack
1375    mov     x19, x0                // backup, x19 = &r[0]
1376    subs    x6, x27, x10           // t[0] - n[0], modify CF
1377    sbcs    x7, x28, x11           // t[1] - n[1] - CF
1378    add     x22, sp , #8*8         // x22 = &S[8]
1379    sub     x20, x5 , #8*4         // x20 = (size - 4)*8
1380
1381    // t - n, x22 = &t[8], x3 = &n[0]
1382.LMul4xSubMod:
1383    sbcs    x8, x25, x12
1384    sbcs    x9, x26, x13
1385
1386    ldp     x10, x11, [x3], #16
1387    ldp     x12, x13, [x3], #16
1388    ldp     x27, x28, [x22], #16
1389    ldp     x25, x26, [x22], #16
1390
1391    sub     x20, x20, #8*4
1392
1393    stp     x6, x7, [x0], 16
1394    stp     x8, x9, [x0], 16
1395
1396    sbcs    x6, x27, x10
1397    sbcs    x7, x28, x11
1398
1399    cbnz    x20, .LMul4xSubMod
1400
1401    sbcs    x8, x25, x12
1402    sbcs    x9, x26, x13
1403    sbcs    xzr, x30, xzr           // CF = x30 - CF, x30 recorded the previous carry
1404
1405    add     x1 , sp , #8*4          // The size of the SP space is size + 4., x1 = sp + 4
1406    stp     x6, x7, [x0]
1407    stp     x8, x9, [x0, #8*2]
1408
1409.LMul4xCopy:
1410    ldp     x14 , x15 , [x19]         // x14~x17 = r[0~3]
1411    ldp     x16 , x17 , [x19, #8*2]
1412    ldp     x27, x28, [x1], #16       // x27~22 = S[4~7]
1413    ldp     x25, x26, [x1], #16
1414    sub     x5, x5, #8*4
1415    csel    x6, x27, x14, lo        // x6 = (CF == 1) ? x27 : x14
1416    csel    x7, x28, x15, lo
1417    csel    x8, x25, x16, lo
1418    csel    x9, x26, x17, lo
1419    stp     x6, x7, [x19], #16
1420    stp     x8, x9, [x19], #16
1421
1422    cbnz    x5, .LMul4xCopy
1423
1424.LMontMul4xEnd:
1425    ldr     x30, [x29, #8]          // Value Pop-Stack in x30 (address of next instruction)
1426    ldp     x27, x28, [x29, #16]
1427    ldp     x25, x26, [x29, #32]
1428    ldp     x23, x24, [x29, #48]
1429    ldp     x21, x22, [x29, #64]
1430    ldp     x19, x20, [x29, #80]
1431    mov     sp , x29
1432    ldr     x29, [sp], #128
1433AARCH64_AUTIASP
1434    ret
1435.size   MontMul4, .-MontMul4
1436
1437#endif
1438