• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * This file is part of the openHiTLS project.
3 *
4 * openHiTLS is licensed under the Mulan PSL v2.
5 * You can use this software according to the terms and conditions of the Mulan PSL v2.
6 * You may obtain a copy of Mulan PSL v2 at:
7 *
8 *     http://license.coscl.org.cn/MulanPSL2
9 *
10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
13 * See the Mulan PSL v2 for more details.
14 */
15
16#include "hitls_build.h"
17#ifdef HITLS_CRYPTO_BN
18
19.file   "bn_mont_x86_64.S"
20.text
21
22.macro  ADD_CARRY a b
23    addq    \a,\b
24    adcq    $0,%rdx
25.endm
26
27.macro  SAVE_REGISTERS
28    pushq   %r15                        // Save non-volatile register.
29    pushq   %r14
30    pushq   %r13
31    pushq   %r12
32    pushq   %rbp
33    pushq   %rbx
34.endm
35
36.macro  RESTORE_REGISTERS
37    popq    %rbx              // Restore non-volatile register.
38    popq    %rbp
39    popq    %r12
40    popq    %r13
41    popq    %r14
42    popq    %r15
43.endm
44
45/*
46* void MontMulx_Asm(uint64_t *r, const uint64_t *a, const uint64_t *b,
47*                     const uint64_t *n, const uint64_t k0, uint32_t size);
48*/
49.globl  MontMulx_Asm
50.type   MontMulx_Asm,@function
51.align  16
52MontMulx_Asm:
53.cfi_startproc
54    testl   $3,%r9d
55    jnz     .LMontMul                   // If size is not divisible by 4, LMontMul.
56    cmpl    $8,%r9d
57    jb      .LMontMul                   // LMontMul
58    cmpq    %rsi,%rdx
59    jne     MontMul4x                    // if a != b, MontMul4x
60    testl   $7,%r9d
61    jz      MontSqr8x                    // If size is divisible by 8,enter MontSqr8x.
62    jmp     MontMul4x
63
64.align  16
65.LMontMul:
66    SAVE_REGISTERS                          // Save non-volatile register.
67    movq    %rsp,%rax                       // rax stores the rsp
68
69    movq    %r9, %r15
70    negq    %r15                            // r15 = -size
71    leaq    -16(%rsp, %r15, 8), %r15        // r15 = rsp - size * 8 - 16
72    andq    $-1024, %r15                    // r15 The address is aligned down by 1 KB.
73    movq    %rsp, %r14                      // r14 = rsp
74
75    subq    %r15,%r14                       // __chkstk implemention, called when the stack size needs to exceed 4096.
76                                            // (the size of a page) to allocate more pages.
77    andq    $-4096,%r14                     // r14 4K down-align.
78    leaq    (%r15,%r14),%rsp                // rsp = r15 + r14
79    cmpq    %r15,%rsp                       // If you want to allocate more than one page, go to Lmul_page_walk.
80    ja      .LoopPage
81    jmp     .LMulBody
82
83.align  16
84.LoopPage:
85    leaq    -4096(%rsp),%rsp            // rsp - 4096 each time until rsp < r15.
86    cmpq    %r15,%rsp
87    ja      .LoopPage
88
89.LMulBody:
90    movq    %rax,8(%rsp,%r9,8)          // Save the original rsp in the stack.
91    movq    %rdx,%r13                   // r13 = b
92
93    xorq    %r11,%r11                   // r11 = 0
94    xorq    %r10,%r10                   // r10 = 0
95
96    movq    (%r13),%rbx                 // rbx = b[0]
97    movq    (%rsi),%rax                 // rax = a[0]
98    mulq    %rbx                        // (rdx, rax) = a[0] * b[0]
99    movq    %rax,%r15                   // r15 = t[0] = lo(a[0] * b[0])
100    movq    %rdx,%r14                   // r14 = hi(a[0] * b[0])
101
102    movq    %r8,%rbp                    // rbp = k0
103    imulq   %r15,%rbp                   // rbp = t[0] * k0
104    movq    (%rcx),%rax                 // rax = n[0]
105    mulq    %rbp                        // (rdx, rax) = t[0] * k0 * n[0]
106    ADD_CARRY    %rax,%r15              // r15 = lo(t[0] * k0 * n[0]) + t[0]
107
108    leaq    1(%r10),%r10                // j++
109
110.Loop1st:
111    movq    (%rsi,%r10,8),%rax          // rax = a[j]
112    movq    %rdx,%r12                   // r12 = hi(t[0] * k0 * n[0])
113
114    mulq    %rbx                        // (rdx, rax) = a[j] * b[0]
115    ADD_CARRY    %rax,%r14              // r14 = hi(a[j - 1] * b[0]) + lo(a[j] * b[0])
116    movq    %rdx,%r15                   // r15 = hi(a[j] * b[0])
117
118    movq    (%rcx,%r10,8),%rax          // rax = n[j]
119    mulq    %rbp                        // (rdx, rax) = t[0] * k0 * n[j]
120    leaq    1(%r10),%r10                // j++
121    cmpq    %r9,%r10                    // if j != size, loop L1st
122    je      .Loop1stSkip
123
124    ADD_CARRY    %rax,%r12              // r12 = hi(t[0] * k0 * n[j]) + lo(t[0] * k0 * n[j])
125    ADD_CARRY    %r14,%r12              // r12 += lo(a[j] * b[0]) + hi(a[j] * b[0])
126    movq    %r12,-16(%rsp,%r10,8)       // t[j - 2] = r13
127    movq    %r15,%r14                   // r14 = hi(a[j] * b[0])
128    jmp     .Loop1st
129
130.Loop1stSkip:
131    ADD_CARRY    %rax,%r12              // r12 = hi(t[0] * k0 * n[j - 1]) + lo(t[0] * k0 * n[j])
132    ADD_CARRY    %r14,%r12              // r12 += hi(a[j - 1] * b[0]) + lo(a[j] * b[0])
133    movq    %r12,-16(%rsp,%r10,8)       // t[j - 2] = r13
134    movq    %r15,%r14                   // r14 = hi(a[j] * b[0])
135
136    movq    %rdx,%r12                   // r12 = hi(t[0] * k0 * n[j])
137    xorq    %rdx,%rdx                   // rdx = 0, Clearing the CF.
138    ADD_CARRY    %r14,%r12              // r12 = hi(t[0] * k0 * n[j]) + hi(a[j] * b[0])
139    movq    %r12,-8(%rsp,%r9,8)         // t[size - 1] = hi(t[0] * k0 * n[j]) + hi(a[j] * b[0]), save overflow bit.
140    movq    %rdx,(%rsp,%r9,8)
141
142    leaq    1(%r11),%r11                // i++
143
144.align  16
145.LoopOuter:
146    xorq    %r10,%r10                   // j = 0
147    movq    (%rsi),%rax                 // rax = a[0]
148    movq    (%r13,%r11,8),%rbx          // rbx = b[i]
149    mulq    %rbx                        // (rdx, rax) = a[0] * b[i]
150    movq    (%rsp),%r15                 // r15 = lo(a[0] * b[i]) + t[0]
151    ADD_CARRY    %rax,%r15
152    movq    %rdx,%r14                   // r14 = hi(a[0] * b[i])
153
154    movq    %r8,%rbp                    // rbp = t[0] * k0
155    imulq   %r15,%rbp
156    movq    (%rcx),%rax                 // rax = n[0]
157    mulq    %rbp                        // (rdx, rax) = t[0] * k0 * n[0]
158    ADD_CARRY    %rax,%r15              // r15 = lo(t[0] * k0 * n[0])
159
160    leaq    1(%r10),%r10                // j++
161
162.align  16
163.LoopInner:
164    movq    (%rsi,%r10,8),%rax          // rax = a[j]
165    movq    %rdx,%r12                   // r12 = hi(t[0] * k0 * n[j])
166    movq    (%rsp,%r10,8),%r15          // r15 = t[j]
167
168    mulq    %rbx                        // (rdx, rax) = a[1] * b[i]
169    ADD_CARRY    %rax,%r14              // r14 = hi(a[0] * b[i]) + lo(a[1] * b[i])
170    movq    (%rcx,%r10,8),%rax          // rax = n[j]
171    ADD_CARRY    %r14,%r15              // r15 = a[j] * b[i] + t[j]
172    movq    %rdx,%r14
173    leaq    1(%r10),%r10                // j++
174
175    mulq    %rbp                        // (rdx, rax) = t[0] * k0 * n[j]
176    cmpq    %r9,%r10                    // if j != size, loop Linner
177    je      .LoopInnerSkip
178
179    ADD_CARRY    %rax,%r12              // r12 = t[0] * k0 * n[j]
180    ADD_CARRY    %r15,%r12              // r12 = a[j] * b[i] + t[j] + n[j] * t[0] * k0
181    movq    %r12,-16(%rsp,%r10,8)       // t[j - 2] = r13
182    jmp     .LoopInner
183
184.LoopInnerSkip:
185    ADD_CARRY    %rax,%r12              // r12 = t[0] * k0 * n[j]
186    ADD_CARRY    %r15,%r12              // r12 = t[0] * k0 * n[j] + a[j] * b[i] + t[j]
187    movq    (%rsp,%r10,8),%r15          // r15 = t[j]
188    movq    %r12,-16(%rsp,%r10,8)       // t[j - 2]
189    movq    %rdx,%r12                   // r12 = hi(t[0] * k0 * n[j])
190
191    xorq    %rdx,%rdx                   // rdx 0
192    ADD_CARRY    %r14,%r12              // r12 = hi(a[1] * b[i]) + hi(t[0] * k0 * n[j])
193    ADD_CARRY    %r15,%r12              // r12 += t[j]
194    movq    %r12,-8(%rsp,%r9,8)         // t[size - 1] = r13
195    movq    %rdx,(%rsp,%r9,8)           // t[size] = CF
196
197    leaq    1(%r11),%r11                // i++
198    cmpq    %r9,%r11                    // if size < i (unsigned)
199    jne     .LoopOuter
200
201    xorq    %r11,%r11                   // r11 = 0, clear CF.
202    movq    (%rsp),%rax                 // rax = t[0]
203    movq    %r9,%r10                    // r10 = size
204
205.align  16
206.LoopSub:
207    sbbq    (%rcx,%r11,8),%rax          // r[i] = t[i] - n[i]
208    movq    %rax,(%rdi,%r11,8)
209    movq    8(%rsp,%r11,8),%rax         // rax = t[i + 1]
210
211    leaq    1(%r11),%r11                // i++
212    decq    %r10                        // j--
213    jnz     .LoopSub                    // if j != 0
214
215    sbbq    $0,%rax                     // rax -= CF
216    movq    $-1,%rbx
217    xorq    %rax,%rbx                   // rbx = !t[i + 1]
218    xorq    %r11,%r11                   // r11 = 0
219    movq    %r9,%r10                    // r10 = size
220
221.LoopCopy:
222    movq    (%rdi,%r11,8),%rcx          // rcx = r[i] & t[i]
223    andq    %rbx,%rcx
224    movq    (%rsp,%r11,8),%rdx          // rdx = CF & t[i]
225    andq    %rax,%rdx
226    orq     %rcx,%rdx
227    movq    %rdx,(%rdi,%r11,8)          // r[i] = t[i]
228    movq    %r9,(%rsp,%r11,8)           // t[i] = size
229    leaq    1(%r11),%r11                // i++
230    subq    $1,%r10                     // j--
231    jnz     .LoopCopy                   // if j != 0
232
233    movq    8(%rsp,%r9,8),%rsi          // rsi = pressed-stacked rsp.
234    movq    $1,%rax                     // rax = 1
235    leaq    (%rsi),%rsp                 // restore rsp.
236    RESTORE_REGISTERS                   // Restore non-volatile register.
237    ret
238.cfi_endproc
239.size   MontMulx_Asm,.-MontMulx_Asm
240
241.type   MontMul4x,@function
242.align  16
243MontMul4x:
244.cfi_startproc
245    SAVE_REGISTERS
246    movq    %rsp,%rax                   // save rsp
247
248    movq    %r9,%r15
249    negq    %r15
250    leaq    -48(%rsp,%r15,8),%r15       // Allocate space: size * 8 + 48 bytes.
251    andq    $-1024,%r15
252    movq    %rsp,%r14
253
254    subq    %r15,%r14                   // __chkstk implemention, called when the stack size needs to exceed 4096.
255    andq    $-4096,%r14
256    leaq    (%r15,%r14),%rsp
257    cmpq    %r15,%rsp                   // If you want to allocate more than one page, go to LoopPage4x.
258    ja      .LoopPage4x
259    jmp     .LoopMul4x
260
261.LoopPage4x:
262    leaq    -4096(%rsp),%rsp            // rsp - 4096each time until rsp >= r10.
263    cmpq    %r15,%rsp
264    ja      .LoopPage4x
265
266.LoopMul4x:
267    movq    %rax, 0(%rsp)         // save stack pointer
268    movq    %rdi, 8(%rsp)         // save r
269    movq    %r8, 16(%rsp)         // save k0
270    movq    %r9, %r10
271    shrq    $2, %r9
272    decq    %r9
273    movq    %r9, 24(%rsp)         // save (size/4) - 1
274    shlq    $3, %r10
275    movq    %rdx, %r12            // r12 = b
276    movq    %r10, 32(%rsp)        // save (size * 8) -> bytes
277
278    addq    %r10, %r12            // r12 = loc(b[size - 1])
279    leaq    80(%rsp),%rbp         // rbp: start position of the tmp buffer
280
281    movq    %rdx,%r13             // r13 = b
282    movq    %r12, 40(%rsp)        // save loc(b + size * 8)
283    movq    (%r13),%rdx           // rbx = b[0]
284
285    // cal a[0 ~ 3] * b[0]
286    mulx    (%rsi), %r12, %r14          // r14 = hi(a[0] * b[0]), r12 = lo(b[0] * a[0])
287    mulx    8(%rsi), %rax, %r15         // (r15, rax) = a[1] * b[0]
288    addq    %rax, %r14                  // r14 = hi(a[0] * b[0]) + lo(a[1] * b[0])
289    mulx    16(%rsi), %rax, %r11        // (rax, r11) = a[2] * b[0]
290    adcq    %rax, %r15                  // r15 = hi(a[1] * b[0]) + lo(a[2] * b[0])
291    adcq    $0, %r11                    // r11 = hi(a[2] * b[0]) + CF
292
293    imulq   %r12,%r8                    // r8 = t[0] * k0, will change CF
294    xorq    %r10,%r10                   // get r10 = 0
295
296    mulx    24(%rsi), %rax, %rbx        // (rax, rbx) = a[3] * b[0]
297    movq    %r8, %rdx                   // rdx = t[0] * k0 = m'
298    adcx    %rax, %r11                  // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0])
299    adcx    %r10, %rbx                  // rbx = hi(a[3] * b[0])
300
301    // cal n[0 ~ 3] * t[0] * k0
302    mulx    (%rcx), %rax, %rdi          // (rdi, rax) = n[0] * m'
303    adcx    %rax, %r12                  // r12 = lo(b[0] * a[0]) + lo(n[0] * m')
304    adox    %r14, %rdi                  // r8 = hi(n[0] * m') + hi(a[0] * b[0]) + hi(n[0] * m')
305
306    mulx    8(%rcx), %rax, %r14         // (r14, rax) = n[1] * m'
307    adcx    %rax, %rdi
308    adox    %r15, %r14                  // r11 = hi(a[1] * b[0]) + lo(a[2] * b[0]) + hi(n[1] * m')
309    movq    %rdi, -32(%rbp)
310
311    mulx    16(%rcx), %rax, %r15        // (r15, rax) = n[2] * m'
312    adcx    %rax, %r14
313    adox    %r11, %r15                  // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0]) + hi(n[2] * m')
314    movq    %r14, -24(%rbp)
315
316    mulx    24(%rcx), %rax, %r11        // (r11, rax) = n[3] * m'
317    adcx    %rax, %r15
318    adox    %r10, %r11                  // r11 = hi(n[3] * m')
319    movq    %r15, -16(%rbp)
320
321    leaq    4*8(%rsi),%rsi              // a offset 4 blocks
322    leaq    4*8(%rcx),%rcx              // n offset 4 blocks
323    movq    (%r13),%rdx                 // rdx = b[0]
324
325.align  16
326.Loop1st4x:
327    mulx    (%rsi), %r12, %r14          // r14 = hi(a[4] * b[0]), r12 = lo(a[4] * b[0])
328    adcx    %r10, %r11                  // r11 += carry
329    mulx    8(%rsi), %rax, %r15         // r15 = hi(a[5] * b[0]), rax = lo(a[5] * b[0])
330    adcx    %rbx, %r12                  // r12 = hi(a[3] * b[0]) + lo(a[4] * b[0])
331    adcx    %rax, %r14                  // r14 = hi(a[4] * b[0]) + lo(a[5] * a[0])
332    mulx    16(%rsi), %rax, %rdi        // rax = hi(a[6] * b[0]), rax = lo(a[6] * b[0])
333    adcx    %rax, %r15                  // r15 = hi(a[5] * b[0]) + lo(a[6] * a[0])
334    mulx    24(%rsi), %rax, %rbx        // rax = hi(a[7] * b[0]), rdi = lo(a[7] * b[0])
335    adcx    %rax, %rdi                  // rbx = hi(a[6] * b[0]) + lo(a[7] * b[0])
336    adcx    %r10, %rbx                  // rdi = hi(a[7] * b[0]) + CF
337
338    movq    %r8, %rdx
339    adox    %r11,%r12                   // r12 = hi(a[3] * b[0]) + lo(b[4] * a[0]) + hi(n[3] * m')
340    mulx    (%rcx), %rax, %r11          // (rax, r8) = n[4] * m'
341    leaq    4*8(%rsi), %rsi             // a offset 4 blocks
342    adcx    %rax,%r12                   // r12 = hi(a[3] * b[0]) + lo(b[4] * a[0])
343                                        //     + hi(n[3] * m') + lo(n[4] * m')
344    adox    %r14, %r11                  // r8 = hi(a[4] * b[0]) + lo(a[5] * b[0]) + hi(n[4] * m')
345
346
347    mulx    8(%rcx), %rax, %r14         // (rax, r14) = n[5] * m'
348    leaq    4*8(%rbp), %rbp             // tmp offset 4 blocks
349    adcx    %rax, %r11                  // r8  = hi(a[4] * b[0]) + lo(a[5] * b[0])
350                                        //     + hi(n[4] * m') + lo(n[5] * m')
351    adox    %r15, %r14                  // r14 = hi(a[5] * b[0]) + lo(a[6] * a[0])
352                                        //     + ho(n[5] * m')
353
354    mulx    16(%rcx), %rax, %r15         // (rax, r15) = n[6] * m'
355    movq    %r12, -5*8(%rbp)
356    adcx    %rax, %r14                   // r14 = hi(a[5] * b[0]) + lo(a[6] * a[0])
357                                         //     + hi(n[5] * m') + lo(n[6] * m')
358    adox    %rdi, %r15                   // r15 = hi(a[6] * b[0]) + lo(a[7] * b[0])
359                                         //     + hi(n[6] * m')
360    movq    %r11, -4*8(%rbp)
361
362    mulx    24(%rcx), %rax, %r11         // (rax, r11) = n[7] * m'
363    movq    %r14, -3*8(%rbp)
364    adcx    %rax, %r15                   // r15 = hi(a[6] * b[0]) + lo(a[7] * b[0])
365                                         //     + hi(n[6] * m') + lo(n[7] * m')
366
367    adox    %r10, %r11
368    movq    %r15, -2*8(%rbp)
369
370    leaq    4*8(%rcx), %rcx             // n offset 4 blocks
371    movq    (%r13),%rdx                 // recover rdx
372    dec     %r9
373    jnz      .Loop1st4x
374
375    movq    32(%rsp), %r15              // r15 = size * 8
376    leaq    8(%r13), %r13               // b offset 1 blocks
377
378    adcx    %r10, %r11                  // hi(n[7] * m') + CF, here OX CF are carried.
379    addq    %r11, %rbx                  // hi(a[7] * b[0]) + hi(n[7] * m')
380    sbbq    %r11,%r11                   // check r11 > 0
381    movq    %rbx, -1*8(%rbp)
382
383.align  4
384.LoopOuter4x:
385    // cal a[0 ~ 3] * b[i]
386    movq    (%r13),%rdx                 // rdx = b[i]
387    mov     %r11, (%rbp)                // keep the highest carry
388    subq    %r15, %rsi                  // get a[0]
389    subq    %r15, %rcx                  // get n[0]
390    leaq    80(%rsp),%rbp               // get tmp[0]
391
392    // from here, a[0 ~ 3] * b[i] needs to add tmp
393    mulx    (%rsi), %r12, %r14          // r14 = hi(a[0] * b[i]), r12 = lo(b[i] * a[0])
394    xorq    %r10,%r10                   // get r10 = 0, and clear CF OF
395
396    mulx    8(%rsi), %rax, %r15         // (r15, rax) = a[1] * b[i]
397    adox    -4*8(%rbp), %r12            // lo(a[1] * b[i]) + tmp[0]
398    adcx    %rax, %r14                  // r14 = hi(a[0] * b[i]) + lo(a[1] * b[i])
399
400    mulx    16(%rsi), %rax, %r11        // (rax, r11) = a[2] * b[0]
401    adox    -3*8(%rbp),%r14             // r14 = hi(a[1] * b[i]) + lo(a[1] * b[i]) + tmp[1]
402    adcx    %rax, %r15                  // r15 = hi(a[1] * b[i]) + lo(a[2] * b[i])
403
404    mulx    24(%rsi), %rax, %rbx        // (rax, rbx) = a[3] * b[0]
405    adox    -2*8(%rbp),%r15             // r15 = hi(a[2] * b[i]) + lo(a[2] * b[i]) + tmp[2]
406    adcx    %rax, %r11                  // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0])
407    adox    -1*8(%rbp),%r11             // r11 = hi(a[2] * b[i]) + lo(a[3] * b[i]) + tmp[3]
408    adcx    %r10,%rbx
409    movq    %r12, %rdx
410    adox    %r10,%rbx
411
412    imulq   16(%rsp),%rdx               // 16(%rsp) save k0, r8 = t[0] * k0 = m', imulq will change CF
413    mulx    (%rcx), %rax, %r8           // (rax, r8) = n[0] * m'
414    xorq    %r10, %r10                  // clear CF
415
416    adcx    %rax, %r12                  // r12 = lo(b[0] * a[0]) + lo(n[0] * m')
417    adox    %r14, %r8                   // r8 = hi(n[0] * m') + hi(a[0] * b[0]) + hi(n[0] * m')
418
419    mulx    8(%rcx), %rax, %rdi         // (rdi, rax) = n[1] * m'
420    leaq    4*8(%rsi),%rsi              // a offsets 4
421    adcx    %rax, %r8
422    adox    %r15, %rdi                  // r11 = hi(a[1] * b[0]) + lo(a[2] * b[0]) + hi(n[1] * m')
423
424    mulx    16(%rcx), %rax, %r15        // (rdi, rax) = n[2] * m'
425    movq    %r8, -32(%rbp)
426    adcx    %rax, %rdi
427    adox    %r11, %r15                  // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0]) + hi(n[2] * m')
428
429    mulx    24(%rcx), %rax, %r11        // (rdi, rax) = n[3] * m'
430    movq    %rdi, -24(%rbp)
431    adcx    %rax, %r15
432    adox    %r10, %r11                  // r11 = hi(n[3] * m')
433    movq    %r15, -16(%rbp)
434
435    leaq    4*8(%rcx),%rcx              // n offsets 4
436
437    movq    %rdx, %r8                   // r8 = t[0] * k0 = m'
438    movq    (%r13), %rdx                // rdx = b[i]
439    movq    24(%rsp), %r9
440
441.align  16
442.Linner4x:
443    mulx    (%rsi), %r12, %r14          // r14 = hi(a[4] * b[i]), r12 = lo(a[4] * b[i])
444    adcx    %r10, %r11                  // carry of previous round
445
446    adox    %rbx, %r12                  // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i])
447
448    mulx    8(%rsi), %rax, %r15         // r15 = hi(a[5] * b[i]), rax = lo(a[5] * b[0])
449    adcx    (%rbp), %r12                // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + tmp[4] --> 所以这里t不偏移
450    adox    %rax,  %r14                 // r14 = hi(a[4] * b[i]) + lo(a[5] * b[0])
451
452    mulx    16(%rsi), %rax, %rdi        // rax = hi(a[6] * b[i]), rax = lo(a[6] * b[i])
453    adcx    8(%rbp), %r14               // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + tmp[5]
454    adox    %rax, %r15                  // r15 = hi(a[5] * b[i]) + lo(a[6] * b[i])
455
456    mulx    24(%rsi), %rax, %rbx        // rax = hi(a[7] * b[i]), rdi = lo(a[7] * b[i])
457    adcx    16(%rbp), %r15              // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + tmp[6]
458    adox    %rax, %rdi                  // rbx = hi(a[6] * b[i]) + lo(a[7] * b[i])
459
460    adox    %r10, %rbx                  // rbx += OF
461    adcx    24(%rbp), %rdi               // rdi = hi(a[6] * b[i]) + lo(a[7] * b[i]) + tmp[7]
462    adcx    %r10, %rbx                  // rbx += CF
463
464    // update rdx, begin cal n[i] * k0 * m
465    adox    %r11,%r12                   // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + hi(n[3] * m')
466    movq    %r8, %rdx
467    mulx    (%rcx), %rax, %r11          // (rax, r8) = n[4] * m'
468    leaq    4*8(%rbp), %rbp             // tmp offsets 4
469    adcx    %rax,%r12                   // r12 = hi(a[3] * b[i]) + lo(b[4] * a[i])
470                                        //     + hi(n[3] * m') + lo(n[4] * m')
471    adox    %r14, %r11                  // r8 = hi(a[4] * b[i]) + lo(a[5] * b[i]) + hi(n[4] * m')
472
473    mulx    8(%rcx), %rax, %r14         // (rax, r14) = n[5] * m'
474    leaq    4*8(%rsi), %rsi             // a offsets 4
475    adcx    %rax, %r11                  // r8  = hi(a[4] * b[i]) + lo(a[5] * b[i])
476                                        //     + hi(n[4] * m') + lo(n[5] * m')
477    adox    %r15, %r14                  // r14 = hi(a[5] * b[i]) + lo(a[6] * a[i])
478                                        //     + ho(n[5] * m')
479
480    mulx    16(%rcx), %rax, %r15        // (rax, r15) = n[6] * m'
481    movq    %r12, -5*8(%rbp)
482    adcx    %rax, %r14                  // r14 = hi(a[5] * b[i]) + lo(a[6] * b[i])
483                                        //     + hi(n[5] * m') + lo(n[6] * m')
484    movq    %r11, -4*8(%rbp)
485    adox    %rdi, %r15                  // r15 = hi(a[6] * b[i]) + lo(a[7] * b[i])
486                                        //     + hi(n[6] * m')
487
488    mulx    24(%rcx), %rax, %r11        // (rax, r11) = n[7] * m'
489    movq    %r14, -3*8(%rbp)
490    adcx    %rax, %r15                  // r15 = hi(a[6] * b[0]) + lo(a[7] * b[0])
491                                        //     + hi(n[6] * m') + lo(n[7] * m')
492
493    adox    %r10, %r11
494    movq    %r15, -2*8(%rbp)
495
496    leaq    4*8(%rcx), %rcx             // n offsets 4
497    movq    (%r13), %rdx
498    dec     %r9
499    jnz     .Linner4x
500
501    movq    32(%rsp), %r15              // r15 = size * 8
502    leaq    8(%r13), %r13               // b offsets 1.
503
504    adcx    %r10, %r11                  // hi(n[7] * m') + OF + CF
505    subq    0*8(%rbp), %r10
506    adcx    %r11, %rbx                  // hi(a[7] * b[0]) + hi(n[7] * m')
507    sbbq    %r11,%r11
508    movq    %rbx, -1*8(%rbp)
509    cmp     40(%rsp), %r13
510    jne    .LoopOuter4x
511
512    leaq   48(%rsp),%rbp                // rbp = tmp[0]
513    subq    %r15, %rcx                  // rcx= n[0]
514    negq    %r11
515
516    movq   24(%rsp), %rdx            // rdx = size/4
517
518    movq   8(%rsp), %rdi             // get r[0]
519
520    // cal tmp - n
521    movq    0(%rbp), %rax            // rax = tmp[0]
522    movq    8(%rbp), %rbx            // rbx = tmp[1]
523    movq    16(%rbp), %r10           // r10 = tmp[2]
524    movq    24(%rbp), %r12           // r12 = tmp[3]
525
526    leaq    32(%rbp), %rbp           // tmp += 4
527
528    subq    0(%rcx), %rax            // tmp[0] - n[0]
529    sbbq    8(%rcx), %rbx            // tmp[1] - n[1]
530    sbbq    16(%rcx), %r10           // tmp[2] - n[2]
531    sbbq    24(%rcx), %r12           // tmp[3] - n[3]
532
533    leaq    32(%rcx), %rcx           // n += 4
534
535    movq    %rax, 0(%rdi)            // r save the tmp - n
536    movq    %rbx, 8(%rdi)
537    movq    %r10, 16(%rdi)
538    movq    %r12, 24(%rdi)
539
540    leaq    32(%rdi), %rdi           // r += 4
541
542.LoopSub4x:
543    movq    0(%rbp), %rax            // rax = tmp[0]
544    movq    8(%rbp), %rbx            // rbx = tmp[1]
545    movq    16(%rbp), %r10           // r10 = tmp[2]
546    movq    24(%rbp), %r12           // r12 = tmp[3]
547
548    leaq    32(%rbp), %rbp
549
550    sbbq    0(%rcx), %rax            // tmp[0] - n[0]
551    sbbq    8(%rcx), %rbx            // tmp[1] - n[1]
552    sbbq    16(%rcx), %r10           // tmp[2] - n[2]
553    sbbq    24(%rcx), %r12           // tmp[3] - n[3]
554
555    leaq    32(%rcx), %rcx
556
557    movq    %rax, 0(%rdi)
558    movq    %rbx, 8(%rdi)
559    movq    %r10, 16(%rdi)
560    movq    %r12, 24(%rdi)
561
562    leaq    32(%rdi), %rdi
563
564    decq    %rdx                    // j--
565    jnz     .LoopSub4x              // if j != 0
566
567    sbbq    $0,%r11                 // cancellation of highest carry
568    subq    %r15, %rbp              // rbp = tmp[0]
569    subq    %r15, %rdi              // r = n[0]
570
571    movq    24(%rsp), %r10          // r10 = size/4 - 1
572
573    pxor    %xmm2,%xmm2             // xmm0 = 0
574    movq    %r11, %xmm0
575    pcmpeqd %xmm1,%xmm1             // xmm5 = -1
576    pshufd  $0,%xmm0,%xmm0
577    pxor    %xmm0,%xmm1
578    xorq    %rax,%rax
579
580    movdqa  (%rbp,%rax),%xmm5      // Copy the result to r.
581    movdqu  (%rdi,%rax),%xmm3
582    pand    %xmm0,%xmm5
583    pand    %xmm1,%xmm3
584    movdqa  16(%rbp,%rax),%xmm4
585    movdqu  %xmm2,(%rbp,%rax)
586    por     %xmm3,%xmm5
587    movdqu  16(%rdi,%rax),%xmm3
588    movdqu  %xmm5,(%rdi,%rax)
589    pand    %xmm0,%xmm4
590    pand    %xmm1,%xmm3
591    movdqa  %xmm2,16(%rbp,%rax)
592    por     %xmm3,%xmm4
593    movdqu  %xmm4,16(%rdi,%rax)
594    leaq    32(%rax),%rax
595
596.align  16
597.LoopCopy4x:
598    movdqa  (%rbp,%rax),%xmm5
599    movdqu  (%rdi,%rax),%xmm3
600    pand    %xmm0,%xmm5
601    pand    %xmm1,%xmm3
602    movdqa  16(%rbp,%rax),%xmm4
603    movdqu  %xmm2,(%rbp,%rax)
604    por     %xmm3,%xmm5
605    movdqu  16(%rdi,%rax),%xmm3
606    movdqu  %xmm5,(%rdi,%rax)
607    pand    %xmm0,%xmm4
608    pand    %xmm1,%xmm3
609    movdqa  %xmm2,16(%rbp,%rax)
610    por     %xmm3,%xmm4
611    movdqu  %xmm4,16(%rdi,%rax)
612    leaq    32(%rax),%rax
613    decq    %r10                        // j--
614    jnz     .LoopCopy4x
615    movq    0(%rsp),%rsi                // rsi = pressed-stacked rsp.
616    movq    $1,%rax
617    leaq    (%rsi),%rsp                 // Restore srsp.
618    RESTORE_REGISTERS
619    ret
620.cfi_endproc
621.size   MontMul4x,.-MontMul4x
622
623.type   MontSqr8x,@function
624.align  32
625MontSqr8x:
626.cfi_startproc
627    SAVE_REGISTERS
628    movq    %rsp,%rax
629
630    movl    %r9d,%r15d
631    shll    $3,%r9d                 // Calculate size * 8 bytes.
632    shlq    $5,%r15                 // size * 8 * 4
633    negq    %r9
634
635    leaq    -64(%rsp,%r9,2),%r14    // r14 = rsp[size * 2 - 8]
636    subq    %rsi,%r14
637    andq    $4095,%r14
638    movq    %rsp,%rbp
639    cmpq    %r14,%r15
640    jae     .Loop8xCheckstk
641
642    leaq    4032(,%r9,2),%r15    // r15 = 4096 - frame - 2 * size
643    subq    %r15,%r14
644    movq    $0,%r15
645    cmovcq  %r15,%r14
646
647.Loop8xCheckstk:
648    subq    %r14,%rbp
649    leaq    -96(%rbp,%r9,2),%rbp    // Allocate a frame + 2 x size.
650
651    andq    $-64,%rbp               // __checkstk implementation,
652                                    // which is invoked when the stack size needs to exceed one page.
653    movq    %rsp,%r14
654    subq    %rbp,%r14
655    andq    $-4096,%r14
656    leaq    (%r14,%rbp),%rsp
657    cmpq    %rbp,%rsp
658    jbe     .LoopMul8x
659
660.align  16
661.LoopPage8x:
662    leaq    -4096(%rsp),%rsp        // Change sp - 4096 each time until sp <= the space to be allocated
663    cmpq    %rbp,%rsp
664    ja      .LoopPage8x
665
666.LoopMul8x:
667    movq    %r9,%r15                // r15 = -size * 8
668    negq    %r9                     // Restoresize.
669    movq    %r8,32(%rsp)            // Save the values of k0 and sp.
670    movq    %rax,40(%rsp)
671
672
673    movq    %rcx, %xmm1             // Pointer to saving n.
674    pxor    %xmm2,%xmm2             // xmm0 = 0
675    movq    %rdi, %xmm0             // Pointer to saving r.
676    movq    %r15, %xmm5             // Save size.
677    call    MontSqr8Inner
678
679    leaq    (%rdi,%r9),%rbx       // rbx = t[size]
680    movq    %r9,%rcx                // rcx = -size
681    movq    %r9,%rdx                // rdx = -size
682    movq    %xmm0, %rdi             // rdi = r
683    sarq    $5,%rcx               // rcx >>= 5
684
685.align  32
686/* T -= N */
687.LoopSub8x:
688    movq    (%rbx),%r13             // r13 = t[i]
689    movq    8(%rbx),%r12            // r12 = t[i + 1]
690    movq    16(%rbx),%r11           // r11 = t[i + 2]
691    movq    24(%rbx),%r10           // r10 = t[i + 3]
692
693    sbbq    (%rbp),%r13             // r13 = t[i] - (n[i] + CF)
694    sbbq    8(%rbp),%r12            // r12 = t[i + 1] - (n[i + 1] + CF)
695    sbbq    16(%rbp),%r11           // r11 = t[i + 2] - (n[i + 2] + CF)
696    sbbq    24(%rbp),%r10           // r10 = t[i + 3] - (n[i + 3] + CF)
697
698    movq    %r13,0(%rdi)            // Assigning value to r.
699    movq    %r12,8(%rdi)
700    movq    %r11,16(%rdi)
701    movq    %r10,24(%rdi)
702
703    leaq    32(%rbp),%rbp           // n += 4
704    leaq    32(%rdi),%rdi           // r += 4
705    leaq    32(%rbx),%rbx           // t += 4
706    incq    %rcx
707    jnz     .LoopSub8x
708
709    sbbq    $0,%rax                 // rax -= CF
710    leaq    (%rbx,%r9),%rbx
711    leaq    (%rdi,%r9),%rdi
712
713    movq    %rax,%xmm0
714    pxor    %xmm2,%xmm2
715    pshufd  $0,%xmm0,%xmm0
716    movq    40(%rsp),%rsi           // rsi = pressed-stacked rsp.
717
718.align  32
719.LoopCopy8x:
720    movdqa  0(%rbx),%xmm1           // Copy the result to r.
721    movdqa  16(%rbx),%xmm5
722    leaq    32(%rbx),%rbx
723    movdqu  0(%rdi),%xmm3
724    movdqu  16(%rdi),%xmm4
725    leaq    32(%rdi),%rdi
726    movdqa  %xmm2,-32(%rbx)
727    movdqa  %xmm2,-16(%rbx)
728    movdqa  %xmm2,-32(%rbx,%rdx)
729    movdqa  %xmm2,-16(%rbx,%rdx)
730    pcmpeqd %xmm0,%xmm2
731    pand    %xmm0,%xmm1
732    pand    %xmm0,%xmm5
733    pand    %xmm2,%xmm3
734    pand    %xmm2,%xmm4
735    pxor    %xmm2,%xmm2
736    por     %xmm1,%xmm3
737    por     %xmm5,%xmm4
738    movdqu  %xmm3,-32(%rdi)
739    movdqu  %xmm4,-16(%rdi)
740    addq    $32,%r9
741    jnz     .LoopCopy8x
742
743    movq    $1,%rax
744    leaq    (%rsi),%rsp             // Restore rsp.
745    RESTORE_REGISTERS               // Restore non-volatile register.
746    ret
747.cfi_endproc
748.size   MontSqr8x,.-MontSqr8x
749
750.type   MontSqr8Inner,@function
751.align  32
752MontSqr8Inner:
753.cfi_startproc
754
755    movq    %rsi, %r8
756    addq    %r9, %r8
757    movq    %r8, 64(%rsp)           // save a[size]
758    movq    %r9, 56(%rsp)           // save size * 8
759    leaq    88(%rsp), %rbp          // tmp的首地址
760
761    leaq    88(%rsp,%r9,2),%rbx
762    movq    %rbx,16(%rsp)   // t[size * 2]
763    leaq    (%rcx,%r9),%rax
764    movq    %rax,8(%rsp)   // n[size]
765    jmp     .MontSqr8xBegin
766
767.MontSqr8xInitStack:
768    movdqa    %xmm2,0*8(%rbp)
769    movdqa    %xmm2,2*8(%rbp)
770    movdqa    %xmm2,4*8(%rbp)
771    movdqa    %xmm2,6*8(%rbp)
772.MontSqr8xBegin:
773    movdqa    %xmm2,8*8(%rbp)
774    movdqa    %xmm2,10*8(%rbp)
775    movdqa    %xmm2,12*8(%rbp)
776    movdqa    %xmm2,14*8(%rbp)
777    lea       128(%rbp), %rbp
778    subq      $64, %r9
779    jnz       .MontSqr8xInitStack
780
781    xorq    %rbx, %rbx                 // clear CF OF
782    movq    $0, %r13
783    movq    $0, %r12
784    movq    $0, %r11
785    movq    $0, %rdi
786    movq    $0, %r15
787    movq    $0, %rcx
788
789    leaq    88(%rsp), %rbp             // set tmp[0]
790    movq    0(%rsi), %rdx              // rdx = a[0]
791    movq    $0, %r10
792
793.LoopOuterSqr8x:
794
795    // begin a[0] * a[1~7]
796    mulx    8(%rsi), %rax, %r14        // rax = lo(a[1] * a[0]), r14 = hi(a[1] * a[0])
797    adcx    %rbx, %rax
798
799    movq    %rax, 8(%rbp)
800    adox    %r13, %r14
801
802    mulx    16(%rsi), %rax, %r13       // (rax, r13) = a[2] * a[0]
803    adcx    %rax, %r14                 // r14 = hi(a[1] * a[0]) + lo(a[2] * a[0])
804    adox    %r12, %r13
805
806    mulx    24(%rsi), %rax, %r12       // (rax, r12) = a[3] * a[0]
807    movq    %r14, 16(%rbp)
808    adcx    %rax, %r13                 // r13 = hi(a[2] * a[0]) + lo(a[3] * a[0])
809    adox    %r11, %r12
810
811    mulx    32(%rsi), %rax, %r11       // (rax, r11) = a[4] * a[0]
812    adcx    %rax, %r12                 // r12 = hi(a[3] * a[0]) + lo(a[4] * a[0])
813
814    adox    %rdi, %r11
815    mulx    40(%rsi), %rax, %rdi       // (rax, rdi) = a[5] * a[0]
816    adcx    %rax, %r11                 // r11 = hi(a[4] * a[0]) + lo(a[5] * a[0])
817
818    adox    %r15, %rdi
819    mulx    48(%rsi), %rax, %r8        // (rax, r8) = a[6] * a[0]
820    adcx    %rax, %rdi                 // rdi = hi(a[5] * a[0]) + lo(a[6] * a[0])
821    adox    %rcx, %r8
822
823    mulx    56(%rsi), %rax, %rbx       // (rax, rbx) = a[7] * a[0]
824    adcx    %rax, %r8                  // r8 = hi(a[6] * a[0]) + lo(a[7] * a[0])
825    adox    %r10, %rbx                 // rbx += CF
826    adcq    64(%rbp), %rbx             // rbx += CF
827
828    sbbq    %r9, %r9                   // get high CF
829    xorq    %r10, %r10                 // clear CF OF
830
831    // begin a[1] * a[2~7]
832    movq    8(%rsi), %rdx              // rdx = a[1]
833    mulx    16(%rsi), %rax, %rcx       // rax = lo(a[2] * a[1]), rcx = hi(a[2] * a[1])
834    adcx    %rax, %r13                 // r13 = hi(a[2] * a[0]) + lo(a[3] * a[0]) + lo(a[2] * a[1])
835
836    mulx    24(%rsi), %rax, %r14       // rax = lo(a[3] * a[1]), r14 = hi(a[3] * a[1])
837    movq    %r13, 24(%rbp)
838
839    adox    %rax, %rcx                 // rcx = lo(a[3] * a[1]) + hi(a[2] * a[1])
840
841    mulx    32(%rsi), %rax, %r13       // (rax, r13) = a[4] * a[1]
842    adcx    %r12, %rcx                 // rcx = hi(a[3] * a[0]) + lo(a[4] * a[0]) + lo(a[3] * a[1]) + hi(a[2] * a[1])
843    adox    %rax, %r14                 // r14 = lo(a[4] * a[1]) + hi(a[3] * a[1])
844
845    mulx    40(%rsi), %rax, %r12       // (rax, r12) = a[5] * a[1]
846    movq    %rcx, 32(%rbp)
847    adcx    %r11, %r14                 // r14 = lo(a[4] * a[1]) + hi(a[3] * a[1]) + hi(a[4] * a[0]) + lo(a[5] * a[0])
848    adox    %rax, %r13                 // r13 = lo(a[5] * a[1]) + hi(a[4] * a[1])
849
850    mulx    48(%rsi), %rax, %r11       // (rax, r11) = a[6] * a[1]
851    adcx    %rdi, %r13                 // r13 = lo(a[5] * a[1]) + hi(a[4] * a[1]) + hi(a[5] * a[0]) + lo(a[6] * a[0])
852    adox    %rax, %r12                 // r12 = hi(a[5] * a[1]) + lo(a[6] * a[1])
853
854    mulx    56(%rsi), %rax, %rdi       // (rax, rdi) = a[7] * a[1]
855    adcx    %r8, %r12                  // r12 = hi(a[5] * a[1]) + lo(a[6] * a[1]) + hi(a[6] * a[0]) + lo(a[7] * a[0])
856    adox    %rax, %r11                 // r11 = hi(a[6] * a[1]) + lo(a[7] * a[1])
857    adcx    %rbx, %r11                 // r11 = hi(a[6] * a[1]) + lo(a[7] * a[1]) + hi(a[7] * a[0])
858
859    adcx    %r10, %rdi                 // rdi += CF
860    adox    %r10, %rdi                 // rdi += OF
861
862    movq    16(%rsi), %rdx             // rdx = a[2]
863
864    // begin a[2] * a[3~7]
865    mulx    24(%rsi), %rax, %rbx       // rax = lo(a[2] * a[3]), rbx = hi(a[2] * a[3])
866    adcx    %rax, %r14                 // r14 = lo(a[4] * a[1]) + hi(a[3] * a[1]) + hi(a[4] * a[0]) + lo(a[5] * a[0])
867                                       //     + lo(a[2] * a[3])
868
869    mulx    32(%rsi), %rax, %rcx       // rax = lo(a[2] * a[4]), rcx = hi(a[2] * a[4])
870
871    movq    %r14, 40(%rbp)
872    adox    %rax, %rbx                 // r13 = lo(a[2] * a[4]) + hi(a[2] * a[3])
873
874    mulx    40(%rsi), %rax, %r8        // rax = lo(a[2] * a[5]), rcx = hi(a[2] * a[5])
875    adcx    %r13, %rbx                 // rbx = lo(a[2] * a[4]) + hi(a[2] * a[3])
876                                       //     + lo(a[5] * a[1]) + hi(a[4] * a[1]) + hi(a[5] * a[0]) + lo(a[6] * a[0])
877
878    adox    %rax, %rcx                 // rcx = hi(a[2] * a[4]) + lo(a[2] * a[5])
879    movq    %rbx, 48(%rbp)
880
881    mulx    48(%rsi), %rax, %r13       // rax = lo(a[2] * a[6]), r13 = hi(a[2] * a[6])
882    adcx    %r12, %rcx                 // rcx = hi(a[5] * a[1]) + lo(a[6] * a[1]) + hi(a[6] * a[0])
883                                       //     + lo(a[7] * a[0]) + hi(a[2] * a[4]) + lo(a[2] * a[5])
884
885    adox    %rax, %r8                  // r8 = hi(a[2] * a[5]) + lo(a[2] * a[6])
886
887    mulx    56(%rsi), %rax, %r12       // rax = lo(a[2] * a[7]), r12 = hi(a[2] * a[7])
888
889    adcx    %r11, %r8                  // r8 = hi(a[2] * a[5]) + lo(a[2] * a[6])
890                                       //     + hi(a[6] * a[1]) + lo(a[7] * a[1]) + hi(a[7] * a[0])
891
892    adox    %rax, %r13                 // r13 = hi(a[2] * a[6]) + lo(a[2] * a[7])
893    adcx    %rdi, %r13                 // r13 = hi(a[2] * a[6]) + lo(a[2] * a[7]) + hi(a[7] * a[1])
894
895    adcx    %r10, %r12                 // r12 += CF
896    adox    %r10, %r12                 // r12 += OF
897
898    movq    24(%rsi), %rdx             // rdx = a[3]
899
900    // begin a[3] * a[4~7]
901    mulx    32(%rsi), %rax, %r14       // rax = lo(a[3] * a[4]), r14 = hi(a[3] * a[4])
902    adcx    %rax, %rcx                 // rcx = hi(a[5] * a[1]) + lo(a[6] * a[1]) + hi(a[6] * a[0])
903                                       //     + lo(a[7] * a[0]) + hi(a[2] * a[4]) + lo(a[2] * a[5]) + lo(a[3] * a[4])
904
905    mulx    40(%rsi), %rax, %rbx       // rax = lo(a[3] * a[5]), rbx = hi(a[3] * a[5])
906    adox    %rax, %r14                 // r14 = hi(a[3] * a[4]) + lo(a[3] * a[5])
907
908    mulx    48(%rsi), %rax, %r11       // rax = lo(a[3] * a[6]), r11 = hi(a[3] * a[6])
909    adcx    %r8, %r14                  // r14 = hi(a[3] * a[4]) + lo(a[3] * a[5])+ hi(a[2] * a[5]) + lo(a[2] * a[6])
910                                       //     + hi(a[6] * a[1]) + lo(a[7] * a[1]) + hi(a[7] * a[0])
911    adox    %rax, %rbx                 // rbx = hi(a[3] * a[5]) + lo(a[3] * a[6])
912
913    mulx    56(%rsi), %rax, %rdi       // rax = lo(a[3] * a[7]), rdi = hi(a[3] * a[7])
914    adcx    %r13, %rbx                 // rbx = hi(a[3] * a[5]) + lo(a[3] * a[6])
915                                       //     + hi(a[2] * a[6]) + lo(a[2] * a[7]) + hi(a[7] * a[1])
916    adox    %rax, %r11                 // r11 = hi(a[3] * a[6]) + lo(a[3] * a[7])
917    adcx    %r12, %r11                 // r11 = hi(a[2] * a[7]) + hi(a[3] * a[6]) + lo(a[3] * a[7])
918
919    adcx    %r10, %rdi                 // rdi += CF
920    adox    %r10, %rdi                 // rdi += OF
921
922    movq    %rcx, 56(%rbp)
923    movq    %r14, 64(%rbp)
924
925    movq    32(%rsi), %rdx             // rdx = a[4]
926
927    // begin a[4] * a[5~7]
928    mulx    40(%rsi), %rax, %r13       // rax = lo(a[4] * a[5]), r13 = hi(a[4] * a[5])
929    adcx    %rax, %rbx                 // rbx = hi(a[3] * a[5]) + lo(a[3] * a[6])
930                                       //     + hi(a[2] * a[6]) + lo(a[2] * a[7]) + hi(a[7] * a[1]) + lo(a[4] * a[5])
931
932    mulx    48(%rsi), %rax, %r12       // rax = lo(a[4] * a[6]), r12 = hi(a[4] * a[6])
933    adox    %rax, %r13                 // r13 = lo(a[4] * a[6]) + hi(a[4] * a[5])
934
935    mulx    56(%rsi), %rax, %r14       // rax = lo(a[4] * a[7]), r14 = hi(a[4] * a[7])
936    adcx    %r11, %r13                 // r13 = hi(a[4] * a[5]) + hi(a[2] * a[7]) + hi(a[3] * a[6])
937                                       //     + lo(a[3] * a[7])
938
939    adox    %rax, %r12                 // r12 = hi(a[4] * a[6]) + lo(a[4] * a[7])
940    adcx    %rdi, %r12                 // r12 = hi(a[4] * a[6]) + lo(a[4] * a[7]) + hi(a[3] * a[7])
941
942    adcx    %r10, %r14                // r14 += CF
943    adox    %r10, %r14                // r14 += OF
944
945    movq    40(%rsi), %rdx            // rdx = a[5]
946
947    // begin a[5] * a[6~7]
948    mulx    48(%rsi), %rax, %r11      // rax = lo(a[5] * a[6]), r11 = hi(a[5] * a[6])
949
950    adcx    %rax, %r12                // r14 = hi(a[4] * a[6]) + lo(a[4] * a[7]) + hi(a[3] * a[7]) + lo(a[5] * a[6])
951
952    mulx    56(%rsi), %rax, %rdi      // rax = lo(a[5] * a[7]), rdi = hi(a[5] * a[7])
953
954    adox    %rax, %r11                // r11 = hi(a[5] * a[6]) + lo(a[5] * a[7])
955    adcx    %r14, %r11                // r11 = hi(a[5] * a[6]) + lo(a[5] * a[7]) + hi(a[4] * a[7])
956    adcx    %r10, %rdi                // rdi += CF
957    adox    %r10, %rdi                // rdi += OF
958
959    movq    48(%rsi), %rdx            // rdx = a[6]
960
961    mulx    56(%rsi), %rax, %r15      // rax = lo(a[7] * a[6]), r15 = hi(a[7] * a[6])
962    adcx    %rax, %rdi                // rdi = hi(a[5] * a[6]) + lo(a[7] * a[6])
963    adcx    %r10, %r15                // r15 += CF
964
965    leaq    64(%rsi), %rsi
966
967    cmp     64(%rsp), %rsi            // cmpared with a[size]
968    je      .Lsqrx8xEnd
969
970    neg     %r9
971    movq    $0, %rcx
972    movq    64(%rbp),%r14
973
974    adcx    9*8(%rbp),%rbx
975    adcx    10*8(%rbp),%r13
976    adcx    11*8(%rbp),%r12
977    adcx    12*8(%rbp),%r11
978    adcx    13*8(%rbp),%rdi
979    adcx    14*8(%rbp),%r15
980    adcx    15*8(%rbp),%rcx
981
982    leaq    (%rsi), %r10              // r10 = a[8]
983    leaq    128(%rbp), %rbp
984    sbbq    %rax,%rax
985    movq    %rax, 72(%rsp)
986    movq    %rbp, 80(%rsp)
987
988    xor     %eax, %eax
989
990
991    movq    -64(%rsi), %rdx
992
993    movq    $-8, %r9
994
995.align  32
996.LoopSqr8x:
997    movq    %r14,%r8
998
999    // begin a[0] * a[8~11]
1000    mulx    0(%r10), %rax, %r14        // rax = lo(a[8] * a[0]), r14 = hi(a[8] * a[0])
1001    adcx    %rax, %r8
1002    adox    %rbx, %r14
1003
1004    mulx    8(%r10), %rax, %rbx        // rax = lo(a[9] * a[0]), rbx = hi(a[8] * a[0])
1005    adcx    %rax, %r14
1006    adox    %r13, %rbx
1007
1008    movq    %r8,(%rbp,%r9,8)
1009
1010    mulx    16(%r10), %rax, %r13        // rax = lo(a[10] * a[0]), r13 = hi(a[10] * a[0])
1011    adcx    %rax, %rbx
1012    adox    %r12, %r13
1013
1014    mulx    24(%r10), %rax, %r12        // rax = lo(a[11] * a[0]), r12 = hi(a[11] * a[0])
1015    adcx    %rax, %r13
1016    adox    %r11, %r12
1017
1018    movq    $0, %r8
1019
1020    mulx    32(%r10), %rax, %r11        // rax = lo(a[12] * a[0]), r11 = hi(a[12] * a[0])
1021    adcx    %rax, %r12
1022    adox    %rdi, %r11
1023
1024    mulx    40(%r10), %rax, %rdi        // rax = lo(a[13] * a[0]), rdi = hi(a[13] * a[0])
1025    adcx    %rax, %r11
1026    adox    %r15, %rdi
1027
1028    mulx    48(%r10), %rax, %r15        // rax = lo(a[14] * a[0]), r15 = hi(a[14] * a[0])
1029    adcx    %rax, %rdi
1030    adox    %rcx, %r15
1031
1032    mulx    56(%r10), %rax, %rcx        // rax = lo(a[15] * a[0]), rcx = hi(a[15] * a[0])
1033    adcx    %rax, %r15
1034    adcx    %r8, %rcx                   // here r8 = 0
1035    adox    %r8, %rcx
1036
1037    movq    8(%rsi,%r9,8),%rdx
1038
1039    inc     %r9
1040    jnz     .LoopSqr8x
1041
1042    leaq    64(%r10), %r10
1043    movq    $-8, %r9
1044
1045    cmp     64(%rsp), %r10             // cmpared with a[size]
1046    je     .LoopSqr8xBreak
1047
1048    subq     72(%rsp), %r8             // read the CF of the previous round.
1049
1050    movq    -64(%rsi), %rdx
1051
1052    adcx    0*8(%rbp),%r14
1053    adcx    1*8(%rbp),%rbx
1054    adcx    2*8(%rbp),%r13
1055    adcx    3*8(%rbp),%r12
1056    adcx    4*8(%rbp),%r11
1057    adcx    5*8(%rbp),%rdi
1058    adcx    6*8(%rbp),%r15
1059    adcx    7*8(%rbp),%rcx
1060
1061    leaq    8*8(%rbp),%rbp
1062
1063    sbbq     %rax, %rax
1064    xorq     %r8, %r8
1065    movq     %rax, 72(%rsp)
1066
1067    jmp    .LoopSqr8x
1068
1069.align  32
1070.LoopSqr8xBreak:
1071
1072    xorq    %r10, %r10
1073    subq    72(%rsp),%r8
1074    adcx    %r10, %r14
1075    movq    0(%rsi),%rdx
1076    movq    %r14,0(%rbp)
1077    movq    80(%rsp), %r8
1078
1079    adcx    %r10,%rbx
1080    adcx    %r10,%r13
1081    adcx    %r10,%r12
1082    adcx    %r10,%r11
1083    adcx    %r10,%rdi
1084    adcx    %r10,%r15
1085    adcx    %r10,%rcx
1086
1087    cmp     %r8, %rbp
1088    je      .LoopOuterSqr8x
1089
1090    // if tmp does not go to the end. The current value needs to be stored in tmp and updated.
1091    movq    %rbx,1*8(%rbp)
1092    movq    1*8(%r8),%rbx
1093    movq    %r13,2*8(%rbp)
1094    movq    2*8(%r8),%r13
1095    movq    %r12,3*8(%rbp)
1096    movq    3*8(%r8),%r12
1097    movq    %r11,4*8(%rbp)
1098    movq    4*8(%r8),%r11
1099    movq    %rdi,5*8(%rbp)
1100    movq    5*8(%r8),%rdi
1101    movq    %r15,6*8(%rbp)
1102    movq    6*8(%r8),%r15
1103    movq    %rcx,7*8(%rbp)
1104    movq    7*8(%r8),%rcx
1105    movq    %r8,%rbp
1106    jmp    .LoopOuterSqr8x
1107
1108.align    32
1109.Lsqrx8xEnd:
1110    mov    %rbx,9*8(%rbp)
1111    mov    %r13,10*8(%rbp)
1112    mov    %r12,11*8(%rbp)
1113    mov    %r11,12*8(%rbp)
1114    mov    %rdi,13*8(%rbp)
1115    mov    %r15,14*8(%rbp)
1116
1117    leaq    88(%rsp), %rbp          // tmp[0]
1118
1119    movq    56(%rsp), %rcx          // rcx = size * 8
1120    sbbq    %rcx, %rsi              // get a[0]
1121
1122    xorq    %r15, %r15              // clear CF OF, r15 = tmp[0] = 0
1123    movq    8(%rbp), %r14           // r14 = tmp[1]
1124    movq    16(%rbp), %r13          // r13 = tmp[2]
1125    movq    24(%rbp), %r12          // r12 = tmp[3]
1126
1127    adox    %r14, %r14              // r14 = 2 * tmp[1]
1128    movq    0(%rsi), %rdx
1129
1130.align  32
1131.LoopShiftAddSqr4x:
1132
1133    mulx    %rdx, %rax, %rbx        // (rbx, rax) = a[0] * a[0]
1134    adox    %r13, %r13              // r13 = 2 * tmp[1]
1135    adox    %r12, %r12              // r12 = 2 * tmp[3]
1136
1137    adcx    %rax, %r15              // r15 = 2 * tmp[0] + lo(a[0] * a[0])
1138    adcx    %rbx, %r14              // r14 = 2 * tmp[1] + hi(a[0] * a[0])
1139
1140    movq    %r15, (%rbp)
1141    movq    %r14, 8(%rbp)
1142
1143    movq    8(%rsi), %rdx
1144
1145    mulx    %rdx, %rax, %rbx        // (rbx, rax) = a[1] * a[1]
1146    adcx    %rax, %r13              // r13 = 2 * tmp[2] + lo(a[1] * a[1])
1147    adcx    %rbx, %r12              // r12 = 2 * tmp[3] + hi(a[1] * a[1])
1148
1149    movq    %r13, 16(%rbp)
1150    movq    %r12, 24(%rbp)
1151
1152    movq    32(%rbp), %r15          // r15 = tmp[4]
1153    movq    40(%rbp), %r14          // r14 = tmp[5]
1154    movq    48(%rbp), %r13          // r13 = tmp[6]
1155    movq    56(%rbp), %r12          // r12 = tmp[7]
1156
1157    movq    16(%rsi), %rdx
1158    mulx    %rdx, %rax, %rbx        // (rbx, rax) = a[2] * a[2]
1159    adox    %r15, %r15              // r15 = 2 * tmp[4]
1160    adcx    %rax, %r15              // r15 = 2 * tmp[4] + lo(a[2] * a[2])
1161
1162    adox    %r14, %r14              // r14 = 2 * tmp[4]
1163    adcx    %rbx, %r14              // r14 = 2 * tmp[5] + hi(a[2] * a[2])
1164
1165    movq    %r15, 32(%rbp)
1166    movq    %r14, 40(%rbp)
1167
1168    movq    24(%rsi), %rdx
1169    mulx    %rdx, %rax, %rbx        // (rbx, rax) = a[3] * a[3]
1170    adox    %r13, %r13              // r13 = 2 * tmp[5]
1171    adcx    %rax, %r13              // r13 = 2 * tmp[5] + lo(a[3] * a[3])
1172
1173    adox    %r12, %r12              // r12 = 2 * tmp[5]
1174    adcx    %rbx, %r12              // rbx = 2 * tmp[5] + hi(a[3] * a[3])
1175
1176    movq    %r13, 48(%rbp)
1177    movq    %r12, 56(%rbp)
1178
1179    leaq    32(%rsi), %rsi          // a[4]
1180
1181    leaq    -32(%rcx),%rcx
1182    jrcxz   .LoopReduceSqr8xBegin   // if i != 0
1183
1184    movq    64(%rbp), %r15          // r15 = tmp[8]
1185    movq    72(%rbp), %r14          // r14 = tmp[9]
1186    adox    %r15, %r15              // r15 = 2 * tmp[8]
1187    adox    %r14, %r14              // r14 = 2 * tmp[9]
1188
1189    movq    80(%rbp), %r13          // r13 = tmp[8]
1190    movq    88(%rbp), %r12          // r12 = tmp[9]
1191
1192    leaq    64(%rbp), %rbp
1193
1194    movq    0(%rsi), %rdx
1195
1196    jmp     .LoopShiftAddSqr4x      // if i != 0
1197
1198.LoopReduceSqr8xBegin:
1199    xorq    %rax,%rax               // rax = 0
1200    leaq    88(%rsp), %rdi          // tmp[0]
1201    movq    $0, %r9                 // Save size.
1202    movq    %xmm1, %rbp             // get n[0]
1203    xorq    %rsi, %rsi              // rsi = 0
1204
1205.align  32
1206.LoopReduceSqr8x:
1207    movq    %rax,80(%rsp)           // Store the highest carry bit.
1208    leaq    (%rdi,%r9),%rdi         // rdi = t[0]
1209
1210    movq    (%rdi),%rdx             // rdx = t[0]
1211    movq    8(%rdi),%r9             // r9 = t[1]
1212    movq    16(%rdi),%r15           // r15 = t[2]
1213    movq    24(%rdi),%r14           // r14 = t[3]
1214    movq    32(%rdi),%r13           // r13 = t[4]
1215    movq    40(%rdi),%r12           // r12 = t[5]
1216    movq    48(%rdi),%r11           // r11 = t[6]
1217    movq    56(%rdi),%r10           // r10 = t[7]
1218
1219    leaq    64(%rdi),%rdi           // rdi = t[8]
1220
1221    movq    %rdx,%r8                // r8 = t[0]
1222    imulq   40(%rsp),%rdx           // rbx = k0 * t[0]
1223	xorq    %rbx,%rbx               // clear CF OF
1224    movl    $8,%ecx
1225
1226.align  32
1227.LoopReduce8x:
1228	movq    %r8, %rbx
1229    movq    %rdx, 80(%rsp,%rcx,8)
1230    mulx    (%rbp), %rax, %r8      // (r8, rax) = m' * n[0]
1231    adcx    %rbx, %rax
1232    adox    %r9, %r8               // r9 = hi(m' * n[]) + t[1]
1233
1234    mulx    8(%rbp), %rax, %r9     // (rdx, r9) = m' * n[0]
1235    adcx    %rax,%r8               // r9 = t[1] + lo(m' * n[1])
1236    adox    %r9, %r15              // r15 = hi(m' * n[1]) + t[2]
1237
1238    mulx    16(%rbp), %r9, %rax    // (r9, rax) = m' * n[2]
1239    adcx    %r15, %r9              // r9 = hi(m' * n[1]) + lo(m' * n[2]) + t[2]
1240    adox    %rax, %r14             // rbx = hi(m' * n[2]) + t[3]
1241
1242    mulx    24(%rbp), %r15, %rax   // (r15, rax) = m' * n[3]
1243    adcx    %r14,%r15              // r15 = hi(m' * n[2]) + lo(m' * n[3]) + t[3]
1244    adox    %rax,%r13              // r13 = hi(m' * n[3]) + t[4]
1245
1246    mulx    32(%rbp), %r14, %rax   // (r14, rax) = m' * n[4]
1247    adcx    %r13,%r14              // r14 = hi(m' * n[3]) + lo(m' * n[4]) + t[4]
1248    adox    %rax,%r12              // r12 = hi(m' * n[4]) + t[5]
1249
1250    mulx    40(%rbp), %r13, %rax   // (r13, rax) = m' * n[5]
1251    adcx    %r12,%r13              // r13 = hi(m' * n[4]) + lo(m' * n[5]) + t[5]
1252    adox    %rax,%r11              // r12 = hi(m' * n[5]) + t[6]
1253
1254    mulx    48(%rbp), %r12, %rax   // (r12, rax) = m' * n[6]
1255    adcx    %r11,%r12              // r13 = hi(m' * n[5]) + lo(m' * n[6]) + t[6]
1256    adox    %r10,%rax              // r12 = hi(m' * n[5]) + t[7]
1257
1258    mulx    56(%rbp), %r11, %r10   // (r11, r10) = m' * n[7]
1259    adcx    %rax,%r11              // r13 = hi(m' * n[6]) + lo(m' * n[7]) + t[7]
1260
1261    adcx    %rsi,%r10              // r12 = hi(m' * n[7]) + t[8]
1262    adox    %rsi,%r10              // r12 = hi(m' * n[7]) + t[8]
1263
1264    movq    %r8, %rdx
1265    mulx    40(%rsp), %rdx, %rax   // (rdx, rax) = m' * n[7]
1266
1267    decl    %ecx                   // ecx--
1268    jnz     .LoopReduce8x          // if ecx != 0
1269
1270    leaq    64(%rbp),%rbp          // rbp += 64, n Pointer Offset.
1271    xorq    %rax,%rax              // rax = 0
1272    cmpq    8(%rsp),%rbp           // rbp = n[size]
1273    jae     .LoopEndCondMul8x
1274
1275    addq    (%rdi),%r8             // r8 += t[0]
1276    adcq    8(%rdi),%r9            // r9 += t[1]
1277    adcq    16(%rdi),%r15          // r15 += t[2]
1278    adcq    24(%rdi),%r14          // r14 += t[3]
1279    adcq    32(%rdi),%r13          // r13 += t[4]
1280    adcq    40(%rdi),%r12          // r12 += t[5]
1281    adcq    48(%rdi),%r11          // r11 += t[6]
1282    adcq    56(%rdi),%r10          // r10 += t[7]
1283    sbbq    %rsi,%rsi              // rsi = -CF
1284
1285    movq    144(%rsp),%rdx         // rbx = m', 80 + 64
1286    movl    $8,%ecx
1287    xor     %eax,%eax
1288.align  32
1289.LoopLastSqr8x:
1290    mulx    (%rbp), %rax, %rbx     // (rbx, rax) = m' * n[0]
1291    adcx    %rax,%r8               // r8 = lo(m' * n[0]) + t[0]
1292    movq    %r8,(%rdi)             // t[0] = r8
1293    leaq    8(%rdi),%rdi           // t++
1294
1295    adox    %rbx,%r9               // r9 = hi(m' * n[]) + t[2]
1296
1297    mulx    8(%rbp), %r8, %rbx     // (r8, rbx) = m' * n[0]
1298    adcx    %r9,%r8                // r9 = t[1] + lo(m' * n[1])
1299    adox    %rbx, %r15             // r15 = hi(m' * n[1]) + t[2]
1300
1301    mulx    16(%rbp), %r9, %rbx    // (r9, rbx) = m' * n[2]
1302    adcx    %r15, %r9              // r9 = hi(m' * n[1]) + lo(m' * n[2]) + t[2]
1303    adox    %rbx, %r14             // r14 = hi(m' * n[2]) + t[3]
1304
1305    mulx    24(%rbp), %r15, %rbx   // (r15, rbx) = m' * n[3]
1306    adcx    %r14,%r15              // r15 = hi(m' * n[2]) + lo(m' * n[3]) + t[3]
1307    adox    %rbx,%r13              // r13 = hi(m' * n[3]) + t[4]
1308
1309    mulx    32(%rbp), %r14, %rbx   // (r14, rbx) = m' * n[4]
1310    adcx    %r13,%r14              // r14 = hi(m' * n[3]) + lo(m' * n[4]) + t[4]
1311    adox    %rbx,%r12              // r12 = hi(m' * n[4]) + t[5]
1312
1313    mulx    40(%rbp), %r13, %rbx   // (r13, rbx) = m' * n[5]
1314    adcx    %r12,%r13              // r13 = hi(m' * n[4]) + lo(m' * n[5]) + t[5]
1315    adox    %rbx,%r11              // r11 = hi(m' * n[5]) + t[6]
1316
1317    mulx    48(%rbp), %r12, %rbx   // (r12, rbx) = m' * n[6]
1318    adcx    %r11,%r12              // r12 = hi(m' * n[5]) + lo(m' * n[6]) + t[6]
1319    adox    %r10,%rbx              // rbx = hi(m' * n[5]) + t[7]
1320
1321    movq    $0, %rax
1322
1323    mulx    56(%rbp), %r11, %r10   // (r11, r10) = m' * n[7]
1324    adcx    %rbx,%r11              // r11 = hi(m' * n[6]) + lo(m' * n[7]) + t[7]
1325
1326    adcx    %rax,%r10              // r10 = hi(m' * n[7]) + t[8]
1327    adox    %rax,%r10              // r10 = hi(m' * n[7]) + t[8]
1328
1329    movq    72(%rsp,%rcx,8),%rdx   // rbx = t[i] * k0
1330
1331    decl    %ecx                   // ecx--
1332    jnz     .LoopLastSqr8x         // if ecx != 0
1333
1334    leaq    64(%rbp),%rbp          // n += 8
1335    cmpq    8(%rsp),%rbp           // Check whether rbp is at the end of the n array. If yes, exit the loop.
1336    jae     .LoopSqrBreak8x
1337
1338    movq    144(%rsp),%rdx          // rbx = m'
1339    negq    %rsi                    // rsi = CF
1340    movq    (%rbp),%rax             // rax = = n[0]
1341    adcq    (%rdi),%r8              // r8 = t[0]
1342    adcq    8(%rdi),%r9             // r9 = t[1]
1343    adcq    16(%rdi),%r15           // r15 = t[2]
1344    adcq    24(%rdi),%r14           // r14 = t[3]
1345    adcq    32(%rdi),%r13           // r13 = t[4]
1346    adcq    40(%rdi),%r12           // r12 = t[5]
1347    adcq    48(%rdi),%r11           // r11 = t[6]
1348    adcq    56(%rdi),%r10           // r10 = t[7]
1349    sbbq    %rsi,%rsi               // rsi = -CF
1350
1351    movl    $8,%ecx                 // ecx = 8
1352    xorq    %rax, %rax
1353    jmp     .LoopLastSqr8x
1354
1355.align  32
1356.LoopSqrBreak8x:
1357    xorq    %rax,%rax               // rax = 0
1358    addq    80(%rsp),%r8            // r8 += Highest carry bit.
1359    adcq    $0,%r9                  // r9 += CF
1360    adcq    $0,%r15                 // r15 += CF
1361    adcq    $0,%r14                 // r14 += CF
1362    adcq    $0,%r13                 // r13 += CF
1363    adcq    $0,%r12                 // r12 += CF
1364    adcq    $0,%r11                 // r11 += CF
1365    adcq    $0,%r10                 // r10 += CF
1366    adcq    $0,%rax                 // rax += CF
1367
1368    negq    %rsi                    // rsi = CF
1369.LoopEndCondMul8x:
1370    adcq    (%rdi),%r8              // r8 += t[0]
1371    adcq    8(%rdi),%r9             // r9 += t[1]
1372    adcq    16(%rdi),%r15           // r15 += t[2]
1373    adcq    24(%rdi),%r14           // r14 += t[3]
1374    adcq    32(%rdi),%r13           // r13 += t[4]
1375    adcq    40(%rdi),%r12           // r12 += t[5]
1376    adcq    48(%rdi),%r11           // r11 += t[6]
1377    adcq    56(%rdi),%r10           // r10 += t[7]
1378    adcq    $0,%rax                 // rax += CF
1379    movq    -8(%rbp),%rcx           // rcx = n[7]
1380    xorq    %rsi,%rsi               // rsi = 0
1381
1382    movq    %xmm1,%rbp              // rbp = n
1383    movq    %r8,(%rdi)              // Save the calculated result back to t[].
1384    movq    %r9,8(%rdi)
1385    movq    %xmm5,%r9
1386    movq    %r15,16(%rdi)
1387    movq    %r14,24(%rdi)
1388    movq    %r13,32(%rdi)
1389    movq    %r12,40(%rdi)
1390    movq    %r11,48(%rdi)
1391    movq    %r10,56(%rdi)
1392    leaq    64(%rdi),%rdi           // t += 8
1393
1394    cmpq    16(%rsp),%rdi           // Cycle the entire t[].
1395    jb      .LoopReduceSqr8x
1396    ret
1397.cfi_endproc
1398.size   MontSqr8Inner,.-MontSqr8Inner
1399
1400#endif
1401