1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#ifdef HITLS_CRYPTO_BN 18 19.file "bn_mont_x86_64.S" 20.text 21 22.macro ADD_CARRY a b 23 addq \a,\b 24 adcq $0,%rdx 25.endm 26 27.macro SAVE_REGISTERS 28 pushq %r15 // Save non-volatile register. 29 pushq %r14 30 pushq %r13 31 pushq %r12 32 pushq %rbp 33 pushq %rbx 34.endm 35 36.macro RESTORE_REGISTERS 37 popq %rbx // Restore non-volatile register. 38 popq %rbp 39 popq %r12 40 popq %r13 41 popq %r14 42 popq %r15 43.endm 44 45/* 46* void MontMulx_Asm(uint64_t *r, const uint64_t *a, const uint64_t *b, 47* const uint64_t *n, const uint64_t k0, uint32_t size); 48*/ 49.globl MontMulx_Asm 50.type MontMulx_Asm,@function 51.align 16 52MontMulx_Asm: 53.cfi_startproc 54 testl $3,%r9d 55 jnz .LMontMul // If size is not divisible by 4, LMontMul. 56 cmpl $8,%r9d 57 jb .LMontMul // LMontMul 58 cmpq %rsi,%rdx 59 jne MontMul4x // if a != b, MontMul4x 60 testl $7,%r9d 61 jz MontSqr8x // If size is divisible by 8,enter MontSqr8x. 62 jmp MontMul4x 63 64.align 16 65.LMontMul: 66 SAVE_REGISTERS // Save non-volatile register. 67 movq %rsp,%rax // rax stores the rsp 68 69 movq %r9, %r15 70 negq %r15 // r15 = -size 71 leaq -16(%rsp, %r15, 8), %r15 // r15 = rsp - size * 8 - 16 72 andq $-1024, %r15 // r15 The address is aligned down by 1 KB. 73 movq %rsp, %r14 // r14 = rsp 74 75 subq %r15,%r14 // __chkstk implemention, called when the stack size needs to exceed 4096. 76 // (the size of a page) to allocate more pages. 77 andq $-4096,%r14 // r14 4K down-align. 78 leaq (%r15,%r14),%rsp // rsp = r15 + r14 79 cmpq %r15,%rsp // If you want to allocate more than one page, go to Lmul_page_walk. 80 ja .LoopPage 81 jmp .LMulBody 82 83.align 16 84.LoopPage: 85 leaq -4096(%rsp),%rsp // rsp - 4096 each time until rsp < r15. 86 cmpq %r15,%rsp 87 ja .LoopPage 88 89.LMulBody: 90 movq %rax,8(%rsp,%r9,8) // Save the original rsp in the stack. 91 movq %rdx,%r13 // r13 = b 92 93 xorq %r11,%r11 // r11 = 0 94 xorq %r10,%r10 // r10 = 0 95 96 movq (%r13),%rbx // rbx = b[0] 97 movq (%rsi),%rax // rax = a[0] 98 mulq %rbx // (rdx, rax) = a[0] * b[0] 99 movq %rax,%r15 // r15 = t[0] = lo(a[0] * b[0]) 100 movq %rdx,%r14 // r14 = hi(a[0] * b[0]) 101 102 movq %r8,%rbp // rbp = k0 103 imulq %r15,%rbp // rbp = t[0] * k0 104 movq (%rcx),%rax // rax = n[0] 105 mulq %rbp // (rdx, rax) = t[0] * k0 * n[0] 106 ADD_CARRY %rax,%r15 // r15 = lo(t[0] * k0 * n[0]) + t[0] 107 108 leaq 1(%r10),%r10 // j++ 109 110.Loop1st: 111 movq (%rsi,%r10,8),%rax // rax = a[j] 112 movq %rdx,%r12 // r12 = hi(t[0] * k0 * n[0]) 113 114 mulq %rbx // (rdx, rax) = a[j] * b[0] 115 ADD_CARRY %rax,%r14 // r14 = hi(a[j - 1] * b[0]) + lo(a[j] * b[0]) 116 movq %rdx,%r15 // r15 = hi(a[j] * b[0]) 117 118 movq (%rcx,%r10,8),%rax // rax = n[j] 119 mulq %rbp // (rdx, rax) = t[0] * k0 * n[j] 120 leaq 1(%r10),%r10 // j++ 121 cmpq %r9,%r10 // if j != size, loop L1st 122 je .Loop1stSkip 123 124 ADD_CARRY %rax,%r12 // r12 = hi(t[0] * k0 * n[j]) + lo(t[0] * k0 * n[j]) 125 ADD_CARRY %r14,%r12 // r12 += lo(a[j] * b[0]) + hi(a[j] * b[0]) 126 movq %r12,-16(%rsp,%r10,8) // t[j - 2] = r13 127 movq %r15,%r14 // r14 = hi(a[j] * b[0]) 128 jmp .Loop1st 129 130.Loop1stSkip: 131 ADD_CARRY %rax,%r12 // r12 = hi(t[0] * k0 * n[j - 1]) + lo(t[0] * k0 * n[j]) 132 ADD_CARRY %r14,%r12 // r12 += hi(a[j - 1] * b[0]) + lo(a[j] * b[0]) 133 movq %r12,-16(%rsp,%r10,8) // t[j - 2] = r13 134 movq %r15,%r14 // r14 = hi(a[j] * b[0]) 135 136 movq %rdx,%r12 // r12 = hi(t[0] * k0 * n[j]) 137 xorq %rdx,%rdx // rdx = 0, Clearing the CF. 138 ADD_CARRY %r14,%r12 // r12 = hi(t[0] * k0 * n[j]) + hi(a[j] * b[0]) 139 movq %r12,-8(%rsp,%r9,8) // t[size - 1] = hi(t[0] * k0 * n[j]) + hi(a[j] * b[0]), save overflow bit. 140 movq %rdx,(%rsp,%r9,8) 141 142 leaq 1(%r11),%r11 // i++ 143 144.align 16 145.LoopOuter: 146 xorq %r10,%r10 // j = 0 147 movq (%rsi),%rax // rax = a[0] 148 movq (%r13,%r11,8),%rbx // rbx = b[i] 149 mulq %rbx // (rdx, rax) = a[0] * b[i] 150 movq (%rsp),%r15 // r15 = lo(a[0] * b[i]) + t[0] 151 ADD_CARRY %rax,%r15 152 movq %rdx,%r14 // r14 = hi(a[0] * b[i]) 153 154 movq %r8,%rbp // rbp = t[0] * k0 155 imulq %r15,%rbp 156 movq (%rcx),%rax // rax = n[0] 157 mulq %rbp // (rdx, rax) = t[0] * k0 * n[0] 158 ADD_CARRY %rax,%r15 // r15 = lo(t[0] * k0 * n[0]) 159 160 leaq 1(%r10),%r10 // j++ 161 162.align 16 163.LoopInner: 164 movq (%rsi,%r10,8),%rax // rax = a[j] 165 movq %rdx,%r12 // r12 = hi(t[0] * k0 * n[j]) 166 movq (%rsp,%r10,8),%r15 // r15 = t[j] 167 168 mulq %rbx // (rdx, rax) = a[1] * b[i] 169 ADD_CARRY %rax,%r14 // r14 = hi(a[0] * b[i]) + lo(a[1] * b[i]) 170 movq (%rcx,%r10,8),%rax // rax = n[j] 171 ADD_CARRY %r14,%r15 // r15 = a[j] * b[i] + t[j] 172 movq %rdx,%r14 173 leaq 1(%r10),%r10 // j++ 174 175 mulq %rbp // (rdx, rax) = t[0] * k0 * n[j] 176 cmpq %r9,%r10 // if j != size, loop Linner 177 je .LoopInnerSkip 178 179 ADD_CARRY %rax,%r12 // r12 = t[0] * k0 * n[j] 180 ADD_CARRY %r15,%r12 // r12 = a[j] * b[i] + t[j] + n[j] * t[0] * k0 181 movq %r12,-16(%rsp,%r10,8) // t[j - 2] = r13 182 jmp .LoopInner 183 184.LoopInnerSkip: 185 ADD_CARRY %rax,%r12 // r12 = t[0] * k0 * n[j] 186 ADD_CARRY %r15,%r12 // r12 = t[0] * k0 * n[j] + a[j] * b[i] + t[j] 187 movq (%rsp,%r10,8),%r15 // r15 = t[j] 188 movq %r12,-16(%rsp,%r10,8) // t[j - 2] 189 movq %rdx,%r12 // r12 = hi(t[0] * k0 * n[j]) 190 191 xorq %rdx,%rdx // rdx 0 192 ADD_CARRY %r14,%r12 // r12 = hi(a[1] * b[i]) + hi(t[0] * k0 * n[j]) 193 ADD_CARRY %r15,%r12 // r12 += t[j] 194 movq %r12,-8(%rsp,%r9,8) // t[size - 1] = r13 195 movq %rdx,(%rsp,%r9,8) // t[size] = CF 196 197 leaq 1(%r11),%r11 // i++ 198 cmpq %r9,%r11 // if size < i (unsigned) 199 jne .LoopOuter 200 201 xorq %r11,%r11 // r11 = 0, clear CF. 202 movq (%rsp),%rax // rax = t[0] 203 movq %r9,%r10 // r10 = size 204 205.align 16 206.LoopSub: 207 sbbq (%rcx,%r11,8),%rax // r[i] = t[i] - n[i] 208 movq %rax,(%rdi,%r11,8) 209 movq 8(%rsp,%r11,8),%rax // rax = t[i + 1] 210 211 leaq 1(%r11),%r11 // i++ 212 decq %r10 // j-- 213 jnz .LoopSub // if j != 0 214 215 sbbq $0,%rax // rax -= CF 216 movq $-1,%rbx 217 xorq %rax,%rbx // rbx = !t[i + 1] 218 xorq %r11,%r11 // r11 = 0 219 movq %r9,%r10 // r10 = size 220 221.LoopCopy: 222 movq (%rdi,%r11,8),%rcx // rcx = r[i] & t[i] 223 andq %rbx,%rcx 224 movq (%rsp,%r11,8),%rdx // rdx = CF & t[i] 225 andq %rax,%rdx 226 orq %rcx,%rdx 227 movq %rdx,(%rdi,%r11,8) // r[i] = t[i] 228 movq %r9,(%rsp,%r11,8) // t[i] = size 229 leaq 1(%r11),%r11 // i++ 230 subq $1,%r10 // j-- 231 jnz .LoopCopy // if j != 0 232 233 movq 8(%rsp,%r9,8),%rsi // rsi = pressed-stacked rsp. 234 movq $1,%rax // rax = 1 235 leaq (%rsi),%rsp // restore rsp. 236 RESTORE_REGISTERS // Restore non-volatile register. 237 ret 238.cfi_endproc 239.size MontMulx_Asm,.-MontMulx_Asm 240 241.type MontMul4x,@function 242.align 16 243MontMul4x: 244.cfi_startproc 245 SAVE_REGISTERS 246 movq %rsp,%rax // save rsp 247 248 movq %r9,%r15 249 negq %r15 250 leaq -48(%rsp,%r15,8),%r15 // Allocate space: size * 8 + 48 bytes. 251 andq $-1024,%r15 252 movq %rsp,%r14 253 254 subq %r15,%r14 // __chkstk implemention, called when the stack size needs to exceed 4096. 255 andq $-4096,%r14 256 leaq (%r15,%r14),%rsp 257 cmpq %r15,%rsp // If you want to allocate more than one page, go to LoopPage4x. 258 ja .LoopPage4x 259 jmp .LoopMul4x 260 261.LoopPage4x: 262 leaq -4096(%rsp),%rsp // rsp - 4096each time until rsp >= r10. 263 cmpq %r15,%rsp 264 ja .LoopPage4x 265 266.LoopMul4x: 267 movq %rax, 0(%rsp) // save stack pointer 268 movq %rdi, 8(%rsp) // save r 269 movq %r8, 16(%rsp) // save k0 270 movq %r9, %r10 271 shrq $2, %r9 272 decq %r9 273 movq %r9, 24(%rsp) // save (size/4) - 1 274 shlq $3, %r10 275 movq %rdx, %r12 // r12 = b 276 movq %r10, 32(%rsp) // save (size * 8) -> bytes 277 278 addq %r10, %r12 // r12 = loc(b[size - 1]) 279 leaq 80(%rsp),%rbp // rbp: start position of the tmp buffer 280 281 movq %rdx,%r13 // r13 = b 282 movq %r12, 40(%rsp) // save loc(b + size * 8) 283 movq (%r13),%rdx // rbx = b[0] 284 285 // cal a[0 ~ 3] * b[0] 286 mulx (%rsi), %r12, %r14 // r14 = hi(a[0] * b[0]), r12 = lo(b[0] * a[0]) 287 mulx 8(%rsi), %rax, %r15 // (r15, rax) = a[1] * b[0] 288 addq %rax, %r14 // r14 = hi(a[0] * b[0]) + lo(a[1] * b[0]) 289 mulx 16(%rsi), %rax, %r11 // (rax, r11) = a[2] * b[0] 290 adcq %rax, %r15 // r15 = hi(a[1] * b[0]) + lo(a[2] * b[0]) 291 adcq $0, %r11 // r11 = hi(a[2] * b[0]) + CF 292 293 imulq %r12,%r8 // r8 = t[0] * k0, will change CF 294 xorq %r10,%r10 // get r10 = 0 295 296 mulx 24(%rsi), %rax, %rbx // (rax, rbx) = a[3] * b[0] 297 movq %r8, %rdx // rdx = t[0] * k0 = m' 298 adcx %rax, %r11 // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0]) 299 adcx %r10, %rbx // rbx = hi(a[3] * b[0]) 300 301 // cal n[0 ~ 3] * t[0] * k0 302 mulx (%rcx), %rax, %rdi // (rdi, rax) = n[0] * m' 303 adcx %rax, %r12 // r12 = lo(b[0] * a[0]) + lo(n[0] * m') 304 adox %r14, %rdi // r8 = hi(n[0] * m') + hi(a[0] * b[0]) + hi(n[0] * m') 305 306 mulx 8(%rcx), %rax, %r14 // (r14, rax) = n[1] * m' 307 adcx %rax, %rdi 308 adox %r15, %r14 // r11 = hi(a[1] * b[0]) + lo(a[2] * b[0]) + hi(n[1] * m') 309 movq %rdi, -32(%rbp) 310 311 mulx 16(%rcx), %rax, %r15 // (r15, rax) = n[2] * m' 312 adcx %rax, %r14 313 adox %r11, %r15 // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0]) + hi(n[2] * m') 314 movq %r14, -24(%rbp) 315 316 mulx 24(%rcx), %rax, %r11 // (r11, rax) = n[3] * m' 317 adcx %rax, %r15 318 adox %r10, %r11 // r11 = hi(n[3] * m') 319 movq %r15, -16(%rbp) 320 321 leaq 4*8(%rsi),%rsi // a offset 4 blocks 322 leaq 4*8(%rcx),%rcx // n offset 4 blocks 323 movq (%r13),%rdx // rdx = b[0] 324 325.align 16 326.Loop1st4x: 327 mulx (%rsi), %r12, %r14 // r14 = hi(a[4] * b[0]), r12 = lo(a[4] * b[0]) 328 adcx %r10, %r11 // r11 += carry 329 mulx 8(%rsi), %rax, %r15 // r15 = hi(a[5] * b[0]), rax = lo(a[5] * b[0]) 330 adcx %rbx, %r12 // r12 = hi(a[3] * b[0]) + lo(a[4] * b[0]) 331 adcx %rax, %r14 // r14 = hi(a[4] * b[0]) + lo(a[5] * a[0]) 332 mulx 16(%rsi), %rax, %rdi // rax = hi(a[6] * b[0]), rax = lo(a[6] * b[0]) 333 adcx %rax, %r15 // r15 = hi(a[5] * b[0]) + lo(a[6] * a[0]) 334 mulx 24(%rsi), %rax, %rbx // rax = hi(a[7] * b[0]), rdi = lo(a[7] * b[0]) 335 adcx %rax, %rdi // rbx = hi(a[6] * b[0]) + lo(a[7] * b[0]) 336 adcx %r10, %rbx // rdi = hi(a[7] * b[0]) + CF 337 338 movq %r8, %rdx 339 adox %r11,%r12 // r12 = hi(a[3] * b[0]) + lo(b[4] * a[0]) + hi(n[3] * m') 340 mulx (%rcx), %rax, %r11 // (rax, r8) = n[4] * m' 341 leaq 4*8(%rsi), %rsi // a offset 4 blocks 342 adcx %rax,%r12 // r12 = hi(a[3] * b[0]) + lo(b[4] * a[0]) 343 // + hi(n[3] * m') + lo(n[4] * m') 344 adox %r14, %r11 // r8 = hi(a[4] * b[0]) + lo(a[5] * b[0]) + hi(n[4] * m') 345 346 347 mulx 8(%rcx), %rax, %r14 // (rax, r14) = n[5] * m' 348 leaq 4*8(%rbp), %rbp // tmp offset 4 blocks 349 adcx %rax, %r11 // r8 = hi(a[4] * b[0]) + lo(a[5] * b[0]) 350 // + hi(n[4] * m') + lo(n[5] * m') 351 adox %r15, %r14 // r14 = hi(a[5] * b[0]) + lo(a[6] * a[0]) 352 // + ho(n[5] * m') 353 354 mulx 16(%rcx), %rax, %r15 // (rax, r15) = n[6] * m' 355 movq %r12, -5*8(%rbp) 356 adcx %rax, %r14 // r14 = hi(a[5] * b[0]) + lo(a[6] * a[0]) 357 // + hi(n[5] * m') + lo(n[6] * m') 358 adox %rdi, %r15 // r15 = hi(a[6] * b[0]) + lo(a[7] * b[0]) 359 // + hi(n[6] * m') 360 movq %r11, -4*8(%rbp) 361 362 mulx 24(%rcx), %rax, %r11 // (rax, r11) = n[7] * m' 363 movq %r14, -3*8(%rbp) 364 adcx %rax, %r15 // r15 = hi(a[6] * b[0]) + lo(a[7] * b[0]) 365 // + hi(n[6] * m') + lo(n[7] * m') 366 367 adox %r10, %r11 368 movq %r15, -2*8(%rbp) 369 370 leaq 4*8(%rcx), %rcx // n offset 4 blocks 371 movq (%r13),%rdx // recover rdx 372 dec %r9 373 jnz .Loop1st4x 374 375 movq 32(%rsp), %r15 // r15 = size * 8 376 leaq 8(%r13), %r13 // b offset 1 blocks 377 378 adcx %r10, %r11 // hi(n[7] * m') + CF, here OX CF are carried. 379 addq %r11, %rbx // hi(a[7] * b[0]) + hi(n[7] * m') 380 sbbq %r11,%r11 // check r11 > 0 381 movq %rbx, -1*8(%rbp) 382 383.align 4 384.LoopOuter4x: 385 // cal a[0 ~ 3] * b[i] 386 movq (%r13),%rdx // rdx = b[i] 387 mov %r11, (%rbp) // keep the highest carry 388 subq %r15, %rsi // get a[0] 389 subq %r15, %rcx // get n[0] 390 leaq 80(%rsp),%rbp // get tmp[0] 391 392 // from here, a[0 ~ 3] * b[i] needs to add tmp 393 mulx (%rsi), %r12, %r14 // r14 = hi(a[0] * b[i]), r12 = lo(b[i] * a[0]) 394 xorq %r10,%r10 // get r10 = 0, and clear CF OF 395 396 mulx 8(%rsi), %rax, %r15 // (r15, rax) = a[1] * b[i] 397 adox -4*8(%rbp), %r12 // lo(a[1] * b[i]) + tmp[0] 398 adcx %rax, %r14 // r14 = hi(a[0] * b[i]) + lo(a[1] * b[i]) 399 400 mulx 16(%rsi), %rax, %r11 // (rax, r11) = a[2] * b[0] 401 adox -3*8(%rbp),%r14 // r14 = hi(a[1] * b[i]) + lo(a[1] * b[i]) + tmp[1] 402 adcx %rax, %r15 // r15 = hi(a[1] * b[i]) + lo(a[2] * b[i]) 403 404 mulx 24(%rsi), %rax, %rbx // (rax, rbx) = a[3] * b[0] 405 adox -2*8(%rbp),%r15 // r15 = hi(a[2] * b[i]) + lo(a[2] * b[i]) + tmp[2] 406 adcx %rax, %r11 // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0]) 407 adox -1*8(%rbp),%r11 // r11 = hi(a[2] * b[i]) + lo(a[3] * b[i]) + tmp[3] 408 adcx %r10,%rbx 409 movq %r12, %rdx 410 adox %r10,%rbx 411 412 imulq 16(%rsp),%rdx // 16(%rsp) save k0, r8 = t[0] * k0 = m', imulq will change CF 413 mulx (%rcx), %rax, %r8 // (rax, r8) = n[0] * m' 414 xorq %r10, %r10 // clear CF 415 416 adcx %rax, %r12 // r12 = lo(b[0] * a[0]) + lo(n[0] * m') 417 adox %r14, %r8 // r8 = hi(n[0] * m') + hi(a[0] * b[0]) + hi(n[0] * m') 418 419 mulx 8(%rcx), %rax, %rdi // (rdi, rax) = n[1] * m' 420 leaq 4*8(%rsi),%rsi // a offsets 4 421 adcx %rax, %r8 422 adox %r15, %rdi // r11 = hi(a[1] * b[0]) + lo(a[2] * b[0]) + hi(n[1] * m') 423 424 mulx 16(%rcx), %rax, %r15 // (rdi, rax) = n[2] * m' 425 movq %r8, -32(%rbp) 426 adcx %rax, %rdi 427 adox %r11, %r15 // r11 = hi(a[2] * b[0]) + lo(a[3] * b[0]) + hi(n[2] * m') 428 429 mulx 24(%rcx), %rax, %r11 // (rdi, rax) = n[3] * m' 430 movq %rdi, -24(%rbp) 431 adcx %rax, %r15 432 adox %r10, %r11 // r11 = hi(n[3] * m') 433 movq %r15, -16(%rbp) 434 435 leaq 4*8(%rcx),%rcx // n offsets 4 436 437 movq %rdx, %r8 // r8 = t[0] * k0 = m' 438 movq (%r13), %rdx // rdx = b[i] 439 movq 24(%rsp), %r9 440 441.align 16 442.Linner4x: 443 mulx (%rsi), %r12, %r14 // r14 = hi(a[4] * b[i]), r12 = lo(a[4] * b[i]) 444 adcx %r10, %r11 // carry of previous round 445 446 adox %rbx, %r12 // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) 447 448 mulx 8(%rsi), %rax, %r15 // r15 = hi(a[5] * b[i]), rax = lo(a[5] * b[0]) 449 adcx (%rbp), %r12 // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + tmp[4] --> 所以这里t不偏移 450 adox %rax, %r14 // r14 = hi(a[4] * b[i]) + lo(a[5] * b[0]) 451 452 mulx 16(%rsi), %rax, %rdi // rax = hi(a[6] * b[i]), rax = lo(a[6] * b[i]) 453 adcx 8(%rbp), %r14 // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + tmp[5] 454 adox %rax, %r15 // r15 = hi(a[5] * b[i]) + lo(a[6] * b[i]) 455 456 mulx 24(%rsi), %rax, %rbx // rax = hi(a[7] * b[i]), rdi = lo(a[7] * b[i]) 457 adcx 16(%rbp), %r15 // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + tmp[6] 458 adox %rax, %rdi // rbx = hi(a[6] * b[i]) + lo(a[7] * b[i]) 459 460 adox %r10, %rbx // rbx += OF 461 adcx 24(%rbp), %rdi // rdi = hi(a[6] * b[i]) + lo(a[7] * b[i]) + tmp[7] 462 adcx %r10, %rbx // rbx += CF 463 464 // update rdx, begin cal n[i] * k0 * m 465 adox %r11,%r12 // r12 = hi(a[3] * b[i]) + lo(a[4] * b[i]) + hi(n[3] * m') 466 movq %r8, %rdx 467 mulx (%rcx), %rax, %r11 // (rax, r8) = n[4] * m' 468 leaq 4*8(%rbp), %rbp // tmp offsets 4 469 adcx %rax,%r12 // r12 = hi(a[3] * b[i]) + lo(b[4] * a[i]) 470 // + hi(n[3] * m') + lo(n[4] * m') 471 adox %r14, %r11 // r8 = hi(a[4] * b[i]) + lo(a[5] * b[i]) + hi(n[4] * m') 472 473 mulx 8(%rcx), %rax, %r14 // (rax, r14) = n[5] * m' 474 leaq 4*8(%rsi), %rsi // a offsets 4 475 adcx %rax, %r11 // r8 = hi(a[4] * b[i]) + lo(a[5] * b[i]) 476 // + hi(n[4] * m') + lo(n[5] * m') 477 adox %r15, %r14 // r14 = hi(a[5] * b[i]) + lo(a[6] * a[i]) 478 // + ho(n[5] * m') 479 480 mulx 16(%rcx), %rax, %r15 // (rax, r15) = n[6] * m' 481 movq %r12, -5*8(%rbp) 482 adcx %rax, %r14 // r14 = hi(a[5] * b[i]) + lo(a[6] * b[i]) 483 // + hi(n[5] * m') + lo(n[6] * m') 484 movq %r11, -4*8(%rbp) 485 adox %rdi, %r15 // r15 = hi(a[6] * b[i]) + lo(a[7] * b[i]) 486 // + hi(n[6] * m') 487 488 mulx 24(%rcx), %rax, %r11 // (rax, r11) = n[7] * m' 489 movq %r14, -3*8(%rbp) 490 adcx %rax, %r15 // r15 = hi(a[6] * b[0]) + lo(a[7] * b[0]) 491 // + hi(n[6] * m') + lo(n[7] * m') 492 493 adox %r10, %r11 494 movq %r15, -2*8(%rbp) 495 496 leaq 4*8(%rcx), %rcx // n offsets 4 497 movq (%r13), %rdx 498 dec %r9 499 jnz .Linner4x 500 501 movq 32(%rsp), %r15 // r15 = size * 8 502 leaq 8(%r13), %r13 // b offsets 1. 503 504 adcx %r10, %r11 // hi(n[7] * m') + OF + CF 505 subq 0*8(%rbp), %r10 506 adcx %r11, %rbx // hi(a[7] * b[0]) + hi(n[7] * m') 507 sbbq %r11,%r11 508 movq %rbx, -1*8(%rbp) 509 cmp 40(%rsp), %r13 510 jne .LoopOuter4x 511 512 leaq 48(%rsp),%rbp // rbp = tmp[0] 513 subq %r15, %rcx // rcx= n[0] 514 negq %r11 515 516 movq 24(%rsp), %rdx // rdx = size/4 517 518 movq 8(%rsp), %rdi // get r[0] 519 520 // cal tmp - n 521 movq 0(%rbp), %rax // rax = tmp[0] 522 movq 8(%rbp), %rbx // rbx = tmp[1] 523 movq 16(%rbp), %r10 // r10 = tmp[2] 524 movq 24(%rbp), %r12 // r12 = tmp[3] 525 526 leaq 32(%rbp), %rbp // tmp += 4 527 528 subq 0(%rcx), %rax // tmp[0] - n[0] 529 sbbq 8(%rcx), %rbx // tmp[1] - n[1] 530 sbbq 16(%rcx), %r10 // tmp[2] - n[2] 531 sbbq 24(%rcx), %r12 // tmp[3] - n[3] 532 533 leaq 32(%rcx), %rcx // n += 4 534 535 movq %rax, 0(%rdi) // r save the tmp - n 536 movq %rbx, 8(%rdi) 537 movq %r10, 16(%rdi) 538 movq %r12, 24(%rdi) 539 540 leaq 32(%rdi), %rdi // r += 4 541 542.LoopSub4x: 543 movq 0(%rbp), %rax // rax = tmp[0] 544 movq 8(%rbp), %rbx // rbx = tmp[1] 545 movq 16(%rbp), %r10 // r10 = tmp[2] 546 movq 24(%rbp), %r12 // r12 = tmp[3] 547 548 leaq 32(%rbp), %rbp 549 550 sbbq 0(%rcx), %rax // tmp[0] - n[0] 551 sbbq 8(%rcx), %rbx // tmp[1] - n[1] 552 sbbq 16(%rcx), %r10 // tmp[2] - n[2] 553 sbbq 24(%rcx), %r12 // tmp[3] - n[3] 554 555 leaq 32(%rcx), %rcx 556 557 movq %rax, 0(%rdi) 558 movq %rbx, 8(%rdi) 559 movq %r10, 16(%rdi) 560 movq %r12, 24(%rdi) 561 562 leaq 32(%rdi), %rdi 563 564 decq %rdx // j-- 565 jnz .LoopSub4x // if j != 0 566 567 sbbq $0,%r11 // cancellation of highest carry 568 subq %r15, %rbp // rbp = tmp[0] 569 subq %r15, %rdi // r = n[0] 570 571 movq 24(%rsp), %r10 // r10 = size/4 - 1 572 573 pxor %xmm2,%xmm2 // xmm0 = 0 574 movq %r11, %xmm0 575 pcmpeqd %xmm1,%xmm1 // xmm5 = -1 576 pshufd $0,%xmm0,%xmm0 577 pxor %xmm0,%xmm1 578 xorq %rax,%rax 579 580 movdqa (%rbp,%rax),%xmm5 // Copy the result to r. 581 movdqu (%rdi,%rax),%xmm3 582 pand %xmm0,%xmm5 583 pand %xmm1,%xmm3 584 movdqa 16(%rbp,%rax),%xmm4 585 movdqu %xmm2,(%rbp,%rax) 586 por %xmm3,%xmm5 587 movdqu 16(%rdi,%rax),%xmm3 588 movdqu %xmm5,(%rdi,%rax) 589 pand %xmm0,%xmm4 590 pand %xmm1,%xmm3 591 movdqa %xmm2,16(%rbp,%rax) 592 por %xmm3,%xmm4 593 movdqu %xmm4,16(%rdi,%rax) 594 leaq 32(%rax),%rax 595 596.align 16 597.LoopCopy4x: 598 movdqa (%rbp,%rax),%xmm5 599 movdqu (%rdi,%rax),%xmm3 600 pand %xmm0,%xmm5 601 pand %xmm1,%xmm3 602 movdqa 16(%rbp,%rax),%xmm4 603 movdqu %xmm2,(%rbp,%rax) 604 por %xmm3,%xmm5 605 movdqu 16(%rdi,%rax),%xmm3 606 movdqu %xmm5,(%rdi,%rax) 607 pand %xmm0,%xmm4 608 pand %xmm1,%xmm3 609 movdqa %xmm2,16(%rbp,%rax) 610 por %xmm3,%xmm4 611 movdqu %xmm4,16(%rdi,%rax) 612 leaq 32(%rax),%rax 613 decq %r10 // j-- 614 jnz .LoopCopy4x 615 movq 0(%rsp),%rsi // rsi = pressed-stacked rsp. 616 movq $1,%rax 617 leaq (%rsi),%rsp // Restore srsp. 618 RESTORE_REGISTERS 619 ret 620.cfi_endproc 621.size MontMul4x,.-MontMul4x 622 623.type MontSqr8x,@function 624.align 32 625MontSqr8x: 626.cfi_startproc 627 SAVE_REGISTERS 628 movq %rsp,%rax 629 630 movl %r9d,%r15d 631 shll $3,%r9d // Calculate size * 8 bytes. 632 shlq $5,%r15 // size * 8 * 4 633 negq %r9 634 635 leaq -64(%rsp,%r9,2),%r14 // r14 = rsp[size * 2 - 8] 636 subq %rsi,%r14 637 andq $4095,%r14 638 movq %rsp,%rbp 639 cmpq %r14,%r15 640 jae .Loop8xCheckstk 641 642 leaq 4032(,%r9,2),%r15 // r15 = 4096 - frame - 2 * size 643 subq %r15,%r14 644 movq $0,%r15 645 cmovcq %r15,%r14 646 647.Loop8xCheckstk: 648 subq %r14,%rbp 649 leaq -96(%rbp,%r9,2),%rbp // Allocate a frame + 2 x size. 650 651 andq $-64,%rbp // __checkstk implementation, 652 // which is invoked when the stack size needs to exceed one page. 653 movq %rsp,%r14 654 subq %rbp,%r14 655 andq $-4096,%r14 656 leaq (%r14,%rbp),%rsp 657 cmpq %rbp,%rsp 658 jbe .LoopMul8x 659 660.align 16 661.LoopPage8x: 662 leaq -4096(%rsp),%rsp // Change sp - 4096 each time until sp <= the space to be allocated 663 cmpq %rbp,%rsp 664 ja .LoopPage8x 665 666.LoopMul8x: 667 movq %r9,%r15 // r15 = -size * 8 668 negq %r9 // Restoresize. 669 movq %r8,32(%rsp) // Save the values of k0 and sp. 670 movq %rax,40(%rsp) 671 672 673 movq %rcx, %xmm1 // Pointer to saving n. 674 pxor %xmm2,%xmm2 // xmm0 = 0 675 movq %rdi, %xmm0 // Pointer to saving r. 676 movq %r15, %xmm5 // Save size. 677 call MontSqr8Inner 678 679 leaq (%rdi,%r9),%rbx // rbx = t[size] 680 movq %r9,%rcx // rcx = -size 681 movq %r9,%rdx // rdx = -size 682 movq %xmm0, %rdi // rdi = r 683 sarq $5,%rcx // rcx >>= 5 684 685.align 32 686/* T -= N */ 687.LoopSub8x: 688 movq (%rbx),%r13 // r13 = t[i] 689 movq 8(%rbx),%r12 // r12 = t[i + 1] 690 movq 16(%rbx),%r11 // r11 = t[i + 2] 691 movq 24(%rbx),%r10 // r10 = t[i + 3] 692 693 sbbq (%rbp),%r13 // r13 = t[i] - (n[i] + CF) 694 sbbq 8(%rbp),%r12 // r12 = t[i + 1] - (n[i + 1] + CF) 695 sbbq 16(%rbp),%r11 // r11 = t[i + 2] - (n[i + 2] + CF) 696 sbbq 24(%rbp),%r10 // r10 = t[i + 3] - (n[i + 3] + CF) 697 698 movq %r13,0(%rdi) // Assigning value to r. 699 movq %r12,8(%rdi) 700 movq %r11,16(%rdi) 701 movq %r10,24(%rdi) 702 703 leaq 32(%rbp),%rbp // n += 4 704 leaq 32(%rdi),%rdi // r += 4 705 leaq 32(%rbx),%rbx // t += 4 706 incq %rcx 707 jnz .LoopSub8x 708 709 sbbq $0,%rax // rax -= CF 710 leaq (%rbx,%r9),%rbx 711 leaq (%rdi,%r9),%rdi 712 713 movq %rax,%xmm0 714 pxor %xmm2,%xmm2 715 pshufd $0,%xmm0,%xmm0 716 movq 40(%rsp),%rsi // rsi = pressed-stacked rsp. 717 718.align 32 719.LoopCopy8x: 720 movdqa 0(%rbx),%xmm1 // Copy the result to r. 721 movdqa 16(%rbx),%xmm5 722 leaq 32(%rbx),%rbx 723 movdqu 0(%rdi),%xmm3 724 movdqu 16(%rdi),%xmm4 725 leaq 32(%rdi),%rdi 726 movdqa %xmm2,-32(%rbx) 727 movdqa %xmm2,-16(%rbx) 728 movdqa %xmm2,-32(%rbx,%rdx) 729 movdqa %xmm2,-16(%rbx,%rdx) 730 pcmpeqd %xmm0,%xmm2 731 pand %xmm0,%xmm1 732 pand %xmm0,%xmm5 733 pand %xmm2,%xmm3 734 pand %xmm2,%xmm4 735 pxor %xmm2,%xmm2 736 por %xmm1,%xmm3 737 por %xmm5,%xmm4 738 movdqu %xmm3,-32(%rdi) 739 movdqu %xmm4,-16(%rdi) 740 addq $32,%r9 741 jnz .LoopCopy8x 742 743 movq $1,%rax 744 leaq (%rsi),%rsp // Restore rsp. 745 RESTORE_REGISTERS // Restore non-volatile register. 746 ret 747.cfi_endproc 748.size MontSqr8x,.-MontSqr8x 749 750.type MontSqr8Inner,@function 751.align 32 752MontSqr8Inner: 753.cfi_startproc 754 755 movq %rsi, %r8 756 addq %r9, %r8 757 movq %r8, 64(%rsp) // save a[size] 758 movq %r9, 56(%rsp) // save size * 8 759 leaq 88(%rsp), %rbp // tmp的首地址 760 761 leaq 88(%rsp,%r9,2),%rbx 762 movq %rbx,16(%rsp) // t[size * 2] 763 leaq (%rcx,%r9),%rax 764 movq %rax,8(%rsp) // n[size] 765 jmp .MontSqr8xBegin 766 767.MontSqr8xInitStack: 768 movdqa %xmm2,0*8(%rbp) 769 movdqa %xmm2,2*8(%rbp) 770 movdqa %xmm2,4*8(%rbp) 771 movdqa %xmm2,6*8(%rbp) 772.MontSqr8xBegin: 773 movdqa %xmm2,8*8(%rbp) 774 movdqa %xmm2,10*8(%rbp) 775 movdqa %xmm2,12*8(%rbp) 776 movdqa %xmm2,14*8(%rbp) 777 lea 128(%rbp), %rbp 778 subq $64, %r9 779 jnz .MontSqr8xInitStack 780 781 xorq %rbx, %rbx // clear CF OF 782 movq $0, %r13 783 movq $0, %r12 784 movq $0, %r11 785 movq $0, %rdi 786 movq $0, %r15 787 movq $0, %rcx 788 789 leaq 88(%rsp), %rbp // set tmp[0] 790 movq 0(%rsi), %rdx // rdx = a[0] 791 movq $0, %r10 792 793.LoopOuterSqr8x: 794 795 // begin a[0] * a[1~7] 796 mulx 8(%rsi), %rax, %r14 // rax = lo(a[1] * a[0]), r14 = hi(a[1] * a[0]) 797 adcx %rbx, %rax 798 799 movq %rax, 8(%rbp) 800 adox %r13, %r14 801 802 mulx 16(%rsi), %rax, %r13 // (rax, r13) = a[2] * a[0] 803 adcx %rax, %r14 // r14 = hi(a[1] * a[0]) + lo(a[2] * a[0]) 804 adox %r12, %r13 805 806 mulx 24(%rsi), %rax, %r12 // (rax, r12) = a[3] * a[0] 807 movq %r14, 16(%rbp) 808 adcx %rax, %r13 // r13 = hi(a[2] * a[0]) + lo(a[3] * a[0]) 809 adox %r11, %r12 810 811 mulx 32(%rsi), %rax, %r11 // (rax, r11) = a[4] * a[0] 812 adcx %rax, %r12 // r12 = hi(a[3] * a[0]) + lo(a[4] * a[0]) 813 814 adox %rdi, %r11 815 mulx 40(%rsi), %rax, %rdi // (rax, rdi) = a[5] * a[0] 816 adcx %rax, %r11 // r11 = hi(a[4] * a[0]) + lo(a[5] * a[0]) 817 818 adox %r15, %rdi 819 mulx 48(%rsi), %rax, %r8 // (rax, r8) = a[6] * a[0] 820 adcx %rax, %rdi // rdi = hi(a[5] * a[0]) + lo(a[6] * a[0]) 821 adox %rcx, %r8 822 823 mulx 56(%rsi), %rax, %rbx // (rax, rbx) = a[7] * a[0] 824 adcx %rax, %r8 // r8 = hi(a[6] * a[0]) + lo(a[7] * a[0]) 825 adox %r10, %rbx // rbx += CF 826 adcq 64(%rbp), %rbx // rbx += CF 827 828 sbbq %r9, %r9 // get high CF 829 xorq %r10, %r10 // clear CF OF 830 831 // begin a[1] * a[2~7] 832 movq 8(%rsi), %rdx // rdx = a[1] 833 mulx 16(%rsi), %rax, %rcx // rax = lo(a[2] * a[1]), rcx = hi(a[2] * a[1]) 834 adcx %rax, %r13 // r13 = hi(a[2] * a[0]) + lo(a[3] * a[0]) + lo(a[2] * a[1]) 835 836 mulx 24(%rsi), %rax, %r14 // rax = lo(a[3] * a[1]), r14 = hi(a[3] * a[1]) 837 movq %r13, 24(%rbp) 838 839 adox %rax, %rcx // rcx = lo(a[3] * a[1]) + hi(a[2] * a[1]) 840 841 mulx 32(%rsi), %rax, %r13 // (rax, r13) = a[4] * a[1] 842 adcx %r12, %rcx // rcx = hi(a[3] * a[0]) + lo(a[4] * a[0]) + lo(a[3] * a[1]) + hi(a[2] * a[1]) 843 adox %rax, %r14 // r14 = lo(a[4] * a[1]) + hi(a[3] * a[1]) 844 845 mulx 40(%rsi), %rax, %r12 // (rax, r12) = a[5] * a[1] 846 movq %rcx, 32(%rbp) 847 adcx %r11, %r14 // r14 = lo(a[4] * a[1]) + hi(a[3] * a[1]) + hi(a[4] * a[0]) + lo(a[5] * a[0]) 848 adox %rax, %r13 // r13 = lo(a[5] * a[1]) + hi(a[4] * a[1]) 849 850 mulx 48(%rsi), %rax, %r11 // (rax, r11) = a[6] * a[1] 851 adcx %rdi, %r13 // r13 = lo(a[5] * a[1]) + hi(a[4] * a[1]) + hi(a[5] * a[0]) + lo(a[6] * a[0]) 852 adox %rax, %r12 // r12 = hi(a[5] * a[1]) + lo(a[6] * a[1]) 853 854 mulx 56(%rsi), %rax, %rdi // (rax, rdi) = a[7] * a[1] 855 adcx %r8, %r12 // r12 = hi(a[5] * a[1]) + lo(a[6] * a[1]) + hi(a[6] * a[0]) + lo(a[7] * a[0]) 856 adox %rax, %r11 // r11 = hi(a[6] * a[1]) + lo(a[7] * a[1]) 857 adcx %rbx, %r11 // r11 = hi(a[6] * a[1]) + lo(a[7] * a[1]) + hi(a[7] * a[0]) 858 859 adcx %r10, %rdi // rdi += CF 860 adox %r10, %rdi // rdi += OF 861 862 movq 16(%rsi), %rdx // rdx = a[2] 863 864 // begin a[2] * a[3~7] 865 mulx 24(%rsi), %rax, %rbx // rax = lo(a[2] * a[3]), rbx = hi(a[2] * a[3]) 866 adcx %rax, %r14 // r14 = lo(a[4] * a[1]) + hi(a[3] * a[1]) + hi(a[4] * a[0]) + lo(a[5] * a[0]) 867 // + lo(a[2] * a[3]) 868 869 mulx 32(%rsi), %rax, %rcx // rax = lo(a[2] * a[4]), rcx = hi(a[2] * a[4]) 870 871 movq %r14, 40(%rbp) 872 adox %rax, %rbx // r13 = lo(a[2] * a[4]) + hi(a[2] * a[3]) 873 874 mulx 40(%rsi), %rax, %r8 // rax = lo(a[2] * a[5]), rcx = hi(a[2] * a[5]) 875 adcx %r13, %rbx // rbx = lo(a[2] * a[4]) + hi(a[2] * a[3]) 876 // + lo(a[5] * a[1]) + hi(a[4] * a[1]) + hi(a[5] * a[0]) + lo(a[6] * a[0]) 877 878 adox %rax, %rcx // rcx = hi(a[2] * a[4]) + lo(a[2] * a[5]) 879 movq %rbx, 48(%rbp) 880 881 mulx 48(%rsi), %rax, %r13 // rax = lo(a[2] * a[6]), r13 = hi(a[2] * a[6]) 882 adcx %r12, %rcx // rcx = hi(a[5] * a[1]) + lo(a[6] * a[1]) + hi(a[6] * a[0]) 883 // + lo(a[7] * a[0]) + hi(a[2] * a[4]) + lo(a[2] * a[5]) 884 885 adox %rax, %r8 // r8 = hi(a[2] * a[5]) + lo(a[2] * a[6]) 886 887 mulx 56(%rsi), %rax, %r12 // rax = lo(a[2] * a[7]), r12 = hi(a[2] * a[7]) 888 889 adcx %r11, %r8 // r8 = hi(a[2] * a[5]) + lo(a[2] * a[6]) 890 // + hi(a[6] * a[1]) + lo(a[7] * a[1]) + hi(a[7] * a[0]) 891 892 adox %rax, %r13 // r13 = hi(a[2] * a[6]) + lo(a[2] * a[7]) 893 adcx %rdi, %r13 // r13 = hi(a[2] * a[6]) + lo(a[2] * a[7]) + hi(a[7] * a[1]) 894 895 adcx %r10, %r12 // r12 += CF 896 adox %r10, %r12 // r12 += OF 897 898 movq 24(%rsi), %rdx // rdx = a[3] 899 900 // begin a[3] * a[4~7] 901 mulx 32(%rsi), %rax, %r14 // rax = lo(a[3] * a[4]), r14 = hi(a[3] * a[4]) 902 adcx %rax, %rcx // rcx = hi(a[5] * a[1]) + lo(a[6] * a[1]) + hi(a[6] * a[0]) 903 // + lo(a[7] * a[0]) + hi(a[2] * a[4]) + lo(a[2] * a[5]) + lo(a[3] * a[4]) 904 905 mulx 40(%rsi), %rax, %rbx // rax = lo(a[3] * a[5]), rbx = hi(a[3] * a[5]) 906 adox %rax, %r14 // r14 = hi(a[3] * a[4]) + lo(a[3] * a[5]) 907 908 mulx 48(%rsi), %rax, %r11 // rax = lo(a[3] * a[6]), r11 = hi(a[3] * a[6]) 909 adcx %r8, %r14 // r14 = hi(a[3] * a[4]) + lo(a[3] * a[5])+ hi(a[2] * a[5]) + lo(a[2] * a[6]) 910 // + hi(a[6] * a[1]) + lo(a[7] * a[1]) + hi(a[7] * a[0]) 911 adox %rax, %rbx // rbx = hi(a[3] * a[5]) + lo(a[3] * a[6]) 912 913 mulx 56(%rsi), %rax, %rdi // rax = lo(a[3] * a[7]), rdi = hi(a[3] * a[7]) 914 adcx %r13, %rbx // rbx = hi(a[3] * a[5]) + lo(a[3] * a[6]) 915 // + hi(a[2] * a[6]) + lo(a[2] * a[7]) + hi(a[7] * a[1]) 916 adox %rax, %r11 // r11 = hi(a[3] * a[6]) + lo(a[3] * a[7]) 917 adcx %r12, %r11 // r11 = hi(a[2] * a[7]) + hi(a[3] * a[6]) + lo(a[3] * a[7]) 918 919 adcx %r10, %rdi // rdi += CF 920 adox %r10, %rdi // rdi += OF 921 922 movq %rcx, 56(%rbp) 923 movq %r14, 64(%rbp) 924 925 movq 32(%rsi), %rdx // rdx = a[4] 926 927 // begin a[4] * a[5~7] 928 mulx 40(%rsi), %rax, %r13 // rax = lo(a[4] * a[5]), r13 = hi(a[4] * a[5]) 929 adcx %rax, %rbx // rbx = hi(a[3] * a[5]) + lo(a[3] * a[6]) 930 // + hi(a[2] * a[6]) + lo(a[2] * a[7]) + hi(a[7] * a[1]) + lo(a[4] * a[5]) 931 932 mulx 48(%rsi), %rax, %r12 // rax = lo(a[4] * a[6]), r12 = hi(a[4] * a[6]) 933 adox %rax, %r13 // r13 = lo(a[4] * a[6]) + hi(a[4] * a[5]) 934 935 mulx 56(%rsi), %rax, %r14 // rax = lo(a[4] * a[7]), r14 = hi(a[4] * a[7]) 936 adcx %r11, %r13 // r13 = hi(a[4] * a[5]) + hi(a[2] * a[7]) + hi(a[3] * a[6]) 937 // + lo(a[3] * a[7]) 938 939 adox %rax, %r12 // r12 = hi(a[4] * a[6]) + lo(a[4] * a[7]) 940 adcx %rdi, %r12 // r12 = hi(a[4] * a[6]) + lo(a[4] * a[7]) + hi(a[3] * a[7]) 941 942 adcx %r10, %r14 // r14 += CF 943 adox %r10, %r14 // r14 += OF 944 945 movq 40(%rsi), %rdx // rdx = a[5] 946 947 // begin a[5] * a[6~7] 948 mulx 48(%rsi), %rax, %r11 // rax = lo(a[5] * a[6]), r11 = hi(a[5] * a[6]) 949 950 adcx %rax, %r12 // r14 = hi(a[4] * a[6]) + lo(a[4] * a[7]) + hi(a[3] * a[7]) + lo(a[5] * a[6]) 951 952 mulx 56(%rsi), %rax, %rdi // rax = lo(a[5] * a[7]), rdi = hi(a[5] * a[7]) 953 954 adox %rax, %r11 // r11 = hi(a[5] * a[6]) + lo(a[5] * a[7]) 955 adcx %r14, %r11 // r11 = hi(a[5] * a[6]) + lo(a[5] * a[7]) + hi(a[4] * a[7]) 956 adcx %r10, %rdi // rdi += CF 957 adox %r10, %rdi // rdi += OF 958 959 movq 48(%rsi), %rdx // rdx = a[6] 960 961 mulx 56(%rsi), %rax, %r15 // rax = lo(a[7] * a[6]), r15 = hi(a[7] * a[6]) 962 adcx %rax, %rdi // rdi = hi(a[5] * a[6]) + lo(a[7] * a[6]) 963 adcx %r10, %r15 // r15 += CF 964 965 leaq 64(%rsi), %rsi 966 967 cmp 64(%rsp), %rsi // cmpared with a[size] 968 je .Lsqrx8xEnd 969 970 neg %r9 971 movq $0, %rcx 972 movq 64(%rbp),%r14 973 974 adcx 9*8(%rbp),%rbx 975 adcx 10*8(%rbp),%r13 976 adcx 11*8(%rbp),%r12 977 adcx 12*8(%rbp),%r11 978 adcx 13*8(%rbp),%rdi 979 adcx 14*8(%rbp),%r15 980 adcx 15*8(%rbp),%rcx 981 982 leaq (%rsi), %r10 // r10 = a[8] 983 leaq 128(%rbp), %rbp 984 sbbq %rax,%rax 985 movq %rax, 72(%rsp) 986 movq %rbp, 80(%rsp) 987 988 xor %eax, %eax 989 990 991 movq -64(%rsi), %rdx 992 993 movq $-8, %r9 994 995.align 32 996.LoopSqr8x: 997 movq %r14,%r8 998 999 // begin a[0] * a[8~11] 1000 mulx 0(%r10), %rax, %r14 // rax = lo(a[8] * a[0]), r14 = hi(a[8] * a[0]) 1001 adcx %rax, %r8 1002 adox %rbx, %r14 1003 1004 mulx 8(%r10), %rax, %rbx // rax = lo(a[9] * a[0]), rbx = hi(a[8] * a[0]) 1005 adcx %rax, %r14 1006 adox %r13, %rbx 1007 1008 movq %r8,(%rbp,%r9,8) 1009 1010 mulx 16(%r10), %rax, %r13 // rax = lo(a[10] * a[0]), r13 = hi(a[10] * a[0]) 1011 adcx %rax, %rbx 1012 adox %r12, %r13 1013 1014 mulx 24(%r10), %rax, %r12 // rax = lo(a[11] * a[0]), r12 = hi(a[11] * a[0]) 1015 adcx %rax, %r13 1016 adox %r11, %r12 1017 1018 movq $0, %r8 1019 1020 mulx 32(%r10), %rax, %r11 // rax = lo(a[12] * a[0]), r11 = hi(a[12] * a[0]) 1021 adcx %rax, %r12 1022 adox %rdi, %r11 1023 1024 mulx 40(%r10), %rax, %rdi // rax = lo(a[13] * a[0]), rdi = hi(a[13] * a[0]) 1025 adcx %rax, %r11 1026 adox %r15, %rdi 1027 1028 mulx 48(%r10), %rax, %r15 // rax = lo(a[14] * a[0]), r15 = hi(a[14] * a[0]) 1029 adcx %rax, %rdi 1030 adox %rcx, %r15 1031 1032 mulx 56(%r10), %rax, %rcx // rax = lo(a[15] * a[0]), rcx = hi(a[15] * a[0]) 1033 adcx %rax, %r15 1034 adcx %r8, %rcx // here r8 = 0 1035 adox %r8, %rcx 1036 1037 movq 8(%rsi,%r9,8),%rdx 1038 1039 inc %r9 1040 jnz .LoopSqr8x 1041 1042 leaq 64(%r10), %r10 1043 movq $-8, %r9 1044 1045 cmp 64(%rsp), %r10 // cmpared with a[size] 1046 je .LoopSqr8xBreak 1047 1048 subq 72(%rsp), %r8 // read the CF of the previous round. 1049 1050 movq -64(%rsi), %rdx 1051 1052 adcx 0*8(%rbp),%r14 1053 adcx 1*8(%rbp),%rbx 1054 adcx 2*8(%rbp),%r13 1055 adcx 3*8(%rbp),%r12 1056 adcx 4*8(%rbp),%r11 1057 adcx 5*8(%rbp),%rdi 1058 adcx 6*8(%rbp),%r15 1059 adcx 7*8(%rbp),%rcx 1060 1061 leaq 8*8(%rbp),%rbp 1062 1063 sbbq %rax, %rax 1064 xorq %r8, %r8 1065 movq %rax, 72(%rsp) 1066 1067 jmp .LoopSqr8x 1068 1069.align 32 1070.LoopSqr8xBreak: 1071 1072 xorq %r10, %r10 1073 subq 72(%rsp),%r8 1074 adcx %r10, %r14 1075 movq 0(%rsi),%rdx 1076 movq %r14,0(%rbp) 1077 movq 80(%rsp), %r8 1078 1079 adcx %r10,%rbx 1080 adcx %r10,%r13 1081 adcx %r10,%r12 1082 adcx %r10,%r11 1083 adcx %r10,%rdi 1084 adcx %r10,%r15 1085 adcx %r10,%rcx 1086 1087 cmp %r8, %rbp 1088 je .LoopOuterSqr8x 1089 1090 // if tmp does not go to the end. The current value needs to be stored in tmp and updated. 1091 movq %rbx,1*8(%rbp) 1092 movq 1*8(%r8),%rbx 1093 movq %r13,2*8(%rbp) 1094 movq 2*8(%r8),%r13 1095 movq %r12,3*8(%rbp) 1096 movq 3*8(%r8),%r12 1097 movq %r11,4*8(%rbp) 1098 movq 4*8(%r8),%r11 1099 movq %rdi,5*8(%rbp) 1100 movq 5*8(%r8),%rdi 1101 movq %r15,6*8(%rbp) 1102 movq 6*8(%r8),%r15 1103 movq %rcx,7*8(%rbp) 1104 movq 7*8(%r8),%rcx 1105 movq %r8,%rbp 1106 jmp .LoopOuterSqr8x 1107 1108.align 32 1109.Lsqrx8xEnd: 1110 mov %rbx,9*8(%rbp) 1111 mov %r13,10*8(%rbp) 1112 mov %r12,11*8(%rbp) 1113 mov %r11,12*8(%rbp) 1114 mov %rdi,13*8(%rbp) 1115 mov %r15,14*8(%rbp) 1116 1117 leaq 88(%rsp), %rbp // tmp[0] 1118 1119 movq 56(%rsp), %rcx // rcx = size * 8 1120 sbbq %rcx, %rsi // get a[0] 1121 1122 xorq %r15, %r15 // clear CF OF, r15 = tmp[0] = 0 1123 movq 8(%rbp), %r14 // r14 = tmp[1] 1124 movq 16(%rbp), %r13 // r13 = tmp[2] 1125 movq 24(%rbp), %r12 // r12 = tmp[3] 1126 1127 adox %r14, %r14 // r14 = 2 * tmp[1] 1128 movq 0(%rsi), %rdx 1129 1130.align 32 1131.LoopShiftAddSqr4x: 1132 1133 mulx %rdx, %rax, %rbx // (rbx, rax) = a[0] * a[0] 1134 adox %r13, %r13 // r13 = 2 * tmp[1] 1135 adox %r12, %r12 // r12 = 2 * tmp[3] 1136 1137 adcx %rax, %r15 // r15 = 2 * tmp[0] + lo(a[0] * a[0]) 1138 adcx %rbx, %r14 // r14 = 2 * tmp[1] + hi(a[0] * a[0]) 1139 1140 movq %r15, (%rbp) 1141 movq %r14, 8(%rbp) 1142 1143 movq 8(%rsi), %rdx 1144 1145 mulx %rdx, %rax, %rbx // (rbx, rax) = a[1] * a[1] 1146 adcx %rax, %r13 // r13 = 2 * tmp[2] + lo(a[1] * a[1]) 1147 adcx %rbx, %r12 // r12 = 2 * tmp[3] + hi(a[1] * a[1]) 1148 1149 movq %r13, 16(%rbp) 1150 movq %r12, 24(%rbp) 1151 1152 movq 32(%rbp), %r15 // r15 = tmp[4] 1153 movq 40(%rbp), %r14 // r14 = tmp[5] 1154 movq 48(%rbp), %r13 // r13 = tmp[6] 1155 movq 56(%rbp), %r12 // r12 = tmp[7] 1156 1157 movq 16(%rsi), %rdx 1158 mulx %rdx, %rax, %rbx // (rbx, rax) = a[2] * a[2] 1159 adox %r15, %r15 // r15 = 2 * tmp[4] 1160 adcx %rax, %r15 // r15 = 2 * tmp[4] + lo(a[2] * a[2]) 1161 1162 adox %r14, %r14 // r14 = 2 * tmp[4] 1163 adcx %rbx, %r14 // r14 = 2 * tmp[5] + hi(a[2] * a[2]) 1164 1165 movq %r15, 32(%rbp) 1166 movq %r14, 40(%rbp) 1167 1168 movq 24(%rsi), %rdx 1169 mulx %rdx, %rax, %rbx // (rbx, rax) = a[3] * a[3] 1170 adox %r13, %r13 // r13 = 2 * tmp[5] 1171 adcx %rax, %r13 // r13 = 2 * tmp[5] + lo(a[3] * a[3]) 1172 1173 adox %r12, %r12 // r12 = 2 * tmp[5] 1174 adcx %rbx, %r12 // rbx = 2 * tmp[5] + hi(a[3] * a[3]) 1175 1176 movq %r13, 48(%rbp) 1177 movq %r12, 56(%rbp) 1178 1179 leaq 32(%rsi), %rsi // a[4] 1180 1181 leaq -32(%rcx),%rcx 1182 jrcxz .LoopReduceSqr8xBegin // if i != 0 1183 1184 movq 64(%rbp), %r15 // r15 = tmp[8] 1185 movq 72(%rbp), %r14 // r14 = tmp[9] 1186 adox %r15, %r15 // r15 = 2 * tmp[8] 1187 adox %r14, %r14 // r14 = 2 * tmp[9] 1188 1189 movq 80(%rbp), %r13 // r13 = tmp[8] 1190 movq 88(%rbp), %r12 // r12 = tmp[9] 1191 1192 leaq 64(%rbp), %rbp 1193 1194 movq 0(%rsi), %rdx 1195 1196 jmp .LoopShiftAddSqr4x // if i != 0 1197 1198.LoopReduceSqr8xBegin: 1199 xorq %rax,%rax // rax = 0 1200 leaq 88(%rsp), %rdi // tmp[0] 1201 movq $0, %r9 // Save size. 1202 movq %xmm1, %rbp // get n[0] 1203 xorq %rsi, %rsi // rsi = 0 1204 1205.align 32 1206.LoopReduceSqr8x: 1207 movq %rax,80(%rsp) // Store the highest carry bit. 1208 leaq (%rdi,%r9),%rdi // rdi = t[0] 1209 1210 movq (%rdi),%rdx // rdx = t[0] 1211 movq 8(%rdi),%r9 // r9 = t[1] 1212 movq 16(%rdi),%r15 // r15 = t[2] 1213 movq 24(%rdi),%r14 // r14 = t[3] 1214 movq 32(%rdi),%r13 // r13 = t[4] 1215 movq 40(%rdi),%r12 // r12 = t[5] 1216 movq 48(%rdi),%r11 // r11 = t[6] 1217 movq 56(%rdi),%r10 // r10 = t[7] 1218 1219 leaq 64(%rdi),%rdi // rdi = t[8] 1220 1221 movq %rdx,%r8 // r8 = t[0] 1222 imulq 40(%rsp),%rdx // rbx = k0 * t[0] 1223 xorq %rbx,%rbx // clear CF OF 1224 movl $8,%ecx 1225 1226.align 32 1227.LoopReduce8x: 1228 movq %r8, %rbx 1229 movq %rdx, 80(%rsp,%rcx,8) 1230 mulx (%rbp), %rax, %r8 // (r8, rax) = m' * n[0] 1231 adcx %rbx, %rax 1232 adox %r9, %r8 // r9 = hi(m' * n[]) + t[1] 1233 1234 mulx 8(%rbp), %rax, %r9 // (rdx, r9) = m' * n[0] 1235 adcx %rax,%r8 // r9 = t[1] + lo(m' * n[1]) 1236 adox %r9, %r15 // r15 = hi(m' * n[1]) + t[2] 1237 1238 mulx 16(%rbp), %r9, %rax // (r9, rax) = m' * n[2] 1239 adcx %r15, %r9 // r9 = hi(m' * n[1]) + lo(m' * n[2]) + t[2] 1240 adox %rax, %r14 // rbx = hi(m' * n[2]) + t[3] 1241 1242 mulx 24(%rbp), %r15, %rax // (r15, rax) = m' * n[3] 1243 adcx %r14,%r15 // r15 = hi(m' * n[2]) + lo(m' * n[3]) + t[3] 1244 adox %rax,%r13 // r13 = hi(m' * n[3]) + t[4] 1245 1246 mulx 32(%rbp), %r14, %rax // (r14, rax) = m' * n[4] 1247 adcx %r13,%r14 // r14 = hi(m' * n[3]) + lo(m' * n[4]) + t[4] 1248 adox %rax,%r12 // r12 = hi(m' * n[4]) + t[5] 1249 1250 mulx 40(%rbp), %r13, %rax // (r13, rax) = m' * n[5] 1251 adcx %r12,%r13 // r13 = hi(m' * n[4]) + lo(m' * n[5]) + t[5] 1252 adox %rax,%r11 // r12 = hi(m' * n[5]) + t[6] 1253 1254 mulx 48(%rbp), %r12, %rax // (r12, rax) = m' * n[6] 1255 adcx %r11,%r12 // r13 = hi(m' * n[5]) + lo(m' * n[6]) + t[6] 1256 adox %r10,%rax // r12 = hi(m' * n[5]) + t[7] 1257 1258 mulx 56(%rbp), %r11, %r10 // (r11, r10) = m' * n[7] 1259 adcx %rax,%r11 // r13 = hi(m' * n[6]) + lo(m' * n[7]) + t[7] 1260 1261 adcx %rsi,%r10 // r12 = hi(m' * n[7]) + t[8] 1262 adox %rsi,%r10 // r12 = hi(m' * n[7]) + t[8] 1263 1264 movq %r8, %rdx 1265 mulx 40(%rsp), %rdx, %rax // (rdx, rax) = m' * n[7] 1266 1267 decl %ecx // ecx-- 1268 jnz .LoopReduce8x // if ecx != 0 1269 1270 leaq 64(%rbp),%rbp // rbp += 64, n Pointer Offset. 1271 xorq %rax,%rax // rax = 0 1272 cmpq 8(%rsp),%rbp // rbp = n[size] 1273 jae .LoopEndCondMul8x 1274 1275 addq (%rdi),%r8 // r8 += t[0] 1276 adcq 8(%rdi),%r9 // r9 += t[1] 1277 adcq 16(%rdi),%r15 // r15 += t[2] 1278 adcq 24(%rdi),%r14 // r14 += t[3] 1279 adcq 32(%rdi),%r13 // r13 += t[4] 1280 adcq 40(%rdi),%r12 // r12 += t[5] 1281 adcq 48(%rdi),%r11 // r11 += t[6] 1282 adcq 56(%rdi),%r10 // r10 += t[7] 1283 sbbq %rsi,%rsi // rsi = -CF 1284 1285 movq 144(%rsp),%rdx // rbx = m', 80 + 64 1286 movl $8,%ecx 1287 xor %eax,%eax 1288.align 32 1289.LoopLastSqr8x: 1290 mulx (%rbp), %rax, %rbx // (rbx, rax) = m' * n[0] 1291 adcx %rax,%r8 // r8 = lo(m' * n[0]) + t[0] 1292 movq %r8,(%rdi) // t[0] = r8 1293 leaq 8(%rdi),%rdi // t++ 1294 1295 adox %rbx,%r9 // r9 = hi(m' * n[]) + t[2] 1296 1297 mulx 8(%rbp), %r8, %rbx // (r8, rbx) = m' * n[0] 1298 adcx %r9,%r8 // r9 = t[1] + lo(m' * n[1]) 1299 adox %rbx, %r15 // r15 = hi(m' * n[1]) + t[2] 1300 1301 mulx 16(%rbp), %r9, %rbx // (r9, rbx) = m' * n[2] 1302 adcx %r15, %r9 // r9 = hi(m' * n[1]) + lo(m' * n[2]) + t[2] 1303 adox %rbx, %r14 // r14 = hi(m' * n[2]) + t[3] 1304 1305 mulx 24(%rbp), %r15, %rbx // (r15, rbx) = m' * n[3] 1306 adcx %r14,%r15 // r15 = hi(m' * n[2]) + lo(m' * n[3]) + t[3] 1307 adox %rbx,%r13 // r13 = hi(m' * n[3]) + t[4] 1308 1309 mulx 32(%rbp), %r14, %rbx // (r14, rbx) = m' * n[4] 1310 adcx %r13,%r14 // r14 = hi(m' * n[3]) + lo(m' * n[4]) + t[4] 1311 adox %rbx,%r12 // r12 = hi(m' * n[4]) + t[5] 1312 1313 mulx 40(%rbp), %r13, %rbx // (r13, rbx) = m' * n[5] 1314 adcx %r12,%r13 // r13 = hi(m' * n[4]) + lo(m' * n[5]) + t[5] 1315 adox %rbx,%r11 // r11 = hi(m' * n[5]) + t[6] 1316 1317 mulx 48(%rbp), %r12, %rbx // (r12, rbx) = m' * n[6] 1318 adcx %r11,%r12 // r12 = hi(m' * n[5]) + lo(m' * n[6]) + t[6] 1319 adox %r10,%rbx // rbx = hi(m' * n[5]) + t[7] 1320 1321 movq $0, %rax 1322 1323 mulx 56(%rbp), %r11, %r10 // (r11, r10) = m' * n[7] 1324 adcx %rbx,%r11 // r11 = hi(m' * n[6]) + lo(m' * n[7]) + t[7] 1325 1326 adcx %rax,%r10 // r10 = hi(m' * n[7]) + t[8] 1327 adox %rax,%r10 // r10 = hi(m' * n[7]) + t[8] 1328 1329 movq 72(%rsp,%rcx,8),%rdx // rbx = t[i] * k0 1330 1331 decl %ecx // ecx-- 1332 jnz .LoopLastSqr8x // if ecx != 0 1333 1334 leaq 64(%rbp),%rbp // n += 8 1335 cmpq 8(%rsp),%rbp // Check whether rbp is at the end of the n array. If yes, exit the loop. 1336 jae .LoopSqrBreak8x 1337 1338 movq 144(%rsp),%rdx // rbx = m' 1339 negq %rsi // rsi = CF 1340 movq (%rbp),%rax // rax = = n[0] 1341 adcq (%rdi),%r8 // r8 = t[0] 1342 adcq 8(%rdi),%r9 // r9 = t[1] 1343 adcq 16(%rdi),%r15 // r15 = t[2] 1344 adcq 24(%rdi),%r14 // r14 = t[3] 1345 adcq 32(%rdi),%r13 // r13 = t[4] 1346 adcq 40(%rdi),%r12 // r12 = t[5] 1347 adcq 48(%rdi),%r11 // r11 = t[6] 1348 adcq 56(%rdi),%r10 // r10 = t[7] 1349 sbbq %rsi,%rsi // rsi = -CF 1350 1351 movl $8,%ecx // ecx = 8 1352 xorq %rax, %rax 1353 jmp .LoopLastSqr8x 1354 1355.align 32 1356.LoopSqrBreak8x: 1357 xorq %rax,%rax // rax = 0 1358 addq 80(%rsp),%r8 // r8 += Highest carry bit. 1359 adcq $0,%r9 // r9 += CF 1360 adcq $0,%r15 // r15 += CF 1361 adcq $0,%r14 // r14 += CF 1362 adcq $0,%r13 // r13 += CF 1363 adcq $0,%r12 // r12 += CF 1364 adcq $0,%r11 // r11 += CF 1365 adcq $0,%r10 // r10 += CF 1366 adcq $0,%rax // rax += CF 1367 1368 negq %rsi // rsi = CF 1369.LoopEndCondMul8x: 1370 adcq (%rdi),%r8 // r8 += t[0] 1371 adcq 8(%rdi),%r9 // r9 += t[1] 1372 adcq 16(%rdi),%r15 // r15 += t[2] 1373 adcq 24(%rdi),%r14 // r14 += t[3] 1374 adcq 32(%rdi),%r13 // r13 += t[4] 1375 adcq 40(%rdi),%r12 // r12 += t[5] 1376 adcq 48(%rdi),%r11 // r11 += t[6] 1377 adcq 56(%rdi),%r10 // r10 += t[7] 1378 adcq $0,%rax // rax += CF 1379 movq -8(%rbp),%rcx // rcx = n[7] 1380 xorq %rsi,%rsi // rsi = 0 1381 1382 movq %xmm1,%rbp // rbp = n 1383 movq %r8,(%rdi) // Save the calculated result back to t[]. 1384 movq %r9,8(%rdi) 1385 movq %xmm5,%r9 1386 movq %r15,16(%rdi) 1387 movq %r14,24(%rdi) 1388 movq %r13,32(%rdi) 1389 movq %r12,40(%rdi) 1390 movq %r11,48(%rdi) 1391 movq %r10,56(%rdi) 1392 leaq 64(%rdi),%rdi // t += 8 1393 1394 cmpq 16(%rsp),%rdi // Cycle the entire t[]. 1395 jb .LoopReduceSqr8x 1396 ret 1397.cfi_endproc 1398.size MontSqr8Inner,.-MontSqr8Inner 1399 1400#endif 1401