1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16/* 17 * Description: Big number Montgomery modular multiplication in armv8 implementation, MontMul_Asm 18 * Ref: Montgomery Multiplication 19 * Process: To cal A * B mod n, we can convert to mont form, and cal A*B*R^(-1). 20 * Detail: 21 * intput:A = (An-1,...,A1,A0)b, B = (Bn-1,...,B1,B0)b, n, n' 22 * output:A*B*R^(-1) 23 * tmp = (tn,tn-1,...,t1,t0)b, initialize to 0 24 * for i: 0 -> (n-1) 25 * ui = (t0 + Ai*B0)m' mod b 26 * t = (t + Ai*B + ui * m) / b 27 * if t >= m 28 * t -= m 29 * return t; 30 * 31 * Deal process: 32 * i. size % 8 == 0 & a == b --> Sqr8x --> complete multiplication 33 * --> size == 8, goto single reduce step 34 * --> size >= 8, goto loop reduce process 35 * ii. size % 4 == 0 --> Mul4x 36 * --> size == 4, goto single step 37 * --> size >= 4, goto loop process 38 * iii. Ordinary --> Mul1x 39 * --> size == 2, goto single step 40 * --> size >= 2, goto loop process 41 * 42 */ 43 44#include "hitls_build.h" 45#ifdef HITLS_CRYPTO_BN 46 47#include "crypt_arm.h" 48 49.arch armv8-a+crypto 50.file "bn_mont_armv8.S" 51.text 52 53.global MontMul_Asm 54.type MontMul_Asm, %function 55.align 5 56MontMul_Asm: 57AARCH64_PACIASP 58 tst x5, #7 59 b.eq MontSqr8 60 tst x5, #3 61 b.eq MontMul4 62 63 stp x29, x30, [sp, #-64]! 64 mov x29, sp 65 stp x23, x24, [sp, #16] 66 stp x21, x22, [sp, #32] 67 stp x19, x20, [sp, #48] 68 69 sub x21, x5 , #2 // j = size-2 70 cbnz x21,.LMul1xBegin 71 72 // if size == 2, goto single step 73 ldp x15, x16, [x1] // a[0], a[1] 74 ldp x19, x20, [x2] // b[0], b[1] 75 ldp x9, x10, [x3] // n[0], n[1] 76 77 mul x23 , x15 , x19 // x23 = lo(a[0] * b[0]) 78 umulh x24 , x15 , x19 // x24 = hi(a[0] * b[0]) 79 mul x6, x16 , x19 // x6 = lo(a[1] * b[0]) 80 umulh x7, x16 , x19 // x7 = hi(a[1] * b[0]) 81 82 mul x11, x23 , x4 // x11 = lo(t[0] * k0) 83 umulh x19, x9, x11 // x19 = hi(n[0] * t[0]*k0) 84 mul x12, x10, x11 // x12 = lo(n[1] * t[0]*k0) 85 umulh x13, x10, x11 // x13 = hi(n[1] * t[0]*k0) 86 87 // we knowns a*b + n'n = 0 (mod R) 88 // so lo(a[0] * b[0]) + lo(n[0] * t[0]*k0) = 0 (mod R) 89 // if lo(a[0] * b[0]) > 0, then 'lo(a[0] * b[0]) + lo(n[0] * t[0]*k0)' would overflow, 90 // else if lo(a[0] * b[0]) == 0, then lo(n[0] * t[0]*k0) == 0 91 cmp x23, #1 92 adc x19, x19, xzr 93 94 adds x23 , x6, x24 // x23 = lo(a[1] * b[0]) + hi(a[0] * b[0]) 95 adc x24 , x7, xzr // x24 = hi(a[1] * b[0]) + CF 96 97 adds x8, x12, x19 // x8 = lo(n[1] * t[0]*k0) + hi(n[0] * t[0]*k0) 98 adc x19, x13, xzr // x19 = hi(n[1] * t[0]*k0) + CF 99 100 adds x21, x8, x23 // x21 = lo(n[1] * t[0]*k0) + hi(n[0] * t[0]*k0) 101 adcs x22, x19, x24 102 adc x23, xzr, xzr // x23 = CF 103 104 mul x14 , x15 , x20 // a[0] * b[1] 105 umulh x15 , x15 , x20 106 mul x6, x16 , x20 // a[1] * b[1] 107 umulh x7, x16 , x20 108 adds x14 , x14 , x21 109 adc x15 , x15 , xzr 110 111 mul x11, x14 , x4 // t[0] * k0 112 113 umulh x20, x9, x11 114 mul x12, x10, x11 // n[1] * t[0]*k0 115 umulh x13, x10, x11 116 cmp x14 , #1 // Check whether the low location is carried. 117 adc x20, x20, xzr 118 adds x14 , x6, x15 119 adc x15 , x7, xzr 120 121 adds x21, x12, x20 122 adcs x20, x13, x23 123 adc x23, xzr, xzr 124 125 adds x14 , x14 , x22 126 adc x15 , x15 , xzr 127 128 adds x21, x21, x14 129 adcs x20, x20, x15 130 adc x23, x23, xzr // x23 += CF 131 132 subs x12 , x21, x9 133 sbcs x13, x20, x10 134 sbcs x23, x23, xzr // update CF 135 136 csel x10, x21, x12, lo 137 csel x11, x20, x13, lo 138 stp x10, x11, [x0] 139 b .LMul1xEnd 140 141.LMul1xBegin: 142 mov x24, x5 // the outermost pointers of our loop 143 lsl x5 , x5 , #3 144 sub x22, sp , x5 // The space size needs to be applied for. 145 and x22, x22, #-16 // For 4-byte alignment 146 mov sp , x22 // Apply for Space. 147 mov x6, x5 148 mov x23, xzr 149 150.LMul1xInitstack: 151 sub x6, x6, #8 152 str xzr, [x22], #8 153 cbnz x6, .LMul1xInitstack 154 mov x22, sp 155 156.LMul1xLoopProces: 157 sub x24, x24, #1 158 sub x21, x5 , #16 // j = size-2 159 160 // Begin mulx 161 ldr x17, [x2], #8 // b[i] 162 ldp x15, x16, [x1], #16 // a[0], a[1] 163 ldp x9, x10, [x3], #16 // n[0], n[1] 164 ldr x19, [x22] // The sp val is 0 during initialization. 165 166 mul x14 , x15 , x17 // a[0] * b[i] 167 umulh x15 , x15 , x17 168 mul x6, x16 , x17 // a[1] * b[i] 169 umulh x7, x16 , x17 170 adds x14 , x14 , x19 171 adc x15 , x15 , xzr 172 173 mul x11, x14 , x4 174 umulh x9, x9, x11 175 mul x12, x10, x11 // n[1] * t[0]*k0 176 cmp x14, #1 177 umulh x13, x10, x11 178 179.LMul1xPrepare: 180 sub x21, x21, #8 // index -= 1 181 ldr x16, [x1], #8 // a[i] 182 ldr x10, [x3], #8 // n[i] 183 ldr x19, [x22, #8] // t[j] 184 185 adc x9, x9, xzr 186 adds x14 , x6, x15 187 adc x15 , x7, xzr 188 189 adds x8, x12, x9 190 adc x9, x13, xzr 191 192 mul x6, x16, x17 // a[j] * b[i] 193 adds x14, x14 , x19 194 umulh x7, x16 , x17 195 adc x15, x15 , xzr 196 197 mul x12, x10, x11 // n[j] * t[0]*k0 198 adds x8, x8, x14 199 umulh x13, x10, x11 200 str x8, [x22], #8 // t[j-1] 201 cbnz x21, .LMul1xPrepare 202 203.LMul1xReduce: 204 ldr x19, [x22, #8] 205 adc x9, x9, xzr 206 adds x14 , x6, x15 207 adc x15 , x7, xzr 208 209 adds x8, x12, x9 210 adcs x9, x13, x23 211 adc x23, xzr, xzr 212 213 adds x14 , x14 , x19 214 adc x15 , x15 , xzr 215 216 adds x8, x8, x14 217 adcs x9, x9, x15 218 adc x23, x23, xzr // x23 += CF, carry of the most significant bit. 219 stp x8, x9, [x22], #8 220 221 mov x22, sp 222 sub x1 , x1 , x5 223 subs x3 , x3 , x5 // x3 = &n[0] 224 cbnz x24, .LMul1xLoopProces 225 226 mov x1, x0 227 mov x21, x5 // get index 228 229.LMul1xSubMod: 230 ldr x19, [x22], #8 231 ldr x10, [x3], #8 232 sub x21, x21, #8 // j-- 233 sbcs x16, x19, x10 // t[j] - n[j] 234 str x16, [x1], #8 // r[j] = t[j] - n[j] 235 cbnz x21,.LMul1xSubMod 236 237 sbcs x23, x23, xzr // x23 -= CF 238 mov x22, sp 239.LMul1xCopy: 240 ldr x19, [x22], #8 241 ldr x16, [x0] 242 sub x5, x5, #8 // size-- 243 csel x10, x19, x16, lo 244 str x10, [x0], #8 245 cbnz x5 , .LMul1xCopy 246 247.LMul1xEnd: 248 ldp x23, x24, [x29, #16] 249 mov sp , x29 250 ldp x21, x22, [x29, #32] 251 ldp x19, x20, [x29, #48] 252 ldr x29, [sp], #64 253AARCH64_AUTIASP 254 ret 255.size MontMul_Asm, .-MontMul_Asm 256 257.type MontSqr8, %function 258MontSqr8: 259AARCH64_PACIASP 260 cmp x1, x2 261 b.ne MontMul4 262 263 stp x29, x30, [sp, #-128]! // sp = sp - 128(Modify the SP and then save the SP.), [sp] = x29, [sp + 8] = x30, 264 // !Indicates modification sp 265 mov x29, sp // x29 = sp, The sp here has been reduced by 128. 266 267 stp x27, x28, [sp, #16] 268 stp x25, x26, [sp, #32] 269 stp x23, x24, [sp, #48] 270 stp x21, x22, [sp, #64] 271 stp x19, x20, [sp, #80] 272 stp x0 , x3 , [sp, #96] // offload r and n, Push the pointers of r and n into the stack. 273 str x4 , [sp, #112] // store n0 274 275 lsl x5, x5, #3 // x5 = x5 * 8, Converts size to bytes. 276 sub x2, sp, x5, lsl#1 // x2 = sp - 2*x5*8, x5 = size, x2 points to the start address of a 2*size memory. *8 is to convert to bytes 277 mov sp, x2 // Alloca, Apply for Space. 278 mov x19, x5 // The lowest eight data blocks do not need to be cleared. 279 eor v0.16b,v0.16b,v0.16b 280 eor v1.16b,v1.16b,v1.16b 281 282.LSqr8xStackInit: 283 sub x19, x19, #8*8 // Offset 64, cyclic increment. 284 st1 {v0.2d, v1.2d}, [x2], #32 285 st1 {v0.2d, v1.2d}, [x2], #32 286 st1 {v0.2d, v1.2d}, [x2], #32 287 st1 {v0.2d, v1.2d}, [x2], #32 288 cbnz x19, .LSqr8xStackInit // When x19 = 0, the loop exits. 289 290 mov x2 , sp // After clear to zero, assign sp back to x2. 291 292 ldp x27, x28, [x2] 293 ldp x25, x26, [x2] 294 ldp x23, x24, [x2] 295 ldp x21, x22, [x2] 296 297 add x3 , x1 , x5 // x3 = x1 + bytes(size * 8) 298 299 ldp x14 , x15 , [x1], #16 // x14 = a[0], x15 = a[1] 300 ldp x16 , x17 , [x1], #16 // x16 = a[2], x17 = a[3] 301 ldp x6, x7, [x1], #16 // x6 = a[4], x7 = a[5] 302 ldp x8, x9, [x1], #16 // x8 = a[6], x9 = a[7] 303 304.LSqr8xLoopMul: 305 mul x10, x14, x15 // a[0] * a[1~4] 306 mul x11, x14, x16 // keep cache hit ratio of x6 307 mul x12, x14, x17 308 mul x13, x14, x6 309 adds x28, x28, x10 // x27~x22 = t[0~7], x28 = t[1] = lo(a[0]*a[1]), adds is used to set CF to 0. 310 adcs x25, x25, x11 // x10~x17 Used to save subsequent calculation results 311 312 mul x10, x14 , x7 // lo(a[0] * a[5~7]), keep cache hit ratio of x14, the same below 313 mul x11, x14 , x8 314 adcs x26, x26, x12 315 adcs x23, x23, x13 // t[4] = lo(a[0] * a[4]) 316 adcs x24, x24, x10 // x24~x22 = t[5~7] 317 mul x12, x14 , x9 // lo(a[0] * a[7]) 318 319 stp x27, x28, [x2], #8*2 // t[0] = a[0]^2, Because the square term is not calculated temporarily, 320 // so t[0] = 0, t[1] = a[0] * a[1] + carry 321 322 adcs x21, x21, x11 323 adcs x22, x22, x12 // t[7] += lo(a[0] * a[7]), Carrying has to be given t[8] 324 adc x27, xzr, xzr // x27 = CF ( Set by t[7] += lo(a[0] * a[7]) ), 325 umulh x13, x14 , x15 // hi(a[0] * a[1~4]), Use x17 to keep the cache hit 326 umulh x10, x14 , x16 327 umulh x11, x14 , x17 328 umulh x12, x14 , x6 329 330 // In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified. 331 adds x25, x25, x13 // t[2] += hi(a[0] * a[1]) 332 adcs x26, x26, x10 333 adcs x23, x23, x11 334 adcs x24, x24, x12 // t[5] += hi(a[0] * a[4]) 335 umulh x13, x14 , x7 // hi(a[0] * a[5~7]) 336 umulh x10, x14 , x8 337 umulh x11, x14 , x9 338 339 //----- lo(a[1] * a[2~4]) ------ 340 adcs x21, x21, x13 // t[6] += hi(a[0] * a[5]) 341 adcs x22, x22, x10 // t[7] += hi(a[0] * a[6]) 342 adc x27, x27, x11 // t[8] += hi(a[0] * a[7]) 343 mul x12, x15, x16 // lo(a[1] * a[2]) 344 mul x13, x15, x17 345 mul x10, x15, x6 346 347 //----- lo(a[1] * a[5~7]) ------ 348 adds x26, x26, x12 // t[3] += lo(a[1] * a[2]), The first calculation of this round 349 // does not take into account the previous carry, and the CF is not modified in line 118. 350 adcs x23, x23, x13 // t[4] += lo(a[1] * a[3]) 351 adcs x24, x24, x10 // t[5] += lo(a[1] * a[4]) 352 mul x11, x15 , x7 353 mul x12, x15 , x8 354 mul x13, x15 , x9 355 356 //----- hi(a[1] * a[2~5]) ------ 357 adcs x21, x21, x11 // t[6] += lo(a[1] * a[5]) 358 adcs x22, x22, x12 // t[7] += lo(a[1] * a[6]) 359 adcs x27, x27, x13 // t[8] += lo(a[1] * a[7]) 360 umulh x10, x15, x16 // hi(a[1] * a[2]) 361 umulh x11, x15, x17 362 umulh x12, x15, x6 363 umulh x13, x15, x7 364 365 stp x25, x26, [x2], #8*2 // t[2] and t[3] are calculated and stored in the memory. 366 // x25 and x22 are used to store t[10] and t[11]. 367 adc x28, xzr, xzr // t[9] = CF ( Set by t[8] += lo(a[1] * a[7]) ) 368 //In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified. 369 //----- hi(a[1] * a[6~7]) ------ 370 adds x23, x23, x10 // t[4] += hi(a[1] * a[2]) 371 adcs x24, x24, x11 // t[5] += hi(a[1] * a[3]) 372 adcs x21, x21, x12 // t[6] += hi(a[1] * a[4]) 373 umulh x10, x15 , x8 // hi(a[1] * a[6]) 374 umulh x11, x15 , x9 // hi(a[1] * a[7]) 375 376 //----- lo(a[2] * a[3~7]) ------ 377 adcs x22, x22, x13 // t[7] += hi(a[1] * a[5]) 378 adcs x27, x27, x10 // t[8] += hi(a[1] * a[6]) 379 adc x28, x28, x11 // t[9] += hi(a[1] * a[7]), Here, only the carry of the previous round 380 mul x12, x16, x17 // lo(a[2] * a[3]) 381 mul x13, x16, x6 382 mul x10, x16, x7 383 // of calculation is retained before x20 calculation. Add x15 to the carry. 384 mul x11, x16 , x8 385 adds x24, x24, x12 // t[5] += lo(a[2] * a[3]), For the first calculation of this round, 386 // the previous carry is not considered. 387 mul x12, x16 , x9 388 adcs x21, x21, x13 // t[6] += lo(a[2] * a[4]) 389 390 //----- hi(a[2] * a[3~7]) ------ 391 adcs x22, x22, x10 // t[7] += lo(a[2] * a[5]) 392 umulh x13, x16, x17 // hi(a[2] * a[3]) 393 umulh x10, x16, x6 394 395 adcs x27, x27, x11 // t[8] += lo(a[2] * a[6]) 396 adcs x28, x28, x12 // t[9] += lo(a[2] * a[7]) 397 umulh x11, x16, x7 398 umulh x12, x16, x8 399 400 stp x23, x24, [x2], #8*2 // After t[4] and t[5] are calculated, they are stored in the memory. 401 // x23 and x24 are used to store t[12] and t[13]. 402 adc x25, xzr, xzr // t[10] = CF ( Set by t[9] += lo(a[2] * a[7]) ) 403 // In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified. 404 adds x21, x21, x13 // t[6] += hi(a[2] * a[3]) 405 adcs x22, x22, x10 // t[7] += hi(a[2] * a[4]) 406 umulh x13, x16, x9 407 408 //----- lo(a[3] * a[4~7]) ------ 409 adcs x27, x27, x11 // t[8] += hi(a[2] * a[5]) 410 adcs x28, x28, x12 // t[9] += hi(a[2] * a[6]) 411 adc x25, x25, x13 // t[10] += hi(a[2] * a[7]) 412 mul x10, x17, x6 413 mul x11, x17, x7 414 mul x12, x17, x8 415 mul x13, x17, x9 416 417 //----- hi(a[3] * a[4~7]) ------ 418 adds x22, x22, x10 // t[7] += lo(a[3] * a[4]) 419 adcs x27, x27, x11 // t[8] += lo(a[3] * a[5]) 420 adcs x28, x28, x12 // t[9] += lo(a[3] * a[6]) 421 adcs x25, x25, x13 // t[10] += lo(a[3] * a[7]) 422 umulh x10, x17, x6 423 umulh x11, x17, x7 424 umulh x12, x17, x8 425 umulh x13, x17, x9 426 427 stp x21, x22, [x2], #8*2 // t[6] and t[7] are calculated and stored in the memory. 428 // x21 and x26 are used to store t[14] and t[15]. 429 adc x26, xzr, xzr // t[11] = CF ( Set by t[10] += lo(a[3] * a[7]) ) 430 // In the new round, the first calculation does not need to be carried, but the CF bit needs to be modified. 431 adds x27, x27, x10 // t[8] += hi(a[3] * a[4]) 432 433 //----- lo(a[4] * a[5~7]) ------ 434 adcs x28, x28, x11 // t[9] += hi(a[3] * a[5]) 435 adcs x25, x25, x12 // t[10] += hi(a[3] * a[6]) 436 adc x26, x26, x13 // t[11] += hi(a[3] * a[7]) 437 mul x10, x6, x7 438 mul x11, x6, x8 439 mul x12, x6, x9 440 441 //----- hi(a[4] * a[5~7]) ------ 442 adds x28, x28, x10 // t[9] += lo(a[4] * a[5]) 443 adcs x25, x25, x11 // t[10] += lo(a[4] * a[6]) 444 adcs x26, x26, x12 // t[11] += lo(a[4] * a[7]) 445 umulh x13, x6, x7 446 umulh x10, x6, x8 447 umulh x11, x6, x9 448 449 //----- lo(a[5] * a[6~7]) ------ 450 mul x12, x7, x8 451 // This is actually a new round, but only t[0-7] can be calculated in each cycle, 452 // and t[8-15] retains the intermediate calculation result. 453 adc x23, xzr, xzr // t[12] = CF( Set by t[11] += lo(a[4] * a[7]) ) 454 adds x25, x25, x13 // t[10] += hi(a[4] * a[5]) 455 adcs x26, x26, x10 // t[11] += hi(a[4] * a[6]) 456 mul x13, x7, x9 457 458 //----- hi(a[5] * a[6~7]) ------ 459 adc x23, x23, x11 // t[12] += hi(a[4] * a[7]) 460 umulh x10, x7, x8 461 umulh x11, x7, x9 462 463 adds x26, x26, x12 // t[11] += lo(a[5] * a[6]) 464 //----- lo(a[6] * a[7]) ------ 465 adcs x23, x23, x13 // t[12] += lo(a[5] * a[7]) 466 mul x12, x8, x9 467 468 //----- hi(a[6] * a[7]) ------ 469 adc x24, xzr, xzr // t[13] = CF ( Set by t[12] += lo(a[5] * a[7]) ), 470 umulh x13, x8, x9 471 // This operation is required when a new umulh is added. 472 adds x23, x23, x10 // t[12] += hi(a[5] * a[6]) 473 adc x24, x24, x11 // t[13] += hi(a[5] * a[7]) 474 sub x19, x3, x1 // x3 = &a[size], x1 = &a[8], x19 = (size - 8) * 8 475 476 adds x24, x24, x12 // t[13] += lo(a[6] * a[7]) 477 adc x21, xzr, xzr // t[14] = CF ( set by t[13] += lo(a[6] * a[7]) ) 478 add x21, x21, x13 // t[14] += hi(a[6] * a[7]), There must be no carry in the last step. 479 480 cbz x19, .LSqr8xLoopMulEnd 481 482 mov x0, x1 // x0 = &a[8] 483 mov x22, xzr 484 485//######################################## 486//# a[0~7] * a[8~15] # 487//######################################## 488.LSqr8xHighMulBegian: 489 mov x19, #-8*8 // Loop range. x0 can retrieve a[0–7] based on this offset. 490 ldp x14 , x15 , [x2, #8*0] // x14 = t[8] , x15 = t[9] 491 adds x27, x27, x14 // t[8](t[8] reserved in the previous round of calculation) + = t[8] 492 // (t[8] taken from memory, initially 0) 493 adcs x28, x28, x15 // t[9] += t[9], be the same as the above 494 ldp x16 , x17 , [x2, #8*2] // x16 = t[10], x17 = t[11] 495 ldp x14 , x15 , [x1], #16 // x14 = a[8], x15 = a[9] 496 adcs x25, x25, x16 497 adcs x26, x26, x17 498 499 ldp x6, x7, [x2, #8*4] // x6 = t[12], x7 = t[13] 500 ldp x16 , x17 , [x1], #16 // x16 = a[10], x17 = a[11] 501 adcs x23, x23, x6 502 adcs x24, x24, x7 503 504 ldp x8, x9, [x2, #8*6] // x8 = t[14], x9 = t[15] 505 ldp x6, x7, [x1], #16 // x6 = a[12], x7 = a[13] 506 adcs x21, x21, x8 507 adcs x22, x22, x9 // t[15] = t[15] + CF, Because a[7]*a[7] is not calculated previously, t[15]=0 508 ldp x8, x9, [x1], #16 // x8 = a[14], x9 = a[15] 509 510.LSqr8xHighMulProces: 511 ldr x4 , [x0, x19] // x4 = [x0 + x19] = [x0 - 56] = [&a[8] - 56] = a[8 - 7] = a[1] 512 //-----lo(a[0] * a[8~11])----- 513 adc x20, xzr, xzr // x20 += CF, Save the carry of t[15]. The same operation is performed below. 514 add x19, x19, #8 // x19 += 8, Loop step size 515 mul x10, x4 , x14 // x4 = a[0], x14 = a[8], x10 = lo(a[0] * a[8]) 516 mul x11, x4 , x15 // x11 = lo(a[0] * a[9]) 517 mul x12, x4 , x16 // x12 = lo(a[0] * a[10]) 518 mul x13, x4 , x17 // x13 = lo(a[0] * a[11]) 519 520 //-----lo(a[0] * a[12~15])----- 521 adds x27, x27, x10 // CF does not need to be added for the first calculation, 522 // t[8] += lo(a[0] * a[8]) 523 adcs x28, x28, x11 // t[9] += lo(a[0] * a[9]) 524 adcs x25, x25, x12 // t[10] += lo(a[0] * a[10]) 525 adcs x26, x26, x13 // t[11] += lo(a[0] * a[11]) 526 mul x10, x4 , x6 527 mul x11, x4 , x7 528 mul x12, x4 , x8 529 mul x13, x4 , x9 530 531 //-----hi(a[0] * a[8~11])----- 532 adcs x23, x23, x10 // t[12] += lo(a[0] * a[12]) 533 adcs x24, x24, x11 // t[13] += lo(a[0] * a[13]) 534 adcs x21, x21, x12 // t[14] += lo(a[0] * a[14]) 535 adcs x22, x22, x13 // t[15] += lo(a[0] * a[15]) 536 umulh x10, x4 , x14 537 umulh x11, x4 , x15 538 umulh x12, x4 , x16 539 umulh x13, x4 , x17 540 541 adc x20, x20, xzr // x20 += CF, Save the carry of t[15] 542 str x27, [x2], #8 // [x2] = t[8], x2 += 8, x27~x22 = t[9~16], 543 // Update the mapping relationship to facilitate cycling. 544 // x27~x26 always correspond to t[m~m+7], and x19 is always the LSB of the window 545 546 //-----hi(a[0] * a[12~15])----- 547 adds x27, x28, x10 // t[9] += hi(a[0] * a[8]), The last calculation was to calculate t[15], 548 // so carry cannot be added to t[9], so adds is used 549 adcs x28, x25, x11 // t[10] += hi(a[0] * a[9]) 550 adcs x25, x26, x12 // t[11] += hi(a[0] * a[10]) 551 adcs x26, x23, x13 // t[12] += hi(a[0] * a[11]) 552 umulh x10, x4 , x6 553 umulh x11, x4 , x7 554 umulh x12, x4 , x8 555 umulh x13, x4 , x9 // x13 = hi(a[0] * a[15]) 556 557 adcs x23, x24, x10 // t[13] += hi(a[0] * a[12]) 558 adcs x24, x21, x11 // t[14] += hi(a[0] * a[13]) 559 adcs x21, x22, x12 // t[15] += hi(a[0] * a[14]) 560 adcs x22, x20, x13 // t[16] = hi(a[0] * a[15]) + CF 561 562 cbnz x19, .LSqr8xHighMulProces // When exiting the loop, x0 = &a[8], x2 = &t[16] 563 564 sub x16, x1, x3 // x3 = x1 + x5 * 8(Converted to bytes), When x1 = x3, the loop ends. 565 cbnz x16, .LSqr8xHighMulBegian // x0 is the outer loop, x1 is the inner loop, and the inner loop ends. 566 // In this case, x2 = &a[size], out-of-bounds position. 567 568 mov x1, x0 // Outer Loop Increment, x1 = &a[16] 569 ldp x14 , x15 , [x1], #16 // x14 = a[8] , x15 = a[9] 570 ldp x16 , x17 , [x1], #16 // x16 = a[10], x17 = a[11] 571 ldp x6, x7, [x1], #16 // x6 = a[12], x7 = a[13] 572 ldp x8, x9, [x1], #16 // x8 = a[14], x9 = a[15] 573 574 sub x10, x3 , x1 // Check whether the outer loop ends, x3 = &a[size], x10 = (size - 16)*8 575 cbz x10, .LSqr8xLoopMul 576 577 sub x11, x2 , x10 // x2 = &t[24], x11 = &t[16] 578 stp x27, x28, [x2 , #8*0] // t[24] = x27, t[25] = x28 579 ldp x27, x28, [x11, #8*0] // x27 = t[16], x28 = t[17] 580 581 stp x25, x26, [x2 , #8*2] // t[26] = x25, t[27] = x26 582 ldp x25, x26, [x11, #8*2] // x25 = t[18], x26 = t[19] 583 584 stp x23, x24, [x2 , #8*4] // t[28] = x23, t[29] = x24 585 ldp x23, x24, [x11, #8*4] // x23 = t[20], x24 = t[21] 586 587 stp x21, x22, [x2 , #8*6] // t[30] = x21, t[31] = x22 588 ldp x21, x22, [x11, #8*6] // x21 = t[22], x22 = t[23] 589 590 mov x2 , x11 // x2 = &t[16] 591 b .LSqr8xLoopMul 592 593.align 4 594.LSqr8xLoopMulEnd: 595 //===== Calculate the squared term ===== 596 //----- sp = &t[0] , x2 = &t[24]----- 597 sub x10, x3, x5 // x10 = a[0] 598 stp x27, x28, [x2, #8*0] // t[24] = x27, t[25] = x28 599 stp x25, x26, [x2, #8*2] // When this step is performed, the calculation results reserved for x27–x26 600 // are not pushed to the stack. 601 stp x23, x24, [x2, #8*4] 602 stp x21, x22, [x2, #8*6] 603 ldp x11, x12, [sp, #8*1] // x11 = t[1], x12 = t[2] 604 ldp x15, x17, [x10], #16 // x15 = a[0], x17 = a[1] 605 ldp x7, x9, [x10], #16 // x7 = a[2], x9 = a[3] 606 mov x1, x10 607 ldp x13, x10, [sp, #8*3] // x13 = t[3], x10 = t[4] 608 609 mul x27, x15, x15 // x27 = lo(a[0] * a[0]) 610 umulh x15, x15, x15 // x15 = hi(a[0] * a[0]) 611 mov x2 , sp // x2 = sp = &t[0] 612 mul x16, x17, x17 // x16 = lo(a[1] * a[1]) 613 adds x28, x15, x11, lsl#1 // x28 = x15 + (x11 * 2) = hi(a[0] * a[0]) + 2 * t[1] 614 umulh x17, x17, x17 // x17 = hi(a[1] * a[1]) 615 616 extr x11, x12, x11, #63 // Lower 63 bits of x11 = x16 | most significant bit of x15 617 // Cyclic right shift by 63 bits to obtain the lower bit, 618 // which is equivalent to cyclic left shift by 1 bit to obtain the upper bit. 619 // The purpose is to *2. 620 // x11 = 2*t[2](Ignore the overflowed part) + carry of (2*t[1]) 621 mov x19, x5 // x19 = size*8 622 623.LSqr8xDealSquare: 624 adcs x25, x16 , x11 // x25 = lo(a[1] * a[1]) + 2*t[2] 625 extr x12, x13, x12, #63 // x12 = 2*t[3](Ignore the overflowed part) + carry of (2*t[2]) 626 adcs x26, x17 , x12 // x26 = hi(a[1] * a[1]) + 2*t[3] 627 628 sub x19, x19, #8*4 // x19 = (size - 8)*8 629 stp x27, x28, [x2], #16 // t[0~3]Re-push stack 630 stp x25, x26, [x2], #16 631 632 mul x6, x7, x7 // x6 = lo(a[2] * a[2]) 633 umulh x7, x7, x7 // x7 = hi(a[2] * a[2]) 634 mul x8, x9, x9 // x6 = lo(a[3] * a[3]) 635 umulh x9, x9, x9 // x7 = hi(a[3] * a[3]) 636 ldp x11, x12, [x2, #8] // x11 = t[5], x12 = t[6] 637 638 extr x13, x10, x13, #63 // x13 = 2*t[4](Ignore the overflowed part) + carry of(2*t[3]) 639 extr x10, x11, x10, #63 // x10 = 2*t[5](Ignore the overflowed part) + carry of(2*t[4]) 640 adcs x23, x6, x13 // x23 = lo(a[2] * a[2]) + 2*t[4] 641 adcs x24, x7, x10 // x24 = hi(a[2] * a[2]) + 2*t[5] 642 643 cbz x19, .LSqr8xReduceStart 644 645 ldp x13, x10, [x2, #24] // x13 = t[7], x10 = t[8] 646 extr x11, x12, x11, #63 // x11 = 2*t[6](Ignore the overflowed part) + carry of(2*t[5]) 647 extr x12, x13, x12, #63 // x12 = 2*t[7](Ignore the overflowed part) + carry of(2*t[6]) 648 adcs x21, x8, x11 // x21 = lo(a[3] * a[3]) + 2*t[6] 649 adcs x22, x9, x12 // x22 = hi(a[3] * a[3]) + 2*t[7] 650 stp x23, x24, [x2], #16 // t[4~7]re-push stack 651 stp x21, x22, [x2], #16 652 ldp x15, x17, [x1], #8*2 // x15 = a[4], x17 = a[5], x1 += 16 = &a[6] 653 ldp x11, x12, [x2, #8] // x11 = t[9], x12 = t[10] 654 655 mul x14 , x15 , x15 // x14 = lo(a[4] * a[4]) 656 umulh x15 , x15 , x15 // x15 = hi(a[4] * a[4]) 657 mul x16 , x17 , x17 // x16 = lo(a[5] * a[5]) 658 umulh x17 , x17 , x17 // x17 = hi(a[5] * a[5]) 659 660 extr x13, x10, x13, #63 // x13 = 2*t[8](Ignore the overflowed part) + carry of(2*t[7]) 661 adcs x27, x14 , x13 // x27 = lo(a[4] * a[4]) + 2*t[8] 662 extr x10, x11, x10, #63 // x10 = 2*t[9](Ignore the overflowed part) + carry of(2*t[8]) 663 adcs x28, x15 , x10 // x28 = hi(a[4] * a[4]) + 2*t[9] 664 665 extr x11, x12, x11, #63 // x11 = 2*t[10](Ignore the overflowed part) + carry of(2*t[9]) 666 667 ldp x13, x10, [x2, #8*3] // Line 438 has obtained t[9] and t[10], x13 = &t[11], x10 = &t[12] 668 ldp x7, x9, [x1], #8*2 // x7 = a[6], x9 = a[7], x1 += 16 = &a[8] 669 670 b .LSqr8xDealSquare 671 672.LSqr8xReduceStart: 673 extr x11, x12, x11, #63 // x11 = 2*t[2*size-2](Ignore the overflowed part) + carry of (2*t[2*size-3]) 674 adcs x21, x8, x11 // x21 = lo(a[size-1] * a[size-1]) + 2*t[2*size-2] 675 extr x12, xzr, x12, #63 // x12 = 2*t[2*size-1](Ignore the overflowed part) + carry of (2*t[2*size-2]) 676 adc x22, x9, x12 // x22 = hi(a[size-1] * a[size-1]) + 2*t[2*size-1] 677 678 ldp x1, x4, [x29, #104] // Pop n and k0 out of the stack, x1 = &n[0], x4 = k0 679 stp x23, x24, [x2] // t[2*size-4 ~ 2*size-1]re-push stack 680 stp x21, x22, [x2,#8*2] 681 682 cmp x5, #64 // if size == 8, we can goto Single step reduce 683 b.ne .LSqr8xReduceLoop 684 685 ldp x14 , x15 , [x1], #16 // x14~x9 = n[0~7] 686 ldp x16 , x17 , [x1], #16 687 ldp x6, x7, [x1], #16 688 ldp x8, x9, [x1], #16 689 690 ldp x27, x28, [sp] // x14~x9 = t[0~7] 691 ldp x25, x26, [sp,#8*2] 692 ldp x23, x24, [sp,#8*4] 693 ldp x21, x22, [sp,#8*6] 694 mov x19, #8 695 mov x2 , sp 696 697 // if size == 8, goto single reduce step 698.LSqr8xSingleReduce: 699 mul x20, x4, x27 700 sub x19, x19, #1 701 //----- lo(n[1~7] * lo(t[0]*k0)) ----- 702 mul x11, x15 , x20 703 mul x12, x16 , x20 704 mul x13, x17 , x20 705 mul x10, x6, x20 706 707 cmp x27, #1 708 adcs x27, x28, x11 709 adcs x28, x25, x12 710 adcs x25, x26, x13 711 adcs x26, x23, x10 712 mul x11, x7, x20 713 mul x12, x8, x20 714 mul x13, x9, x20 715 716 //----- hi(n[0~7] * lo(t[0]*k0)) ----- 717 adcs x23, x24, x11 718 adcs x24, x21, x12 719 adcs x21, x22, x13 720 adc x22, xzr, xzr // x22 += CF 721 umulh x10, x14 , x20 722 umulh x11, x15 , x20 723 umulh x12, x16 , x20 724 umulh x13, x17 , x20 725 726 adds x27, x27, x10 727 adcs x28, x28, x11 728 adcs x25, x25, x12 729 adcs x26, x26, x13 730 umulh x10, x6, x20 731 umulh x11, x7, x20 732 umulh x12, x8, x20 733 umulh x13, x9, x20 734 735 adcs x23, x23, x10 736 adcs x24, x24, x11 737 adcs x21, x21, x12 738 adc x22, x22, x13 739 cbnz x19, .LSqr8xSingleReduce // Need cycle 8 times 740 741 ldp x10, x11, [x2, #64] // x10 = t[8], x11 = t[9] 742 ldp x12, x13, [x2, #80] 743 adds x27, x27, x10 744 adcs x28, x28, x11 745 ldp x10, x11, [x2, #96] 746 adcs x25, x25, x12 747 adcs x26, x26, x13 748 adcs x23, x23, x10 749 ldp x12, x13, [x2, #112] 750 751 adcs x24, x24, x11 752 adcs x21, x21, x12 753 adcs x22, x22, x13 754 adc x20, xzr, xzr 755 ldr x0, [x29, #96] // r Pop-Stack 756 757 // t - mod 758 subs x14, x27, x14 759 sbcs x15, x28, x15 760 sbcs x16, x25, x16 761 sbcs x17, x26, x17 762 sbcs x6, x23, x6 763 sbcs x7, x24, x7 764 sbcs x8, x21, x8 765 sbcs x9, x22, x9 766 sbcs x20, x20, xzr // determine whether there is a borrowing 767 768 // according to CF choose our result 769 csel x14 , x27, x14 , lo 770 csel x15 , x28, x15 , lo 771 csel x16 , x25, x16 , lo 772 csel x17 , x26, x17 , lo 773 stp x14 , x15 , [x0, #8*0] 774 csel x6, x23, x6, lo 775 csel x7, x24, x7, lo 776 stp x16 , x17 , [x0, #8*2] 777 csel x8, x21, x8, lo 778 csel x9, x22, x9, lo 779 stp x6, x7, [x0, #8*4] 780 stp x8, x9, [x0, #8*6] 781 b .LMontSqr8xEnd 782 783.LSqr8xReduceLoop: 784 add x3, x1, x5 // x3 = &n[size] 785 mov x30, xzr 786 ldp x14 , x15 , [x1], #16 // x14~x9 = n[0~7] 787 ldp x16 , x17 , [x1], #16 788 ldp x6, x7, [x1], #16 789 ldp x8, x9, [x1], #16 790 791 ldp x27, x28, [sp] // x27 = t[0], x28 = t[1] 792 ldp x25, x26, [sp,#8*2] // x25~x22 = t[2~7] 793 ldp x23, x24, [sp,#8*4] 794 ldp x21, x22, [sp,#8*6] 795 mov x19, #8 796 mov x2 , sp 797 798.LSqr8xReduceProcess: 799 mul x20, x4, x27 // x20 = lo(k0 * t[0]) 800 sub x19, x19, #1 801 //----- lo(n[1~7] * lo(t[0]*k0)) ----- 802 mul x11, x15, x20 // x11 = n[1] * lo(t[0]*k0) 803 mul x12, x16, x20 // x12 = n[2] * lo(t[0]*k0) 804 mul x13, x17, x20 // x13 = n[3] * lo(t[0]*k0) 805 mul x10, x6, x20 // x10 = n[4] * lo(t[0]*k0) 806 807 str x20, [x2], #8 // Push lo(t[0]*k0) on the stack., x2 += 8 808 cmp x27, #1 // Check whether the low location is carried. 809 810 adcs x27, x28, x11 // x27 = t[1] + lo(n[1] * lo(t[0]*k0)) 811 adcs x28, x25, x12 // x28 = t[2] + lo(n[2] * lo(t[0]*k0)) 812 adcs x25, x26, x13 // x25 = t[3] + lo(n[3] * lo(t[0]*k0)) 813 adcs x26, x23, x10 // x26 = t[4] + lo(n[4] * lo(t[0]*k0)) 814 mul x11, x7, x20 815 mul x12, x8, x20 816 mul x13, x9, x20 817 818 //----- hi(n[0~7] * lo(t[0]*k0)) ----- 819 adcs x23, x24, x11 // x23 = t[5] + lo(n[5] * lo(t[0]*k0)) 820 adcs x24, x21, x12 // x24 = t[6] + lo(n[6] * lo(t[0]*k0)) 821 adcs x21, x22, x13 // x21 = t[7] + lo(n[7] * lo(t[0]*k0)) 822 adc x22, xzr, xzr // x22 += CF 823 umulh x10, x14 , x20 824 umulh x11, x15 , x20 825 umulh x12, x16 , x20 826 umulh x13, x17 , x20 827 828 adds x27, x27, x10 // x27 += hi(n[0] * lo(t[0]*k0)) 829 adcs x28, x28, x11 // x28 += hi(n[1] * lo(t[0]*k0)) 830 adcs x25, x25, x12 // x25 += hi(n[2] * lo(t[0]*k0)) 831 adcs x26, x26, x13 // x26 += hi(n[3] * lo(t[0]*k0)) 832 umulh x10, x6, x20 833 umulh x11, x7, x20 834 umulh x12, x8, x20 835 umulh x13, x9, x20 836 837 adcs x23, x23, x10 // x23 += hi(n[4] * lo(t[0]*k0)) 838 adcs x24, x24, x11 // x24 += hi(n[5] * lo(t[0]*k0)) 839 adcs x21, x21, x12 // x21 += hi(n[6] * lo(t[0]*k0)) 840 adc x22, x22, x13 // x22 += hi(n[7] * lo(t[0]*k0)) 841 cbnz x19, .LSqr8xReduceProcess // Cycle 8 times, and at the end of the cycle, x2 += 8*8 842 843 ldp x10, x11, [x2, #8*0] // x10 = t[8], x11 = t[9] 844 ldp x12, x13, [x2, #8*2] 845 mov x0, x2 846 847 adds x27, x27, x10 848 adcs x28, x28, x11 849 ldp x10, x11, [x2,#8*4] 850 adcs x25, x25, x12 851 adcs x26, x26, x13 852 ldp x12, x13, [x2,#8*6] 853 adcs x23, x23, x10 854 adcs x24, x24, x11 855 adcs x21, x21, x12 856 adcs x22, x22, x13 857 858 ldr x4 , [x2, #-8*8] // x4 = t[0] 859 ldp x14 , x15 , [x1], #16 // x14~x9 = &n[8]~&n[15] 860 ldp x16 , x17 , [x1], #16 861 ldp x6, x7, [x1], #16 862 ldp x8, x9, [x1], #16 863 mov x19, #-8*8 864 865.LSqr8xReduce: 866 adc x20, xzr, xzr // x20 = CF 867 add x19, x19, #8 868 869 mul x10, x14 , x4 870 mul x11, x15 , x4 871 mul x12, x16 , x4 872 mul x13, x17 , x4 873 874 adds x27, x27, x10 875 adcs x28, x28, x11 876 adcs x25, x25, x12 877 adcs x26, x26, x13 878 mul x10, x6, x4 879 mul x11, x7, x4 880 mul x12, x8, x4 881 mul x13, x9, x4 882 883 adcs x23, x23, x10 884 adcs x24, x24, x11 885 adcs x21, x21, x12 886 adcs x22, x22, x13 887 umulh x10, x14 , x4 888 umulh x11, x15 , x4 889 umulh x12, x16 , x4 890 umulh x13, x17 , x4 891 892 adc x20, x20, xzr 893 894 str x27, [x2], #8 // x27 = t[8], x2 += 8 895 896 adds x27, x28, x10 // x27 = t[1] + lo(n[1] * lo(t[0]*k0)) 897 adcs x28, x25, x11 // x28 = t[2] + lo(n[2] * lo(t[0]*k0)) 898 adcs x25, x26, x12 // x25 = t[3] + lo(n[3] * lo(t[0]*k0)) 899 adcs x26, x23, x13 // x26 = t[4] + lo(n[4] * lo(t[0]*k0)) 900 umulh x10, x6, x4 901 umulh x11, x7, x4 902 umulh x12, x8, x4 903 umulh x13, x9, x4 904 905 // x0 = &t[8] 906 ldr x4 , [x0, x19] 907 908 adcs x23, x24, x10 909 adcs x24, x21, x11 910 adcs x21, x22, x12 911 adcs x22, x20, x13 912 913 cbnz x19, .LSqr8xReduce 914 915 ldp x14 , x15 , [x2, #8*0] 916 ldp x16 , x17 , [x2, #8*2] 917 sub x19, x3, x1 // x19 = (size-16)*8 918 919 ldp x6, x7, [x2, #8*4] 920 ldp x8, x9, [x2, #8*6] 921 cbz x19, .LSqr8xReduceBreak 922 923 ldr x4 , [x0, #-8*8] 924 925 adds x27, x27, x14 926 adcs x28, x28, x15 927 adcs x25, x25, x16 928 adcs x26, x26, x17 929 adcs x23, x23, x6 930 adcs x24, x24, x7 931 adcs x21, x21, x8 932 adcs x22, x22, x9 933 934 ldp x14 , x15 , [x1], #16 935 ldp x16 , x17 , [x1], #16 936 ldp x6, x7, [x1], #16 937 ldp x8, x9, [x1], #16 938 939 mov x19, #-8*8 940 b .LSqr8xReduce 941 942.align 4 943.LSqr8xReduceBreak: 944 sub x12, x3, x5 // x12 = n, reassign to n 945 ldr x4 , [x29, #112] // k0 pop-stack 946 947 cmp x30, #1 // Check whether the low location is carried. 948 949 adcs x10, x27, x14 950 adcs x11, x28, x15 951 stp x10, x11, [x2] , #16 952 953 ldp x27 ,x28, [x0 , #8*0] 954 ldp x14 , x15, [x12], #16 // x12 = &n[0] (Line 638 assigns a value) 955 956 adcs x25, x25, x16 957 adcs x26, x26, x17 958 adcs x23, x23, x6 959 adcs x24, x24, x7 960 adcs x21, x21, x8 961 adcs x22, x22, x9 962 adc x30, xzr, xzr 963 964 ldp x16, x17, [x12], #16 965 ldp x6, x7, [x12], #16 966 ldp x8, x9, [x12], #16 967 968 stp x25, x26, [x2], #16 969 ldp x25, x26, [x0, #8*2] 970 971 stp x23, x24, [x2], #16 972 ldp x23, x24, [x0, #8*4] 973 974 stp x21, x22, [x2], #16 975 ldp x21, x22, [x0, #8*6] 976 977 sub x20, x2, x29 // Check whether the loop ends 978 mov x1, x12 979 mov x2, x0 // sliding window 980 mov x19, #8 981 cbnz x20, .LSqr8xReduceProcess 982 983 // Final step 984 ldr x0 , [x29, #96] // r Pop-Stack 985 add x2 , x2 , #8*8 986 subs x10, x27, x14 987 sbcs x11, x28, x15 988 sub x19, x5 , #8*8 989 mov x3 , x0 // backup x0 990 991.LSqr8xSubMod: 992 ldp x14 , x15, [x1], #16 993 sbcs x12, x25, x16 994 sbcs x13, x26, x17 995 ldp x16 , x17 , [x1], #16 996 stp x10, x11, [x0], #16 997 stp x12, x13, [x0], #16 998 999 sbcs x10, x23, x6 1000 sbcs x11, x24, x7 1001 ldp x6, x7, [x1], #16 1002 1003 sbcs x12, x21, x8 1004 sbcs x13, x22, x9 1005 ldp x8, x9, [x1], #16 1006 stp x10, x11, [x0], #16 1007 stp x12, x13, [x0], #16 1008 1009 ldp x27, x28, [x2], #16 1010 ldp x25, x26, [x2], #16 1011 ldp x23, x24, [x2], #16 1012 ldp x21, x22, [x2], #16 1013 1014 sub x19, x19, #8*8 1015 sbcs x10, x27, x14 1016 sbcs x11, x28, x15 1017 1018 cbnz x19, .LSqr8xSubMod 1019 1020 mov x2 , sp 1021 add x1 , sp , x5 1022 1023 sbcs x12, x25, x16 1024 sbcs x13, x26, x17 1025 stp x12, x13, [x0, #8*2] 1026 stp x10, x11, [x0, #8*0] 1027 sbcs x10, x23, x6 1028 sbcs x11, x24, x7 1029 stp x10, x11, [x0, #8*4] 1030 sbcs x12, x21, x8 1031 sbcs x13, x22, x9 1032 stp x12, x13, [x0, #8*6] 1033 sbcs xzr, x30, xzr // Determine whether there is a borrowing 1034 1035.LSqr8xCopy: 1036 ldp x14, x15, [x3, #8*0] 1037 ldp x16, x17, [x3, #8*2] 1038 ldp x6, x7, [x3, #8*4] 1039 ldp x8, x9, [x3, #8*6] 1040 1041 ldp x27, x28, [x1], #16 1042 ldp x25, x26, [x1], #16 1043 ldp x23, x24, [x1], #16 1044 ldp x21, x22, [x1], #16 1045 1046 sub x5, x5, #8*8 1047 1048 csel x10, x27, x14, lo // Condition selection instruction, lo = less than, 1049 // equivalent to x14 = (conf==lo) ? x27 : x14 1050 csel x11, x28, x15, lo 1051 csel x12, x25, x16 , lo 1052 csel x13, x26, x17, lo 1053 csel x14, x23, x6, lo 1054 csel x15, x24, x7, lo 1055 csel x16, x21, x8, lo 1056 csel x17, x22, x9, lo 1057 1058 stp x10, x11, [x3], #16 1059 stp x12, x13, [x3], #16 1060 stp x14, x15, [x3], #16 1061 stp x16, x17, [x3], #16 1062 cbnz x5, .LSqr8xCopy 1063 1064.LMontSqr8xEnd: 1065 ldr x30, [x29, #8] // Pop-Stack 1066 ldp x27, x28, [x29, #16] 1067 mov sp , x29 1068 ldp x25, x26, [x29, #32] 1069 mov x0 , #1 1070 ldp x23, x24, [x29, #48] 1071 ldp x21, x22, [x29, #64] 1072 ldp x19, x20, [x29, #80] // x19 = [x29 + 80], x20 = [x29 + 80 + 8], 1073 // ldp reads two 8-byte memory blocks at a time. 1074 ldr x29, [sp], #128 // x29 = [sp], sp = sp + 128,ldr reads only an 8-byte block of memory 1075AARCH64_AUTIASP 1076 ret 1077.size MontSqr8, .-MontSqr8 1078 1079 1080.type MontMul4, %function 1081MontMul4: 1082AARCH64_PACIASP 1083 stp x29, x30, [sp, #-128]! 1084 mov x29, sp 1085 stp x27, x28, [sp, #16] 1086 stp x25, x26, [sp, #32] 1087 stp x23, x24, [sp, #48] 1088 stp x21, x22, [sp, #64] 1089 stp x19, x20, [sp, #80] 1090 mov x27, xzr 1091 mov x28, xzr 1092 mov x25, xzr 1093 mov x26, xzr 1094 mov x30, xzr 1095 1096 lsl x5 , x5 , #3 1097 sub x22, sp , x5 1098 sub sp , x22, #8*4 // The space of size + 4 is applied for 1099 mov x22, sp 1100 1101 sub x6, x5, #32 1102 cbnz x6, .LMul4xProcesStart 1103 1104 ldp x14 , x15 , [x1, #8*0] 1105 ldp x16 , x17 , [x1, #8*2] // x14~x17 = a[0~3] 1106 ldp x10, x11, [x3] // x10~x13 = n[0~3] 1107 ldp x12, x13, [x3, #8*2] 1108 1109 mov x1 , xzr 1110 mov x20, #4 1111 1112 // if size == 4, goto single step 1113.LMul4xSingleStep: 1114 sub x20, x20, #0x1 1115 ldr x24, [x2], #8 // b[i] 1116 1117 //----- lo(a[0~3] * b[0]) ----- 1118 mul x6, x14 , x24 1119 mul x7, x15 , x24 1120 mul x8, x16 , x24 1121 mul x9, x17 , x24 1122 1123 //----- hi(a[0~3] * b[0]) ----- 1124 adds x27, x27, x6 1125 umulh x6, x14 , x24 1126 adcs x28, x28, x7 1127 umulh x7, x15 , x24 1128 adcs x25, x25, x8 1129 umulh x8, x16 , x24 1130 adcs x26, x26, x9 1131 umulh x9, x17 , x24 1132 1133 mul x21, x27, x4 1134 adc x23, xzr, xzr 1135 //----- lo(n[0~3] * t[0]*k0) ----- 1136 adds x28, x28, x6 1137 adcs x25, x25, x7 1138 mul x7, x11, x21 1139 adcs x26, x26, x8 1140 mul x8, x12, x21 1141 adc x23, x23, x9 1142 mul x9, x13, x21 1143 cmp x27, #1 1144 adcs x27, x28, x7 1145 1146 //----- hi(n[0~3] *t[0]*k0) ----- 1147 umulh x6, x10, x21 1148 umulh x7, x11, x21 1149 adcs x28, x25, x8 1150 umulh x8, x12, x21 1151 adcs x25, x26, x9 1152 umulh x9, x13, x21 1153 adcs x26, x23, x1 1154 adc x1 , xzr, xzr 1155 1156 adds x27, x27, x6 1157 adcs x28, x28, x7 1158 adcs x25, x25, x8 1159 adcs x26, x26, x9 1160 adc x1 , x1 , xzr 1161 1162 cbnz x20, .LMul4xSingleStep 1163 1164 subs x14 , x27, x10 1165 sbcs x15 , x28, x11 1166 sbcs x16 , x25, x12 1167 sbcs x17 , x26, x13 1168 sbcs xzr, x1 , xzr // update CF, Determine whether to borrow 1169 1170 csel x14 , x27, x14 , lo 1171 csel x15 , x28, x15 , lo 1172 csel x16 , x25, x16 , lo 1173 csel x17 , x26, x17 , lo 1174 stp x14 , x15 , [x0, #8*0] 1175 stp x16 , x17 , [x0, #8*2] 1176 b .LMontMul4xEnd 1177 1178.LMul4xProcesStart: 1179 add x6, x5, #32 1180 eor v0.16b,v0.16b,v0.16b 1181 eor v1.16b,v1.16b,v1.16b 1182.LMul4xInitstack: 1183 sub x6, x6, #32 1184 st1 {v0.2d, v1.2d}, [x22], #32 1185 cbnz x6, .LMul4xInitstack 1186 mov x22, sp 1187 1188 add x6, x2 , x5 // x6 = &b[size] 1189 adds x19, x1 , x5 // x19 = &a[size] 1190 stp x0 , x6, [x29, #96] // r and b[size] push stack 1191 mov x0 , xzr 1192 mov x20, #0 1193 1194 .LMul4xLoopProces: 1195 ldr x24, [x2] // x24 = b[0] 1196 ldp x14 , x15 , [x1], #16 1197 ldp x16 , x17 , [x1], #16 // x14~x17 = a[0~3] 1198 1199 ldp x10 , x11 , [x3], #16 1200 ldp x12 , x13 , [x3], #16 // x10~x13 = n[0~3] 1201 1202.LMul4xPrepare: 1203 //----- lo(a[0~3] * b[0]) ----- 1204 adc x0 , x0 , xzr // x0 += CF 1205 add x20, x20, #8 1206 and x20, x20, #31 // x20 &= 0xff. The lower eight bits are used. 1207 // When x28 = 32, the instruction becomes 0. 1208 mul x6, x14 , x24 1209 mul x7, x15 , x24 1210 mul x8, x16 , x24 1211 mul x9, x17 , x24 1212 1213 //----- hi(a[0~3] * b[0]) ----- 1214 adds x27, x27, x6 // t[0] += lo(a[0] * b[0]) 1215 adcs x28, x28, x7 // t[1] += lo(a[1] * b[0]) 1216 adcs x25, x25, x8 // t[2] += lo(a[2] * b[0]) 1217 adcs x26, x26, x9 // t[3] += lo(a[3] * b[0]) 1218 umulh x6, x14 , x24 // x6 = hi(a[0] * b[0]) 1219 umulh x7, x15 , x24 1220 umulh x8, x16 , x24 1221 umulh x9, x17 , x24 1222 1223 mul x21, x27, x4 // x21 = t[0] * k0, t[0]*k0 needs to be recalculated in each round. 1224 // t[0] is different in each round. 1225 adc x23, xzr, xzr // t[4] += CF(set by t[3] += lo(a[3] * b[0]) ) 1226 1227 ldr x24, [x2, x20] // b[i] 1228 1229 str x21, [x22], #8 // t[0] * k0 push stack, x22 += 8 1230 1231 //----- lo(n[0~3] * t[0]*k0) ----- 1232 adds x28, x28, x6 // t[1] += hi(a[0] * b[0]) 1233 adcs x25, x25, x7 // t[2] += hi(a[1] * b[0]) 1234 adcs x26, x26, x8 // t[3] += hi(a[2] * b[0]) 1235 adc x23, x23, x9 // t[4] += hi(a[3] * b[0]) 1236 mul x7, x11, x21 // x7 = lo(n[1] * t[0]*k0) 1237 mul x8, x12, x21 // x8 = lo(n[2] * t[0]*k0) 1238 mul x9, x13, x21 // x9 = lo(n[3] * t[0]*k0) 1239 cmp x27, #1 1240 adcs x27, x28, x7 // (t[0]) = x27 = t[1] + lo(n[1] * t[0]*k0), 1241 // Perform S/R operations, r=2^64, shift right 64 bits. 1242 1243 //----- hi(n[0~3] *t[0]*k0) ----- 1244 adcs x28, x25, x8 // x28 = t[2] + lo(n[2] * t[0]*k0) 1245 adcs x25, x26, x9 // x25 = t[3] + lo(n[3] * t[0]*k0) 1246 adcs x26, x23, x0 // x26 = t[4] + 0 + CF 1247 adc x0 , xzr, xzr // x0 = CF 1248 umulh x6, x10, x21 // x6 = hi(n[0] * t[0]*k0) 1249 umulh x7, x11, x21 // x7 = hi(n[1] * t[0]*k0) 1250 umulh x8, x12, x21 // x8 = hi(n[2] * t[0]*k0) 1251 umulh x9, x13, x21 // x9 = hi(n[3] * t[0]*k0) 1252 1253 adds x27, x27, x6 // x27 = t[1] + hi(n[0] * t[0]*k0) 1254 adcs x28, x28, x7 // x28 = t[2] + hi(n[1] * t[0]*k0) 1255 adcs x25, x25, x8 // x25 = t[3] + hi(n[2] * t[0]*k0) 1256 adcs x26, x26, x9 // x26 = t[4] + hi(n[3] * t[0]*k0) 1257 1258 cbnz x20, .LMul4xPrepare 1259 1260 // Four t[0] * k0s are stacked in each loop. 1261 adc x0 , x0 , xzr 1262 ldp x6, x7, [x22, #8*4] // load A (cal before) 1263 ldp x8, x9, [x22, #8*6] 1264 1265 adds x27, x27, x6 1266 adcs x28, x28, x7 1267 adcs x25, x25, x8 1268 adcs x26, x26, x9 1269 ldr x21, [sp] // x21 = t[0] * k0 1270 1271.LMul4xReduceBegin: 1272 ldp x14 , x15 , [x1], #16 1273 ldp x16 , x17 , [x1], #16 // x14~x17 = a[4~7] 1274 ldp x10 , x11 , [x3], #16 1275 ldp x12 , x13 , [x3], #16 // n[4~7] 1276 1277.LMul4xReduceProces: 1278 adc x0 , x0 , xzr 1279 add x20, x20, #8 1280 and x20, x20, #31 1281 //----- lo(a[4~7] * b[i]) ----- 1282 mul x6, x14 , x24 1283 mul x7, x15 , x24 1284 mul x8, x16 , x24 1285 mul x9, x17 , x24 1286 1287 //----- hi(a[4~7] * b[i]) ----- 1288 adds x27, x27, x6 // x27 += lo(a[4~7] * b[i]) 1289 adcs x28, x28, x7 1290 adcs x25, x25, x8 1291 adcs x26, x26, x9 1292 adc x23, xzr, xzr 1293 umulh x6, x14 , x24 1294 umulh x7, x15 , x24 1295 umulh x8, x16 , x24 1296 umulh x9, x17 , x24 1297 1298 ldr x24, [x2, x20] // b[i] 1299 1300 //----- lo(n[4~7] * t[0]*k0) ----- 1301 adds x28, x28, x6 1302 adcs x25, x25, x7 1303 adcs x26, x26, x8 1304 adc x23, x23, x9 1305 mul x6, x10, x21 1306 mul x7, x11, x21 1307 mul x8, x12, x21 1308 mul x9, x13, x21 1309 1310 //----- hi(n[4~7] * t[0]*k0) ----- 1311 adds x27, x27, x6 1312 adcs x28, x28, x7 1313 adcs x25, x25, x8 1314 adcs x26, x26, x9 1315 adcs x23, x23, x0 1316 umulh x6, x10, x21 1317 umulh x7, x11, x21 1318 umulh x8, x12, x21 1319 umulh x9, x13, x21 1320 1321 ldr x21, [sp, x20] // t[0]*k0 1322 1323 adc x0 , xzr, xzr // x0 = CF, record carry 1324 str x27, [x22], #8 // s[i] the calculation is complete, write the result, x22 += 8 1325 1326 adds x27, x28, x6 1327 adcs x28, x25, x7 1328 adcs x25, x26, x8 1329 adcs x26, x23, x9 1330 1331 cbnz x20, .LMul4xReduceProces 1332 sub x6, x19, x1 // x6 = &a[size] - &a[i] 1333 // (The value of x1 increases cyclically.) Check whether the loop ends 1334 adc x0 , x0 , xzr 1335 cbz x6, .LMul4xLoopExitCheck 1336 1337 ldp x6, x7, [x22, #8*4] // t[4~7] 1338 ldp x8, x9, [x22, #8*6] 1339 1340 adds x27, x27, x6 1341 adcs x28, x28, x7 1342 adcs x25, x25, x8 1343 adcs x26, x26, x9 1344 1345 b .LMul4xReduceBegin 1346 1347.LMul4xLoopExitCheck: 1348 ldr x9, [x29, #104] // b[size] Pop-Stack 1349 add x2 , x2 , #8*4 // b subscript is offset once by 4 each time, &b[4], &b[8]...... 1350 sub x9 , x9, x2 // Indicates whether the outer loop ends. 1351 1352 adds x27, x27, x30 1353 adcs x28, x28, xzr 1354 stp x27, x28, [x22, #8*0] // x27, x20 After the calculation is complete, Push-stack storage 1355 ldp x27, x28, [sp , #8*4] // t[0], t[1] 1356 1357 adcs x25, x25, xzr 1358 adcs x26, x26, xzr 1359 stp x25, x26, [x22, #8*2] // x25, x22 After the calculation is complete, Push-stack storage 1360 ldp x25, x26, [sp , #8*6] // t[2], t[3] 1361 adc x30, x0 , xzr 1362 sub x3 , x3 , x5 // x1 = &n[0] 1363 cbz x9, .LMul4xEnd 1364 1365 sub x1 , x1 , x5 // x1 = &a[0] 1366 1367 mov x22, sp 1368 mov x0 , xzr 1369 b .LMul4xLoopProces 1370 1371.LMul4xEnd: 1372 ldp x10, x11, [x3], #16 1373 ldp x12, x13, [x3], #16 1374 ldr x0, [x29, #96] // r[0] Pop-Stack 1375 mov x19, x0 // backup, x19 = &r[0] 1376 subs x6, x27, x10 // t[0] - n[0], modify CF 1377 sbcs x7, x28, x11 // t[1] - n[1] - CF 1378 add x22, sp , #8*8 // x22 = &S[8] 1379 sub x20, x5 , #8*4 // x20 = (size - 4)*8 1380 1381 // t - n, x22 = &t[8], x3 = &n[0] 1382.LMul4xSubMod: 1383 sbcs x8, x25, x12 1384 sbcs x9, x26, x13 1385 1386 ldp x10, x11, [x3], #16 1387 ldp x12, x13, [x3], #16 1388 ldp x27, x28, [x22], #16 1389 ldp x25, x26, [x22], #16 1390 1391 sub x20, x20, #8*4 1392 1393 stp x6, x7, [x0], 16 1394 stp x8, x9, [x0], 16 1395 1396 sbcs x6, x27, x10 1397 sbcs x7, x28, x11 1398 1399 cbnz x20, .LMul4xSubMod 1400 1401 sbcs x8, x25, x12 1402 sbcs x9, x26, x13 1403 sbcs xzr, x30, xzr // CF = x30 - CF, x30 recorded the previous carry 1404 1405 add x1 , sp , #8*4 // The size of the SP space is size + 4., x1 = sp + 4 1406 stp x6, x7, [x0] 1407 stp x8, x9, [x0, #8*2] 1408 1409.LMul4xCopy: 1410 ldp x14 , x15 , [x19] // x14~x17 = r[0~3] 1411 ldp x16 , x17 , [x19, #8*2] 1412 ldp x27, x28, [x1], #16 // x27~22 = S[4~7] 1413 ldp x25, x26, [x1], #16 1414 sub x5, x5, #8*4 1415 csel x6, x27, x14, lo // x6 = (CF == 1) ? x27 : x14 1416 csel x7, x28, x15, lo 1417 csel x8, x25, x16, lo 1418 csel x9, x26, x17, lo 1419 stp x6, x7, [x19], #16 1420 stp x8, x9, [x19], #16 1421 1422 cbnz x5, .LMul4xCopy 1423 1424.LMontMul4xEnd: 1425 ldr x30, [x29, #8] // Value Pop-Stack in x30 (address of next instruction) 1426 ldp x27, x28, [x29, #16] 1427 ldp x25, x26, [x29, #32] 1428 ldp x23, x24, [x29, #48] 1429 ldp x21, x22, [x29, #64] 1430 ldp x19, x20, [x29, #80] 1431 mov sp , x29 1432 ldr x29, [sp], #128 1433AARCH64_AUTIASP 1434 ret 1435.size MontMul4, .-MontMul4 1436 1437#endif 1438