1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#if defined(HITLS_CRYPTO_CURVE_NISTP256) && defined(HITLS_CRYPTO_NIST_USE_ACCEL) 18 19#include "crypt_arm.h" 20#include "ecp256_pre_comp_table.s" 21 22.arch armv8-a+crypto 23.file "ecp256_armv8.S" 24 25.section .rodata 26.align 4 27.Lpoly: // P 28.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 29.Lone_mont: // R mod P, R = 2^256, = 2^256 - P 30.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe 31.Lord: // Order, n 32.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 33.LordK: // (2^64 - ord[0]) * ordK = 1 // LordK * Lord, the lower 64 bits are all Fs. 34 // LordK * Lord, The lower 64 bits are all Fs, coincidence? 35.quad 0xccd1c8aaee00bc4f 36.Lone: 37.quad 1, 0, 0, 0 38/* 39Initial z is not set to 1 400000000300000000, 00000001FFFFFFFE, FFFFFFFD00000002, FFFFFFFE00000003 41*/ 42 43.text 44.globl ECP256_GetPreCompTable 45.type ECP256_GetPreCompTable,%function 46.align 4 47ECP256_GetPreCompTable: 48AARCH64_PACIASP 49 adrp x0, g_preCompTable 50 add x0, x0, :lo12:g_preCompTable 51AARCH64_AUTIASP 52 ret 53.size ECP256_GetPreCompTable,.-ECP256_GetPreCompTable 54 55.globl ECP256_FromMont 56.type ECP256_FromMont,%function 57.align 4 58ECP256_FromMont: 59AARCH64_PACIASP 60 stp x29, x30, [sp, #-32]! 61 add x29, sp , #0 62 stp x19, x20, [sp, #16] 63 64 ldp x4 , x5 , [x1] 65 ldp x6 , x7 , [x1, #16] 66 adrp x14, .Lpoly+8 67 add x14,x14,:lo12:.Lpoly+8 68 ldr x12,[x14] 69 ldr x13,[x14,#16] 70 adrp x2, .Lone // &bp[0] 71 add x2, x2, :lo12:.Lone 72 73 bl ECP256_MulCore 74 75 ldp x19, x20, [sp , #16] 76 ldp x29, x30, [sp], #32 77AARCH64_AUTIASP 78 ret 79.size ECP256_FromMont,.-ECP256_FromMont 80 81.type ECP256_AddCore,%function 82.align 4 83ECP256_AddCore: 84AARCH64_PACIASP 85 adds x14, x14, x8 // ret = a+b 86 adcs x15, x15, x9 87 adcs x16, x16, x10 88 adcs x17, x17, x11 89 adc x1 , xzr, xzr 90 91 // r -= p, p = 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 92 adds x8 , x14, #1 // x8 = x14 - 0xffffffffffffffff = x14 + 1 93 sbcs x9 , x15, x12 // x9 = x15 - 0x00000000ffffffff 94 sbcs x10, x16, xzr // x10 = x16 - 0 95 sbcs x11, x17, x13 // x11 = x17 - 0xffffffff00000001 96 sbcs xzr, x1 , xzr // Determine whether r-p has borrowing 97 98 // r = r < p ? r : r-p 99 // x8~x11 saves r-p, x14~x17 saves r 100 csel x14, x14, x8 , lo 101 csel x15, x15, x9 , lo 102 csel x16, x16, x10, lo 103 csel x17, x17, x11, lo 104 105 stp x14, x15, [x0] 106 stp x16, x17, [x0, #16] 107AARCH64_AUTIASP 108 ret 109.size ECP256_AddCore,.-ECP256_AddCore 110 111 112/////////////////////////////////////// 113// x14~x17 = a[0~3], x8~x11 = b[0~3] // 114/////////////////////////////////////// 115.type ECP256_SubCore1,%function 116.align 4 117ECP256_SubCore1: 118AARCH64_PACIASP 119 subs x14, x14, x8 120 sbcs x15, x15, x9 121 sbcs x16, x16, x10 122 sbcs x17, x17, x11 123 sbc x1 , xzr, xzr // x1 = CF 124 125 // r += p 126 subs x8 , x14, #1 // x8 = x14 + 0xffffffffffffffff = x14 - 1 127 adcs x9 , x15, x12 // x9 = x15 + 0x00000000ffffffff 128 adcs x10, x16, xzr // x10 = x16 + 0 129 adc x11, x17, x13 // x11 = x17 + 0xffffffff00000001 130 cmp x1 , xzr 131 132 // r = borrow==0 ? r : r+p 133 csel x14, x14, x8 , eq 134 csel x15, x15, x9 , eq 135 csel x16, x16, x10, eq 136 csel x17, x17, x11, eq 137 stp x14, x15, [x0] 138 stp x16, x17, [x0, #16] 139AARCH64_AUTIASP 140 ret 141.size ECP256_SubCore1,.-ECP256_SubCore1 142 143.type ECP256_SubCore2,%function 144.align 4 145ECP256_SubCore2: 146AARCH64_PACIASP 147 subs x14, x8 , x14 148 sbcs x15, x9 , x15 149 sbcs x16, x10, x16 150 sbcs x17, x11, x17 151 sbc x1 , xzr, xzr // x1 = CF 152 153 // r += p 154 subs x8 , x14, #1 // x8 = x14 + 0xffffffffffffffff = x14 - 1 155 adcs x9 , x15, x12 // x9 = x15 + 0x00000000ffffffff 156 adcs x10, x16, xzr // x10 = x16 + 0 157 adc x11, x17, x13 // x11 = x17 + 0xffffffff00000001 158 cmp x1 , xzr 159 160 // r = borrow==0 ? r : r+p 161 csel x14, x14, x8 , eq 162 csel x15, x15, x9 , eq 163 csel x16, x16, x10, eq 164 csel x17, x17, x11, eq 165 stp x14, x15, [x0] 166 stp x16, x17, [x0, #16] 167AARCH64_AUTIASP 168 ret 169.size ECP256_SubCore2,.-ECP256_SubCore2 170 171 172.type ECP256_DivBy2Core,%function 173.align 4 174ECP256_DivBy2Core: 175AARCH64_PACIASP 176 // a+p 177 subs x8 , x14, #1 // x8 = x14 + 0xffffffffffffffff = x14 - 1 178 adcs x9 , x15, x12 // x9 = x15 + 0x00000000ffffffff 179 adcs x10, x16, xzr // x10 = x16 + 0 180 adcs x11, x17, x13 // x11 = x17 + 0xffffffff00000001 181 adc x1 , xzr, xzr // x1 = CF 182 tst x14, #1 // Check whether a is an even number. eq indicates that a is an even number. 183 184 // a = a Is an even number ? a : a+p 185 csel x14, x14, x8 , eq 186 csel x15, x15, x9 , eq 187 csel x16, x16, x10, eq 188 csel x17, x17, x11, eq 189 csel x1 , xzr, x1 , eq // If a is still a, then x1, which holds the carry, is set to 0. 190 191 lsr x14, x14, #1 // r >>= 1, Divided by 2. 192 orr x14, x14, x15, lsl#63 // x15 is shifted leftward by 63 bits. The lower 63 bits become 0. 193 // The most significant bit is the least significant bit before the shift. 194 lsr x15, x15, #1 // The most significant bit of x14 is changed to the least significant bit of x15 195 // to achieve cyclic right shift. 196 orr x15, x15, x16, lsl#63 197 lsr x16, x16, #1 198 orr x16, x16, x17, lsl#63 199 lsr x17, x17, #1 200 orr x17, x17, x1 , lsl#63 // The most significant bit of the highest block is x1. 201 stp x14, x15, [x0] 202 stp x16, x17, [x0, #16] 203AARCH64_AUTIASP 204 ret 205.size ECP256_DivBy2Core,.-ECP256_DivBy2Core 206 207.globl ECP256_Neg 208.type ECP256_Neg,%function 209.align 4 210ECP256_Neg: 211AARCH64_PACIASP 212 stp x29, x30, [sp, #-16]! 213 add x29, sp , #0 214 215 ldp x14, x15, [x1] 216 ldp x16, x17, [x1, #16] 217 218 mov x8 , xzr 219 mov x9 , xzr 220 mov x10, xzr 221 mov x11, xzr 222 223 adrp x7, .Lpoly+8 224 add x7,x7,:lo12:.Lpoly+8 225 ldr x12,[x7] 226 ldr x13,[x7,#16] 227 // -a = 0 - a 228 bl ECP256_SubCore2 229 230 ldp x29, x30, [sp], #16 231AARCH64_AUTIASP 232 ret 233.size ECP256_Neg,.-ECP256_Neg 234 235 236.type ECP256_MulCore,%function 237.align 4 238ECP256_MulCore: 239AARCH64_PACIASP 240 ldr x3 , [x2] // b[0] 241 242 mul x14, x4 , x3 243 umulh x8 , x4 , x3 244 mul x15, x5 , x3 245 umulh x9 , x5 , x3 246 mul x16, x6 , x3 247 umulh x10, x6 , x3 248 mul x17, x7 , x3 249 umulh x11, x7 , x3 250 251 adds x15, x15, x8 252 adcs x16, x16, x9 253 adcs x17, x17, x10 254 adc x19, xzr, x11 // x19 = x11 + CF 255 mov x20, xzr 256 257 lsl x8 , x14, #32 // In this case, x14 stores the lower 32 bits of the lo(a[0]*b[i]), x8 = lo(a[0]*b[i]). 258 // The lower 32 bits are shifted to the left by 32 bits, and 0s are padded to the lower bits. 259 lsr x9 , x14, #32 // x9 = The most significant 32 bits of lo(a[0]*b[i]) are shifted rightward by 32 bits 260 // and 0s are padded to the most significant bits. 261 subs x10, x14, x8 262 sbc x11, x14, x9 263 264 adds x14, x15, x8 265 adcs x15, x16, x9 266 adcs x16, x17, x10 267 adcs x17, x19, x11 268 adc x19, x20, xzr 269 270 ldr x3 , [x2, #8] // b[1] 271 272 // lo(a[0~3] * b[i]) 273 mul x8 , x4 , x3 274 mul x9 , x5 , x3 275 mul x10, x6 , x3 276 mul x11, x7 , x3 277 278 adds x14, x14, x8 279 adcs x15, x15, x9 280 adcs x16, x16, x10 281 adcs x17, x17, x11 282 adc x19, x19, xzr 283 284 // hi(a[0~3] * b[i]) 285 umulh x8 , x4 , x3 286 umulh x9 , x5 , x3 287 umulh x10, x6 , x3 288 umulh x11, x7 , x3 289 290 adds x15, x15, x8 291 adcs x16, x16, x9 292 adcs x17, x17, x10 293 adcs x19, x19, x11 294 adc x20, xzr, xzr 295 296 lsl x8 , x14, #32 // In this case, x14 stores the lower 32 bits of the lo(a[0]*b[i]), 297 // x8 = lo(a[0]*b[i]). The lower 32 bits are shifted to the left by 32 bits, 298 // and 0s are padded to the lower bits. 299 lsr x9 , x14, #32 // x9 = The most significant 32 bits of lo(a[0]*b[i]) are shifted rightward 300 // by 32 bits and 0s are padded to the most significant bits. 301 subs x10, x14, x8 302 sbc x11, x14, x9 303 304 adds x14, x15, x8 305 adcs x15, x16, x9 306 adcs x16, x17, x10 307 adcs x17, x19, x11 308 adc x19, x20, xzr 309 310 ldr x3 , [x2, #8*2] // b[2] 311 312 // lo(a[0~3] * b[i]) 313 mul x8 , x4 , x3 314 mul x9 , x5 , x3 315 mul x10, x6 , x3 316 mul x11, x7 , x3 317 318 adds x14, x14, x8 319 adcs x15, x15, x9 320 adcs x16, x16, x10 321 adcs x17, x17, x11 322 adc x19, x19, xzr 323 324 // hi(a[0~3] * b[i]) 325 umulh x8 , x4 , x3 326 umulh x9 , x5 , x3 327 umulh x10, x6 , x3 328 umulh x11, x7 , x3 329 330 adds x15, x15, x8 331 adcs x16, x16, x9 332 adcs x17, x17, x10 333 adcs x19, x19, x11 334 adc x20, xzr, xzr 335 336 lsl x8 , x14, #32 // x8 = lq<<32 337 lsr x9 , x14, #32 // x9 = hq 338 subs x10, x14, x8 // x10 = q - lq<<32 339 sbc x11, x14, x9 // x11 = q - hq 340 341 adds x14, x15, x8 342 adcs x15, x16, x9 343 adcs x16, x17, x10 344 adcs x17, x19, x11 345 adc x19, x20, xzr 346 347 ldr x3 , [x2, #8*3] // b[3] 348 349 // lo(a[0~3] * b[i]) 350 mul x8 , x4 , x3 351 mul x9 , x5 , x3 352 mul x10, x6 , x3 353 mul x11, x7 , x3 354 355 adds x14, x14, x8 356 adcs x15, x15, x9 357 adcs x16, x16, x10 358 adcs x17, x17, x11 359 adc x19, x19, xzr 360 361 // hi(a[0~3] * b[i]) 362 umulh x8 , x4 , x3 363 umulh x9 , x5 , x3 364 umulh x10, x6 , x3 365 umulh x11, x7 , x3 366 367 adds x15, x15, x8 368 adcs x16, x16, x9 369 adcs x17, x17, x10 370 adcs x19, x19, x11 371 adc x20, xzr, xzr 372 373 // last reduction 374 lsl x8 , x14, #32 // In this case, x14 stores the lower 32 bits of the lo(a[0]*b[i]), 375 // x8 = The lower 32 bits of (lo(a[0]*b[i])) are shifted to the left by 32 bits, 376 // and 0s are padded to the lower bits. 377 lsr x9 , x14, #32 // x9 = The most significant 32 bits of (lo(a[0]*b[i])) are shifted rightward 378 // by 32 bits and 0s are padded to the most significant bits. 379 subs x10, x14, x8 380 sbc x11, x14, x9 381 382 adds x14, x15, x8 383 adcs x15, x16, x9 384 adcs x16, x17, x10 385 adcs x17, x19, x11 386 adc x19, x20, xzr 387 388 // x8~x11 = r - p (r may be greater than p) 389 adds x8 , x14, #1 // x8 = x14 - 0xffffffffffffffff = x14 + 1 390 sbcs x9 , x15, x12 // x9 = x15 - 0x00000000ffffffff 391 sbcs x10, x16, xzr // x10 = x16 - 0 392 sbcs x11, x17, x13 // x11 = x17 - 0xffffffff00000001 393 sbcs xzr, x19, xzr // Determine whether to borrow 394 395 csel x14, x14, x8 , lo 396 csel x15, x15, x9 , lo 397 csel x16, x16, x10, lo 398 csel x17, x17, x11, lo 399 400 stp x14, x15, [x0] 401 stp x16, x17, [x0, #8*2] 402AARCH64_AUTIASP 403 ret 404.size ECP256_MulCore, .-ECP256_MulCore 405 406 407.type ECP256_SqrCore,%function 408.align 4 409ECP256_SqrCore: 410AARCH64_PACIASP 411/****************************************** 412 x7 x1 x20 x19 x17 x16 x15 x14 413 h0*1 l0*1 414 h0*2 l0*2 415 h0*3 l0*3 416 h1*2 l1*2 417 h1*3 l1*3 418 h2*3 l2*3 419 h3*3 l3*3 h2*2 l2*2 h1*1 l1*1 h0*0 l0*0 420*******************************************/ 421 422 // a[1~3] * a[0] 423 mul x15, x5 , x4 // lo(a[1] * a[0]) 424 umulh x8 , x5 , x4 // hi(a[1] * a[0]) 425 mul x16, x6 , x4 // lo(a[2] * a[0]) 426 umulh x9 , x6 , x4 // hi(a[2] * a[0]) 427 mul x17, x7 , x4 // lo(a[3] * a[0]) 428 umulh x19, x7 , x4 // hi(a[3] * a[0]) 429 430 adds x16, x16, x8 431 adcs x17, x17, x9 432 adc x19, x19, xzr // There will be no more carry. 433 434 // a[2~3] * a[1] 435 mul x8 , x6 , x5 // lo(a[2] * a[1]) 436 umulh x9 , x6 , x5 // hi(a[2] * a[1]) 437 mul x10, x7 , x5 // lo(a[3] * a[1]) 438 umulh x11, x7 , x5 // hi(a[3] * a[1]) 439 440 // a[3] * a[2] 441 mul x20, x7 , x6 442 umulh x1 , x7 , x6 443 444 // Add the results of the current round, and then add the acc (register in the note above). 445 adds x9 , x10, x9 446 adc x10, x11, xzr 447 448 adds x17, x17, x8 449 adcs x19, x19, x9 450 adcs x20, x20, x10 451 adc x1 , x1 , xzr // a[0-3] has been saved in x4 to x7. 452 453 // a[0~3] ^ 2 454 mul x14, x4 , x4 455 umulh x4 , x4 , x4 456 mul x8 , x5 , x5 457 umulh x5 , x5 , x5 458 mul x9 , x6 , x6 459 umulh x6 , x6 , x6 460 mul x10, x7 , x7 461 umulh x7 , x7 , x7 462 463 // acc[1~6] << 1 464 adds x15, x15, x15 465 adcs x16, x16, x16 466 adcs x17, x17, x17 467 adcs x19, x19, x19 468 adcs x20, x20, x20 469 adcs x1 , x1 , x1 470 adc x2 , xzr, xzr 471 472 // acc[] += a[] ^ 2 473 adds x15, x15, x4 474 adcs x16, x16, x8 475 adcs x17, x17, x5 476 adcs x19, x19, x9 477 adcs x20, x20, x6 478 adcs x1 , x1 , x10 479 adc x7 , x7 , x2 480 481 // Four rounds of reduce. 482 lsl x8 , x14, #32 483 lsr x9 , x14, #32 484 subs x10, x14, x8 485 sbc x11, x14, x9 486 adds x14, x15, x8 487 adcs x15, x16, x9 488 adcs x16, x17, x10 489 adc x17, x11, xzr 490 491 lsl x8 , x14, #32 492 lsr x9 , x14, #32 493 subs x10, x14, x8 494 sbc x11, x14, x9 495 adds x14, x15, x8 496 adcs x15, x16, x9 497 adcs x16, x17, x10 498 adc x17, x11, xzr 499 500 lsl x8 , x14, #32 501 lsr x9 , x14, #32 502 subs x10, x14, x8 503 sbc x11, x14, x9 504 adds x14, x15, x8 505 adcs x15, x16, x9 506 adcs x16, x17, x10 507 adc x17, x11, xzr 508 509 lsl x8 , x14, #32 510 lsr x9 , x14, #32 511 subs x10, x14, x8 512 sbc x11, x14, x9 513 adds x14, x15, x8 514 adcs x15, x16, x9 515 adcs x16, x17, x10 516 adc x17, x11, xzr 517 518 // Add acc[4-7]. x14~x19 is the current calculation result r. 519 adds x14, x14, x19 520 adcs x15, x15, x20 521 adcs x16, x16, x1 522 adcs x17, x17, x7 523 adc x19, xzr, xzr 524 525 // r -= p 526 adds x8 , x14, #1 // subs x8,x14,#-1 527 sbcs x9 , x15, x12 528 sbcs x10, x16, xzr 529 sbcs x11, x17, x13 530 sbcs xzr, x19, xzr // The set CF is used to determine the size relationship between r and p. 531 532 // r = r-p < 0 ? r : r-p 533 csel x14, x14, x8 , lo 534 csel x15, x15, x9 , lo 535 csel x16, x16, x10, lo 536 csel x17, x17, x11, lo 537 538 stp x14, x15, [x0] 539 stp x16, x17, [x0, #16] 540AARCH64_AUTIASP 541 ret 542.size ECP256_SqrCore,.-ECP256_SqrCore 543 544.globl ECP256_Mul 545.type ECP256_Mul,%function 546.align 4 547ECP256_Mul: 548AARCH64_PACIASP 549 stp x29, x30, [sp, #-32]! 550 add x29, sp , #0 551 stp x19, x20, [sp, #16] 552 553 ldp x4 , x5 , [x1] 554 ldp x6 , x7 , [x1,#16] 555 556 adrp x3, .Lpoly+8 557 add x3,x3,:lo12:.Lpoly+8 558 ldr x12,[x3] 559 ldr x13,[x3,#16] 560 561 bl ECP256_MulCore 562 563 ldp x19, x20, [sp , #16] 564 ldp x29, x30, [sp], #32 565AARCH64_AUTIASP 566 ret 567.size ECP256_Mul,.-ECP256_Mul 568 569.globl ECP256_Sqr 570.type ECP256_Sqr,%function 571.align 4 572ECP256_Sqr: 573AARCH64_PACIASP 574 stp x29, x30, [sp, #-32]! 575 add x29, sp , #0 576 stp x19, x20, [sp, #16] 577 578 ldp x4 , x5 , [x1] 579 ldp x6 , x7 , [x1, #16] 580 adrp x3, .Lpoly+8 581 add x3,x3,:lo12:.Lpoly+8 582 ldr x12,[x3] 583 ldr x13,[x3,#16] 584 585 bl ECP256_SqrCore 586 587 ldp x19, x20, [sp , #16] 588 ldp x29, x30, [sp], #32 589AARCH64_AUTIASP 590 ret 591.size ECP256_Sqr,.-ECP256_Sqr 592 593# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-madd-2007-bl 594# Deal process: 595# Z1Z1 = Z12 596# U2 = X2*Z1Z1 597# S2 = Y2*Z1*Z1Z1 598# H = U2-X1 599# HH = H2 600# I = 4*HH 601# J = H*I 602# r = 2*(S2-Y1) 603# V = X1*I 604# X3 = r2-J-2*V 605# Y3 = r*(V-X3)-2*Y1*J 606# Z3 = (Z1+H)2-Z1Z1-HH 607.globl ECP256_AddAffine 608.type ECP256_AddAffine,%function 609.align 5 610ECP256_AddAffine: 611AARCH64_PACIASP 612 613 stp x29, x30, [sp, #-80]! 614 add x29, sp , #0 615 stp x19, x20, [sp, #16] 616 stp x21, x22, [sp, #32] 617 stp x23, x24, [sp, #48] 618 stp x25, x26, [sp, #64] 619 sub sp , sp , #32*10 // Each coordinate is 256 bits, that is, 32 bytes, and a space of 10 coordinates is applied for. 620 // Based on C implementation, z1sqr and S2 lifecycles do not overlap. 621 // You can share one piece of memory and save an extra piece of memory. 622 623 // x0 to x2 will be used to invoke the interface to transfer parameters. 624 // Therefore, the initial input parameters are saved to x21 to x23. 625 // Obtains the structure members based on the address offset. Each member is 256 bits, that is, 32 bytes. 626 mov x21, x0 // CRYPT_P256_Point *r 627 mov x22, x1 // CRYPT_P256_Point *a 628 mov x23, x2 // CRYPT_P256_AffinePoint *b 629 630 631 adrp x14, .Lpoly+8 // p[1] 632 add x14,x14,:lo12:.Lpoly+8 633 ldr x12,[x14] // p[3] 634 ldr x13,[x14,#16] 635 636 637 ldp x4, x5, [x1, #64] // x1+64 = &(a->z[0]), a->z[0] is marked as z1[0] 638 ldp x6, x7, [x1, #64+16] // x1+64+16 = &(a->z[2]) 639 640 orr x24, x4 , x5 // x24 = z1[0] | z1[1] 641 orr x24, x24, x6 // x24 |= z1[2] 642 orr x24, x24, x7 // x24 |= z1[3] 643 cmp x24, #0 644 csetm x24, ne // x24 = x24!=0 ? -1 : 0, it means ~iszero(z1), 645 // determine whether point a is (x, y, 0) 646 647 ldp x8 , x9 , [x2] // x2 = &(b->x[0]), b->x[0] is marked as x2[0] 648 ldp x10, x11, [x2, #16] // x2+16 = &(b->x[2]) 649 ldp x14, x15, [x2, 32] // x2+32 = &(b->y[0]) 650 ldp x16, x17, [x2, #32+16] // x2+32+16 = &(b->y[2]) 651 652 // x25 = x2[0] | x2[1] | x2[2] | x2[3] | y2[0] | y2[1] | y2[2] | y2[3] 653 orr x25, x8 , x9 654 orr x25, x25, x10 655 orr x25, x25, x11 656 orr x25, x25, x14 657 orr x25, x25, x15 658 orr x25, x25, x16 659 orr x25, x25, x17 660 cmp x25, #0 661 csetm x25, ne // x25 = x25!=0 ? -1 : 0, it means ~iszero(x2|y2), 662 // Check whether point b is (0, 0). 663 664 add x0 , sp , #32*3 // zsqr = sp[32*3] 665 // x4~x7 = z1[0~3] (the value has been assigned.) 666 bl ECP256_SqrCore // zsqr = z1^2 667 // x14~x17 also temporarily stored results 668 669 // The next calculation requires the zsqr calculation result as the input. 670 // The calculation result is stored in x4 to x7 in advance to prevent x14 to x17 from being overwritten 671 // by the next calculation. Therefore, the calculation result needs to be read from the memory. 672 mov x4 , x14 673 mov x5 , x15 674 mov x6 , x16 675 mov x7 , x17 676 677 add x0 , sp , #32*4 // u2 = sp[32*4] 678 add x2 , x23, #0 // x2 points to b->x. 679 // x4~x7 * [x2] 680 bl ECP256_MulCore // u2 = z1^2 * x2 681 // x14~x17 also temporarily stored results. 682 683 ldp x8 , x9 , [x22] // a->x 684 ldp x10, x11, [x22, #16] 685 add x0 , sp , #32*5 // h = sp[32*5] 686 // x14~x17 - x8~x11 687 bl ECP256_SubCore1 // h = u2 - x1 688 689 ldp x4 , x5 , [sp, #32*3] // zsqr = sp[32*3] 690 ldp x6 , x7 , [sp, #32*3+16] 691 add x2 , x22, #64 // x2 points to a->z 692 add x0 , sp , #32*3 // s2 = sp[32*3], zsqr will not be used later. 693 // You can use this block memory to store and calculate s2. 694 bl ECP256_MulCore // s2 = zsqr * z1 = z1^3 695 696 ldp x4 , x5 , [sp, #32*5] // x4~x7 = h = sp[32*5] 697 ldp x6 , x7 , [sp, #32*5+16] 698 add x2 , x22, #64 699 add x0 , sp , #32*2 // x0 points to res_z. 700 bl ECP256_MulCore // res_z = h * z1 701 702 ldp x4 , x5 , [sp, #32*3] // x4~x7 = s2 = sp[32*3] 703 ldp x6 , x7 , [sp, #32*3+16] 704 add x2 , x23, #32 // x2 points to b->y. 705 add x0 , sp , #32*3 // s2 = sp[32*3] 706 bl ECP256_MulCore // s2 = s2 * y2 = y2 * z1^3 707 708 ldp x8 , x9 , [x22, #32] 709 ldp x10, x11, [x22, #32+16] 710 add x0 , sp , #32*6 // R = sp[32*6] 711 // x14~x17 - x8~x11 712 bl ECP256_SubCore1 // R = s2 - y1 713 714 // Rsqr = R^2 and hsqr = h^2 are independent of each other. The sequence can be adjusted. 715 // If Rsqr is calculated first, the memory can be read less once. (The pipeline bubble needs to be optimized.) 716 // In addition, the calculated hsqr can read less memory in hcub = hsqr x h. 717 mov x4 , x14 718 mov x5 , x15 719 mov x6 , x16 720 mov x7 , x17 721 add x0 , sp , #32*7 // Rsqr = sp[32*7] 722 bl ECP256_SqrCore // Rsqr = R^2 723 724 ldp x4 , x5 , [sp, #32*5] // h = sp[32*5] 725 ldp x6 , x7 , [sp, #32*5+16] 726 add x0 , sp , #32*8 // hsqr = sp[32*8] 727 bl ECP256_SqrCore // hsqr = h^2 728 729 // x14 to x17 stores the hsqr results and does not need to be read from the memory. 730 // (The pipeline bubble exists and needs to be optimized.) 731 mov x4 , x14 732 mov x5 , x15 733 mov x6 , x16 734 mov x7 , x17 735 add x2 , sp , #32*5 // h = sp[32*5] 736 add x0 , sp , #32*9 // hcub = sp[32*9] 737 bl ECP256_MulCore // hcub = hsqr * h = h^3 738 739 // Because the multiplication involves too many registers, the preceding hsqr cannot be backed up. 740 // If the registers after x19 are used, the hsqr needs to be pushed and popped at the beginning 741 // and end of the stack, which outweighs the gain. Therefore, the hsqr can only be read from the memory. 742 ldp x4 , x5 , [sp, #32*8] // hsqr = sp[32*8] 743 ldp x6 , x7 , [sp, #32*8+16] 744 add x2 , x22, #0 // x2 points to a->x 745 add x0 , sp , #32*4 // u2 = sp[32*4] 746 bl ECP256_MulCore // u2 = hsqr * x1 747 748 mov x8 , x14 749 mov x9 , x15 750 mov x10, x16 751 mov x11, x17 752 add x0 , sp , #32*8 // hsqr = sp[32*8] 753 // x14~x17 + x8~x11 754 bl ECP256_AddCore // hsqr = 2 * u2 755 756 // x14~x17 saves hsqr, so ECP256_SubCore2 is called. 757 ldp x8 , x9 , [sp, #32*7] // Rsqr = sp[32*7] 758 ldp x10, x11, [sp, #32*7+16] 759 // x8~x11 - x14~x17 760 add x0 , sp , #0 // x0 points to res_x 761 bl ECP256_SubCore2 // res_x = Rsqr - hsqr 762 763 // x14~x17 save res_x, so call ECP256_SubCore1 764 ldp x8 , x9 , [sp, #32*9] // hcub = sp[32*9] 765 ldp x10, x11, [sp, #32*9+16] 766 // x14~x17 - x8~x11 767 bl ECP256_SubCore1 // res_x = res_x - hcub 768 769 // x14~x17 save res_x, so call ECP256_SubCore2 770 ldp x8 , x9 , [sp, #32*4] // u2 = sp[32*4] 771 ldp x10, x11, [sp, #32*4+16] 772 add x0 , sp , #32*1 773 bl ECP256_SubCore2 // res_y = u2 - res_x 774 775 ldp x4 , x5 , [sp, #32*9] // hcub = sp[32*9] 776 ldp x6 , x7 , [sp, #32*9+16] 777 add x2 , x22, #32 // y1 = a->y 778 add x0 , sp , #32*3 // s2 = sp[32*3] 779 bl ECP256_MulCore // s2 = y1 * hcub 780 781 add x2 , sp , #32*6 // R = sp[32*6] 782 add x0 , sp , #32*1 // res_y = sp[32*1] 783 ldp x4 , x5 , [x0] 784 ldp x6 , x7 , [x0, #16] 785 bl ECP256_MulCore // res_y = res_y * R 786 787 // x14–x17 stores res_y, and x0 is still res_y. No value needs to be assigned again. 788 ldp x8 , x9 , [sp, #32*3] // s2 = sp[32*3] 789 ldp x10, x11, [sp, #32*3+16] 790 // x14~x17 - x8~x11 791 bl ECP256_SubCore1 // res_y = res_y - s2 792 793 794 // Four memory blocks are not read. (The pipeline bubble exists and needs to be optimized.) 795 // x14-x17 stores res_y. Therefore, res_y is judged and assigned a value. 796 // copy y 797 ldp x4 , x5 , [x23, #32] // in2_y, b->y 798 ldp x6 , x7 , [x23, #32+16] 799 ldp x8 , x9 , [x22, #32] // in1_y, a->y 800 ldp x10, x11, [x22, #32+16] 801 802 // res_y = in1infty == 0 ? res_y : in2_y 803 cmp x24, #0 // x24 = ~in1infty 804 csel x14, x14, x4 , ne 805 csel x15, x15, x5 , ne 806 csel x16, x16, x6 , ne 807 csel x17, x17, x7 , ne 808 // res_y = in2infty == 0 ? res_y : in1_y 809 cmp x25, #0 // x25 = ~in2infty 810 csel x14, x14, x8 , ne 811 csel x15, x15, x9 , ne 812 csel x16, x16, x10, ne 813 csel x17, x17, x11, ne 814 815 stp x14, x15, [x21, #32] 816 stp x16, x17, [x21, #32+16] 817 818 // copy x 819 ldp x4 , x5 , [x23] // in2_x, b->x 820 ldp x6 , x7 , [x23, #16] 821 ldp x8 , x9 , [x22] // in1_x, a->x 822 ldp x10, x11, [x22, #16] 823 ldp x14, x15, [sp] // res_x 824 ldp x16, x17, [sp , #16] 825 826 // res_x = in1infty == 0 ? res_x : in2_x 827 cmp x24, #0 // x24 = ~in1infty 828 csel x14, x14, x4 , ne 829 csel x15, x15, x5 , ne 830 csel x16, x16, x6 , ne 831 csel x17, x17, x7 , ne 832 // res_x = in2infty == 0 ? res_x : in1_x 833 cmp x25, #0 // x25 = ~in2infty 834 csel x14, x14, x8 , ne 835 csel x15, x15, x9 , ne 836 csel x16, x16, x10, ne 837 csel x17, x17, x11, ne 838 839 stp x14, x15, [x21] 840 stp x16, x17, [x21, #16] 841 842 // copy z 843 adrp x23, .Lone_mont 844 add x23, x23, :lo12:.Lone_mont 845 ldp x4 , x5 , [x23] // one_mont, 1 * RR * R' = R (mod p) 846 ldp x6 , x7 , [x23, #16] 847 ldp x8 , x9 , [x22, #64] // in1_z, a->z 848 ldp x10, x11, [x22, #64+16] 849 ldp x14, x15, [sp , #64] // res_z 850 ldp x16, x17, [sp , #64+16] 851 852 // res_z = in1infty == 0 ? res_z : ONE 853 cmp x24, #0 // x24 = ~in1infty 854 csel x14, x14, x4 , ne 855 csel x15, x15, x5 , ne 856 csel x16, x16, x6 , ne 857 csel x17, x17, x7 , ne 858 // res_z = in2infty == 0 ? res_z : in1_z 859 cmp x25, #0 // x25 = ~in2infty 860 csel x14, x14, x8 , ne 861 csel x15, x15, x9 , ne 862 csel x16, x16, x10, ne 863 csel x17, x17, x11, ne 864 865 stp x14, x15, [x21, #64] 866 stp x16, x17, [x21, #64+16] 867 868 add sp , x29, #0 869 ldp x19, x20, [x29, #16] 870 ldp x21, x22, [x29, #32] 871 ldp x23, x24, [x29, #48] 872 ldp x25, x26, [x29, #64] 873 ldp x29, x30, [sp], #80 874AARCH64_AUTIASP 875 ret 876.size ECP256_AddAffine,.-ECP256_AddAffine 877 878# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo 879# Deal process: 880# U1 = X1*Z22 881# U2 = X2*Z12 882# S1 = Y1*Z23 883# S2 = Y2*Z13 884# H = U2-U1 885# r = S2-S1 886# X3 = r2-H3-2*U1*H2 887# Y3 = r*(U1*H2-X3)-S1*H3 888# Z3 = Z1*Z2*H 889.globl ECP256_PointAdd 890.type ECP256_PointAdd,%function 891.align 5 892ECP256_PointAdd: 893AARCH64_PACIASP 894 stp x29, x30, [sp, #-96]! 895 add x29, sp , #0 896 stp x19, x20, [sp, #16] 897 stp x21, x22, [sp, #32] 898 stp x23, x24, [sp, #48] 899 stp x25, x26, [sp, #64] 900 stp x27, x28, [sp, #80] 901 sub sp , sp , #32*12 // Each coordinate is 256 bits, that is, 32 bytes, 902 // and a space of 10 coordinates is applied for. 903 904 adrp x14, .Lpoly+8 // p[1] 905 add x14,x14,:lo12:.Lpoly+8 906 ldr x12,[x14] // p[3] 907 ldr x13,[x14,#16] 908 909 // Pointer to the initial backup input parameter. 910 // x0 to x2 are used to transfer parameters when the interface is invoked. 911 mov x21, x0 912 mov x22, x1 913 mov x23, x2 914 915 ldp x4 , x5 , [x22, #64] // z1 916 ldp x6 , x7 , [x22, #64+16] 917 // x24 = z1[0] | z1[1] | z1[2] | z1[3] 918 orr x24, x4 , x5 919 orr x24, x24, x6 920 orr x24, x24, x7 // x24 = ~in1infty 921 cmp x24, #0 922 csetm x24, ne // x24 = (x24 != 0) ? -1 : 0, that is ~iszero(z1), 923 // Determine whether point a is(x, y, 0) 924 925 // Because x4 to x7 exactly hold z1, z1^2 can be calculated first. 926 add x0 , sp , #0 // z1sqr = sp[32*0] 927 bl ECP256_SqrCore // z1sqr = z1^2 928 929 // x14 to x17 exactly save the z1sqr result (the pipeline bubble exists and needs to be optimized). 930 mov x4 , x14 931 mov x5 , x15 932 mov x6 , x16 933 mov x7 , x17 934 add x2 , x22, #64 // x2 points to z1, that is a->z 935 add x0 , sp , #32*1 // s2 = sp[32*1] 936 bl ECP256_MulCore // s2 = z1^3 937 938 // x14 to x17 exactly save the result of s2. (The pipeline bubble exists and needs to be optimized.) 939 mov x4 , x14 940 mov x5 , x15 941 mov x6 , x16 942 mov x7 , x17 943 add x2 , x23, #32 // x2 points to y2, that is b->y 944 // add x0 , sp , #32*1 // s2 = sp[32*1], x0 It is exactly s2 and does not need to be set. 945 // (There are pipeline bubbles to be optimized.) 946 bl ECP256_MulCore // s2 = y2 * z1^3 947 948 ldp x4 , x5 , [x23, #64] // z2 949 ldp x6 , x7 , [x23, #64+16] 950 // x25 = z2[0] | z2[1] | z2[2] | z2[3] 951 orr x25, x4 , x5 952 orr x25, x25, x6 953 orr x25, x25, x7 // x25 = ~in2infty 954 cmp x25, #0 955 csetm x25, ne // x25 = (x25 != 0) ? -1 : 0, that is ~iszero(z2), 956 // Check whether point b is(x, y, 0) 957 958 // Because x4~x7 happens to hold z2, z2^2 can be calculated 959 add x0 , sp , #32*2 // z2sqr = sp[32*2] 960 bl ECP256_SqrCore // z2sqr = z2^2 961 962 // x14 to x17 exactly save the z2sqr result. (There are pipeline bubbles to be optimized.) 963 mov x4 , x14 964 mov x5 , x15 965 mov x6 , x16 966 mov x7 , x17 967 add x2 , x23, #64 // x2 points to z2, that is b->z 968 add x0 , sp , #32*3 // s1 = sp[32*3] 969 bl ECP256_MulCore // s1 = z2^3 970 971 // x14 to x17 exactly save the result of s2. (The pipeline bubble exists and needs to be optimized.) 972 mov x4 , x14 973 mov x5 , x15 974 mov x6 , x16 975 mov x7 , x17 976 add x2 , x22, #32 // x2 points to y1, that is a->y 977 // add x0 , sp , #32*3 // s1 = sp[32*3], x0 is exactly s1 and does not need to be set. 978 // (There are pipeline bubbles to be optimized.) 979 bl ECP256_MulCore // s1 = y1 * z2^3 980 981 982 // x14 to x17 save the result of s1. Therefore, the ECP256_SubCore2 interface is selected. 983 ldp x8 , x9 , [sp, #32*1] // s2 = sp[32*1] 984 ldp x10, x11, [sp, #32*1+16] 985 // x8~x11 - x14~x17 986 add x0 , sp , #32*4 // R = sp[32*4] 987 bl ECP256_SubCore2 // R = s2 - s1 988 989 // x14 to x17 save the result of R, (R==0) <==> (s2 == s1) 990 orr x26, x14, x15 991 orr x26, x26, x16 992 orr x26, x26, x17 // x26 = ~(s1 == s2), The data of R is not read from the memory in advance. 993 994 ldp x4 , x5 , [x22] // Take x1, that is a->x 995 ldp x6 , x7 , [x22, #16] 996 add x2 , sp , #32*2 // z2sqr = sp[32*2] 997 add x0 , sp , #32*5 // u1 = sp[32*5] 998 bl ECP256_MulCore // u1 = x1 * z2sqr 999 1000 ldp x4 , x5 , [x23] // Take x2, that is b->x 1001 ldp x6 , x7 , [x23, #16] 1002 add x2 , sp , #0 // z1sqr = sp[32*0] 1003 add x0 , sp , #32*6 // u2 = sp[32*6] 1004 bl ECP256_MulCore // u2 = x2 * z1sqr 1005 1006 // x14 to x17 save the result of u2. Therefore, the ECP256_SubCore1 interface is selected. 1007 ldp x8 , x9 , [sp, #32*5] // u1 = sp[32*5] 1008 ldp x10, x11, [sp, #32*5+16] 1009 // x14~x17 - x8~x11 1010 add x0 , sp , #32*7 // h = sp[32*7] 1011 bl ECP256_SubCore1 // h = u2 - u1 1012 1013 // x14 to x17 save the result of h, (h==0) <==> (u2 == u1) 1014 orr x14, x14, x15 1015 orr x14, x14, x16 1016 orr x14, x14, x17 // x14 = ~(u1 == u2), It is used for subsequent judgment. 1017 // The data of h does not need to be read from the memory in advance. 1018 1019 // x24 = ~in2infty 1020 // x25 = ~in1infty 1021 // x26 = ~(s1 == s2) = ~is_equal(S1, S2) 1022 // x14 = ~(u1 == u2) = ~is_equal(U1, U2) 1023 // if(is_equal(U1, U2) & ~in1infty & ~in2infty & is_equal(S1, S2)) 1024 // 将(is_equal(U1, U2) & ~in1infty & ~in2infty & is_equal(S1, S2))记为flag 1025 // <==> 1026 // if ((~is_equal(U1, U2) | in1infty | in2infty | ~is_equal(S1, S2)) == 0) 1027 // if (flag == 0) 1028 1029 mvn x27, x24 // x27 = in1infty 1030 mvn x28, x25 // x28 = in2infty 1031 1032 orr x14, x14, x26 1033 orr x14, x14, x27 1034 orr x14, x14, x28 1035 cbnz x14, .Lpoint_add // flag != 0, Continue calculation by C implementation 1036 1037.Ladd_double: 1038 mov x0 , x21 // x0 = r 1039 mov x1 , x22 // x1 = a 1040 ldp x23, x24, [x29, #48] // The double function uses only x19 to x22, and only x19 to x22 1041 // is popped at the end. Therefore, x23 to x28 is popped in advance. 1042 ldp x25, x26, [x29, #64] 1043 ldp x27, x28, [x29, #80] 1044 add sp , sp , #32*(12-4) // The size of the stack space used varies. The double function 1045 // only releases 32 x 4. Therefore, you need to release part of the stack space in advance. 1046 b .Lpoint_double 1047 1048.align 4 1049.Lpoint_add: 1050 ldp x4 , x5 , [sp, #32*4] // R = sp[32*4] 1051 ldp x6 , x7 , [sp, #32*4+16] 1052 add x0 , sp , #0 // Rsqr = sp[0], Because z1sqr will not be used later, 1053 // this memory can be used to save Rsqr. 1054 bl ECP256_SqrCore // Rsqr = R^2 1055 1056 ldp x4 , x5 , [sp, #32*7] // h = sp[32*7] 1057 ldp x6 , x7 , [sp, #32*7+16] 1058 add x2 , x22, #64 // x2 points to z1 1059 add x0 , sp , #32*11 // res_z = sp[32*11] 1060 bl ECP256_MulCore // res_z = h * z1 1061 1062 // The res_z result is stored in x14 to x17. (The pipeline bubble needs to be optimized.) 1063 mov x4 , x14 1064 mov x5 , x15 1065 mov x6 , x16 1066 mov x7 , x17 1067 add x2 , x23, #64 // x2 points to z2 1068 // x0 is still res_z and does not need to be reassigned. 1069 // add x0 , sp , #32*11 // res_z = sp[32*11] 1070 bl ECP256_MulCore // res_z = res_z * z2 1071 1072 ldp x4 , x5 , [sp, #32*7] // h = sp[32*7] 1073 ldp x6 , x7 , [sp, #32*7+16] 1074 add x0 , sp , #32*2 // hsqr = sp[32*2], Because z2sqr will not be used later, 1075 // you can use this memory to save hsqr. 1076 bl ECP256_SqrCore 1077 1078 // The hsqr result is stored in x14 to x17. (The pipeline bubble needs to be optimized.) 1079 mov x4 , x14 1080 mov x5 , x15 1081 mov x6 , x16 1082 mov x7 , x17 1083 add x2 , sp , #32*7 // h = sp[32*7] 1084 add x0 , sp , #32*8 // hcub = sp[32*8] 1085 bl ECP256_MulCore // hcub = hsqr * h 1086 1087 ldp x4 , x5 , [sp, #32*5] // u1 = sp[32*5] 1088 ldp x6 , x7 , [sp, #32*5+16] 1089 add x2 , sp , #32*2 // hsqr = sp[32*2] 1090 add x0 , sp , #32*6 // u2 = sp[32*6] 1091 bl ECP256_MulCore // u2 = u1 * hsqr 1092 1093 // x14~x17 exactly save the result of u2. 1094 mov x8 , x14 1095 mov x9 , x15 1096 mov x10, x16 1097 mov x11, x17 1098 // add x0 , sp , #32*2 // hsqr = sp[32*2] 1099 // The previous operation x2 also saves the address of hsqr. 1100 mov x0 , x2 1101 // x14~x17 + x8~x11 1102 bl ECP256_AddCore // hsqr = 2 * u2 1103 1104 // x14 to x17 save the hsqr result. Therefore, ECP256_SubCore2 is called. 1105 ldp x8 , x9 , [sp] // Rsqr = sp[0] 1106 ldp x10, x11, [sp, #16] 1107 // x8~x11 - x14~x17 1108 add x0 , sp , #32*9 // res_x = sp[32*9] 1109 bl ECP256_SubCore2 // res_x = Rsqr - hsqr 1110 1111 // x14 to x17 exactly save the result of res_x, so ECP256_SubCore1 is called. 1112 ldp x8 , x9 , [sp, #32*8] // hcub = sp[32*8] 1113 ldp x10, x11, [sp, #32*8+16] 1114 // x14~x17 - x8~11 1115 // The previous operation x0 also stores the address of res_x. 1116 bl ECP256_SubCore1 // res_x = res_x - hcub 1117 1118 // x14 to x17 exactly save the result of res_x. Therefore, ECP256_SubCore2 is called. 1119 ldp x8 , x9 , [sp, #32*6] // u2 = sp[0] 1120 ldp x10, x11, [sp, #32*6+16] 1121 // x8~x11 - x14~x17 1122 add x0 , sp , #32*10 1123 bl ECP256_SubCore2 // res_y = u2 - res_x 1124 1125 ldp x4 , x5 , [sp, #32*3] // s1 = sp[32*3] 1126 ldp x6 , x7 , [sp, #32*3+16] 1127 add x2 , sp , #32*8 // hcub = sp[32*8] 1128 add x0 , sp , #32*1 // s2 = sp[32*1] 1129 bl ECP256_MulCore // s2 = s1 * hcub 1130 1131 add x2 , sp , #32*4 // R = sp[32*4] 1132 add x0 , sp , #32*10 // res_y = sp[32*10] 1133 ldp x4 , x5 , [x0] // res_y = sp[32*10], Borrowing x0 address without offset 1134 ldp x6 , x7 , [x0, #16] 1135 bl ECP256_MulCore // res_y = res_y * R 1136 1137 // x14 to x17 exactly save the result of res_y. Therefore, ECP256_SubCore1 is called. 1138 ldp x8 , x9 , [sp, #32] // s2 = sp[32*1] 1139 ldp x10, x11, [sp, #32+16] 1140 // x14~x17 - x8~x11 1141 // The previous operation x0 also stores the address of res_y. 1142 bl ECP256_SubCore1 // res_y = res_y - s2 1143 1144 // Four memory blocks are not read. (The pipeline bubble exists and needs to be optimized.) 1145 // x14-x17 stores res_y. Therefore, res_y is judged and assigned a value. 1146 // copy y 1147 ldp x4 , x5 , [x23, #32] // in2_y, b->y 1148 ldp x6 , x7 , [x23, #32+16] 1149 ldp x8 , x9 , [x22, #32] // in1_y, a->y 1150 ldp x10, x11, [x22, #32+16] 1151 1152 // res_y = in1infty == 0 ? res_y : in2_y 1153 cmp x24, #0 // x24 = ~in1infty 1154 csel x14, x14, x4 , ne 1155 csel x15, x15, x5 , ne 1156 csel x16, x16, x6 , ne 1157 csel x17, x17, x7 , ne 1158 // res_y = in2infty == 0 ? res_y : in1_y 1159 cmp x25, #0 // x25 = ~in2infty 1160 csel x14, x14, x8 , ne 1161 csel x15, x15, x9 , ne 1162 csel x16, x16, x10, ne 1163 csel x17, x17, x11, ne 1164 1165 stp x14, x15, [x21, #32] 1166 stp x16, x17, [x21, #32+16] 1167 1168 // copy x 1169 ldp x4 , x5 , [x23] // in2_x, b->x 1170 ldp x6 , x7 , [x23, #16] 1171 ldp x8 , x9 , [x22] // in1_x, a->x 1172 ldp x10, x11, [x22, #16] 1173 ldp x14, x15, [sp , #32*9] // res_x 1174 ldp x16, x17, [sp , #32*9+16] 1175 1176 // res_x = in1infty == 0 ? res_x : in2_x 1177 cmp x24, #0 // x24 = ~in1infty 1178 csel x14, x14, x4 , ne 1179 csel x15, x15, x5 , ne 1180 csel x16, x16, x6 , ne 1181 csel x17, x17, x7 , ne 1182 // res_x = in2infty == 0 ? res_x : in1_x 1183 cmp x25, #0 // x25 = ~in2infty 1184 csel x14, x14, x8 , ne 1185 csel x15, x15, x9 , ne 1186 csel x16, x16, x10, ne 1187 csel x17, x17, x11, ne 1188 1189 stp x14, x15, [x21] 1190 stp x16, x17, [x21, #16] 1191 1192 // copy z 1193 ldp x4 , x5 , [x23, #64] // in2_z, b->z 1194 ldp x6 , x7 , [x23, #64+16] 1195 ldp x8 , x9 , [x22, #64] // in1_z, a->z 1196 ldp x10, x11, [x22, #64+16] 1197 ldp x14, x15, [sp , #32*11] // res_z 1198 ldp x16, x17, [sp , #32*11+16] 1199 1200 // res_z = in1infty == 0 ? res_z : in2_z 1201 cmp x24, #0 // x24 = ~in1infty 1202 csel x14, x14, x4 , ne 1203 csel x15, x15, x5 , ne 1204 csel x16, x16, x6 , ne 1205 csel x17, x17, x7 , ne 1206 // res_z = in2infty == 0 ? res_z : in1_z 1207 cmp x25, #0 // x25 = ~in2infty 1208 csel x14, x14, x8 , ne 1209 csel x15, x15, x9 , ne 1210 csel x16, x16, x10, ne 1211 csel x17, x17, x11, ne 1212 1213 stp x14, x15, [x21, #64] 1214 stp x16, x17, [x21, #64+16] 1215 1216 add sp , x29, #0 // Free temporary variable 1217 ldp x19, x20, [x29, #16] 1218 ldp x21, x22, [x29, #32] 1219 ldp x23, x24, [x29, #48] 1220 ldp x25, x26, [x29, #64] 1221 ldp x27, x28, [x29, #80] 1222 ldp x29, x30, [sp], #96 1223AARCH64_AUTIASP 1224 ret 1225.size ECP256_PointAdd,.-ECP256_PointAdd 1226 1227# ref. https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b 1228# Deal process: 1229# delta = Z12 1230# gamma = Y12 1231# beta = X1*gamma 1232# alpha = 3*(X1-delta)*(X1+delta) 1233# X3 = alpha2-8*beta 1234# Z3 = (Y1+Z1)2-gamma-delta 1235# Y3 = alpha*(4*beta-X3)-8*gamma2 1236.globl ECP256_PointDouble 1237.type ECP256_PointDouble,%function 1238.align 5 1239ECP256_PointDouble: 1240AARCH64_PACIASP 1241 stp x29, x30, [sp, #-96]! // Why is 96 space given here? Only x19~x22 is stacked. Theoretically, only [sp, #-48] is required. 1242 add x29, sp , #0 // Whether it is related to the Lpoint_double jump of the ecp_nistz256_point_add interface 1243 stp x19, x20, [sp, #16] 1244 stp x21, x22, [sp, #32] 1245 sub sp , sp , #32*4 1246 1247.Lpoint_double: 1248 ldp x14, x15, [x1 , #32] // a->y 1249 ldp x16, x17, [x1 , #32+16] 1250 // Back up the initial input parameter pointer r, a. 1251 mov x21, x0 1252 mov x22, x1 1253 adrp x3, .Lpoly+8 // p[1] 1254 add x3,x3,:lo12:.Lpoly+8 1255 ldr x12,[x3] // p[3] 1256 ldr x13,[x3,#16] 1257 1258 mov x8 , x14 1259 mov x9 , x15 1260 ldp x4 , x5 , [x22, #64] // a->z 1261 mov x10, x16 1262 mov x11, x17 1263 ldp x6 , x7 , [x22, #64+16] 1264 add x0 , sp , #0 // s = sp[0] 1265 bl ECP256_AddCore // s = 2 * a->y 1266 1267 add x0 , sp , #32 // zsqr = sp[32*1] 1268 bl ECP256_SqrCore // zsqr = (a->z)^2 1269 1270 // x14 to x17 save the result of s. Calculate s^2 first. 1271 ldp x4 , x5 , [sp] 1272 add x0 , sp , #0 // s = sp[0] 1273 ldp x6 , x7 , [sp, #16] 1274 bl ECP256_SqrCore // s = s^2 1275 1276 ldp x14, x15, [sp, #32] // a->x 1277 ldp x8 , x9 , [x22] // a->x 1278 ldp x16, x17, [sp, #32+16] 1279 add x0 , sp , #64 // m = sp[32*2] 1280 ldp x10, x11, [x22, #16] 1281 bl ECP256_AddCore // m = a->x + zsqr 1282 1283 ldp x14 , x15 , [sp, #32] // a->x 1284 ldp x8 , x9 , [x22] // a->x 1285 ldp x16, x17, [sp, #32+16] 1286 add x0 , sp , #32 1287 ldp x10, x11, [x22, #16] 1288 bl ECP256_SubCore2 // zsqr = a->x - zsqr 1289 1290 ldp x4 , x5 , [x22, #64] // a->z 1291 ldp x6 , x7 , [x22, #64+16] 1292 add x0 , x21, #64 // res_z 1293 add x2 , x22, #32 // a->y 1294 bl ECP256_MulCore // res_z = a->z * a->y 1295 1296 // x14 to x17 save the result of res_z. 1297 mov x8 , x14 1298 mov x9 , x15 1299 ldp x4 , x5 , [sp] // s = sp[0] 1300 mov x10, x16 1301 mov x11, x17 1302 ldp x6 , x7 , [sp, #16] 1303 add x0 , x21, #64 // res_z 1304 bl ECP256_AddCore // res_z = 2 * res_z 1305 1306 add x0 , x21, #32 // x0 points to res_y, that is r->y 1307 bl ECP256_SqrCore // res_y = s^2 1308 1309 add x0 , x21, #32 1310 bl ECP256_DivBy2Core // res_y = res_y / 2 1311 1312 ldp x4 , x5 , [sp, #64] // m = sp[64] 1313 ldp x6 , x7 , [sp, #64+16] 1314 add x2 , sp , #32 // zsqr = sp[32] 1315 add x0 , sp , #64 1316 bl ECP256_MulCore // m = m * zsqr 1317 1318 // x14–x17 saves the result of m. Save an extra copy to x4–x7. 1319 mov x8 , x14 1320 mov x4 , x14 1321 mov x9 , x15 1322 mov x5 , x15 1323 mov x10, x16 1324 mov x6 , x16 1325 mov x11, x17 1326 mov x7 , x17 1327 add x0 , sp , #64 1328 bl ECP256_AddCore // m = 2 * m 1329 mov x8 , x4 1330 mov x9 , x5 1331 mov x10, x6 1332 mov x11, x7 1333 bl ECP256_AddCore // m = 3 * m 1334 1335 ldp x4 , x5 , [sp] // s = sp[0] 1336 add x2 , x22, #0 // a->x 1337 ldp x6 , x7 , [sp, #16] 1338 add x0 , sp , #0 // s = sp[0] 1339 bl ECP256_MulCore // s = s * a->x 1340 1341 // x14~x17 exactly saves the result of s 1342 mov x8 , x14 1343 mov x9 , x15 1344 ldp x4 , x5 , [sp, #64] 1345 mov x10, x16 1346 mov x11, x17 1347 ldp x6 , x7 , [sp, #64+16] 1348 add x0 , sp , #96 // tmp = sp[96] 1349 bl ECP256_AddCore // tmp = 2 * s 1350 1351 // x14~x17 exactly saves the result of m. 1352 add x0 , x21, #0 // res_x = r->x 1353 bl ECP256_SqrCore // res_x = m^2 1354 1355 // x14~x17 exactly saves the result of res_x. 1356 ldp x8 , x9 , [sp, #96] // tmp = sp[96] 1357 ldp x10, x11, [sp, #96+16] 1358 bl ECP256_SubCore1 // res_x = res_x - tmp 1359 1360 // x14~x17 exactly saves the result of res_x, Therefore, ECP256_SubCore2 is invoked. 1361 ldp x8 , x9 , [sp] // s = sp[0] 1362 ldp x10, x11, [sp, #16] 1363 add x0 , sp , #0 1364 bl ECP256_SubCore2 // s = s - res_x 1365 1366 // x14~x17 exactly saves the result of s, x0 still points to s. 1367 mov x4 , x14 1368 mov x5 , x15 1369 mov x6 , x16 1370 mov x7 , x17 1371 add x2 , sp , #64 // m = sp[64] 1372 bl ECP256_MulCore // s = s * m 1373 1374 // x14 to x17 just save the result of s, so call ECP256_SubCore1. 1375 add x0 , x21, #32 // x0 points to res_y 1376 ldp x8 , x9 , [x0] // res_y 1377 ldp x10, x11, [x0, #16] 1378 bl ECP256_SubCore1 // res_y = s - res_y 1379 1380 add sp , x29, #0 // Free temporary variable 1381 ldp x19, x20, [x29, #16] 1382 ldp x21, x22, [x29, #32] 1383 ldp x29, x30, [sp], #96 1384AARCH64_AUTIASP 1385 ret 1386.size ECP256_PointDouble,.-ECP256_PointDouble 1387 1388.globl ECP256_OrdMul 1389.type ECP256_OrdMul,%function 1390.align 4 1391ECP256_OrdMul: 1392AARCH64_PACIASP 1393 stp x29, x30, [sp, #-64]! 1394 add x29, sp , #0 1395 stp x19, x20, [sp, #16] 1396 stp x21, x22, [sp, #32] 1397 stp x23, x24, [sp, #48] 1398 1399 adrp x23, .Lord 1400 add x23, x23, :lo12:.Lord // x23 = &n 1401 ldp x12, x13, [x23] // n[0~3] 1402 ldp x21, x22, [x23, #16] 1403 ldr x23, [x23, #32] // x23 = LordK 1404 1405 ldp x4 , x5 , [x1] // a[0~3] 1406 ldp x6 , x7 , [x1, #16] 1407 1408 ldr x3 , [x2] // x3 = b[0] 1409 1410 mul x14, x4 , x3 // a[0~3] * b[0] 1411 umulh x8 , x4 , x3 1412 mul x15, x5 , x3 1413 umulh x9 , x5 , x3 1414 mul x16, x6 , x3 1415 umulh x10, x6 , x3 1416 mul x17, x7 , x3 1417 umulh x11, x7 , x3 1418 1419 // a[0-3] * b[0] is stored in x14~x17, x19, x20. 1420 adds x15, x15, x8 1421 adcs x16, x16, x9 1422 adcs x17, x17, x10 1423 adc x19, xzr, x11 // x19 = x11 + CF 1424 mov x20, xzr 1425 1426 // reduce 1427 // n[0] = 0xf3b9cac2fc632551 1428 // n[1] = 0xbce6faada7179e84 1429 // n[2] = 0xffffffffffffffff, lo(q*n[2]) = -q , hi(q*n[2]) = q 1430 // n[3] = 0xffffffff00000000, lo(q*n[3]) = -lq<<32, hi(q*n[3]) = q-hq 1431 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), Equivalent to q in Montgomery 1432 // modular multiplication 1433 1434 lsl x8 , x24, #32 // lq << 32 1435 lsr x9 , x24, #32 // hq 1436 subs x16, x16, x24 // x16 = x16 + lo(q*n[2]) = x16 - q 1437 sbcs x17, x17, x8 // x17 = x17 + lo(q*n[3]) = x17 - lq<<32 1438 sbcs x19, x19, x9 // x19 = x19 - hq, Should have added q-hq, here first subtract hq, then add q. 1439 sbc x20, x20, xzr // x20 = x20 - CF 1440 1441 umulh x9 , x12, x24 // hi(q*n[0]) 1442 mul x10, x13, x24 // lo(q*n[1]) 1443 umulh x11, x13, x24 // hi(q*n[1]) 1444 1445 // First calculate the value to be added to the same accumulator. 1446 subs xzr, x14, #1 // CF = (x14 < 1)? 1447 adcs x10, x10, x9 // x10 = hi(q*n[0]) + lo(q*n[1]) 1448 adc x11, x11, xzr // x11 = hi(q*n[1]) + CF 1449 1450 // S /= r, accumulation of staggered blocks 1451 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1452 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1453 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1454 adcs x17, x19, x24 // x17 = x19 + q, hq has been subtracted from x19 in front, 1455 // so here x17 = hi(q*n[3]) 1456 adc x19, x20, xzr // x19 = x20 + CF 1457 1458 ldr x3 , [x2, #8] // b[1] 1459 1460 mul x8 , x4 , x3 1461 mul x9 , x5 , x3 1462 mul x10, x6 , x3 1463 mul x11, x7 , x3 1464 1465 // add lo(a[0~3] * b[i]) 1466 adds x14, x14, x8 1467 adcs x15, x15, x9 1468 adcs x16, x16, x10 1469 adcs x17, x17, x11 1470 adc x19, x19, xzr 1471 1472 umulh x8 , x4 , x3 1473 umulh x9 , x5 , x3 1474 umulh x10, x6 , x3 1475 umulh x11, x7 , x3 1476 // add hi(a[0~3] * b[i]) 1477 adds x15, x15, x8 1478 adcs x16, x16, x9 1479 adcs x17, x17, x10 1480 adcs x19, x19, x11 1481 adc x20, xzr, xzr 1482 1483 // reduce 1484 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), equivalent to q 1485 // in Montgomery modular multiplication 1486 1487 lsl x8 , x24, #32 1488 lsr x9 , x24, #32 1489 subs x16, x16, x24 1490 sbcs x17, x17, x8 1491 sbcs x19, x19, x9 1492 sbc x20, x20, xzr 1493 1494 umulh x9 , x12, x24 // hi(q*n[0]) 1495 mul x10, x13, x24 // lo(q*n[1]) 1496 umulh x11, x13, x24 // hi(q*n[1]) 1497 1498 subs xzr, x14, #1 1499 adcs x10, x10, x9 1500 adc x11, x11, xzr 1501 1502 // S /= r, accumulation of staggered blocks 1503 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1504 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1505 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1506 adcs x17, x19, x24 // x17 = x19 + q, hq has been subtracted from x19 in front, 1507 // so here x17 = hi(q*n[3]) 1508 adc x19, x20, xzr // x19 = x20 + CF 1509 1510 ldr x3 , [x2, #16] // b[2] 1511 1512 mul x8 , x4 , x3 1513 mul x9 , x5 , x3 1514 mul x10, x6 , x3 1515 mul x11, x7 , x3 1516 1517 // adds lo(a[0~3] * b[i]) 1518 adds x14, x14, x8 1519 adcs x15, x15, x9 1520 adcs x16, x16, x10 1521 adcs x17, x17, x11 1522 adc x19, x19, xzr 1523 1524 umulh x8 , x4 , x3 1525 umulh x9 , x5 , x3 1526 umulh x10, x6 , x3 1527 umulh x11, x7 , x3 1528 // adds hi(a[0~3] * b[i]) 1529 adds x15, x15, x8 1530 adcs x16, x16, x9 1531 adcs x17, x17, x10 1532 adcs x19, x19, x11 1533 adc x20, xzr, xzr 1534 1535 // reduce 1536 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery 1537 // modular multiplication 1538 1539 lsl x8 , x24, #32 1540 lsr x9 , x24, #32 1541 subs x16, x16, x24 1542 sbcs x17, x17, x8 1543 sbcs x19, x19, x9 1544 sbc x20, x20, xzr 1545 1546 umulh x9 , x12, x24 // hi(q*n[0]) 1547 mul x10, x13, x24 // lo(q*n[1]) 1548 umulh x11, x13, x24 // hi(q*n[1]) 1549 1550 subs xzr, x14, #1 1551 adcs x10, x10, x9 1552 adc x11, x11, xzr 1553 1554 // S /= r, accumulation of staggered blocks 1555 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1556 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1557 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1558 adcs x17, x19, x24 // x17 = x19 + q, hq has been subtracted from x19 in front, 1559 // so here x17 = hi(q*n[3]) 1560 adc x19, x20, xzr // x19 = x20 + CF 1561 1562 ldr x3 , [x2, #24] // b[3] 1563 1564 mul x8 , x4 , x3 1565 mul x9 , x5 , x3 1566 mul x10, x6 , x3 1567 mul x11, x7 , x3 1568 1569 // add lo(a[0~3] * b[i]) 1570 adds x14, x14, x8 1571 adcs x15, x15, x9 1572 adcs x16, x16, x10 1573 adcs x17, x17, x11 1574 adc x19, x19, xzr 1575 1576 umulh x8 , x4 , x3 1577 umulh x9 , x5 , x3 1578 umulh x10, x6 , x3 1579 umulh x11, x7 , x3 1580 // add hi(a[0~3] * b[i]) 1581 adds x15, x15, x8 1582 adcs x16, x16, x9 1583 adcs x17, x17, x10 1584 adcs x19, x19, x11 1585 adc x20, xzr, xzr 1586 1587 // reduce 1588 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery 1589 // modular multiplication. 1590 1591 lsl x8 , x24, #32 1592 lsr x9 , x24, #32 1593 subs x16, x16, x24 1594 sbcs x17, x17, x8 1595 sbcs x19, x19, x9 1596 sbc x20, x20, xzr 1597 1598 umulh x9 , x12, x24 // hi(q*n[0]) 1599 mul x10, x13, x24 // lo(q*n[1]) 1600 umulh x11, x13, x24 // hi(q*n[1]) 1601 1602 subs xzr, x14, #1 1603 adcs x10, x10, x9 1604 adc x11, x11, xzr 1605 1606 // S /= r, Accumulation of staggered blocks 1607 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1608 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1609 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1610 adcs x17, x19, x24 // x17 = x19 + q, hq has been subtracted from x19 in front, so here x17 = hi(q*n[3]) 1611 adc x19, x20, xzr // x19 = x20 + CF 1612 1613 // So far, x14 to x17, and x19 retain the calculation results. 1614 // x8~x11 save r -= n 1615 subs x8 , x14, x12 1616 sbcs x9 , x15, x13 1617 sbcs x10, x16, x21 1618 sbcs x11, x17, x22 1619 sbcs xzr, x19, xzr // Finally, the CF bit is set to indicate whether r-n needs to be borrowed. 1620 1621 // r = r-n < 0 ? r : r-n 1622 csel x14, x14, x8 , lo 1623 csel x15, x15, x9 , lo 1624 csel x16, x16, x10, lo 1625 csel x17, x17, x11, lo 1626 1627 stp x14, x15, [x0] // Write the result to r 1628 stp x16, x17, [x0, #16] 1629 1630 ldp x19, x20, [sp, #16] 1631 ldp x21, x22, [sp, #32] 1632 ldp x23, x24, [sp, #48] 1633 ldr x29, [sp], #64 1634AARCH64_AUTIASP 1635 ret 1636.size ECP256_OrdMul,.-ECP256_OrdMul 1637 1638.globl ECP256_OrdSqr 1639.type ECP256_OrdSqr,%function 1640.align 4 1641ECP256_OrdSqr: 1642AARCH64_PACIASP 1643 stp x29, x30, [sp, #-64]! 1644 add x29, sp , #0 1645 stp x19, x20, [sp, #16] 1646 stp x21, x22, [sp, #32] 1647 stp x23, x24, [sp, #48] 1648 1649 adrp x23, .Lord 1650 add x23, x23, :lo12:.Lord // x23 = &n 1651 ldp x12, x13, [x23] // n[0~3] 1652 ldp x21, x22, [x23, #16] 1653 ldr x23, [x23, #32] // x23 = LordK 1654 1655 ldp x4 , x5 , [x1] // x4~x7 = a[0~3] 1656 ldp x6 , x7 , [x1, #16] 1657 1658.align 4 1659.Lord_sqr: 1660 sub x2 , x2 , #1 1661/****************************************** 1662 x7 x1 x20 x19 x17 x16 x15 x14 1663 h0*1 l0*1 1664 h0*2 l0*2 1665 h0*3 l0*3 1666 h1*2 l1*2 1667 h1*3 l1*3 1668 h2*3 l2*3 1669 h3*3 l3*3 h2*2 l2*2 h1*1 l1*1 h0*0 l0*0 1670*******************************************/ 1671 1672 // a[1~3] * a[0] 1673 mul x15, x5 , x4 // lo(a[1] * a[0]) 1674 umulh x8 , x5 , x4 // hi(a[1] * a[0]) 1675 mul x16, x6 , x4 // lo(a[2] * a[0]) 1676 umulh x9 , x6 , x4 // hi(a[2] * a[0]) 1677 mul x17, x7 , x4 // lo(a[3] * a[0]) 1678 umulh x19, x7 , x4 // hi(a[3] * a[0]) 1679 1680 adds x16, x16, x8 1681 adcs x17, x17, x9 1682 adc x19, x19, xzr // No more carry 1683 1684 // a[2~3] * a[1] 1685 mul x8 , x6 , x5 // lo(a[2] * a[1]) 1686 umulh x9 , x6 , x5 // hi(a[2] * a[1]) 1687 mul x10, x7 , x5 // lo(a[3] * a[1]) 1688 umulh x11, x7 , x5 // hi(a[3] * a[1]) 1689 1690 // a[3] * a[2] 1691 mul x20, x7 , x6 1692 umulh x1 , x7 , x6 1693 1694 // Add the calculation result of the current round, 1695 // and then add the calculation result with the acc (register in the preceding note). 1696 adds x9 , x10, x9 1697 adc x10, x11, xzr 1698 1699 adds x17, x17, x8 1700 adcs x19, x19, x9 1701 adcs x20, x20, x10 1702 adc x1 , x1 , xzr // a[0-3] has been saved in x4 to x7. 1703 1704 // a[0~3] ^ 2 1705 mul x14, x4 , x4 1706 umulh x4 , x4 , x4 1707 mul x8 , x5 , x5 1708 umulh x5 , x5 , x5 1709 mul x9 , x6 , x6 1710 umulh x6 , x6 , x6 1711 mul x10, x7 , x7 1712 umulh x7 , x7 , x7 1713 1714 // acc[1~6] << 1 1715 adds x15, x15, x15 1716 adcs x16, x16, x16 1717 adcs x17, x17, x17 1718 adcs x19, x19, x19 1719 adcs x20, x20, x20 1720 adcs x1 , x1 , x1 1721 adc x3 , xzr, xzr 1722 1723 // acc[] += a[] ^ 2 1724 adds x15, x15, x4 1725 adcs x16, x16, x8 1726 adcs x17, x17, x5 1727 adcs x19, x19, x9 1728 adcs x20, x20, x6 1729 adcs x1 , x1 , x10 1730 adc x7 , x7 , x3 1731 1732 // Four rounds of reduce 1733 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery 1734 // modular multiplication 1735 1736 umulh x9 , x12, x24 // hi(q*n[0]) 1737 mul x10, x13, x24 // lo(q*n[1]) 1738 umulh x11, x13, x24 // hi(q*n[1]) 1739 1740 // First calculate the value to be added to the same accumulator. 1741 subs xzr, x14, #1 // CF = (x14 < 1) 1742 adcs x10, x10, x9 // x10 = hi(q*n[0]) + lo(q*n[1]) 1743 adc x11, x11, xzr // x11 = hi(q*n[1]) + CF 1744 1745 // S /= r, accumulation of staggered blocks 1746 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1747 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1748 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1749 adc x17, xzr, x24 // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later. 1750 1751 lsl x8 , x24, #32 // lq << 32 1752 lsr x9 , x24, #32 // hq 1753 1754 subs x15, x15, x24 // x15 = x15 + lo(q*n[2]) = x15 - q 1755 sbcs x16, x16, x8 // x16 = x16 + lo(q*n[3]) = x16 - lq<<32 1756 sbc x17, x17, x9 // x17 = x17 - hq, (remaining part of hi(q*n[3])) 1757 1758 // Round 2 reduce 1759 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery 1760 // modular multiplication 1761 1762 umulh x9 , x12, x24 // hi(q*n[0]) 1763 mul x10, x13, x24 // lo(q*n[1]) 1764 umulh x11, x13, x24 // hi(q*n[1]) 1765 1766 // First calculate the value to be added to the same accumulator. 1767 subs xzr, x14, #1 // CF = (x14 < 1) 1768 adcs x10, x10, x9 // x10 = hi(q*n[0]) + lo(q*n[1]) 1769 adc x11, x11, xzr // x11 = hi(q*n[1]) + CF 1770 1771 // S /= r, accumulation of staggered blocks 1772 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1773 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1774 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1775 adc x17, xzr, x24 // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later. 1776 1777 lsl x8 , x24, #32 // lq << 32 1778 lsr x9 , x24, #32 // hq 1779 1780 subs x15, x15, x24 // x15 = x15 + lo(q*n[2]) = x15 - q 1781 sbcs x16, x16, x8 // x16 = x16 + lo(q*n[3]) = x16 - lq<<32 1782 sbc x17, x17, x9 // x17 = x17 - hq, (The remainder of the hi(q*n[3])) 1783 1784 // Round 3 reduce 1785 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery 1786 // modular multiplication 1787 1788 umulh x9 , x12, x24 // hi(q*n[0]) 1789 mul x10, x13, x24 // lo(q*n[1]) 1790 umulh x11, x13, x24 // hi(q*n[1]) 1791 1792 // First calculate the value to be added to the same accumulator. 1793 subs xzr, x14, #1 // CF = (x14 < 1) 1794 adcs x10, x10, x9 // x10 = hi(q*n[0]) + lo(q*n[1]) 1795 adc x11, x11, xzr // x11 = hi(q*n[1]) + CF 1796 1797 // S /= r, accumulation of staggered blocks 1798 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1799 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1800 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1801 adc x17, xzr, x24 // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later. 1802 1803 lsl x8 , x24, #32 // lq << 32 1804 lsr x9 , x24, #32 // hq 1805 1806 subs x15, x15, x24 // x15 = x15 + lo(q*n[2]) = x15 - q 1807 sbcs x16, x16, x8 // x16 = x16 + lo(q*n[3]) = x16 - lq<<32 1808 sbc x17, x17, x9 // x17 = x17 - hq, (Remaining part of hi(q*n[3])) 1809 1810 // Round 4 reduce 1811 mul x24, x14, x23 // x24 = lo(a[0]*b[0]*ordk), equivalent to q in Montgomery 1812 // modular multiplication 1813 1814 umulh x9 , x12, x24 // hi(q*n[0]) 1815 mul x10, x13, x24 // lo(q*n[1]) 1816 umulh x11, x13, x24 // hi(q*n[1]) 1817 1818 // First calculate the value to be added to the same accumulator. 1819 subs xzr, x14, #1 // CF = (x14 < 1) 1820 adcs x10, x10, x9 // x10 = hi(q*n[0]) + lo(q*n[1]) 1821 adc x11, x11, xzr // x11 = hi(q*n[1]) + CF 1822 1823 // S /= r, accumulation of staggered blocks 1824 adds x14, x15, x10 // x14 = x15 + hi(q*n[0]) + lo(q*n[1]) 1825 adcs x15, x16, x11 // x15 = x16 + hi(q*n[1]) 1826 adcs x16, x17, x24 // x16 = x17 + hi(q*n[2]) 1827 adc x17, xzr, x24 // x17 += q, Supposed to be add hi(q*n[3]) = q-hq, hq will be subtracted later. 1828 1829 lsl x8 , x24, #32 // lq << 32 1830 lsr x9 , x24, #32 // hq 1831 1832 subs x15, x15, x24 // x15 = x15 + lo(q*n[2]) = x15 - q 1833 sbcs x16, x16, x8 // x16 = x16 + lo(q*n[3]) = x16 - lq<<32 1834 sbc x17, x17, x9 // x17 = x17 - hq, (Remaining part of hi(q*n[3])) 1835 1836 // add acc[4-7], x14~x19 is the current calculation result r 1837 adds x14, x14, x19 1838 adcs x15, x15, x20 1839 adcs x16, x16, x1 1840 adcs x17, x17, x7 1841 adc x19, xzr, xzr 1842 1843 // r -= p 1844 subs x8 , x14, x12 1845 sbcs x9 , x15, x13 1846 sbcs x10, x16, x21 1847 sbcs x11, x17, x22 1848 sbcs xzr, x19, xzr // The set CF is used to determine the relationship between r and p. 1849 1850 // r = r-p < 0 ? r : r-p 1851 // Use x4 to x7 to save the result. You can perform the square operation cyclically. 1852 csel x4 , x14, x8 , lo 1853 csel x5 , x15, x9 , lo 1854 csel x6 , x16, x10, lo 1855 csel x7 , x17, x11, lo 1856 1857 cbnz x2 , .Lord_sqr // Number of square operations, that is a^(2^rep) 1858 1859 stp x4 , x5 , [x0] 1860 stp x6 , x7 , [x0, #16] 1861 1862 ldp x19, x20, [sp, #16] 1863 ldp x21, x22, [sp, #32] 1864 ldp x23, x24, [sp, #48] 1865 ldr x29, [sp], #64 1866AARCH64_AUTIASP 1867 ret 1868.size ECP256_OrdSqr,.-ECP256_OrdSqr 1869 1870.globl ECP256_Scatterw5 1871.type ECP256_Scatterw5,%function 1872.align 4 1873ECP256_Scatterw5: 1874AARCH64_PACIASP 1875 stp x29, x30, [sp, #-16]! 1876 add x29, sp , #0 1877 1878 add x0 , x0 , x2 , lsl#3 // Needs x0 = x0 + (x2-1)*8 = x0 + x2*8 - 8 1879 // The action of -8 is placed in the str instruction. 1880 1881 ldp x4 , x5 , [x1] // x 1882 ldp x6 , x7 , [x1, #16] 1883 1884 str x4 , [x0, #128*0-8] // 8*16*0 1885 str x5 , [x0, #128*1-8] // 8*16*1 1886 str x6 , [x0, #128*2-8] // 8*16*2 1887 str x7 , [x0, #128*3-8] // 8*16*3 1888 1889 add x0 , x0 , #64*8 1890 1891 ldp x4 , x5 , [x1, #32] // y 1892 ldp x6 , x7 , [x1, #32+16] 1893 1894 str x4 , [x0, #128*0-8] // 8*16*0 1895 str x5 , [x0, #128*1-8] // 8*16*1 1896 str x6 , [x0, #128*2-8] // 8*16*2 1897 str x7 , [x0, #128*3-8] // 8*16*3 1898 1899 add x0 , x0 , #64*8 1900 1901 ldp x4 , x5 , [x1, #64] // z 1902 ldp x6 , x7 , [x1, #64+16] 1903 1904 str x4 , [x0, #128*0-8] // 8*16*0 1905 str x5 , [x0, #128*1-8] // 8*16*1 1906 str x6 , [x0, #128*2-8] // 8*16*2 1907 str x7 , [x0, #128*3-8] // 8*16*3 1908 1909 ldr x29, [sp], #16 1910AARCH64_AUTIASP 1911 ret 1912.size ECP256_Scatterw5,.-ECP256_Scatterw5 1913 1914.globl ECP256_Gatherw5 1915.type ECP256_Gatherw5,%function 1916.align 4 1917ECP256_Gatherw5: 1918AARCH64_PACIASP 1919 stp x29, x30, [sp, #-16]! 1920 add x29, sp , #0 1921 1922 cmp x2 , xzr 1923 csetm x3 , ne // x3 = (x2 == 0) ? 0 : -1 1924 add x2 , x2 , x3 // x2 += x3, if x2 != 0, then x2 = x2 - 1, if not x2 = 0 1925 1926 add x1 , x1 , x2 , lsl#3 // x1 += x2*8, offset 1927 1928 ldr x4 , [x1, #128*0] 1929 ldr x5 , [x1, #128*1] 1930 ldr x6 , [x1, #128*2] 1931 ldr x7 , [x1, #128*3] 1932 1933 csel x4 , x4 , xzr, ne // If x2 = 0, Then return to 0. Otherwise, returns the read point coordinates 1934 csel x5 , x5 , xzr, ne 1935 csel x6 , x6 , xzr, ne 1936 csel x7 , x7 , xzr, ne 1937 1938 stp x4 , x5 , [x0] // r->x 1939 stp x6 , x7 , [x0, #16] 1940 1941 add x1 , x1 , #64*8 1942 1943 ldr x4 , [x1, #128*0] 1944 ldr x5 , [x1, #128*1] 1945 ldr x6 , [x1, #128*2] 1946 ldr x7 , [x1, #128*3] 1947 1948 csel x4 , x4 , xzr, ne // If x2 = 0, return 0. Otherwise, returns the read point coordinates 1949 csel x5 , x5 , xzr, ne 1950 csel x6 , x6 , xzr, ne 1951 csel x7 , x7 , xzr, ne 1952 1953 stp x4 , x5 , [x0, #32] // r->y 1954 stp x6 , x7 , [x0, #48] 1955 1956 add x1 , x1 , #64*8 1957 1958 ldr x4 , [x1, #128*0] 1959 ldr x5 , [x1, #128*1] 1960 ldr x6 , [x1, #128*2] 1961 ldr x7 , [x1, #128*3] 1962 1963 csel x4 , x4 , xzr, ne // If x2 = 0, return 0. Otherwise, returns the read point coordinates 1964 csel x5 , x5 , xzr, ne 1965 csel x6 , x6 , xzr, ne 1966 csel x7 , x7 , xzr, ne 1967 1968 stp x4 , x5 , [x0, #64] // r->z 1969 stp x6 , x7 , [x0, #80] 1970 1971 ldr x29, [sp], #16 1972AARCH64_AUTIASP 1973 ret 1974.size ECP256_Gatherw5,.-ECP256_Gatherw5 1975 1976.globl ECP256_Scatterw7 1977.type ECP256_Scatterw7,%function 1978.align 4 1979ECP256_Scatterw7: 1980AARCH64_PACIASP 1981 stp x29, x30, [sp, #-16]! 1982 add x29, sp , #0 1983 1984 // add x0 , x0 , x2 1985 sub x2 , x2 , #63 // x2 = x2 - 63 1986 sub x0 , x0 , x2 // x0 = x0 - (x2(original) - 63) = x0 + (63 - x2(original)) 1987 mov x2 , #8 // Loop count 1988.Lscatter_w7: 1989 ldr x3 , [x1], #8 1990 subs x2 , x2 , #1 1991 1992 strb w3 , [x0, #64*0] 1993 lsr x3 , x3 , #8 1994 strb w3 , [x0, #64*1] 1995 lsr x3 , x3 , #8 1996 strb w3 , [x0, #64*2] 1997 lsr x3 , x3 , #8 1998 strb w3 , [x0, #64*3] 1999 lsr x3 , x3 , #8 2000 strb w3 , [x0, #64*4] 2001 lsr x3 , x3 , #8 2002 strb w3 , [x0, #64*5] 2003 lsr x3 , x3 , #8 2004 strb w3 , [x0, #64*6] 2005 lsr x3 , x3 , #8 2006 strb w3 , [x0, #64*7] 2007 add x0 , x0 , #64*8 2008 b.ne .Lscatter_w7 2009 2010 ldr x29, [sp], #16 2011AARCH64_AUTIASP 2012 ret 2013.size ECP256_Scatterw7,.-ECP256_Scatterw7 2014 2015// The anti-cache attack solution can be used together with ECP256_Scatterw7. 2016.globl ECP256_Gatherw7 2017.type ECP256_Gatherw7,%function 2018.align 4 2019ECP256_Gatherw7: 2020AARCH64_PACIASP 2021 stp x29, x30, [sp, #-16]! 2022 add x29, sp , #0 2023 2024 cmp x2 , xzr 2025 csetm x3 , ne // x3 = (x2 == 0) ? 0 : -1 2026 add x2 , x2 , x3 // x2 = (x2 == 0) ? 0 : (x2 - 1) (offset) 2027 sub x2 , x2 , #63 // x2 = x2 - 63 2028 sub x1 , x1 , x2 // x0 = x0 - (x2(original) - 63) = x0 + (63 - x2(original)) 2029 mov x2 , #8 // Indicates the number of cycles. The x and y coordinates contain 64 bytes in total, 2030 // and 8 bytes are obtained at a time. Therefore, eight cycles are required. 2031 nop 2032.Lgather_w7: 2033 2034 ldrb w4 , [x1, #64*0] 2035 ldrb w5 , [x1, #64*1] 2036 ldrb w6 , [x1, #64*2] 2037 ldrb w7 , [x1, #64*3] 2038 ldrb w8 , [x1, #64*4] 2039 ldrb w9 , [x1, #64*5] 2040 ldrb w10, [x1, #64*6] 2041 ldrb w11, [x1, #64*7] 2042 2043 // x4 = x4 | x5 << 8 2044 // x6 = x6 | x7 << 8 2045 // x4 = x4 | x6 << 16 2046 // x4 = [x7][x6][x5][x4] 2047 orr x4 , x4 , x5 , lsl#8 2048 orr x6 , x6 , x7 , lsl#8 2049 orr x4 , x4 , x6 , lsl#16 2050 2051 // x8 = x8 | x9 << 8 2052 // x10 = x10 | x11 << 8 2053 // x8 = x8 | x10 << 16 2054 // x8 = [x11][x10][x9][x8] 2055 orr x8 , x8 , x9 , lsl#8 2056 orr x10, x10, x11, lsl#8 2057 orr x8 , x8 , x10, lsl#16 2058 2059 // x4 = x4 | x8 << 32 2060 // x4 = [x11][x10][x9][x8] 2061 orr x4 , x4 , x8 , lsl#32 2062 2063 and x4 , x4 , x3 2064 str x4 , [x0], #8 2065 2066 add x1 , x1 , #64*8 2067 subs x2 , x2, #1 2068 b.ne .Lgather_w7 2069 2070 ldr x29,[sp],#16 2071AARCH64_AUTIASP 2072 ret 2073.size ECP256_Gatherw7,.-ECP256_Gatherw7 2074 2075#endif 2076