1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305) 18 19#include "crypt_arm.h" 20 21.arch armv8-a 22 23/** 24 * c structure:Poly1305Ctx 25 */ 26 27.set CTX_acc, 0 28.set CTX_r, 24 29.set CTX_s, 40 30.set CTX_table, 56 31.set CTX_data, 200 32.set CTX_lastLen, 216 33.set CTX_flag, 220 34.set CTX_size, 224 35 36.equ FLAG_BASE2_26, 1 37 38/* 104 78 52 26 0 39 * out4 out3 out2 out1 out0 40 * in0[63:52] in0[51:26] in0[25:0] 41 * in1[63:40] in1[39:14] in1[13:0]<<12 42 * in2[39:0]<<24 43 */ 44/** 45 * Macro description: converts the large number format. Three pieces of base 2^64 data are transferred, 46 * and five pieces of base 2^26 data are transferred. 47 * Input register: 48 * in0: digits 0 to 63 of the large number in the original format 49 * in1:64 to 127 characters in the original format 50 * in2:128 or more digits of the large number in the original format 51 * Modified register: None 52 * Output register: 53 * out0: 0 to 25 bits of the converted large number 54 * out1:26 to 51 digits of the converted large number 55 * out2:52 to 77 bits of the converted large number 56 * out3:78 to 103 bits of the converted large number 57 * out4:104 or more digits of the converted large number 58 * Function/Macro Call: None 59 * Restriction: Note that the valid bits of in2 cannot exceed 40 bits. 60 * Otherwise, data will be lost. 61 */ 62 .macro CONVERT_64TO26 out0 out1 out2 out3 out4 in0 in1 in2 63 and \out0, \in0, #0x03ffffff 64 ubfx \out1, \in0, #26, #26 65 extr \out2, \in1, \in0, #52 66 and \out2, \out2, #0x03ffffff 67 ubfx \out3, \in1, #14, #26 68 extr \out4, \in2, \in1, #40 69 .endm 70 71/* 128 64 0 72 * out2 out1 out0 73 * in0 74 * (in1>>38) in1<<26 75 * in2>>12 in2<<52 76 * (in3>>50) in3<<14 77 * in4>>24 in4<<40 78 */ 79/** 80 * Macro description: converts the large number format. Five pieces of base2^26 data are transferred, 81 * and three pieces of base2^64 data are transferred. 82 * Input register: 83 * in0: large data block 0 in the original format 84 * in1: large data block 1 in the original format 85 * in2: large data block 2 in the original format 86 * in3: large data block 3 in the original format 87 * in4: large data block 4 in the original format 88 * Modified register: None 89 * Output register: 90 * out0: bits 0 to 63 of the converted large number 91 * out1: 64-127 bits of the converted large number 92 * out2: 128 or more digits of the converted large number 93 * Function/Macro Call: None 94 * Restriction: Ensure that the valid bits in0-in4 of the input data do not exceed 38 bits. Otherwise, 95 * data will be lost. 96 */ 97 .macro CONVERT_26TO64 out0 out1 out2 in0 in1 in2 in3 in4 98 add \out0, \in0, \in1, lsl#26 99 adds \out0, \out0, \in2, lsl#52 100 lsr \out1, \in2, #12 101 add \out1, \out1, \in3, lsl#14 102 adc \out1, \out1, xzr 103 adds \out1, \out1, \in4, lsl#40 104 lsr \out2, \in4, #24 105 adc \out2, \out2, xzr 106 .endm 107 108 109/* register | t_0 t_1 | t_2 | 110 * bits | 128 bits | 64 bits | 111 * 1 | r0*a0(lo) r0*a1(lo) | r0*a2(lo) | 112 * 2 | r0*a0(hi) | r0*a1(hi) | 113 * 3 | s1*a1(lo) r1*a0(lo) | | 114 * 4 | s1*a1(hi) | r1*a0(hi) | 115 * 5 | s1*a2(lo) | | 116 */ 117/** 118 * Macro description: Multiply large numbers and perform modulo 119 * (a0|a1|a2) = (a0|a1|a2) * (r0|r1) mod P 120 * Input register: 121 * a_0: digits 0 to 63 of the large number a 122 * a_1: 64 to 127 digits of the major number a 123 * a_2: 128 or more digits of the major number a 124 * r_0: bits 0 to 63 of the large number r 125 * r_1: 64-127 bits of the large number r 126 * s_1: 5/4 times the large number r_1 127 * Change register: x11-x15 128 * Output register: 129 * a_0: bits 0 to 63 of the multiplication result 130 * a_1: 64-127 bits of the multiplication result 131 * a_2: 128 or more bits of the multiplication result 132 * Function/Macro Call: None 133 * Restriction: The relationship between s1 and r1 is s1 = r1 + r1 >> 2. 134 */ 135 .macro POLY1305_MOD_MUL a_0, a_1, a_2, r_0, r_1, s_1 136 /* 1 */ 137 mul x11, \r_0, \a_0 138 mul x12, \r_0, \a_1 139 mul x13, \r_0, \a_2 140 /* 2 */ 141 umulh x14, \r_0, \a_0 142 umulh x15, \r_0, \a_1 143 adds x12, x12, x14 144 adc x13, x13, x15 145 /* 3 */ 146 mul x14, \s_1, \a_1 147 mul x15, \r_1, \a_0 148 adds x11, x11, x14 149 adcs x12, x12, x15 150 adc x13, x13, xzr 151 /* 4 */ 152 umulh x14, \s_1, \a_1 153 umulh x15, \r_1, \a_0 154 adds x12, x12, x14 155 adc x13, x13, x15 156 /* 5 */ 157 mul x15, \s_1, \a_2 158 adds x12, x12, x15 159 adc x13, x13, xzr 160 /* Split x13 and add 5/4 of the high-order part to x11. */ 161 bic x15, x13, #3 162 and x13, x13, #3 163 add x15, x15, x15, lsr#2 164 adds \a_0, x11, x15 165 adcs \a_1, x12, xzr 166 adc \a_2, x13, xzr 167 .endm 168 169/** 170 * Macro description: Convert the content of a large number (r_0|r_1|r_2) into the format of 2 ^ 26, 171 * and then fill the memory pointed to by ptr at intervals. 172 * Input register: 173 * r_0: digits 0 to 63 of a large number 174 * r_1: indicates the 64th to 127th digits of the large number. 175 * r_2: 128th to 191th digits of a large number 176 * ptr: start address of the memory to be filled 177 * Change register: x11-x15 178 * Output register: None 179 * Function/Macro call: TRANSFER_64TO26 180 * 181 */ 182 .macro Fill_TABLE r_0, r_1, r_2, ptr 183 /* base 2^64 -> base 2^26 */ 184 /* r_0 r_1 r_2 --> x11 x12 x13 x14 x15 */ 185 CONVERT_64TO26 x11, x12, x13, x14, x15, \r_0, \r_1, \r_2 186 /* Stores the converted value. */ 187 str w11, [\ptr, #16*0] 188 str w12, [\ptr, #16*1] 189 str w13, [\ptr, #16*2] 190 str w14, [\ptr, #16*3] 191 str w15, [\ptr, #16*4] 192 /* Multiply 5 times and continue to store */ 193 add w12, w12, w12, lsl#2 194 add w13, w13, w13, lsl#2 195 add w14, w14, w14, lsl#2 196 add w15, w15, w15, lsl#2 197 198 str w12, [\ptr, #16*5] 199 str w13, [\ptr, #16*6] 200 str w14, [\ptr, #16*7] 201 str w15, [\ptr, #16*8] 202 .endm 203 204/** 205 * Function description: This function is used to initialize the pre-computation table. 206 * Function prototype: void Poly1305InitForAsm(Poly1305Ctx *ctx); 207 * Input register: 208 * x0: address of the context structure 209 * Change register x0 and x5-x15. 210 * Output register: None 211 * Function/Macro Call: Poly1305_MOD_MUL Fill_TABLE 212 */ 213.text 214.balign 64 215.global Poly1305InitForAsm 216.type Poly1305InitForAsm, %function 217Poly1305InitForAsm: 218AARCH64_PACIASP 219 stp x29, x30, [sp, #-16]! 220 add x29, sp, #0 221 222 /* Clearing the member flag */ 223 str wzr, [x0, #CTX_flag] 224 225 /* Initialize the r table. */ 226 ldp x8, x9, [x0, #CTX_r] 227 228#ifdef HITLS_BIG_ENDIAN 229 /* The r value needs to be reversed in the big-endian case. */ 230 ror x8, x8, #32 231 ror x9, x9, #32 232#endif 233 234 add x10, x9, x9, lsr#2 235 /* padding r^1 */ 236 add x0, x0, #CTX_table + 12 237 mov x5, x8 238 mov x6, x9 239 mov x7, xzr 240 Fill_TABLE x5, x6, x7, x0 241 242 /* Calculate and populate r^2 */ 243 sub x0, x0, #4 244 POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10 245 Fill_TABLE x5, x6, x7, x0 246 247 /* Calculate and populate r^3 */ 248 sub x0, x0, #4 249 POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10 250 Fill_TABLE x5, x6, x7, x0 251 252 /* Calculate and populate r^4 */ 253 sub x0, x0, #4 254 POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10 255 Fill_TABLE x5, x6, x7, x0 256 257 eor x5, x5, x5 258 eor x6, x6, x6 259 eor x7, x7, x7 260 eor x8, x8, x8 261 eor x9, x9, x9 262 eor x10, x10, x10 263 264 ldp x29, x30, [sp], #16 265AARCH64_AUTIASP 266 ret 267.size Poly1305InitForAsm, .-Poly1305InitForAsm 268 269/** 270 * Function description: Outputs the final result value to the specified memory. 271 * Function prototype: void Poly1305Last(Poly1305Ctx *ctx, uint8_t mac[POLY1305_TAGSIZE]); 272 * Input register: 273 * x0: address of the context structure 274 * x1: pointer to the output buffer 275 * Change register: x3-x15 276 * Output register: None 277 * Function/Macro Call: Poly1305LastNeon 278 */ 279.text 280.balign 64 281.global Poly1305Last 282.type Poly1305Last, %function 283Poly1305Last: 284AARCH64_PACIASP 285 ldr w15, [x0, #CTX_flag] 286 and w15, w15, #FLAG_BASE2_26 287 cbnz w15, Poly1305LastNeon 288 289 ldp x3, x4, [x0, #CTX_acc] 290 ldr x5, [x0, #CTX_acc + 16] 291 ldp x12, x13, [x0, #CTX_s] 292 293 adds x9, x3, #5 // Compute acc + 5 294 adcs x10, x4, xzr 295 adc x11, x5, xzr 296 /* Test for more than 2 ^ 130 */ 297 cmp x11, #3 298 /* If yes, use the value after adding 5 (equal to the value after modulo operation). 299 If no, use the original value. */ 300 csel x3, x3, x9, le 301 csel x4, x4, x10, le 302 /* Plus the s value */ 303#ifdef HITLS_BIG_ENDIAN 304 /* In the big-endian scenario, the s value needs to be reversed. */ 305 ror x12, x12, #32 306 ror x13, x13, #32 307#endif 308 adds x3, x3, x12 309 adc x4, x4, x13 310 mov x12, xzr // zero out. 311 mov x13, xzr 312#ifdef HITLS_BIG_ENDIAN 313 /* In big-endian mode, the data is converted to little-endian and then output to the memory. */ 314 rev x3, x3 315 rev x4, x4 316#endif 317 stp x3, x4, [x1] 318AARCH64_AUTIASP 319 ret 320.size Poly1305Last, .-Poly1305Last 321 322/** 323 * Function description: Outputs the final result value to the specified memory. 324 * Function prototype: void Poly1305LastNeon(Poly1305Ctx *ctx, uint8_t mac[POLY1305_TAGSIZE]); 325 * Input register: 326 * x0: address of the context structure 327 * x1: pointer to the output buffer 328 * Change register: x2-x15 329 * Output register: None 330 * Function/Macro Call: None 331 */ 332.text 333.balign 64 334.type Poly1305LastNeon, %function 335Poly1305LastNeon: 336AARCH64_PACIASP 337 /* Load the value of base 2^26. */ 338 ldp w11, w12, [x0, #CTX_acc] 339 ldp w13, w14, [x0, #CTX_acc + 8] 340 ldr w15, [x0, #CTX_acc + 16] 341 /* Converted to base 2^64, x11 to x15 are within 30 bits. */ 342 CONVERT_26TO64 x5, x6, x7, x11, x12, x13, x14, x15 343 /* Load the s value. */ 344 ldp x2, x3, [x0, #CTX_s] 345 346 /* Add more than 130 bits by 5 to the lower bits. */ 347 bic x15, x7, #3 348 and x7, x7, #3 349 add x15, x15, x15, lsr#2 350 adds x5, x5, x15 351 adcs x6, x6, xzr 352 adc x7, x7, xzr 353 354 /* Modulo P, subtract directly */ 355 /* subtraction:acc - (2^130 - 5) = acc + 5 - 2^130 */ 356 adds x11, x5, #5 357 adcs x12, x6, xzr 358 adc x13, x7, xzr 359 /* Test for more than 2 ^ 130 */ 360 cmp x13, #4 361 /* If P is greater than or equal to P, the new value is used. */ 362 csel x5, x11, x5, ge 363 csel x6, x12, x6, ge 364 365 /* Value of s plus acc */ 366#ifdef HITLS_BIG_ENDIAN 367 /* In the big-endian scenario, the s value needs to be reversed. */ 368 ror x2, x2, #32 369 ror x3, x3, #32 370#endif 371 372 adds x2, x2, x5 373 adc x3, x3, x6 374 375#ifdef HITLS_BIG_ENDIAN 376 /* In big-endian mode, the data is converted to little-endian and then output to the memory. */ 377 rev x2, x2 378 rev x3, x3 379#endif 380 381 stp x2, x3, [x1] 382AARCH64_AUTIASP 383 ret 384.size Poly1305LastNeon, .-Poly1305LastNeon 385 386 387/** 388 * Function description: Compresses the input data and stores it in the context structure. 389 * Function prototype: uint32_t Poly1305Block(Poly1305Ctx *ctx, const uint8_t *data, 390 * uint32_t dataLen, uint32_t padbit); 391 * Input register: 392 * x0: address of the context structure 393 * x1: pointer to the input data 394 * x2: length of the input data 395 * x3: padded bits, 0 or 1. 396 * Change register: x4-x15 397 * Output register: 398 * x0: length of the remaining data to be processed 399 * Function/Macro Call: CONVERT_26TO64 POLY1305_MOD_MUL Poly1305BlockNeon 400 */ 401.text 402.balign 64 403.global Poly1305Block 404.type Poly1305Block, %function 405Poly1305Block: 406AARCH64_PACIASP 407 /* x4 indicates the length of the basic instruction set to be processed, 408 and x2 indicates the remaining length of the instruction set to be processed. */ 409 /* If the value is less than 16, no processing is required. If NEON is supported, 410 the part that is greater than or equal to 256 is reserved for NEON. */ 411 and x4, x2, #0xF0 // x4 is the processing length of the basic instruction set. 412 bic x2, x2, #0xF0 // x2 is the remaining length after the basic instruction set is processed. 413 cbz x4, .Lskip_process 414 /* Load the ACC value. */ 415 ldr w15, [x0, #CTX_flag] 416 and w14, w15, #FLAG_BASE2_26 417 cbz w14, .Lload_acc_64 418 bic w15, w15, #FLAG_BASE2_26 419 str w15, [x0, #CTX_flag] 420 ldp w10, w11, [x0, #CTX_acc] 421 ldp w12, w13, [x0, #CTX_acc + 8] 422 ldr w14, [x0, #CTX_acc + 16] 423 CONVERT_26TO64 x5, x6, x7, x10, x11, x12, x13, x14 424 b .Lend_load_acc_64 425.Lload_acc_64: 426 ldp x5, x6, [x0, #CTX_acc] 427 ldr x7, [x0, #CTX_acc + 16] 428.Lend_load_acc_64: 429 430 /* Load the r value. */ 431 ldp x8, x9, [x0, #CTX_r] 432 433#ifdef HITLS_BIG_ENDIAN 434 /* The r value needs to be reversed in the big-endian case. */ 435 ror x8, x8, #32 436 ror x9, x9, #32 437#endif 438 439 add x10, x9, x9, lsr#2 440 441.Lloop_64: 442 /* Accumulator acc plus plaintext block with padding x3 */ 443 ldp x11, x12, [x1], #16 444 445#ifdef HITLS_BIG_ENDIAN 446 rev x11, x11 447 rev x12, x12 448#endif 449 450 adds x5, x5, x11 451 adcs x6, x6, x12 452 adc x7, x7, x3 453 /* Multiply large numbers and take modulo (x5|x6|x7) = (x5|x6|x7) * (x8|x9) mod P */ 454 /* x10 = x9 + x9 >> 2 */ 455 POLY1305_MOD_MUL x5, x6, x7, x8, x9, x10 456 /* End of loop, update iteration information */ 457 sub x4, x4, #16 458 cbnz x4, .Lloop_64 459 460 stp x5, x6, [x0, #CTX_acc] 461 str x7, [x0, #CTX_acc + 16] 462.Lskip_process: 463 /* If the remaining length is 256 bytes or more, the NEON processes the remaining length. */ 464 bic x4, x2, #0xFF 465 cbnz x4, Poly1305BlockNeon 466 467 /* function returns */ 468 and x0, x2, #15 // The return value is the unprocessed length. 469 eor x8, x8, x8 470 eor x9, x9, x9 471AARCH64_AUTIASP 472 ret 473.size Poly1305Block, .-Poly1305Block 474 475/** 476 * Function description: Compresses the input data, stores the data in the context structure, and uses the NEON register. 477 * Function prototype: uint32_t Poly1305BlockNeon(Poly1305Ctx *ctx, const uint8_t *data, uint32_t dataLen, uint32_t padbit); 478 * Input register: 479 * x0: context structure address 480 * x1: pointer to the input data 481 * x2: length of the input data 482 * x3: padding bit, 0 or 1. 483 * Modify the register x0-x15,v0-v7,v16-v31. 484 * Output register: 485 * x0: length of the remaining data to be processed 486 * Function/Macro call: CONVERT_64TO26 487 */ 488.text 489.balign 64 490.type Poly1305BlockNeon, %function 491Poly1305BlockNeon: 492 stp x29, x30, [sp, #-16]! 493 stp d8, d9, [sp, #-16]! 494 stp d10, d11, [sp, #-16]! 495 stp d12, d13, [sp, #-16]! 496 stp d14, d15, [sp, #-16]! 497 498 /* Load the acc value, which is stored in v24-v28. */ 499 ldr w15, [x0, #CTX_flag] 500 and w14, w15, #FLAG_BASE2_26 501 cbnz w14, .Lload_acc_26 502 orr w15, w15, #FLAG_BASE2_26 503 str w15, [x0, #CTX_flag] 504 ldp x5, x6, [x0, #CTX_acc] 505 ldr x7, [x0, #CTX_acc + 16] 506 CONVERT_64TO26 x11, x12, x13, x14, x15, x5, x6, x7 507 fmov s24, w11 508 fmov s25, w12 509 fmov s26, w13 510 fmov s27, w14 511 fmov s28, w15 512 b .Lend_load_acc_26 513.Lload_acc_26: 514 ldp s24, s25, [x0, #CTX_acc] 515 ldp s26, s27, [x0, #CTX_acc + 8] 516 ldr s28, [x0, #CTX_acc + 16] 517.Lend_load_acc_26: 518 519 /* Load r-value table */ 520 add x15, x0, #CTX_table 521 ld1 {v0.4s}, [x15], #16 // r^n[0] mod P, n = 1, 2, 3, 4 522 ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [x15], #64 // r^n[1:4] mod P 523 ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [x15], #64 // 5 * r^n[1:4] mod P 524 525 /* Pre-treatment before start of cycle */ 526 add x1, x1, #64 527 sub x4, x4, #64 528 /* v31.2d is {0x3ffffff, 0x3ffffff} */ 529 movi v31.16b, #0xFF 530 ushr v31.2d, v31.2d, #38 531 532 /* Load (m[2], m[3]), convert the format, and save it to v14-v18. */ 533 ldp x9, x10, [x1, #-32] 534 ldp x14, x15, [x1, #-16] 535 536#ifdef HITLS_BIG_ENDIAN 537 rev x9, x9 538 rev x10, x10 539 rev x14, x14 540 rev x15, x15 541#endif 542 543 and x6, x9, #0x03ffffff 544 ubfx x7, x9, #26, #26 545 extr x8, x10, x9, #52 546 and x8, x8, #0x03ffffff 547 ubfx x9, x10, #14, #26 548 extr x10, x3, x10, #40 549 550 and x11, x14, #0x03ffffff 551 ubfx x12, x14, #26, #26 552 extr x13, x15, x14, #52 553 and x13, x13, #0x03ffffff 554 ubfx x14, x15, #14, #26 555 extr x15, x3, x15, #40 556 557 add x6, x6, x11, lsl#32 558 add x7, x7, x12, lsl#32 559 add x8, x8, x13, lsl#32 560 add x9, x9, x14, lsl#32 561 add x10, x10, x15, lsl#32 562 563 fmov d14, x6 564 fmov d15, x7 565 fmov d16, x8 566 fmov d17, x9 567 fmov d18, x10 568 569 /* Load (m[0], m[1]) and save the converted format in v9-v13. */ 570 ldp x9, x10, [x1, #-64] 571 ldp x14, x15, [x1, #-48] 572 573#ifdef HITLS_BIG_ENDIAN 574 rev x9, x9 575 rev x10, x10 576 rev x14, x14 577 rev x15, x15 578#endif 579 580 and x6, x9, #0x03ffffff 581 ubfx x7, x9, #26, #26 582 extr x8, x10, x9, #52 583 and x8, x8, #0x03ffffff 584 ubfx x9, x10, #14, #26 585 extr x10, x3, x10, #40 586 587 and x11, x14, #0x03ffffff 588 ubfx x12, x14, #26, #26 589 extr x13, x15, x14, #52 590 and x13, x13, #0x03ffffff 591 ubfx x14, x15, #14, #26 592 extr x15, x3, x15, #40 593 594 add x6, x6, x11, lsl#32 595 add x7, x7, x12, lsl#32 596 add x8, x8, x13, lsl#32 597 add x9, x9, x14, lsl#32 598 add x10, x10, x15, lsl#32 599 600 fmov d9, x6 601 fmov d10, x7 602 fmov d11, x8 603 fmov d12, x9 604 fmov d13, x10 605 606 /* 607 See NEON Crypto by Daniel J. Bernstein and Peter Schwabe 608 Use base 2^26 to represent a large number: f = f[0] + f[1]<<26 + f[2]<<52 + f[3]<<78 + f[4]<<104 609 Calculate h = (f * g) mod (2^130 - 5), using the NEON register 610 h[0] = f[0]g[0] + 5f[1]g[4] + 5f[2]g[3] + 5f[3]g[2] + 5f[4]g[1] 611 h[1] = f[0]g[1] + f[1]g[0] + 5f[2]g[4] + 5f[3]g[3] + 5f[4]g[2] 612 h[2] = f[0]g[2] + f[1]g[1] + f[2]g[0] + 5f[3]g[4] + 5f[4]g[3] 613 h[3] = f[0]g[3] + f[1]g[2] + f[2]g[1] + f[3]g[0] + 5f[4]g[4] 614 h[4] = f[0]g[4] + f[1]g[3] + f[2]g[2] + f[3]g[1] + f[4]g[0] 615 616 NEON Polynomial Calculation Process: 617 ((m[0]r^4 + m[2]r^2 + m[4])*r^4 + m[6]r^2 + m[8])*r^4 + m[10]r^2 618 + ((m[1]r^4 + m[3]r^2 + m[5])*r^4 + m[7]r^2 + m[9])*r^3 + m[11]r^1 619 620 Calculated inside the loop: 621 (x[0],y[0]) = (acc, 0) 622 (x[1],y[1]) = (m[2],m[3])*(r^2,r^2) + ((m[0],m[1]) + (x[0],y[0]))*(r^4,r^4) 623 (x[2],y[2]) = (m[6],m[7])*(r^2,r^2) + ((m[4],m[5]) + (x[1],y[1]))*(r^4,r^4) 624 */ 625 /* Start loop, vector register has used v0-v8 to hold r value precalculated table, v24-v28 to hold ACC value */ 626.Lloop_neon: 627 add x1, x1, #64 628 sub x4, x4, #64 629 630 /* Compute (m[2 + 4i], m[3 + 4i])*(r^2, r^2), stored in v19-v23 */ 631 /* Load the (m[6 + 4i], m[7 + 4i]) file and save it in v14-v18. */ 632 ldp x9, x10, [x1, #-32] 633 634 umull v19.2d, v14.2s, v0.s[2] 635 umull v20.2d, v14.2s, v1.s[2] 636 umull v21.2d, v14.2s, v2.s[2] 637 umull v22.2d, v14.2s, v3.s[2] 638 umull v23.2d, v14.2s, v4.s[2] 639 640 ldp x14, x15, [x1, #-16] 641 642 umlal v19.2d, v15.2s, v8.s[2] 643 umlal v20.2d, v15.2s, v0.s[2] 644 umlal v21.2d, v15.2s, v1.s[2] 645 umlal v22.2d, v15.2s, v2.s[2] 646 umlal v23.2d, v15.2s, v3.s[2] 647 648#ifdef HITLS_BIG_ENDIAN 649 rev x9, x9 650 rev x10, x10 651 rev x14, x14 652 rev x15, x15 653#endif 654 655 and x6, x9, #0x03ffffff 656 and x11, x14, #0x03ffffff 657 ubfx x7, x9, #26, #26 658 ubfx x12, x14, #26, #26 659 extr x8, x10, x9, #52 660 extr x13, x15, x14, #52 661 662 umlal v19.2d, v16.2s, v7.s[2] 663 umlal v20.2d, v16.2s, v8.s[2] 664 umlal v21.2d, v16.2s, v0.s[2] 665 umlal v22.2d, v16.2s, v1.s[2] 666 umlal v23.2d, v16.2s, v2.s[2] 667 668 and x8, x8, #0x03ffffff 669 and x13, x13, #0x03ffffff 670 ubfx x9, x10, #14, #26 671 ubfx x14, x15, #14, #26 672 extr x10, x3, x10, #40 673 extr x15, x3, x15, #40 674 675 umlal v19.2d, v17.2s, v6.s[2] 676 umlal v20.2d, v17.2s, v7.s[2] 677 umlal v21.2d, v17.2s, v8.s[2] 678 umlal v22.2d, v17.2s, v0.s[2] 679 umlal v23.2d, v17.2s, v1.s[2] 680 681 add x6, x6, x11, lsl#32 682 add x7, x7, x12, lsl#32 683 add x8, x8, x13, lsl#32 684 add x9, x9, x14, lsl#32 685 add x10, x10, x15, lsl#32 686 687 umlal v19.2d, v18.2s, v5.s[2] 688 umlal v20.2d, v18.2s, v6.s[2] 689 umlal v21.2d, v18.2s, v7.s[2] 690 umlal v22.2d, v18.2s, v8.s[2] 691 umlal v23.2d, v18.2s, v0.s[2] 692 693 fmov d14, x6 694 fmov d15, x7 695 fmov d16, x8 696 fmov d17, x9 697 fmov d18, x10 698 699 /* It is not placed at the beginning of the loop because it depends on v24 to v28. */ 700 /* Compute ((m[0 + 4i], m[1 + 4i]) + (x[i], y[i]))*(r^4, r^4), stored in v19-v23 */ 701 /* Load the (m[4 + 4i], m[5 + 4i]) file and save it in v9-v13. */ 702 add v9.2s, v9.2s, v24.2s 703 add v10.2s, v10.2s, v25.2s 704 add v11.2s, v11.2s, v26.2s 705 add v12.2s, v12.2s, v27.2s 706 add v13.2s, v13.2s, v28.2s 707 708 ldp x9, x10, [x1, #-64] 709 710 umlal v19.2d, v9.2s, v0.s[0] 711 umlal v20.2d, v9.2s, v1.s[0] 712 umlal v21.2d, v9.2s, v2.s[0] 713 umlal v22.2d, v9.2s, v3.s[0] 714 umlal v23.2d, v9.2s, v4.s[0] 715 716 ldp x14, x15, [x1, #-48] 717 718 umlal v19.2d, v10.2s, v8.s[0] 719 umlal v20.2d, v10.2s, v0.s[0] 720 umlal v21.2d, v10.2s, v1.s[0] 721 umlal v22.2d, v10.2s, v2.s[0] 722 umlal v23.2d, v10.2s, v3.s[0] 723 724#ifdef HITLS_BIG_ENDIAN 725 rev x9, x9 726 rev x10, x10 727 rev x14, x14 728 rev x15, x15 729#endif 730 731 and x6, x9, #0x03ffffff 732 and x11, x14, #0x03ffffff 733 ubfx x7, x9, #26, #26 734 ubfx x12, x14, #26, #26 735 extr x8, x10, x9, #52 736 extr x13, x15, x14, #52 737 738 umlal v19.2d, v11.2s, v7.s[0] 739 umlal v20.2d, v11.2s, v8.s[0] 740 umlal v21.2d, v11.2s, v0.s[0] 741 umlal v22.2d, v11.2s, v1.s[0] 742 umlal v23.2d, v11.2s, v2.s[0] 743 744 and x8, x8, #0x03ffffff 745 and x13, x13, #0x03ffffff 746 ubfx x9, x10, #14, #26 747 ubfx x14, x15, #14, #26 748 extr x10, x3, x10, #40 749 extr x15, x3, x15, #40 750 751 umlal v19.2d, v12.2s, v6.s[0] 752 umlal v20.2d, v12.2s, v7.s[0] 753 umlal v21.2d, v12.2s, v8.s[0] 754 umlal v22.2d, v12.2s, v0.s[0] 755 umlal v23.2d, v12.2s, v1.s[0] 756 757 add x6, x6, x11, lsl#32 758 add x7, x7, x12, lsl#32 759 add x8, x8, x13, lsl#32 760 add x9, x9, x14, lsl#32 761 add x10, x10, x15, lsl#32 762 763 umlal v19.2d, v13.2s, v5.s[0] 764 umlal v20.2d, v13.2s, v6.s[0] 765 umlal v21.2d, v13.2s, v7.s[0] 766 umlal v22.2d, v13.2s, v8.s[0] 767 umlal v23.2d, v13.2s, v0.s[0] 768 769 fmov d9, x6 770 fmov d10, x7 771 fmov d11, x8 772 fmov d12, x9 773 fmov d13, x10 774 775 /* Because v19-v23 significant bits may exceed 56 bits, to ensure that subsequent multiplication 776 does not overflow, two carry is processed. */ 777 ushr v24.2d, v19.2d, #26 778 ushr v25.2d, v20.2d, #26 779 ushr v26.2d, v21.2d, #26 780 ushr v27.2d, v22.2d, #26 781 ushr v28.2d, v23.2d, #26 782 /* More than 130 digits multiplied by 5 to the lower bits */ 783 shl v29.2d, v28.2d, #2 784 add v28.2d, v28.2d, v29.2d 785 /* Use the AND operation to truncate the lower 26 bits. */ 786 and v19.16b, v19.16b, v31.16b 787 and v20.16b, v20.16b, v31.16b 788 and v21.16b, v21.16b, v31.16b 789 and v22.16b, v22.16b, v31.16b 790 and v23.16b, v23.16b, v31.16b 791 /* Add the part of the low carry */ 792 add v19.2d, v19.2d, v28.2d 793 add v20.2d, v20.2d, v24.2d 794 add v21.2d, v21.2d, v25.2d 795 add v22.2d, v22.2d, v26.2d 796 add v23.2d, v23.2d, v27.2d 797 /* Continue carry processing */ 798 ushr v24.2d, v19.2d, #26 799 ushr v25.2d, v20.2d, #26 800 ushr v26.2d, v21.2d, #26 801 ushr v27.2d, v22.2d, #26 802 ushr v28.2d, v23.2d, #26 803 shl v29.2d, v28.2d, #2 804 add v28.2d, v28.2d, v29.2d 805 806 and v19.16b, v19.16b, v31.16b 807 and v20.16b, v20.16b, v31.16b 808 and v21.16b, v21.16b, v31.16b 809 and v22.16b, v22.16b, v31.16b 810 and v23.16b, v23.16b, v31.16b 811 812 add v19.2d, v19.2d, v28.2d 813 add v20.2d, v20.2d, v24.2d 814 add v21.2d, v21.2d, v25.2d 815 add v22.2d, v22.2d, v26.2d 816 add v23.2d, v23.2d, v27.2d 817 818 /* The calculated (x[i + 1], y[i + 1]) is stored in v24-v28 and is reserved for the next cycle. */ 819 xtn v24.2s, v19.2d 820 xtn v25.2s, v20.2d 821 xtn v26.2s, v21.2d 822 xtn v27.2s, v22.2d 823 xtn v28.2s, v23.2d 824 825 /* End of loop, skip */ 826 cbnz x4, .Lloop_neon 827 828 /* Dealing with the tail */ 829 /* Compute (m[6 + 4i], m[7 + 4i])*(r^2, r^1), stored in v19-v23 */ 830 dup v14.2d, v14.d[0] 831 dup v15.2d, v15.d[0] 832 dup v16.2d, v16.d[0] 833 dup v17.2d, v17.d[0] 834 dup v18.2d, v18.d[0] 835 836 umull2 v19.2d, v14.4s, v0.4s 837 umull2 v20.2d, v14.4s, v1.4s 838 umull2 v21.2d, v14.4s, v2.4s 839 umull2 v22.2d, v14.4s, v3.4s 840 umull2 v23.2d, v14.4s, v4.4s 841 842 umlal2 v19.2d, v15.4s, v8.4s 843 umlal2 v20.2d, v15.4s, v0.4s 844 umlal2 v21.2d, v15.4s, v1.4s 845 umlal2 v22.2d, v15.4s, v2.4s 846 umlal2 v23.2d, v15.4s, v3.4s 847 848 umlal2 v19.2d, v16.4s, v7.4s 849 umlal2 v20.2d, v16.4s, v8.4s 850 umlal2 v21.2d, v16.4s, v0.4s 851 umlal2 v22.2d, v16.4s, v1.4s 852 umlal2 v23.2d, v16.4s, v2.4s 853 854 umlal2 v19.2d, v17.4s, v6.4s 855 umlal2 v20.2d, v17.4s, v7.4s 856 umlal2 v21.2d, v17.4s, v8.4s 857 umlal2 v22.2d, v17.4s, v0.4s 858 umlal2 v23.2d, v17.4s, v1.4s 859 860 umlal2 v19.2d, v18.4s, v5.4s 861 umlal2 v20.2d, v18.4s, v6.4s 862 umlal2 v21.2d, v18.4s, v7.4s 863 umlal2 v22.2d, v18.4s, v8.4s 864 umlal2 v23.2d, v18.4s, v0.4s 865 866 /* Compute (m[4 + 4i], m[5 + 4i])*(r^4, r^3), stored in v19-v23 */ 867 add v9.2s, v9.2s, v24.2s 868 add v10.2s, v10.2s, v25.2s 869 add v11.2s, v11.2s, v26.2s 870 add v12.2s, v12.2s, v27.2s 871 add v13.2s, v13.2s, v28.2s 872 873 umlal v19.2d, v9.2s, v0.2s 874 umlal v20.2d, v9.2s, v1.2s 875 umlal v21.2d, v9.2s, v2.2s 876 umlal v22.2d, v9.2s, v3.2s 877 umlal v23.2d, v9.2s, v4.2s 878 879 umlal v19.2d, v10.2s, v8.2s 880 umlal v20.2d, v10.2s, v0.2s 881 umlal v21.2d, v10.2s, v1.2s 882 umlal v22.2d, v10.2s, v2.2s 883 umlal v23.2d, v10.2s, v3.2s 884 885 umlal v19.2d, v11.2s, v7.2s 886 umlal v20.2d, v11.2s, v8.2s 887 umlal v21.2d, v11.2s, v0.2s 888 umlal v22.2d, v11.2s, v1.2s 889 umlal v23.2d, v11.2s, v2.2s 890 891 umlal v19.2d, v12.2s, v6.2s 892 umlal v20.2d, v12.2s, v7.2s 893 umlal v21.2d, v12.2s, v8.2s 894 umlal v22.2d, v12.2s, v0.2s 895 umlal v23.2d, v12.2s, v1.2s 896 897 umlal v19.2d, v13.2s, v5.2s 898 umlal v20.2d, v13.2s, v6.2s 899 umlal v21.2d, v13.2s, v7.2s 900 umlal v22.2d, v13.2s, v8.2s 901 umlal v23.2d, v13.2s, v0.2s 902 903 /* The results are added, stored in v24-v28, and base 2^26 carry. */ 904 ushr v24.2d, v19.2d, #26 905 ushr v25.2d, v20.2d, #26 906 ushr v26.2d, v21.2d, #26 907 ushr v27.2d, v22.2d, #26 908 ushr v28.2d, v23.2d, #26 909 shl v29.2d, v28.2d, #2 910 add v28.2d, v28.2d, v29.2d 911 912 and v19.16b, v19.16b, v31.16b 913 and v20.16b, v20.16b, v31.16b 914 and v21.16b, v21.16b, v31.16b 915 and v22.16b, v22.16b, v31.16b 916 and v23.16b, v23.16b, v31.16b 917 918 add v19.2d, v19.2d, v28.2d 919 add v20.2d, v20.2d, v24.2d 920 add v21.2d, v21.2d, v25.2d 921 add v22.2d, v22.2d, v26.2d 922 add v23.2d, v23.2d, v27.2d 923 /* Continue carry processing */ 924 ushr v24.2d, v19.2d, #26 925 ushr v25.2d, v20.2d, #26 926 ushr v26.2d, v21.2d, #26 927 ushr v27.2d, v22.2d, #26 928 ushr v28.2d, v23.2d, #26 929 shl v29.2d, v28.2d, #2 930 add v28.2d, v28.2d, v29.2d 931 932 and v19.16b, v19.16b, v31.16b 933 and v20.16b, v20.16b, v31.16b 934 and v21.16b, v21.16b, v31.16b 935 and v22.16b, v22.16b, v31.16b 936 and v23.16b, v23.16b, v31.16b 937 938 add v19.2d, v19.2d, v28.2d 939 add v20.2d, v20.2d, v24.2d 940 add v21.2d, v21.2d, v25.2d 941 add v22.2d, v22.2d, v26.2d 942 add v23.2d, v23.2d, v27.2d 943 944 addp v24.2d, v19.2d, v19.2d 945 addp v25.2d, v20.2d, v20.2d 946 addp v26.2d, v21.2d, v21.2d 947 addp v27.2d, v22.2d, v22.2d 948 addp v28.2d, v23.2d, v23.2d 949 /* After the processing is complete, save the data. Note that the carry may not be completely processed. */ 950 stp s24, s25, [x0, #CTX_acc] 951 stp s26, s27, [x0, #CTX_acc + 8] 952 str s28, [x0, #CTX_acc + 16] 953 954 /* return */ 955 mov x5, xzr 956 ldp d14, d15, [sp], #16 957 ldp d12, d13, [sp], #16 958 ldp d10, d11, [sp], #16 959 ldp d8, d9, [sp], #16 960 ldp x29, x30, [sp], #16 961 and x0, x2, #15 // The return value is the unprocessed length. 962AARCH64_AUTIASP 963 ret 964.size Poly1305BlockNeon, .-Poly1305BlockNeon 965 966/** 967 * Function description: This function is used to clear residual sensitive information in registers. 968 * Function prototype: void Poly1305CleanRegister(); 969 * Input register: None 970 * Modify the registers v0-v7, v16-v31. 971 * Output register: None 972 * Function/Macro Call: None 973 */ 974.text 975.balign 64 976.global Poly1305CleanRegister 977.type Poly1305CleanRegister, %function 978Poly1305CleanRegister: 979AARCH64_PACIASP 980 movi v0.16b, #0 981 and v1.16b, v1.16b, v0.16b 982 and v2.16b, v2.16b, v0.16b 983 and v3.16b, v3.16b, v0.16b 984 and v4.16b, v4.16b, v0.16b 985 and v5.16b, v5.16b, v0.16b 986 and v6.16b, v6.16b, v0.16b 987 and v7.16b, v7.16b, v0.16b 988 /* V8 to V15 are overwritten during register recovery and do not need to be cleared. */ 989 and v16.16b, v16.16b, v0.16b 990 and v17.16b, v17.16b, v0.16b 991 and v18.16b, v18.16b, v0.16b 992 and v19.16b, v19.16b, v0.16b 993 and v20.16b, v20.16b, v0.16b 994 and v21.16b, v21.16b, v0.16b 995 and v22.16b, v22.16b, v0.16b 996 and v23.16b, v23.16b, v0.16b 997 and v24.16b, v24.16b, v0.16b 998 and v25.16b, v25.16b, v0.16b 999 and v26.16b, v26.16b, v0.16b 1000 and v27.16b, v27.16b, v0.16b 1001 and v28.16b, v28.16b, v0.16b 1002 and v29.16b, v29.16b, v0.16b 1003 and v30.16b, v30.16b, v0.16b 1004 and v31.16b, v31.16b, v0.16b 1005AARCH64_AUTIASP 1006 ret 1007.size Poly1305CleanRegister, .-Poly1305CleanRegister 1008 1009#endif 1010