1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15 16#include "hitls_build.h" 17#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305) 18 19#include "poly1305_x86_64.S" 20 21.file "poly1305_x86_64_avx512.S" 22.text 23 24.set ZH0, %zmm0 25.set ZH1, %zmm1 26.set ZH2, %zmm2 27.set ZH3, %zmm3 28.set ZH4, %zmm4 29.set ZT0, %zmm5 30.set ZT1, %zmm6 31.set ZT2, %zmm7 32.set ZT3, %zmm8 33.set ZT4, %zmm9 34.set ZMASK, %zmm10 35.set ZB0, %zmm11 36.set ZB1, %zmm12 37.set ZB2, %zmm13 38.set ZB3, %zmm14 39.set ZB4, %zmm15 40.set ZR0, %zmm16 41.set ZR1, %zmm17 42.set ZR2, %zmm18 43.set ZR3, %zmm19 44.set ZR4, %zmm20 45.set ZS1, %zmm21 46.set ZS2, %zmm22 47.set ZS3, %zmm23 48.set ZS4, %zmm24 49.set ZM0, %zmm25 50.set ZM1, %zmm26 51.set ZM2, %zmm27 52.set ZM3, %zmm28 53.set ZM4, %zmm29 54.set PADBIT_ZMM, %zmm30 55 56.align 64 57g_permd_avx512: 58 .long 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7 59.size g_permd_avx512, .-g_permd_avx512 60 61/** 62 * Function description: This function is implemented by x86_64 poly1305. The result is stored in ctx->acc. 63 * Function prototype: uint32_t Poly1305Block(Poly1305_Ctx *ctx, const uint8_t *data, uint32_t dataLen, uint32_t padbit); 64 * Input register: 65 * CTX: address of the Poly305_Ctx structure 66 * INP: pointer to the input data 67 * LEN: length of the input data 68 * PADBIT: padding bit, 0 or 1. 69 * Change register: r8-r15, rbx, rbp, rdx, rax 70 * Output register: 71 * %rax: length of the remaining data to be processed 72 * Function/Macro Call: Poly1305_MOD_MUL 73 */ 74.globl Poly1305Block 75.type Poly1305Block, @function 76Poly1305Block: 77.cfi_startproc 78.align 32 79 cmp $256, LEN 80 jae .Lblock_avx_pre 81 call Poly1305Block64Bit 82 ret 83 84.Lblock_avx_pre: 85 andq $-16, LEN 86 test $63, LEN 87 jz Poly1305BlockAVX512 88 89.Lbase2_64_avx_body: 90 91 push %rbx 92 push %rbp 93 push %r12 94 push %r13 95 push %r14 96 push %r15 97 98 movq LEN, %r15 99 movq (CTX), ACC1 // load acc 100 LOAD_ACC_R CTX, R0, R1, R2, ACC1, ACC2, ACC3, %r8d, %rax 101 test %r8d, %r8d 102 jz .Lbase2_64_avx_loop 103 104 CONVERT_26TO64_PRE ACC1, ACC2, D1, D2, D3 105 CONVERT_26TO64 ACC1 D1, ACC2, D2, D3, ACC3 106 movl $0, 220(CTX) 107 108.align 32 109.Lbase2_64_avx_loop: 110 addq (INP), ACC1 111 adcq 8(INP), ACC2 112 adcq PADBIT, ACC3 113 lea 16(INP), INP 114 115 POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2 116 117 subq $16, %r15 118 test $63, %r15 119 movq R1, %rax 120 jnz .Lbase2_64_avx_loop 121 122 movq ACC1, (CTX) 123 movq ACC2, 8(CTX) 124 movq ACC3, 16(CTX) 125 movq %r15, LEN 126 pop %r15 127 pop %r14 128 pop %r13 129 pop %r12 130 pop %rbp 131 pop %rbx 132 133 jmp Poly1305BlockAVX512 134 ret 135.cfi_endproc 136.size Poly1305Block, .-Poly1305Block 137 138/** 139 * Function description: x86_64 poly1305 AVX512 assembly acceleration implementation 140 * Input register: 141 * CTX: address of the Poly305_Ctx structure 142 * INP: pointer to the input data 143 * LEN: length of the input data 144 * PADBIT: padding bit, 0 or 1. 145 * Change register: zmm0-31, rax, rsp, r11, rcx, rdi, k1-k3 146 * Output register: 147 * rax: length of the remaining data to be processed 148 * Function/Macro Call: 149 * CONVERT_64TO26 150 */ 151.globl Poly1305BlockAVX512 152.type Poly1305BlockAVX512, @function 153.align 32 154Poly1305BlockAVX512: 155.cfi_startproc 156 push %rbx 157 push %rbp 158 push %r12 159 push %r13 160 push %r14 161 push %r15 162 163 vzeroupper 164 movq (CTX), ACC1 165 movq 8(CTX), ACC2 166 movq 16(CTX), ACC3 167 movl 220(CTX), %r8d 168 test %r8d, %r8d 169 jnz .Lblock_avx512_pre 170 movq LEN, %r15 171 CONVERT_64TO26 ACC1, ACC2, ACC3, %rax, %rdx 172 movq %r15, LEN 173 jmp .Lblock_avx512_body 174 175.Lblock_avx512_pre: 176 movd %r14, %xmm0 177 movd %rbx, %xmm2 178 shrq $32, %r14 179 shrq $32, %rbx 180 movd %r14, %xmm1 181 movd %rbx, %xmm3 182 movd %rbp, %xmm4 183 184.Lblock_avx512_body: 185 186 movl $15, %eax 187 kmovw %eax, %k2 188 leaq -8(%rsp), %r11 189 subq $0x128, %rsp 190 leaq 56(CTX), CTX 191 vmovdqa g_permd_avx2(%rip), YT2 // g_permd_avx2 192 193 // Extend the precomputation table to the power of 8 194 andq $-512, %rsp 195 movq $0x20, %rax 196 vmovdqu (CTX), %xmm11 197 vmovdqu 16(CTX), %xmm12 198 vmovdqu 32(CTX), %xmm5 199 vmovdqu 48(CTX), %xmm13 200 vmovdqu 64(CTX), %xmm6 201 vmovdqu 80(CTX), %xmm14 202 vpermd ZB0, ZT2, ZR0 // 00 00 34 12 -> 14 24 34 44 203 vmovdqu 96(CTX), %xmm8 204 vpbroadcastq g_mask26(%rip), ZMASK // g_mask26 205 vmovdqu 112(CTX), %xmm15 206 vpermd ZB1, ZT2, ZR1 207 vmovdqu 128(CTX), %xmm9 208 vpermd ZT0, ZT2, ZS1 209 vpermd ZB2, ZT2, ZR2 210 vmovdqa64 ZR0, (%rsp){%k2} 211 vpsrlq $32, ZR0, ZT0 // 14 24 34 44 -> 01 02 03 04 212 vpermd ZT1, ZT2, ZS2 213 vmovdqu64 ZR1, (%rsp, %rax){%k2} 214 vpsrlq $32, ZR1, ZT1 215 vpermd ZB3, ZT2, ZR3 216 vmovdqa64 ZS1, 0x40(%rsp){%k2} 217 vpermd ZT3, ZT2, ZS3 218 vmovdqu64 ZR2, 0x40(%rsp, %rax){%k2} 219 vpermd ZB4, ZT2, ZR4 220 vmovdqa64 ZS2, 0x80(%rsp){%k2} 221 vpermd ZT4, ZT2, ZS4 222 vmovdqu64 ZR3, 0x80(%rsp, %rax){%k2} 223 vmovdqa64 ZS3, 0xc0(%rsp){%k2} 224 vmovdqu64 ZR4, 0xc0(%rsp, %rax){%k2} 225 vmovdqa64 ZS4, 0x100(%rsp){%k2} 226 227 vpmuludq ZT0, ZR0, ZB0 228 vpmuludq ZT0, ZR1, ZB1 229 vpmuludq ZT0, ZR2, ZB2 230 vpmuludq ZT0, ZR3, ZB3 231 vpmuludq ZT0, ZR4, ZB4 232 vpsrlq $32, ZR2, ZT2 233 234 vpmuludq ZT1, ZS4, ZM0 235 vpmuludq ZT1, ZR0, ZM1 236 vpmuludq ZT1, ZR1, ZM2 237 vpmuludq ZT1, ZR2, ZM3 238 vpmuludq ZT1, ZR3, ZM4 239 vpsrlq $32, ZR3, ZT3 240 vpaddq ZM0, ZB0, ZB0 241 vpaddq ZM1, ZB1, ZB1 242 vpaddq ZM2, ZB2, ZB2 243 vpaddq ZM3, ZB3, ZB3 244 vpaddq ZM4, ZB4, ZB4 245 246 vpmuludq ZT2, ZS3, ZM0 247 vpmuludq ZT2, ZS4, ZM1 248 vpmuludq ZT2, ZR0, ZM2 249 vpmuludq ZT2, ZR1, ZM3 250 vpmuludq ZT2, ZR2, ZM4 251 vpsrlq $32, ZR4, ZT4 252 vpaddq ZM0, ZB0, ZB0 253 vpaddq ZM1, ZB1, ZB1 254 vpaddq ZM2, ZB2, ZB2 255 vpaddq ZM3, ZB3, ZB3 256 vpaddq ZM4, ZB4, ZB4 257 258 vpmuludq ZT3, ZS2, ZM0 259 vpmuludq ZT3, ZS3, ZM1 260 vpmuludq ZT3, ZS4, ZM2 261 vpmuludq ZT3, ZR0, ZM3 262 vpmuludq ZT3, ZR1, ZM4 263 vpaddq ZM0, ZB0, ZB0 264 vpaddq ZM1, ZB1, ZB1 265 vpaddq ZM2, ZB2, ZB2 266 vpaddq ZM3, ZB3, ZB3 267 vpaddq ZM4, ZB4, ZB4 268 269 vpmuludq ZT4, ZS1, ZM0 270 vpmuludq ZT4, ZS2, ZM1 271 vpmuludq ZT4, ZS3, ZM2 272 vpmuludq ZT4, ZS4, ZM3 273 vpmuludq ZT4, ZR0, ZM4 274 vpaddq ZM0, ZB0, ZB0 275 vpaddq ZM1, ZB1, ZB1 276 vpaddq ZM2, ZB2, ZB2 277 vpaddq ZM3, ZB3, ZB3 278 vpaddq ZM4, ZB4, ZB4 279 280 // reduction 281 vpsrlq $26, ZB3, ZM3 282 vpandq ZMASK, ZB3, ZB3 283 vpaddq ZM3, ZB4, ZB4 // d3 -> d4 284 vpsrlq $26, ZB0, ZM0 285 vpandq ZMASK, ZB0, ZB0 286 vpaddq ZM0, ZB1, ZB1 // d0 -> d1 287 vpsrlq $26, ZB4, ZM4 288 vpandq ZMASK, ZB4, ZB4 289 vmovdqu64 (INP), ZT3 290 vmovdqu64 64(INP), ZT4 291 leaq 128(INP), INP 292 vpsrlq $26, ZB1, ZM1 293 vpandq ZMASK, ZB1, ZB1 294 vpaddq ZM1, ZB2, ZB2 // d1 -> d2 295 vpaddq ZM4, ZB0, ZB0 296 vpsllq $2, ZM4, ZM4 297 vpaddq ZM4, ZB0, ZB0 // d4 -> d0 298 vpsrlq $26, ZB2, ZM2 299 vpandq ZMASK, ZB2, ZB2 300 vpaddq ZM2, ZB3, ZB3 // d2 -> d3 301 vpsrlq $26, ZB0, ZM0 302 vpandq ZMASK, ZB0, ZB0 303 vpaddq ZM0, ZB1, ZB1 // d0 -> d1 304 vpsrlq $26, ZB3, ZM3 305 vpandq ZMASK, ZB3, ZB3 306 vpaddq ZM3, ZB4, ZB4 // d3 -> d4 307 308 vpunpcklqdq ZT4, ZT3, ZT0 309 vpunpckhqdq ZT4, ZT3, ZT4 310 311 // Construct R and S to make them in operable form. 312 vmovdqu32 g_permd_avx512(%rip), ZM0 // g_permd_avx512 313 movl $0x7777, %eax 314 kmovw %eax, %k1 315 vpermd ZR0, ZM0, ZR0 // 14 24 34 44 -> 1444 2444 3444 4444 316 vpermd ZR1, ZM0, ZR1 317 vpermd ZR2, ZM0, ZR2 318 vpermd ZR3, ZM0, ZR3 319 vpermd ZR4, ZM0, ZR4 320 vpermd ZB0, ZM0, ZR0{%k1} // 05 06 07 08 and 1444 2444 3444 4444 -> 1858 2868 3878 4888 321 vpermd ZB1, ZM0, ZR1{%k1} 322 vpermd ZB2, ZM0, ZR2{%k1} 323 vpermd ZB3, ZM0, ZR3{%k1} 324 vpermd ZB4, ZM0, ZR4{%k1} 325 326 vpslld $2, ZR1, ZS1 327 vpslld $2, ZR2, ZS2 328 vpslld $2, ZR3, ZS3 329 vpslld $2, ZR4, ZS4 330 vpaddd ZR1, ZS1, ZS1 331 vpaddd ZR2, ZS2, ZS2 332 vpaddd ZR3, ZS3, ZS3 333 vpaddd ZR4, ZS4, ZS4 334 335 // Processes the input message block and constructs the operation form. 336 vpbroadcastq g_129(%rip), PADBIT_ZMM // g_129 337 vpsrlq $52, ZT0, ZT2 338 vpsllq $12, ZT4, ZT3 339 vporq ZT3, ZT2, ZT2 340 vpsrlq $26, ZT0, ZT1 341 vpsrlq $14, ZT4, ZT3 342 vpsrlq $40, ZT4, ZT4 // 4 343 vpandq ZMASK, ZT0, ZT0 // 0 344 vpandq ZMASK, ZT2, ZT2 // 2 345 346 vpaddq ZH2, ZT2, ZH2 347 subq $192, LEN 348 jbe .Lblock_avx512_tail 349 jmp .Lblock_avx512_loop 350 351.align 32 352.Lblock_avx512_loop: 353 354 // ((inp[0] * r^8 + inp[ 8]) * r^8 + inp[16]) * r^8 355 // ((inp[1] * r^8 + inp[ 9]) * r^8 + inp[17]) * r^7 356 // ((inp[2] * r^8 + inp[10]) * r^8 + inp[18]) * r^6 357 // ((inp[3] * r^8 + inp[11]) * r^8 + inp[19]) * r^5 358 // ((inp[4] * r^8 + inp[12]) * r^8 + inp[20]) * r^4 359 // ((inp[5] * r^8 + inp[13]) * r^8 + inp[21]) * r^3 360 // ((inp[6] * r^8 + inp[14]) * r^8 + inp[22]) * r^2 361 // ((inp[7] * r^8 + inp[15]) * r^8 + inp[23]) * r^1 362 363 // b3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 364 // b4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 365 // b0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 366 // b1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 367 // b2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 368 369 vpmuludq ZH2, ZR1, ZB3 370 vpandq ZMASK, ZT1, ZT1 // 1 371 vpmuludq ZH2, ZR2, ZB4 372 vpandq ZMASK, ZT3, ZT3 // 3 373 vpmuludq ZH2, ZS3, ZB0 374 vporq PADBIT_ZMM, ZT4, ZT4 375 vpmuludq ZH2, ZS4, ZB1 376 vpaddq ZH0, ZT0, ZH0 377 vpmuludq ZH2, ZR0, ZB2 378 vpaddq ZH1, ZT1, ZH1 379 vpaddq ZH3, ZT3, ZH3 380 vpaddq ZH4, ZT4, ZH4 381 vmovdqu64 (INP), ZT3 382 vmovdqu64 64(INP), ZT4 383 lea 128(INP), INP 384 385 vpmuludq ZH0, ZR3, ZM3 386 vpmuludq ZH0, ZR4, ZM4 387 vpmuludq ZH0, ZR0, ZM0 388 vpmuludq ZH0, ZR1, ZM1 389 vpaddq ZM3, ZB3, ZB3 390 vpaddq ZM4, ZB4, ZB4 391 vpaddq ZM0, ZB0, ZB0 392 vpaddq ZM1, ZB1, ZB1 393 394 vpmuludq ZH1, ZR2, ZM3 395 vpmuludq ZH1, ZR3, ZM4 396 vpmuludq ZH1, ZS4, ZM0 397 vpmuludq ZH0, ZR2, ZM2 398 vpaddq ZM3, ZB3, ZB3 399 vpaddq ZM4, ZB4, ZB4 400 vpaddq ZM0, ZB0, ZB0 401 vpaddq ZM2, ZB2, ZB2 402 vpunpcklqdq ZT4, ZT3, ZT0 403 vpunpckhqdq ZT4, ZT3, ZT4 404 405 vpmuludq ZH3, ZR0, ZM3 406 vpmuludq ZH3, ZR1, ZM4 407 vpmuludq ZH1, ZR0, ZM1 408 vpmuludq ZH1, ZR1, ZM2 409 vpaddq ZM3, ZB3, ZB3 410 vpaddq ZM4, ZB4, ZB4 411 vpaddq ZM1, ZB1, ZB1 412 vpaddq ZM2, ZB2, ZB2 413 414 vpmuludq ZH4, ZS4, ZM3 415 vpmuludq ZH4, ZR0, ZM4 416 vpmuludq ZH3, ZS2, ZM0 417 vpmuludq ZH3, ZS3, ZM1 418 vpmuludq ZH3, ZS4, ZM2 419 vpaddq ZM3, ZB3, ZB3 420 vpaddq ZM4, ZB4, ZB4 421 vpaddq ZM0, ZB0, ZB0 422 vpaddq ZM1, ZB1, ZB1 423 vpaddq ZM2, ZB2, ZB2 424 425 vpmuludq ZH4, ZS1, ZM0 426 vpmuludq ZH4, ZS2, ZM1 427 vpmuludq ZH4, ZS3, ZM2 428 vpaddq ZM0, ZB0, ZH0 429 vpaddq ZM1, ZB1, ZH1 430 vpaddq ZM2, ZB2, ZH2 431 vpsrlq $52, ZT0, ZT2 432 vpsllq $12, ZT4, ZT3 433 434 // reduction 435 vpsrlq $26, ZB3, ZH3 436 vpandq ZMASK, ZB3, ZB3 437 vpaddq ZH3, ZB4, ZH4 438 vporq ZT3, ZT2, ZT2 439 440 vpsrlq $26, ZH0, ZB0 441 vpandq ZMASK, ZH0, ZH0 442 vpaddq ZB0, ZH1, ZH1 443 vpandq ZMASK, ZT2, ZT2 444 445 vpsrlq $26, ZH4, ZB4 446 vpandq ZMASK, ZH4, ZH4 447 vpsrlq $26, ZH1, ZB1 448 vpandq ZMASK, ZH1, ZH1 449 vpaddq ZB1, ZH2, ZH2 450 451 vpaddq ZB4, ZH0, ZH0 452 vpsllq $2, ZB4, ZB4 453 vpaddq ZB4, ZH0, ZH0 454 vpaddq ZT2, ZH2, ZH2 455 vpsrlq $26, ZT0, ZT1 456 457 vpsrlq $26, ZH2, ZB2 458 vpandq ZMASK, ZH2, ZH2 459 vpaddq ZB2, ZB3, ZH3 460 vpsrlq $14, ZT4, ZT3 461 vpsrlq $40, ZT4, ZT4 462 vpandq ZMASK, ZT0, ZT0 463 464 vpsrlq $26, ZH0, ZB0 465 vpandq ZMASK, ZH0, ZH0 466 vpaddq ZB0, ZH1, ZH1 467 468 vpsrlq $26, ZH3, ZB3 469 vpandq ZMASK, ZH3, ZH3 470 vpaddq ZB3, ZH4, ZH4 471 472 subq $128, LEN 473 ja .Lblock_avx512_loop 474 475.align 32 476.Lblock_avx512_tail: 477 478 vpsrlq $32, ZR0, ZR0 // 1858286838784888 -> 0105020603070408 479 vpsrlq $32, ZR1, ZR1 480 vpsrlq $32, ZS1, ZS1 481 vpsrlq $32, ZR2, ZR2 482 vpsrlq $32, ZS2, ZS2 483 vpsrlq $32, ZR3, ZR3 484 vpsrlq $32, ZS3, ZS3 485 vpsrlq $32, ZR4, ZR4 486 vpsrlq $32, ZS4, ZS4 487 488 lea (INP, LEN), INP 489 vpaddq ZH0, ZT0, ZH0 490 vpmuludq ZH2, ZR1, ZB3 491 vpandq ZMASK, ZT1, ZT1 492 vpmuludq ZH2, ZR2, ZB4 493 vpandq ZMASK, ZT3, ZT3 494 vpmuludq ZH2, ZS3, ZB0 495 vporq PADBIT_ZMM, ZT4, ZT4 496 vpmuludq ZH2, ZS4, ZB1 497 vpaddq ZH1, ZT1, ZH1 498 vpmuludq ZH2, ZR0, ZB2 499 vpaddq ZH3, ZT3, ZH3 500 vpaddq ZH4, ZT4, ZH4 501 502 vmovdqu (INP), %xmm5 503 vmovdqu 16(INP), %xmm6 504 vpmuludq ZH0, ZR3, ZM3 505 vpmuludq ZH0, ZR4, ZM4 506 vpmuludq ZH0, ZR0, ZM0 507 vpmuludq ZH0, ZR1, ZM1 508 vpaddq ZM3, ZB3, ZB3 509 vpaddq ZM4, ZB4, ZB4 510 vpaddq ZM0, ZB0, ZB0 511 vpaddq ZM1, ZB1, ZB1 512 513 vinserti128 $1, 32(INP), YT0, YT0 514 vinserti128 $1, 48(INP), YT1, YT1 515 vpmuludq ZH1, ZR2, ZM3 516 vpmuludq ZH1, ZR3, ZM4 517 vpmuludq ZH1, ZS4, ZM0 518 vpmuludq ZH0, ZR2, ZM2 519 vpaddq ZM3, ZB3, ZB3 520 vpaddq ZM4, ZB4, ZB4 521 vpaddq ZM0, ZB0, ZB0 522 vpaddq ZM2, ZB2, ZB2 523 524 vpmuludq ZH3, ZR0, ZM3 525 vpmuludq ZH3, ZR1, ZM4 526 vpmuludq ZH1, ZR0, ZM1 527 vpmuludq ZH1, ZR1, ZM2 528 vpaddq ZM3, ZB3, ZB3 529 vpaddq ZM4, ZB4, ZB4 530 vpaddq ZM1, ZB1, ZB1 531 vpaddq ZM2, ZB2, ZB2 532 533 vpmuludq ZH4, ZS4, ZM3 534 vpmuludq ZH4, ZR0, ZM4 535 vpmuludq ZH3, ZS2, ZM0 536 vpmuludq ZH3, ZS3, ZM1 537 vpmuludq ZH3, ZS4, ZM2 538 vpaddq ZM3, ZB3, ZH3 539 vpaddq ZM4, ZB4, ZB4 540 vpaddq ZM0, ZB0, ZB0 541 vpaddq ZM1, ZB1, ZB1 542 vpaddq ZM2, ZB2, ZB2 543 544 vpmuludq ZH4, ZS1, ZM0 545 vpmuludq ZH4, ZS2, ZM1 546 vpmuludq ZH4, ZS3, ZM2 547 vpaddq ZM0, ZB0, ZH0 548 vpaddq ZM1, ZB1, ZH1 549 vpaddq ZM2, ZB2, ZH2 550 551 // Summary of calculation results of different blocks 552 movl $1, %eax 553 kmovw %eax, %k3 554 vpermq $0xb1, ZH0, ZB0 555 vpermq $0xb1, ZH1, ZB1 556 vpermq $0xb1, ZH2, ZB2 557 vpermq $0xb1, ZH3, ZB3 558 vpermq $0xb1, ZB4, ZH4 559 vpaddq ZB0, ZH0, ZH0 560 vpaddq ZB1, ZH1, ZH1 561 vpaddq ZB2, ZH2, ZH2 562 vpaddq ZB3, ZH3, ZH3 563 vpaddq ZB4, ZH4, ZH4 564 vpermq $0x2, ZH0, ZB0 565 vpermq $0x2, ZH1, ZB1 566 vpermq $0x2, ZH2, ZB2 567 vpermq $0x2, ZH3, ZB3 568 vpermq $0x2, ZH4, ZB4 569 vpaddq ZB0, ZH0, ZH0 570 vpaddq ZB1, ZH1, ZH1 571 vpaddq ZB2, ZH2, ZH2 572 vpaddq ZB3, ZH3, ZH3 573 vpaddq ZB4, ZH4, ZH4 574 575 vextracti64x4 $0x1, ZH0, YB0 576 vextracti64x4 $0x1, ZH1, YB1 577 vextracti64x4 $0x1, ZH2, YB2 578 vextracti64x4 $0x1, ZH3, YB3 579 vextracti64x4 $0x1, ZH4, YB4 580 vpaddq ZB0, ZH0, ZH0{%k3}{z} 581 vpaddq ZB1, ZH1, ZH1{%k3}{z} 582 vpaddq ZB2, ZH2, ZH2{%k3}{z} 583 vpaddq ZB3, ZH3, ZH3{%k3}{z} 584 vpaddq ZB4, ZH4, ZH4{%k3}{z} 585 586 // reduction 587 vpsrlq $26, YH3, YB3 588 vpandq YMASK, YH3, YH3 589 vpaddq YB3, YH4, YH4 590 vpsrldq $6, YT0, YT2 591 vpsrldq $6, YT1, YT3 592 593 vpsrlq $26, YH0, YB0 594 vpandq YMASK, YH0, YH0 595 vpaddq YB0, YH1, YH1 596 vpunpckhqdq YT1, YT0, YT4 597 vpunpcklqdq YT1, YT0, YT0 598 vpunpcklqdq YT3, YT2, YT2 599 600 vpsrlq $26, YH4, YB4 601 vpandq YMASK, YH4, YH4 602 vpsrlq $26, YH1, YB1 603 vpandq YMASK, YH1, YH1 604 vpaddq YB1, YH2, YH2 605 vpsrlq $30, YT2, YT3 606 vpsrlq $4, YT2, YT2 607 608 vpaddq YB4, YH0, YH0 609 vpsllq $2, YB4, YB4 610 vpaddq YB4, YH0, YH0 611 vpsrlq $26, YT0, YT1 612 vpsrlq $40, YT4, YT4 613 614 vpsrlq $26, YH2, YB2 615 vpandq YMASK, YH2, YH2 616 vpaddq YB2, YH3, YH3 617 vpand YMASK, YT2, YT2 618 vpand YMASK, YT3, YT3 619 620 vpsrlq $26, YH0, YB0 621 vpandq YMASK, YH0, YH0 622 vpaddq YB0, YH1, YH1 623 vpaddq YH2, YT2, YH2 624 vpand YMASK, YT1, YT1 625 626 vpsrlq $26, YH3, YB3 627 vpand YMASK, YH3, YH3 628 vpaddq YB3, YH4, YH4 629 vpand YMASK, YT0, YT0 630 vpor g_129(%rip), YT4, YT4 631 632 addq $64, LEN 633 jnz .Lblock_4_tail 634 635 vpsubq YT2, YH2, YH2 636 jmp .Lblock_avx512_end 637 638.align 32 639.Lblock_4_tail: 640 BLOCK4_AVX2_TAIL YT0, YT1, YT2, YT3, YT4, YH0, YH1, YH2, YH3, YH4, YB0, YB1, YB2, YB3, YB4, YMASK, %rsp 641 642.Lblock_avx512_end: 643 vmovd %xmm0, -56(CTX) 644 vmovd %xmm1, -52(CTX) 645 vmovd %xmm2, -48(CTX) 646 vmovd %xmm3, -44(CTX) 647 vmovd %xmm4, -40(CTX) 648 vzeroall 649 lea 8(%r11),%rsp 650 651 pop %r15 652 pop %r14 653 pop %r13 654 pop %r12 655 pop %rbp 656 pop %rbx 657 movq LEN, %rax 658 ret 659.cfi_endproc 660.size Poly1305BlockAVX512, .-Poly1305BlockAVX512 661 662 /** 663 * Function description: This function is used to clear residual sensitive information in a register. 664 * Function prototype: void Poly1305CleanRegister(); 665 * Input register: None 666 * Modify the register: 667 * Output register: None 668 * Function/Macro Call: None 669 */ 670.globl Poly1305CleanRegister 671.type Poly1305CleanRegister,@function 672Poly1305CleanRegister: 673.cfi_startproc 674 vzeroall 675 vpxorq ZR0, ZR0, ZR0 676 vpxorq ZR1, ZR1, ZR1 677 vpxorq ZR2, ZR2, ZR2 678 vpxorq ZR3, ZR3, ZR3 679 vpxorq ZR4, ZR4, ZR4 680 vpxorq ZS1, ZS1, ZS1 681 vpxorq ZS2, ZS2, ZS2 682 vpxorq ZS3, ZS3, ZS3 683 vpxorq ZS4, ZS4, ZS4 684 vpxorq ZM0, ZM0, ZM0 685 vpxorq ZM1, ZM1, ZM1 686 vpxorq ZM2, ZM2, ZM2 687 vpxorq ZM3, ZM3, ZM3 688 vpxorq ZM4, ZM4, ZM4 689 ret 690.cfi_endproc 691.size Poly1305CleanRegister, .-Poly1305CleanRegister 692 693#endif 694