1/* 2 * This file is part of the openHiTLS project. 3 * 4 * openHiTLS is licensed under the Mulan PSL v2. 5 * You can use this software according to the terms and conditions of the Mulan PSL v2. 6 * You may obtain a copy of Mulan PSL v2 at: 7 * 8 * http://license.coscl.org.cn/MulanPSL2 9 * 10 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, 11 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, 12 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 13 * See the Mulan PSL v2 for more details. 14 */ 15#include "hitls_build.h" 16#if defined(HITLS_CRYPTO_CHACHA20) && defined(HITLS_CRYPTO_CHACHA20POLY1305) 17 18#include "poly1305_x86_64.S" 19 20.file "poly1305_x86_64_avx2.S" 21.text 22 23/** 24 * Function description: This function is implemented by x86_64 poly1305. The result is stored in ctx->acc. 25 * Function prototype: uint32_t Poly1305Block(Poly1305_Ctx *ctx, const uint8_t *data, 26 * uint32_t dataLen, uint32_t padbit); 27 * Input register: 28 * CTX: address of the Poly305_Ctx structure 29 * INP: pointer to the input data 30 * LEN: length of the input data 31 * PADBIT: padding bit, 0 or 1. 32 * Change registers: r8-r14, rbx, rbp 33 * Output register: 34 * %rax: length of the remaining data to be processed 35 * Function/Macro Call:Poly1305_MOD_MUL 36 */ 37.globl Poly1305Block 38.type Poly1305Block,@function 39Poly1305Block: 40.cfi_startproc 41.align 32 42 cmp $256, LEN 43 jae .Lblock_avx_pre 44 jmp Poly1305Block64Bit 45 46.Lblock_avx_pre: 47 andq $-16, LEN 48 test $63, LEN 49 jz Poly1305BlockAVX2 50 51.Lbase2_64_avx_body: 52 53 push %rbx 54 push %rbp 55 push %r12 56 push %r13 57 push %r14 58 push %r15 59 60 movq LEN, %r15 61 LOAD_ACC_R CTX, R0, R1, R2, ACC1, ACC2, ACC3, %r8d, %rax 62 test %r8d, %r8d 63 jz .Lbase2_64_avx_loop 64 65 CONVERT_26TO64_PRE ACC1, ACC2, D1, D2, D3 66 CONVERT_26TO64 ACC1, D1, ACC2, D2, D3, ACC3 67 movl $0, 220(CTX) 68 69.align 32 70.Lbase2_64_avx_loop: 71 addq (INP), ACC1 72 adcq 8(INP), ACC2 73 adcq PADBIT, ACC3 74 lea 16(INP), INP 75 76 POLY1305_MOD_MUL ACC1, ACC2, ACC3, R0, R1, R2 77 78 subq $16, %r15 79 movq R1, %rax 80 test $63, %r15 81 jnz .Lbase2_64_avx_loop 82 83 movq ACC1, (CTX) 84 movq ACC2, 8(CTX) 85 movq ACC3, 16(CTX) 86 movq %r15, LEN 87 pop %r15 88 pop %r14 89 pop %r13 90 pop %r12 91 pop %rbp 92 pop %rbx 93 94 jmp Poly1305BlockAVX2 95 ret 96.cfi_endproc 97.size Poly1305Block, .-Poly1305Block 98 99/** 100 * Function description: x86_64 poly1305 AVX2 implementation 101 * Input register: 102 * CTX: address of the Poly305_Ctx structure 103 * INP: pointer to the input data 104 * LEN: length of the input data 105 * PADBIT: padding bit, 0 or 1. 106 * Change register: ymm0-15, r8, r9, r14, r15, rax, rbx, rdx, rbp 107 * Output register: 108 * rax: length of the remaining data to be processed 109 * Function/Macro Call: 110 * CONVERT_64TO26 111 */ 112.globl Poly1305BlockAVX2 113.type Poly1305BlockAVX2, @function 114.align 32 115Poly1305BlockAVX2: 116.cfi_startproc 117 push %rbx 118 push %rbp 119 push %r14 120 push %r15 121 122 vzeroupper 123 movq (CTX), ACC1 // load acc 124 movq 8(CTX), ACC2 125 movq 16(CTX), ACC3 126 movl 220(CTX), %r8d 127 test %r8d, %r8d 128 jnz .Lblock_avx2_pre 129 movq LEN, %r15 130 CONVERT_64TO26 ACC1, ACC2, ACC3, %rax, %rdx // base2_64 --> base2_26 131 movq %r15, LEN 132 jmp .Lblock_avx2_body 133 134.Lblock_avx2_pre: 135 movd %r14, %xmm0 136 movd %rbx, %xmm2 137 movd %rbp, %xmm4 138 shrq $32, %r14 139 shrq $32, %rbx 140 movd %r14, %xmm1 141 movd %rbx, %xmm3 142 143.align 32 144.Lblock_avx2_body: 145 146 leaq 56(CTX), CTX // 56(CTX) 147 vmovdqu g_permd_avx2(%rip), YT0 // g_permd_avx2 148 leaq -8(%rsp), %r11 149 150 /* Transform the content in the precomputation table into a computable form and put it into the stack. */ 151 vmovdqu (CTX), %xmm7 152 vmovdqu 16(CTX), %xmm8 153 subq $0x128, %rsp 154 vmovdqu 32(CTX), %xmm9 155 vmovdqu 48(CTX), %xmm11 156 andq $-512, %rsp 157 vmovdqu 64(CTX), %xmm12 158 vmovdqu 80(CTX), %xmm13 159 vpermd YT2, YT0, YT2 // 00 00 34 12 --> 14 24 34 44 160 vmovdqu 96(CTX), %xmm14 161 vpermd YT3, YT0, YT3 162 vmovdqu 112(CTX), %xmm15 163 vpermd YT4, YT0, YT4 164 vmovdqu 128(CTX), %xmm10 165 vpermd YB0, YT0, YB0 166 vmovdqa YT2, (%rsp) // r0 167 vpermd YB1, YT0, YB1 168 vmovdqa YT3, 0x20(%rsp) // r1 169 vpermd YB2, YT0, YB2 170 vmovdqa YT4, 0x40(%rsp) // s1 171 vpermd YB3, YT0, YB3 172 vmovdqa YB0, 0x60(%rsp) // r2 173 vpermd YB4, YT0, YB4 174 vmovdqa YB1, 0x80(%rsp) // s2 175 vpermd YMASK, YT0, YMASK 176 vmovdqa YB2, 0xa0(%rsp) // r3 177 vmovdqa YB3, 0xc0(%rsp) // s3 178 vmovdqa YB4, 0xe0(%rsp) // r4 179 vmovdqa YMASK, 0x100(%rsp) // s4 180 181 /* Load 4 blocks of data and convert them to base2_26 */ 182 vmovdqu g_mask26(%rip), YMASK // g_mask26 183 vmovdqu (INP), %xmm5 184 vmovdqu 16(INP), %xmm6 185 vinserti128 $1, 32(INP), YT0, YT0 186 vinserti128 $1, 48(INP), YT1, YT1 187 leaq 64(INP), INP 188 189 vpsrldq $6, YT0, YT2 190 vpsrldq $6, YT1, YT3 191 vpunpckhqdq YT1, YT0, YT4 192 vpunpcklqdq YT1, YT0, YT0 193 vpunpcklqdq YT3, YT2, YT2 194 195 vpsrlq $26, YT0, YT1 196 vpsrlq $30, YT2, YT3 197 vpsrlq $4, YT2, YT2 198 vpsrlq $40, YT4, YT4 // 4 199 vpand YMASK, YT3, YT3 // 3 200 vpand YMASK, YT2, YT2 // 2 201 vpor g_129(%rip), YT4, YT4 // padbit 202 vpand YMASK, YT1, YT1 // 1 203 vpand YMASK, YT0, YT0 // 0 204 205 vpaddq YH2, YT2, YH2 206 sub $64, LEN 207 jz .Lblock_avx2_tail 208 jmp .Lblock_avx2_loop 209 210.align 32 211.Lblock_avx2_loop: 212 213 // ((inp[0]*r^4 + inp[4])*r^4 + inp[ 8])*r^4 214 // ((inp[1]*r^4 + inp[5])*r^4 + inp[ 9])*r^3 215 // ((inp[2]*r^4 + inp[6])*r^4 + inp[10])*r^2 216 // ((inp[3]*r^4 + inp[7])*r^4 + inp[11])*r^1 217 vpaddq YH0, YT0, YH0 218 vpaddq YH1, YT1, YH1 219 vpaddq YH3, YT3, YH3 220 vpaddq YH4, YT4, YH4 221 vmovdqa (%rsp), YT0 // r0^4 222 vmovdqa 0x20(%rsp), YT1 // r1^4 223 vmovdqa 0x60(%rsp), YT2 // r2^4 224 vmovdqa 0xc0(%rsp), YT3 // s3^4 225 vmovdqa 0x100(%rsp), YMASK // s4^4 226 227 // b4 = h4*r0^4 + h3*r1^4 + h2*r2^4 + h1*r3^4 + h0*r4^4 228 // b3 = h3*r0^4 + h2*r1^4 + h1*r2^4 + h0*r3^4 + h4*s4^4 229 // b2 = h2*r0^4 + h1*r1^4 + h0*r2^4 + h4*s3^4 + h3*s4^4 230 // b1 = h1*r0^4 + h0*r1^4 + h4*s2^4 + h3*s3^4 + h2*s4^4 231 // b0 = h0*r0^4 + h4*s1^4 + h3*s2^4 + h2*s3^4 + h1*s4^4 232 // 233 // First calculate h2, the above formula can be deformed as 234 // 235 // b4 = h2*r2^4 + h4*r0^4 + h3*r1^4 + + h1*r3^4 + h0*r4^4 236 // b3 = h2*r1^4 + h3*r0^4 + + h1*r2^4 + h0*r3^4 + h4*s4^4 237 // b2 = h2*r0^4 + + h1*r1^4 + h0*r2^4 + h4*s3^4 + h3*s4^4 238 // b1 = h2*s4^4 + h1*r0^4 + h0*r1^4 + h4*s2^4 + h3*s3^4 + 239 // b0 = h2*s3^4 + h0*r0^4 + h4*s1^4 + h3*s2^4 + + h1*s4^4 240 241 vpmuludq YH2, YT0, YB2 // b2 = h2 * r0^4 242 vpmuludq YH2, YT1, YB3 // b3 = h2 * r1^4 243 vpmuludq YH2, YT2, YB4 // b4 = h2 * r2^4 244 vpmuludq YH2, YT3, YB0 // b0 = h2 * s3^4 245 vpmuludq YH2, YMASK, YB1 // b1 = h2 * s4^4 246 247 vpmuludq YH1, YT1, YT4 // h1 * r1^4 (Available Scratch Registers:T4、H2) 248 vpmuludq YH0, YT1, YH2 // h0 * r1^4 249 vpaddq YT4, YB2, YB2 // b2 += h1 * r1^4 250 vpaddq YH2, YB1, YB1 // b1 += h0 * r1^4 251 vpmuludq YH3, YT1, YT4 // h3 * r1^4 252 vpmuludq 0x40(%rsp), YH4, YH2 // h4 * s1^4 253 vpaddq YT4 ,YB4, YB4 // b4 += h3 * r1^4 254 vpaddq YH2, YB0, YB0 // b0 += h4 * s1^4 255 vmovdqa 0x80(%rsp), YT1 // load s2^4 256 257 vpmuludq YH4, YT0, YT4 // h4 * r0^4 (Available Scratch Registers:T4、H2) 258 vpmuludq YH3, YT0, YH2 // h3 * r0^4 259 vpaddq YT4, YB4, YB4 // b4 += h4 * r0^4 260 vpaddq YH2, YB3, YB3 // b3 += h3 * r0^4 261 vpmuludq YH0, YT0, YT4 // h0 * r0^4 262 vpmuludq YH1, YT0, YH2 // h1 * r0^4 263 vpaddq YT4, YB0, YB0 // b0 += h0 * r0^4 264 vpaddq YH2, YB1, YB1 // b1 += h1 * r0^4 265 vmovdqu (INP), %xmm5 // load input (YT0) 266 267 vpmuludq YH4, YT1, YT4 // h4 * s2^4 268 vpmuludq YH3, YT1, YH2 // h3 * s2^4 269 vinserti128 $1, 32(INP), YT0, YT0 270 vpaddq YT4, YB1, YB1 // b1 += h4 * s2^4 271 vpaddq YH2, YB0, YB0 // b0 += h3 * s2^4 272 vpmuludq YH1, YT2, YT4 // h1 * r2^4 (Available Scratch Registers:T4、H2) 273 vpmuludq YH0, YT2, YH2 // h0 * r2^4 274 vmovdqu 16(INP), %xmm6 // load input (YT1) 275 vpaddq YT4, YB3, YB3 // b3 += h1 * r2^4 276 vpaddq YH2, YB2, YB2 // b2 += h0 * r2^4 277 vinserti128 $1, 48(INP), YT1, YT1 278 vmovdqa 0xa0(%rsp), YH2 // load r3^4 279 leaq 64(INP), INP 280 281 vpmuludq YH1, YH2, YT4 // h1 * r3^4 (Available Scratch Registers:T4、H2) 282 vpmuludq YH0, YH2, YH2 // h0 * r3^4 283 vpsrldq $6, YT0, YT2 284 vpaddq YT4, YB4, YB4 // b4 += h1 * r3^4 285 vpaddq YH2, YB3, YB3 // b3 += h0 * r3^4 286 vpmuludq YH4, YT3, YT4 // h4 * s3^4 287 vpmuludq YH3, YT3, YH2 // h3 * s3^4 288 vpsrldq $6, YT1, YT3 289 vpaddq YT4, YB2, YB2 // b2 += h4 * s3^4 290 vpaddq YH2, YB1, YB1 // b1 += h3 * s3^4 (finish) 291 vpunpckhqdq YT1, YT0, YT4 292 293 vpmuludq YH3, YMASK, YH3 // h3 * s4^4 294 vpmuludq YH4, YMASK, YH4 // h4 * s4^4 295 vpunpcklqdq YT1, YT0, YT0 296 vpaddq YB2, YH3, YH2 // h2 += h3 * s4^4 (finish) 297 vpaddq YB3, YH4, YH3 // h3 += h4 * s4^4 (finish) 298 vpunpcklqdq YT3, YT2, YT3 299 vpmuludq 0xe0(%rsp), YH0, YH4 // h0 * r4^4 300 vpmuludq YH1, YMASK, YH0 // h1 * s4^4 301 vmovdqu g_mask26(%rip), YMASK 302 vpaddq YH4, YB4, YH4 // h4 += h0 * r4^4 (finish) 303 vpaddq YH0, YB0, YH0 // h0 += h1 * s4^4 (finish) 304 305 // reduction 306 vpsrlq $26, YH3, YB3 307 vpand YMASK, YH3, YH3 308 vpaddq YB3, YH4, YH4 // h3 -> h4 309 vpsrlq $26, YH0, YB0 310 vpand YMASK, YH0, YH0 311 vpaddq YB0, YB1, YH1 // h0 -> h1 312 vpsrlq $26, YH4, YB4 313 vpand YMASK, YH4, YH4 314 vpsrlq $4, YT3, YT2 315 vpsrlq $26, YH1, YB1 316 vpand YMASK, YH1, YH1 317 vpaddq YB1, YH2, YH2 // h1 -> h2 318 vpaddq YB4, YH0, YH0 319 vpsllq $2, YB4, YB4 320 vpaddq YB4, YH0, YH0 // h4 -> h0 321 vpand YMASK, YT2, YT2 322 vpsrlq $26, YT0, YT1 323 vpsrlq $26, YH2, YB2 324 vpand YMASK, YH2, YH2 325 vpaddq YB2, YH3, YH3 // h2 -> h3 326 vpaddq YT2, YH2, YH2 // prepare next 4 block 327 vpsrlq $30, YT3, YT3 328 vpsrlq $26, YH0, YB0 329 vpand YMASK, YH0, YH0 330 vpaddq YB0, YH1, YH1 // h0 -> h1 331 vpsrlq $40, YT4, YT4 332 vpsrlq $26, YH3, YB3 333 vpand YMASK, YH3, YH3 334 vpaddq YB3, YH4, YH4 // h3 -> h4 335 336 vpand YMASK, YT0, YT0 // new input 0 337 vpand YMASK, YT1, YT1 // new input 1 338 vpand YMASK, YT3, YT3 // new input 3 339 vpor g_129(%rip), YT4, YT4 // new input 4, padbit 340 341 subq $64, LEN 342 jnz .Lblock_avx2_loop 343 344.Lblock_avx2_tail: 345 BLOCK4_AVX2_TAIL YT0, YT1, YT2, YT3, YT4, YH0, YH1, YH2, YH3, YH4, YB0, YB1, YB2, YB3, YB4, YMASK, %rsp 346 347 vmovd %xmm0, -56(CTX) 348 vmovd %xmm1, -52(CTX) 349 vmovd %xmm2, -48(CTX) 350 vmovd %xmm3, -44(CTX) 351 vmovd %xmm4, -40(CTX) 352 vzeroupper 353 leaq 8(%r11), %rsp 354 pop %r15 355 pop %r14 356 pop %rbp 357 pop %rbx 358 movq LEN, %rax 359 ret 360.cfi_endproc 361.size Poly1305BlockAVX2, .-Poly1305BlockAVX2 362 363 /** 364 * Function description: This function is used to clear residual sensitive information in a register. 365 * Function prototype: void Poly1305CleanRegister(); 366 */ 367.globl Poly1305CleanRegister 368.type Poly1305CleanRegister, @function 369Poly1305CleanRegister: 370.cfi_startproc 371 vzeroall 372 ret 373.cfi_endproc 374.size Poly1305CleanRegister, .-Poly1305CleanRegister 375 376#endif 377