1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11.text 12 13.extern GFp_ia32cap_P 14.hidden GFp_ia32cap_P 15 16.align 64 17.Lzero: 18.long 0,0,0,0 19.Lone: 20.long 1,0,0,0 21.Linc: 22.long 0,1,2,3 23.Lfour: 24.long 4,4,4,4 25.Lincy: 26.long 0,2,4,6,1,3,5,7 27.Leight: 28.long 8,8,8,8,8,8,8,8 29.Lrot16: 30.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 31.Lrot24: 32.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 33.Lsigma: 34.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 35.align 64 36.Lzeroz: 37.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 38.Lfourz: 39.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 40.Lincz: 41.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 42.Lsixteen: 43.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 44.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 45.globl GFp_ChaCha20_ctr32 46.hidden GFp_ChaCha20_ctr32 47.type GFp_ChaCha20_ctr32,@function 48.align 64 49GFp_ChaCha20_ctr32: 50.cfi_startproc 51 cmpq $0,%rdx 52 je .Lno_data 53 movq GFp_ia32cap_P+4(%rip),%r10 54 testl $512,%r10d 55 jnz .LChaCha20_ssse3 56 57 pushq %rbx 58.cfi_adjust_cfa_offset 8 59.cfi_offset rbx,-16 60 pushq %rbp 61.cfi_adjust_cfa_offset 8 62.cfi_offset rbp,-24 63 pushq %r12 64.cfi_adjust_cfa_offset 8 65.cfi_offset r12,-32 66 pushq %r13 67.cfi_adjust_cfa_offset 8 68.cfi_offset r13,-40 69 pushq %r14 70.cfi_adjust_cfa_offset 8 71.cfi_offset r14,-48 72 pushq %r15 73.cfi_adjust_cfa_offset 8 74.cfi_offset r15,-56 75 subq $64+24,%rsp 76.cfi_adjust_cfa_offset 88 77.Lctr32_body: 78 79 80 movdqu (%rcx),%xmm1 81 movdqu 16(%rcx),%xmm2 82 movdqu (%r8),%xmm3 83 movdqa .Lone(%rip),%xmm4 84 85 86 movdqa %xmm1,16(%rsp) 87 movdqa %xmm2,32(%rsp) 88 movdqa %xmm3,48(%rsp) 89 movq %rdx,%rbp 90 jmp .Loop_outer 91 92.align 32 93.Loop_outer: 94 movl $0x61707865,%eax 95 movl $0x3320646e,%ebx 96 movl $0x79622d32,%ecx 97 movl $0x6b206574,%edx 98 movl 16(%rsp),%r8d 99 movl 20(%rsp),%r9d 100 movl 24(%rsp),%r10d 101 movl 28(%rsp),%r11d 102 movd %xmm3,%r12d 103 movl 52(%rsp),%r13d 104 movl 56(%rsp),%r14d 105 movl 60(%rsp),%r15d 106 107 movq %rbp,64+0(%rsp) 108 movl $10,%ebp 109 movq %rsi,64+8(%rsp) 110.byte 102,72,15,126,214 111 movq %rdi,64+16(%rsp) 112 movq %rsi,%rdi 113 shrq $32,%rdi 114 jmp .Loop 115 116.align 32 117.Loop: 118 addl %r8d,%eax 119 xorl %eax,%r12d 120 roll $16,%r12d 121 addl %r9d,%ebx 122 xorl %ebx,%r13d 123 roll $16,%r13d 124 addl %r12d,%esi 125 xorl %esi,%r8d 126 roll $12,%r8d 127 addl %r13d,%edi 128 xorl %edi,%r9d 129 roll $12,%r9d 130 addl %r8d,%eax 131 xorl %eax,%r12d 132 roll $8,%r12d 133 addl %r9d,%ebx 134 xorl %ebx,%r13d 135 roll $8,%r13d 136 addl %r12d,%esi 137 xorl %esi,%r8d 138 roll $7,%r8d 139 addl %r13d,%edi 140 xorl %edi,%r9d 141 roll $7,%r9d 142 movl %esi,32(%rsp) 143 movl %edi,36(%rsp) 144 movl 40(%rsp),%esi 145 movl 44(%rsp),%edi 146 addl %r10d,%ecx 147 xorl %ecx,%r14d 148 roll $16,%r14d 149 addl %r11d,%edx 150 xorl %edx,%r15d 151 roll $16,%r15d 152 addl %r14d,%esi 153 xorl %esi,%r10d 154 roll $12,%r10d 155 addl %r15d,%edi 156 xorl %edi,%r11d 157 roll $12,%r11d 158 addl %r10d,%ecx 159 xorl %ecx,%r14d 160 roll $8,%r14d 161 addl %r11d,%edx 162 xorl %edx,%r15d 163 roll $8,%r15d 164 addl %r14d,%esi 165 xorl %esi,%r10d 166 roll $7,%r10d 167 addl %r15d,%edi 168 xorl %edi,%r11d 169 roll $7,%r11d 170 addl %r9d,%eax 171 xorl %eax,%r15d 172 roll $16,%r15d 173 addl %r10d,%ebx 174 xorl %ebx,%r12d 175 roll $16,%r12d 176 addl %r15d,%esi 177 xorl %esi,%r9d 178 roll $12,%r9d 179 addl %r12d,%edi 180 xorl %edi,%r10d 181 roll $12,%r10d 182 addl %r9d,%eax 183 xorl %eax,%r15d 184 roll $8,%r15d 185 addl %r10d,%ebx 186 xorl %ebx,%r12d 187 roll $8,%r12d 188 addl %r15d,%esi 189 xorl %esi,%r9d 190 roll $7,%r9d 191 addl %r12d,%edi 192 xorl %edi,%r10d 193 roll $7,%r10d 194 movl %esi,40(%rsp) 195 movl %edi,44(%rsp) 196 movl 32(%rsp),%esi 197 movl 36(%rsp),%edi 198 addl %r11d,%ecx 199 xorl %ecx,%r13d 200 roll $16,%r13d 201 addl %r8d,%edx 202 xorl %edx,%r14d 203 roll $16,%r14d 204 addl %r13d,%esi 205 xorl %esi,%r11d 206 roll $12,%r11d 207 addl %r14d,%edi 208 xorl %edi,%r8d 209 roll $12,%r8d 210 addl %r11d,%ecx 211 xorl %ecx,%r13d 212 roll $8,%r13d 213 addl %r8d,%edx 214 xorl %edx,%r14d 215 roll $8,%r14d 216 addl %r13d,%esi 217 xorl %esi,%r11d 218 roll $7,%r11d 219 addl %r14d,%edi 220 xorl %edi,%r8d 221 roll $7,%r8d 222 decl %ebp 223 jnz .Loop 224 movl %edi,36(%rsp) 225 movl %esi,32(%rsp) 226 movq 64(%rsp),%rbp 227 movdqa %xmm2,%xmm1 228 movq 64+8(%rsp),%rsi 229 paddd %xmm4,%xmm3 230 movq 64+16(%rsp),%rdi 231 232 addl $0x61707865,%eax 233 addl $0x3320646e,%ebx 234 addl $0x79622d32,%ecx 235 addl $0x6b206574,%edx 236 addl 16(%rsp),%r8d 237 addl 20(%rsp),%r9d 238 addl 24(%rsp),%r10d 239 addl 28(%rsp),%r11d 240 addl 48(%rsp),%r12d 241 addl 52(%rsp),%r13d 242 addl 56(%rsp),%r14d 243 addl 60(%rsp),%r15d 244 paddd 32(%rsp),%xmm1 245 246 cmpq $64,%rbp 247 jb .Ltail 248 249 xorl 0(%rsi),%eax 250 xorl 4(%rsi),%ebx 251 xorl 8(%rsi),%ecx 252 xorl 12(%rsi),%edx 253 xorl 16(%rsi),%r8d 254 xorl 20(%rsi),%r9d 255 xorl 24(%rsi),%r10d 256 xorl 28(%rsi),%r11d 257 movdqu 32(%rsi),%xmm0 258 xorl 48(%rsi),%r12d 259 xorl 52(%rsi),%r13d 260 xorl 56(%rsi),%r14d 261 xorl 60(%rsi),%r15d 262 leaq 64(%rsi),%rsi 263 pxor %xmm1,%xmm0 264 265 movdqa %xmm2,32(%rsp) 266 movd %xmm3,48(%rsp) 267 268 movl %eax,0(%rdi) 269 movl %ebx,4(%rdi) 270 movl %ecx,8(%rdi) 271 movl %edx,12(%rdi) 272 movl %r8d,16(%rdi) 273 movl %r9d,20(%rdi) 274 movl %r10d,24(%rdi) 275 movl %r11d,28(%rdi) 276 movdqu %xmm0,32(%rdi) 277 movl %r12d,48(%rdi) 278 movl %r13d,52(%rdi) 279 movl %r14d,56(%rdi) 280 movl %r15d,60(%rdi) 281 leaq 64(%rdi),%rdi 282 283 subq $64,%rbp 284 jnz .Loop_outer 285 286 jmp .Ldone 287 288.align 16 289.Ltail: 290 movl %eax,0(%rsp) 291 movl %ebx,4(%rsp) 292 xorq %rbx,%rbx 293 movl %ecx,8(%rsp) 294 movl %edx,12(%rsp) 295 movl %r8d,16(%rsp) 296 movl %r9d,20(%rsp) 297 movl %r10d,24(%rsp) 298 movl %r11d,28(%rsp) 299 movdqa %xmm1,32(%rsp) 300 movl %r12d,48(%rsp) 301 movl %r13d,52(%rsp) 302 movl %r14d,56(%rsp) 303 movl %r15d,60(%rsp) 304 305.Loop_tail: 306 movzbl (%rsi,%rbx,1),%eax 307 movzbl (%rsp,%rbx,1),%edx 308 leaq 1(%rbx),%rbx 309 xorl %edx,%eax 310 movb %al,-1(%rdi,%rbx,1) 311 decq %rbp 312 jnz .Loop_tail 313 314.Ldone: 315 leaq 64+24+48(%rsp),%rsi 316 movq -48(%rsi),%r15 317.cfi_restore r15 318 movq -40(%rsi),%r14 319.cfi_restore r14 320 movq -32(%rsi),%r13 321.cfi_restore r13 322 movq -24(%rsi),%r12 323.cfi_restore r12 324 movq -16(%rsi),%rbp 325.cfi_restore rbp 326 movq -8(%rsi),%rbx 327.cfi_restore rbx 328 leaq (%rsi),%rsp 329.cfi_adjust_cfa_offset -136 330.Lno_data: 331 .byte 0xf3,0xc3 332.cfi_endproc 333.size GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32 334.type ChaCha20_ssse3,@function 335.align 32 336ChaCha20_ssse3: 337.LChaCha20_ssse3: 338.cfi_startproc 339 movq %rsp,%r9 340.cfi_def_cfa_register r9 341 cmpq $128,%rdx 342 ja .LChaCha20_4x 343 344.Ldo_sse3_after_all: 345 subq $64+8,%rsp 346 movdqa .Lsigma(%rip),%xmm0 347 movdqu (%rcx),%xmm1 348 movdqu 16(%rcx),%xmm2 349 movdqu (%r8),%xmm3 350 movdqa .Lrot16(%rip),%xmm6 351 movdqa .Lrot24(%rip),%xmm7 352 353 movdqa %xmm0,0(%rsp) 354 movdqa %xmm1,16(%rsp) 355 movdqa %xmm2,32(%rsp) 356 movdqa %xmm3,48(%rsp) 357 movq $10,%r8 358 jmp .Loop_ssse3 359 360.align 32 361.Loop_outer_ssse3: 362 movdqa .Lone(%rip),%xmm3 363 movdqa 0(%rsp),%xmm0 364 movdqa 16(%rsp),%xmm1 365 movdqa 32(%rsp),%xmm2 366 paddd 48(%rsp),%xmm3 367 movq $10,%r8 368 movdqa %xmm3,48(%rsp) 369 jmp .Loop_ssse3 370 371.align 32 372.Loop_ssse3: 373 paddd %xmm1,%xmm0 374 pxor %xmm0,%xmm3 375.byte 102,15,56,0,222 376 paddd %xmm3,%xmm2 377 pxor %xmm2,%xmm1 378 movdqa %xmm1,%xmm4 379 psrld $20,%xmm1 380 pslld $12,%xmm4 381 por %xmm4,%xmm1 382 paddd %xmm1,%xmm0 383 pxor %xmm0,%xmm3 384.byte 102,15,56,0,223 385 paddd %xmm3,%xmm2 386 pxor %xmm2,%xmm1 387 movdqa %xmm1,%xmm4 388 psrld $25,%xmm1 389 pslld $7,%xmm4 390 por %xmm4,%xmm1 391 pshufd $78,%xmm2,%xmm2 392 pshufd $57,%xmm1,%xmm1 393 pshufd $147,%xmm3,%xmm3 394 nop 395 paddd %xmm1,%xmm0 396 pxor %xmm0,%xmm3 397.byte 102,15,56,0,222 398 paddd %xmm3,%xmm2 399 pxor %xmm2,%xmm1 400 movdqa %xmm1,%xmm4 401 psrld $20,%xmm1 402 pslld $12,%xmm4 403 por %xmm4,%xmm1 404 paddd %xmm1,%xmm0 405 pxor %xmm0,%xmm3 406.byte 102,15,56,0,223 407 paddd %xmm3,%xmm2 408 pxor %xmm2,%xmm1 409 movdqa %xmm1,%xmm4 410 psrld $25,%xmm1 411 pslld $7,%xmm4 412 por %xmm4,%xmm1 413 pshufd $78,%xmm2,%xmm2 414 pshufd $147,%xmm1,%xmm1 415 pshufd $57,%xmm3,%xmm3 416 decq %r8 417 jnz .Loop_ssse3 418 paddd 0(%rsp),%xmm0 419 paddd 16(%rsp),%xmm1 420 paddd 32(%rsp),%xmm2 421 paddd 48(%rsp),%xmm3 422 423 cmpq $64,%rdx 424 jb .Ltail_ssse3 425 426 movdqu 0(%rsi),%xmm4 427 movdqu 16(%rsi),%xmm5 428 pxor %xmm4,%xmm0 429 movdqu 32(%rsi),%xmm4 430 pxor %xmm5,%xmm1 431 movdqu 48(%rsi),%xmm5 432 leaq 64(%rsi),%rsi 433 pxor %xmm4,%xmm2 434 pxor %xmm5,%xmm3 435 436 movdqu %xmm0,0(%rdi) 437 movdqu %xmm1,16(%rdi) 438 movdqu %xmm2,32(%rdi) 439 movdqu %xmm3,48(%rdi) 440 leaq 64(%rdi),%rdi 441 442 subq $64,%rdx 443 jnz .Loop_outer_ssse3 444 445 jmp .Ldone_ssse3 446 447.align 16 448.Ltail_ssse3: 449 movdqa %xmm0,0(%rsp) 450 movdqa %xmm1,16(%rsp) 451 movdqa %xmm2,32(%rsp) 452 movdqa %xmm3,48(%rsp) 453 xorq %r8,%r8 454 455.Loop_tail_ssse3: 456 movzbl (%rsi,%r8,1),%eax 457 movzbl (%rsp,%r8,1),%ecx 458 leaq 1(%r8),%r8 459 xorl %ecx,%eax 460 movb %al,-1(%rdi,%r8,1) 461 decq %rdx 462 jnz .Loop_tail_ssse3 463 464.Ldone_ssse3: 465 leaq (%r9),%rsp 466.cfi_def_cfa_register rsp 467.Lssse3_epilogue: 468 .byte 0xf3,0xc3 469.cfi_endproc 470.size ChaCha20_ssse3,.-ChaCha20_ssse3 471.type ChaCha20_4x,@function 472.align 32 473ChaCha20_4x: 474.LChaCha20_4x: 475.cfi_startproc 476 movq %rsp,%r9 477.cfi_def_cfa_register r9 478 movq %r10,%r11 479 shrq $32,%r10 480 testq $32,%r10 481 jnz .LChaCha20_8x 482 cmpq $192,%rdx 483 ja .Lproceed4x 484 485 andq $71303168,%r11 486 cmpq $4194304,%r11 487 je .Ldo_sse3_after_all 488 489.Lproceed4x: 490 subq $0x140+8,%rsp 491 movdqa .Lsigma(%rip),%xmm11 492 movdqu (%rcx),%xmm15 493 movdqu 16(%rcx),%xmm7 494 movdqu (%r8),%xmm3 495 leaq 256(%rsp),%rcx 496 leaq .Lrot16(%rip),%r10 497 leaq .Lrot24(%rip),%r11 498 499 pshufd $0x00,%xmm11,%xmm8 500 pshufd $0x55,%xmm11,%xmm9 501 movdqa %xmm8,64(%rsp) 502 pshufd $0xaa,%xmm11,%xmm10 503 movdqa %xmm9,80(%rsp) 504 pshufd $0xff,%xmm11,%xmm11 505 movdqa %xmm10,96(%rsp) 506 movdqa %xmm11,112(%rsp) 507 508 pshufd $0x00,%xmm15,%xmm12 509 pshufd $0x55,%xmm15,%xmm13 510 movdqa %xmm12,128-256(%rcx) 511 pshufd $0xaa,%xmm15,%xmm14 512 movdqa %xmm13,144-256(%rcx) 513 pshufd $0xff,%xmm15,%xmm15 514 movdqa %xmm14,160-256(%rcx) 515 movdqa %xmm15,176-256(%rcx) 516 517 pshufd $0x00,%xmm7,%xmm4 518 pshufd $0x55,%xmm7,%xmm5 519 movdqa %xmm4,192-256(%rcx) 520 pshufd $0xaa,%xmm7,%xmm6 521 movdqa %xmm5,208-256(%rcx) 522 pshufd $0xff,%xmm7,%xmm7 523 movdqa %xmm6,224-256(%rcx) 524 movdqa %xmm7,240-256(%rcx) 525 526 pshufd $0x00,%xmm3,%xmm0 527 pshufd $0x55,%xmm3,%xmm1 528 paddd .Linc(%rip),%xmm0 529 pshufd $0xaa,%xmm3,%xmm2 530 movdqa %xmm1,272-256(%rcx) 531 pshufd $0xff,%xmm3,%xmm3 532 movdqa %xmm2,288-256(%rcx) 533 movdqa %xmm3,304-256(%rcx) 534 535 jmp .Loop_enter4x 536 537.align 32 538.Loop_outer4x: 539 movdqa 64(%rsp),%xmm8 540 movdqa 80(%rsp),%xmm9 541 movdqa 96(%rsp),%xmm10 542 movdqa 112(%rsp),%xmm11 543 movdqa 128-256(%rcx),%xmm12 544 movdqa 144-256(%rcx),%xmm13 545 movdqa 160-256(%rcx),%xmm14 546 movdqa 176-256(%rcx),%xmm15 547 movdqa 192-256(%rcx),%xmm4 548 movdqa 208-256(%rcx),%xmm5 549 movdqa 224-256(%rcx),%xmm6 550 movdqa 240-256(%rcx),%xmm7 551 movdqa 256-256(%rcx),%xmm0 552 movdqa 272-256(%rcx),%xmm1 553 movdqa 288-256(%rcx),%xmm2 554 movdqa 304-256(%rcx),%xmm3 555 paddd .Lfour(%rip),%xmm0 556 557.Loop_enter4x: 558 movdqa %xmm6,32(%rsp) 559 movdqa %xmm7,48(%rsp) 560 movdqa (%r10),%xmm7 561 movl $10,%eax 562 movdqa %xmm0,256-256(%rcx) 563 jmp .Loop4x 564 565.align 32 566.Loop4x: 567 paddd %xmm12,%xmm8 568 paddd %xmm13,%xmm9 569 pxor %xmm8,%xmm0 570 pxor %xmm9,%xmm1 571.byte 102,15,56,0,199 572.byte 102,15,56,0,207 573 paddd %xmm0,%xmm4 574 paddd %xmm1,%xmm5 575 pxor %xmm4,%xmm12 576 pxor %xmm5,%xmm13 577 movdqa %xmm12,%xmm6 578 pslld $12,%xmm12 579 psrld $20,%xmm6 580 movdqa %xmm13,%xmm7 581 pslld $12,%xmm13 582 por %xmm6,%xmm12 583 psrld $20,%xmm7 584 movdqa (%r11),%xmm6 585 por %xmm7,%xmm13 586 paddd %xmm12,%xmm8 587 paddd %xmm13,%xmm9 588 pxor %xmm8,%xmm0 589 pxor %xmm9,%xmm1 590.byte 102,15,56,0,198 591.byte 102,15,56,0,206 592 paddd %xmm0,%xmm4 593 paddd %xmm1,%xmm5 594 pxor %xmm4,%xmm12 595 pxor %xmm5,%xmm13 596 movdqa %xmm12,%xmm7 597 pslld $7,%xmm12 598 psrld $25,%xmm7 599 movdqa %xmm13,%xmm6 600 pslld $7,%xmm13 601 por %xmm7,%xmm12 602 psrld $25,%xmm6 603 movdqa (%r10),%xmm7 604 por %xmm6,%xmm13 605 movdqa %xmm4,0(%rsp) 606 movdqa %xmm5,16(%rsp) 607 movdqa 32(%rsp),%xmm4 608 movdqa 48(%rsp),%xmm5 609 paddd %xmm14,%xmm10 610 paddd %xmm15,%xmm11 611 pxor %xmm10,%xmm2 612 pxor %xmm11,%xmm3 613.byte 102,15,56,0,215 614.byte 102,15,56,0,223 615 paddd %xmm2,%xmm4 616 paddd %xmm3,%xmm5 617 pxor %xmm4,%xmm14 618 pxor %xmm5,%xmm15 619 movdqa %xmm14,%xmm6 620 pslld $12,%xmm14 621 psrld $20,%xmm6 622 movdqa %xmm15,%xmm7 623 pslld $12,%xmm15 624 por %xmm6,%xmm14 625 psrld $20,%xmm7 626 movdqa (%r11),%xmm6 627 por %xmm7,%xmm15 628 paddd %xmm14,%xmm10 629 paddd %xmm15,%xmm11 630 pxor %xmm10,%xmm2 631 pxor %xmm11,%xmm3 632.byte 102,15,56,0,214 633.byte 102,15,56,0,222 634 paddd %xmm2,%xmm4 635 paddd %xmm3,%xmm5 636 pxor %xmm4,%xmm14 637 pxor %xmm5,%xmm15 638 movdqa %xmm14,%xmm7 639 pslld $7,%xmm14 640 psrld $25,%xmm7 641 movdqa %xmm15,%xmm6 642 pslld $7,%xmm15 643 por %xmm7,%xmm14 644 psrld $25,%xmm6 645 movdqa (%r10),%xmm7 646 por %xmm6,%xmm15 647 paddd %xmm13,%xmm8 648 paddd %xmm14,%xmm9 649 pxor %xmm8,%xmm3 650 pxor %xmm9,%xmm0 651.byte 102,15,56,0,223 652.byte 102,15,56,0,199 653 paddd %xmm3,%xmm4 654 paddd %xmm0,%xmm5 655 pxor %xmm4,%xmm13 656 pxor %xmm5,%xmm14 657 movdqa %xmm13,%xmm6 658 pslld $12,%xmm13 659 psrld $20,%xmm6 660 movdqa %xmm14,%xmm7 661 pslld $12,%xmm14 662 por %xmm6,%xmm13 663 psrld $20,%xmm7 664 movdqa (%r11),%xmm6 665 por %xmm7,%xmm14 666 paddd %xmm13,%xmm8 667 paddd %xmm14,%xmm9 668 pxor %xmm8,%xmm3 669 pxor %xmm9,%xmm0 670.byte 102,15,56,0,222 671.byte 102,15,56,0,198 672 paddd %xmm3,%xmm4 673 paddd %xmm0,%xmm5 674 pxor %xmm4,%xmm13 675 pxor %xmm5,%xmm14 676 movdqa %xmm13,%xmm7 677 pslld $7,%xmm13 678 psrld $25,%xmm7 679 movdqa %xmm14,%xmm6 680 pslld $7,%xmm14 681 por %xmm7,%xmm13 682 psrld $25,%xmm6 683 movdqa (%r10),%xmm7 684 por %xmm6,%xmm14 685 movdqa %xmm4,32(%rsp) 686 movdqa %xmm5,48(%rsp) 687 movdqa 0(%rsp),%xmm4 688 movdqa 16(%rsp),%xmm5 689 paddd %xmm15,%xmm10 690 paddd %xmm12,%xmm11 691 pxor %xmm10,%xmm1 692 pxor %xmm11,%xmm2 693.byte 102,15,56,0,207 694.byte 102,15,56,0,215 695 paddd %xmm1,%xmm4 696 paddd %xmm2,%xmm5 697 pxor %xmm4,%xmm15 698 pxor %xmm5,%xmm12 699 movdqa %xmm15,%xmm6 700 pslld $12,%xmm15 701 psrld $20,%xmm6 702 movdqa %xmm12,%xmm7 703 pslld $12,%xmm12 704 por %xmm6,%xmm15 705 psrld $20,%xmm7 706 movdqa (%r11),%xmm6 707 por %xmm7,%xmm12 708 paddd %xmm15,%xmm10 709 paddd %xmm12,%xmm11 710 pxor %xmm10,%xmm1 711 pxor %xmm11,%xmm2 712.byte 102,15,56,0,206 713.byte 102,15,56,0,214 714 paddd %xmm1,%xmm4 715 paddd %xmm2,%xmm5 716 pxor %xmm4,%xmm15 717 pxor %xmm5,%xmm12 718 movdqa %xmm15,%xmm7 719 pslld $7,%xmm15 720 psrld $25,%xmm7 721 movdqa %xmm12,%xmm6 722 pslld $7,%xmm12 723 por %xmm7,%xmm15 724 psrld $25,%xmm6 725 movdqa (%r10),%xmm7 726 por %xmm6,%xmm12 727 decl %eax 728 jnz .Loop4x 729 730 paddd 64(%rsp),%xmm8 731 paddd 80(%rsp),%xmm9 732 paddd 96(%rsp),%xmm10 733 paddd 112(%rsp),%xmm11 734 735 movdqa %xmm8,%xmm6 736 punpckldq %xmm9,%xmm8 737 movdqa %xmm10,%xmm7 738 punpckldq %xmm11,%xmm10 739 punpckhdq %xmm9,%xmm6 740 punpckhdq %xmm11,%xmm7 741 movdqa %xmm8,%xmm9 742 punpcklqdq %xmm10,%xmm8 743 movdqa %xmm6,%xmm11 744 punpcklqdq %xmm7,%xmm6 745 punpckhqdq %xmm10,%xmm9 746 punpckhqdq %xmm7,%xmm11 747 paddd 128-256(%rcx),%xmm12 748 paddd 144-256(%rcx),%xmm13 749 paddd 160-256(%rcx),%xmm14 750 paddd 176-256(%rcx),%xmm15 751 752 movdqa %xmm8,0(%rsp) 753 movdqa %xmm9,16(%rsp) 754 movdqa 32(%rsp),%xmm8 755 movdqa 48(%rsp),%xmm9 756 757 movdqa %xmm12,%xmm10 758 punpckldq %xmm13,%xmm12 759 movdqa %xmm14,%xmm7 760 punpckldq %xmm15,%xmm14 761 punpckhdq %xmm13,%xmm10 762 punpckhdq %xmm15,%xmm7 763 movdqa %xmm12,%xmm13 764 punpcklqdq %xmm14,%xmm12 765 movdqa %xmm10,%xmm15 766 punpcklqdq %xmm7,%xmm10 767 punpckhqdq %xmm14,%xmm13 768 punpckhqdq %xmm7,%xmm15 769 paddd 192-256(%rcx),%xmm4 770 paddd 208-256(%rcx),%xmm5 771 paddd 224-256(%rcx),%xmm8 772 paddd 240-256(%rcx),%xmm9 773 774 movdqa %xmm6,32(%rsp) 775 movdqa %xmm11,48(%rsp) 776 777 movdqa %xmm4,%xmm14 778 punpckldq %xmm5,%xmm4 779 movdqa %xmm8,%xmm7 780 punpckldq %xmm9,%xmm8 781 punpckhdq %xmm5,%xmm14 782 punpckhdq %xmm9,%xmm7 783 movdqa %xmm4,%xmm5 784 punpcklqdq %xmm8,%xmm4 785 movdqa %xmm14,%xmm9 786 punpcklqdq %xmm7,%xmm14 787 punpckhqdq %xmm8,%xmm5 788 punpckhqdq %xmm7,%xmm9 789 paddd 256-256(%rcx),%xmm0 790 paddd 272-256(%rcx),%xmm1 791 paddd 288-256(%rcx),%xmm2 792 paddd 304-256(%rcx),%xmm3 793 794 movdqa %xmm0,%xmm8 795 punpckldq %xmm1,%xmm0 796 movdqa %xmm2,%xmm7 797 punpckldq %xmm3,%xmm2 798 punpckhdq %xmm1,%xmm8 799 punpckhdq %xmm3,%xmm7 800 movdqa %xmm0,%xmm1 801 punpcklqdq %xmm2,%xmm0 802 movdqa %xmm8,%xmm3 803 punpcklqdq %xmm7,%xmm8 804 punpckhqdq %xmm2,%xmm1 805 punpckhqdq %xmm7,%xmm3 806 cmpq $256,%rdx 807 jb .Ltail4x 808 809 movdqu 0(%rsi),%xmm6 810 movdqu 16(%rsi),%xmm11 811 movdqu 32(%rsi),%xmm2 812 movdqu 48(%rsi),%xmm7 813 pxor 0(%rsp),%xmm6 814 pxor %xmm12,%xmm11 815 pxor %xmm4,%xmm2 816 pxor %xmm0,%xmm7 817 818 movdqu %xmm6,0(%rdi) 819 movdqu 64(%rsi),%xmm6 820 movdqu %xmm11,16(%rdi) 821 movdqu 80(%rsi),%xmm11 822 movdqu %xmm2,32(%rdi) 823 movdqu 96(%rsi),%xmm2 824 movdqu %xmm7,48(%rdi) 825 movdqu 112(%rsi),%xmm7 826 leaq 128(%rsi),%rsi 827 pxor 16(%rsp),%xmm6 828 pxor %xmm13,%xmm11 829 pxor %xmm5,%xmm2 830 pxor %xmm1,%xmm7 831 832 movdqu %xmm6,64(%rdi) 833 movdqu 0(%rsi),%xmm6 834 movdqu %xmm11,80(%rdi) 835 movdqu 16(%rsi),%xmm11 836 movdqu %xmm2,96(%rdi) 837 movdqu 32(%rsi),%xmm2 838 movdqu %xmm7,112(%rdi) 839 leaq 128(%rdi),%rdi 840 movdqu 48(%rsi),%xmm7 841 pxor 32(%rsp),%xmm6 842 pxor %xmm10,%xmm11 843 pxor %xmm14,%xmm2 844 pxor %xmm8,%xmm7 845 846 movdqu %xmm6,0(%rdi) 847 movdqu 64(%rsi),%xmm6 848 movdqu %xmm11,16(%rdi) 849 movdqu 80(%rsi),%xmm11 850 movdqu %xmm2,32(%rdi) 851 movdqu 96(%rsi),%xmm2 852 movdqu %xmm7,48(%rdi) 853 movdqu 112(%rsi),%xmm7 854 leaq 128(%rsi),%rsi 855 pxor 48(%rsp),%xmm6 856 pxor %xmm15,%xmm11 857 pxor %xmm9,%xmm2 858 pxor %xmm3,%xmm7 859 movdqu %xmm6,64(%rdi) 860 movdqu %xmm11,80(%rdi) 861 movdqu %xmm2,96(%rdi) 862 movdqu %xmm7,112(%rdi) 863 leaq 128(%rdi),%rdi 864 865 subq $256,%rdx 866 jnz .Loop_outer4x 867 868 jmp .Ldone4x 869 870.Ltail4x: 871 cmpq $192,%rdx 872 jae .L192_or_more4x 873 cmpq $128,%rdx 874 jae .L128_or_more4x 875 cmpq $64,%rdx 876 jae .L64_or_more4x 877 878 879 xorq %r10,%r10 880 881 movdqa %xmm12,16(%rsp) 882 movdqa %xmm4,32(%rsp) 883 movdqa %xmm0,48(%rsp) 884 jmp .Loop_tail4x 885 886.align 32 887.L64_or_more4x: 888 movdqu 0(%rsi),%xmm6 889 movdqu 16(%rsi),%xmm11 890 movdqu 32(%rsi),%xmm2 891 movdqu 48(%rsi),%xmm7 892 pxor 0(%rsp),%xmm6 893 pxor %xmm12,%xmm11 894 pxor %xmm4,%xmm2 895 pxor %xmm0,%xmm7 896 movdqu %xmm6,0(%rdi) 897 movdqu %xmm11,16(%rdi) 898 movdqu %xmm2,32(%rdi) 899 movdqu %xmm7,48(%rdi) 900 je .Ldone4x 901 902 movdqa 16(%rsp),%xmm6 903 leaq 64(%rsi),%rsi 904 xorq %r10,%r10 905 movdqa %xmm6,0(%rsp) 906 movdqa %xmm13,16(%rsp) 907 leaq 64(%rdi),%rdi 908 movdqa %xmm5,32(%rsp) 909 subq $64,%rdx 910 movdqa %xmm1,48(%rsp) 911 jmp .Loop_tail4x 912 913.align 32 914.L128_or_more4x: 915 movdqu 0(%rsi),%xmm6 916 movdqu 16(%rsi),%xmm11 917 movdqu 32(%rsi),%xmm2 918 movdqu 48(%rsi),%xmm7 919 pxor 0(%rsp),%xmm6 920 pxor %xmm12,%xmm11 921 pxor %xmm4,%xmm2 922 pxor %xmm0,%xmm7 923 924 movdqu %xmm6,0(%rdi) 925 movdqu 64(%rsi),%xmm6 926 movdqu %xmm11,16(%rdi) 927 movdqu 80(%rsi),%xmm11 928 movdqu %xmm2,32(%rdi) 929 movdqu 96(%rsi),%xmm2 930 movdqu %xmm7,48(%rdi) 931 movdqu 112(%rsi),%xmm7 932 pxor 16(%rsp),%xmm6 933 pxor %xmm13,%xmm11 934 pxor %xmm5,%xmm2 935 pxor %xmm1,%xmm7 936 movdqu %xmm6,64(%rdi) 937 movdqu %xmm11,80(%rdi) 938 movdqu %xmm2,96(%rdi) 939 movdqu %xmm7,112(%rdi) 940 je .Ldone4x 941 942 movdqa 32(%rsp),%xmm6 943 leaq 128(%rsi),%rsi 944 xorq %r10,%r10 945 movdqa %xmm6,0(%rsp) 946 movdqa %xmm10,16(%rsp) 947 leaq 128(%rdi),%rdi 948 movdqa %xmm14,32(%rsp) 949 subq $128,%rdx 950 movdqa %xmm8,48(%rsp) 951 jmp .Loop_tail4x 952 953.align 32 954.L192_or_more4x: 955 movdqu 0(%rsi),%xmm6 956 movdqu 16(%rsi),%xmm11 957 movdqu 32(%rsi),%xmm2 958 movdqu 48(%rsi),%xmm7 959 pxor 0(%rsp),%xmm6 960 pxor %xmm12,%xmm11 961 pxor %xmm4,%xmm2 962 pxor %xmm0,%xmm7 963 964 movdqu %xmm6,0(%rdi) 965 movdqu 64(%rsi),%xmm6 966 movdqu %xmm11,16(%rdi) 967 movdqu 80(%rsi),%xmm11 968 movdqu %xmm2,32(%rdi) 969 movdqu 96(%rsi),%xmm2 970 movdqu %xmm7,48(%rdi) 971 movdqu 112(%rsi),%xmm7 972 leaq 128(%rsi),%rsi 973 pxor 16(%rsp),%xmm6 974 pxor %xmm13,%xmm11 975 pxor %xmm5,%xmm2 976 pxor %xmm1,%xmm7 977 978 movdqu %xmm6,64(%rdi) 979 movdqu 0(%rsi),%xmm6 980 movdqu %xmm11,80(%rdi) 981 movdqu 16(%rsi),%xmm11 982 movdqu %xmm2,96(%rdi) 983 movdqu 32(%rsi),%xmm2 984 movdqu %xmm7,112(%rdi) 985 leaq 128(%rdi),%rdi 986 movdqu 48(%rsi),%xmm7 987 pxor 32(%rsp),%xmm6 988 pxor %xmm10,%xmm11 989 pxor %xmm14,%xmm2 990 pxor %xmm8,%xmm7 991 movdqu %xmm6,0(%rdi) 992 movdqu %xmm11,16(%rdi) 993 movdqu %xmm2,32(%rdi) 994 movdqu %xmm7,48(%rdi) 995 je .Ldone4x 996 997 movdqa 48(%rsp),%xmm6 998 leaq 64(%rsi),%rsi 999 xorq %r10,%r10 1000 movdqa %xmm6,0(%rsp) 1001 movdqa %xmm15,16(%rsp) 1002 leaq 64(%rdi),%rdi 1003 movdqa %xmm9,32(%rsp) 1004 subq $192,%rdx 1005 movdqa %xmm3,48(%rsp) 1006 1007.Loop_tail4x: 1008 movzbl (%rsi,%r10,1),%eax 1009 movzbl (%rsp,%r10,1),%ecx 1010 leaq 1(%r10),%r10 1011 xorl %ecx,%eax 1012 movb %al,-1(%rdi,%r10,1) 1013 decq %rdx 1014 jnz .Loop_tail4x 1015 1016.Ldone4x: 1017 leaq (%r9),%rsp 1018.cfi_def_cfa_register rsp 1019.L4x_epilogue: 1020 .byte 0xf3,0xc3 1021.cfi_endproc 1022.size ChaCha20_4x,.-ChaCha20_4x 1023.type ChaCha20_8x,@function 1024.align 32 1025ChaCha20_8x: 1026.LChaCha20_8x: 1027.cfi_startproc 1028 movq %rsp,%r9 1029.cfi_def_cfa_register r9 1030 subq $0x280+8,%rsp 1031 andq $-32,%rsp 1032 vzeroupper 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 vbroadcasti128 .Lsigma(%rip),%ymm11 1044 vbroadcasti128 (%rcx),%ymm3 1045 vbroadcasti128 16(%rcx),%ymm15 1046 vbroadcasti128 (%r8),%ymm7 1047 leaq 256(%rsp),%rcx 1048 leaq 512(%rsp),%rax 1049 leaq .Lrot16(%rip),%r10 1050 leaq .Lrot24(%rip),%r11 1051 1052 vpshufd $0x00,%ymm11,%ymm8 1053 vpshufd $0x55,%ymm11,%ymm9 1054 vmovdqa %ymm8,128-256(%rcx) 1055 vpshufd $0xaa,%ymm11,%ymm10 1056 vmovdqa %ymm9,160-256(%rcx) 1057 vpshufd $0xff,%ymm11,%ymm11 1058 vmovdqa %ymm10,192-256(%rcx) 1059 vmovdqa %ymm11,224-256(%rcx) 1060 1061 vpshufd $0x00,%ymm3,%ymm0 1062 vpshufd $0x55,%ymm3,%ymm1 1063 vmovdqa %ymm0,256-256(%rcx) 1064 vpshufd $0xaa,%ymm3,%ymm2 1065 vmovdqa %ymm1,288-256(%rcx) 1066 vpshufd $0xff,%ymm3,%ymm3 1067 vmovdqa %ymm2,320-256(%rcx) 1068 vmovdqa %ymm3,352-256(%rcx) 1069 1070 vpshufd $0x00,%ymm15,%ymm12 1071 vpshufd $0x55,%ymm15,%ymm13 1072 vmovdqa %ymm12,384-512(%rax) 1073 vpshufd $0xaa,%ymm15,%ymm14 1074 vmovdqa %ymm13,416-512(%rax) 1075 vpshufd $0xff,%ymm15,%ymm15 1076 vmovdqa %ymm14,448-512(%rax) 1077 vmovdqa %ymm15,480-512(%rax) 1078 1079 vpshufd $0x00,%ymm7,%ymm4 1080 vpshufd $0x55,%ymm7,%ymm5 1081 vpaddd .Lincy(%rip),%ymm4,%ymm4 1082 vpshufd $0xaa,%ymm7,%ymm6 1083 vmovdqa %ymm5,544-512(%rax) 1084 vpshufd $0xff,%ymm7,%ymm7 1085 vmovdqa %ymm6,576-512(%rax) 1086 vmovdqa %ymm7,608-512(%rax) 1087 1088 jmp .Loop_enter8x 1089 1090.align 32 1091.Loop_outer8x: 1092 vmovdqa 128-256(%rcx),%ymm8 1093 vmovdqa 160-256(%rcx),%ymm9 1094 vmovdqa 192-256(%rcx),%ymm10 1095 vmovdqa 224-256(%rcx),%ymm11 1096 vmovdqa 256-256(%rcx),%ymm0 1097 vmovdqa 288-256(%rcx),%ymm1 1098 vmovdqa 320-256(%rcx),%ymm2 1099 vmovdqa 352-256(%rcx),%ymm3 1100 vmovdqa 384-512(%rax),%ymm12 1101 vmovdqa 416-512(%rax),%ymm13 1102 vmovdqa 448-512(%rax),%ymm14 1103 vmovdqa 480-512(%rax),%ymm15 1104 vmovdqa 512-512(%rax),%ymm4 1105 vmovdqa 544-512(%rax),%ymm5 1106 vmovdqa 576-512(%rax),%ymm6 1107 vmovdqa 608-512(%rax),%ymm7 1108 vpaddd .Leight(%rip),%ymm4,%ymm4 1109 1110.Loop_enter8x: 1111 vmovdqa %ymm14,64(%rsp) 1112 vmovdqa %ymm15,96(%rsp) 1113 vbroadcasti128 (%r10),%ymm15 1114 vmovdqa %ymm4,512-512(%rax) 1115 movl $10,%eax 1116 jmp .Loop8x 1117 1118.align 32 1119.Loop8x: 1120 vpaddd %ymm0,%ymm8,%ymm8 1121 vpxor %ymm4,%ymm8,%ymm4 1122 vpshufb %ymm15,%ymm4,%ymm4 1123 vpaddd %ymm1,%ymm9,%ymm9 1124 vpxor %ymm5,%ymm9,%ymm5 1125 vpshufb %ymm15,%ymm5,%ymm5 1126 vpaddd %ymm4,%ymm12,%ymm12 1127 vpxor %ymm0,%ymm12,%ymm0 1128 vpslld $12,%ymm0,%ymm14 1129 vpsrld $20,%ymm0,%ymm0 1130 vpor %ymm0,%ymm14,%ymm0 1131 vbroadcasti128 (%r11),%ymm14 1132 vpaddd %ymm5,%ymm13,%ymm13 1133 vpxor %ymm1,%ymm13,%ymm1 1134 vpslld $12,%ymm1,%ymm15 1135 vpsrld $20,%ymm1,%ymm1 1136 vpor %ymm1,%ymm15,%ymm1 1137 vpaddd %ymm0,%ymm8,%ymm8 1138 vpxor %ymm4,%ymm8,%ymm4 1139 vpshufb %ymm14,%ymm4,%ymm4 1140 vpaddd %ymm1,%ymm9,%ymm9 1141 vpxor %ymm5,%ymm9,%ymm5 1142 vpshufb %ymm14,%ymm5,%ymm5 1143 vpaddd %ymm4,%ymm12,%ymm12 1144 vpxor %ymm0,%ymm12,%ymm0 1145 vpslld $7,%ymm0,%ymm15 1146 vpsrld $25,%ymm0,%ymm0 1147 vpor %ymm0,%ymm15,%ymm0 1148 vbroadcasti128 (%r10),%ymm15 1149 vpaddd %ymm5,%ymm13,%ymm13 1150 vpxor %ymm1,%ymm13,%ymm1 1151 vpslld $7,%ymm1,%ymm14 1152 vpsrld $25,%ymm1,%ymm1 1153 vpor %ymm1,%ymm14,%ymm1 1154 vmovdqa %ymm12,0(%rsp) 1155 vmovdqa %ymm13,32(%rsp) 1156 vmovdqa 64(%rsp),%ymm12 1157 vmovdqa 96(%rsp),%ymm13 1158 vpaddd %ymm2,%ymm10,%ymm10 1159 vpxor %ymm6,%ymm10,%ymm6 1160 vpshufb %ymm15,%ymm6,%ymm6 1161 vpaddd %ymm3,%ymm11,%ymm11 1162 vpxor %ymm7,%ymm11,%ymm7 1163 vpshufb %ymm15,%ymm7,%ymm7 1164 vpaddd %ymm6,%ymm12,%ymm12 1165 vpxor %ymm2,%ymm12,%ymm2 1166 vpslld $12,%ymm2,%ymm14 1167 vpsrld $20,%ymm2,%ymm2 1168 vpor %ymm2,%ymm14,%ymm2 1169 vbroadcasti128 (%r11),%ymm14 1170 vpaddd %ymm7,%ymm13,%ymm13 1171 vpxor %ymm3,%ymm13,%ymm3 1172 vpslld $12,%ymm3,%ymm15 1173 vpsrld $20,%ymm3,%ymm3 1174 vpor %ymm3,%ymm15,%ymm3 1175 vpaddd %ymm2,%ymm10,%ymm10 1176 vpxor %ymm6,%ymm10,%ymm6 1177 vpshufb %ymm14,%ymm6,%ymm6 1178 vpaddd %ymm3,%ymm11,%ymm11 1179 vpxor %ymm7,%ymm11,%ymm7 1180 vpshufb %ymm14,%ymm7,%ymm7 1181 vpaddd %ymm6,%ymm12,%ymm12 1182 vpxor %ymm2,%ymm12,%ymm2 1183 vpslld $7,%ymm2,%ymm15 1184 vpsrld $25,%ymm2,%ymm2 1185 vpor %ymm2,%ymm15,%ymm2 1186 vbroadcasti128 (%r10),%ymm15 1187 vpaddd %ymm7,%ymm13,%ymm13 1188 vpxor %ymm3,%ymm13,%ymm3 1189 vpslld $7,%ymm3,%ymm14 1190 vpsrld $25,%ymm3,%ymm3 1191 vpor %ymm3,%ymm14,%ymm3 1192 vpaddd %ymm1,%ymm8,%ymm8 1193 vpxor %ymm7,%ymm8,%ymm7 1194 vpshufb %ymm15,%ymm7,%ymm7 1195 vpaddd %ymm2,%ymm9,%ymm9 1196 vpxor %ymm4,%ymm9,%ymm4 1197 vpshufb %ymm15,%ymm4,%ymm4 1198 vpaddd %ymm7,%ymm12,%ymm12 1199 vpxor %ymm1,%ymm12,%ymm1 1200 vpslld $12,%ymm1,%ymm14 1201 vpsrld $20,%ymm1,%ymm1 1202 vpor %ymm1,%ymm14,%ymm1 1203 vbroadcasti128 (%r11),%ymm14 1204 vpaddd %ymm4,%ymm13,%ymm13 1205 vpxor %ymm2,%ymm13,%ymm2 1206 vpslld $12,%ymm2,%ymm15 1207 vpsrld $20,%ymm2,%ymm2 1208 vpor %ymm2,%ymm15,%ymm2 1209 vpaddd %ymm1,%ymm8,%ymm8 1210 vpxor %ymm7,%ymm8,%ymm7 1211 vpshufb %ymm14,%ymm7,%ymm7 1212 vpaddd %ymm2,%ymm9,%ymm9 1213 vpxor %ymm4,%ymm9,%ymm4 1214 vpshufb %ymm14,%ymm4,%ymm4 1215 vpaddd %ymm7,%ymm12,%ymm12 1216 vpxor %ymm1,%ymm12,%ymm1 1217 vpslld $7,%ymm1,%ymm15 1218 vpsrld $25,%ymm1,%ymm1 1219 vpor %ymm1,%ymm15,%ymm1 1220 vbroadcasti128 (%r10),%ymm15 1221 vpaddd %ymm4,%ymm13,%ymm13 1222 vpxor %ymm2,%ymm13,%ymm2 1223 vpslld $7,%ymm2,%ymm14 1224 vpsrld $25,%ymm2,%ymm2 1225 vpor %ymm2,%ymm14,%ymm2 1226 vmovdqa %ymm12,64(%rsp) 1227 vmovdqa %ymm13,96(%rsp) 1228 vmovdqa 0(%rsp),%ymm12 1229 vmovdqa 32(%rsp),%ymm13 1230 vpaddd %ymm3,%ymm10,%ymm10 1231 vpxor %ymm5,%ymm10,%ymm5 1232 vpshufb %ymm15,%ymm5,%ymm5 1233 vpaddd %ymm0,%ymm11,%ymm11 1234 vpxor %ymm6,%ymm11,%ymm6 1235 vpshufb %ymm15,%ymm6,%ymm6 1236 vpaddd %ymm5,%ymm12,%ymm12 1237 vpxor %ymm3,%ymm12,%ymm3 1238 vpslld $12,%ymm3,%ymm14 1239 vpsrld $20,%ymm3,%ymm3 1240 vpor %ymm3,%ymm14,%ymm3 1241 vbroadcasti128 (%r11),%ymm14 1242 vpaddd %ymm6,%ymm13,%ymm13 1243 vpxor %ymm0,%ymm13,%ymm0 1244 vpslld $12,%ymm0,%ymm15 1245 vpsrld $20,%ymm0,%ymm0 1246 vpor %ymm0,%ymm15,%ymm0 1247 vpaddd %ymm3,%ymm10,%ymm10 1248 vpxor %ymm5,%ymm10,%ymm5 1249 vpshufb %ymm14,%ymm5,%ymm5 1250 vpaddd %ymm0,%ymm11,%ymm11 1251 vpxor %ymm6,%ymm11,%ymm6 1252 vpshufb %ymm14,%ymm6,%ymm6 1253 vpaddd %ymm5,%ymm12,%ymm12 1254 vpxor %ymm3,%ymm12,%ymm3 1255 vpslld $7,%ymm3,%ymm15 1256 vpsrld $25,%ymm3,%ymm3 1257 vpor %ymm3,%ymm15,%ymm3 1258 vbroadcasti128 (%r10),%ymm15 1259 vpaddd %ymm6,%ymm13,%ymm13 1260 vpxor %ymm0,%ymm13,%ymm0 1261 vpslld $7,%ymm0,%ymm14 1262 vpsrld $25,%ymm0,%ymm0 1263 vpor %ymm0,%ymm14,%ymm0 1264 decl %eax 1265 jnz .Loop8x 1266 1267 leaq 512(%rsp),%rax 1268 vpaddd 128-256(%rcx),%ymm8,%ymm8 1269 vpaddd 160-256(%rcx),%ymm9,%ymm9 1270 vpaddd 192-256(%rcx),%ymm10,%ymm10 1271 vpaddd 224-256(%rcx),%ymm11,%ymm11 1272 1273 vpunpckldq %ymm9,%ymm8,%ymm14 1274 vpunpckldq %ymm11,%ymm10,%ymm15 1275 vpunpckhdq %ymm9,%ymm8,%ymm8 1276 vpunpckhdq %ymm11,%ymm10,%ymm10 1277 vpunpcklqdq %ymm15,%ymm14,%ymm9 1278 vpunpckhqdq %ymm15,%ymm14,%ymm14 1279 vpunpcklqdq %ymm10,%ymm8,%ymm11 1280 vpunpckhqdq %ymm10,%ymm8,%ymm8 1281 vpaddd 256-256(%rcx),%ymm0,%ymm0 1282 vpaddd 288-256(%rcx),%ymm1,%ymm1 1283 vpaddd 320-256(%rcx),%ymm2,%ymm2 1284 vpaddd 352-256(%rcx),%ymm3,%ymm3 1285 1286 vpunpckldq %ymm1,%ymm0,%ymm10 1287 vpunpckldq %ymm3,%ymm2,%ymm15 1288 vpunpckhdq %ymm1,%ymm0,%ymm0 1289 vpunpckhdq %ymm3,%ymm2,%ymm2 1290 vpunpcklqdq %ymm15,%ymm10,%ymm1 1291 vpunpckhqdq %ymm15,%ymm10,%ymm10 1292 vpunpcklqdq %ymm2,%ymm0,%ymm3 1293 vpunpckhqdq %ymm2,%ymm0,%ymm0 1294 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1295 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1296 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1297 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1298 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1299 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1300 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1301 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1302 vmovdqa %ymm15,0(%rsp) 1303 vmovdqa %ymm9,32(%rsp) 1304 vmovdqa 64(%rsp),%ymm15 1305 vmovdqa 96(%rsp),%ymm9 1306 1307 vpaddd 384-512(%rax),%ymm12,%ymm12 1308 vpaddd 416-512(%rax),%ymm13,%ymm13 1309 vpaddd 448-512(%rax),%ymm15,%ymm15 1310 vpaddd 480-512(%rax),%ymm9,%ymm9 1311 1312 vpunpckldq %ymm13,%ymm12,%ymm2 1313 vpunpckldq %ymm9,%ymm15,%ymm8 1314 vpunpckhdq %ymm13,%ymm12,%ymm12 1315 vpunpckhdq %ymm9,%ymm15,%ymm15 1316 vpunpcklqdq %ymm8,%ymm2,%ymm13 1317 vpunpckhqdq %ymm8,%ymm2,%ymm2 1318 vpunpcklqdq %ymm15,%ymm12,%ymm9 1319 vpunpckhqdq %ymm15,%ymm12,%ymm12 1320 vpaddd 512-512(%rax),%ymm4,%ymm4 1321 vpaddd 544-512(%rax),%ymm5,%ymm5 1322 vpaddd 576-512(%rax),%ymm6,%ymm6 1323 vpaddd 608-512(%rax),%ymm7,%ymm7 1324 1325 vpunpckldq %ymm5,%ymm4,%ymm15 1326 vpunpckldq %ymm7,%ymm6,%ymm8 1327 vpunpckhdq %ymm5,%ymm4,%ymm4 1328 vpunpckhdq %ymm7,%ymm6,%ymm6 1329 vpunpcklqdq %ymm8,%ymm15,%ymm5 1330 vpunpckhqdq %ymm8,%ymm15,%ymm15 1331 vpunpcklqdq %ymm6,%ymm4,%ymm7 1332 vpunpckhqdq %ymm6,%ymm4,%ymm4 1333 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1334 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1335 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1336 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1337 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1338 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1339 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1340 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1341 vmovdqa 0(%rsp),%ymm6 1342 vmovdqa 32(%rsp),%ymm12 1343 1344 cmpq $512,%rdx 1345 jb .Ltail8x 1346 1347 vpxor 0(%rsi),%ymm6,%ymm6 1348 vpxor 32(%rsi),%ymm8,%ymm8 1349 vpxor 64(%rsi),%ymm1,%ymm1 1350 vpxor 96(%rsi),%ymm5,%ymm5 1351 leaq 128(%rsi),%rsi 1352 vmovdqu %ymm6,0(%rdi) 1353 vmovdqu %ymm8,32(%rdi) 1354 vmovdqu %ymm1,64(%rdi) 1355 vmovdqu %ymm5,96(%rdi) 1356 leaq 128(%rdi),%rdi 1357 1358 vpxor 0(%rsi),%ymm12,%ymm12 1359 vpxor 32(%rsi),%ymm13,%ymm13 1360 vpxor 64(%rsi),%ymm10,%ymm10 1361 vpxor 96(%rsi),%ymm15,%ymm15 1362 leaq 128(%rsi),%rsi 1363 vmovdqu %ymm12,0(%rdi) 1364 vmovdqu %ymm13,32(%rdi) 1365 vmovdqu %ymm10,64(%rdi) 1366 vmovdqu %ymm15,96(%rdi) 1367 leaq 128(%rdi),%rdi 1368 1369 vpxor 0(%rsi),%ymm14,%ymm14 1370 vpxor 32(%rsi),%ymm2,%ymm2 1371 vpxor 64(%rsi),%ymm3,%ymm3 1372 vpxor 96(%rsi),%ymm7,%ymm7 1373 leaq 128(%rsi),%rsi 1374 vmovdqu %ymm14,0(%rdi) 1375 vmovdqu %ymm2,32(%rdi) 1376 vmovdqu %ymm3,64(%rdi) 1377 vmovdqu %ymm7,96(%rdi) 1378 leaq 128(%rdi),%rdi 1379 1380 vpxor 0(%rsi),%ymm11,%ymm11 1381 vpxor 32(%rsi),%ymm9,%ymm9 1382 vpxor 64(%rsi),%ymm0,%ymm0 1383 vpxor 96(%rsi),%ymm4,%ymm4 1384 leaq 128(%rsi),%rsi 1385 vmovdqu %ymm11,0(%rdi) 1386 vmovdqu %ymm9,32(%rdi) 1387 vmovdqu %ymm0,64(%rdi) 1388 vmovdqu %ymm4,96(%rdi) 1389 leaq 128(%rdi),%rdi 1390 1391 subq $512,%rdx 1392 jnz .Loop_outer8x 1393 1394 jmp .Ldone8x 1395 1396.Ltail8x: 1397 cmpq $448,%rdx 1398 jae .L448_or_more8x 1399 cmpq $384,%rdx 1400 jae .L384_or_more8x 1401 cmpq $320,%rdx 1402 jae .L320_or_more8x 1403 cmpq $256,%rdx 1404 jae .L256_or_more8x 1405 cmpq $192,%rdx 1406 jae .L192_or_more8x 1407 cmpq $128,%rdx 1408 jae .L128_or_more8x 1409 cmpq $64,%rdx 1410 jae .L64_or_more8x 1411 1412 xorq %r10,%r10 1413 vmovdqa %ymm6,0(%rsp) 1414 vmovdqa %ymm8,32(%rsp) 1415 jmp .Loop_tail8x 1416 1417.align 32 1418.L64_or_more8x: 1419 vpxor 0(%rsi),%ymm6,%ymm6 1420 vpxor 32(%rsi),%ymm8,%ymm8 1421 vmovdqu %ymm6,0(%rdi) 1422 vmovdqu %ymm8,32(%rdi) 1423 je .Ldone8x 1424 1425 leaq 64(%rsi),%rsi 1426 xorq %r10,%r10 1427 vmovdqa %ymm1,0(%rsp) 1428 leaq 64(%rdi),%rdi 1429 subq $64,%rdx 1430 vmovdqa %ymm5,32(%rsp) 1431 jmp .Loop_tail8x 1432 1433.align 32 1434.L128_or_more8x: 1435 vpxor 0(%rsi),%ymm6,%ymm6 1436 vpxor 32(%rsi),%ymm8,%ymm8 1437 vpxor 64(%rsi),%ymm1,%ymm1 1438 vpxor 96(%rsi),%ymm5,%ymm5 1439 vmovdqu %ymm6,0(%rdi) 1440 vmovdqu %ymm8,32(%rdi) 1441 vmovdqu %ymm1,64(%rdi) 1442 vmovdqu %ymm5,96(%rdi) 1443 je .Ldone8x 1444 1445 leaq 128(%rsi),%rsi 1446 xorq %r10,%r10 1447 vmovdqa %ymm12,0(%rsp) 1448 leaq 128(%rdi),%rdi 1449 subq $128,%rdx 1450 vmovdqa %ymm13,32(%rsp) 1451 jmp .Loop_tail8x 1452 1453.align 32 1454.L192_or_more8x: 1455 vpxor 0(%rsi),%ymm6,%ymm6 1456 vpxor 32(%rsi),%ymm8,%ymm8 1457 vpxor 64(%rsi),%ymm1,%ymm1 1458 vpxor 96(%rsi),%ymm5,%ymm5 1459 vpxor 128(%rsi),%ymm12,%ymm12 1460 vpxor 160(%rsi),%ymm13,%ymm13 1461 vmovdqu %ymm6,0(%rdi) 1462 vmovdqu %ymm8,32(%rdi) 1463 vmovdqu %ymm1,64(%rdi) 1464 vmovdqu %ymm5,96(%rdi) 1465 vmovdqu %ymm12,128(%rdi) 1466 vmovdqu %ymm13,160(%rdi) 1467 je .Ldone8x 1468 1469 leaq 192(%rsi),%rsi 1470 xorq %r10,%r10 1471 vmovdqa %ymm10,0(%rsp) 1472 leaq 192(%rdi),%rdi 1473 subq $192,%rdx 1474 vmovdqa %ymm15,32(%rsp) 1475 jmp .Loop_tail8x 1476 1477.align 32 1478.L256_or_more8x: 1479 vpxor 0(%rsi),%ymm6,%ymm6 1480 vpxor 32(%rsi),%ymm8,%ymm8 1481 vpxor 64(%rsi),%ymm1,%ymm1 1482 vpxor 96(%rsi),%ymm5,%ymm5 1483 vpxor 128(%rsi),%ymm12,%ymm12 1484 vpxor 160(%rsi),%ymm13,%ymm13 1485 vpxor 192(%rsi),%ymm10,%ymm10 1486 vpxor 224(%rsi),%ymm15,%ymm15 1487 vmovdqu %ymm6,0(%rdi) 1488 vmovdqu %ymm8,32(%rdi) 1489 vmovdqu %ymm1,64(%rdi) 1490 vmovdqu %ymm5,96(%rdi) 1491 vmovdqu %ymm12,128(%rdi) 1492 vmovdqu %ymm13,160(%rdi) 1493 vmovdqu %ymm10,192(%rdi) 1494 vmovdqu %ymm15,224(%rdi) 1495 je .Ldone8x 1496 1497 leaq 256(%rsi),%rsi 1498 xorq %r10,%r10 1499 vmovdqa %ymm14,0(%rsp) 1500 leaq 256(%rdi),%rdi 1501 subq $256,%rdx 1502 vmovdqa %ymm2,32(%rsp) 1503 jmp .Loop_tail8x 1504 1505.align 32 1506.L320_or_more8x: 1507 vpxor 0(%rsi),%ymm6,%ymm6 1508 vpxor 32(%rsi),%ymm8,%ymm8 1509 vpxor 64(%rsi),%ymm1,%ymm1 1510 vpxor 96(%rsi),%ymm5,%ymm5 1511 vpxor 128(%rsi),%ymm12,%ymm12 1512 vpxor 160(%rsi),%ymm13,%ymm13 1513 vpxor 192(%rsi),%ymm10,%ymm10 1514 vpxor 224(%rsi),%ymm15,%ymm15 1515 vpxor 256(%rsi),%ymm14,%ymm14 1516 vpxor 288(%rsi),%ymm2,%ymm2 1517 vmovdqu %ymm6,0(%rdi) 1518 vmovdqu %ymm8,32(%rdi) 1519 vmovdqu %ymm1,64(%rdi) 1520 vmovdqu %ymm5,96(%rdi) 1521 vmovdqu %ymm12,128(%rdi) 1522 vmovdqu %ymm13,160(%rdi) 1523 vmovdqu %ymm10,192(%rdi) 1524 vmovdqu %ymm15,224(%rdi) 1525 vmovdqu %ymm14,256(%rdi) 1526 vmovdqu %ymm2,288(%rdi) 1527 je .Ldone8x 1528 1529 leaq 320(%rsi),%rsi 1530 xorq %r10,%r10 1531 vmovdqa %ymm3,0(%rsp) 1532 leaq 320(%rdi),%rdi 1533 subq $320,%rdx 1534 vmovdqa %ymm7,32(%rsp) 1535 jmp .Loop_tail8x 1536 1537.align 32 1538.L384_or_more8x: 1539 vpxor 0(%rsi),%ymm6,%ymm6 1540 vpxor 32(%rsi),%ymm8,%ymm8 1541 vpxor 64(%rsi),%ymm1,%ymm1 1542 vpxor 96(%rsi),%ymm5,%ymm5 1543 vpxor 128(%rsi),%ymm12,%ymm12 1544 vpxor 160(%rsi),%ymm13,%ymm13 1545 vpxor 192(%rsi),%ymm10,%ymm10 1546 vpxor 224(%rsi),%ymm15,%ymm15 1547 vpxor 256(%rsi),%ymm14,%ymm14 1548 vpxor 288(%rsi),%ymm2,%ymm2 1549 vpxor 320(%rsi),%ymm3,%ymm3 1550 vpxor 352(%rsi),%ymm7,%ymm7 1551 vmovdqu %ymm6,0(%rdi) 1552 vmovdqu %ymm8,32(%rdi) 1553 vmovdqu %ymm1,64(%rdi) 1554 vmovdqu %ymm5,96(%rdi) 1555 vmovdqu %ymm12,128(%rdi) 1556 vmovdqu %ymm13,160(%rdi) 1557 vmovdqu %ymm10,192(%rdi) 1558 vmovdqu %ymm15,224(%rdi) 1559 vmovdqu %ymm14,256(%rdi) 1560 vmovdqu %ymm2,288(%rdi) 1561 vmovdqu %ymm3,320(%rdi) 1562 vmovdqu %ymm7,352(%rdi) 1563 je .Ldone8x 1564 1565 leaq 384(%rsi),%rsi 1566 xorq %r10,%r10 1567 vmovdqa %ymm11,0(%rsp) 1568 leaq 384(%rdi),%rdi 1569 subq $384,%rdx 1570 vmovdqa %ymm9,32(%rsp) 1571 jmp .Loop_tail8x 1572 1573.align 32 1574.L448_or_more8x: 1575 vpxor 0(%rsi),%ymm6,%ymm6 1576 vpxor 32(%rsi),%ymm8,%ymm8 1577 vpxor 64(%rsi),%ymm1,%ymm1 1578 vpxor 96(%rsi),%ymm5,%ymm5 1579 vpxor 128(%rsi),%ymm12,%ymm12 1580 vpxor 160(%rsi),%ymm13,%ymm13 1581 vpxor 192(%rsi),%ymm10,%ymm10 1582 vpxor 224(%rsi),%ymm15,%ymm15 1583 vpxor 256(%rsi),%ymm14,%ymm14 1584 vpxor 288(%rsi),%ymm2,%ymm2 1585 vpxor 320(%rsi),%ymm3,%ymm3 1586 vpxor 352(%rsi),%ymm7,%ymm7 1587 vpxor 384(%rsi),%ymm11,%ymm11 1588 vpxor 416(%rsi),%ymm9,%ymm9 1589 vmovdqu %ymm6,0(%rdi) 1590 vmovdqu %ymm8,32(%rdi) 1591 vmovdqu %ymm1,64(%rdi) 1592 vmovdqu %ymm5,96(%rdi) 1593 vmovdqu %ymm12,128(%rdi) 1594 vmovdqu %ymm13,160(%rdi) 1595 vmovdqu %ymm10,192(%rdi) 1596 vmovdqu %ymm15,224(%rdi) 1597 vmovdqu %ymm14,256(%rdi) 1598 vmovdqu %ymm2,288(%rdi) 1599 vmovdqu %ymm3,320(%rdi) 1600 vmovdqu %ymm7,352(%rdi) 1601 vmovdqu %ymm11,384(%rdi) 1602 vmovdqu %ymm9,416(%rdi) 1603 je .Ldone8x 1604 1605 leaq 448(%rsi),%rsi 1606 xorq %r10,%r10 1607 vmovdqa %ymm0,0(%rsp) 1608 leaq 448(%rdi),%rdi 1609 subq $448,%rdx 1610 vmovdqa %ymm4,32(%rsp) 1611 1612.Loop_tail8x: 1613 movzbl (%rsi,%r10,1),%eax 1614 movzbl (%rsp,%r10,1),%ecx 1615 leaq 1(%r10),%r10 1616 xorl %ecx,%eax 1617 movb %al,-1(%rdi,%r10,1) 1618 decq %rdx 1619 jnz .Loop_tail8x 1620 1621.Ldone8x: 1622 vzeroall 1623 leaq (%r9),%rsp 1624.cfi_def_cfa_register rsp 1625.L8x_epilogue: 1626 .byte 0xf3,0xc3 1627.cfi_endproc 1628.size ChaCha20_8x,.-ChaCha20_8x 1629#endif 1630.section .note.GNU-stack,"",@progbits 1631