1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11#include "ring_core_generated/prefix_symbols_asm.h" 12.text 13 14.extern OPENSSL_ia32cap_P 15.hidden OPENSSL_ia32cap_P 16 17.align 64 18.Lzero: 19.long 0,0,0,0 20.Lone: 21.long 1,0,0,0 22.Linc: 23.long 0,1,2,3 24.Lfour: 25.long 4,4,4,4 26.Lincy: 27.long 0,2,4,6,1,3,5,7 28.Leight: 29.long 8,8,8,8,8,8,8,8 30.Lrot16: 31.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 32.Lrot24: 33.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 34.Lsigma: 35.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 36.align 64 37.Lzeroz: 38.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 39.Lfourz: 40.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 41.Lincz: 42.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 43.Lsixteen: 44.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 45.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 46.globl ChaCha20_ctr32 47.hidden ChaCha20_ctr32 48.type ChaCha20_ctr32,@function 49.align 64 50ChaCha20_ctr32: 51.cfi_startproc 52 cmpq $0,%rdx 53 je .Lno_data 54 movq OPENSSL_ia32cap_P+4(%rip),%r10 55 testl $512,%r10d 56 jnz .LChaCha20_ssse3 57 58 pushq %rbx 59.cfi_adjust_cfa_offset 8 60.cfi_offset rbx,-16 61 pushq %rbp 62.cfi_adjust_cfa_offset 8 63.cfi_offset rbp,-24 64 pushq %r12 65.cfi_adjust_cfa_offset 8 66.cfi_offset r12,-32 67 pushq %r13 68.cfi_adjust_cfa_offset 8 69.cfi_offset r13,-40 70 pushq %r14 71.cfi_adjust_cfa_offset 8 72.cfi_offset r14,-48 73 pushq %r15 74.cfi_adjust_cfa_offset 8 75.cfi_offset r15,-56 76 subq $64+24,%rsp 77.cfi_adjust_cfa_offset 88 78.Lctr32_body: 79 80 81 movdqu (%rcx),%xmm1 82 movdqu 16(%rcx),%xmm2 83 movdqu (%r8),%xmm3 84 movdqa .Lone(%rip),%xmm4 85 86 87 movdqa %xmm1,16(%rsp) 88 movdqa %xmm2,32(%rsp) 89 movdqa %xmm3,48(%rsp) 90 movq %rdx,%rbp 91 jmp .Loop_outer 92 93.align 32 94.Loop_outer: 95 movl $0x61707865,%eax 96 movl $0x3320646e,%ebx 97 movl $0x79622d32,%ecx 98 movl $0x6b206574,%edx 99 movl 16(%rsp),%r8d 100 movl 20(%rsp),%r9d 101 movl 24(%rsp),%r10d 102 movl 28(%rsp),%r11d 103 movd %xmm3,%r12d 104 movl 52(%rsp),%r13d 105 movl 56(%rsp),%r14d 106 movl 60(%rsp),%r15d 107 108 movq %rbp,64+0(%rsp) 109 movl $10,%ebp 110 movq %rsi,64+8(%rsp) 111.byte 102,72,15,126,214 112 movq %rdi,64+16(%rsp) 113 movq %rsi,%rdi 114 shrq $32,%rdi 115 jmp .Loop 116 117.align 32 118.Loop: 119 addl %r8d,%eax 120 xorl %eax,%r12d 121 roll $16,%r12d 122 addl %r9d,%ebx 123 xorl %ebx,%r13d 124 roll $16,%r13d 125 addl %r12d,%esi 126 xorl %esi,%r8d 127 roll $12,%r8d 128 addl %r13d,%edi 129 xorl %edi,%r9d 130 roll $12,%r9d 131 addl %r8d,%eax 132 xorl %eax,%r12d 133 roll $8,%r12d 134 addl %r9d,%ebx 135 xorl %ebx,%r13d 136 roll $8,%r13d 137 addl %r12d,%esi 138 xorl %esi,%r8d 139 roll $7,%r8d 140 addl %r13d,%edi 141 xorl %edi,%r9d 142 roll $7,%r9d 143 movl %esi,32(%rsp) 144 movl %edi,36(%rsp) 145 movl 40(%rsp),%esi 146 movl 44(%rsp),%edi 147 addl %r10d,%ecx 148 xorl %ecx,%r14d 149 roll $16,%r14d 150 addl %r11d,%edx 151 xorl %edx,%r15d 152 roll $16,%r15d 153 addl %r14d,%esi 154 xorl %esi,%r10d 155 roll $12,%r10d 156 addl %r15d,%edi 157 xorl %edi,%r11d 158 roll $12,%r11d 159 addl %r10d,%ecx 160 xorl %ecx,%r14d 161 roll $8,%r14d 162 addl %r11d,%edx 163 xorl %edx,%r15d 164 roll $8,%r15d 165 addl %r14d,%esi 166 xorl %esi,%r10d 167 roll $7,%r10d 168 addl %r15d,%edi 169 xorl %edi,%r11d 170 roll $7,%r11d 171 addl %r9d,%eax 172 xorl %eax,%r15d 173 roll $16,%r15d 174 addl %r10d,%ebx 175 xorl %ebx,%r12d 176 roll $16,%r12d 177 addl %r15d,%esi 178 xorl %esi,%r9d 179 roll $12,%r9d 180 addl %r12d,%edi 181 xorl %edi,%r10d 182 roll $12,%r10d 183 addl %r9d,%eax 184 xorl %eax,%r15d 185 roll $8,%r15d 186 addl %r10d,%ebx 187 xorl %ebx,%r12d 188 roll $8,%r12d 189 addl %r15d,%esi 190 xorl %esi,%r9d 191 roll $7,%r9d 192 addl %r12d,%edi 193 xorl %edi,%r10d 194 roll $7,%r10d 195 movl %esi,40(%rsp) 196 movl %edi,44(%rsp) 197 movl 32(%rsp),%esi 198 movl 36(%rsp),%edi 199 addl %r11d,%ecx 200 xorl %ecx,%r13d 201 roll $16,%r13d 202 addl %r8d,%edx 203 xorl %edx,%r14d 204 roll $16,%r14d 205 addl %r13d,%esi 206 xorl %esi,%r11d 207 roll $12,%r11d 208 addl %r14d,%edi 209 xorl %edi,%r8d 210 roll $12,%r8d 211 addl %r11d,%ecx 212 xorl %ecx,%r13d 213 roll $8,%r13d 214 addl %r8d,%edx 215 xorl %edx,%r14d 216 roll $8,%r14d 217 addl %r13d,%esi 218 xorl %esi,%r11d 219 roll $7,%r11d 220 addl %r14d,%edi 221 xorl %edi,%r8d 222 roll $7,%r8d 223 decl %ebp 224 jnz .Loop 225 movl %edi,36(%rsp) 226 movl %esi,32(%rsp) 227 movq 64(%rsp),%rbp 228 movdqa %xmm2,%xmm1 229 movq 64+8(%rsp),%rsi 230 paddd %xmm4,%xmm3 231 movq 64+16(%rsp),%rdi 232 233 addl $0x61707865,%eax 234 addl $0x3320646e,%ebx 235 addl $0x79622d32,%ecx 236 addl $0x6b206574,%edx 237 addl 16(%rsp),%r8d 238 addl 20(%rsp),%r9d 239 addl 24(%rsp),%r10d 240 addl 28(%rsp),%r11d 241 addl 48(%rsp),%r12d 242 addl 52(%rsp),%r13d 243 addl 56(%rsp),%r14d 244 addl 60(%rsp),%r15d 245 paddd 32(%rsp),%xmm1 246 247 cmpq $64,%rbp 248 jb .Ltail 249 250 xorl 0(%rsi),%eax 251 xorl 4(%rsi),%ebx 252 xorl 8(%rsi),%ecx 253 xorl 12(%rsi),%edx 254 xorl 16(%rsi),%r8d 255 xorl 20(%rsi),%r9d 256 xorl 24(%rsi),%r10d 257 xorl 28(%rsi),%r11d 258 movdqu 32(%rsi),%xmm0 259 xorl 48(%rsi),%r12d 260 xorl 52(%rsi),%r13d 261 xorl 56(%rsi),%r14d 262 xorl 60(%rsi),%r15d 263 leaq 64(%rsi),%rsi 264 pxor %xmm1,%xmm0 265 266 movdqa %xmm2,32(%rsp) 267 movd %xmm3,48(%rsp) 268 269 movl %eax,0(%rdi) 270 movl %ebx,4(%rdi) 271 movl %ecx,8(%rdi) 272 movl %edx,12(%rdi) 273 movl %r8d,16(%rdi) 274 movl %r9d,20(%rdi) 275 movl %r10d,24(%rdi) 276 movl %r11d,28(%rdi) 277 movdqu %xmm0,32(%rdi) 278 movl %r12d,48(%rdi) 279 movl %r13d,52(%rdi) 280 movl %r14d,56(%rdi) 281 movl %r15d,60(%rdi) 282 leaq 64(%rdi),%rdi 283 284 subq $64,%rbp 285 jnz .Loop_outer 286 287 jmp .Ldone 288 289.align 16 290.Ltail: 291 movl %eax,0(%rsp) 292 movl %ebx,4(%rsp) 293 xorq %rbx,%rbx 294 movl %ecx,8(%rsp) 295 movl %edx,12(%rsp) 296 movl %r8d,16(%rsp) 297 movl %r9d,20(%rsp) 298 movl %r10d,24(%rsp) 299 movl %r11d,28(%rsp) 300 movdqa %xmm1,32(%rsp) 301 movl %r12d,48(%rsp) 302 movl %r13d,52(%rsp) 303 movl %r14d,56(%rsp) 304 movl %r15d,60(%rsp) 305 306.Loop_tail: 307 movzbl (%rsi,%rbx,1),%eax 308 movzbl (%rsp,%rbx,1),%edx 309 leaq 1(%rbx),%rbx 310 xorl %edx,%eax 311 movb %al,-1(%rdi,%rbx,1) 312 decq %rbp 313 jnz .Loop_tail 314 315.Ldone: 316 leaq 64+24+48(%rsp),%rsi 317 movq -48(%rsi),%r15 318.cfi_restore r15 319 movq -40(%rsi),%r14 320.cfi_restore r14 321 movq -32(%rsi),%r13 322.cfi_restore r13 323 movq -24(%rsi),%r12 324.cfi_restore r12 325 movq -16(%rsi),%rbp 326.cfi_restore rbp 327 movq -8(%rsi),%rbx 328.cfi_restore rbx 329 leaq (%rsi),%rsp 330.cfi_adjust_cfa_offset -136 331.Lno_data: 332 .byte 0xf3,0xc3 333.cfi_endproc 334.size ChaCha20_ctr32,.-ChaCha20_ctr32 335.type ChaCha20_ssse3,@function 336.align 32 337ChaCha20_ssse3: 338.LChaCha20_ssse3: 339.cfi_startproc 340 movq %rsp,%r9 341.cfi_def_cfa_register r9 342 cmpq $128,%rdx 343 ja .LChaCha20_4x 344 345.Ldo_sse3_after_all: 346 subq $64+8,%rsp 347 movdqa .Lsigma(%rip),%xmm0 348 movdqu (%rcx),%xmm1 349 movdqu 16(%rcx),%xmm2 350 movdqu (%r8),%xmm3 351 movdqa .Lrot16(%rip),%xmm6 352 movdqa .Lrot24(%rip),%xmm7 353 354 movdqa %xmm0,0(%rsp) 355 movdqa %xmm1,16(%rsp) 356 movdqa %xmm2,32(%rsp) 357 movdqa %xmm3,48(%rsp) 358 movq $10,%r8 359 jmp .Loop_ssse3 360 361.align 32 362.Loop_outer_ssse3: 363 movdqa .Lone(%rip),%xmm3 364 movdqa 0(%rsp),%xmm0 365 movdqa 16(%rsp),%xmm1 366 movdqa 32(%rsp),%xmm2 367 paddd 48(%rsp),%xmm3 368 movq $10,%r8 369 movdqa %xmm3,48(%rsp) 370 jmp .Loop_ssse3 371 372.align 32 373.Loop_ssse3: 374 paddd %xmm1,%xmm0 375 pxor %xmm0,%xmm3 376.byte 102,15,56,0,222 377 paddd %xmm3,%xmm2 378 pxor %xmm2,%xmm1 379 movdqa %xmm1,%xmm4 380 psrld $20,%xmm1 381 pslld $12,%xmm4 382 por %xmm4,%xmm1 383 paddd %xmm1,%xmm0 384 pxor %xmm0,%xmm3 385.byte 102,15,56,0,223 386 paddd %xmm3,%xmm2 387 pxor %xmm2,%xmm1 388 movdqa %xmm1,%xmm4 389 psrld $25,%xmm1 390 pslld $7,%xmm4 391 por %xmm4,%xmm1 392 pshufd $78,%xmm2,%xmm2 393 pshufd $57,%xmm1,%xmm1 394 pshufd $147,%xmm3,%xmm3 395 nop 396 paddd %xmm1,%xmm0 397 pxor %xmm0,%xmm3 398.byte 102,15,56,0,222 399 paddd %xmm3,%xmm2 400 pxor %xmm2,%xmm1 401 movdqa %xmm1,%xmm4 402 psrld $20,%xmm1 403 pslld $12,%xmm4 404 por %xmm4,%xmm1 405 paddd %xmm1,%xmm0 406 pxor %xmm0,%xmm3 407.byte 102,15,56,0,223 408 paddd %xmm3,%xmm2 409 pxor %xmm2,%xmm1 410 movdqa %xmm1,%xmm4 411 psrld $25,%xmm1 412 pslld $7,%xmm4 413 por %xmm4,%xmm1 414 pshufd $78,%xmm2,%xmm2 415 pshufd $147,%xmm1,%xmm1 416 pshufd $57,%xmm3,%xmm3 417 decq %r8 418 jnz .Loop_ssse3 419 paddd 0(%rsp),%xmm0 420 paddd 16(%rsp),%xmm1 421 paddd 32(%rsp),%xmm2 422 paddd 48(%rsp),%xmm3 423 424 cmpq $64,%rdx 425 jb .Ltail_ssse3 426 427 movdqu 0(%rsi),%xmm4 428 movdqu 16(%rsi),%xmm5 429 pxor %xmm4,%xmm0 430 movdqu 32(%rsi),%xmm4 431 pxor %xmm5,%xmm1 432 movdqu 48(%rsi),%xmm5 433 leaq 64(%rsi),%rsi 434 pxor %xmm4,%xmm2 435 pxor %xmm5,%xmm3 436 437 movdqu %xmm0,0(%rdi) 438 movdqu %xmm1,16(%rdi) 439 movdqu %xmm2,32(%rdi) 440 movdqu %xmm3,48(%rdi) 441 leaq 64(%rdi),%rdi 442 443 subq $64,%rdx 444 jnz .Loop_outer_ssse3 445 446 jmp .Ldone_ssse3 447 448.align 16 449.Ltail_ssse3: 450 movdqa %xmm0,0(%rsp) 451 movdqa %xmm1,16(%rsp) 452 movdqa %xmm2,32(%rsp) 453 movdqa %xmm3,48(%rsp) 454 xorq %r8,%r8 455 456.Loop_tail_ssse3: 457 movzbl (%rsi,%r8,1),%eax 458 movzbl (%rsp,%r8,1),%ecx 459 leaq 1(%r8),%r8 460 xorl %ecx,%eax 461 movb %al,-1(%rdi,%r8,1) 462 decq %rdx 463 jnz .Loop_tail_ssse3 464 465.Ldone_ssse3: 466 leaq (%r9),%rsp 467.cfi_def_cfa_register rsp 468.Lssse3_epilogue: 469 .byte 0xf3,0xc3 470.cfi_endproc 471.size ChaCha20_ssse3,.-ChaCha20_ssse3 472.type ChaCha20_4x,@function 473.align 32 474ChaCha20_4x: 475.LChaCha20_4x: 476.cfi_startproc 477 movq %rsp,%r9 478.cfi_def_cfa_register r9 479 movq %r10,%r11 480 shrq $32,%r10 481 testq $32,%r10 482 jnz .LChaCha20_8x 483 cmpq $192,%rdx 484 ja .Lproceed4x 485 486 andq $71303168,%r11 487 cmpq $4194304,%r11 488 je .Ldo_sse3_after_all 489 490.Lproceed4x: 491 subq $0x140+8,%rsp 492 movdqa .Lsigma(%rip),%xmm11 493 movdqu (%rcx),%xmm15 494 movdqu 16(%rcx),%xmm7 495 movdqu (%r8),%xmm3 496 leaq 256(%rsp),%rcx 497 leaq .Lrot16(%rip),%r10 498 leaq .Lrot24(%rip),%r11 499 500 pshufd $0x00,%xmm11,%xmm8 501 pshufd $0x55,%xmm11,%xmm9 502 movdqa %xmm8,64(%rsp) 503 pshufd $0xaa,%xmm11,%xmm10 504 movdqa %xmm9,80(%rsp) 505 pshufd $0xff,%xmm11,%xmm11 506 movdqa %xmm10,96(%rsp) 507 movdqa %xmm11,112(%rsp) 508 509 pshufd $0x00,%xmm15,%xmm12 510 pshufd $0x55,%xmm15,%xmm13 511 movdqa %xmm12,128-256(%rcx) 512 pshufd $0xaa,%xmm15,%xmm14 513 movdqa %xmm13,144-256(%rcx) 514 pshufd $0xff,%xmm15,%xmm15 515 movdqa %xmm14,160-256(%rcx) 516 movdqa %xmm15,176-256(%rcx) 517 518 pshufd $0x00,%xmm7,%xmm4 519 pshufd $0x55,%xmm7,%xmm5 520 movdqa %xmm4,192-256(%rcx) 521 pshufd $0xaa,%xmm7,%xmm6 522 movdqa %xmm5,208-256(%rcx) 523 pshufd $0xff,%xmm7,%xmm7 524 movdqa %xmm6,224-256(%rcx) 525 movdqa %xmm7,240-256(%rcx) 526 527 pshufd $0x00,%xmm3,%xmm0 528 pshufd $0x55,%xmm3,%xmm1 529 paddd .Linc(%rip),%xmm0 530 pshufd $0xaa,%xmm3,%xmm2 531 movdqa %xmm1,272-256(%rcx) 532 pshufd $0xff,%xmm3,%xmm3 533 movdqa %xmm2,288-256(%rcx) 534 movdqa %xmm3,304-256(%rcx) 535 536 jmp .Loop_enter4x 537 538.align 32 539.Loop_outer4x: 540 movdqa 64(%rsp),%xmm8 541 movdqa 80(%rsp),%xmm9 542 movdqa 96(%rsp),%xmm10 543 movdqa 112(%rsp),%xmm11 544 movdqa 128-256(%rcx),%xmm12 545 movdqa 144-256(%rcx),%xmm13 546 movdqa 160-256(%rcx),%xmm14 547 movdqa 176-256(%rcx),%xmm15 548 movdqa 192-256(%rcx),%xmm4 549 movdqa 208-256(%rcx),%xmm5 550 movdqa 224-256(%rcx),%xmm6 551 movdqa 240-256(%rcx),%xmm7 552 movdqa 256-256(%rcx),%xmm0 553 movdqa 272-256(%rcx),%xmm1 554 movdqa 288-256(%rcx),%xmm2 555 movdqa 304-256(%rcx),%xmm3 556 paddd .Lfour(%rip),%xmm0 557 558.Loop_enter4x: 559 movdqa %xmm6,32(%rsp) 560 movdqa %xmm7,48(%rsp) 561 movdqa (%r10),%xmm7 562 movl $10,%eax 563 movdqa %xmm0,256-256(%rcx) 564 jmp .Loop4x 565 566.align 32 567.Loop4x: 568 paddd %xmm12,%xmm8 569 paddd %xmm13,%xmm9 570 pxor %xmm8,%xmm0 571 pxor %xmm9,%xmm1 572.byte 102,15,56,0,199 573.byte 102,15,56,0,207 574 paddd %xmm0,%xmm4 575 paddd %xmm1,%xmm5 576 pxor %xmm4,%xmm12 577 pxor %xmm5,%xmm13 578 movdqa %xmm12,%xmm6 579 pslld $12,%xmm12 580 psrld $20,%xmm6 581 movdqa %xmm13,%xmm7 582 pslld $12,%xmm13 583 por %xmm6,%xmm12 584 psrld $20,%xmm7 585 movdqa (%r11),%xmm6 586 por %xmm7,%xmm13 587 paddd %xmm12,%xmm8 588 paddd %xmm13,%xmm9 589 pxor %xmm8,%xmm0 590 pxor %xmm9,%xmm1 591.byte 102,15,56,0,198 592.byte 102,15,56,0,206 593 paddd %xmm0,%xmm4 594 paddd %xmm1,%xmm5 595 pxor %xmm4,%xmm12 596 pxor %xmm5,%xmm13 597 movdqa %xmm12,%xmm7 598 pslld $7,%xmm12 599 psrld $25,%xmm7 600 movdqa %xmm13,%xmm6 601 pslld $7,%xmm13 602 por %xmm7,%xmm12 603 psrld $25,%xmm6 604 movdqa (%r10),%xmm7 605 por %xmm6,%xmm13 606 movdqa %xmm4,0(%rsp) 607 movdqa %xmm5,16(%rsp) 608 movdqa 32(%rsp),%xmm4 609 movdqa 48(%rsp),%xmm5 610 paddd %xmm14,%xmm10 611 paddd %xmm15,%xmm11 612 pxor %xmm10,%xmm2 613 pxor %xmm11,%xmm3 614.byte 102,15,56,0,215 615.byte 102,15,56,0,223 616 paddd %xmm2,%xmm4 617 paddd %xmm3,%xmm5 618 pxor %xmm4,%xmm14 619 pxor %xmm5,%xmm15 620 movdqa %xmm14,%xmm6 621 pslld $12,%xmm14 622 psrld $20,%xmm6 623 movdqa %xmm15,%xmm7 624 pslld $12,%xmm15 625 por %xmm6,%xmm14 626 psrld $20,%xmm7 627 movdqa (%r11),%xmm6 628 por %xmm7,%xmm15 629 paddd %xmm14,%xmm10 630 paddd %xmm15,%xmm11 631 pxor %xmm10,%xmm2 632 pxor %xmm11,%xmm3 633.byte 102,15,56,0,214 634.byte 102,15,56,0,222 635 paddd %xmm2,%xmm4 636 paddd %xmm3,%xmm5 637 pxor %xmm4,%xmm14 638 pxor %xmm5,%xmm15 639 movdqa %xmm14,%xmm7 640 pslld $7,%xmm14 641 psrld $25,%xmm7 642 movdqa %xmm15,%xmm6 643 pslld $7,%xmm15 644 por %xmm7,%xmm14 645 psrld $25,%xmm6 646 movdqa (%r10),%xmm7 647 por %xmm6,%xmm15 648 paddd %xmm13,%xmm8 649 paddd %xmm14,%xmm9 650 pxor %xmm8,%xmm3 651 pxor %xmm9,%xmm0 652.byte 102,15,56,0,223 653.byte 102,15,56,0,199 654 paddd %xmm3,%xmm4 655 paddd %xmm0,%xmm5 656 pxor %xmm4,%xmm13 657 pxor %xmm5,%xmm14 658 movdqa %xmm13,%xmm6 659 pslld $12,%xmm13 660 psrld $20,%xmm6 661 movdqa %xmm14,%xmm7 662 pslld $12,%xmm14 663 por %xmm6,%xmm13 664 psrld $20,%xmm7 665 movdqa (%r11),%xmm6 666 por %xmm7,%xmm14 667 paddd %xmm13,%xmm8 668 paddd %xmm14,%xmm9 669 pxor %xmm8,%xmm3 670 pxor %xmm9,%xmm0 671.byte 102,15,56,0,222 672.byte 102,15,56,0,198 673 paddd %xmm3,%xmm4 674 paddd %xmm0,%xmm5 675 pxor %xmm4,%xmm13 676 pxor %xmm5,%xmm14 677 movdqa %xmm13,%xmm7 678 pslld $7,%xmm13 679 psrld $25,%xmm7 680 movdqa %xmm14,%xmm6 681 pslld $7,%xmm14 682 por %xmm7,%xmm13 683 psrld $25,%xmm6 684 movdqa (%r10),%xmm7 685 por %xmm6,%xmm14 686 movdqa %xmm4,32(%rsp) 687 movdqa %xmm5,48(%rsp) 688 movdqa 0(%rsp),%xmm4 689 movdqa 16(%rsp),%xmm5 690 paddd %xmm15,%xmm10 691 paddd %xmm12,%xmm11 692 pxor %xmm10,%xmm1 693 pxor %xmm11,%xmm2 694.byte 102,15,56,0,207 695.byte 102,15,56,0,215 696 paddd %xmm1,%xmm4 697 paddd %xmm2,%xmm5 698 pxor %xmm4,%xmm15 699 pxor %xmm5,%xmm12 700 movdqa %xmm15,%xmm6 701 pslld $12,%xmm15 702 psrld $20,%xmm6 703 movdqa %xmm12,%xmm7 704 pslld $12,%xmm12 705 por %xmm6,%xmm15 706 psrld $20,%xmm7 707 movdqa (%r11),%xmm6 708 por %xmm7,%xmm12 709 paddd %xmm15,%xmm10 710 paddd %xmm12,%xmm11 711 pxor %xmm10,%xmm1 712 pxor %xmm11,%xmm2 713.byte 102,15,56,0,206 714.byte 102,15,56,0,214 715 paddd %xmm1,%xmm4 716 paddd %xmm2,%xmm5 717 pxor %xmm4,%xmm15 718 pxor %xmm5,%xmm12 719 movdqa %xmm15,%xmm7 720 pslld $7,%xmm15 721 psrld $25,%xmm7 722 movdqa %xmm12,%xmm6 723 pslld $7,%xmm12 724 por %xmm7,%xmm15 725 psrld $25,%xmm6 726 movdqa (%r10),%xmm7 727 por %xmm6,%xmm12 728 decl %eax 729 jnz .Loop4x 730 731 paddd 64(%rsp),%xmm8 732 paddd 80(%rsp),%xmm9 733 paddd 96(%rsp),%xmm10 734 paddd 112(%rsp),%xmm11 735 736 movdqa %xmm8,%xmm6 737 punpckldq %xmm9,%xmm8 738 movdqa %xmm10,%xmm7 739 punpckldq %xmm11,%xmm10 740 punpckhdq %xmm9,%xmm6 741 punpckhdq %xmm11,%xmm7 742 movdqa %xmm8,%xmm9 743 punpcklqdq %xmm10,%xmm8 744 movdqa %xmm6,%xmm11 745 punpcklqdq %xmm7,%xmm6 746 punpckhqdq %xmm10,%xmm9 747 punpckhqdq %xmm7,%xmm11 748 paddd 128-256(%rcx),%xmm12 749 paddd 144-256(%rcx),%xmm13 750 paddd 160-256(%rcx),%xmm14 751 paddd 176-256(%rcx),%xmm15 752 753 movdqa %xmm8,0(%rsp) 754 movdqa %xmm9,16(%rsp) 755 movdqa 32(%rsp),%xmm8 756 movdqa 48(%rsp),%xmm9 757 758 movdqa %xmm12,%xmm10 759 punpckldq %xmm13,%xmm12 760 movdqa %xmm14,%xmm7 761 punpckldq %xmm15,%xmm14 762 punpckhdq %xmm13,%xmm10 763 punpckhdq %xmm15,%xmm7 764 movdqa %xmm12,%xmm13 765 punpcklqdq %xmm14,%xmm12 766 movdqa %xmm10,%xmm15 767 punpcklqdq %xmm7,%xmm10 768 punpckhqdq %xmm14,%xmm13 769 punpckhqdq %xmm7,%xmm15 770 paddd 192-256(%rcx),%xmm4 771 paddd 208-256(%rcx),%xmm5 772 paddd 224-256(%rcx),%xmm8 773 paddd 240-256(%rcx),%xmm9 774 775 movdqa %xmm6,32(%rsp) 776 movdqa %xmm11,48(%rsp) 777 778 movdqa %xmm4,%xmm14 779 punpckldq %xmm5,%xmm4 780 movdqa %xmm8,%xmm7 781 punpckldq %xmm9,%xmm8 782 punpckhdq %xmm5,%xmm14 783 punpckhdq %xmm9,%xmm7 784 movdqa %xmm4,%xmm5 785 punpcklqdq %xmm8,%xmm4 786 movdqa %xmm14,%xmm9 787 punpcklqdq %xmm7,%xmm14 788 punpckhqdq %xmm8,%xmm5 789 punpckhqdq %xmm7,%xmm9 790 paddd 256-256(%rcx),%xmm0 791 paddd 272-256(%rcx),%xmm1 792 paddd 288-256(%rcx),%xmm2 793 paddd 304-256(%rcx),%xmm3 794 795 movdqa %xmm0,%xmm8 796 punpckldq %xmm1,%xmm0 797 movdqa %xmm2,%xmm7 798 punpckldq %xmm3,%xmm2 799 punpckhdq %xmm1,%xmm8 800 punpckhdq %xmm3,%xmm7 801 movdqa %xmm0,%xmm1 802 punpcklqdq %xmm2,%xmm0 803 movdqa %xmm8,%xmm3 804 punpcklqdq %xmm7,%xmm8 805 punpckhqdq %xmm2,%xmm1 806 punpckhqdq %xmm7,%xmm3 807 cmpq $256,%rdx 808 jb .Ltail4x 809 810 movdqu 0(%rsi),%xmm6 811 movdqu 16(%rsi),%xmm11 812 movdqu 32(%rsi),%xmm2 813 movdqu 48(%rsi),%xmm7 814 pxor 0(%rsp),%xmm6 815 pxor %xmm12,%xmm11 816 pxor %xmm4,%xmm2 817 pxor %xmm0,%xmm7 818 819 movdqu %xmm6,0(%rdi) 820 movdqu 64(%rsi),%xmm6 821 movdqu %xmm11,16(%rdi) 822 movdqu 80(%rsi),%xmm11 823 movdqu %xmm2,32(%rdi) 824 movdqu 96(%rsi),%xmm2 825 movdqu %xmm7,48(%rdi) 826 movdqu 112(%rsi),%xmm7 827 leaq 128(%rsi),%rsi 828 pxor 16(%rsp),%xmm6 829 pxor %xmm13,%xmm11 830 pxor %xmm5,%xmm2 831 pxor %xmm1,%xmm7 832 833 movdqu %xmm6,64(%rdi) 834 movdqu 0(%rsi),%xmm6 835 movdqu %xmm11,80(%rdi) 836 movdqu 16(%rsi),%xmm11 837 movdqu %xmm2,96(%rdi) 838 movdqu 32(%rsi),%xmm2 839 movdqu %xmm7,112(%rdi) 840 leaq 128(%rdi),%rdi 841 movdqu 48(%rsi),%xmm7 842 pxor 32(%rsp),%xmm6 843 pxor %xmm10,%xmm11 844 pxor %xmm14,%xmm2 845 pxor %xmm8,%xmm7 846 847 movdqu %xmm6,0(%rdi) 848 movdqu 64(%rsi),%xmm6 849 movdqu %xmm11,16(%rdi) 850 movdqu 80(%rsi),%xmm11 851 movdqu %xmm2,32(%rdi) 852 movdqu 96(%rsi),%xmm2 853 movdqu %xmm7,48(%rdi) 854 movdqu 112(%rsi),%xmm7 855 leaq 128(%rsi),%rsi 856 pxor 48(%rsp),%xmm6 857 pxor %xmm15,%xmm11 858 pxor %xmm9,%xmm2 859 pxor %xmm3,%xmm7 860 movdqu %xmm6,64(%rdi) 861 movdqu %xmm11,80(%rdi) 862 movdqu %xmm2,96(%rdi) 863 movdqu %xmm7,112(%rdi) 864 leaq 128(%rdi),%rdi 865 866 subq $256,%rdx 867 jnz .Loop_outer4x 868 869 jmp .Ldone4x 870 871.Ltail4x: 872 cmpq $192,%rdx 873 jae .L192_or_more4x 874 cmpq $128,%rdx 875 jae .L128_or_more4x 876 cmpq $64,%rdx 877 jae .L64_or_more4x 878 879 880 xorq %r10,%r10 881 882 movdqa %xmm12,16(%rsp) 883 movdqa %xmm4,32(%rsp) 884 movdqa %xmm0,48(%rsp) 885 jmp .Loop_tail4x 886 887.align 32 888.L64_or_more4x: 889 movdqu 0(%rsi),%xmm6 890 movdqu 16(%rsi),%xmm11 891 movdqu 32(%rsi),%xmm2 892 movdqu 48(%rsi),%xmm7 893 pxor 0(%rsp),%xmm6 894 pxor %xmm12,%xmm11 895 pxor %xmm4,%xmm2 896 pxor %xmm0,%xmm7 897 movdqu %xmm6,0(%rdi) 898 movdqu %xmm11,16(%rdi) 899 movdqu %xmm2,32(%rdi) 900 movdqu %xmm7,48(%rdi) 901 je .Ldone4x 902 903 movdqa 16(%rsp),%xmm6 904 leaq 64(%rsi),%rsi 905 xorq %r10,%r10 906 movdqa %xmm6,0(%rsp) 907 movdqa %xmm13,16(%rsp) 908 leaq 64(%rdi),%rdi 909 movdqa %xmm5,32(%rsp) 910 subq $64,%rdx 911 movdqa %xmm1,48(%rsp) 912 jmp .Loop_tail4x 913 914.align 32 915.L128_or_more4x: 916 movdqu 0(%rsi),%xmm6 917 movdqu 16(%rsi),%xmm11 918 movdqu 32(%rsi),%xmm2 919 movdqu 48(%rsi),%xmm7 920 pxor 0(%rsp),%xmm6 921 pxor %xmm12,%xmm11 922 pxor %xmm4,%xmm2 923 pxor %xmm0,%xmm7 924 925 movdqu %xmm6,0(%rdi) 926 movdqu 64(%rsi),%xmm6 927 movdqu %xmm11,16(%rdi) 928 movdqu 80(%rsi),%xmm11 929 movdqu %xmm2,32(%rdi) 930 movdqu 96(%rsi),%xmm2 931 movdqu %xmm7,48(%rdi) 932 movdqu 112(%rsi),%xmm7 933 pxor 16(%rsp),%xmm6 934 pxor %xmm13,%xmm11 935 pxor %xmm5,%xmm2 936 pxor %xmm1,%xmm7 937 movdqu %xmm6,64(%rdi) 938 movdqu %xmm11,80(%rdi) 939 movdqu %xmm2,96(%rdi) 940 movdqu %xmm7,112(%rdi) 941 je .Ldone4x 942 943 movdqa 32(%rsp),%xmm6 944 leaq 128(%rsi),%rsi 945 xorq %r10,%r10 946 movdqa %xmm6,0(%rsp) 947 movdqa %xmm10,16(%rsp) 948 leaq 128(%rdi),%rdi 949 movdqa %xmm14,32(%rsp) 950 subq $128,%rdx 951 movdqa %xmm8,48(%rsp) 952 jmp .Loop_tail4x 953 954.align 32 955.L192_or_more4x: 956 movdqu 0(%rsi),%xmm6 957 movdqu 16(%rsi),%xmm11 958 movdqu 32(%rsi),%xmm2 959 movdqu 48(%rsi),%xmm7 960 pxor 0(%rsp),%xmm6 961 pxor %xmm12,%xmm11 962 pxor %xmm4,%xmm2 963 pxor %xmm0,%xmm7 964 965 movdqu %xmm6,0(%rdi) 966 movdqu 64(%rsi),%xmm6 967 movdqu %xmm11,16(%rdi) 968 movdqu 80(%rsi),%xmm11 969 movdqu %xmm2,32(%rdi) 970 movdqu 96(%rsi),%xmm2 971 movdqu %xmm7,48(%rdi) 972 movdqu 112(%rsi),%xmm7 973 leaq 128(%rsi),%rsi 974 pxor 16(%rsp),%xmm6 975 pxor %xmm13,%xmm11 976 pxor %xmm5,%xmm2 977 pxor %xmm1,%xmm7 978 979 movdqu %xmm6,64(%rdi) 980 movdqu 0(%rsi),%xmm6 981 movdqu %xmm11,80(%rdi) 982 movdqu 16(%rsi),%xmm11 983 movdqu %xmm2,96(%rdi) 984 movdqu 32(%rsi),%xmm2 985 movdqu %xmm7,112(%rdi) 986 leaq 128(%rdi),%rdi 987 movdqu 48(%rsi),%xmm7 988 pxor 32(%rsp),%xmm6 989 pxor %xmm10,%xmm11 990 pxor %xmm14,%xmm2 991 pxor %xmm8,%xmm7 992 movdqu %xmm6,0(%rdi) 993 movdqu %xmm11,16(%rdi) 994 movdqu %xmm2,32(%rdi) 995 movdqu %xmm7,48(%rdi) 996 je .Ldone4x 997 998 movdqa 48(%rsp),%xmm6 999 leaq 64(%rsi),%rsi 1000 xorq %r10,%r10 1001 movdqa %xmm6,0(%rsp) 1002 movdqa %xmm15,16(%rsp) 1003 leaq 64(%rdi),%rdi 1004 movdqa %xmm9,32(%rsp) 1005 subq $192,%rdx 1006 movdqa %xmm3,48(%rsp) 1007 1008.Loop_tail4x: 1009 movzbl (%rsi,%r10,1),%eax 1010 movzbl (%rsp,%r10,1),%ecx 1011 leaq 1(%r10),%r10 1012 xorl %ecx,%eax 1013 movb %al,-1(%rdi,%r10,1) 1014 decq %rdx 1015 jnz .Loop_tail4x 1016 1017.Ldone4x: 1018 leaq (%r9),%rsp 1019.cfi_def_cfa_register rsp 1020.L4x_epilogue: 1021 .byte 0xf3,0xc3 1022.cfi_endproc 1023.size ChaCha20_4x,.-ChaCha20_4x 1024.type ChaCha20_8x,@function 1025.align 32 1026ChaCha20_8x: 1027.LChaCha20_8x: 1028.cfi_startproc 1029 movq %rsp,%r9 1030.cfi_def_cfa_register r9 1031 subq $0x280+8,%rsp 1032 andq $-32,%rsp 1033 vzeroupper 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 vbroadcasti128 .Lsigma(%rip),%ymm11 1045 vbroadcasti128 (%rcx),%ymm3 1046 vbroadcasti128 16(%rcx),%ymm15 1047 vbroadcasti128 (%r8),%ymm7 1048 leaq 256(%rsp),%rcx 1049 leaq 512(%rsp),%rax 1050 leaq .Lrot16(%rip),%r10 1051 leaq .Lrot24(%rip),%r11 1052 1053 vpshufd $0x00,%ymm11,%ymm8 1054 vpshufd $0x55,%ymm11,%ymm9 1055 vmovdqa %ymm8,128-256(%rcx) 1056 vpshufd $0xaa,%ymm11,%ymm10 1057 vmovdqa %ymm9,160-256(%rcx) 1058 vpshufd $0xff,%ymm11,%ymm11 1059 vmovdqa %ymm10,192-256(%rcx) 1060 vmovdqa %ymm11,224-256(%rcx) 1061 1062 vpshufd $0x00,%ymm3,%ymm0 1063 vpshufd $0x55,%ymm3,%ymm1 1064 vmovdqa %ymm0,256-256(%rcx) 1065 vpshufd $0xaa,%ymm3,%ymm2 1066 vmovdqa %ymm1,288-256(%rcx) 1067 vpshufd $0xff,%ymm3,%ymm3 1068 vmovdqa %ymm2,320-256(%rcx) 1069 vmovdqa %ymm3,352-256(%rcx) 1070 1071 vpshufd $0x00,%ymm15,%ymm12 1072 vpshufd $0x55,%ymm15,%ymm13 1073 vmovdqa %ymm12,384-512(%rax) 1074 vpshufd $0xaa,%ymm15,%ymm14 1075 vmovdqa %ymm13,416-512(%rax) 1076 vpshufd $0xff,%ymm15,%ymm15 1077 vmovdqa %ymm14,448-512(%rax) 1078 vmovdqa %ymm15,480-512(%rax) 1079 1080 vpshufd $0x00,%ymm7,%ymm4 1081 vpshufd $0x55,%ymm7,%ymm5 1082 vpaddd .Lincy(%rip),%ymm4,%ymm4 1083 vpshufd $0xaa,%ymm7,%ymm6 1084 vmovdqa %ymm5,544-512(%rax) 1085 vpshufd $0xff,%ymm7,%ymm7 1086 vmovdqa %ymm6,576-512(%rax) 1087 vmovdqa %ymm7,608-512(%rax) 1088 1089 jmp .Loop_enter8x 1090 1091.align 32 1092.Loop_outer8x: 1093 vmovdqa 128-256(%rcx),%ymm8 1094 vmovdqa 160-256(%rcx),%ymm9 1095 vmovdqa 192-256(%rcx),%ymm10 1096 vmovdqa 224-256(%rcx),%ymm11 1097 vmovdqa 256-256(%rcx),%ymm0 1098 vmovdqa 288-256(%rcx),%ymm1 1099 vmovdqa 320-256(%rcx),%ymm2 1100 vmovdqa 352-256(%rcx),%ymm3 1101 vmovdqa 384-512(%rax),%ymm12 1102 vmovdqa 416-512(%rax),%ymm13 1103 vmovdqa 448-512(%rax),%ymm14 1104 vmovdqa 480-512(%rax),%ymm15 1105 vmovdqa 512-512(%rax),%ymm4 1106 vmovdqa 544-512(%rax),%ymm5 1107 vmovdqa 576-512(%rax),%ymm6 1108 vmovdqa 608-512(%rax),%ymm7 1109 vpaddd .Leight(%rip),%ymm4,%ymm4 1110 1111.Loop_enter8x: 1112 vmovdqa %ymm14,64(%rsp) 1113 vmovdqa %ymm15,96(%rsp) 1114 vbroadcasti128 (%r10),%ymm15 1115 vmovdqa %ymm4,512-512(%rax) 1116 movl $10,%eax 1117 jmp .Loop8x 1118 1119.align 32 1120.Loop8x: 1121 vpaddd %ymm0,%ymm8,%ymm8 1122 vpxor %ymm4,%ymm8,%ymm4 1123 vpshufb %ymm15,%ymm4,%ymm4 1124 vpaddd %ymm1,%ymm9,%ymm9 1125 vpxor %ymm5,%ymm9,%ymm5 1126 vpshufb %ymm15,%ymm5,%ymm5 1127 vpaddd %ymm4,%ymm12,%ymm12 1128 vpxor %ymm0,%ymm12,%ymm0 1129 vpslld $12,%ymm0,%ymm14 1130 vpsrld $20,%ymm0,%ymm0 1131 vpor %ymm0,%ymm14,%ymm0 1132 vbroadcasti128 (%r11),%ymm14 1133 vpaddd %ymm5,%ymm13,%ymm13 1134 vpxor %ymm1,%ymm13,%ymm1 1135 vpslld $12,%ymm1,%ymm15 1136 vpsrld $20,%ymm1,%ymm1 1137 vpor %ymm1,%ymm15,%ymm1 1138 vpaddd %ymm0,%ymm8,%ymm8 1139 vpxor %ymm4,%ymm8,%ymm4 1140 vpshufb %ymm14,%ymm4,%ymm4 1141 vpaddd %ymm1,%ymm9,%ymm9 1142 vpxor %ymm5,%ymm9,%ymm5 1143 vpshufb %ymm14,%ymm5,%ymm5 1144 vpaddd %ymm4,%ymm12,%ymm12 1145 vpxor %ymm0,%ymm12,%ymm0 1146 vpslld $7,%ymm0,%ymm15 1147 vpsrld $25,%ymm0,%ymm0 1148 vpor %ymm0,%ymm15,%ymm0 1149 vbroadcasti128 (%r10),%ymm15 1150 vpaddd %ymm5,%ymm13,%ymm13 1151 vpxor %ymm1,%ymm13,%ymm1 1152 vpslld $7,%ymm1,%ymm14 1153 vpsrld $25,%ymm1,%ymm1 1154 vpor %ymm1,%ymm14,%ymm1 1155 vmovdqa %ymm12,0(%rsp) 1156 vmovdqa %ymm13,32(%rsp) 1157 vmovdqa 64(%rsp),%ymm12 1158 vmovdqa 96(%rsp),%ymm13 1159 vpaddd %ymm2,%ymm10,%ymm10 1160 vpxor %ymm6,%ymm10,%ymm6 1161 vpshufb %ymm15,%ymm6,%ymm6 1162 vpaddd %ymm3,%ymm11,%ymm11 1163 vpxor %ymm7,%ymm11,%ymm7 1164 vpshufb %ymm15,%ymm7,%ymm7 1165 vpaddd %ymm6,%ymm12,%ymm12 1166 vpxor %ymm2,%ymm12,%ymm2 1167 vpslld $12,%ymm2,%ymm14 1168 vpsrld $20,%ymm2,%ymm2 1169 vpor %ymm2,%ymm14,%ymm2 1170 vbroadcasti128 (%r11),%ymm14 1171 vpaddd %ymm7,%ymm13,%ymm13 1172 vpxor %ymm3,%ymm13,%ymm3 1173 vpslld $12,%ymm3,%ymm15 1174 vpsrld $20,%ymm3,%ymm3 1175 vpor %ymm3,%ymm15,%ymm3 1176 vpaddd %ymm2,%ymm10,%ymm10 1177 vpxor %ymm6,%ymm10,%ymm6 1178 vpshufb %ymm14,%ymm6,%ymm6 1179 vpaddd %ymm3,%ymm11,%ymm11 1180 vpxor %ymm7,%ymm11,%ymm7 1181 vpshufb %ymm14,%ymm7,%ymm7 1182 vpaddd %ymm6,%ymm12,%ymm12 1183 vpxor %ymm2,%ymm12,%ymm2 1184 vpslld $7,%ymm2,%ymm15 1185 vpsrld $25,%ymm2,%ymm2 1186 vpor %ymm2,%ymm15,%ymm2 1187 vbroadcasti128 (%r10),%ymm15 1188 vpaddd %ymm7,%ymm13,%ymm13 1189 vpxor %ymm3,%ymm13,%ymm3 1190 vpslld $7,%ymm3,%ymm14 1191 vpsrld $25,%ymm3,%ymm3 1192 vpor %ymm3,%ymm14,%ymm3 1193 vpaddd %ymm1,%ymm8,%ymm8 1194 vpxor %ymm7,%ymm8,%ymm7 1195 vpshufb %ymm15,%ymm7,%ymm7 1196 vpaddd %ymm2,%ymm9,%ymm9 1197 vpxor %ymm4,%ymm9,%ymm4 1198 vpshufb %ymm15,%ymm4,%ymm4 1199 vpaddd %ymm7,%ymm12,%ymm12 1200 vpxor %ymm1,%ymm12,%ymm1 1201 vpslld $12,%ymm1,%ymm14 1202 vpsrld $20,%ymm1,%ymm1 1203 vpor %ymm1,%ymm14,%ymm1 1204 vbroadcasti128 (%r11),%ymm14 1205 vpaddd %ymm4,%ymm13,%ymm13 1206 vpxor %ymm2,%ymm13,%ymm2 1207 vpslld $12,%ymm2,%ymm15 1208 vpsrld $20,%ymm2,%ymm2 1209 vpor %ymm2,%ymm15,%ymm2 1210 vpaddd %ymm1,%ymm8,%ymm8 1211 vpxor %ymm7,%ymm8,%ymm7 1212 vpshufb %ymm14,%ymm7,%ymm7 1213 vpaddd %ymm2,%ymm9,%ymm9 1214 vpxor %ymm4,%ymm9,%ymm4 1215 vpshufb %ymm14,%ymm4,%ymm4 1216 vpaddd %ymm7,%ymm12,%ymm12 1217 vpxor %ymm1,%ymm12,%ymm1 1218 vpslld $7,%ymm1,%ymm15 1219 vpsrld $25,%ymm1,%ymm1 1220 vpor %ymm1,%ymm15,%ymm1 1221 vbroadcasti128 (%r10),%ymm15 1222 vpaddd %ymm4,%ymm13,%ymm13 1223 vpxor %ymm2,%ymm13,%ymm2 1224 vpslld $7,%ymm2,%ymm14 1225 vpsrld $25,%ymm2,%ymm2 1226 vpor %ymm2,%ymm14,%ymm2 1227 vmovdqa %ymm12,64(%rsp) 1228 vmovdqa %ymm13,96(%rsp) 1229 vmovdqa 0(%rsp),%ymm12 1230 vmovdqa 32(%rsp),%ymm13 1231 vpaddd %ymm3,%ymm10,%ymm10 1232 vpxor %ymm5,%ymm10,%ymm5 1233 vpshufb %ymm15,%ymm5,%ymm5 1234 vpaddd %ymm0,%ymm11,%ymm11 1235 vpxor %ymm6,%ymm11,%ymm6 1236 vpshufb %ymm15,%ymm6,%ymm6 1237 vpaddd %ymm5,%ymm12,%ymm12 1238 vpxor %ymm3,%ymm12,%ymm3 1239 vpslld $12,%ymm3,%ymm14 1240 vpsrld $20,%ymm3,%ymm3 1241 vpor %ymm3,%ymm14,%ymm3 1242 vbroadcasti128 (%r11),%ymm14 1243 vpaddd %ymm6,%ymm13,%ymm13 1244 vpxor %ymm0,%ymm13,%ymm0 1245 vpslld $12,%ymm0,%ymm15 1246 vpsrld $20,%ymm0,%ymm0 1247 vpor %ymm0,%ymm15,%ymm0 1248 vpaddd %ymm3,%ymm10,%ymm10 1249 vpxor %ymm5,%ymm10,%ymm5 1250 vpshufb %ymm14,%ymm5,%ymm5 1251 vpaddd %ymm0,%ymm11,%ymm11 1252 vpxor %ymm6,%ymm11,%ymm6 1253 vpshufb %ymm14,%ymm6,%ymm6 1254 vpaddd %ymm5,%ymm12,%ymm12 1255 vpxor %ymm3,%ymm12,%ymm3 1256 vpslld $7,%ymm3,%ymm15 1257 vpsrld $25,%ymm3,%ymm3 1258 vpor %ymm3,%ymm15,%ymm3 1259 vbroadcasti128 (%r10),%ymm15 1260 vpaddd %ymm6,%ymm13,%ymm13 1261 vpxor %ymm0,%ymm13,%ymm0 1262 vpslld $7,%ymm0,%ymm14 1263 vpsrld $25,%ymm0,%ymm0 1264 vpor %ymm0,%ymm14,%ymm0 1265 decl %eax 1266 jnz .Loop8x 1267 1268 leaq 512(%rsp),%rax 1269 vpaddd 128-256(%rcx),%ymm8,%ymm8 1270 vpaddd 160-256(%rcx),%ymm9,%ymm9 1271 vpaddd 192-256(%rcx),%ymm10,%ymm10 1272 vpaddd 224-256(%rcx),%ymm11,%ymm11 1273 1274 vpunpckldq %ymm9,%ymm8,%ymm14 1275 vpunpckldq %ymm11,%ymm10,%ymm15 1276 vpunpckhdq %ymm9,%ymm8,%ymm8 1277 vpunpckhdq %ymm11,%ymm10,%ymm10 1278 vpunpcklqdq %ymm15,%ymm14,%ymm9 1279 vpunpckhqdq %ymm15,%ymm14,%ymm14 1280 vpunpcklqdq %ymm10,%ymm8,%ymm11 1281 vpunpckhqdq %ymm10,%ymm8,%ymm8 1282 vpaddd 256-256(%rcx),%ymm0,%ymm0 1283 vpaddd 288-256(%rcx),%ymm1,%ymm1 1284 vpaddd 320-256(%rcx),%ymm2,%ymm2 1285 vpaddd 352-256(%rcx),%ymm3,%ymm3 1286 1287 vpunpckldq %ymm1,%ymm0,%ymm10 1288 vpunpckldq %ymm3,%ymm2,%ymm15 1289 vpunpckhdq %ymm1,%ymm0,%ymm0 1290 vpunpckhdq %ymm3,%ymm2,%ymm2 1291 vpunpcklqdq %ymm15,%ymm10,%ymm1 1292 vpunpckhqdq %ymm15,%ymm10,%ymm10 1293 vpunpcklqdq %ymm2,%ymm0,%ymm3 1294 vpunpckhqdq %ymm2,%ymm0,%ymm0 1295 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1296 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1297 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1298 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1299 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1300 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1301 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1302 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1303 vmovdqa %ymm15,0(%rsp) 1304 vmovdqa %ymm9,32(%rsp) 1305 vmovdqa 64(%rsp),%ymm15 1306 vmovdqa 96(%rsp),%ymm9 1307 1308 vpaddd 384-512(%rax),%ymm12,%ymm12 1309 vpaddd 416-512(%rax),%ymm13,%ymm13 1310 vpaddd 448-512(%rax),%ymm15,%ymm15 1311 vpaddd 480-512(%rax),%ymm9,%ymm9 1312 1313 vpunpckldq %ymm13,%ymm12,%ymm2 1314 vpunpckldq %ymm9,%ymm15,%ymm8 1315 vpunpckhdq %ymm13,%ymm12,%ymm12 1316 vpunpckhdq %ymm9,%ymm15,%ymm15 1317 vpunpcklqdq %ymm8,%ymm2,%ymm13 1318 vpunpckhqdq %ymm8,%ymm2,%ymm2 1319 vpunpcklqdq %ymm15,%ymm12,%ymm9 1320 vpunpckhqdq %ymm15,%ymm12,%ymm12 1321 vpaddd 512-512(%rax),%ymm4,%ymm4 1322 vpaddd 544-512(%rax),%ymm5,%ymm5 1323 vpaddd 576-512(%rax),%ymm6,%ymm6 1324 vpaddd 608-512(%rax),%ymm7,%ymm7 1325 1326 vpunpckldq %ymm5,%ymm4,%ymm15 1327 vpunpckldq %ymm7,%ymm6,%ymm8 1328 vpunpckhdq %ymm5,%ymm4,%ymm4 1329 vpunpckhdq %ymm7,%ymm6,%ymm6 1330 vpunpcklqdq %ymm8,%ymm15,%ymm5 1331 vpunpckhqdq %ymm8,%ymm15,%ymm15 1332 vpunpcklqdq %ymm6,%ymm4,%ymm7 1333 vpunpckhqdq %ymm6,%ymm4,%ymm4 1334 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1335 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1336 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1337 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1338 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1339 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1340 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1341 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1342 vmovdqa 0(%rsp),%ymm6 1343 vmovdqa 32(%rsp),%ymm12 1344 1345 cmpq $512,%rdx 1346 jb .Ltail8x 1347 1348 vpxor 0(%rsi),%ymm6,%ymm6 1349 vpxor 32(%rsi),%ymm8,%ymm8 1350 vpxor 64(%rsi),%ymm1,%ymm1 1351 vpxor 96(%rsi),%ymm5,%ymm5 1352 leaq 128(%rsi),%rsi 1353 vmovdqu %ymm6,0(%rdi) 1354 vmovdqu %ymm8,32(%rdi) 1355 vmovdqu %ymm1,64(%rdi) 1356 vmovdqu %ymm5,96(%rdi) 1357 leaq 128(%rdi),%rdi 1358 1359 vpxor 0(%rsi),%ymm12,%ymm12 1360 vpxor 32(%rsi),%ymm13,%ymm13 1361 vpxor 64(%rsi),%ymm10,%ymm10 1362 vpxor 96(%rsi),%ymm15,%ymm15 1363 leaq 128(%rsi),%rsi 1364 vmovdqu %ymm12,0(%rdi) 1365 vmovdqu %ymm13,32(%rdi) 1366 vmovdqu %ymm10,64(%rdi) 1367 vmovdqu %ymm15,96(%rdi) 1368 leaq 128(%rdi),%rdi 1369 1370 vpxor 0(%rsi),%ymm14,%ymm14 1371 vpxor 32(%rsi),%ymm2,%ymm2 1372 vpxor 64(%rsi),%ymm3,%ymm3 1373 vpxor 96(%rsi),%ymm7,%ymm7 1374 leaq 128(%rsi),%rsi 1375 vmovdqu %ymm14,0(%rdi) 1376 vmovdqu %ymm2,32(%rdi) 1377 vmovdqu %ymm3,64(%rdi) 1378 vmovdqu %ymm7,96(%rdi) 1379 leaq 128(%rdi),%rdi 1380 1381 vpxor 0(%rsi),%ymm11,%ymm11 1382 vpxor 32(%rsi),%ymm9,%ymm9 1383 vpxor 64(%rsi),%ymm0,%ymm0 1384 vpxor 96(%rsi),%ymm4,%ymm4 1385 leaq 128(%rsi),%rsi 1386 vmovdqu %ymm11,0(%rdi) 1387 vmovdqu %ymm9,32(%rdi) 1388 vmovdqu %ymm0,64(%rdi) 1389 vmovdqu %ymm4,96(%rdi) 1390 leaq 128(%rdi),%rdi 1391 1392 subq $512,%rdx 1393 jnz .Loop_outer8x 1394 1395 jmp .Ldone8x 1396 1397.Ltail8x: 1398 cmpq $448,%rdx 1399 jae .L448_or_more8x 1400 cmpq $384,%rdx 1401 jae .L384_or_more8x 1402 cmpq $320,%rdx 1403 jae .L320_or_more8x 1404 cmpq $256,%rdx 1405 jae .L256_or_more8x 1406 cmpq $192,%rdx 1407 jae .L192_or_more8x 1408 cmpq $128,%rdx 1409 jae .L128_or_more8x 1410 cmpq $64,%rdx 1411 jae .L64_or_more8x 1412 1413 xorq %r10,%r10 1414 vmovdqa %ymm6,0(%rsp) 1415 vmovdqa %ymm8,32(%rsp) 1416 jmp .Loop_tail8x 1417 1418.align 32 1419.L64_or_more8x: 1420 vpxor 0(%rsi),%ymm6,%ymm6 1421 vpxor 32(%rsi),%ymm8,%ymm8 1422 vmovdqu %ymm6,0(%rdi) 1423 vmovdqu %ymm8,32(%rdi) 1424 je .Ldone8x 1425 1426 leaq 64(%rsi),%rsi 1427 xorq %r10,%r10 1428 vmovdqa %ymm1,0(%rsp) 1429 leaq 64(%rdi),%rdi 1430 subq $64,%rdx 1431 vmovdqa %ymm5,32(%rsp) 1432 jmp .Loop_tail8x 1433 1434.align 32 1435.L128_or_more8x: 1436 vpxor 0(%rsi),%ymm6,%ymm6 1437 vpxor 32(%rsi),%ymm8,%ymm8 1438 vpxor 64(%rsi),%ymm1,%ymm1 1439 vpxor 96(%rsi),%ymm5,%ymm5 1440 vmovdqu %ymm6,0(%rdi) 1441 vmovdqu %ymm8,32(%rdi) 1442 vmovdqu %ymm1,64(%rdi) 1443 vmovdqu %ymm5,96(%rdi) 1444 je .Ldone8x 1445 1446 leaq 128(%rsi),%rsi 1447 xorq %r10,%r10 1448 vmovdqa %ymm12,0(%rsp) 1449 leaq 128(%rdi),%rdi 1450 subq $128,%rdx 1451 vmovdqa %ymm13,32(%rsp) 1452 jmp .Loop_tail8x 1453 1454.align 32 1455.L192_or_more8x: 1456 vpxor 0(%rsi),%ymm6,%ymm6 1457 vpxor 32(%rsi),%ymm8,%ymm8 1458 vpxor 64(%rsi),%ymm1,%ymm1 1459 vpxor 96(%rsi),%ymm5,%ymm5 1460 vpxor 128(%rsi),%ymm12,%ymm12 1461 vpxor 160(%rsi),%ymm13,%ymm13 1462 vmovdqu %ymm6,0(%rdi) 1463 vmovdqu %ymm8,32(%rdi) 1464 vmovdqu %ymm1,64(%rdi) 1465 vmovdqu %ymm5,96(%rdi) 1466 vmovdqu %ymm12,128(%rdi) 1467 vmovdqu %ymm13,160(%rdi) 1468 je .Ldone8x 1469 1470 leaq 192(%rsi),%rsi 1471 xorq %r10,%r10 1472 vmovdqa %ymm10,0(%rsp) 1473 leaq 192(%rdi),%rdi 1474 subq $192,%rdx 1475 vmovdqa %ymm15,32(%rsp) 1476 jmp .Loop_tail8x 1477 1478.align 32 1479.L256_or_more8x: 1480 vpxor 0(%rsi),%ymm6,%ymm6 1481 vpxor 32(%rsi),%ymm8,%ymm8 1482 vpxor 64(%rsi),%ymm1,%ymm1 1483 vpxor 96(%rsi),%ymm5,%ymm5 1484 vpxor 128(%rsi),%ymm12,%ymm12 1485 vpxor 160(%rsi),%ymm13,%ymm13 1486 vpxor 192(%rsi),%ymm10,%ymm10 1487 vpxor 224(%rsi),%ymm15,%ymm15 1488 vmovdqu %ymm6,0(%rdi) 1489 vmovdqu %ymm8,32(%rdi) 1490 vmovdqu %ymm1,64(%rdi) 1491 vmovdqu %ymm5,96(%rdi) 1492 vmovdqu %ymm12,128(%rdi) 1493 vmovdqu %ymm13,160(%rdi) 1494 vmovdqu %ymm10,192(%rdi) 1495 vmovdqu %ymm15,224(%rdi) 1496 je .Ldone8x 1497 1498 leaq 256(%rsi),%rsi 1499 xorq %r10,%r10 1500 vmovdqa %ymm14,0(%rsp) 1501 leaq 256(%rdi),%rdi 1502 subq $256,%rdx 1503 vmovdqa %ymm2,32(%rsp) 1504 jmp .Loop_tail8x 1505 1506.align 32 1507.L320_or_more8x: 1508 vpxor 0(%rsi),%ymm6,%ymm6 1509 vpxor 32(%rsi),%ymm8,%ymm8 1510 vpxor 64(%rsi),%ymm1,%ymm1 1511 vpxor 96(%rsi),%ymm5,%ymm5 1512 vpxor 128(%rsi),%ymm12,%ymm12 1513 vpxor 160(%rsi),%ymm13,%ymm13 1514 vpxor 192(%rsi),%ymm10,%ymm10 1515 vpxor 224(%rsi),%ymm15,%ymm15 1516 vpxor 256(%rsi),%ymm14,%ymm14 1517 vpxor 288(%rsi),%ymm2,%ymm2 1518 vmovdqu %ymm6,0(%rdi) 1519 vmovdqu %ymm8,32(%rdi) 1520 vmovdqu %ymm1,64(%rdi) 1521 vmovdqu %ymm5,96(%rdi) 1522 vmovdqu %ymm12,128(%rdi) 1523 vmovdqu %ymm13,160(%rdi) 1524 vmovdqu %ymm10,192(%rdi) 1525 vmovdqu %ymm15,224(%rdi) 1526 vmovdqu %ymm14,256(%rdi) 1527 vmovdqu %ymm2,288(%rdi) 1528 je .Ldone8x 1529 1530 leaq 320(%rsi),%rsi 1531 xorq %r10,%r10 1532 vmovdqa %ymm3,0(%rsp) 1533 leaq 320(%rdi),%rdi 1534 subq $320,%rdx 1535 vmovdqa %ymm7,32(%rsp) 1536 jmp .Loop_tail8x 1537 1538.align 32 1539.L384_or_more8x: 1540 vpxor 0(%rsi),%ymm6,%ymm6 1541 vpxor 32(%rsi),%ymm8,%ymm8 1542 vpxor 64(%rsi),%ymm1,%ymm1 1543 vpxor 96(%rsi),%ymm5,%ymm5 1544 vpxor 128(%rsi),%ymm12,%ymm12 1545 vpxor 160(%rsi),%ymm13,%ymm13 1546 vpxor 192(%rsi),%ymm10,%ymm10 1547 vpxor 224(%rsi),%ymm15,%ymm15 1548 vpxor 256(%rsi),%ymm14,%ymm14 1549 vpxor 288(%rsi),%ymm2,%ymm2 1550 vpxor 320(%rsi),%ymm3,%ymm3 1551 vpxor 352(%rsi),%ymm7,%ymm7 1552 vmovdqu %ymm6,0(%rdi) 1553 vmovdqu %ymm8,32(%rdi) 1554 vmovdqu %ymm1,64(%rdi) 1555 vmovdqu %ymm5,96(%rdi) 1556 vmovdqu %ymm12,128(%rdi) 1557 vmovdqu %ymm13,160(%rdi) 1558 vmovdqu %ymm10,192(%rdi) 1559 vmovdqu %ymm15,224(%rdi) 1560 vmovdqu %ymm14,256(%rdi) 1561 vmovdqu %ymm2,288(%rdi) 1562 vmovdqu %ymm3,320(%rdi) 1563 vmovdqu %ymm7,352(%rdi) 1564 je .Ldone8x 1565 1566 leaq 384(%rsi),%rsi 1567 xorq %r10,%r10 1568 vmovdqa %ymm11,0(%rsp) 1569 leaq 384(%rdi),%rdi 1570 subq $384,%rdx 1571 vmovdqa %ymm9,32(%rsp) 1572 jmp .Loop_tail8x 1573 1574.align 32 1575.L448_or_more8x: 1576 vpxor 0(%rsi),%ymm6,%ymm6 1577 vpxor 32(%rsi),%ymm8,%ymm8 1578 vpxor 64(%rsi),%ymm1,%ymm1 1579 vpxor 96(%rsi),%ymm5,%ymm5 1580 vpxor 128(%rsi),%ymm12,%ymm12 1581 vpxor 160(%rsi),%ymm13,%ymm13 1582 vpxor 192(%rsi),%ymm10,%ymm10 1583 vpxor 224(%rsi),%ymm15,%ymm15 1584 vpxor 256(%rsi),%ymm14,%ymm14 1585 vpxor 288(%rsi),%ymm2,%ymm2 1586 vpxor 320(%rsi),%ymm3,%ymm3 1587 vpxor 352(%rsi),%ymm7,%ymm7 1588 vpxor 384(%rsi),%ymm11,%ymm11 1589 vpxor 416(%rsi),%ymm9,%ymm9 1590 vmovdqu %ymm6,0(%rdi) 1591 vmovdqu %ymm8,32(%rdi) 1592 vmovdqu %ymm1,64(%rdi) 1593 vmovdqu %ymm5,96(%rdi) 1594 vmovdqu %ymm12,128(%rdi) 1595 vmovdqu %ymm13,160(%rdi) 1596 vmovdqu %ymm10,192(%rdi) 1597 vmovdqu %ymm15,224(%rdi) 1598 vmovdqu %ymm14,256(%rdi) 1599 vmovdqu %ymm2,288(%rdi) 1600 vmovdqu %ymm3,320(%rdi) 1601 vmovdqu %ymm7,352(%rdi) 1602 vmovdqu %ymm11,384(%rdi) 1603 vmovdqu %ymm9,416(%rdi) 1604 je .Ldone8x 1605 1606 leaq 448(%rsi),%rsi 1607 xorq %r10,%r10 1608 vmovdqa %ymm0,0(%rsp) 1609 leaq 448(%rdi),%rdi 1610 subq $448,%rdx 1611 vmovdqa %ymm4,32(%rsp) 1612 1613.Loop_tail8x: 1614 movzbl (%rsi,%r10,1),%eax 1615 movzbl (%rsp,%r10,1),%ecx 1616 leaq 1(%r10),%r10 1617 xorl %ecx,%eax 1618 movb %al,-1(%rdi,%r10,1) 1619 decq %rdx 1620 jnz .Loop_tail8x 1621 1622.Ldone8x: 1623 vzeroall 1624 leaq (%r9),%rsp 1625.cfi_def_cfa_register rsp 1626.L8x_epilogue: 1627 .byte 0xf3,0xc3 1628.cfi_endproc 1629.size ChaCha20_8x,.-ChaCha20_8x 1630#endif 1631.section .note.GNU-stack,"",@progbits 1632