1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11#if defined(BORINGSSL_PREFIX) 12#include <boringssl_prefix_symbols_asm.h> 13#endif 14.text 15 16.extern OPENSSL_ia32cap_P 17.hidden OPENSSL_ia32cap_P 18 19.align 64 20.Lzero: 21.long 0,0,0,0 22.Lone: 23.long 1,0,0,0 24.Linc: 25.long 0,1,2,3 26.Lfour: 27.long 4,4,4,4 28.Lincy: 29.long 0,2,4,6,1,3,5,7 30.Leight: 31.long 8,8,8,8,8,8,8,8 32.Lrot16: 33.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 34.Lrot24: 35.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 36.Lsigma: 37.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 38.align 64 39.Lzeroz: 40.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 41.Lfourz: 42.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 43.Lincz: 44.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 45.Lsixteen: 46.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 47.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 48.globl ChaCha20_ctr32 49.hidden ChaCha20_ctr32 50.type ChaCha20_ctr32,@function 51.align 64 52ChaCha20_ctr32: 53.cfi_startproc 54 cmpq $0,%rdx 55 je .Lno_data 56 movq OPENSSL_ia32cap_P+4(%rip),%r10 57 testl $512,%r10d 58 jnz .LChaCha20_ssse3 59 60 pushq %rbx 61.cfi_adjust_cfa_offset 8 62.cfi_offset rbx,-16 63 pushq %rbp 64.cfi_adjust_cfa_offset 8 65.cfi_offset rbp,-24 66 pushq %r12 67.cfi_adjust_cfa_offset 8 68.cfi_offset r12,-32 69 pushq %r13 70.cfi_adjust_cfa_offset 8 71.cfi_offset r13,-40 72 pushq %r14 73.cfi_adjust_cfa_offset 8 74.cfi_offset r14,-48 75 pushq %r15 76.cfi_adjust_cfa_offset 8 77.cfi_offset r15,-56 78 subq $64+24,%rsp 79.cfi_adjust_cfa_offset 88 80.Lctr32_body: 81 82 83 movdqu (%rcx),%xmm1 84 movdqu 16(%rcx),%xmm2 85 movdqu (%r8),%xmm3 86 movdqa .Lone(%rip),%xmm4 87 88 89 movdqa %xmm1,16(%rsp) 90 movdqa %xmm2,32(%rsp) 91 movdqa %xmm3,48(%rsp) 92 movq %rdx,%rbp 93 jmp .Loop_outer 94 95.align 32 96.Loop_outer: 97 movl $0x61707865,%eax 98 movl $0x3320646e,%ebx 99 movl $0x79622d32,%ecx 100 movl $0x6b206574,%edx 101 movl 16(%rsp),%r8d 102 movl 20(%rsp),%r9d 103 movl 24(%rsp),%r10d 104 movl 28(%rsp),%r11d 105 movd %xmm3,%r12d 106 movl 52(%rsp),%r13d 107 movl 56(%rsp),%r14d 108 movl 60(%rsp),%r15d 109 110 movq %rbp,64+0(%rsp) 111 movl $10,%ebp 112 movq %rsi,64+8(%rsp) 113.byte 102,72,15,126,214 114 movq %rdi,64+16(%rsp) 115 movq %rsi,%rdi 116 shrq $32,%rdi 117 jmp .Loop 118 119.align 32 120.Loop: 121 addl %r8d,%eax 122 xorl %eax,%r12d 123 roll $16,%r12d 124 addl %r9d,%ebx 125 xorl %ebx,%r13d 126 roll $16,%r13d 127 addl %r12d,%esi 128 xorl %esi,%r8d 129 roll $12,%r8d 130 addl %r13d,%edi 131 xorl %edi,%r9d 132 roll $12,%r9d 133 addl %r8d,%eax 134 xorl %eax,%r12d 135 roll $8,%r12d 136 addl %r9d,%ebx 137 xorl %ebx,%r13d 138 roll $8,%r13d 139 addl %r12d,%esi 140 xorl %esi,%r8d 141 roll $7,%r8d 142 addl %r13d,%edi 143 xorl %edi,%r9d 144 roll $7,%r9d 145 movl %esi,32(%rsp) 146 movl %edi,36(%rsp) 147 movl 40(%rsp),%esi 148 movl 44(%rsp),%edi 149 addl %r10d,%ecx 150 xorl %ecx,%r14d 151 roll $16,%r14d 152 addl %r11d,%edx 153 xorl %edx,%r15d 154 roll $16,%r15d 155 addl %r14d,%esi 156 xorl %esi,%r10d 157 roll $12,%r10d 158 addl %r15d,%edi 159 xorl %edi,%r11d 160 roll $12,%r11d 161 addl %r10d,%ecx 162 xorl %ecx,%r14d 163 roll $8,%r14d 164 addl %r11d,%edx 165 xorl %edx,%r15d 166 roll $8,%r15d 167 addl %r14d,%esi 168 xorl %esi,%r10d 169 roll $7,%r10d 170 addl %r15d,%edi 171 xorl %edi,%r11d 172 roll $7,%r11d 173 addl %r9d,%eax 174 xorl %eax,%r15d 175 roll $16,%r15d 176 addl %r10d,%ebx 177 xorl %ebx,%r12d 178 roll $16,%r12d 179 addl %r15d,%esi 180 xorl %esi,%r9d 181 roll $12,%r9d 182 addl %r12d,%edi 183 xorl %edi,%r10d 184 roll $12,%r10d 185 addl %r9d,%eax 186 xorl %eax,%r15d 187 roll $8,%r15d 188 addl %r10d,%ebx 189 xorl %ebx,%r12d 190 roll $8,%r12d 191 addl %r15d,%esi 192 xorl %esi,%r9d 193 roll $7,%r9d 194 addl %r12d,%edi 195 xorl %edi,%r10d 196 roll $7,%r10d 197 movl %esi,40(%rsp) 198 movl %edi,44(%rsp) 199 movl 32(%rsp),%esi 200 movl 36(%rsp),%edi 201 addl %r11d,%ecx 202 xorl %ecx,%r13d 203 roll $16,%r13d 204 addl %r8d,%edx 205 xorl %edx,%r14d 206 roll $16,%r14d 207 addl %r13d,%esi 208 xorl %esi,%r11d 209 roll $12,%r11d 210 addl %r14d,%edi 211 xorl %edi,%r8d 212 roll $12,%r8d 213 addl %r11d,%ecx 214 xorl %ecx,%r13d 215 roll $8,%r13d 216 addl %r8d,%edx 217 xorl %edx,%r14d 218 roll $8,%r14d 219 addl %r13d,%esi 220 xorl %esi,%r11d 221 roll $7,%r11d 222 addl %r14d,%edi 223 xorl %edi,%r8d 224 roll $7,%r8d 225 decl %ebp 226 jnz .Loop 227 movl %edi,36(%rsp) 228 movl %esi,32(%rsp) 229 movq 64(%rsp),%rbp 230 movdqa %xmm2,%xmm1 231 movq 64+8(%rsp),%rsi 232 paddd %xmm4,%xmm3 233 movq 64+16(%rsp),%rdi 234 235 addl $0x61707865,%eax 236 addl $0x3320646e,%ebx 237 addl $0x79622d32,%ecx 238 addl $0x6b206574,%edx 239 addl 16(%rsp),%r8d 240 addl 20(%rsp),%r9d 241 addl 24(%rsp),%r10d 242 addl 28(%rsp),%r11d 243 addl 48(%rsp),%r12d 244 addl 52(%rsp),%r13d 245 addl 56(%rsp),%r14d 246 addl 60(%rsp),%r15d 247 paddd 32(%rsp),%xmm1 248 249 cmpq $64,%rbp 250 jb .Ltail 251 252 xorl 0(%rsi),%eax 253 xorl 4(%rsi),%ebx 254 xorl 8(%rsi),%ecx 255 xorl 12(%rsi),%edx 256 xorl 16(%rsi),%r8d 257 xorl 20(%rsi),%r9d 258 xorl 24(%rsi),%r10d 259 xorl 28(%rsi),%r11d 260 movdqu 32(%rsi),%xmm0 261 xorl 48(%rsi),%r12d 262 xorl 52(%rsi),%r13d 263 xorl 56(%rsi),%r14d 264 xorl 60(%rsi),%r15d 265 leaq 64(%rsi),%rsi 266 pxor %xmm1,%xmm0 267 268 movdqa %xmm2,32(%rsp) 269 movd %xmm3,48(%rsp) 270 271 movl %eax,0(%rdi) 272 movl %ebx,4(%rdi) 273 movl %ecx,8(%rdi) 274 movl %edx,12(%rdi) 275 movl %r8d,16(%rdi) 276 movl %r9d,20(%rdi) 277 movl %r10d,24(%rdi) 278 movl %r11d,28(%rdi) 279 movdqu %xmm0,32(%rdi) 280 movl %r12d,48(%rdi) 281 movl %r13d,52(%rdi) 282 movl %r14d,56(%rdi) 283 movl %r15d,60(%rdi) 284 leaq 64(%rdi),%rdi 285 286 subq $64,%rbp 287 jnz .Loop_outer 288 289 jmp .Ldone 290 291.align 16 292.Ltail: 293 movl %eax,0(%rsp) 294 movl %ebx,4(%rsp) 295 xorq %rbx,%rbx 296 movl %ecx,8(%rsp) 297 movl %edx,12(%rsp) 298 movl %r8d,16(%rsp) 299 movl %r9d,20(%rsp) 300 movl %r10d,24(%rsp) 301 movl %r11d,28(%rsp) 302 movdqa %xmm1,32(%rsp) 303 movl %r12d,48(%rsp) 304 movl %r13d,52(%rsp) 305 movl %r14d,56(%rsp) 306 movl %r15d,60(%rsp) 307 308.Loop_tail: 309 movzbl (%rsi,%rbx,1),%eax 310 movzbl (%rsp,%rbx,1),%edx 311 leaq 1(%rbx),%rbx 312 xorl %edx,%eax 313 movb %al,-1(%rdi,%rbx,1) 314 decq %rbp 315 jnz .Loop_tail 316 317.Ldone: 318 leaq 64+24+48(%rsp),%rsi 319 movq -48(%rsi),%r15 320.cfi_restore r15 321 movq -40(%rsi),%r14 322.cfi_restore r14 323 movq -32(%rsi),%r13 324.cfi_restore r13 325 movq -24(%rsi),%r12 326.cfi_restore r12 327 movq -16(%rsi),%rbp 328.cfi_restore rbp 329 movq -8(%rsi),%rbx 330.cfi_restore rbx 331 leaq (%rsi),%rsp 332.cfi_adjust_cfa_offset -136 333.Lno_data: 334 .byte 0xf3,0xc3 335.cfi_endproc 336.size ChaCha20_ctr32,.-ChaCha20_ctr32 337.type ChaCha20_ssse3,@function 338.align 32 339ChaCha20_ssse3: 340.LChaCha20_ssse3: 341.cfi_startproc 342 movq %rsp,%r9 343.cfi_def_cfa_register r9 344 cmpq $128,%rdx 345 ja .LChaCha20_4x 346 347.Ldo_sse3_after_all: 348 subq $64+8,%rsp 349 movdqa .Lsigma(%rip),%xmm0 350 movdqu (%rcx),%xmm1 351 movdqu 16(%rcx),%xmm2 352 movdqu (%r8),%xmm3 353 movdqa .Lrot16(%rip),%xmm6 354 movdqa .Lrot24(%rip),%xmm7 355 356 movdqa %xmm0,0(%rsp) 357 movdqa %xmm1,16(%rsp) 358 movdqa %xmm2,32(%rsp) 359 movdqa %xmm3,48(%rsp) 360 movq $10,%r8 361 jmp .Loop_ssse3 362 363.align 32 364.Loop_outer_ssse3: 365 movdqa .Lone(%rip),%xmm3 366 movdqa 0(%rsp),%xmm0 367 movdqa 16(%rsp),%xmm1 368 movdqa 32(%rsp),%xmm2 369 paddd 48(%rsp),%xmm3 370 movq $10,%r8 371 movdqa %xmm3,48(%rsp) 372 jmp .Loop_ssse3 373 374.align 32 375.Loop_ssse3: 376 paddd %xmm1,%xmm0 377 pxor %xmm0,%xmm3 378.byte 102,15,56,0,222 379 paddd %xmm3,%xmm2 380 pxor %xmm2,%xmm1 381 movdqa %xmm1,%xmm4 382 psrld $20,%xmm1 383 pslld $12,%xmm4 384 por %xmm4,%xmm1 385 paddd %xmm1,%xmm0 386 pxor %xmm0,%xmm3 387.byte 102,15,56,0,223 388 paddd %xmm3,%xmm2 389 pxor %xmm2,%xmm1 390 movdqa %xmm1,%xmm4 391 psrld $25,%xmm1 392 pslld $7,%xmm4 393 por %xmm4,%xmm1 394 pshufd $78,%xmm2,%xmm2 395 pshufd $57,%xmm1,%xmm1 396 pshufd $147,%xmm3,%xmm3 397 nop 398 paddd %xmm1,%xmm0 399 pxor %xmm0,%xmm3 400.byte 102,15,56,0,222 401 paddd %xmm3,%xmm2 402 pxor %xmm2,%xmm1 403 movdqa %xmm1,%xmm4 404 psrld $20,%xmm1 405 pslld $12,%xmm4 406 por %xmm4,%xmm1 407 paddd %xmm1,%xmm0 408 pxor %xmm0,%xmm3 409.byte 102,15,56,0,223 410 paddd %xmm3,%xmm2 411 pxor %xmm2,%xmm1 412 movdqa %xmm1,%xmm4 413 psrld $25,%xmm1 414 pslld $7,%xmm4 415 por %xmm4,%xmm1 416 pshufd $78,%xmm2,%xmm2 417 pshufd $147,%xmm1,%xmm1 418 pshufd $57,%xmm3,%xmm3 419 decq %r8 420 jnz .Loop_ssse3 421 paddd 0(%rsp),%xmm0 422 paddd 16(%rsp),%xmm1 423 paddd 32(%rsp),%xmm2 424 paddd 48(%rsp),%xmm3 425 426 cmpq $64,%rdx 427 jb .Ltail_ssse3 428 429 movdqu 0(%rsi),%xmm4 430 movdqu 16(%rsi),%xmm5 431 pxor %xmm4,%xmm0 432 movdqu 32(%rsi),%xmm4 433 pxor %xmm5,%xmm1 434 movdqu 48(%rsi),%xmm5 435 leaq 64(%rsi),%rsi 436 pxor %xmm4,%xmm2 437 pxor %xmm5,%xmm3 438 439 movdqu %xmm0,0(%rdi) 440 movdqu %xmm1,16(%rdi) 441 movdqu %xmm2,32(%rdi) 442 movdqu %xmm3,48(%rdi) 443 leaq 64(%rdi),%rdi 444 445 subq $64,%rdx 446 jnz .Loop_outer_ssse3 447 448 jmp .Ldone_ssse3 449 450.align 16 451.Ltail_ssse3: 452 movdqa %xmm0,0(%rsp) 453 movdqa %xmm1,16(%rsp) 454 movdqa %xmm2,32(%rsp) 455 movdqa %xmm3,48(%rsp) 456 xorq %r8,%r8 457 458.Loop_tail_ssse3: 459 movzbl (%rsi,%r8,1),%eax 460 movzbl (%rsp,%r8,1),%ecx 461 leaq 1(%r8),%r8 462 xorl %ecx,%eax 463 movb %al,-1(%rdi,%r8,1) 464 decq %rdx 465 jnz .Loop_tail_ssse3 466 467.Ldone_ssse3: 468 leaq (%r9),%rsp 469.cfi_def_cfa_register rsp 470.Lssse3_epilogue: 471 .byte 0xf3,0xc3 472.cfi_endproc 473.size ChaCha20_ssse3,.-ChaCha20_ssse3 474.type ChaCha20_4x,@function 475.align 32 476ChaCha20_4x: 477.LChaCha20_4x: 478.cfi_startproc 479 movq %rsp,%r9 480.cfi_def_cfa_register r9 481 movq %r10,%r11 482 shrq $32,%r10 483 testq $32,%r10 484 jnz .LChaCha20_8x 485 cmpq $192,%rdx 486 ja .Lproceed4x 487 488 andq $71303168,%r11 489 cmpq $4194304,%r11 490 je .Ldo_sse3_after_all 491 492.Lproceed4x: 493 subq $0x140+8,%rsp 494 movdqa .Lsigma(%rip),%xmm11 495 movdqu (%rcx),%xmm15 496 movdqu 16(%rcx),%xmm7 497 movdqu (%r8),%xmm3 498 leaq 256(%rsp),%rcx 499 leaq .Lrot16(%rip),%r10 500 leaq .Lrot24(%rip),%r11 501 502 pshufd $0x00,%xmm11,%xmm8 503 pshufd $0x55,%xmm11,%xmm9 504 movdqa %xmm8,64(%rsp) 505 pshufd $0xaa,%xmm11,%xmm10 506 movdqa %xmm9,80(%rsp) 507 pshufd $0xff,%xmm11,%xmm11 508 movdqa %xmm10,96(%rsp) 509 movdqa %xmm11,112(%rsp) 510 511 pshufd $0x00,%xmm15,%xmm12 512 pshufd $0x55,%xmm15,%xmm13 513 movdqa %xmm12,128-256(%rcx) 514 pshufd $0xaa,%xmm15,%xmm14 515 movdqa %xmm13,144-256(%rcx) 516 pshufd $0xff,%xmm15,%xmm15 517 movdqa %xmm14,160-256(%rcx) 518 movdqa %xmm15,176-256(%rcx) 519 520 pshufd $0x00,%xmm7,%xmm4 521 pshufd $0x55,%xmm7,%xmm5 522 movdqa %xmm4,192-256(%rcx) 523 pshufd $0xaa,%xmm7,%xmm6 524 movdqa %xmm5,208-256(%rcx) 525 pshufd $0xff,%xmm7,%xmm7 526 movdqa %xmm6,224-256(%rcx) 527 movdqa %xmm7,240-256(%rcx) 528 529 pshufd $0x00,%xmm3,%xmm0 530 pshufd $0x55,%xmm3,%xmm1 531 paddd .Linc(%rip),%xmm0 532 pshufd $0xaa,%xmm3,%xmm2 533 movdqa %xmm1,272-256(%rcx) 534 pshufd $0xff,%xmm3,%xmm3 535 movdqa %xmm2,288-256(%rcx) 536 movdqa %xmm3,304-256(%rcx) 537 538 jmp .Loop_enter4x 539 540.align 32 541.Loop_outer4x: 542 movdqa 64(%rsp),%xmm8 543 movdqa 80(%rsp),%xmm9 544 movdqa 96(%rsp),%xmm10 545 movdqa 112(%rsp),%xmm11 546 movdqa 128-256(%rcx),%xmm12 547 movdqa 144-256(%rcx),%xmm13 548 movdqa 160-256(%rcx),%xmm14 549 movdqa 176-256(%rcx),%xmm15 550 movdqa 192-256(%rcx),%xmm4 551 movdqa 208-256(%rcx),%xmm5 552 movdqa 224-256(%rcx),%xmm6 553 movdqa 240-256(%rcx),%xmm7 554 movdqa 256-256(%rcx),%xmm0 555 movdqa 272-256(%rcx),%xmm1 556 movdqa 288-256(%rcx),%xmm2 557 movdqa 304-256(%rcx),%xmm3 558 paddd .Lfour(%rip),%xmm0 559 560.Loop_enter4x: 561 movdqa %xmm6,32(%rsp) 562 movdqa %xmm7,48(%rsp) 563 movdqa (%r10),%xmm7 564 movl $10,%eax 565 movdqa %xmm0,256-256(%rcx) 566 jmp .Loop4x 567 568.align 32 569.Loop4x: 570 paddd %xmm12,%xmm8 571 paddd %xmm13,%xmm9 572 pxor %xmm8,%xmm0 573 pxor %xmm9,%xmm1 574.byte 102,15,56,0,199 575.byte 102,15,56,0,207 576 paddd %xmm0,%xmm4 577 paddd %xmm1,%xmm5 578 pxor %xmm4,%xmm12 579 pxor %xmm5,%xmm13 580 movdqa %xmm12,%xmm6 581 pslld $12,%xmm12 582 psrld $20,%xmm6 583 movdqa %xmm13,%xmm7 584 pslld $12,%xmm13 585 por %xmm6,%xmm12 586 psrld $20,%xmm7 587 movdqa (%r11),%xmm6 588 por %xmm7,%xmm13 589 paddd %xmm12,%xmm8 590 paddd %xmm13,%xmm9 591 pxor %xmm8,%xmm0 592 pxor %xmm9,%xmm1 593.byte 102,15,56,0,198 594.byte 102,15,56,0,206 595 paddd %xmm0,%xmm4 596 paddd %xmm1,%xmm5 597 pxor %xmm4,%xmm12 598 pxor %xmm5,%xmm13 599 movdqa %xmm12,%xmm7 600 pslld $7,%xmm12 601 psrld $25,%xmm7 602 movdqa %xmm13,%xmm6 603 pslld $7,%xmm13 604 por %xmm7,%xmm12 605 psrld $25,%xmm6 606 movdqa (%r10),%xmm7 607 por %xmm6,%xmm13 608 movdqa %xmm4,0(%rsp) 609 movdqa %xmm5,16(%rsp) 610 movdqa 32(%rsp),%xmm4 611 movdqa 48(%rsp),%xmm5 612 paddd %xmm14,%xmm10 613 paddd %xmm15,%xmm11 614 pxor %xmm10,%xmm2 615 pxor %xmm11,%xmm3 616.byte 102,15,56,0,215 617.byte 102,15,56,0,223 618 paddd %xmm2,%xmm4 619 paddd %xmm3,%xmm5 620 pxor %xmm4,%xmm14 621 pxor %xmm5,%xmm15 622 movdqa %xmm14,%xmm6 623 pslld $12,%xmm14 624 psrld $20,%xmm6 625 movdqa %xmm15,%xmm7 626 pslld $12,%xmm15 627 por %xmm6,%xmm14 628 psrld $20,%xmm7 629 movdqa (%r11),%xmm6 630 por %xmm7,%xmm15 631 paddd %xmm14,%xmm10 632 paddd %xmm15,%xmm11 633 pxor %xmm10,%xmm2 634 pxor %xmm11,%xmm3 635.byte 102,15,56,0,214 636.byte 102,15,56,0,222 637 paddd %xmm2,%xmm4 638 paddd %xmm3,%xmm5 639 pxor %xmm4,%xmm14 640 pxor %xmm5,%xmm15 641 movdqa %xmm14,%xmm7 642 pslld $7,%xmm14 643 psrld $25,%xmm7 644 movdqa %xmm15,%xmm6 645 pslld $7,%xmm15 646 por %xmm7,%xmm14 647 psrld $25,%xmm6 648 movdqa (%r10),%xmm7 649 por %xmm6,%xmm15 650 paddd %xmm13,%xmm8 651 paddd %xmm14,%xmm9 652 pxor %xmm8,%xmm3 653 pxor %xmm9,%xmm0 654.byte 102,15,56,0,223 655.byte 102,15,56,0,199 656 paddd %xmm3,%xmm4 657 paddd %xmm0,%xmm5 658 pxor %xmm4,%xmm13 659 pxor %xmm5,%xmm14 660 movdqa %xmm13,%xmm6 661 pslld $12,%xmm13 662 psrld $20,%xmm6 663 movdqa %xmm14,%xmm7 664 pslld $12,%xmm14 665 por %xmm6,%xmm13 666 psrld $20,%xmm7 667 movdqa (%r11),%xmm6 668 por %xmm7,%xmm14 669 paddd %xmm13,%xmm8 670 paddd %xmm14,%xmm9 671 pxor %xmm8,%xmm3 672 pxor %xmm9,%xmm0 673.byte 102,15,56,0,222 674.byte 102,15,56,0,198 675 paddd %xmm3,%xmm4 676 paddd %xmm0,%xmm5 677 pxor %xmm4,%xmm13 678 pxor %xmm5,%xmm14 679 movdqa %xmm13,%xmm7 680 pslld $7,%xmm13 681 psrld $25,%xmm7 682 movdqa %xmm14,%xmm6 683 pslld $7,%xmm14 684 por %xmm7,%xmm13 685 psrld $25,%xmm6 686 movdqa (%r10),%xmm7 687 por %xmm6,%xmm14 688 movdqa %xmm4,32(%rsp) 689 movdqa %xmm5,48(%rsp) 690 movdqa 0(%rsp),%xmm4 691 movdqa 16(%rsp),%xmm5 692 paddd %xmm15,%xmm10 693 paddd %xmm12,%xmm11 694 pxor %xmm10,%xmm1 695 pxor %xmm11,%xmm2 696.byte 102,15,56,0,207 697.byte 102,15,56,0,215 698 paddd %xmm1,%xmm4 699 paddd %xmm2,%xmm5 700 pxor %xmm4,%xmm15 701 pxor %xmm5,%xmm12 702 movdqa %xmm15,%xmm6 703 pslld $12,%xmm15 704 psrld $20,%xmm6 705 movdqa %xmm12,%xmm7 706 pslld $12,%xmm12 707 por %xmm6,%xmm15 708 psrld $20,%xmm7 709 movdqa (%r11),%xmm6 710 por %xmm7,%xmm12 711 paddd %xmm15,%xmm10 712 paddd %xmm12,%xmm11 713 pxor %xmm10,%xmm1 714 pxor %xmm11,%xmm2 715.byte 102,15,56,0,206 716.byte 102,15,56,0,214 717 paddd %xmm1,%xmm4 718 paddd %xmm2,%xmm5 719 pxor %xmm4,%xmm15 720 pxor %xmm5,%xmm12 721 movdqa %xmm15,%xmm7 722 pslld $7,%xmm15 723 psrld $25,%xmm7 724 movdqa %xmm12,%xmm6 725 pslld $7,%xmm12 726 por %xmm7,%xmm15 727 psrld $25,%xmm6 728 movdqa (%r10),%xmm7 729 por %xmm6,%xmm12 730 decl %eax 731 jnz .Loop4x 732 733 paddd 64(%rsp),%xmm8 734 paddd 80(%rsp),%xmm9 735 paddd 96(%rsp),%xmm10 736 paddd 112(%rsp),%xmm11 737 738 movdqa %xmm8,%xmm6 739 punpckldq %xmm9,%xmm8 740 movdqa %xmm10,%xmm7 741 punpckldq %xmm11,%xmm10 742 punpckhdq %xmm9,%xmm6 743 punpckhdq %xmm11,%xmm7 744 movdqa %xmm8,%xmm9 745 punpcklqdq %xmm10,%xmm8 746 movdqa %xmm6,%xmm11 747 punpcklqdq %xmm7,%xmm6 748 punpckhqdq %xmm10,%xmm9 749 punpckhqdq %xmm7,%xmm11 750 paddd 128-256(%rcx),%xmm12 751 paddd 144-256(%rcx),%xmm13 752 paddd 160-256(%rcx),%xmm14 753 paddd 176-256(%rcx),%xmm15 754 755 movdqa %xmm8,0(%rsp) 756 movdqa %xmm9,16(%rsp) 757 movdqa 32(%rsp),%xmm8 758 movdqa 48(%rsp),%xmm9 759 760 movdqa %xmm12,%xmm10 761 punpckldq %xmm13,%xmm12 762 movdqa %xmm14,%xmm7 763 punpckldq %xmm15,%xmm14 764 punpckhdq %xmm13,%xmm10 765 punpckhdq %xmm15,%xmm7 766 movdqa %xmm12,%xmm13 767 punpcklqdq %xmm14,%xmm12 768 movdqa %xmm10,%xmm15 769 punpcklqdq %xmm7,%xmm10 770 punpckhqdq %xmm14,%xmm13 771 punpckhqdq %xmm7,%xmm15 772 paddd 192-256(%rcx),%xmm4 773 paddd 208-256(%rcx),%xmm5 774 paddd 224-256(%rcx),%xmm8 775 paddd 240-256(%rcx),%xmm9 776 777 movdqa %xmm6,32(%rsp) 778 movdqa %xmm11,48(%rsp) 779 780 movdqa %xmm4,%xmm14 781 punpckldq %xmm5,%xmm4 782 movdqa %xmm8,%xmm7 783 punpckldq %xmm9,%xmm8 784 punpckhdq %xmm5,%xmm14 785 punpckhdq %xmm9,%xmm7 786 movdqa %xmm4,%xmm5 787 punpcklqdq %xmm8,%xmm4 788 movdqa %xmm14,%xmm9 789 punpcklqdq %xmm7,%xmm14 790 punpckhqdq %xmm8,%xmm5 791 punpckhqdq %xmm7,%xmm9 792 paddd 256-256(%rcx),%xmm0 793 paddd 272-256(%rcx),%xmm1 794 paddd 288-256(%rcx),%xmm2 795 paddd 304-256(%rcx),%xmm3 796 797 movdqa %xmm0,%xmm8 798 punpckldq %xmm1,%xmm0 799 movdqa %xmm2,%xmm7 800 punpckldq %xmm3,%xmm2 801 punpckhdq %xmm1,%xmm8 802 punpckhdq %xmm3,%xmm7 803 movdqa %xmm0,%xmm1 804 punpcklqdq %xmm2,%xmm0 805 movdqa %xmm8,%xmm3 806 punpcklqdq %xmm7,%xmm8 807 punpckhqdq %xmm2,%xmm1 808 punpckhqdq %xmm7,%xmm3 809 cmpq $256,%rdx 810 jb .Ltail4x 811 812 movdqu 0(%rsi),%xmm6 813 movdqu 16(%rsi),%xmm11 814 movdqu 32(%rsi),%xmm2 815 movdqu 48(%rsi),%xmm7 816 pxor 0(%rsp),%xmm6 817 pxor %xmm12,%xmm11 818 pxor %xmm4,%xmm2 819 pxor %xmm0,%xmm7 820 821 movdqu %xmm6,0(%rdi) 822 movdqu 64(%rsi),%xmm6 823 movdqu %xmm11,16(%rdi) 824 movdqu 80(%rsi),%xmm11 825 movdqu %xmm2,32(%rdi) 826 movdqu 96(%rsi),%xmm2 827 movdqu %xmm7,48(%rdi) 828 movdqu 112(%rsi),%xmm7 829 leaq 128(%rsi),%rsi 830 pxor 16(%rsp),%xmm6 831 pxor %xmm13,%xmm11 832 pxor %xmm5,%xmm2 833 pxor %xmm1,%xmm7 834 835 movdqu %xmm6,64(%rdi) 836 movdqu 0(%rsi),%xmm6 837 movdqu %xmm11,80(%rdi) 838 movdqu 16(%rsi),%xmm11 839 movdqu %xmm2,96(%rdi) 840 movdqu 32(%rsi),%xmm2 841 movdqu %xmm7,112(%rdi) 842 leaq 128(%rdi),%rdi 843 movdqu 48(%rsi),%xmm7 844 pxor 32(%rsp),%xmm6 845 pxor %xmm10,%xmm11 846 pxor %xmm14,%xmm2 847 pxor %xmm8,%xmm7 848 849 movdqu %xmm6,0(%rdi) 850 movdqu 64(%rsi),%xmm6 851 movdqu %xmm11,16(%rdi) 852 movdqu 80(%rsi),%xmm11 853 movdqu %xmm2,32(%rdi) 854 movdqu 96(%rsi),%xmm2 855 movdqu %xmm7,48(%rdi) 856 movdqu 112(%rsi),%xmm7 857 leaq 128(%rsi),%rsi 858 pxor 48(%rsp),%xmm6 859 pxor %xmm15,%xmm11 860 pxor %xmm9,%xmm2 861 pxor %xmm3,%xmm7 862 movdqu %xmm6,64(%rdi) 863 movdqu %xmm11,80(%rdi) 864 movdqu %xmm2,96(%rdi) 865 movdqu %xmm7,112(%rdi) 866 leaq 128(%rdi),%rdi 867 868 subq $256,%rdx 869 jnz .Loop_outer4x 870 871 jmp .Ldone4x 872 873.Ltail4x: 874 cmpq $192,%rdx 875 jae .L192_or_more4x 876 cmpq $128,%rdx 877 jae .L128_or_more4x 878 cmpq $64,%rdx 879 jae .L64_or_more4x 880 881 882 xorq %r10,%r10 883 884 movdqa %xmm12,16(%rsp) 885 movdqa %xmm4,32(%rsp) 886 movdqa %xmm0,48(%rsp) 887 jmp .Loop_tail4x 888 889.align 32 890.L64_or_more4x: 891 movdqu 0(%rsi),%xmm6 892 movdqu 16(%rsi),%xmm11 893 movdqu 32(%rsi),%xmm2 894 movdqu 48(%rsi),%xmm7 895 pxor 0(%rsp),%xmm6 896 pxor %xmm12,%xmm11 897 pxor %xmm4,%xmm2 898 pxor %xmm0,%xmm7 899 movdqu %xmm6,0(%rdi) 900 movdqu %xmm11,16(%rdi) 901 movdqu %xmm2,32(%rdi) 902 movdqu %xmm7,48(%rdi) 903 je .Ldone4x 904 905 movdqa 16(%rsp),%xmm6 906 leaq 64(%rsi),%rsi 907 xorq %r10,%r10 908 movdqa %xmm6,0(%rsp) 909 movdqa %xmm13,16(%rsp) 910 leaq 64(%rdi),%rdi 911 movdqa %xmm5,32(%rsp) 912 subq $64,%rdx 913 movdqa %xmm1,48(%rsp) 914 jmp .Loop_tail4x 915 916.align 32 917.L128_or_more4x: 918 movdqu 0(%rsi),%xmm6 919 movdqu 16(%rsi),%xmm11 920 movdqu 32(%rsi),%xmm2 921 movdqu 48(%rsi),%xmm7 922 pxor 0(%rsp),%xmm6 923 pxor %xmm12,%xmm11 924 pxor %xmm4,%xmm2 925 pxor %xmm0,%xmm7 926 927 movdqu %xmm6,0(%rdi) 928 movdqu 64(%rsi),%xmm6 929 movdqu %xmm11,16(%rdi) 930 movdqu 80(%rsi),%xmm11 931 movdqu %xmm2,32(%rdi) 932 movdqu 96(%rsi),%xmm2 933 movdqu %xmm7,48(%rdi) 934 movdqu 112(%rsi),%xmm7 935 pxor 16(%rsp),%xmm6 936 pxor %xmm13,%xmm11 937 pxor %xmm5,%xmm2 938 pxor %xmm1,%xmm7 939 movdqu %xmm6,64(%rdi) 940 movdqu %xmm11,80(%rdi) 941 movdqu %xmm2,96(%rdi) 942 movdqu %xmm7,112(%rdi) 943 je .Ldone4x 944 945 movdqa 32(%rsp),%xmm6 946 leaq 128(%rsi),%rsi 947 xorq %r10,%r10 948 movdqa %xmm6,0(%rsp) 949 movdqa %xmm10,16(%rsp) 950 leaq 128(%rdi),%rdi 951 movdqa %xmm14,32(%rsp) 952 subq $128,%rdx 953 movdqa %xmm8,48(%rsp) 954 jmp .Loop_tail4x 955 956.align 32 957.L192_or_more4x: 958 movdqu 0(%rsi),%xmm6 959 movdqu 16(%rsi),%xmm11 960 movdqu 32(%rsi),%xmm2 961 movdqu 48(%rsi),%xmm7 962 pxor 0(%rsp),%xmm6 963 pxor %xmm12,%xmm11 964 pxor %xmm4,%xmm2 965 pxor %xmm0,%xmm7 966 967 movdqu %xmm6,0(%rdi) 968 movdqu 64(%rsi),%xmm6 969 movdqu %xmm11,16(%rdi) 970 movdqu 80(%rsi),%xmm11 971 movdqu %xmm2,32(%rdi) 972 movdqu 96(%rsi),%xmm2 973 movdqu %xmm7,48(%rdi) 974 movdqu 112(%rsi),%xmm7 975 leaq 128(%rsi),%rsi 976 pxor 16(%rsp),%xmm6 977 pxor %xmm13,%xmm11 978 pxor %xmm5,%xmm2 979 pxor %xmm1,%xmm7 980 981 movdqu %xmm6,64(%rdi) 982 movdqu 0(%rsi),%xmm6 983 movdqu %xmm11,80(%rdi) 984 movdqu 16(%rsi),%xmm11 985 movdqu %xmm2,96(%rdi) 986 movdqu 32(%rsi),%xmm2 987 movdqu %xmm7,112(%rdi) 988 leaq 128(%rdi),%rdi 989 movdqu 48(%rsi),%xmm7 990 pxor 32(%rsp),%xmm6 991 pxor %xmm10,%xmm11 992 pxor %xmm14,%xmm2 993 pxor %xmm8,%xmm7 994 movdqu %xmm6,0(%rdi) 995 movdqu %xmm11,16(%rdi) 996 movdqu %xmm2,32(%rdi) 997 movdqu %xmm7,48(%rdi) 998 je .Ldone4x 999 1000 movdqa 48(%rsp),%xmm6 1001 leaq 64(%rsi),%rsi 1002 xorq %r10,%r10 1003 movdqa %xmm6,0(%rsp) 1004 movdqa %xmm15,16(%rsp) 1005 leaq 64(%rdi),%rdi 1006 movdqa %xmm9,32(%rsp) 1007 subq $192,%rdx 1008 movdqa %xmm3,48(%rsp) 1009 1010.Loop_tail4x: 1011 movzbl (%rsi,%r10,1),%eax 1012 movzbl (%rsp,%r10,1),%ecx 1013 leaq 1(%r10),%r10 1014 xorl %ecx,%eax 1015 movb %al,-1(%rdi,%r10,1) 1016 decq %rdx 1017 jnz .Loop_tail4x 1018 1019.Ldone4x: 1020 leaq (%r9),%rsp 1021.cfi_def_cfa_register rsp 1022.L4x_epilogue: 1023 .byte 0xf3,0xc3 1024.cfi_endproc 1025.size ChaCha20_4x,.-ChaCha20_4x 1026.type ChaCha20_8x,@function 1027.align 32 1028ChaCha20_8x: 1029.LChaCha20_8x: 1030.cfi_startproc 1031 movq %rsp,%r9 1032.cfi_def_cfa_register r9 1033 subq $0x280+8,%rsp 1034 andq $-32,%rsp 1035 vzeroupper 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 vbroadcasti128 .Lsigma(%rip),%ymm11 1047 vbroadcasti128 (%rcx),%ymm3 1048 vbroadcasti128 16(%rcx),%ymm15 1049 vbroadcasti128 (%r8),%ymm7 1050 leaq 256(%rsp),%rcx 1051 leaq 512(%rsp),%rax 1052 leaq .Lrot16(%rip),%r10 1053 leaq .Lrot24(%rip),%r11 1054 1055 vpshufd $0x00,%ymm11,%ymm8 1056 vpshufd $0x55,%ymm11,%ymm9 1057 vmovdqa %ymm8,128-256(%rcx) 1058 vpshufd $0xaa,%ymm11,%ymm10 1059 vmovdqa %ymm9,160-256(%rcx) 1060 vpshufd $0xff,%ymm11,%ymm11 1061 vmovdqa %ymm10,192-256(%rcx) 1062 vmovdqa %ymm11,224-256(%rcx) 1063 1064 vpshufd $0x00,%ymm3,%ymm0 1065 vpshufd $0x55,%ymm3,%ymm1 1066 vmovdqa %ymm0,256-256(%rcx) 1067 vpshufd $0xaa,%ymm3,%ymm2 1068 vmovdqa %ymm1,288-256(%rcx) 1069 vpshufd $0xff,%ymm3,%ymm3 1070 vmovdqa %ymm2,320-256(%rcx) 1071 vmovdqa %ymm3,352-256(%rcx) 1072 1073 vpshufd $0x00,%ymm15,%ymm12 1074 vpshufd $0x55,%ymm15,%ymm13 1075 vmovdqa %ymm12,384-512(%rax) 1076 vpshufd $0xaa,%ymm15,%ymm14 1077 vmovdqa %ymm13,416-512(%rax) 1078 vpshufd $0xff,%ymm15,%ymm15 1079 vmovdqa %ymm14,448-512(%rax) 1080 vmovdqa %ymm15,480-512(%rax) 1081 1082 vpshufd $0x00,%ymm7,%ymm4 1083 vpshufd $0x55,%ymm7,%ymm5 1084 vpaddd .Lincy(%rip),%ymm4,%ymm4 1085 vpshufd $0xaa,%ymm7,%ymm6 1086 vmovdqa %ymm5,544-512(%rax) 1087 vpshufd $0xff,%ymm7,%ymm7 1088 vmovdqa %ymm6,576-512(%rax) 1089 vmovdqa %ymm7,608-512(%rax) 1090 1091 jmp .Loop_enter8x 1092 1093.align 32 1094.Loop_outer8x: 1095 vmovdqa 128-256(%rcx),%ymm8 1096 vmovdqa 160-256(%rcx),%ymm9 1097 vmovdqa 192-256(%rcx),%ymm10 1098 vmovdqa 224-256(%rcx),%ymm11 1099 vmovdqa 256-256(%rcx),%ymm0 1100 vmovdqa 288-256(%rcx),%ymm1 1101 vmovdqa 320-256(%rcx),%ymm2 1102 vmovdqa 352-256(%rcx),%ymm3 1103 vmovdqa 384-512(%rax),%ymm12 1104 vmovdqa 416-512(%rax),%ymm13 1105 vmovdqa 448-512(%rax),%ymm14 1106 vmovdqa 480-512(%rax),%ymm15 1107 vmovdqa 512-512(%rax),%ymm4 1108 vmovdqa 544-512(%rax),%ymm5 1109 vmovdqa 576-512(%rax),%ymm6 1110 vmovdqa 608-512(%rax),%ymm7 1111 vpaddd .Leight(%rip),%ymm4,%ymm4 1112 1113.Loop_enter8x: 1114 vmovdqa %ymm14,64(%rsp) 1115 vmovdqa %ymm15,96(%rsp) 1116 vbroadcasti128 (%r10),%ymm15 1117 vmovdqa %ymm4,512-512(%rax) 1118 movl $10,%eax 1119 jmp .Loop8x 1120 1121.align 32 1122.Loop8x: 1123 vpaddd %ymm0,%ymm8,%ymm8 1124 vpxor %ymm4,%ymm8,%ymm4 1125 vpshufb %ymm15,%ymm4,%ymm4 1126 vpaddd %ymm1,%ymm9,%ymm9 1127 vpxor %ymm5,%ymm9,%ymm5 1128 vpshufb %ymm15,%ymm5,%ymm5 1129 vpaddd %ymm4,%ymm12,%ymm12 1130 vpxor %ymm0,%ymm12,%ymm0 1131 vpslld $12,%ymm0,%ymm14 1132 vpsrld $20,%ymm0,%ymm0 1133 vpor %ymm0,%ymm14,%ymm0 1134 vbroadcasti128 (%r11),%ymm14 1135 vpaddd %ymm5,%ymm13,%ymm13 1136 vpxor %ymm1,%ymm13,%ymm1 1137 vpslld $12,%ymm1,%ymm15 1138 vpsrld $20,%ymm1,%ymm1 1139 vpor %ymm1,%ymm15,%ymm1 1140 vpaddd %ymm0,%ymm8,%ymm8 1141 vpxor %ymm4,%ymm8,%ymm4 1142 vpshufb %ymm14,%ymm4,%ymm4 1143 vpaddd %ymm1,%ymm9,%ymm9 1144 vpxor %ymm5,%ymm9,%ymm5 1145 vpshufb %ymm14,%ymm5,%ymm5 1146 vpaddd %ymm4,%ymm12,%ymm12 1147 vpxor %ymm0,%ymm12,%ymm0 1148 vpslld $7,%ymm0,%ymm15 1149 vpsrld $25,%ymm0,%ymm0 1150 vpor %ymm0,%ymm15,%ymm0 1151 vbroadcasti128 (%r10),%ymm15 1152 vpaddd %ymm5,%ymm13,%ymm13 1153 vpxor %ymm1,%ymm13,%ymm1 1154 vpslld $7,%ymm1,%ymm14 1155 vpsrld $25,%ymm1,%ymm1 1156 vpor %ymm1,%ymm14,%ymm1 1157 vmovdqa %ymm12,0(%rsp) 1158 vmovdqa %ymm13,32(%rsp) 1159 vmovdqa 64(%rsp),%ymm12 1160 vmovdqa 96(%rsp),%ymm13 1161 vpaddd %ymm2,%ymm10,%ymm10 1162 vpxor %ymm6,%ymm10,%ymm6 1163 vpshufb %ymm15,%ymm6,%ymm6 1164 vpaddd %ymm3,%ymm11,%ymm11 1165 vpxor %ymm7,%ymm11,%ymm7 1166 vpshufb %ymm15,%ymm7,%ymm7 1167 vpaddd %ymm6,%ymm12,%ymm12 1168 vpxor %ymm2,%ymm12,%ymm2 1169 vpslld $12,%ymm2,%ymm14 1170 vpsrld $20,%ymm2,%ymm2 1171 vpor %ymm2,%ymm14,%ymm2 1172 vbroadcasti128 (%r11),%ymm14 1173 vpaddd %ymm7,%ymm13,%ymm13 1174 vpxor %ymm3,%ymm13,%ymm3 1175 vpslld $12,%ymm3,%ymm15 1176 vpsrld $20,%ymm3,%ymm3 1177 vpor %ymm3,%ymm15,%ymm3 1178 vpaddd %ymm2,%ymm10,%ymm10 1179 vpxor %ymm6,%ymm10,%ymm6 1180 vpshufb %ymm14,%ymm6,%ymm6 1181 vpaddd %ymm3,%ymm11,%ymm11 1182 vpxor %ymm7,%ymm11,%ymm7 1183 vpshufb %ymm14,%ymm7,%ymm7 1184 vpaddd %ymm6,%ymm12,%ymm12 1185 vpxor %ymm2,%ymm12,%ymm2 1186 vpslld $7,%ymm2,%ymm15 1187 vpsrld $25,%ymm2,%ymm2 1188 vpor %ymm2,%ymm15,%ymm2 1189 vbroadcasti128 (%r10),%ymm15 1190 vpaddd %ymm7,%ymm13,%ymm13 1191 vpxor %ymm3,%ymm13,%ymm3 1192 vpslld $7,%ymm3,%ymm14 1193 vpsrld $25,%ymm3,%ymm3 1194 vpor %ymm3,%ymm14,%ymm3 1195 vpaddd %ymm1,%ymm8,%ymm8 1196 vpxor %ymm7,%ymm8,%ymm7 1197 vpshufb %ymm15,%ymm7,%ymm7 1198 vpaddd %ymm2,%ymm9,%ymm9 1199 vpxor %ymm4,%ymm9,%ymm4 1200 vpshufb %ymm15,%ymm4,%ymm4 1201 vpaddd %ymm7,%ymm12,%ymm12 1202 vpxor %ymm1,%ymm12,%ymm1 1203 vpslld $12,%ymm1,%ymm14 1204 vpsrld $20,%ymm1,%ymm1 1205 vpor %ymm1,%ymm14,%ymm1 1206 vbroadcasti128 (%r11),%ymm14 1207 vpaddd %ymm4,%ymm13,%ymm13 1208 vpxor %ymm2,%ymm13,%ymm2 1209 vpslld $12,%ymm2,%ymm15 1210 vpsrld $20,%ymm2,%ymm2 1211 vpor %ymm2,%ymm15,%ymm2 1212 vpaddd %ymm1,%ymm8,%ymm8 1213 vpxor %ymm7,%ymm8,%ymm7 1214 vpshufb %ymm14,%ymm7,%ymm7 1215 vpaddd %ymm2,%ymm9,%ymm9 1216 vpxor %ymm4,%ymm9,%ymm4 1217 vpshufb %ymm14,%ymm4,%ymm4 1218 vpaddd %ymm7,%ymm12,%ymm12 1219 vpxor %ymm1,%ymm12,%ymm1 1220 vpslld $7,%ymm1,%ymm15 1221 vpsrld $25,%ymm1,%ymm1 1222 vpor %ymm1,%ymm15,%ymm1 1223 vbroadcasti128 (%r10),%ymm15 1224 vpaddd %ymm4,%ymm13,%ymm13 1225 vpxor %ymm2,%ymm13,%ymm2 1226 vpslld $7,%ymm2,%ymm14 1227 vpsrld $25,%ymm2,%ymm2 1228 vpor %ymm2,%ymm14,%ymm2 1229 vmovdqa %ymm12,64(%rsp) 1230 vmovdqa %ymm13,96(%rsp) 1231 vmovdqa 0(%rsp),%ymm12 1232 vmovdqa 32(%rsp),%ymm13 1233 vpaddd %ymm3,%ymm10,%ymm10 1234 vpxor %ymm5,%ymm10,%ymm5 1235 vpshufb %ymm15,%ymm5,%ymm5 1236 vpaddd %ymm0,%ymm11,%ymm11 1237 vpxor %ymm6,%ymm11,%ymm6 1238 vpshufb %ymm15,%ymm6,%ymm6 1239 vpaddd %ymm5,%ymm12,%ymm12 1240 vpxor %ymm3,%ymm12,%ymm3 1241 vpslld $12,%ymm3,%ymm14 1242 vpsrld $20,%ymm3,%ymm3 1243 vpor %ymm3,%ymm14,%ymm3 1244 vbroadcasti128 (%r11),%ymm14 1245 vpaddd %ymm6,%ymm13,%ymm13 1246 vpxor %ymm0,%ymm13,%ymm0 1247 vpslld $12,%ymm0,%ymm15 1248 vpsrld $20,%ymm0,%ymm0 1249 vpor %ymm0,%ymm15,%ymm0 1250 vpaddd %ymm3,%ymm10,%ymm10 1251 vpxor %ymm5,%ymm10,%ymm5 1252 vpshufb %ymm14,%ymm5,%ymm5 1253 vpaddd %ymm0,%ymm11,%ymm11 1254 vpxor %ymm6,%ymm11,%ymm6 1255 vpshufb %ymm14,%ymm6,%ymm6 1256 vpaddd %ymm5,%ymm12,%ymm12 1257 vpxor %ymm3,%ymm12,%ymm3 1258 vpslld $7,%ymm3,%ymm15 1259 vpsrld $25,%ymm3,%ymm3 1260 vpor %ymm3,%ymm15,%ymm3 1261 vbroadcasti128 (%r10),%ymm15 1262 vpaddd %ymm6,%ymm13,%ymm13 1263 vpxor %ymm0,%ymm13,%ymm0 1264 vpslld $7,%ymm0,%ymm14 1265 vpsrld $25,%ymm0,%ymm0 1266 vpor %ymm0,%ymm14,%ymm0 1267 decl %eax 1268 jnz .Loop8x 1269 1270 leaq 512(%rsp),%rax 1271 vpaddd 128-256(%rcx),%ymm8,%ymm8 1272 vpaddd 160-256(%rcx),%ymm9,%ymm9 1273 vpaddd 192-256(%rcx),%ymm10,%ymm10 1274 vpaddd 224-256(%rcx),%ymm11,%ymm11 1275 1276 vpunpckldq %ymm9,%ymm8,%ymm14 1277 vpunpckldq %ymm11,%ymm10,%ymm15 1278 vpunpckhdq %ymm9,%ymm8,%ymm8 1279 vpunpckhdq %ymm11,%ymm10,%ymm10 1280 vpunpcklqdq %ymm15,%ymm14,%ymm9 1281 vpunpckhqdq %ymm15,%ymm14,%ymm14 1282 vpunpcklqdq %ymm10,%ymm8,%ymm11 1283 vpunpckhqdq %ymm10,%ymm8,%ymm8 1284 vpaddd 256-256(%rcx),%ymm0,%ymm0 1285 vpaddd 288-256(%rcx),%ymm1,%ymm1 1286 vpaddd 320-256(%rcx),%ymm2,%ymm2 1287 vpaddd 352-256(%rcx),%ymm3,%ymm3 1288 1289 vpunpckldq %ymm1,%ymm0,%ymm10 1290 vpunpckldq %ymm3,%ymm2,%ymm15 1291 vpunpckhdq %ymm1,%ymm0,%ymm0 1292 vpunpckhdq %ymm3,%ymm2,%ymm2 1293 vpunpcklqdq %ymm15,%ymm10,%ymm1 1294 vpunpckhqdq %ymm15,%ymm10,%ymm10 1295 vpunpcklqdq %ymm2,%ymm0,%ymm3 1296 vpunpckhqdq %ymm2,%ymm0,%ymm0 1297 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1298 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1299 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1300 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1301 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1302 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1303 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1304 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1305 vmovdqa %ymm15,0(%rsp) 1306 vmovdqa %ymm9,32(%rsp) 1307 vmovdqa 64(%rsp),%ymm15 1308 vmovdqa 96(%rsp),%ymm9 1309 1310 vpaddd 384-512(%rax),%ymm12,%ymm12 1311 vpaddd 416-512(%rax),%ymm13,%ymm13 1312 vpaddd 448-512(%rax),%ymm15,%ymm15 1313 vpaddd 480-512(%rax),%ymm9,%ymm9 1314 1315 vpunpckldq %ymm13,%ymm12,%ymm2 1316 vpunpckldq %ymm9,%ymm15,%ymm8 1317 vpunpckhdq %ymm13,%ymm12,%ymm12 1318 vpunpckhdq %ymm9,%ymm15,%ymm15 1319 vpunpcklqdq %ymm8,%ymm2,%ymm13 1320 vpunpckhqdq %ymm8,%ymm2,%ymm2 1321 vpunpcklqdq %ymm15,%ymm12,%ymm9 1322 vpunpckhqdq %ymm15,%ymm12,%ymm12 1323 vpaddd 512-512(%rax),%ymm4,%ymm4 1324 vpaddd 544-512(%rax),%ymm5,%ymm5 1325 vpaddd 576-512(%rax),%ymm6,%ymm6 1326 vpaddd 608-512(%rax),%ymm7,%ymm7 1327 1328 vpunpckldq %ymm5,%ymm4,%ymm15 1329 vpunpckldq %ymm7,%ymm6,%ymm8 1330 vpunpckhdq %ymm5,%ymm4,%ymm4 1331 vpunpckhdq %ymm7,%ymm6,%ymm6 1332 vpunpcklqdq %ymm8,%ymm15,%ymm5 1333 vpunpckhqdq %ymm8,%ymm15,%ymm15 1334 vpunpcklqdq %ymm6,%ymm4,%ymm7 1335 vpunpckhqdq %ymm6,%ymm4,%ymm4 1336 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1337 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1338 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1339 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1340 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1341 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1342 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1343 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1344 vmovdqa 0(%rsp),%ymm6 1345 vmovdqa 32(%rsp),%ymm12 1346 1347 cmpq $512,%rdx 1348 jb .Ltail8x 1349 1350 vpxor 0(%rsi),%ymm6,%ymm6 1351 vpxor 32(%rsi),%ymm8,%ymm8 1352 vpxor 64(%rsi),%ymm1,%ymm1 1353 vpxor 96(%rsi),%ymm5,%ymm5 1354 leaq 128(%rsi),%rsi 1355 vmovdqu %ymm6,0(%rdi) 1356 vmovdqu %ymm8,32(%rdi) 1357 vmovdqu %ymm1,64(%rdi) 1358 vmovdqu %ymm5,96(%rdi) 1359 leaq 128(%rdi),%rdi 1360 1361 vpxor 0(%rsi),%ymm12,%ymm12 1362 vpxor 32(%rsi),%ymm13,%ymm13 1363 vpxor 64(%rsi),%ymm10,%ymm10 1364 vpxor 96(%rsi),%ymm15,%ymm15 1365 leaq 128(%rsi),%rsi 1366 vmovdqu %ymm12,0(%rdi) 1367 vmovdqu %ymm13,32(%rdi) 1368 vmovdqu %ymm10,64(%rdi) 1369 vmovdqu %ymm15,96(%rdi) 1370 leaq 128(%rdi),%rdi 1371 1372 vpxor 0(%rsi),%ymm14,%ymm14 1373 vpxor 32(%rsi),%ymm2,%ymm2 1374 vpxor 64(%rsi),%ymm3,%ymm3 1375 vpxor 96(%rsi),%ymm7,%ymm7 1376 leaq 128(%rsi),%rsi 1377 vmovdqu %ymm14,0(%rdi) 1378 vmovdqu %ymm2,32(%rdi) 1379 vmovdqu %ymm3,64(%rdi) 1380 vmovdqu %ymm7,96(%rdi) 1381 leaq 128(%rdi),%rdi 1382 1383 vpxor 0(%rsi),%ymm11,%ymm11 1384 vpxor 32(%rsi),%ymm9,%ymm9 1385 vpxor 64(%rsi),%ymm0,%ymm0 1386 vpxor 96(%rsi),%ymm4,%ymm4 1387 leaq 128(%rsi),%rsi 1388 vmovdqu %ymm11,0(%rdi) 1389 vmovdqu %ymm9,32(%rdi) 1390 vmovdqu %ymm0,64(%rdi) 1391 vmovdqu %ymm4,96(%rdi) 1392 leaq 128(%rdi),%rdi 1393 1394 subq $512,%rdx 1395 jnz .Loop_outer8x 1396 1397 jmp .Ldone8x 1398 1399.Ltail8x: 1400 cmpq $448,%rdx 1401 jae .L448_or_more8x 1402 cmpq $384,%rdx 1403 jae .L384_or_more8x 1404 cmpq $320,%rdx 1405 jae .L320_or_more8x 1406 cmpq $256,%rdx 1407 jae .L256_or_more8x 1408 cmpq $192,%rdx 1409 jae .L192_or_more8x 1410 cmpq $128,%rdx 1411 jae .L128_or_more8x 1412 cmpq $64,%rdx 1413 jae .L64_or_more8x 1414 1415 xorq %r10,%r10 1416 vmovdqa %ymm6,0(%rsp) 1417 vmovdqa %ymm8,32(%rsp) 1418 jmp .Loop_tail8x 1419 1420.align 32 1421.L64_or_more8x: 1422 vpxor 0(%rsi),%ymm6,%ymm6 1423 vpxor 32(%rsi),%ymm8,%ymm8 1424 vmovdqu %ymm6,0(%rdi) 1425 vmovdqu %ymm8,32(%rdi) 1426 je .Ldone8x 1427 1428 leaq 64(%rsi),%rsi 1429 xorq %r10,%r10 1430 vmovdqa %ymm1,0(%rsp) 1431 leaq 64(%rdi),%rdi 1432 subq $64,%rdx 1433 vmovdqa %ymm5,32(%rsp) 1434 jmp .Loop_tail8x 1435 1436.align 32 1437.L128_or_more8x: 1438 vpxor 0(%rsi),%ymm6,%ymm6 1439 vpxor 32(%rsi),%ymm8,%ymm8 1440 vpxor 64(%rsi),%ymm1,%ymm1 1441 vpxor 96(%rsi),%ymm5,%ymm5 1442 vmovdqu %ymm6,0(%rdi) 1443 vmovdqu %ymm8,32(%rdi) 1444 vmovdqu %ymm1,64(%rdi) 1445 vmovdqu %ymm5,96(%rdi) 1446 je .Ldone8x 1447 1448 leaq 128(%rsi),%rsi 1449 xorq %r10,%r10 1450 vmovdqa %ymm12,0(%rsp) 1451 leaq 128(%rdi),%rdi 1452 subq $128,%rdx 1453 vmovdqa %ymm13,32(%rsp) 1454 jmp .Loop_tail8x 1455 1456.align 32 1457.L192_or_more8x: 1458 vpxor 0(%rsi),%ymm6,%ymm6 1459 vpxor 32(%rsi),%ymm8,%ymm8 1460 vpxor 64(%rsi),%ymm1,%ymm1 1461 vpxor 96(%rsi),%ymm5,%ymm5 1462 vpxor 128(%rsi),%ymm12,%ymm12 1463 vpxor 160(%rsi),%ymm13,%ymm13 1464 vmovdqu %ymm6,0(%rdi) 1465 vmovdqu %ymm8,32(%rdi) 1466 vmovdqu %ymm1,64(%rdi) 1467 vmovdqu %ymm5,96(%rdi) 1468 vmovdqu %ymm12,128(%rdi) 1469 vmovdqu %ymm13,160(%rdi) 1470 je .Ldone8x 1471 1472 leaq 192(%rsi),%rsi 1473 xorq %r10,%r10 1474 vmovdqa %ymm10,0(%rsp) 1475 leaq 192(%rdi),%rdi 1476 subq $192,%rdx 1477 vmovdqa %ymm15,32(%rsp) 1478 jmp .Loop_tail8x 1479 1480.align 32 1481.L256_or_more8x: 1482 vpxor 0(%rsi),%ymm6,%ymm6 1483 vpxor 32(%rsi),%ymm8,%ymm8 1484 vpxor 64(%rsi),%ymm1,%ymm1 1485 vpxor 96(%rsi),%ymm5,%ymm5 1486 vpxor 128(%rsi),%ymm12,%ymm12 1487 vpxor 160(%rsi),%ymm13,%ymm13 1488 vpxor 192(%rsi),%ymm10,%ymm10 1489 vpxor 224(%rsi),%ymm15,%ymm15 1490 vmovdqu %ymm6,0(%rdi) 1491 vmovdqu %ymm8,32(%rdi) 1492 vmovdqu %ymm1,64(%rdi) 1493 vmovdqu %ymm5,96(%rdi) 1494 vmovdqu %ymm12,128(%rdi) 1495 vmovdqu %ymm13,160(%rdi) 1496 vmovdqu %ymm10,192(%rdi) 1497 vmovdqu %ymm15,224(%rdi) 1498 je .Ldone8x 1499 1500 leaq 256(%rsi),%rsi 1501 xorq %r10,%r10 1502 vmovdqa %ymm14,0(%rsp) 1503 leaq 256(%rdi),%rdi 1504 subq $256,%rdx 1505 vmovdqa %ymm2,32(%rsp) 1506 jmp .Loop_tail8x 1507 1508.align 32 1509.L320_or_more8x: 1510 vpxor 0(%rsi),%ymm6,%ymm6 1511 vpxor 32(%rsi),%ymm8,%ymm8 1512 vpxor 64(%rsi),%ymm1,%ymm1 1513 vpxor 96(%rsi),%ymm5,%ymm5 1514 vpxor 128(%rsi),%ymm12,%ymm12 1515 vpxor 160(%rsi),%ymm13,%ymm13 1516 vpxor 192(%rsi),%ymm10,%ymm10 1517 vpxor 224(%rsi),%ymm15,%ymm15 1518 vpxor 256(%rsi),%ymm14,%ymm14 1519 vpxor 288(%rsi),%ymm2,%ymm2 1520 vmovdqu %ymm6,0(%rdi) 1521 vmovdqu %ymm8,32(%rdi) 1522 vmovdqu %ymm1,64(%rdi) 1523 vmovdqu %ymm5,96(%rdi) 1524 vmovdqu %ymm12,128(%rdi) 1525 vmovdqu %ymm13,160(%rdi) 1526 vmovdqu %ymm10,192(%rdi) 1527 vmovdqu %ymm15,224(%rdi) 1528 vmovdqu %ymm14,256(%rdi) 1529 vmovdqu %ymm2,288(%rdi) 1530 je .Ldone8x 1531 1532 leaq 320(%rsi),%rsi 1533 xorq %r10,%r10 1534 vmovdqa %ymm3,0(%rsp) 1535 leaq 320(%rdi),%rdi 1536 subq $320,%rdx 1537 vmovdqa %ymm7,32(%rsp) 1538 jmp .Loop_tail8x 1539 1540.align 32 1541.L384_or_more8x: 1542 vpxor 0(%rsi),%ymm6,%ymm6 1543 vpxor 32(%rsi),%ymm8,%ymm8 1544 vpxor 64(%rsi),%ymm1,%ymm1 1545 vpxor 96(%rsi),%ymm5,%ymm5 1546 vpxor 128(%rsi),%ymm12,%ymm12 1547 vpxor 160(%rsi),%ymm13,%ymm13 1548 vpxor 192(%rsi),%ymm10,%ymm10 1549 vpxor 224(%rsi),%ymm15,%ymm15 1550 vpxor 256(%rsi),%ymm14,%ymm14 1551 vpxor 288(%rsi),%ymm2,%ymm2 1552 vpxor 320(%rsi),%ymm3,%ymm3 1553 vpxor 352(%rsi),%ymm7,%ymm7 1554 vmovdqu %ymm6,0(%rdi) 1555 vmovdqu %ymm8,32(%rdi) 1556 vmovdqu %ymm1,64(%rdi) 1557 vmovdqu %ymm5,96(%rdi) 1558 vmovdqu %ymm12,128(%rdi) 1559 vmovdqu %ymm13,160(%rdi) 1560 vmovdqu %ymm10,192(%rdi) 1561 vmovdqu %ymm15,224(%rdi) 1562 vmovdqu %ymm14,256(%rdi) 1563 vmovdqu %ymm2,288(%rdi) 1564 vmovdqu %ymm3,320(%rdi) 1565 vmovdqu %ymm7,352(%rdi) 1566 je .Ldone8x 1567 1568 leaq 384(%rsi),%rsi 1569 xorq %r10,%r10 1570 vmovdqa %ymm11,0(%rsp) 1571 leaq 384(%rdi),%rdi 1572 subq $384,%rdx 1573 vmovdqa %ymm9,32(%rsp) 1574 jmp .Loop_tail8x 1575 1576.align 32 1577.L448_or_more8x: 1578 vpxor 0(%rsi),%ymm6,%ymm6 1579 vpxor 32(%rsi),%ymm8,%ymm8 1580 vpxor 64(%rsi),%ymm1,%ymm1 1581 vpxor 96(%rsi),%ymm5,%ymm5 1582 vpxor 128(%rsi),%ymm12,%ymm12 1583 vpxor 160(%rsi),%ymm13,%ymm13 1584 vpxor 192(%rsi),%ymm10,%ymm10 1585 vpxor 224(%rsi),%ymm15,%ymm15 1586 vpxor 256(%rsi),%ymm14,%ymm14 1587 vpxor 288(%rsi),%ymm2,%ymm2 1588 vpxor 320(%rsi),%ymm3,%ymm3 1589 vpxor 352(%rsi),%ymm7,%ymm7 1590 vpxor 384(%rsi),%ymm11,%ymm11 1591 vpxor 416(%rsi),%ymm9,%ymm9 1592 vmovdqu %ymm6,0(%rdi) 1593 vmovdqu %ymm8,32(%rdi) 1594 vmovdqu %ymm1,64(%rdi) 1595 vmovdqu %ymm5,96(%rdi) 1596 vmovdqu %ymm12,128(%rdi) 1597 vmovdqu %ymm13,160(%rdi) 1598 vmovdqu %ymm10,192(%rdi) 1599 vmovdqu %ymm15,224(%rdi) 1600 vmovdqu %ymm14,256(%rdi) 1601 vmovdqu %ymm2,288(%rdi) 1602 vmovdqu %ymm3,320(%rdi) 1603 vmovdqu %ymm7,352(%rdi) 1604 vmovdqu %ymm11,384(%rdi) 1605 vmovdqu %ymm9,416(%rdi) 1606 je .Ldone8x 1607 1608 leaq 448(%rsi),%rsi 1609 xorq %r10,%r10 1610 vmovdqa %ymm0,0(%rsp) 1611 leaq 448(%rdi),%rdi 1612 subq $448,%rdx 1613 vmovdqa %ymm4,32(%rsp) 1614 1615.Loop_tail8x: 1616 movzbl (%rsi,%r10,1),%eax 1617 movzbl (%rsp,%r10,1),%ecx 1618 leaq 1(%r10),%r10 1619 xorl %ecx,%eax 1620 movb %al,-1(%rdi,%r10,1) 1621 decq %rdx 1622 jnz .Loop_tail8x 1623 1624.Ldone8x: 1625 vzeroall 1626 leaq (%r9),%rsp 1627.cfi_def_cfa_register rsp 1628.L8x_epilogue: 1629 .byte 0xf3,0xc3 1630.cfi_endproc 1631.size ChaCha20_8x,.-ChaCha20_8x 1632#endif 1633