1.text 2 3 4 5.align 64 6.Lzero: 7.long 0,0,0,0 8.Lone: 9.long 1,0,0,0 10.Linc: 11.long 0,1,2,3 12.Lfour: 13.long 4,4,4,4 14.Lincy: 15.long 0,2,4,6,1,3,5,7 16.Leight: 17.long 8,8,8,8,8,8,8,8 18.Lrot16: 19.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 20.Lrot24: 21.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 22.Ltwoy: 23.long 2,0,0,0, 2,0,0,0 24.align 64 25.Lzeroz: 26.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 27.Lfourz: 28.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 29.Lincz: 30.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 31.Lsixteen: 32.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 33.Lsigma: 34.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 35.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 36.globl ChaCha20_ctr32 37.type ChaCha20_ctr32,@function 38.align 64 39ChaCha20_ctr32: 40.cfi_startproc 41 cmpq $0,%rdx 42 je .Lno_data 43 movq OPENSSL_ia32cap_P+4(%rip),%r10 44 testl $512,%r10d 45 jnz .LChaCha20_ssse3 46 47 pushq %rbx 48.cfi_adjust_cfa_offset 8 49.cfi_offset %rbx,-16 50 pushq %rbp 51.cfi_adjust_cfa_offset 8 52.cfi_offset %rbp,-24 53 pushq %r12 54.cfi_adjust_cfa_offset 8 55.cfi_offset %r12,-32 56 pushq %r13 57.cfi_adjust_cfa_offset 8 58.cfi_offset %r13,-40 59 pushq %r14 60.cfi_adjust_cfa_offset 8 61.cfi_offset %r14,-48 62 pushq %r15 63.cfi_adjust_cfa_offset 8 64.cfi_offset %r15,-56 65 subq $64+24,%rsp 66.cfi_adjust_cfa_offset 64+24 67.Lctr32_body: 68 69 70 movdqu (%rcx),%xmm1 71 movdqu 16(%rcx),%xmm2 72 movdqu (%r8),%xmm3 73 movdqa .Lone(%rip),%xmm4 74 75 76 movdqa %xmm1,16(%rsp) 77 movdqa %xmm2,32(%rsp) 78 movdqa %xmm3,48(%rsp) 79 movq %rdx,%rbp 80 jmp .Loop_outer 81 82.align 32 83.Loop_outer: 84 movl $0x61707865,%eax 85 movl $0x3320646e,%ebx 86 movl $0x79622d32,%ecx 87 movl $0x6b206574,%edx 88 movl 16(%rsp),%r8d 89 movl 20(%rsp),%r9d 90 movl 24(%rsp),%r10d 91 movl 28(%rsp),%r11d 92 movd %xmm3,%r12d 93 movl 52(%rsp),%r13d 94 movl 56(%rsp),%r14d 95 movl 60(%rsp),%r15d 96 97 movq %rbp,64+0(%rsp) 98 movl $10,%ebp 99 movq %rsi,64+8(%rsp) 100.byte 102,72,15,126,214 101 movq %rdi,64+16(%rsp) 102 movq %rsi,%rdi 103 shrq $32,%rdi 104 jmp .Loop 105 106.align 32 107.Loop: 108 addl %r8d,%eax 109 xorl %eax,%r12d 110 roll $16,%r12d 111 addl %r9d,%ebx 112 xorl %ebx,%r13d 113 roll $16,%r13d 114 addl %r12d,%esi 115 xorl %esi,%r8d 116 roll $12,%r8d 117 addl %r13d,%edi 118 xorl %edi,%r9d 119 roll $12,%r9d 120 addl %r8d,%eax 121 xorl %eax,%r12d 122 roll $8,%r12d 123 addl %r9d,%ebx 124 xorl %ebx,%r13d 125 roll $8,%r13d 126 addl %r12d,%esi 127 xorl %esi,%r8d 128 roll $7,%r8d 129 addl %r13d,%edi 130 xorl %edi,%r9d 131 roll $7,%r9d 132 movl %esi,32(%rsp) 133 movl %edi,36(%rsp) 134 movl 40(%rsp),%esi 135 movl 44(%rsp),%edi 136 addl %r10d,%ecx 137 xorl %ecx,%r14d 138 roll $16,%r14d 139 addl %r11d,%edx 140 xorl %edx,%r15d 141 roll $16,%r15d 142 addl %r14d,%esi 143 xorl %esi,%r10d 144 roll $12,%r10d 145 addl %r15d,%edi 146 xorl %edi,%r11d 147 roll $12,%r11d 148 addl %r10d,%ecx 149 xorl %ecx,%r14d 150 roll $8,%r14d 151 addl %r11d,%edx 152 xorl %edx,%r15d 153 roll $8,%r15d 154 addl %r14d,%esi 155 xorl %esi,%r10d 156 roll $7,%r10d 157 addl %r15d,%edi 158 xorl %edi,%r11d 159 roll $7,%r11d 160 addl %r9d,%eax 161 xorl %eax,%r15d 162 roll $16,%r15d 163 addl %r10d,%ebx 164 xorl %ebx,%r12d 165 roll $16,%r12d 166 addl %r15d,%esi 167 xorl %esi,%r9d 168 roll $12,%r9d 169 addl %r12d,%edi 170 xorl %edi,%r10d 171 roll $12,%r10d 172 addl %r9d,%eax 173 xorl %eax,%r15d 174 roll $8,%r15d 175 addl %r10d,%ebx 176 xorl %ebx,%r12d 177 roll $8,%r12d 178 addl %r15d,%esi 179 xorl %esi,%r9d 180 roll $7,%r9d 181 addl %r12d,%edi 182 xorl %edi,%r10d 183 roll $7,%r10d 184 movl %esi,40(%rsp) 185 movl %edi,44(%rsp) 186 movl 32(%rsp),%esi 187 movl 36(%rsp),%edi 188 addl %r11d,%ecx 189 xorl %ecx,%r13d 190 roll $16,%r13d 191 addl %r8d,%edx 192 xorl %edx,%r14d 193 roll $16,%r14d 194 addl %r13d,%esi 195 xorl %esi,%r11d 196 roll $12,%r11d 197 addl %r14d,%edi 198 xorl %edi,%r8d 199 roll $12,%r8d 200 addl %r11d,%ecx 201 xorl %ecx,%r13d 202 roll $8,%r13d 203 addl %r8d,%edx 204 xorl %edx,%r14d 205 roll $8,%r14d 206 addl %r13d,%esi 207 xorl %esi,%r11d 208 roll $7,%r11d 209 addl %r14d,%edi 210 xorl %edi,%r8d 211 roll $7,%r8d 212 decl %ebp 213 jnz .Loop 214 movl %edi,36(%rsp) 215 movl %esi,32(%rsp) 216 movq 64(%rsp),%rbp 217 movdqa %xmm2,%xmm1 218 movq 64+8(%rsp),%rsi 219 paddd %xmm4,%xmm3 220 movq 64+16(%rsp),%rdi 221 222 addl $0x61707865,%eax 223 addl $0x3320646e,%ebx 224 addl $0x79622d32,%ecx 225 addl $0x6b206574,%edx 226 addl 16(%rsp),%r8d 227 addl 20(%rsp),%r9d 228 addl 24(%rsp),%r10d 229 addl 28(%rsp),%r11d 230 addl 48(%rsp),%r12d 231 addl 52(%rsp),%r13d 232 addl 56(%rsp),%r14d 233 addl 60(%rsp),%r15d 234 paddd 32(%rsp),%xmm1 235 236 cmpq $64,%rbp 237 jb .Ltail 238 239 xorl 0(%rsi),%eax 240 xorl 4(%rsi),%ebx 241 xorl 8(%rsi),%ecx 242 xorl 12(%rsi),%edx 243 xorl 16(%rsi),%r8d 244 xorl 20(%rsi),%r9d 245 xorl 24(%rsi),%r10d 246 xorl 28(%rsi),%r11d 247 movdqu 32(%rsi),%xmm0 248 xorl 48(%rsi),%r12d 249 xorl 52(%rsi),%r13d 250 xorl 56(%rsi),%r14d 251 xorl 60(%rsi),%r15d 252 leaq 64(%rsi),%rsi 253 pxor %xmm1,%xmm0 254 255 movdqa %xmm2,32(%rsp) 256 movd %xmm3,48(%rsp) 257 258 movl %eax,0(%rdi) 259 movl %ebx,4(%rdi) 260 movl %ecx,8(%rdi) 261 movl %edx,12(%rdi) 262 movl %r8d,16(%rdi) 263 movl %r9d,20(%rdi) 264 movl %r10d,24(%rdi) 265 movl %r11d,28(%rdi) 266 movdqu %xmm0,32(%rdi) 267 movl %r12d,48(%rdi) 268 movl %r13d,52(%rdi) 269 movl %r14d,56(%rdi) 270 movl %r15d,60(%rdi) 271 leaq 64(%rdi),%rdi 272 273 subq $64,%rbp 274 jnz .Loop_outer 275 276 jmp .Ldone 277 278.align 16 279.Ltail: 280 movl %eax,0(%rsp) 281 movl %ebx,4(%rsp) 282 xorq %rbx,%rbx 283 movl %ecx,8(%rsp) 284 movl %edx,12(%rsp) 285 movl %r8d,16(%rsp) 286 movl %r9d,20(%rsp) 287 movl %r10d,24(%rsp) 288 movl %r11d,28(%rsp) 289 movdqa %xmm1,32(%rsp) 290 movl %r12d,48(%rsp) 291 movl %r13d,52(%rsp) 292 movl %r14d,56(%rsp) 293 movl %r15d,60(%rsp) 294 295.Loop_tail: 296 movzbl (%rsi,%rbx,1),%eax 297 movzbl (%rsp,%rbx,1),%edx 298 leaq 1(%rbx),%rbx 299 xorl %edx,%eax 300 movb %al,-1(%rdi,%rbx,1) 301 decq %rbp 302 jnz .Loop_tail 303 304.Ldone: 305 leaq 64+24+48(%rsp),%rsi 306.cfi_def_cfa %rsi,8 307 movq -48(%rsi),%r15 308.cfi_restore %r15 309 movq -40(%rsi),%r14 310.cfi_restore %r14 311 movq -32(%rsi),%r13 312.cfi_restore %r13 313 movq -24(%rsi),%r12 314.cfi_restore %r12 315 movq -16(%rsi),%rbp 316.cfi_restore %rbp 317 movq -8(%rsi),%rbx 318.cfi_restore %rbx 319 leaq (%rsi),%rsp 320.cfi_def_cfa_register %rsp 321.Lno_data: 322 .byte 0xf3,0xc3 323.cfi_endproc 324.size ChaCha20_ctr32,.-ChaCha20_ctr32 325.type ChaCha20_ssse3,@function 326.align 32 327ChaCha20_ssse3: 328.cfi_startproc 329.LChaCha20_ssse3: 330 movq %rsp,%r9 331.cfi_def_cfa_register %r9 332 testl $2048,%r10d 333 jnz .LChaCha20_4xop 334 cmpq $128,%rdx 335 je .LChaCha20_128 336 ja .LChaCha20_4x 337 338.Ldo_sse3_after_all: 339 subq $64+8,%rsp 340 movdqa .Lsigma(%rip),%xmm0 341 movdqu (%rcx),%xmm1 342 movdqu 16(%rcx),%xmm2 343 movdqu (%r8),%xmm3 344 movdqa .Lrot16(%rip),%xmm6 345 movdqa .Lrot24(%rip),%xmm7 346 347 movdqa %xmm0,0(%rsp) 348 movdqa %xmm1,16(%rsp) 349 movdqa %xmm2,32(%rsp) 350 movdqa %xmm3,48(%rsp) 351 movq $10,%r8 352 jmp .Loop_ssse3 353 354.align 32 355.Loop_outer_ssse3: 356 movdqa .Lone(%rip),%xmm3 357 movdqa 0(%rsp),%xmm0 358 movdqa 16(%rsp),%xmm1 359 movdqa 32(%rsp),%xmm2 360 paddd 48(%rsp),%xmm3 361 movq $10,%r8 362 movdqa %xmm3,48(%rsp) 363 jmp .Loop_ssse3 364 365.align 32 366.Loop_ssse3: 367 paddd %xmm1,%xmm0 368 pxor %xmm0,%xmm3 369.byte 102,15,56,0,222 370 paddd %xmm3,%xmm2 371 pxor %xmm2,%xmm1 372 movdqa %xmm1,%xmm4 373 psrld $20,%xmm1 374 pslld $12,%xmm4 375 por %xmm4,%xmm1 376 paddd %xmm1,%xmm0 377 pxor %xmm0,%xmm3 378.byte 102,15,56,0,223 379 paddd %xmm3,%xmm2 380 pxor %xmm2,%xmm1 381 movdqa %xmm1,%xmm4 382 psrld $25,%xmm1 383 pslld $7,%xmm4 384 por %xmm4,%xmm1 385 pshufd $78,%xmm2,%xmm2 386 pshufd $57,%xmm1,%xmm1 387 pshufd $147,%xmm3,%xmm3 388 nop 389 paddd %xmm1,%xmm0 390 pxor %xmm0,%xmm3 391.byte 102,15,56,0,222 392 paddd %xmm3,%xmm2 393 pxor %xmm2,%xmm1 394 movdqa %xmm1,%xmm4 395 psrld $20,%xmm1 396 pslld $12,%xmm4 397 por %xmm4,%xmm1 398 paddd %xmm1,%xmm0 399 pxor %xmm0,%xmm3 400.byte 102,15,56,0,223 401 paddd %xmm3,%xmm2 402 pxor %xmm2,%xmm1 403 movdqa %xmm1,%xmm4 404 psrld $25,%xmm1 405 pslld $7,%xmm4 406 por %xmm4,%xmm1 407 pshufd $78,%xmm2,%xmm2 408 pshufd $147,%xmm1,%xmm1 409 pshufd $57,%xmm3,%xmm3 410 decq %r8 411 jnz .Loop_ssse3 412 paddd 0(%rsp),%xmm0 413 paddd 16(%rsp),%xmm1 414 paddd 32(%rsp),%xmm2 415 paddd 48(%rsp),%xmm3 416 417 cmpq $64,%rdx 418 jb .Ltail_ssse3 419 420 movdqu 0(%rsi),%xmm4 421 movdqu 16(%rsi),%xmm5 422 pxor %xmm4,%xmm0 423 movdqu 32(%rsi),%xmm4 424 pxor %xmm5,%xmm1 425 movdqu 48(%rsi),%xmm5 426 leaq 64(%rsi),%rsi 427 pxor %xmm4,%xmm2 428 pxor %xmm5,%xmm3 429 430 movdqu %xmm0,0(%rdi) 431 movdqu %xmm1,16(%rdi) 432 movdqu %xmm2,32(%rdi) 433 movdqu %xmm3,48(%rdi) 434 leaq 64(%rdi),%rdi 435 436 subq $64,%rdx 437 jnz .Loop_outer_ssse3 438 439 jmp .Ldone_ssse3 440 441.align 16 442.Ltail_ssse3: 443 movdqa %xmm0,0(%rsp) 444 movdqa %xmm1,16(%rsp) 445 movdqa %xmm2,32(%rsp) 446 movdqa %xmm3,48(%rsp) 447 xorq %r8,%r8 448 449.Loop_tail_ssse3: 450 movzbl (%rsi,%r8,1),%eax 451 movzbl (%rsp,%r8,1),%ecx 452 leaq 1(%r8),%r8 453 xorl %ecx,%eax 454 movb %al,-1(%rdi,%r8,1) 455 decq %rdx 456 jnz .Loop_tail_ssse3 457 458.Ldone_ssse3: 459 leaq (%r9),%rsp 460.cfi_def_cfa_register %rsp 461.Lssse3_epilogue: 462 .byte 0xf3,0xc3 463.cfi_endproc 464.size ChaCha20_ssse3,.-ChaCha20_ssse3 465.type ChaCha20_128,@function 466.align 32 467ChaCha20_128: 468.cfi_startproc 469.LChaCha20_128: 470 movq %rsp,%r9 471.cfi_def_cfa_register %r9 472 subq $64+8,%rsp 473 movdqa .Lsigma(%rip),%xmm8 474 movdqu (%rcx),%xmm9 475 movdqu 16(%rcx),%xmm2 476 movdqu (%r8),%xmm3 477 movdqa .Lone(%rip),%xmm1 478 movdqa .Lrot16(%rip),%xmm6 479 movdqa .Lrot24(%rip),%xmm7 480 481 movdqa %xmm8,%xmm10 482 movdqa %xmm8,0(%rsp) 483 movdqa %xmm9,%xmm11 484 movdqa %xmm9,16(%rsp) 485 movdqa %xmm2,%xmm0 486 movdqa %xmm2,32(%rsp) 487 paddd %xmm3,%xmm1 488 movdqa %xmm3,48(%rsp) 489 movq $10,%r8 490 jmp .Loop_128 491 492.align 32 493.Loop_128: 494 paddd %xmm9,%xmm8 495 pxor %xmm8,%xmm3 496 paddd %xmm11,%xmm10 497 pxor %xmm10,%xmm1 498.byte 102,15,56,0,222 499.byte 102,15,56,0,206 500 paddd %xmm3,%xmm2 501 paddd %xmm1,%xmm0 502 pxor %xmm2,%xmm9 503 pxor %xmm0,%xmm11 504 movdqa %xmm9,%xmm4 505 psrld $20,%xmm9 506 movdqa %xmm11,%xmm5 507 pslld $12,%xmm4 508 psrld $20,%xmm11 509 por %xmm4,%xmm9 510 pslld $12,%xmm5 511 por %xmm5,%xmm11 512 paddd %xmm9,%xmm8 513 pxor %xmm8,%xmm3 514 paddd %xmm11,%xmm10 515 pxor %xmm10,%xmm1 516.byte 102,15,56,0,223 517.byte 102,15,56,0,207 518 paddd %xmm3,%xmm2 519 paddd %xmm1,%xmm0 520 pxor %xmm2,%xmm9 521 pxor %xmm0,%xmm11 522 movdqa %xmm9,%xmm4 523 psrld $25,%xmm9 524 movdqa %xmm11,%xmm5 525 pslld $7,%xmm4 526 psrld $25,%xmm11 527 por %xmm4,%xmm9 528 pslld $7,%xmm5 529 por %xmm5,%xmm11 530 pshufd $78,%xmm2,%xmm2 531 pshufd $57,%xmm9,%xmm9 532 pshufd $147,%xmm3,%xmm3 533 pshufd $78,%xmm0,%xmm0 534 pshufd $57,%xmm11,%xmm11 535 pshufd $147,%xmm1,%xmm1 536 paddd %xmm9,%xmm8 537 pxor %xmm8,%xmm3 538 paddd %xmm11,%xmm10 539 pxor %xmm10,%xmm1 540.byte 102,15,56,0,222 541.byte 102,15,56,0,206 542 paddd %xmm3,%xmm2 543 paddd %xmm1,%xmm0 544 pxor %xmm2,%xmm9 545 pxor %xmm0,%xmm11 546 movdqa %xmm9,%xmm4 547 psrld $20,%xmm9 548 movdqa %xmm11,%xmm5 549 pslld $12,%xmm4 550 psrld $20,%xmm11 551 por %xmm4,%xmm9 552 pslld $12,%xmm5 553 por %xmm5,%xmm11 554 paddd %xmm9,%xmm8 555 pxor %xmm8,%xmm3 556 paddd %xmm11,%xmm10 557 pxor %xmm10,%xmm1 558.byte 102,15,56,0,223 559.byte 102,15,56,0,207 560 paddd %xmm3,%xmm2 561 paddd %xmm1,%xmm0 562 pxor %xmm2,%xmm9 563 pxor %xmm0,%xmm11 564 movdqa %xmm9,%xmm4 565 psrld $25,%xmm9 566 movdqa %xmm11,%xmm5 567 pslld $7,%xmm4 568 psrld $25,%xmm11 569 por %xmm4,%xmm9 570 pslld $7,%xmm5 571 por %xmm5,%xmm11 572 pshufd $78,%xmm2,%xmm2 573 pshufd $147,%xmm9,%xmm9 574 pshufd $57,%xmm3,%xmm3 575 pshufd $78,%xmm0,%xmm0 576 pshufd $147,%xmm11,%xmm11 577 pshufd $57,%xmm1,%xmm1 578 decq %r8 579 jnz .Loop_128 580 paddd 0(%rsp),%xmm8 581 paddd 16(%rsp),%xmm9 582 paddd 32(%rsp),%xmm2 583 paddd 48(%rsp),%xmm3 584 paddd .Lone(%rip),%xmm1 585 paddd 0(%rsp),%xmm10 586 paddd 16(%rsp),%xmm11 587 paddd 32(%rsp),%xmm0 588 paddd 48(%rsp),%xmm1 589 590 movdqu 0(%rsi),%xmm4 591 movdqu 16(%rsi),%xmm5 592 pxor %xmm4,%xmm8 593 movdqu 32(%rsi),%xmm4 594 pxor %xmm5,%xmm9 595 movdqu 48(%rsi),%xmm5 596 pxor %xmm4,%xmm2 597 movdqu 64(%rsi),%xmm4 598 pxor %xmm5,%xmm3 599 movdqu 80(%rsi),%xmm5 600 pxor %xmm4,%xmm10 601 movdqu 96(%rsi),%xmm4 602 pxor %xmm5,%xmm11 603 movdqu 112(%rsi),%xmm5 604 pxor %xmm4,%xmm0 605 pxor %xmm5,%xmm1 606 607 movdqu %xmm8,0(%rdi) 608 movdqu %xmm9,16(%rdi) 609 movdqu %xmm2,32(%rdi) 610 movdqu %xmm3,48(%rdi) 611 movdqu %xmm10,64(%rdi) 612 movdqu %xmm11,80(%rdi) 613 movdqu %xmm0,96(%rdi) 614 movdqu %xmm1,112(%rdi) 615 leaq (%r9),%rsp 616.cfi_def_cfa_register %rsp 617.L128_epilogue: 618 .byte 0xf3,0xc3 619.cfi_endproc 620.size ChaCha20_128,.-ChaCha20_128 621.type ChaCha20_4x,@function 622.align 32 623ChaCha20_4x: 624.cfi_startproc 625.LChaCha20_4x: 626 movq %rsp,%r9 627.cfi_def_cfa_register %r9 628 movq %r10,%r11 629 shrq $32,%r10 630 testq $32,%r10 631 jnz .LChaCha20_8x 632 cmpq $192,%rdx 633 ja .Lproceed4x 634 635 andq $71303168,%r11 636 cmpq $4194304,%r11 637 je .Ldo_sse3_after_all 638 639.Lproceed4x: 640 subq $0x140+8,%rsp 641 movdqa .Lsigma(%rip),%xmm11 642 movdqu (%rcx),%xmm15 643 movdqu 16(%rcx),%xmm7 644 movdqu (%r8),%xmm3 645 leaq 256(%rsp),%rcx 646 leaq .Lrot16(%rip),%r10 647 leaq .Lrot24(%rip),%r11 648 649 pshufd $0x00,%xmm11,%xmm8 650 pshufd $0x55,%xmm11,%xmm9 651 movdqa %xmm8,64(%rsp) 652 pshufd $0xaa,%xmm11,%xmm10 653 movdqa %xmm9,80(%rsp) 654 pshufd $0xff,%xmm11,%xmm11 655 movdqa %xmm10,96(%rsp) 656 movdqa %xmm11,112(%rsp) 657 658 pshufd $0x00,%xmm15,%xmm12 659 pshufd $0x55,%xmm15,%xmm13 660 movdqa %xmm12,128-256(%rcx) 661 pshufd $0xaa,%xmm15,%xmm14 662 movdqa %xmm13,144-256(%rcx) 663 pshufd $0xff,%xmm15,%xmm15 664 movdqa %xmm14,160-256(%rcx) 665 movdqa %xmm15,176-256(%rcx) 666 667 pshufd $0x00,%xmm7,%xmm4 668 pshufd $0x55,%xmm7,%xmm5 669 movdqa %xmm4,192-256(%rcx) 670 pshufd $0xaa,%xmm7,%xmm6 671 movdqa %xmm5,208-256(%rcx) 672 pshufd $0xff,%xmm7,%xmm7 673 movdqa %xmm6,224-256(%rcx) 674 movdqa %xmm7,240-256(%rcx) 675 676 pshufd $0x00,%xmm3,%xmm0 677 pshufd $0x55,%xmm3,%xmm1 678 paddd .Linc(%rip),%xmm0 679 pshufd $0xaa,%xmm3,%xmm2 680 movdqa %xmm1,272-256(%rcx) 681 pshufd $0xff,%xmm3,%xmm3 682 movdqa %xmm2,288-256(%rcx) 683 movdqa %xmm3,304-256(%rcx) 684 685 jmp .Loop_enter4x 686 687.align 32 688.Loop_outer4x: 689 movdqa 64(%rsp),%xmm8 690 movdqa 80(%rsp),%xmm9 691 movdqa 96(%rsp),%xmm10 692 movdqa 112(%rsp),%xmm11 693 movdqa 128-256(%rcx),%xmm12 694 movdqa 144-256(%rcx),%xmm13 695 movdqa 160-256(%rcx),%xmm14 696 movdqa 176-256(%rcx),%xmm15 697 movdqa 192-256(%rcx),%xmm4 698 movdqa 208-256(%rcx),%xmm5 699 movdqa 224-256(%rcx),%xmm6 700 movdqa 240-256(%rcx),%xmm7 701 movdqa 256-256(%rcx),%xmm0 702 movdqa 272-256(%rcx),%xmm1 703 movdqa 288-256(%rcx),%xmm2 704 movdqa 304-256(%rcx),%xmm3 705 paddd .Lfour(%rip),%xmm0 706 707.Loop_enter4x: 708 movdqa %xmm6,32(%rsp) 709 movdqa %xmm7,48(%rsp) 710 movdqa (%r10),%xmm7 711 movl $10,%eax 712 movdqa %xmm0,256-256(%rcx) 713 jmp .Loop4x 714 715.align 32 716.Loop4x: 717 paddd %xmm12,%xmm8 718 paddd %xmm13,%xmm9 719 pxor %xmm8,%xmm0 720 pxor %xmm9,%xmm1 721.byte 102,15,56,0,199 722.byte 102,15,56,0,207 723 paddd %xmm0,%xmm4 724 paddd %xmm1,%xmm5 725 pxor %xmm4,%xmm12 726 pxor %xmm5,%xmm13 727 movdqa %xmm12,%xmm6 728 pslld $12,%xmm12 729 psrld $20,%xmm6 730 movdqa %xmm13,%xmm7 731 pslld $12,%xmm13 732 por %xmm6,%xmm12 733 psrld $20,%xmm7 734 movdqa (%r11),%xmm6 735 por %xmm7,%xmm13 736 paddd %xmm12,%xmm8 737 paddd %xmm13,%xmm9 738 pxor %xmm8,%xmm0 739 pxor %xmm9,%xmm1 740.byte 102,15,56,0,198 741.byte 102,15,56,0,206 742 paddd %xmm0,%xmm4 743 paddd %xmm1,%xmm5 744 pxor %xmm4,%xmm12 745 pxor %xmm5,%xmm13 746 movdqa %xmm12,%xmm7 747 pslld $7,%xmm12 748 psrld $25,%xmm7 749 movdqa %xmm13,%xmm6 750 pslld $7,%xmm13 751 por %xmm7,%xmm12 752 psrld $25,%xmm6 753 movdqa (%r10),%xmm7 754 por %xmm6,%xmm13 755 movdqa %xmm4,0(%rsp) 756 movdqa %xmm5,16(%rsp) 757 movdqa 32(%rsp),%xmm4 758 movdqa 48(%rsp),%xmm5 759 paddd %xmm14,%xmm10 760 paddd %xmm15,%xmm11 761 pxor %xmm10,%xmm2 762 pxor %xmm11,%xmm3 763.byte 102,15,56,0,215 764.byte 102,15,56,0,223 765 paddd %xmm2,%xmm4 766 paddd %xmm3,%xmm5 767 pxor %xmm4,%xmm14 768 pxor %xmm5,%xmm15 769 movdqa %xmm14,%xmm6 770 pslld $12,%xmm14 771 psrld $20,%xmm6 772 movdqa %xmm15,%xmm7 773 pslld $12,%xmm15 774 por %xmm6,%xmm14 775 psrld $20,%xmm7 776 movdqa (%r11),%xmm6 777 por %xmm7,%xmm15 778 paddd %xmm14,%xmm10 779 paddd %xmm15,%xmm11 780 pxor %xmm10,%xmm2 781 pxor %xmm11,%xmm3 782.byte 102,15,56,0,214 783.byte 102,15,56,0,222 784 paddd %xmm2,%xmm4 785 paddd %xmm3,%xmm5 786 pxor %xmm4,%xmm14 787 pxor %xmm5,%xmm15 788 movdqa %xmm14,%xmm7 789 pslld $7,%xmm14 790 psrld $25,%xmm7 791 movdqa %xmm15,%xmm6 792 pslld $7,%xmm15 793 por %xmm7,%xmm14 794 psrld $25,%xmm6 795 movdqa (%r10),%xmm7 796 por %xmm6,%xmm15 797 paddd %xmm13,%xmm8 798 paddd %xmm14,%xmm9 799 pxor %xmm8,%xmm3 800 pxor %xmm9,%xmm0 801.byte 102,15,56,0,223 802.byte 102,15,56,0,199 803 paddd %xmm3,%xmm4 804 paddd %xmm0,%xmm5 805 pxor %xmm4,%xmm13 806 pxor %xmm5,%xmm14 807 movdqa %xmm13,%xmm6 808 pslld $12,%xmm13 809 psrld $20,%xmm6 810 movdqa %xmm14,%xmm7 811 pslld $12,%xmm14 812 por %xmm6,%xmm13 813 psrld $20,%xmm7 814 movdqa (%r11),%xmm6 815 por %xmm7,%xmm14 816 paddd %xmm13,%xmm8 817 paddd %xmm14,%xmm9 818 pxor %xmm8,%xmm3 819 pxor %xmm9,%xmm0 820.byte 102,15,56,0,222 821.byte 102,15,56,0,198 822 paddd %xmm3,%xmm4 823 paddd %xmm0,%xmm5 824 pxor %xmm4,%xmm13 825 pxor %xmm5,%xmm14 826 movdqa %xmm13,%xmm7 827 pslld $7,%xmm13 828 psrld $25,%xmm7 829 movdqa %xmm14,%xmm6 830 pslld $7,%xmm14 831 por %xmm7,%xmm13 832 psrld $25,%xmm6 833 movdqa (%r10),%xmm7 834 por %xmm6,%xmm14 835 movdqa %xmm4,32(%rsp) 836 movdqa %xmm5,48(%rsp) 837 movdqa 0(%rsp),%xmm4 838 movdqa 16(%rsp),%xmm5 839 paddd %xmm15,%xmm10 840 paddd %xmm12,%xmm11 841 pxor %xmm10,%xmm1 842 pxor %xmm11,%xmm2 843.byte 102,15,56,0,207 844.byte 102,15,56,0,215 845 paddd %xmm1,%xmm4 846 paddd %xmm2,%xmm5 847 pxor %xmm4,%xmm15 848 pxor %xmm5,%xmm12 849 movdqa %xmm15,%xmm6 850 pslld $12,%xmm15 851 psrld $20,%xmm6 852 movdqa %xmm12,%xmm7 853 pslld $12,%xmm12 854 por %xmm6,%xmm15 855 psrld $20,%xmm7 856 movdqa (%r11),%xmm6 857 por %xmm7,%xmm12 858 paddd %xmm15,%xmm10 859 paddd %xmm12,%xmm11 860 pxor %xmm10,%xmm1 861 pxor %xmm11,%xmm2 862.byte 102,15,56,0,206 863.byte 102,15,56,0,214 864 paddd %xmm1,%xmm4 865 paddd %xmm2,%xmm5 866 pxor %xmm4,%xmm15 867 pxor %xmm5,%xmm12 868 movdqa %xmm15,%xmm7 869 pslld $7,%xmm15 870 psrld $25,%xmm7 871 movdqa %xmm12,%xmm6 872 pslld $7,%xmm12 873 por %xmm7,%xmm15 874 psrld $25,%xmm6 875 movdqa (%r10),%xmm7 876 por %xmm6,%xmm12 877 decl %eax 878 jnz .Loop4x 879 880 paddd 64(%rsp),%xmm8 881 paddd 80(%rsp),%xmm9 882 paddd 96(%rsp),%xmm10 883 paddd 112(%rsp),%xmm11 884 885 movdqa %xmm8,%xmm6 886 punpckldq %xmm9,%xmm8 887 movdqa %xmm10,%xmm7 888 punpckldq %xmm11,%xmm10 889 punpckhdq %xmm9,%xmm6 890 punpckhdq %xmm11,%xmm7 891 movdqa %xmm8,%xmm9 892 punpcklqdq %xmm10,%xmm8 893 movdqa %xmm6,%xmm11 894 punpcklqdq %xmm7,%xmm6 895 punpckhqdq %xmm10,%xmm9 896 punpckhqdq %xmm7,%xmm11 897 paddd 128-256(%rcx),%xmm12 898 paddd 144-256(%rcx),%xmm13 899 paddd 160-256(%rcx),%xmm14 900 paddd 176-256(%rcx),%xmm15 901 902 movdqa %xmm8,0(%rsp) 903 movdqa %xmm9,16(%rsp) 904 movdqa 32(%rsp),%xmm8 905 movdqa 48(%rsp),%xmm9 906 907 movdqa %xmm12,%xmm10 908 punpckldq %xmm13,%xmm12 909 movdqa %xmm14,%xmm7 910 punpckldq %xmm15,%xmm14 911 punpckhdq %xmm13,%xmm10 912 punpckhdq %xmm15,%xmm7 913 movdqa %xmm12,%xmm13 914 punpcklqdq %xmm14,%xmm12 915 movdqa %xmm10,%xmm15 916 punpcklqdq %xmm7,%xmm10 917 punpckhqdq %xmm14,%xmm13 918 punpckhqdq %xmm7,%xmm15 919 paddd 192-256(%rcx),%xmm4 920 paddd 208-256(%rcx),%xmm5 921 paddd 224-256(%rcx),%xmm8 922 paddd 240-256(%rcx),%xmm9 923 924 movdqa %xmm6,32(%rsp) 925 movdqa %xmm11,48(%rsp) 926 927 movdqa %xmm4,%xmm14 928 punpckldq %xmm5,%xmm4 929 movdqa %xmm8,%xmm7 930 punpckldq %xmm9,%xmm8 931 punpckhdq %xmm5,%xmm14 932 punpckhdq %xmm9,%xmm7 933 movdqa %xmm4,%xmm5 934 punpcklqdq %xmm8,%xmm4 935 movdqa %xmm14,%xmm9 936 punpcklqdq %xmm7,%xmm14 937 punpckhqdq %xmm8,%xmm5 938 punpckhqdq %xmm7,%xmm9 939 paddd 256-256(%rcx),%xmm0 940 paddd 272-256(%rcx),%xmm1 941 paddd 288-256(%rcx),%xmm2 942 paddd 304-256(%rcx),%xmm3 943 944 movdqa %xmm0,%xmm8 945 punpckldq %xmm1,%xmm0 946 movdqa %xmm2,%xmm7 947 punpckldq %xmm3,%xmm2 948 punpckhdq %xmm1,%xmm8 949 punpckhdq %xmm3,%xmm7 950 movdqa %xmm0,%xmm1 951 punpcklqdq %xmm2,%xmm0 952 movdqa %xmm8,%xmm3 953 punpcklqdq %xmm7,%xmm8 954 punpckhqdq %xmm2,%xmm1 955 punpckhqdq %xmm7,%xmm3 956 cmpq $256,%rdx 957 jb .Ltail4x 958 959 movdqu 0(%rsi),%xmm6 960 movdqu 16(%rsi),%xmm11 961 movdqu 32(%rsi),%xmm2 962 movdqu 48(%rsi),%xmm7 963 pxor 0(%rsp),%xmm6 964 pxor %xmm12,%xmm11 965 pxor %xmm4,%xmm2 966 pxor %xmm0,%xmm7 967 968 movdqu %xmm6,0(%rdi) 969 movdqu 64(%rsi),%xmm6 970 movdqu %xmm11,16(%rdi) 971 movdqu 80(%rsi),%xmm11 972 movdqu %xmm2,32(%rdi) 973 movdqu 96(%rsi),%xmm2 974 movdqu %xmm7,48(%rdi) 975 movdqu 112(%rsi),%xmm7 976 leaq 128(%rsi),%rsi 977 pxor 16(%rsp),%xmm6 978 pxor %xmm13,%xmm11 979 pxor %xmm5,%xmm2 980 pxor %xmm1,%xmm7 981 982 movdqu %xmm6,64(%rdi) 983 movdqu 0(%rsi),%xmm6 984 movdqu %xmm11,80(%rdi) 985 movdqu 16(%rsi),%xmm11 986 movdqu %xmm2,96(%rdi) 987 movdqu 32(%rsi),%xmm2 988 movdqu %xmm7,112(%rdi) 989 leaq 128(%rdi),%rdi 990 movdqu 48(%rsi),%xmm7 991 pxor 32(%rsp),%xmm6 992 pxor %xmm10,%xmm11 993 pxor %xmm14,%xmm2 994 pxor %xmm8,%xmm7 995 996 movdqu %xmm6,0(%rdi) 997 movdqu 64(%rsi),%xmm6 998 movdqu %xmm11,16(%rdi) 999 movdqu 80(%rsi),%xmm11 1000 movdqu %xmm2,32(%rdi) 1001 movdqu 96(%rsi),%xmm2 1002 movdqu %xmm7,48(%rdi) 1003 movdqu 112(%rsi),%xmm7 1004 leaq 128(%rsi),%rsi 1005 pxor 48(%rsp),%xmm6 1006 pxor %xmm15,%xmm11 1007 pxor %xmm9,%xmm2 1008 pxor %xmm3,%xmm7 1009 movdqu %xmm6,64(%rdi) 1010 movdqu %xmm11,80(%rdi) 1011 movdqu %xmm2,96(%rdi) 1012 movdqu %xmm7,112(%rdi) 1013 leaq 128(%rdi),%rdi 1014 1015 subq $256,%rdx 1016 jnz .Loop_outer4x 1017 1018 jmp .Ldone4x 1019 1020.Ltail4x: 1021 cmpq $192,%rdx 1022 jae .L192_or_more4x 1023 cmpq $128,%rdx 1024 jae .L128_or_more4x 1025 cmpq $64,%rdx 1026 jae .L64_or_more4x 1027 1028 1029 xorq %r10,%r10 1030 1031 movdqa %xmm12,16(%rsp) 1032 movdqa %xmm4,32(%rsp) 1033 movdqa %xmm0,48(%rsp) 1034 jmp .Loop_tail4x 1035 1036.align 32 1037.L64_or_more4x: 1038 movdqu 0(%rsi),%xmm6 1039 movdqu 16(%rsi),%xmm11 1040 movdqu 32(%rsi),%xmm2 1041 movdqu 48(%rsi),%xmm7 1042 pxor 0(%rsp),%xmm6 1043 pxor %xmm12,%xmm11 1044 pxor %xmm4,%xmm2 1045 pxor %xmm0,%xmm7 1046 movdqu %xmm6,0(%rdi) 1047 movdqu %xmm11,16(%rdi) 1048 movdqu %xmm2,32(%rdi) 1049 movdqu %xmm7,48(%rdi) 1050 je .Ldone4x 1051 1052 movdqa 16(%rsp),%xmm6 1053 leaq 64(%rsi),%rsi 1054 xorq %r10,%r10 1055 movdqa %xmm6,0(%rsp) 1056 movdqa %xmm13,16(%rsp) 1057 leaq 64(%rdi),%rdi 1058 movdqa %xmm5,32(%rsp) 1059 subq $64,%rdx 1060 movdqa %xmm1,48(%rsp) 1061 jmp .Loop_tail4x 1062 1063.align 32 1064.L128_or_more4x: 1065 movdqu 0(%rsi),%xmm6 1066 movdqu 16(%rsi),%xmm11 1067 movdqu 32(%rsi),%xmm2 1068 movdqu 48(%rsi),%xmm7 1069 pxor 0(%rsp),%xmm6 1070 pxor %xmm12,%xmm11 1071 pxor %xmm4,%xmm2 1072 pxor %xmm0,%xmm7 1073 1074 movdqu %xmm6,0(%rdi) 1075 movdqu 64(%rsi),%xmm6 1076 movdqu %xmm11,16(%rdi) 1077 movdqu 80(%rsi),%xmm11 1078 movdqu %xmm2,32(%rdi) 1079 movdqu 96(%rsi),%xmm2 1080 movdqu %xmm7,48(%rdi) 1081 movdqu 112(%rsi),%xmm7 1082 pxor 16(%rsp),%xmm6 1083 pxor %xmm13,%xmm11 1084 pxor %xmm5,%xmm2 1085 pxor %xmm1,%xmm7 1086 movdqu %xmm6,64(%rdi) 1087 movdqu %xmm11,80(%rdi) 1088 movdqu %xmm2,96(%rdi) 1089 movdqu %xmm7,112(%rdi) 1090 je .Ldone4x 1091 1092 movdqa 32(%rsp),%xmm6 1093 leaq 128(%rsi),%rsi 1094 xorq %r10,%r10 1095 movdqa %xmm6,0(%rsp) 1096 movdqa %xmm10,16(%rsp) 1097 leaq 128(%rdi),%rdi 1098 movdqa %xmm14,32(%rsp) 1099 subq $128,%rdx 1100 movdqa %xmm8,48(%rsp) 1101 jmp .Loop_tail4x 1102 1103.align 32 1104.L192_or_more4x: 1105 movdqu 0(%rsi),%xmm6 1106 movdqu 16(%rsi),%xmm11 1107 movdqu 32(%rsi),%xmm2 1108 movdqu 48(%rsi),%xmm7 1109 pxor 0(%rsp),%xmm6 1110 pxor %xmm12,%xmm11 1111 pxor %xmm4,%xmm2 1112 pxor %xmm0,%xmm7 1113 1114 movdqu %xmm6,0(%rdi) 1115 movdqu 64(%rsi),%xmm6 1116 movdqu %xmm11,16(%rdi) 1117 movdqu 80(%rsi),%xmm11 1118 movdqu %xmm2,32(%rdi) 1119 movdqu 96(%rsi),%xmm2 1120 movdqu %xmm7,48(%rdi) 1121 movdqu 112(%rsi),%xmm7 1122 leaq 128(%rsi),%rsi 1123 pxor 16(%rsp),%xmm6 1124 pxor %xmm13,%xmm11 1125 pxor %xmm5,%xmm2 1126 pxor %xmm1,%xmm7 1127 1128 movdqu %xmm6,64(%rdi) 1129 movdqu 0(%rsi),%xmm6 1130 movdqu %xmm11,80(%rdi) 1131 movdqu 16(%rsi),%xmm11 1132 movdqu %xmm2,96(%rdi) 1133 movdqu 32(%rsi),%xmm2 1134 movdqu %xmm7,112(%rdi) 1135 leaq 128(%rdi),%rdi 1136 movdqu 48(%rsi),%xmm7 1137 pxor 32(%rsp),%xmm6 1138 pxor %xmm10,%xmm11 1139 pxor %xmm14,%xmm2 1140 pxor %xmm8,%xmm7 1141 movdqu %xmm6,0(%rdi) 1142 movdqu %xmm11,16(%rdi) 1143 movdqu %xmm2,32(%rdi) 1144 movdqu %xmm7,48(%rdi) 1145 je .Ldone4x 1146 1147 movdqa 48(%rsp),%xmm6 1148 leaq 64(%rsi),%rsi 1149 xorq %r10,%r10 1150 movdqa %xmm6,0(%rsp) 1151 movdqa %xmm15,16(%rsp) 1152 leaq 64(%rdi),%rdi 1153 movdqa %xmm9,32(%rsp) 1154 subq $192,%rdx 1155 movdqa %xmm3,48(%rsp) 1156 1157.Loop_tail4x: 1158 movzbl (%rsi,%r10,1),%eax 1159 movzbl (%rsp,%r10,1),%ecx 1160 leaq 1(%r10),%r10 1161 xorl %ecx,%eax 1162 movb %al,-1(%rdi,%r10,1) 1163 decq %rdx 1164 jnz .Loop_tail4x 1165 1166.Ldone4x: 1167 leaq (%r9),%rsp 1168.cfi_def_cfa_register %rsp 1169.L4x_epilogue: 1170 .byte 0xf3,0xc3 1171.cfi_endproc 1172.size ChaCha20_4x,.-ChaCha20_4x 1173.type ChaCha20_4xop,@function 1174.align 32 1175ChaCha20_4xop: 1176.cfi_startproc 1177.LChaCha20_4xop: 1178 movq %rsp,%r9 1179.cfi_def_cfa_register %r9 1180 subq $0x140+8,%rsp 1181 vzeroupper 1182 1183 vmovdqa .Lsigma(%rip),%xmm11 1184 vmovdqu (%rcx),%xmm3 1185 vmovdqu 16(%rcx),%xmm15 1186 vmovdqu (%r8),%xmm7 1187 leaq 256(%rsp),%rcx 1188 1189 vpshufd $0x00,%xmm11,%xmm8 1190 vpshufd $0x55,%xmm11,%xmm9 1191 vmovdqa %xmm8,64(%rsp) 1192 vpshufd $0xaa,%xmm11,%xmm10 1193 vmovdqa %xmm9,80(%rsp) 1194 vpshufd $0xff,%xmm11,%xmm11 1195 vmovdqa %xmm10,96(%rsp) 1196 vmovdqa %xmm11,112(%rsp) 1197 1198 vpshufd $0x00,%xmm3,%xmm0 1199 vpshufd $0x55,%xmm3,%xmm1 1200 vmovdqa %xmm0,128-256(%rcx) 1201 vpshufd $0xaa,%xmm3,%xmm2 1202 vmovdqa %xmm1,144-256(%rcx) 1203 vpshufd $0xff,%xmm3,%xmm3 1204 vmovdqa %xmm2,160-256(%rcx) 1205 vmovdqa %xmm3,176-256(%rcx) 1206 1207 vpshufd $0x00,%xmm15,%xmm12 1208 vpshufd $0x55,%xmm15,%xmm13 1209 vmovdqa %xmm12,192-256(%rcx) 1210 vpshufd $0xaa,%xmm15,%xmm14 1211 vmovdqa %xmm13,208-256(%rcx) 1212 vpshufd $0xff,%xmm15,%xmm15 1213 vmovdqa %xmm14,224-256(%rcx) 1214 vmovdqa %xmm15,240-256(%rcx) 1215 1216 vpshufd $0x00,%xmm7,%xmm4 1217 vpshufd $0x55,%xmm7,%xmm5 1218 vpaddd .Linc(%rip),%xmm4,%xmm4 1219 vpshufd $0xaa,%xmm7,%xmm6 1220 vmovdqa %xmm5,272-256(%rcx) 1221 vpshufd $0xff,%xmm7,%xmm7 1222 vmovdqa %xmm6,288-256(%rcx) 1223 vmovdqa %xmm7,304-256(%rcx) 1224 1225 jmp .Loop_enter4xop 1226 1227.align 32 1228.Loop_outer4xop: 1229 vmovdqa 64(%rsp),%xmm8 1230 vmovdqa 80(%rsp),%xmm9 1231 vmovdqa 96(%rsp),%xmm10 1232 vmovdqa 112(%rsp),%xmm11 1233 vmovdqa 128-256(%rcx),%xmm0 1234 vmovdqa 144-256(%rcx),%xmm1 1235 vmovdqa 160-256(%rcx),%xmm2 1236 vmovdqa 176-256(%rcx),%xmm3 1237 vmovdqa 192-256(%rcx),%xmm12 1238 vmovdqa 208-256(%rcx),%xmm13 1239 vmovdqa 224-256(%rcx),%xmm14 1240 vmovdqa 240-256(%rcx),%xmm15 1241 vmovdqa 256-256(%rcx),%xmm4 1242 vmovdqa 272-256(%rcx),%xmm5 1243 vmovdqa 288-256(%rcx),%xmm6 1244 vmovdqa 304-256(%rcx),%xmm7 1245 vpaddd .Lfour(%rip),%xmm4,%xmm4 1246 1247.Loop_enter4xop: 1248 movl $10,%eax 1249 vmovdqa %xmm4,256-256(%rcx) 1250 jmp .Loop4xop 1251 1252.align 32 1253.Loop4xop: 1254 vpaddd %xmm0,%xmm8,%xmm8 1255 vpaddd %xmm1,%xmm9,%xmm9 1256 vpaddd %xmm2,%xmm10,%xmm10 1257 vpaddd %xmm3,%xmm11,%xmm11 1258 vpxor %xmm4,%xmm8,%xmm4 1259 vpxor %xmm5,%xmm9,%xmm5 1260 vpxor %xmm6,%xmm10,%xmm6 1261 vpxor %xmm7,%xmm11,%xmm7 1262.byte 143,232,120,194,228,16 1263.byte 143,232,120,194,237,16 1264.byte 143,232,120,194,246,16 1265.byte 143,232,120,194,255,16 1266 vpaddd %xmm4,%xmm12,%xmm12 1267 vpaddd %xmm5,%xmm13,%xmm13 1268 vpaddd %xmm6,%xmm14,%xmm14 1269 vpaddd %xmm7,%xmm15,%xmm15 1270 vpxor %xmm0,%xmm12,%xmm0 1271 vpxor %xmm1,%xmm13,%xmm1 1272 vpxor %xmm14,%xmm2,%xmm2 1273 vpxor %xmm15,%xmm3,%xmm3 1274.byte 143,232,120,194,192,12 1275.byte 143,232,120,194,201,12 1276.byte 143,232,120,194,210,12 1277.byte 143,232,120,194,219,12 1278 vpaddd %xmm8,%xmm0,%xmm8 1279 vpaddd %xmm9,%xmm1,%xmm9 1280 vpaddd %xmm2,%xmm10,%xmm10 1281 vpaddd %xmm3,%xmm11,%xmm11 1282 vpxor %xmm4,%xmm8,%xmm4 1283 vpxor %xmm5,%xmm9,%xmm5 1284 vpxor %xmm6,%xmm10,%xmm6 1285 vpxor %xmm7,%xmm11,%xmm7 1286.byte 143,232,120,194,228,8 1287.byte 143,232,120,194,237,8 1288.byte 143,232,120,194,246,8 1289.byte 143,232,120,194,255,8 1290 vpaddd %xmm4,%xmm12,%xmm12 1291 vpaddd %xmm5,%xmm13,%xmm13 1292 vpaddd %xmm6,%xmm14,%xmm14 1293 vpaddd %xmm7,%xmm15,%xmm15 1294 vpxor %xmm0,%xmm12,%xmm0 1295 vpxor %xmm1,%xmm13,%xmm1 1296 vpxor %xmm14,%xmm2,%xmm2 1297 vpxor %xmm15,%xmm3,%xmm3 1298.byte 143,232,120,194,192,7 1299.byte 143,232,120,194,201,7 1300.byte 143,232,120,194,210,7 1301.byte 143,232,120,194,219,7 1302 vpaddd %xmm1,%xmm8,%xmm8 1303 vpaddd %xmm2,%xmm9,%xmm9 1304 vpaddd %xmm3,%xmm10,%xmm10 1305 vpaddd %xmm0,%xmm11,%xmm11 1306 vpxor %xmm7,%xmm8,%xmm7 1307 vpxor %xmm4,%xmm9,%xmm4 1308 vpxor %xmm5,%xmm10,%xmm5 1309 vpxor %xmm6,%xmm11,%xmm6 1310.byte 143,232,120,194,255,16 1311.byte 143,232,120,194,228,16 1312.byte 143,232,120,194,237,16 1313.byte 143,232,120,194,246,16 1314 vpaddd %xmm7,%xmm14,%xmm14 1315 vpaddd %xmm4,%xmm15,%xmm15 1316 vpaddd %xmm5,%xmm12,%xmm12 1317 vpaddd %xmm6,%xmm13,%xmm13 1318 vpxor %xmm1,%xmm14,%xmm1 1319 vpxor %xmm2,%xmm15,%xmm2 1320 vpxor %xmm12,%xmm3,%xmm3 1321 vpxor %xmm13,%xmm0,%xmm0 1322.byte 143,232,120,194,201,12 1323.byte 143,232,120,194,210,12 1324.byte 143,232,120,194,219,12 1325.byte 143,232,120,194,192,12 1326 vpaddd %xmm8,%xmm1,%xmm8 1327 vpaddd %xmm9,%xmm2,%xmm9 1328 vpaddd %xmm3,%xmm10,%xmm10 1329 vpaddd %xmm0,%xmm11,%xmm11 1330 vpxor %xmm7,%xmm8,%xmm7 1331 vpxor %xmm4,%xmm9,%xmm4 1332 vpxor %xmm5,%xmm10,%xmm5 1333 vpxor %xmm6,%xmm11,%xmm6 1334.byte 143,232,120,194,255,8 1335.byte 143,232,120,194,228,8 1336.byte 143,232,120,194,237,8 1337.byte 143,232,120,194,246,8 1338 vpaddd %xmm7,%xmm14,%xmm14 1339 vpaddd %xmm4,%xmm15,%xmm15 1340 vpaddd %xmm5,%xmm12,%xmm12 1341 vpaddd %xmm6,%xmm13,%xmm13 1342 vpxor %xmm1,%xmm14,%xmm1 1343 vpxor %xmm2,%xmm15,%xmm2 1344 vpxor %xmm12,%xmm3,%xmm3 1345 vpxor %xmm13,%xmm0,%xmm0 1346.byte 143,232,120,194,201,7 1347.byte 143,232,120,194,210,7 1348.byte 143,232,120,194,219,7 1349.byte 143,232,120,194,192,7 1350 decl %eax 1351 jnz .Loop4xop 1352 1353 vpaddd 64(%rsp),%xmm8,%xmm8 1354 vpaddd 80(%rsp),%xmm9,%xmm9 1355 vpaddd 96(%rsp),%xmm10,%xmm10 1356 vpaddd 112(%rsp),%xmm11,%xmm11 1357 1358 vmovdqa %xmm14,32(%rsp) 1359 vmovdqa %xmm15,48(%rsp) 1360 1361 vpunpckldq %xmm9,%xmm8,%xmm14 1362 vpunpckldq %xmm11,%xmm10,%xmm15 1363 vpunpckhdq %xmm9,%xmm8,%xmm8 1364 vpunpckhdq %xmm11,%xmm10,%xmm10 1365 vpunpcklqdq %xmm15,%xmm14,%xmm9 1366 vpunpckhqdq %xmm15,%xmm14,%xmm14 1367 vpunpcklqdq %xmm10,%xmm8,%xmm11 1368 vpunpckhqdq %xmm10,%xmm8,%xmm8 1369 vpaddd 128-256(%rcx),%xmm0,%xmm0 1370 vpaddd 144-256(%rcx),%xmm1,%xmm1 1371 vpaddd 160-256(%rcx),%xmm2,%xmm2 1372 vpaddd 176-256(%rcx),%xmm3,%xmm3 1373 1374 vmovdqa %xmm9,0(%rsp) 1375 vmovdqa %xmm14,16(%rsp) 1376 vmovdqa 32(%rsp),%xmm9 1377 vmovdqa 48(%rsp),%xmm14 1378 1379 vpunpckldq %xmm1,%xmm0,%xmm10 1380 vpunpckldq %xmm3,%xmm2,%xmm15 1381 vpunpckhdq %xmm1,%xmm0,%xmm0 1382 vpunpckhdq %xmm3,%xmm2,%xmm2 1383 vpunpcklqdq %xmm15,%xmm10,%xmm1 1384 vpunpckhqdq %xmm15,%xmm10,%xmm10 1385 vpunpcklqdq %xmm2,%xmm0,%xmm3 1386 vpunpckhqdq %xmm2,%xmm0,%xmm0 1387 vpaddd 192-256(%rcx),%xmm12,%xmm12 1388 vpaddd 208-256(%rcx),%xmm13,%xmm13 1389 vpaddd 224-256(%rcx),%xmm9,%xmm9 1390 vpaddd 240-256(%rcx),%xmm14,%xmm14 1391 1392 vpunpckldq %xmm13,%xmm12,%xmm2 1393 vpunpckldq %xmm14,%xmm9,%xmm15 1394 vpunpckhdq %xmm13,%xmm12,%xmm12 1395 vpunpckhdq %xmm14,%xmm9,%xmm9 1396 vpunpcklqdq %xmm15,%xmm2,%xmm13 1397 vpunpckhqdq %xmm15,%xmm2,%xmm2 1398 vpunpcklqdq %xmm9,%xmm12,%xmm14 1399 vpunpckhqdq %xmm9,%xmm12,%xmm12 1400 vpaddd 256-256(%rcx),%xmm4,%xmm4 1401 vpaddd 272-256(%rcx),%xmm5,%xmm5 1402 vpaddd 288-256(%rcx),%xmm6,%xmm6 1403 vpaddd 304-256(%rcx),%xmm7,%xmm7 1404 1405 vpunpckldq %xmm5,%xmm4,%xmm9 1406 vpunpckldq %xmm7,%xmm6,%xmm15 1407 vpunpckhdq %xmm5,%xmm4,%xmm4 1408 vpunpckhdq %xmm7,%xmm6,%xmm6 1409 vpunpcklqdq %xmm15,%xmm9,%xmm5 1410 vpunpckhqdq %xmm15,%xmm9,%xmm9 1411 vpunpcklqdq %xmm6,%xmm4,%xmm7 1412 vpunpckhqdq %xmm6,%xmm4,%xmm4 1413 vmovdqa 0(%rsp),%xmm6 1414 vmovdqa 16(%rsp),%xmm15 1415 1416 cmpq $256,%rdx 1417 jb .Ltail4xop 1418 1419 vpxor 0(%rsi),%xmm6,%xmm6 1420 vpxor 16(%rsi),%xmm1,%xmm1 1421 vpxor 32(%rsi),%xmm13,%xmm13 1422 vpxor 48(%rsi),%xmm5,%xmm5 1423 vpxor 64(%rsi),%xmm15,%xmm15 1424 vpxor 80(%rsi),%xmm10,%xmm10 1425 vpxor 96(%rsi),%xmm2,%xmm2 1426 vpxor 112(%rsi),%xmm9,%xmm9 1427 leaq 128(%rsi),%rsi 1428 vpxor 0(%rsi),%xmm11,%xmm11 1429 vpxor 16(%rsi),%xmm3,%xmm3 1430 vpxor 32(%rsi),%xmm14,%xmm14 1431 vpxor 48(%rsi),%xmm7,%xmm7 1432 vpxor 64(%rsi),%xmm8,%xmm8 1433 vpxor 80(%rsi),%xmm0,%xmm0 1434 vpxor 96(%rsi),%xmm12,%xmm12 1435 vpxor 112(%rsi),%xmm4,%xmm4 1436 leaq 128(%rsi),%rsi 1437 1438 vmovdqu %xmm6,0(%rdi) 1439 vmovdqu %xmm1,16(%rdi) 1440 vmovdqu %xmm13,32(%rdi) 1441 vmovdqu %xmm5,48(%rdi) 1442 vmovdqu %xmm15,64(%rdi) 1443 vmovdqu %xmm10,80(%rdi) 1444 vmovdqu %xmm2,96(%rdi) 1445 vmovdqu %xmm9,112(%rdi) 1446 leaq 128(%rdi),%rdi 1447 vmovdqu %xmm11,0(%rdi) 1448 vmovdqu %xmm3,16(%rdi) 1449 vmovdqu %xmm14,32(%rdi) 1450 vmovdqu %xmm7,48(%rdi) 1451 vmovdqu %xmm8,64(%rdi) 1452 vmovdqu %xmm0,80(%rdi) 1453 vmovdqu %xmm12,96(%rdi) 1454 vmovdqu %xmm4,112(%rdi) 1455 leaq 128(%rdi),%rdi 1456 1457 subq $256,%rdx 1458 jnz .Loop_outer4xop 1459 1460 jmp .Ldone4xop 1461 1462.align 32 1463.Ltail4xop: 1464 cmpq $192,%rdx 1465 jae .L192_or_more4xop 1466 cmpq $128,%rdx 1467 jae .L128_or_more4xop 1468 cmpq $64,%rdx 1469 jae .L64_or_more4xop 1470 1471 xorq %r10,%r10 1472 vmovdqa %xmm6,0(%rsp) 1473 vmovdqa %xmm1,16(%rsp) 1474 vmovdqa %xmm13,32(%rsp) 1475 vmovdqa %xmm5,48(%rsp) 1476 jmp .Loop_tail4xop 1477 1478.align 32 1479.L64_or_more4xop: 1480 vpxor 0(%rsi),%xmm6,%xmm6 1481 vpxor 16(%rsi),%xmm1,%xmm1 1482 vpxor 32(%rsi),%xmm13,%xmm13 1483 vpxor 48(%rsi),%xmm5,%xmm5 1484 vmovdqu %xmm6,0(%rdi) 1485 vmovdqu %xmm1,16(%rdi) 1486 vmovdqu %xmm13,32(%rdi) 1487 vmovdqu %xmm5,48(%rdi) 1488 je .Ldone4xop 1489 1490 leaq 64(%rsi),%rsi 1491 vmovdqa %xmm15,0(%rsp) 1492 xorq %r10,%r10 1493 vmovdqa %xmm10,16(%rsp) 1494 leaq 64(%rdi),%rdi 1495 vmovdqa %xmm2,32(%rsp) 1496 subq $64,%rdx 1497 vmovdqa %xmm9,48(%rsp) 1498 jmp .Loop_tail4xop 1499 1500.align 32 1501.L128_or_more4xop: 1502 vpxor 0(%rsi),%xmm6,%xmm6 1503 vpxor 16(%rsi),%xmm1,%xmm1 1504 vpxor 32(%rsi),%xmm13,%xmm13 1505 vpxor 48(%rsi),%xmm5,%xmm5 1506 vpxor 64(%rsi),%xmm15,%xmm15 1507 vpxor 80(%rsi),%xmm10,%xmm10 1508 vpxor 96(%rsi),%xmm2,%xmm2 1509 vpxor 112(%rsi),%xmm9,%xmm9 1510 1511 vmovdqu %xmm6,0(%rdi) 1512 vmovdqu %xmm1,16(%rdi) 1513 vmovdqu %xmm13,32(%rdi) 1514 vmovdqu %xmm5,48(%rdi) 1515 vmovdqu %xmm15,64(%rdi) 1516 vmovdqu %xmm10,80(%rdi) 1517 vmovdqu %xmm2,96(%rdi) 1518 vmovdqu %xmm9,112(%rdi) 1519 je .Ldone4xop 1520 1521 leaq 128(%rsi),%rsi 1522 vmovdqa %xmm11,0(%rsp) 1523 xorq %r10,%r10 1524 vmovdqa %xmm3,16(%rsp) 1525 leaq 128(%rdi),%rdi 1526 vmovdqa %xmm14,32(%rsp) 1527 subq $128,%rdx 1528 vmovdqa %xmm7,48(%rsp) 1529 jmp .Loop_tail4xop 1530 1531.align 32 1532.L192_or_more4xop: 1533 vpxor 0(%rsi),%xmm6,%xmm6 1534 vpxor 16(%rsi),%xmm1,%xmm1 1535 vpxor 32(%rsi),%xmm13,%xmm13 1536 vpxor 48(%rsi),%xmm5,%xmm5 1537 vpxor 64(%rsi),%xmm15,%xmm15 1538 vpxor 80(%rsi),%xmm10,%xmm10 1539 vpxor 96(%rsi),%xmm2,%xmm2 1540 vpxor 112(%rsi),%xmm9,%xmm9 1541 leaq 128(%rsi),%rsi 1542 vpxor 0(%rsi),%xmm11,%xmm11 1543 vpxor 16(%rsi),%xmm3,%xmm3 1544 vpxor 32(%rsi),%xmm14,%xmm14 1545 vpxor 48(%rsi),%xmm7,%xmm7 1546 1547 vmovdqu %xmm6,0(%rdi) 1548 vmovdqu %xmm1,16(%rdi) 1549 vmovdqu %xmm13,32(%rdi) 1550 vmovdqu %xmm5,48(%rdi) 1551 vmovdqu %xmm15,64(%rdi) 1552 vmovdqu %xmm10,80(%rdi) 1553 vmovdqu %xmm2,96(%rdi) 1554 vmovdqu %xmm9,112(%rdi) 1555 leaq 128(%rdi),%rdi 1556 vmovdqu %xmm11,0(%rdi) 1557 vmovdqu %xmm3,16(%rdi) 1558 vmovdqu %xmm14,32(%rdi) 1559 vmovdqu %xmm7,48(%rdi) 1560 je .Ldone4xop 1561 1562 leaq 64(%rsi),%rsi 1563 vmovdqa %xmm8,0(%rsp) 1564 xorq %r10,%r10 1565 vmovdqa %xmm0,16(%rsp) 1566 leaq 64(%rdi),%rdi 1567 vmovdqa %xmm12,32(%rsp) 1568 subq $192,%rdx 1569 vmovdqa %xmm4,48(%rsp) 1570 1571.Loop_tail4xop: 1572 movzbl (%rsi,%r10,1),%eax 1573 movzbl (%rsp,%r10,1),%ecx 1574 leaq 1(%r10),%r10 1575 xorl %ecx,%eax 1576 movb %al,-1(%rdi,%r10,1) 1577 decq %rdx 1578 jnz .Loop_tail4xop 1579 1580.Ldone4xop: 1581 vzeroupper 1582 leaq (%r9),%rsp 1583.cfi_def_cfa_register %rsp 1584.L4xop_epilogue: 1585 .byte 0xf3,0xc3 1586.cfi_endproc 1587.size ChaCha20_4xop,.-ChaCha20_4xop 1588.type ChaCha20_8x,@function 1589.align 32 1590ChaCha20_8x: 1591.cfi_startproc 1592.LChaCha20_8x: 1593 movq %rsp,%r9 1594.cfi_def_cfa_register %r9 1595 subq $0x280+8,%rsp 1596 andq $-32,%rsp 1597 vzeroupper 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 vbroadcasti128 .Lsigma(%rip),%ymm11 1609 vbroadcasti128 (%rcx),%ymm3 1610 vbroadcasti128 16(%rcx),%ymm15 1611 vbroadcasti128 (%r8),%ymm7 1612 leaq 256(%rsp),%rcx 1613 leaq 512(%rsp),%rax 1614 leaq .Lrot16(%rip),%r10 1615 leaq .Lrot24(%rip),%r11 1616 1617 vpshufd $0x00,%ymm11,%ymm8 1618 vpshufd $0x55,%ymm11,%ymm9 1619 vmovdqa %ymm8,128-256(%rcx) 1620 vpshufd $0xaa,%ymm11,%ymm10 1621 vmovdqa %ymm9,160-256(%rcx) 1622 vpshufd $0xff,%ymm11,%ymm11 1623 vmovdqa %ymm10,192-256(%rcx) 1624 vmovdqa %ymm11,224-256(%rcx) 1625 1626 vpshufd $0x00,%ymm3,%ymm0 1627 vpshufd $0x55,%ymm3,%ymm1 1628 vmovdqa %ymm0,256-256(%rcx) 1629 vpshufd $0xaa,%ymm3,%ymm2 1630 vmovdqa %ymm1,288-256(%rcx) 1631 vpshufd $0xff,%ymm3,%ymm3 1632 vmovdqa %ymm2,320-256(%rcx) 1633 vmovdqa %ymm3,352-256(%rcx) 1634 1635 vpshufd $0x00,%ymm15,%ymm12 1636 vpshufd $0x55,%ymm15,%ymm13 1637 vmovdqa %ymm12,384-512(%rax) 1638 vpshufd $0xaa,%ymm15,%ymm14 1639 vmovdqa %ymm13,416-512(%rax) 1640 vpshufd $0xff,%ymm15,%ymm15 1641 vmovdqa %ymm14,448-512(%rax) 1642 vmovdqa %ymm15,480-512(%rax) 1643 1644 vpshufd $0x00,%ymm7,%ymm4 1645 vpshufd $0x55,%ymm7,%ymm5 1646 vpaddd .Lincy(%rip),%ymm4,%ymm4 1647 vpshufd $0xaa,%ymm7,%ymm6 1648 vmovdqa %ymm5,544-512(%rax) 1649 vpshufd $0xff,%ymm7,%ymm7 1650 vmovdqa %ymm6,576-512(%rax) 1651 vmovdqa %ymm7,608-512(%rax) 1652 1653 jmp .Loop_enter8x 1654 1655.align 32 1656.Loop_outer8x: 1657 vmovdqa 128-256(%rcx),%ymm8 1658 vmovdqa 160-256(%rcx),%ymm9 1659 vmovdqa 192-256(%rcx),%ymm10 1660 vmovdqa 224-256(%rcx),%ymm11 1661 vmovdqa 256-256(%rcx),%ymm0 1662 vmovdqa 288-256(%rcx),%ymm1 1663 vmovdqa 320-256(%rcx),%ymm2 1664 vmovdqa 352-256(%rcx),%ymm3 1665 vmovdqa 384-512(%rax),%ymm12 1666 vmovdqa 416-512(%rax),%ymm13 1667 vmovdqa 448-512(%rax),%ymm14 1668 vmovdqa 480-512(%rax),%ymm15 1669 vmovdqa 512-512(%rax),%ymm4 1670 vmovdqa 544-512(%rax),%ymm5 1671 vmovdqa 576-512(%rax),%ymm6 1672 vmovdqa 608-512(%rax),%ymm7 1673 vpaddd .Leight(%rip),%ymm4,%ymm4 1674 1675.Loop_enter8x: 1676 vmovdqa %ymm14,64(%rsp) 1677 vmovdqa %ymm15,96(%rsp) 1678 vbroadcasti128 (%r10),%ymm15 1679 vmovdqa %ymm4,512-512(%rax) 1680 movl $10,%eax 1681 jmp .Loop8x 1682 1683.align 32 1684.Loop8x: 1685 vpaddd %ymm0,%ymm8,%ymm8 1686 vpxor %ymm4,%ymm8,%ymm4 1687 vpshufb %ymm15,%ymm4,%ymm4 1688 vpaddd %ymm1,%ymm9,%ymm9 1689 vpxor %ymm5,%ymm9,%ymm5 1690 vpshufb %ymm15,%ymm5,%ymm5 1691 vpaddd %ymm4,%ymm12,%ymm12 1692 vpxor %ymm0,%ymm12,%ymm0 1693 vpslld $12,%ymm0,%ymm14 1694 vpsrld $20,%ymm0,%ymm0 1695 vpor %ymm0,%ymm14,%ymm0 1696 vbroadcasti128 (%r11),%ymm14 1697 vpaddd %ymm5,%ymm13,%ymm13 1698 vpxor %ymm1,%ymm13,%ymm1 1699 vpslld $12,%ymm1,%ymm15 1700 vpsrld $20,%ymm1,%ymm1 1701 vpor %ymm1,%ymm15,%ymm1 1702 vpaddd %ymm0,%ymm8,%ymm8 1703 vpxor %ymm4,%ymm8,%ymm4 1704 vpshufb %ymm14,%ymm4,%ymm4 1705 vpaddd %ymm1,%ymm9,%ymm9 1706 vpxor %ymm5,%ymm9,%ymm5 1707 vpshufb %ymm14,%ymm5,%ymm5 1708 vpaddd %ymm4,%ymm12,%ymm12 1709 vpxor %ymm0,%ymm12,%ymm0 1710 vpslld $7,%ymm0,%ymm15 1711 vpsrld $25,%ymm0,%ymm0 1712 vpor %ymm0,%ymm15,%ymm0 1713 vbroadcasti128 (%r10),%ymm15 1714 vpaddd %ymm5,%ymm13,%ymm13 1715 vpxor %ymm1,%ymm13,%ymm1 1716 vpslld $7,%ymm1,%ymm14 1717 vpsrld $25,%ymm1,%ymm1 1718 vpor %ymm1,%ymm14,%ymm1 1719 vmovdqa %ymm12,0(%rsp) 1720 vmovdqa %ymm13,32(%rsp) 1721 vmovdqa 64(%rsp),%ymm12 1722 vmovdqa 96(%rsp),%ymm13 1723 vpaddd %ymm2,%ymm10,%ymm10 1724 vpxor %ymm6,%ymm10,%ymm6 1725 vpshufb %ymm15,%ymm6,%ymm6 1726 vpaddd %ymm3,%ymm11,%ymm11 1727 vpxor %ymm7,%ymm11,%ymm7 1728 vpshufb %ymm15,%ymm7,%ymm7 1729 vpaddd %ymm6,%ymm12,%ymm12 1730 vpxor %ymm2,%ymm12,%ymm2 1731 vpslld $12,%ymm2,%ymm14 1732 vpsrld $20,%ymm2,%ymm2 1733 vpor %ymm2,%ymm14,%ymm2 1734 vbroadcasti128 (%r11),%ymm14 1735 vpaddd %ymm7,%ymm13,%ymm13 1736 vpxor %ymm3,%ymm13,%ymm3 1737 vpslld $12,%ymm3,%ymm15 1738 vpsrld $20,%ymm3,%ymm3 1739 vpor %ymm3,%ymm15,%ymm3 1740 vpaddd %ymm2,%ymm10,%ymm10 1741 vpxor %ymm6,%ymm10,%ymm6 1742 vpshufb %ymm14,%ymm6,%ymm6 1743 vpaddd %ymm3,%ymm11,%ymm11 1744 vpxor %ymm7,%ymm11,%ymm7 1745 vpshufb %ymm14,%ymm7,%ymm7 1746 vpaddd %ymm6,%ymm12,%ymm12 1747 vpxor %ymm2,%ymm12,%ymm2 1748 vpslld $7,%ymm2,%ymm15 1749 vpsrld $25,%ymm2,%ymm2 1750 vpor %ymm2,%ymm15,%ymm2 1751 vbroadcasti128 (%r10),%ymm15 1752 vpaddd %ymm7,%ymm13,%ymm13 1753 vpxor %ymm3,%ymm13,%ymm3 1754 vpslld $7,%ymm3,%ymm14 1755 vpsrld $25,%ymm3,%ymm3 1756 vpor %ymm3,%ymm14,%ymm3 1757 vpaddd %ymm1,%ymm8,%ymm8 1758 vpxor %ymm7,%ymm8,%ymm7 1759 vpshufb %ymm15,%ymm7,%ymm7 1760 vpaddd %ymm2,%ymm9,%ymm9 1761 vpxor %ymm4,%ymm9,%ymm4 1762 vpshufb %ymm15,%ymm4,%ymm4 1763 vpaddd %ymm7,%ymm12,%ymm12 1764 vpxor %ymm1,%ymm12,%ymm1 1765 vpslld $12,%ymm1,%ymm14 1766 vpsrld $20,%ymm1,%ymm1 1767 vpor %ymm1,%ymm14,%ymm1 1768 vbroadcasti128 (%r11),%ymm14 1769 vpaddd %ymm4,%ymm13,%ymm13 1770 vpxor %ymm2,%ymm13,%ymm2 1771 vpslld $12,%ymm2,%ymm15 1772 vpsrld $20,%ymm2,%ymm2 1773 vpor %ymm2,%ymm15,%ymm2 1774 vpaddd %ymm1,%ymm8,%ymm8 1775 vpxor %ymm7,%ymm8,%ymm7 1776 vpshufb %ymm14,%ymm7,%ymm7 1777 vpaddd %ymm2,%ymm9,%ymm9 1778 vpxor %ymm4,%ymm9,%ymm4 1779 vpshufb %ymm14,%ymm4,%ymm4 1780 vpaddd %ymm7,%ymm12,%ymm12 1781 vpxor %ymm1,%ymm12,%ymm1 1782 vpslld $7,%ymm1,%ymm15 1783 vpsrld $25,%ymm1,%ymm1 1784 vpor %ymm1,%ymm15,%ymm1 1785 vbroadcasti128 (%r10),%ymm15 1786 vpaddd %ymm4,%ymm13,%ymm13 1787 vpxor %ymm2,%ymm13,%ymm2 1788 vpslld $7,%ymm2,%ymm14 1789 vpsrld $25,%ymm2,%ymm2 1790 vpor %ymm2,%ymm14,%ymm2 1791 vmovdqa %ymm12,64(%rsp) 1792 vmovdqa %ymm13,96(%rsp) 1793 vmovdqa 0(%rsp),%ymm12 1794 vmovdqa 32(%rsp),%ymm13 1795 vpaddd %ymm3,%ymm10,%ymm10 1796 vpxor %ymm5,%ymm10,%ymm5 1797 vpshufb %ymm15,%ymm5,%ymm5 1798 vpaddd %ymm0,%ymm11,%ymm11 1799 vpxor %ymm6,%ymm11,%ymm6 1800 vpshufb %ymm15,%ymm6,%ymm6 1801 vpaddd %ymm5,%ymm12,%ymm12 1802 vpxor %ymm3,%ymm12,%ymm3 1803 vpslld $12,%ymm3,%ymm14 1804 vpsrld $20,%ymm3,%ymm3 1805 vpor %ymm3,%ymm14,%ymm3 1806 vbroadcasti128 (%r11),%ymm14 1807 vpaddd %ymm6,%ymm13,%ymm13 1808 vpxor %ymm0,%ymm13,%ymm0 1809 vpslld $12,%ymm0,%ymm15 1810 vpsrld $20,%ymm0,%ymm0 1811 vpor %ymm0,%ymm15,%ymm0 1812 vpaddd %ymm3,%ymm10,%ymm10 1813 vpxor %ymm5,%ymm10,%ymm5 1814 vpshufb %ymm14,%ymm5,%ymm5 1815 vpaddd %ymm0,%ymm11,%ymm11 1816 vpxor %ymm6,%ymm11,%ymm6 1817 vpshufb %ymm14,%ymm6,%ymm6 1818 vpaddd %ymm5,%ymm12,%ymm12 1819 vpxor %ymm3,%ymm12,%ymm3 1820 vpslld $7,%ymm3,%ymm15 1821 vpsrld $25,%ymm3,%ymm3 1822 vpor %ymm3,%ymm15,%ymm3 1823 vbroadcasti128 (%r10),%ymm15 1824 vpaddd %ymm6,%ymm13,%ymm13 1825 vpxor %ymm0,%ymm13,%ymm0 1826 vpslld $7,%ymm0,%ymm14 1827 vpsrld $25,%ymm0,%ymm0 1828 vpor %ymm0,%ymm14,%ymm0 1829 decl %eax 1830 jnz .Loop8x 1831 1832 leaq 512(%rsp),%rax 1833 vpaddd 128-256(%rcx),%ymm8,%ymm8 1834 vpaddd 160-256(%rcx),%ymm9,%ymm9 1835 vpaddd 192-256(%rcx),%ymm10,%ymm10 1836 vpaddd 224-256(%rcx),%ymm11,%ymm11 1837 1838 vpunpckldq %ymm9,%ymm8,%ymm14 1839 vpunpckldq %ymm11,%ymm10,%ymm15 1840 vpunpckhdq %ymm9,%ymm8,%ymm8 1841 vpunpckhdq %ymm11,%ymm10,%ymm10 1842 vpunpcklqdq %ymm15,%ymm14,%ymm9 1843 vpunpckhqdq %ymm15,%ymm14,%ymm14 1844 vpunpcklqdq %ymm10,%ymm8,%ymm11 1845 vpunpckhqdq %ymm10,%ymm8,%ymm8 1846 vpaddd 256-256(%rcx),%ymm0,%ymm0 1847 vpaddd 288-256(%rcx),%ymm1,%ymm1 1848 vpaddd 320-256(%rcx),%ymm2,%ymm2 1849 vpaddd 352-256(%rcx),%ymm3,%ymm3 1850 1851 vpunpckldq %ymm1,%ymm0,%ymm10 1852 vpunpckldq %ymm3,%ymm2,%ymm15 1853 vpunpckhdq %ymm1,%ymm0,%ymm0 1854 vpunpckhdq %ymm3,%ymm2,%ymm2 1855 vpunpcklqdq %ymm15,%ymm10,%ymm1 1856 vpunpckhqdq %ymm15,%ymm10,%ymm10 1857 vpunpcklqdq %ymm2,%ymm0,%ymm3 1858 vpunpckhqdq %ymm2,%ymm0,%ymm0 1859 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1860 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1861 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1862 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1863 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1864 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1865 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1866 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1867 vmovdqa %ymm15,0(%rsp) 1868 vmovdqa %ymm9,32(%rsp) 1869 vmovdqa 64(%rsp),%ymm15 1870 vmovdqa 96(%rsp),%ymm9 1871 1872 vpaddd 384-512(%rax),%ymm12,%ymm12 1873 vpaddd 416-512(%rax),%ymm13,%ymm13 1874 vpaddd 448-512(%rax),%ymm15,%ymm15 1875 vpaddd 480-512(%rax),%ymm9,%ymm9 1876 1877 vpunpckldq %ymm13,%ymm12,%ymm2 1878 vpunpckldq %ymm9,%ymm15,%ymm8 1879 vpunpckhdq %ymm13,%ymm12,%ymm12 1880 vpunpckhdq %ymm9,%ymm15,%ymm15 1881 vpunpcklqdq %ymm8,%ymm2,%ymm13 1882 vpunpckhqdq %ymm8,%ymm2,%ymm2 1883 vpunpcklqdq %ymm15,%ymm12,%ymm9 1884 vpunpckhqdq %ymm15,%ymm12,%ymm12 1885 vpaddd 512-512(%rax),%ymm4,%ymm4 1886 vpaddd 544-512(%rax),%ymm5,%ymm5 1887 vpaddd 576-512(%rax),%ymm6,%ymm6 1888 vpaddd 608-512(%rax),%ymm7,%ymm7 1889 1890 vpunpckldq %ymm5,%ymm4,%ymm15 1891 vpunpckldq %ymm7,%ymm6,%ymm8 1892 vpunpckhdq %ymm5,%ymm4,%ymm4 1893 vpunpckhdq %ymm7,%ymm6,%ymm6 1894 vpunpcklqdq %ymm8,%ymm15,%ymm5 1895 vpunpckhqdq %ymm8,%ymm15,%ymm15 1896 vpunpcklqdq %ymm6,%ymm4,%ymm7 1897 vpunpckhqdq %ymm6,%ymm4,%ymm4 1898 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1899 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1900 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1901 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1902 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1903 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1904 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1905 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1906 vmovdqa 0(%rsp),%ymm6 1907 vmovdqa 32(%rsp),%ymm12 1908 1909 cmpq $512,%rdx 1910 jb .Ltail8x 1911 1912 vpxor 0(%rsi),%ymm6,%ymm6 1913 vpxor 32(%rsi),%ymm8,%ymm8 1914 vpxor 64(%rsi),%ymm1,%ymm1 1915 vpxor 96(%rsi),%ymm5,%ymm5 1916 leaq 128(%rsi),%rsi 1917 vmovdqu %ymm6,0(%rdi) 1918 vmovdqu %ymm8,32(%rdi) 1919 vmovdqu %ymm1,64(%rdi) 1920 vmovdqu %ymm5,96(%rdi) 1921 leaq 128(%rdi),%rdi 1922 1923 vpxor 0(%rsi),%ymm12,%ymm12 1924 vpxor 32(%rsi),%ymm13,%ymm13 1925 vpxor 64(%rsi),%ymm10,%ymm10 1926 vpxor 96(%rsi),%ymm15,%ymm15 1927 leaq 128(%rsi),%rsi 1928 vmovdqu %ymm12,0(%rdi) 1929 vmovdqu %ymm13,32(%rdi) 1930 vmovdqu %ymm10,64(%rdi) 1931 vmovdqu %ymm15,96(%rdi) 1932 leaq 128(%rdi),%rdi 1933 1934 vpxor 0(%rsi),%ymm14,%ymm14 1935 vpxor 32(%rsi),%ymm2,%ymm2 1936 vpxor 64(%rsi),%ymm3,%ymm3 1937 vpxor 96(%rsi),%ymm7,%ymm7 1938 leaq 128(%rsi),%rsi 1939 vmovdqu %ymm14,0(%rdi) 1940 vmovdqu %ymm2,32(%rdi) 1941 vmovdqu %ymm3,64(%rdi) 1942 vmovdqu %ymm7,96(%rdi) 1943 leaq 128(%rdi),%rdi 1944 1945 vpxor 0(%rsi),%ymm11,%ymm11 1946 vpxor 32(%rsi),%ymm9,%ymm9 1947 vpxor 64(%rsi),%ymm0,%ymm0 1948 vpxor 96(%rsi),%ymm4,%ymm4 1949 leaq 128(%rsi),%rsi 1950 vmovdqu %ymm11,0(%rdi) 1951 vmovdqu %ymm9,32(%rdi) 1952 vmovdqu %ymm0,64(%rdi) 1953 vmovdqu %ymm4,96(%rdi) 1954 leaq 128(%rdi),%rdi 1955 1956 subq $512,%rdx 1957 jnz .Loop_outer8x 1958 1959 jmp .Ldone8x 1960 1961.Ltail8x: 1962 cmpq $448,%rdx 1963 jae .L448_or_more8x 1964 cmpq $384,%rdx 1965 jae .L384_or_more8x 1966 cmpq $320,%rdx 1967 jae .L320_or_more8x 1968 cmpq $256,%rdx 1969 jae .L256_or_more8x 1970 cmpq $192,%rdx 1971 jae .L192_or_more8x 1972 cmpq $128,%rdx 1973 jae .L128_or_more8x 1974 cmpq $64,%rdx 1975 jae .L64_or_more8x 1976 1977 xorq %r10,%r10 1978 vmovdqa %ymm6,0(%rsp) 1979 vmovdqa %ymm8,32(%rsp) 1980 jmp .Loop_tail8x 1981 1982.align 32 1983.L64_or_more8x: 1984 vpxor 0(%rsi),%ymm6,%ymm6 1985 vpxor 32(%rsi),%ymm8,%ymm8 1986 vmovdqu %ymm6,0(%rdi) 1987 vmovdqu %ymm8,32(%rdi) 1988 je .Ldone8x 1989 1990 leaq 64(%rsi),%rsi 1991 xorq %r10,%r10 1992 vmovdqa %ymm1,0(%rsp) 1993 leaq 64(%rdi),%rdi 1994 subq $64,%rdx 1995 vmovdqa %ymm5,32(%rsp) 1996 jmp .Loop_tail8x 1997 1998.align 32 1999.L128_or_more8x: 2000 vpxor 0(%rsi),%ymm6,%ymm6 2001 vpxor 32(%rsi),%ymm8,%ymm8 2002 vpxor 64(%rsi),%ymm1,%ymm1 2003 vpxor 96(%rsi),%ymm5,%ymm5 2004 vmovdqu %ymm6,0(%rdi) 2005 vmovdqu %ymm8,32(%rdi) 2006 vmovdqu %ymm1,64(%rdi) 2007 vmovdqu %ymm5,96(%rdi) 2008 je .Ldone8x 2009 2010 leaq 128(%rsi),%rsi 2011 xorq %r10,%r10 2012 vmovdqa %ymm12,0(%rsp) 2013 leaq 128(%rdi),%rdi 2014 subq $128,%rdx 2015 vmovdqa %ymm13,32(%rsp) 2016 jmp .Loop_tail8x 2017 2018.align 32 2019.L192_or_more8x: 2020 vpxor 0(%rsi),%ymm6,%ymm6 2021 vpxor 32(%rsi),%ymm8,%ymm8 2022 vpxor 64(%rsi),%ymm1,%ymm1 2023 vpxor 96(%rsi),%ymm5,%ymm5 2024 vpxor 128(%rsi),%ymm12,%ymm12 2025 vpxor 160(%rsi),%ymm13,%ymm13 2026 vmovdqu %ymm6,0(%rdi) 2027 vmovdqu %ymm8,32(%rdi) 2028 vmovdqu %ymm1,64(%rdi) 2029 vmovdqu %ymm5,96(%rdi) 2030 vmovdqu %ymm12,128(%rdi) 2031 vmovdqu %ymm13,160(%rdi) 2032 je .Ldone8x 2033 2034 leaq 192(%rsi),%rsi 2035 xorq %r10,%r10 2036 vmovdqa %ymm10,0(%rsp) 2037 leaq 192(%rdi),%rdi 2038 subq $192,%rdx 2039 vmovdqa %ymm15,32(%rsp) 2040 jmp .Loop_tail8x 2041 2042.align 32 2043.L256_or_more8x: 2044 vpxor 0(%rsi),%ymm6,%ymm6 2045 vpxor 32(%rsi),%ymm8,%ymm8 2046 vpxor 64(%rsi),%ymm1,%ymm1 2047 vpxor 96(%rsi),%ymm5,%ymm5 2048 vpxor 128(%rsi),%ymm12,%ymm12 2049 vpxor 160(%rsi),%ymm13,%ymm13 2050 vpxor 192(%rsi),%ymm10,%ymm10 2051 vpxor 224(%rsi),%ymm15,%ymm15 2052 vmovdqu %ymm6,0(%rdi) 2053 vmovdqu %ymm8,32(%rdi) 2054 vmovdqu %ymm1,64(%rdi) 2055 vmovdqu %ymm5,96(%rdi) 2056 vmovdqu %ymm12,128(%rdi) 2057 vmovdqu %ymm13,160(%rdi) 2058 vmovdqu %ymm10,192(%rdi) 2059 vmovdqu %ymm15,224(%rdi) 2060 je .Ldone8x 2061 2062 leaq 256(%rsi),%rsi 2063 xorq %r10,%r10 2064 vmovdqa %ymm14,0(%rsp) 2065 leaq 256(%rdi),%rdi 2066 subq $256,%rdx 2067 vmovdqa %ymm2,32(%rsp) 2068 jmp .Loop_tail8x 2069 2070.align 32 2071.L320_or_more8x: 2072 vpxor 0(%rsi),%ymm6,%ymm6 2073 vpxor 32(%rsi),%ymm8,%ymm8 2074 vpxor 64(%rsi),%ymm1,%ymm1 2075 vpxor 96(%rsi),%ymm5,%ymm5 2076 vpxor 128(%rsi),%ymm12,%ymm12 2077 vpxor 160(%rsi),%ymm13,%ymm13 2078 vpxor 192(%rsi),%ymm10,%ymm10 2079 vpxor 224(%rsi),%ymm15,%ymm15 2080 vpxor 256(%rsi),%ymm14,%ymm14 2081 vpxor 288(%rsi),%ymm2,%ymm2 2082 vmovdqu %ymm6,0(%rdi) 2083 vmovdqu %ymm8,32(%rdi) 2084 vmovdqu %ymm1,64(%rdi) 2085 vmovdqu %ymm5,96(%rdi) 2086 vmovdqu %ymm12,128(%rdi) 2087 vmovdqu %ymm13,160(%rdi) 2088 vmovdqu %ymm10,192(%rdi) 2089 vmovdqu %ymm15,224(%rdi) 2090 vmovdqu %ymm14,256(%rdi) 2091 vmovdqu %ymm2,288(%rdi) 2092 je .Ldone8x 2093 2094 leaq 320(%rsi),%rsi 2095 xorq %r10,%r10 2096 vmovdqa %ymm3,0(%rsp) 2097 leaq 320(%rdi),%rdi 2098 subq $320,%rdx 2099 vmovdqa %ymm7,32(%rsp) 2100 jmp .Loop_tail8x 2101 2102.align 32 2103.L384_or_more8x: 2104 vpxor 0(%rsi),%ymm6,%ymm6 2105 vpxor 32(%rsi),%ymm8,%ymm8 2106 vpxor 64(%rsi),%ymm1,%ymm1 2107 vpxor 96(%rsi),%ymm5,%ymm5 2108 vpxor 128(%rsi),%ymm12,%ymm12 2109 vpxor 160(%rsi),%ymm13,%ymm13 2110 vpxor 192(%rsi),%ymm10,%ymm10 2111 vpxor 224(%rsi),%ymm15,%ymm15 2112 vpxor 256(%rsi),%ymm14,%ymm14 2113 vpxor 288(%rsi),%ymm2,%ymm2 2114 vpxor 320(%rsi),%ymm3,%ymm3 2115 vpxor 352(%rsi),%ymm7,%ymm7 2116 vmovdqu %ymm6,0(%rdi) 2117 vmovdqu %ymm8,32(%rdi) 2118 vmovdqu %ymm1,64(%rdi) 2119 vmovdqu %ymm5,96(%rdi) 2120 vmovdqu %ymm12,128(%rdi) 2121 vmovdqu %ymm13,160(%rdi) 2122 vmovdqu %ymm10,192(%rdi) 2123 vmovdqu %ymm15,224(%rdi) 2124 vmovdqu %ymm14,256(%rdi) 2125 vmovdqu %ymm2,288(%rdi) 2126 vmovdqu %ymm3,320(%rdi) 2127 vmovdqu %ymm7,352(%rdi) 2128 je .Ldone8x 2129 2130 leaq 384(%rsi),%rsi 2131 xorq %r10,%r10 2132 vmovdqa %ymm11,0(%rsp) 2133 leaq 384(%rdi),%rdi 2134 subq $384,%rdx 2135 vmovdqa %ymm9,32(%rsp) 2136 jmp .Loop_tail8x 2137 2138.align 32 2139.L448_or_more8x: 2140 vpxor 0(%rsi),%ymm6,%ymm6 2141 vpxor 32(%rsi),%ymm8,%ymm8 2142 vpxor 64(%rsi),%ymm1,%ymm1 2143 vpxor 96(%rsi),%ymm5,%ymm5 2144 vpxor 128(%rsi),%ymm12,%ymm12 2145 vpxor 160(%rsi),%ymm13,%ymm13 2146 vpxor 192(%rsi),%ymm10,%ymm10 2147 vpxor 224(%rsi),%ymm15,%ymm15 2148 vpxor 256(%rsi),%ymm14,%ymm14 2149 vpxor 288(%rsi),%ymm2,%ymm2 2150 vpxor 320(%rsi),%ymm3,%ymm3 2151 vpxor 352(%rsi),%ymm7,%ymm7 2152 vpxor 384(%rsi),%ymm11,%ymm11 2153 vpxor 416(%rsi),%ymm9,%ymm9 2154 vmovdqu %ymm6,0(%rdi) 2155 vmovdqu %ymm8,32(%rdi) 2156 vmovdqu %ymm1,64(%rdi) 2157 vmovdqu %ymm5,96(%rdi) 2158 vmovdqu %ymm12,128(%rdi) 2159 vmovdqu %ymm13,160(%rdi) 2160 vmovdqu %ymm10,192(%rdi) 2161 vmovdqu %ymm15,224(%rdi) 2162 vmovdqu %ymm14,256(%rdi) 2163 vmovdqu %ymm2,288(%rdi) 2164 vmovdqu %ymm3,320(%rdi) 2165 vmovdqu %ymm7,352(%rdi) 2166 vmovdqu %ymm11,384(%rdi) 2167 vmovdqu %ymm9,416(%rdi) 2168 je .Ldone8x 2169 2170 leaq 448(%rsi),%rsi 2171 xorq %r10,%r10 2172 vmovdqa %ymm0,0(%rsp) 2173 leaq 448(%rdi),%rdi 2174 subq $448,%rdx 2175 vmovdqa %ymm4,32(%rsp) 2176 2177.Loop_tail8x: 2178 movzbl (%rsi,%r10,1),%eax 2179 movzbl (%rsp,%r10,1),%ecx 2180 leaq 1(%r10),%r10 2181 xorl %ecx,%eax 2182 movb %al,-1(%rdi,%r10,1) 2183 decq %rdx 2184 jnz .Loop_tail8x 2185 2186.Ldone8x: 2187 vzeroall 2188 leaq (%r9),%rsp 2189.cfi_def_cfa_register %rsp 2190.L8x_epilogue: 2191 .byte 0xf3,0xc3 2192.cfi_endproc 2193.size ChaCha20_8x,.-ChaCha20_8x 2194