1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11#include "ring_core_generated/prefix_symbols_asm.h" 12.text 13 14 15 16.p2align 6 17L$zero: 18.long 0,0,0,0 19L$one: 20.long 1,0,0,0 21L$inc: 22.long 0,1,2,3 23L$four: 24.long 4,4,4,4 25L$incy: 26.long 0,2,4,6,1,3,5,7 27L$eight: 28.long 8,8,8,8,8,8,8,8 29L$rot16: 30.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 31L$rot24: 32.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 33L$sigma: 34.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 35.p2align 6 36L$zeroz: 37.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 38L$fourz: 39.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 40L$incz: 41.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 42L$sixteen: 43.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 44.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 45.globl _ChaCha20_ctr32 46.private_extern _ChaCha20_ctr32 47 48.p2align 6 49_ChaCha20_ctr32: 50 51 cmpq $0,%rdx 52 je L$no_data 53 movq _OPENSSL_ia32cap_P+4(%rip),%r10 54 testl $512,%r10d 55 jnz L$ChaCha20_ssse3 56 57 pushq %rbx 58 59 pushq %rbp 60 61 pushq %r12 62 63 pushq %r13 64 65 pushq %r14 66 67 pushq %r15 68 69 subq $64+24,%rsp 70 71L$ctr32_body: 72 73 74 movdqu (%rcx),%xmm1 75 movdqu 16(%rcx),%xmm2 76 movdqu (%r8),%xmm3 77 movdqa L$one(%rip),%xmm4 78 79 80 movdqa %xmm1,16(%rsp) 81 movdqa %xmm2,32(%rsp) 82 movdqa %xmm3,48(%rsp) 83 movq %rdx,%rbp 84 jmp L$oop_outer 85 86.p2align 5 87L$oop_outer: 88 movl $0x61707865,%eax 89 movl $0x3320646e,%ebx 90 movl $0x79622d32,%ecx 91 movl $0x6b206574,%edx 92 movl 16(%rsp),%r8d 93 movl 20(%rsp),%r9d 94 movl 24(%rsp),%r10d 95 movl 28(%rsp),%r11d 96 movd %xmm3,%r12d 97 movl 52(%rsp),%r13d 98 movl 56(%rsp),%r14d 99 movl 60(%rsp),%r15d 100 101 movq %rbp,64+0(%rsp) 102 movl $10,%ebp 103 movq %rsi,64+8(%rsp) 104.byte 102,72,15,126,214 105 movq %rdi,64+16(%rsp) 106 movq %rsi,%rdi 107 shrq $32,%rdi 108 jmp L$oop 109 110.p2align 5 111L$oop: 112 addl %r8d,%eax 113 xorl %eax,%r12d 114 roll $16,%r12d 115 addl %r9d,%ebx 116 xorl %ebx,%r13d 117 roll $16,%r13d 118 addl %r12d,%esi 119 xorl %esi,%r8d 120 roll $12,%r8d 121 addl %r13d,%edi 122 xorl %edi,%r9d 123 roll $12,%r9d 124 addl %r8d,%eax 125 xorl %eax,%r12d 126 roll $8,%r12d 127 addl %r9d,%ebx 128 xorl %ebx,%r13d 129 roll $8,%r13d 130 addl %r12d,%esi 131 xorl %esi,%r8d 132 roll $7,%r8d 133 addl %r13d,%edi 134 xorl %edi,%r9d 135 roll $7,%r9d 136 movl %esi,32(%rsp) 137 movl %edi,36(%rsp) 138 movl 40(%rsp),%esi 139 movl 44(%rsp),%edi 140 addl %r10d,%ecx 141 xorl %ecx,%r14d 142 roll $16,%r14d 143 addl %r11d,%edx 144 xorl %edx,%r15d 145 roll $16,%r15d 146 addl %r14d,%esi 147 xorl %esi,%r10d 148 roll $12,%r10d 149 addl %r15d,%edi 150 xorl %edi,%r11d 151 roll $12,%r11d 152 addl %r10d,%ecx 153 xorl %ecx,%r14d 154 roll $8,%r14d 155 addl %r11d,%edx 156 xorl %edx,%r15d 157 roll $8,%r15d 158 addl %r14d,%esi 159 xorl %esi,%r10d 160 roll $7,%r10d 161 addl %r15d,%edi 162 xorl %edi,%r11d 163 roll $7,%r11d 164 addl %r9d,%eax 165 xorl %eax,%r15d 166 roll $16,%r15d 167 addl %r10d,%ebx 168 xorl %ebx,%r12d 169 roll $16,%r12d 170 addl %r15d,%esi 171 xorl %esi,%r9d 172 roll $12,%r9d 173 addl %r12d,%edi 174 xorl %edi,%r10d 175 roll $12,%r10d 176 addl %r9d,%eax 177 xorl %eax,%r15d 178 roll $8,%r15d 179 addl %r10d,%ebx 180 xorl %ebx,%r12d 181 roll $8,%r12d 182 addl %r15d,%esi 183 xorl %esi,%r9d 184 roll $7,%r9d 185 addl %r12d,%edi 186 xorl %edi,%r10d 187 roll $7,%r10d 188 movl %esi,40(%rsp) 189 movl %edi,44(%rsp) 190 movl 32(%rsp),%esi 191 movl 36(%rsp),%edi 192 addl %r11d,%ecx 193 xorl %ecx,%r13d 194 roll $16,%r13d 195 addl %r8d,%edx 196 xorl %edx,%r14d 197 roll $16,%r14d 198 addl %r13d,%esi 199 xorl %esi,%r11d 200 roll $12,%r11d 201 addl %r14d,%edi 202 xorl %edi,%r8d 203 roll $12,%r8d 204 addl %r11d,%ecx 205 xorl %ecx,%r13d 206 roll $8,%r13d 207 addl %r8d,%edx 208 xorl %edx,%r14d 209 roll $8,%r14d 210 addl %r13d,%esi 211 xorl %esi,%r11d 212 roll $7,%r11d 213 addl %r14d,%edi 214 xorl %edi,%r8d 215 roll $7,%r8d 216 decl %ebp 217 jnz L$oop 218 movl %edi,36(%rsp) 219 movl %esi,32(%rsp) 220 movq 64(%rsp),%rbp 221 movdqa %xmm2,%xmm1 222 movq 64+8(%rsp),%rsi 223 paddd %xmm4,%xmm3 224 movq 64+16(%rsp),%rdi 225 226 addl $0x61707865,%eax 227 addl $0x3320646e,%ebx 228 addl $0x79622d32,%ecx 229 addl $0x6b206574,%edx 230 addl 16(%rsp),%r8d 231 addl 20(%rsp),%r9d 232 addl 24(%rsp),%r10d 233 addl 28(%rsp),%r11d 234 addl 48(%rsp),%r12d 235 addl 52(%rsp),%r13d 236 addl 56(%rsp),%r14d 237 addl 60(%rsp),%r15d 238 paddd 32(%rsp),%xmm1 239 240 cmpq $64,%rbp 241 jb L$tail 242 243 xorl 0(%rsi),%eax 244 xorl 4(%rsi),%ebx 245 xorl 8(%rsi),%ecx 246 xorl 12(%rsi),%edx 247 xorl 16(%rsi),%r8d 248 xorl 20(%rsi),%r9d 249 xorl 24(%rsi),%r10d 250 xorl 28(%rsi),%r11d 251 movdqu 32(%rsi),%xmm0 252 xorl 48(%rsi),%r12d 253 xorl 52(%rsi),%r13d 254 xorl 56(%rsi),%r14d 255 xorl 60(%rsi),%r15d 256 leaq 64(%rsi),%rsi 257 pxor %xmm1,%xmm0 258 259 movdqa %xmm2,32(%rsp) 260 movd %xmm3,48(%rsp) 261 262 movl %eax,0(%rdi) 263 movl %ebx,4(%rdi) 264 movl %ecx,8(%rdi) 265 movl %edx,12(%rdi) 266 movl %r8d,16(%rdi) 267 movl %r9d,20(%rdi) 268 movl %r10d,24(%rdi) 269 movl %r11d,28(%rdi) 270 movdqu %xmm0,32(%rdi) 271 movl %r12d,48(%rdi) 272 movl %r13d,52(%rdi) 273 movl %r14d,56(%rdi) 274 movl %r15d,60(%rdi) 275 leaq 64(%rdi),%rdi 276 277 subq $64,%rbp 278 jnz L$oop_outer 279 280 jmp L$done 281 282.p2align 4 283L$tail: 284 movl %eax,0(%rsp) 285 movl %ebx,4(%rsp) 286 xorq %rbx,%rbx 287 movl %ecx,8(%rsp) 288 movl %edx,12(%rsp) 289 movl %r8d,16(%rsp) 290 movl %r9d,20(%rsp) 291 movl %r10d,24(%rsp) 292 movl %r11d,28(%rsp) 293 movdqa %xmm1,32(%rsp) 294 movl %r12d,48(%rsp) 295 movl %r13d,52(%rsp) 296 movl %r14d,56(%rsp) 297 movl %r15d,60(%rsp) 298 299L$oop_tail: 300 movzbl (%rsi,%rbx,1),%eax 301 movzbl (%rsp,%rbx,1),%edx 302 leaq 1(%rbx),%rbx 303 xorl %edx,%eax 304 movb %al,-1(%rdi,%rbx,1) 305 decq %rbp 306 jnz L$oop_tail 307 308L$done: 309 leaq 64+24+48(%rsp),%rsi 310 movq -48(%rsi),%r15 311 312 movq -40(%rsi),%r14 313 314 movq -32(%rsi),%r13 315 316 movq -24(%rsi),%r12 317 318 movq -16(%rsi),%rbp 319 320 movq -8(%rsi),%rbx 321 322 leaq (%rsi),%rsp 323 324L$no_data: 325 .byte 0xf3,0xc3 326 327 328 329.p2align 5 330ChaCha20_ssse3: 331L$ChaCha20_ssse3: 332 333 movq %rsp,%r9 334 335 cmpq $128,%rdx 336 ja L$ChaCha20_4x 337 338L$do_sse3_after_all: 339 subq $64+8,%rsp 340 movdqa L$sigma(%rip),%xmm0 341 movdqu (%rcx),%xmm1 342 movdqu 16(%rcx),%xmm2 343 movdqu (%r8),%xmm3 344 movdqa L$rot16(%rip),%xmm6 345 movdqa L$rot24(%rip),%xmm7 346 347 movdqa %xmm0,0(%rsp) 348 movdqa %xmm1,16(%rsp) 349 movdqa %xmm2,32(%rsp) 350 movdqa %xmm3,48(%rsp) 351 movq $10,%r8 352 jmp L$oop_ssse3 353 354.p2align 5 355L$oop_outer_ssse3: 356 movdqa L$one(%rip),%xmm3 357 movdqa 0(%rsp),%xmm0 358 movdqa 16(%rsp),%xmm1 359 movdqa 32(%rsp),%xmm2 360 paddd 48(%rsp),%xmm3 361 movq $10,%r8 362 movdqa %xmm3,48(%rsp) 363 jmp L$oop_ssse3 364 365.p2align 5 366L$oop_ssse3: 367 paddd %xmm1,%xmm0 368 pxor %xmm0,%xmm3 369.byte 102,15,56,0,222 370 paddd %xmm3,%xmm2 371 pxor %xmm2,%xmm1 372 movdqa %xmm1,%xmm4 373 psrld $20,%xmm1 374 pslld $12,%xmm4 375 por %xmm4,%xmm1 376 paddd %xmm1,%xmm0 377 pxor %xmm0,%xmm3 378.byte 102,15,56,0,223 379 paddd %xmm3,%xmm2 380 pxor %xmm2,%xmm1 381 movdqa %xmm1,%xmm4 382 psrld $25,%xmm1 383 pslld $7,%xmm4 384 por %xmm4,%xmm1 385 pshufd $78,%xmm2,%xmm2 386 pshufd $57,%xmm1,%xmm1 387 pshufd $147,%xmm3,%xmm3 388 nop 389 paddd %xmm1,%xmm0 390 pxor %xmm0,%xmm3 391.byte 102,15,56,0,222 392 paddd %xmm3,%xmm2 393 pxor %xmm2,%xmm1 394 movdqa %xmm1,%xmm4 395 psrld $20,%xmm1 396 pslld $12,%xmm4 397 por %xmm4,%xmm1 398 paddd %xmm1,%xmm0 399 pxor %xmm0,%xmm3 400.byte 102,15,56,0,223 401 paddd %xmm3,%xmm2 402 pxor %xmm2,%xmm1 403 movdqa %xmm1,%xmm4 404 psrld $25,%xmm1 405 pslld $7,%xmm4 406 por %xmm4,%xmm1 407 pshufd $78,%xmm2,%xmm2 408 pshufd $147,%xmm1,%xmm1 409 pshufd $57,%xmm3,%xmm3 410 decq %r8 411 jnz L$oop_ssse3 412 paddd 0(%rsp),%xmm0 413 paddd 16(%rsp),%xmm1 414 paddd 32(%rsp),%xmm2 415 paddd 48(%rsp),%xmm3 416 417 cmpq $64,%rdx 418 jb L$tail_ssse3 419 420 movdqu 0(%rsi),%xmm4 421 movdqu 16(%rsi),%xmm5 422 pxor %xmm4,%xmm0 423 movdqu 32(%rsi),%xmm4 424 pxor %xmm5,%xmm1 425 movdqu 48(%rsi),%xmm5 426 leaq 64(%rsi),%rsi 427 pxor %xmm4,%xmm2 428 pxor %xmm5,%xmm3 429 430 movdqu %xmm0,0(%rdi) 431 movdqu %xmm1,16(%rdi) 432 movdqu %xmm2,32(%rdi) 433 movdqu %xmm3,48(%rdi) 434 leaq 64(%rdi),%rdi 435 436 subq $64,%rdx 437 jnz L$oop_outer_ssse3 438 439 jmp L$done_ssse3 440 441.p2align 4 442L$tail_ssse3: 443 movdqa %xmm0,0(%rsp) 444 movdqa %xmm1,16(%rsp) 445 movdqa %xmm2,32(%rsp) 446 movdqa %xmm3,48(%rsp) 447 xorq %r8,%r8 448 449L$oop_tail_ssse3: 450 movzbl (%rsi,%r8,1),%eax 451 movzbl (%rsp,%r8,1),%ecx 452 leaq 1(%r8),%r8 453 xorl %ecx,%eax 454 movb %al,-1(%rdi,%r8,1) 455 decq %rdx 456 jnz L$oop_tail_ssse3 457 458L$done_ssse3: 459 leaq (%r9),%rsp 460 461L$ssse3_epilogue: 462 .byte 0xf3,0xc3 463 464 465 466.p2align 5 467ChaCha20_4x: 468L$ChaCha20_4x: 469 470 movq %rsp,%r9 471 472 movq %r10,%r11 473 shrq $32,%r10 474 testq $32,%r10 475 jnz L$ChaCha20_8x 476 cmpq $192,%rdx 477 ja L$proceed4x 478 479 andq $71303168,%r11 480 cmpq $4194304,%r11 481 je L$do_sse3_after_all 482 483L$proceed4x: 484 subq $0x140+8,%rsp 485 movdqa L$sigma(%rip),%xmm11 486 movdqu (%rcx),%xmm15 487 movdqu 16(%rcx),%xmm7 488 movdqu (%r8),%xmm3 489 leaq 256(%rsp),%rcx 490 leaq L$rot16(%rip),%r10 491 leaq L$rot24(%rip),%r11 492 493 pshufd $0x00,%xmm11,%xmm8 494 pshufd $0x55,%xmm11,%xmm9 495 movdqa %xmm8,64(%rsp) 496 pshufd $0xaa,%xmm11,%xmm10 497 movdqa %xmm9,80(%rsp) 498 pshufd $0xff,%xmm11,%xmm11 499 movdqa %xmm10,96(%rsp) 500 movdqa %xmm11,112(%rsp) 501 502 pshufd $0x00,%xmm15,%xmm12 503 pshufd $0x55,%xmm15,%xmm13 504 movdqa %xmm12,128-256(%rcx) 505 pshufd $0xaa,%xmm15,%xmm14 506 movdqa %xmm13,144-256(%rcx) 507 pshufd $0xff,%xmm15,%xmm15 508 movdqa %xmm14,160-256(%rcx) 509 movdqa %xmm15,176-256(%rcx) 510 511 pshufd $0x00,%xmm7,%xmm4 512 pshufd $0x55,%xmm7,%xmm5 513 movdqa %xmm4,192-256(%rcx) 514 pshufd $0xaa,%xmm7,%xmm6 515 movdqa %xmm5,208-256(%rcx) 516 pshufd $0xff,%xmm7,%xmm7 517 movdqa %xmm6,224-256(%rcx) 518 movdqa %xmm7,240-256(%rcx) 519 520 pshufd $0x00,%xmm3,%xmm0 521 pshufd $0x55,%xmm3,%xmm1 522 paddd L$inc(%rip),%xmm0 523 pshufd $0xaa,%xmm3,%xmm2 524 movdqa %xmm1,272-256(%rcx) 525 pshufd $0xff,%xmm3,%xmm3 526 movdqa %xmm2,288-256(%rcx) 527 movdqa %xmm3,304-256(%rcx) 528 529 jmp L$oop_enter4x 530 531.p2align 5 532L$oop_outer4x: 533 movdqa 64(%rsp),%xmm8 534 movdqa 80(%rsp),%xmm9 535 movdqa 96(%rsp),%xmm10 536 movdqa 112(%rsp),%xmm11 537 movdqa 128-256(%rcx),%xmm12 538 movdqa 144-256(%rcx),%xmm13 539 movdqa 160-256(%rcx),%xmm14 540 movdqa 176-256(%rcx),%xmm15 541 movdqa 192-256(%rcx),%xmm4 542 movdqa 208-256(%rcx),%xmm5 543 movdqa 224-256(%rcx),%xmm6 544 movdqa 240-256(%rcx),%xmm7 545 movdqa 256-256(%rcx),%xmm0 546 movdqa 272-256(%rcx),%xmm1 547 movdqa 288-256(%rcx),%xmm2 548 movdqa 304-256(%rcx),%xmm3 549 paddd L$four(%rip),%xmm0 550 551L$oop_enter4x: 552 movdqa %xmm6,32(%rsp) 553 movdqa %xmm7,48(%rsp) 554 movdqa (%r10),%xmm7 555 movl $10,%eax 556 movdqa %xmm0,256-256(%rcx) 557 jmp L$oop4x 558 559.p2align 5 560L$oop4x: 561 paddd %xmm12,%xmm8 562 paddd %xmm13,%xmm9 563 pxor %xmm8,%xmm0 564 pxor %xmm9,%xmm1 565.byte 102,15,56,0,199 566.byte 102,15,56,0,207 567 paddd %xmm0,%xmm4 568 paddd %xmm1,%xmm5 569 pxor %xmm4,%xmm12 570 pxor %xmm5,%xmm13 571 movdqa %xmm12,%xmm6 572 pslld $12,%xmm12 573 psrld $20,%xmm6 574 movdqa %xmm13,%xmm7 575 pslld $12,%xmm13 576 por %xmm6,%xmm12 577 psrld $20,%xmm7 578 movdqa (%r11),%xmm6 579 por %xmm7,%xmm13 580 paddd %xmm12,%xmm8 581 paddd %xmm13,%xmm9 582 pxor %xmm8,%xmm0 583 pxor %xmm9,%xmm1 584.byte 102,15,56,0,198 585.byte 102,15,56,0,206 586 paddd %xmm0,%xmm4 587 paddd %xmm1,%xmm5 588 pxor %xmm4,%xmm12 589 pxor %xmm5,%xmm13 590 movdqa %xmm12,%xmm7 591 pslld $7,%xmm12 592 psrld $25,%xmm7 593 movdqa %xmm13,%xmm6 594 pslld $7,%xmm13 595 por %xmm7,%xmm12 596 psrld $25,%xmm6 597 movdqa (%r10),%xmm7 598 por %xmm6,%xmm13 599 movdqa %xmm4,0(%rsp) 600 movdqa %xmm5,16(%rsp) 601 movdqa 32(%rsp),%xmm4 602 movdqa 48(%rsp),%xmm5 603 paddd %xmm14,%xmm10 604 paddd %xmm15,%xmm11 605 pxor %xmm10,%xmm2 606 pxor %xmm11,%xmm3 607.byte 102,15,56,0,215 608.byte 102,15,56,0,223 609 paddd %xmm2,%xmm4 610 paddd %xmm3,%xmm5 611 pxor %xmm4,%xmm14 612 pxor %xmm5,%xmm15 613 movdqa %xmm14,%xmm6 614 pslld $12,%xmm14 615 psrld $20,%xmm6 616 movdqa %xmm15,%xmm7 617 pslld $12,%xmm15 618 por %xmm6,%xmm14 619 psrld $20,%xmm7 620 movdqa (%r11),%xmm6 621 por %xmm7,%xmm15 622 paddd %xmm14,%xmm10 623 paddd %xmm15,%xmm11 624 pxor %xmm10,%xmm2 625 pxor %xmm11,%xmm3 626.byte 102,15,56,0,214 627.byte 102,15,56,0,222 628 paddd %xmm2,%xmm4 629 paddd %xmm3,%xmm5 630 pxor %xmm4,%xmm14 631 pxor %xmm5,%xmm15 632 movdqa %xmm14,%xmm7 633 pslld $7,%xmm14 634 psrld $25,%xmm7 635 movdqa %xmm15,%xmm6 636 pslld $7,%xmm15 637 por %xmm7,%xmm14 638 psrld $25,%xmm6 639 movdqa (%r10),%xmm7 640 por %xmm6,%xmm15 641 paddd %xmm13,%xmm8 642 paddd %xmm14,%xmm9 643 pxor %xmm8,%xmm3 644 pxor %xmm9,%xmm0 645.byte 102,15,56,0,223 646.byte 102,15,56,0,199 647 paddd %xmm3,%xmm4 648 paddd %xmm0,%xmm5 649 pxor %xmm4,%xmm13 650 pxor %xmm5,%xmm14 651 movdqa %xmm13,%xmm6 652 pslld $12,%xmm13 653 psrld $20,%xmm6 654 movdqa %xmm14,%xmm7 655 pslld $12,%xmm14 656 por %xmm6,%xmm13 657 psrld $20,%xmm7 658 movdqa (%r11),%xmm6 659 por %xmm7,%xmm14 660 paddd %xmm13,%xmm8 661 paddd %xmm14,%xmm9 662 pxor %xmm8,%xmm3 663 pxor %xmm9,%xmm0 664.byte 102,15,56,0,222 665.byte 102,15,56,0,198 666 paddd %xmm3,%xmm4 667 paddd %xmm0,%xmm5 668 pxor %xmm4,%xmm13 669 pxor %xmm5,%xmm14 670 movdqa %xmm13,%xmm7 671 pslld $7,%xmm13 672 psrld $25,%xmm7 673 movdqa %xmm14,%xmm6 674 pslld $7,%xmm14 675 por %xmm7,%xmm13 676 psrld $25,%xmm6 677 movdqa (%r10),%xmm7 678 por %xmm6,%xmm14 679 movdqa %xmm4,32(%rsp) 680 movdqa %xmm5,48(%rsp) 681 movdqa 0(%rsp),%xmm4 682 movdqa 16(%rsp),%xmm5 683 paddd %xmm15,%xmm10 684 paddd %xmm12,%xmm11 685 pxor %xmm10,%xmm1 686 pxor %xmm11,%xmm2 687.byte 102,15,56,0,207 688.byte 102,15,56,0,215 689 paddd %xmm1,%xmm4 690 paddd %xmm2,%xmm5 691 pxor %xmm4,%xmm15 692 pxor %xmm5,%xmm12 693 movdqa %xmm15,%xmm6 694 pslld $12,%xmm15 695 psrld $20,%xmm6 696 movdqa %xmm12,%xmm7 697 pslld $12,%xmm12 698 por %xmm6,%xmm15 699 psrld $20,%xmm7 700 movdqa (%r11),%xmm6 701 por %xmm7,%xmm12 702 paddd %xmm15,%xmm10 703 paddd %xmm12,%xmm11 704 pxor %xmm10,%xmm1 705 pxor %xmm11,%xmm2 706.byte 102,15,56,0,206 707.byte 102,15,56,0,214 708 paddd %xmm1,%xmm4 709 paddd %xmm2,%xmm5 710 pxor %xmm4,%xmm15 711 pxor %xmm5,%xmm12 712 movdqa %xmm15,%xmm7 713 pslld $7,%xmm15 714 psrld $25,%xmm7 715 movdqa %xmm12,%xmm6 716 pslld $7,%xmm12 717 por %xmm7,%xmm15 718 psrld $25,%xmm6 719 movdqa (%r10),%xmm7 720 por %xmm6,%xmm12 721 decl %eax 722 jnz L$oop4x 723 724 paddd 64(%rsp),%xmm8 725 paddd 80(%rsp),%xmm9 726 paddd 96(%rsp),%xmm10 727 paddd 112(%rsp),%xmm11 728 729 movdqa %xmm8,%xmm6 730 punpckldq %xmm9,%xmm8 731 movdqa %xmm10,%xmm7 732 punpckldq %xmm11,%xmm10 733 punpckhdq %xmm9,%xmm6 734 punpckhdq %xmm11,%xmm7 735 movdqa %xmm8,%xmm9 736 punpcklqdq %xmm10,%xmm8 737 movdqa %xmm6,%xmm11 738 punpcklqdq %xmm7,%xmm6 739 punpckhqdq %xmm10,%xmm9 740 punpckhqdq %xmm7,%xmm11 741 paddd 128-256(%rcx),%xmm12 742 paddd 144-256(%rcx),%xmm13 743 paddd 160-256(%rcx),%xmm14 744 paddd 176-256(%rcx),%xmm15 745 746 movdqa %xmm8,0(%rsp) 747 movdqa %xmm9,16(%rsp) 748 movdqa 32(%rsp),%xmm8 749 movdqa 48(%rsp),%xmm9 750 751 movdqa %xmm12,%xmm10 752 punpckldq %xmm13,%xmm12 753 movdqa %xmm14,%xmm7 754 punpckldq %xmm15,%xmm14 755 punpckhdq %xmm13,%xmm10 756 punpckhdq %xmm15,%xmm7 757 movdqa %xmm12,%xmm13 758 punpcklqdq %xmm14,%xmm12 759 movdqa %xmm10,%xmm15 760 punpcklqdq %xmm7,%xmm10 761 punpckhqdq %xmm14,%xmm13 762 punpckhqdq %xmm7,%xmm15 763 paddd 192-256(%rcx),%xmm4 764 paddd 208-256(%rcx),%xmm5 765 paddd 224-256(%rcx),%xmm8 766 paddd 240-256(%rcx),%xmm9 767 768 movdqa %xmm6,32(%rsp) 769 movdqa %xmm11,48(%rsp) 770 771 movdqa %xmm4,%xmm14 772 punpckldq %xmm5,%xmm4 773 movdqa %xmm8,%xmm7 774 punpckldq %xmm9,%xmm8 775 punpckhdq %xmm5,%xmm14 776 punpckhdq %xmm9,%xmm7 777 movdqa %xmm4,%xmm5 778 punpcklqdq %xmm8,%xmm4 779 movdqa %xmm14,%xmm9 780 punpcklqdq %xmm7,%xmm14 781 punpckhqdq %xmm8,%xmm5 782 punpckhqdq %xmm7,%xmm9 783 paddd 256-256(%rcx),%xmm0 784 paddd 272-256(%rcx),%xmm1 785 paddd 288-256(%rcx),%xmm2 786 paddd 304-256(%rcx),%xmm3 787 788 movdqa %xmm0,%xmm8 789 punpckldq %xmm1,%xmm0 790 movdqa %xmm2,%xmm7 791 punpckldq %xmm3,%xmm2 792 punpckhdq %xmm1,%xmm8 793 punpckhdq %xmm3,%xmm7 794 movdqa %xmm0,%xmm1 795 punpcklqdq %xmm2,%xmm0 796 movdqa %xmm8,%xmm3 797 punpcklqdq %xmm7,%xmm8 798 punpckhqdq %xmm2,%xmm1 799 punpckhqdq %xmm7,%xmm3 800 cmpq $256,%rdx 801 jb L$tail4x 802 803 movdqu 0(%rsi),%xmm6 804 movdqu 16(%rsi),%xmm11 805 movdqu 32(%rsi),%xmm2 806 movdqu 48(%rsi),%xmm7 807 pxor 0(%rsp),%xmm6 808 pxor %xmm12,%xmm11 809 pxor %xmm4,%xmm2 810 pxor %xmm0,%xmm7 811 812 movdqu %xmm6,0(%rdi) 813 movdqu 64(%rsi),%xmm6 814 movdqu %xmm11,16(%rdi) 815 movdqu 80(%rsi),%xmm11 816 movdqu %xmm2,32(%rdi) 817 movdqu 96(%rsi),%xmm2 818 movdqu %xmm7,48(%rdi) 819 movdqu 112(%rsi),%xmm7 820 leaq 128(%rsi),%rsi 821 pxor 16(%rsp),%xmm6 822 pxor %xmm13,%xmm11 823 pxor %xmm5,%xmm2 824 pxor %xmm1,%xmm7 825 826 movdqu %xmm6,64(%rdi) 827 movdqu 0(%rsi),%xmm6 828 movdqu %xmm11,80(%rdi) 829 movdqu 16(%rsi),%xmm11 830 movdqu %xmm2,96(%rdi) 831 movdqu 32(%rsi),%xmm2 832 movdqu %xmm7,112(%rdi) 833 leaq 128(%rdi),%rdi 834 movdqu 48(%rsi),%xmm7 835 pxor 32(%rsp),%xmm6 836 pxor %xmm10,%xmm11 837 pxor %xmm14,%xmm2 838 pxor %xmm8,%xmm7 839 840 movdqu %xmm6,0(%rdi) 841 movdqu 64(%rsi),%xmm6 842 movdqu %xmm11,16(%rdi) 843 movdqu 80(%rsi),%xmm11 844 movdqu %xmm2,32(%rdi) 845 movdqu 96(%rsi),%xmm2 846 movdqu %xmm7,48(%rdi) 847 movdqu 112(%rsi),%xmm7 848 leaq 128(%rsi),%rsi 849 pxor 48(%rsp),%xmm6 850 pxor %xmm15,%xmm11 851 pxor %xmm9,%xmm2 852 pxor %xmm3,%xmm7 853 movdqu %xmm6,64(%rdi) 854 movdqu %xmm11,80(%rdi) 855 movdqu %xmm2,96(%rdi) 856 movdqu %xmm7,112(%rdi) 857 leaq 128(%rdi),%rdi 858 859 subq $256,%rdx 860 jnz L$oop_outer4x 861 862 jmp L$done4x 863 864L$tail4x: 865 cmpq $192,%rdx 866 jae L$192_or_more4x 867 cmpq $128,%rdx 868 jae L$128_or_more4x 869 cmpq $64,%rdx 870 jae L$64_or_more4x 871 872 873 xorq %r10,%r10 874 875 movdqa %xmm12,16(%rsp) 876 movdqa %xmm4,32(%rsp) 877 movdqa %xmm0,48(%rsp) 878 jmp L$oop_tail4x 879 880.p2align 5 881L$64_or_more4x: 882 movdqu 0(%rsi),%xmm6 883 movdqu 16(%rsi),%xmm11 884 movdqu 32(%rsi),%xmm2 885 movdqu 48(%rsi),%xmm7 886 pxor 0(%rsp),%xmm6 887 pxor %xmm12,%xmm11 888 pxor %xmm4,%xmm2 889 pxor %xmm0,%xmm7 890 movdqu %xmm6,0(%rdi) 891 movdqu %xmm11,16(%rdi) 892 movdqu %xmm2,32(%rdi) 893 movdqu %xmm7,48(%rdi) 894 je L$done4x 895 896 movdqa 16(%rsp),%xmm6 897 leaq 64(%rsi),%rsi 898 xorq %r10,%r10 899 movdqa %xmm6,0(%rsp) 900 movdqa %xmm13,16(%rsp) 901 leaq 64(%rdi),%rdi 902 movdqa %xmm5,32(%rsp) 903 subq $64,%rdx 904 movdqa %xmm1,48(%rsp) 905 jmp L$oop_tail4x 906 907.p2align 5 908L$128_or_more4x: 909 movdqu 0(%rsi),%xmm6 910 movdqu 16(%rsi),%xmm11 911 movdqu 32(%rsi),%xmm2 912 movdqu 48(%rsi),%xmm7 913 pxor 0(%rsp),%xmm6 914 pxor %xmm12,%xmm11 915 pxor %xmm4,%xmm2 916 pxor %xmm0,%xmm7 917 918 movdqu %xmm6,0(%rdi) 919 movdqu 64(%rsi),%xmm6 920 movdqu %xmm11,16(%rdi) 921 movdqu 80(%rsi),%xmm11 922 movdqu %xmm2,32(%rdi) 923 movdqu 96(%rsi),%xmm2 924 movdqu %xmm7,48(%rdi) 925 movdqu 112(%rsi),%xmm7 926 pxor 16(%rsp),%xmm6 927 pxor %xmm13,%xmm11 928 pxor %xmm5,%xmm2 929 pxor %xmm1,%xmm7 930 movdqu %xmm6,64(%rdi) 931 movdqu %xmm11,80(%rdi) 932 movdqu %xmm2,96(%rdi) 933 movdqu %xmm7,112(%rdi) 934 je L$done4x 935 936 movdqa 32(%rsp),%xmm6 937 leaq 128(%rsi),%rsi 938 xorq %r10,%r10 939 movdqa %xmm6,0(%rsp) 940 movdqa %xmm10,16(%rsp) 941 leaq 128(%rdi),%rdi 942 movdqa %xmm14,32(%rsp) 943 subq $128,%rdx 944 movdqa %xmm8,48(%rsp) 945 jmp L$oop_tail4x 946 947.p2align 5 948L$192_or_more4x: 949 movdqu 0(%rsi),%xmm6 950 movdqu 16(%rsi),%xmm11 951 movdqu 32(%rsi),%xmm2 952 movdqu 48(%rsi),%xmm7 953 pxor 0(%rsp),%xmm6 954 pxor %xmm12,%xmm11 955 pxor %xmm4,%xmm2 956 pxor %xmm0,%xmm7 957 958 movdqu %xmm6,0(%rdi) 959 movdqu 64(%rsi),%xmm6 960 movdqu %xmm11,16(%rdi) 961 movdqu 80(%rsi),%xmm11 962 movdqu %xmm2,32(%rdi) 963 movdqu 96(%rsi),%xmm2 964 movdqu %xmm7,48(%rdi) 965 movdqu 112(%rsi),%xmm7 966 leaq 128(%rsi),%rsi 967 pxor 16(%rsp),%xmm6 968 pxor %xmm13,%xmm11 969 pxor %xmm5,%xmm2 970 pxor %xmm1,%xmm7 971 972 movdqu %xmm6,64(%rdi) 973 movdqu 0(%rsi),%xmm6 974 movdqu %xmm11,80(%rdi) 975 movdqu 16(%rsi),%xmm11 976 movdqu %xmm2,96(%rdi) 977 movdqu 32(%rsi),%xmm2 978 movdqu %xmm7,112(%rdi) 979 leaq 128(%rdi),%rdi 980 movdqu 48(%rsi),%xmm7 981 pxor 32(%rsp),%xmm6 982 pxor %xmm10,%xmm11 983 pxor %xmm14,%xmm2 984 pxor %xmm8,%xmm7 985 movdqu %xmm6,0(%rdi) 986 movdqu %xmm11,16(%rdi) 987 movdqu %xmm2,32(%rdi) 988 movdqu %xmm7,48(%rdi) 989 je L$done4x 990 991 movdqa 48(%rsp),%xmm6 992 leaq 64(%rsi),%rsi 993 xorq %r10,%r10 994 movdqa %xmm6,0(%rsp) 995 movdqa %xmm15,16(%rsp) 996 leaq 64(%rdi),%rdi 997 movdqa %xmm9,32(%rsp) 998 subq $192,%rdx 999 movdqa %xmm3,48(%rsp) 1000 1001L$oop_tail4x: 1002 movzbl (%rsi,%r10,1),%eax 1003 movzbl (%rsp,%r10,1),%ecx 1004 leaq 1(%r10),%r10 1005 xorl %ecx,%eax 1006 movb %al,-1(%rdi,%r10,1) 1007 decq %rdx 1008 jnz L$oop_tail4x 1009 1010L$done4x: 1011 leaq (%r9),%rsp 1012 1013L$4x_epilogue: 1014 .byte 0xf3,0xc3 1015 1016 1017 1018.p2align 5 1019ChaCha20_8x: 1020L$ChaCha20_8x: 1021 1022 movq %rsp,%r9 1023 1024 subq $0x280+8,%rsp 1025 andq $-32,%rsp 1026 vzeroupper 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 vbroadcasti128 L$sigma(%rip),%ymm11 1038 vbroadcasti128 (%rcx),%ymm3 1039 vbroadcasti128 16(%rcx),%ymm15 1040 vbroadcasti128 (%r8),%ymm7 1041 leaq 256(%rsp),%rcx 1042 leaq 512(%rsp),%rax 1043 leaq L$rot16(%rip),%r10 1044 leaq L$rot24(%rip),%r11 1045 1046 vpshufd $0x00,%ymm11,%ymm8 1047 vpshufd $0x55,%ymm11,%ymm9 1048 vmovdqa %ymm8,128-256(%rcx) 1049 vpshufd $0xaa,%ymm11,%ymm10 1050 vmovdqa %ymm9,160-256(%rcx) 1051 vpshufd $0xff,%ymm11,%ymm11 1052 vmovdqa %ymm10,192-256(%rcx) 1053 vmovdqa %ymm11,224-256(%rcx) 1054 1055 vpshufd $0x00,%ymm3,%ymm0 1056 vpshufd $0x55,%ymm3,%ymm1 1057 vmovdqa %ymm0,256-256(%rcx) 1058 vpshufd $0xaa,%ymm3,%ymm2 1059 vmovdqa %ymm1,288-256(%rcx) 1060 vpshufd $0xff,%ymm3,%ymm3 1061 vmovdqa %ymm2,320-256(%rcx) 1062 vmovdqa %ymm3,352-256(%rcx) 1063 1064 vpshufd $0x00,%ymm15,%ymm12 1065 vpshufd $0x55,%ymm15,%ymm13 1066 vmovdqa %ymm12,384-512(%rax) 1067 vpshufd $0xaa,%ymm15,%ymm14 1068 vmovdqa %ymm13,416-512(%rax) 1069 vpshufd $0xff,%ymm15,%ymm15 1070 vmovdqa %ymm14,448-512(%rax) 1071 vmovdqa %ymm15,480-512(%rax) 1072 1073 vpshufd $0x00,%ymm7,%ymm4 1074 vpshufd $0x55,%ymm7,%ymm5 1075 vpaddd L$incy(%rip),%ymm4,%ymm4 1076 vpshufd $0xaa,%ymm7,%ymm6 1077 vmovdqa %ymm5,544-512(%rax) 1078 vpshufd $0xff,%ymm7,%ymm7 1079 vmovdqa %ymm6,576-512(%rax) 1080 vmovdqa %ymm7,608-512(%rax) 1081 1082 jmp L$oop_enter8x 1083 1084.p2align 5 1085L$oop_outer8x: 1086 vmovdqa 128-256(%rcx),%ymm8 1087 vmovdqa 160-256(%rcx),%ymm9 1088 vmovdqa 192-256(%rcx),%ymm10 1089 vmovdqa 224-256(%rcx),%ymm11 1090 vmovdqa 256-256(%rcx),%ymm0 1091 vmovdqa 288-256(%rcx),%ymm1 1092 vmovdqa 320-256(%rcx),%ymm2 1093 vmovdqa 352-256(%rcx),%ymm3 1094 vmovdqa 384-512(%rax),%ymm12 1095 vmovdqa 416-512(%rax),%ymm13 1096 vmovdqa 448-512(%rax),%ymm14 1097 vmovdqa 480-512(%rax),%ymm15 1098 vmovdqa 512-512(%rax),%ymm4 1099 vmovdqa 544-512(%rax),%ymm5 1100 vmovdqa 576-512(%rax),%ymm6 1101 vmovdqa 608-512(%rax),%ymm7 1102 vpaddd L$eight(%rip),%ymm4,%ymm4 1103 1104L$oop_enter8x: 1105 vmovdqa %ymm14,64(%rsp) 1106 vmovdqa %ymm15,96(%rsp) 1107 vbroadcasti128 (%r10),%ymm15 1108 vmovdqa %ymm4,512-512(%rax) 1109 movl $10,%eax 1110 jmp L$oop8x 1111 1112.p2align 5 1113L$oop8x: 1114 vpaddd %ymm0,%ymm8,%ymm8 1115 vpxor %ymm4,%ymm8,%ymm4 1116 vpshufb %ymm15,%ymm4,%ymm4 1117 vpaddd %ymm1,%ymm9,%ymm9 1118 vpxor %ymm5,%ymm9,%ymm5 1119 vpshufb %ymm15,%ymm5,%ymm5 1120 vpaddd %ymm4,%ymm12,%ymm12 1121 vpxor %ymm0,%ymm12,%ymm0 1122 vpslld $12,%ymm0,%ymm14 1123 vpsrld $20,%ymm0,%ymm0 1124 vpor %ymm0,%ymm14,%ymm0 1125 vbroadcasti128 (%r11),%ymm14 1126 vpaddd %ymm5,%ymm13,%ymm13 1127 vpxor %ymm1,%ymm13,%ymm1 1128 vpslld $12,%ymm1,%ymm15 1129 vpsrld $20,%ymm1,%ymm1 1130 vpor %ymm1,%ymm15,%ymm1 1131 vpaddd %ymm0,%ymm8,%ymm8 1132 vpxor %ymm4,%ymm8,%ymm4 1133 vpshufb %ymm14,%ymm4,%ymm4 1134 vpaddd %ymm1,%ymm9,%ymm9 1135 vpxor %ymm5,%ymm9,%ymm5 1136 vpshufb %ymm14,%ymm5,%ymm5 1137 vpaddd %ymm4,%ymm12,%ymm12 1138 vpxor %ymm0,%ymm12,%ymm0 1139 vpslld $7,%ymm0,%ymm15 1140 vpsrld $25,%ymm0,%ymm0 1141 vpor %ymm0,%ymm15,%ymm0 1142 vbroadcasti128 (%r10),%ymm15 1143 vpaddd %ymm5,%ymm13,%ymm13 1144 vpxor %ymm1,%ymm13,%ymm1 1145 vpslld $7,%ymm1,%ymm14 1146 vpsrld $25,%ymm1,%ymm1 1147 vpor %ymm1,%ymm14,%ymm1 1148 vmovdqa %ymm12,0(%rsp) 1149 vmovdqa %ymm13,32(%rsp) 1150 vmovdqa 64(%rsp),%ymm12 1151 vmovdqa 96(%rsp),%ymm13 1152 vpaddd %ymm2,%ymm10,%ymm10 1153 vpxor %ymm6,%ymm10,%ymm6 1154 vpshufb %ymm15,%ymm6,%ymm6 1155 vpaddd %ymm3,%ymm11,%ymm11 1156 vpxor %ymm7,%ymm11,%ymm7 1157 vpshufb %ymm15,%ymm7,%ymm7 1158 vpaddd %ymm6,%ymm12,%ymm12 1159 vpxor %ymm2,%ymm12,%ymm2 1160 vpslld $12,%ymm2,%ymm14 1161 vpsrld $20,%ymm2,%ymm2 1162 vpor %ymm2,%ymm14,%ymm2 1163 vbroadcasti128 (%r11),%ymm14 1164 vpaddd %ymm7,%ymm13,%ymm13 1165 vpxor %ymm3,%ymm13,%ymm3 1166 vpslld $12,%ymm3,%ymm15 1167 vpsrld $20,%ymm3,%ymm3 1168 vpor %ymm3,%ymm15,%ymm3 1169 vpaddd %ymm2,%ymm10,%ymm10 1170 vpxor %ymm6,%ymm10,%ymm6 1171 vpshufb %ymm14,%ymm6,%ymm6 1172 vpaddd %ymm3,%ymm11,%ymm11 1173 vpxor %ymm7,%ymm11,%ymm7 1174 vpshufb %ymm14,%ymm7,%ymm7 1175 vpaddd %ymm6,%ymm12,%ymm12 1176 vpxor %ymm2,%ymm12,%ymm2 1177 vpslld $7,%ymm2,%ymm15 1178 vpsrld $25,%ymm2,%ymm2 1179 vpor %ymm2,%ymm15,%ymm2 1180 vbroadcasti128 (%r10),%ymm15 1181 vpaddd %ymm7,%ymm13,%ymm13 1182 vpxor %ymm3,%ymm13,%ymm3 1183 vpslld $7,%ymm3,%ymm14 1184 vpsrld $25,%ymm3,%ymm3 1185 vpor %ymm3,%ymm14,%ymm3 1186 vpaddd %ymm1,%ymm8,%ymm8 1187 vpxor %ymm7,%ymm8,%ymm7 1188 vpshufb %ymm15,%ymm7,%ymm7 1189 vpaddd %ymm2,%ymm9,%ymm9 1190 vpxor %ymm4,%ymm9,%ymm4 1191 vpshufb %ymm15,%ymm4,%ymm4 1192 vpaddd %ymm7,%ymm12,%ymm12 1193 vpxor %ymm1,%ymm12,%ymm1 1194 vpslld $12,%ymm1,%ymm14 1195 vpsrld $20,%ymm1,%ymm1 1196 vpor %ymm1,%ymm14,%ymm1 1197 vbroadcasti128 (%r11),%ymm14 1198 vpaddd %ymm4,%ymm13,%ymm13 1199 vpxor %ymm2,%ymm13,%ymm2 1200 vpslld $12,%ymm2,%ymm15 1201 vpsrld $20,%ymm2,%ymm2 1202 vpor %ymm2,%ymm15,%ymm2 1203 vpaddd %ymm1,%ymm8,%ymm8 1204 vpxor %ymm7,%ymm8,%ymm7 1205 vpshufb %ymm14,%ymm7,%ymm7 1206 vpaddd %ymm2,%ymm9,%ymm9 1207 vpxor %ymm4,%ymm9,%ymm4 1208 vpshufb %ymm14,%ymm4,%ymm4 1209 vpaddd %ymm7,%ymm12,%ymm12 1210 vpxor %ymm1,%ymm12,%ymm1 1211 vpslld $7,%ymm1,%ymm15 1212 vpsrld $25,%ymm1,%ymm1 1213 vpor %ymm1,%ymm15,%ymm1 1214 vbroadcasti128 (%r10),%ymm15 1215 vpaddd %ymm4,%ymm13,%ymm13 1216 vpxor %ymm2,%ymm13,%ymm2 1217 vpslld $7,%ymm2,%ymm14 1218 vpsrld $25,%ymm2,%ymm2 1219 vpor %ymm2,%ymm14,%ymm2 1220 vmovdqa %ymm12,64(%rsp) 1221 vmovdqa %ymm13,96(%rsp) 1222 vmovdqa 0(%rsp),%ymm12 1223 vmovdqa 32(%rsp),%ymm13 1224 vpaddd %ymm3,%ymm10,%ymm10 1225 vpxor %ymm5,%ymm10,%ymm5 1226 vpshufb %ymm15,%ymm5,%ymm5 1227 vpaddd %ymm0,%ymm11,%ymm11 1228 vpxor %ymm6,%ymm11,%ymm6 1229 vpshufb %ymm15,%ymm6,%ymm6 1230 vpaddd %ymm5,%ymm12,%ymm12 1231 vpxor %ymm3,%ymm12,%ymm3 1232 vpslld $12,%ymm3,%ymm14 1233 vpsrld $20,%ymm3,%ymm3 1234 vpor %ymm3,%ymm14,%ymm3 1235 vbroadcasti128 (%r11),%ymm14 1236 vpaddd %ymm6,%ymm13,%ymm13 1237 vpxor %ymm0,%ymm13,%ymm0 1238 vpslld $12,%ymm0,%ymm15 1239 vpsrld $20,%ymm0,%ymm0 1240 vpor %ymm0,%ymm15,%ymm0 1241 vpaddd %ymm3,%ymm10,%ymm10 1242 vpxor %ymm5,%ymm10,%ymm5 1243 vpshufb %ymm14,%ymm5,%ymm5 1244 vpaddd %ymm0,%ymm11,%ymm11 1245 vpxor %ymm6,%ymm11,%ymm6 1246 vpshufb %ymm14,%ymm6,%ymm6 1247 vpaddd %ymm5,%ymm12,%ymm12 1248 vpxor %ymm3,%ymm12,%ymm3 1249 vpslld $7,%ymm3,%ymm15 1250 vpsrld $25,%ymm3,%ymm3 1251 vpor %ymm3,%ymm15,%ymm3 1252 vbroadcasti128 (%r10),%ymm15 1253 vpaddd %ymm6,%ymm13,%ymm13 1254 vpxor %ymm0,%ymm13,%ymm0 1255 vpslld $7,%ymm0,%ymm14 1256 vpsrld $25,%ymm0,%ymm0 1257 vpor %ymm0,%ymm14,%ymm0 1258 decl %eax 1259 jnz L$oop8x 1260 1261 leaq 512(%rsp),%rax 1262 vpaddd 128-256(%rcx),%ymm8,%ymm8 1263 vpaddd 160-256(%rcx),%ymm9,%ymm9 1264 vpaddd 192-256(%rcx),%ymm10,%ymm10 1265 vpaddd 224-256(%rcx),%ymm11,%ymm11 1266 1267 vpunpckldq %ymm9,%ymm8,%ymm14 1268 vpunpckldq %ymm11,%ymm10,%ymm15 1269 vpunpckhdq %ymm9,%ymm8,%ymm8 1270 vpunpckhdq %ymm11,%ymm10,%ymm10 1271 vpunpcklqdq %ymm15,%ymm14,%ymm9 1272 vpunpckhqdq %ymm15,%ymm14,%ymm14 1273 vpunpcklqdq %ymm10,%ymm8,%ymm11 1274 vpunpckhqdq %ymm10,%ymm8,%ymm8 1275 vpaddd 256-256(%rcx),%ymm0,%ymm0 1276 vpaddd 288-256(%rcx),%ymm1,%ymm1 1277 vpaddd 320-256(%rcx),%ymm2,%ymm2 1278 vpaddd 352-256(%rcx),%ymm3,%ymm3 1279 1280 vpunpckldq %ymm1,%ymm0,%ymm10 1281 vpunpckldq %ymm3,%ymm2,%ymm15 1282 vpunpckhdq %ymm1,%ymm0,%ymm0 1283 vpunpckhdq %ymm3,%ymm2,%ymm2 1284 vpunpcklqdq %ymm15,%ymm10,%ymm1 1285 vpunpckhqdq %ymm15,%ymm10,%ymm10 1286 vpunpcklqdq %ymm2,%ymm0,%ymm3 1287 vpunpckhqdq %ymm2,%ymm0,%ymm0 1288 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1289 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1290 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1291 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1292 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1293 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1294 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1295 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1296 vmovdqa %ymm15,0(%rsp) 1297 vmovdqa %ymm9,32(%rsp) 1298 vmovdqa 64(%rsp),%ymm15 1299 vmovdqa 96(%rsp),%ymm9 1300 1301 vpaddd 384-512(%rax),%ymm12,%ymm12 1302 vpaddd 416-512(%rax),%ymm13,%ymm13 1303 vpaddd 448-512(%rax),%ymm15,%ymm15 1304 vpaddd 480-512(%rax),%ymm9,%ymm9 1305 1306 vpunpckldq %ymm13,%ymm12,%ymm2 1307 vpunpckldq %ymm9,%ymm15,%ymm8 1308 vpunpckhdq %ymm13,%ymm12,%ymm12 1309 vpunpckhdq %ymm9,%ymm15,%ymm15 1310 vpunpcklqdq %ymm8,%ymm2,%ymm13 1311 vpunpckhqdq %ymm8,%ymm2,%ymm2 1312 vpunpcklqdq %ymm15,%ymm12,%ymm9 1313 vpunpckhqdq %ymm15,%ymm12,%ymm12 1314 vpaddd 512-512(%rax),%ymm4,%ymm4 1315 vpaddd 544-512(%rax),%ymm5,%ymm5 1316 vpaddd 576-512(%rax),%ymm6,%ymm6 1317 vpaddd 608-512(%rax),%ymm7,%ymm7 1318 1319 vpunpckldq %ymm5,%ymm4,%ymm15 1320 vpunpckldq %ymm7,%ymm6,%ymm8 1321 vpunpckhdq %ymm5,%ymm4,%ymm4 1322 vpunpckhdq %ymm7,%ymm6,%ymm6 1323 vpunpcklqdq %ymm8,%ymm15,%ymm5 1324 vpunpckhqdq %ymm8,%ymm15,%ymm15 1325 vpunpcklqdq %ymm6,%ymm4,%ymm7 1326 vpunpckhqdq %ymm6,%ymm4,%ymm4 1327 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1328 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1329 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1330 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1331 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1332 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1333 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1334 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1335 vmovdqa 0(%rsp),%ymm6 1336 vmovdqa 32(%rsp),%ymm12 1337 1338 cmpq $512,%rdx 1339 jb L$tail8x 1340 1341 vpxor 0(%rsi),%ymm6,%ymm6 1342 vpxor 32(%rsi),%ymm8,%ymm8 1343 vpxor 64(%rsi),%ymm1,%ymm1 1344 vpxor 96(%rsi),%ymm5,%ymm5 1345 leaq 128(%rsi),%rsi 1346 vmovdqu %ymm6,0(%rdi) 1347 vmovdqu %ymm8,32(%rdi) 1348 vmovdqu %ymm1,64(%rdi) 1349 vmovdqu %ymm5,96(%rdi) 1350 leaq 128(%rdi),%rdi 1351 1352 vpxor 0(%rsi),%ymm12,%ymm12 1353 vpxor 32(%rsi),%ymm13,%ymm13 1354 vpxor 64(%rsi),%ymm10,%ymm10 1355 vpxor 96(%rsi),%ymm15,%ymm15 1356 leaq 128(%rsi),%rsi 1357 vmovdqu %ymm12,0(%rdi) 1358 vmovdqu %ymm13,32(%rdi) 1359 vmovdqu %ymm10,64(%rdi) 1360 vmovdqu %ymm15,96(%rdi) 1361 leaq 128(%rdi),%rdi 1362 1363 vpxor 0(%rsi),%ymm14,%ymm14 1364 vpxor 32(%rsi),%ymm2,%ymm2 1365 vpxor 64(%rsi),%ymm3,%ymm3 1366 vpxor 96(%rsi),%ymm7,%ymm7 1367 leaq 128(%rsi),%rsi 1368 vmovdqu %ymm14,0(%rdi) 1369 vmovdqu %ymm2,32(%rdi) 1370 vmovdqu %ymm3,64(%rdi) 1371 vmovdqu %ymm7,96(%rdi) 1372 leaq 128(%rdi),%rdi 1373 1374 vpxor 0(%rsi),%ymm11,%ymm11 1375 vpxor 32(%rsi),%ymm9,%ymm9 1376 vpxor 64(%rsi),%ymm0,%ymm0 1377 vpxor 96(%rsi),%ymm4,%ymm4 1378 leaq 128(%rsi),%rsi 1379 vmovdqu %ymm11,0(%rdi) 1380 vmovdqu %ymm9,32(%rdi) 1381 vmovdqu %ymm0,64(%rdi) 1382 vmovdqu %ymm4,96(%rdi) 1383 leaq 128(%rdi),%rdi 1384 1385 subq $512,%rdx 1386 jnz L$oop_outer8x 1387 1388 jmp L$done8x 1389 1390L$tail8x: 1391 cmpq $448,%rdx 1392 jae L$448_or_more8x 1393 cmpq $384,%rdx 1394 jae L$384_or_more8x 1395 cmpq $320,%rdx 1396 jae L$320_or_more8x 1397 cmpq $256,%rdx 1398 jae L$256_or_more8x 1399 cmpq $192,%rdx 1400 jae L$192_or_more8x 1401 cmpq $128,%rdx 1402 jae L$128_or_more8x 1403 cmpq $64,%rdx 1404 jae L$64_or_more8x 1405 1406 xorq %r10,%r10 1407 vmovdqa %ymm6,0(%rsp) 1408 vmovdqa %ymm8,32(%rsp) 1409 jmp L$oop_tail8x 1410 1411.p2align 5 1412L$64_or_more8x: 1413 vpxor 0(%rsi),%ymm6,%ymm6 1414 vpxor 32(%rsi),%ymm8,%ymm8 1415 vmovdqu %ymm6,0(%rdi) 1416 vmovdqu %ymm8,32(%rdi) 1417 je L$done8x 1418 1419 leaq 64(%rsi),%rsi 1420 xorq %r10,%r10 1421 vmovdqa %ymm1,0(%rsp) 1422 leaq 64(%rdi),%rdi 1423 subq $64,%rdx 1424 vmovdqa %ymm5,32(%rsp) 1425 jmp L$oop_tail8x 1426 1427.p2align 5 1428L$128_or_more8x: 1429 vpxor 0(%rsi),%ymm6,%ymm6 1430 vpxor 32(%rsi),%ymm8,%ymm8 1431 vpxor 64(%rsi),%ymm1,%ymm1 1432 vpxor 96(%rsi),%ymm5,%ymm5 1433 vmovdqu %ymm6,0(%rdi) 1434 vmovdqu %ymm8,32(%rdi) 1435 vmovdqu %ymm1,64(%rdi) 1436 vmovdqu %ymm5,96(%rdi) 1437 je L$done8x 1438 1439 leaq 128(%rsi),%rsi 1440 xorq %r10,%r10 1441 vmovdqa %ymm12,0(%rsp) 1442 leaq 128(%rdi),%rdi 1443 subq $128,%rdx 1444 vmovdqa %ymm13,32(%rsp) 1445 jmp L$oop_tail8x 1446 1447.p2align 5 1448L$192_or_more8x: 1449 vpxor 0(%rsi),%ymm6,%ymm6 1450 vpxor 32(%rsi),%ymm8,%ymm8 1451 vpxor 64(%rsi),%ymm1,%ymm1 1452 vpxor 96(%rsi),%ymm5,%ymm5 1453 vpxor 128(%rsi),%ymm12,%ymm12 1454 vpxor 160(%rsi),%ymm13,%ymm13 1455 vmovdqu %ymm6,0(%rdi) 1456 vmovdqu %ymm8,32(%rdi) 1457 vmovdqu %ymm1,64(%rdi) 1458 vmovdqu %ymm5,96(%rdi) 1459 vmovdqu %ymm12,128(%rdi) 1460 vmovdqu %ymm13,160(%rdi) 1461 je L$done8x 1462 1463 leaq 192(%rsi),%rsi 1464 xorq %r10,%r10 1465 vmovdqa %ymm10,0(%rsp) 1466 leaq 192(%rdi),%rdi 1467 subq $192,%rdx 1468 vmovdqa %ymm15,32(%rsp) 1469 jmp L$oop_tail8x 1470 1471.p2align 5 1472L$256_or_more8x: 1473 vpxor 0(%rsi),%ymm6,%ymm6 1474 vpxor 32(%rsi),%ymm8,%ymm8 1475 vpxor 64(%rsi),%ymm1,%ymm1 1476 vpxor 96(%rsi),%ymm5,%ymm5 1477 vpxor 128(%rsi),%ymm12,%ymm12 1478 vpxor 160(%rsi),%ymm13,%ymm13 1479 vpxor 192(%rsi),%ymm10,%ymm10 1480 vpxor 224(%rsi),%ymm15,%ymm15 1481 vmovdqu %ymm6,0(%rdi) 1482 vmovdqu %ymm8,32(%rdi) 1483 vmovdqu %ymm1,64(%rdi) 1484 vmovdqu %ymm5,96(%rdi) 1485 vmovdqu %ymm12,128(%rdi) 1486 vmovdqu %ymm13,160(%rdi) 1487 vmovdqu %ymm10,192(%rdi) 1488 vmovdqu %ymm15,224(%rdi) 1489 je L$done8x 1490 1491 leaq 256(%rsi),%rsi 1492 xorq %r10,%r10 1493 vmovdqa %ymm14,0(%rsp) 1494 leaq 256(%rdi),%rdi 1495 subq $256,%rdx 1496 vmovdqa %ymm2,32(%rsp) 1497 jmp L$oop_tail8x 1498 1499.p2align 5 1500L$320_or_more8x: 1501 vpxor 0(%rsi),%ymm6,%ymm6 1502 vpxor 32(%rsi),%ymm8,%ymm8 1503 vpxor 64(%rsi),%ymm1,%ymm1 1504 vpxor 96(%rsi),%ymm5,%ymm5 1505 vpxor 128(%rsi),%ymm12,%ymm12 1506 vpxor 160(%rsi),%ymm13,%ymm13 1507 vpxor 192(%rsi),%ymm10,%ymm10 1508 vpxor 224(%rsi),%ymm15,%ymm15 1509 vpxor 256(%rsi),%ymm14,%ymm14 1510 vpxor 288(%rsi),%ymm2,%ymm2 1511 vmovdqu %ymm6,0(%rdi) 1512 vmovdqu %ymm8,32(%rdi) 1513 vmovdqu %ymm1,64(%rdi) 1514 vmovdqu %ymm5,96(%rdi) 1515 vmovdqu %ymm12,128(%rdi) 1516 vmovdqu %ymm13,160(%rdi) 1517 vmovdqu %ymm10,192(%rdi) 1518 vmovdqu %ymm15,224(%rdi) 1519 vmovdqu %ymm14,256(%rdi) 1520 vmovdqu %ymm2,288(%rdi) 1521 je L$done8x 1522 1523 leaq 320(%rsi),%rsi 1524 xorq %r10,%r10 1525 vmovdqa %ymm3,0(%rsp) 1526 leaq 320(%rdi),%rdi 1527 subq $320,%rdx 1528 vmovdqa %ymm7,32(%rsp) 1529 jmp L$oop_tail8x 1530 1531.p2align 5 1532L$384_or_more8x: 1533 vpxor 0(%rsi),%ymm6,%ymm6 1534 vpxor 32(%rsi),%ymm8,%ymm8 1535 vpxor 64(%rsi),%ymm1,%ymm1 1536 vpxor 96(%rsi),%ymm5,%ymm5 1537 vpxor 128(%rsi),%ymm12,%ymm12 1538 vpxor 160(%rsi),%ymm13,%ymm13 1539 vpxor 192(%rsi),%ymm10,%ymm10 1540 vpxor 224(%rsi),%ymm15,%ymm15 1541 vpxor 256(%rsi),%ymm14,%ymm14 1542 vpxor 288(%rsi),%ymm2,%ymm2 1543 vpxor 320(%rsi),%ymm3,%ymm3 1544 vpxor 352(%rsi),%ymm7,%ymm7 1545 vmovdqu %ymm6,0(%rdi) 1546 vmovdqu %ymm8,32(%rdi) 1547 vmovdqu %ymm1,64(%rdi) 1548 vmovdqu %ymm5,96(%rdi) 1549 vmovdqu %ymm12,128(%rdi) 1550 vmovdqu %ymm13,160(%rdi) 1551 vmovdqu %ymm10,192(%rdi) 1552 vmovdqu %ymm15,224(%rdi) 1553 vmovdqu %ymm14,256(%rdi) 1554 vmovdqu %ymm2,288(%rdi) 1555 vmovdqu %ymm3,320(%rdi) 1556 vmovdqu %ymm7,352(%rdi) 1557 je L$done8x 1558 1559 leaq 384(%rsi),%rsi 1560 xorq %r10,%r10 1561 vmovdqa %ymm11,0(%rsp) 1562 leaq 384(%rdi),%rdi 1563 subq $384,%rdx 1564 vmovdqa %ymm9,32(%rsp) 1565 jmp L$oop_tail8x 1566 1567.p2align 5 1568L$448_or_more8x: 1569 vpxor 0(%rsi),%ymm6,%ymm6 1570 vpxor 32(%rsi),%ymm8,%ymm8 1571 vpxor 64(%rsi),%ymm1,%ymm1 1572 vpxor 96(%rsi),%ymm5,%ymm5 1573 vpxor 128(%rsi),%ymm12,%ymm12 1574 vpxor 160(%rsi),%ymm13,%ymm13 1575 vpxor 192(%rsi),%ymm10,%ymm10 1576 vpxor 224(%rsi),%ymm15,%ymm15 1577 vpxor 256(%rsi),%ymm14,%ymm14 1578 vpxor 288(%rsi),%ymm2,%ymm2 1579 vpxor 320(%rsi),%ymm3,%ymm3 1580 vpxor 352(%rsi),%ymm7,%ymm7 1581 vpxor 384(%rsi),%ymm11,%ymm11 1582 vpxor 416(%rsi),%ymm9,%ymm9 1583 vmovdqu %ymm6,0(%rdi) 1584 vmovdqu %ymm8,32(%rdi) 1585 vmovdqu %ymm1,64(%rdi) 1586 vmovdqu %ymm5,96(%rdi) 1587 vmovdqu %ymm12,128(%rdi) 1588 vmovdqu %ymm13,160(%rdi) 1589 vmovdqu %ymm10,192(%rdi) 1590 vmovdqu %ymm15,224(%rdi) 1591 vmovdqu %ymm14,256(%rdi) 1592 vmovdqu %ymm2,288(%rdi) 1593 vmovdqu %ymm3,320(%rdi) 1594 vmovdqu %ymm7,352(%rdi) 1595 vmovdqu %ymm11,384(%rdi) 1596 vmovdqu %ymm9,416(%rdi) 1597 je L$done8x 1598 1599 leaq 448(%rsi),%rsi 1600 xorq %r10,%r10 1601 vmovdqa %ymm0,0(%rsp) 1602 leaq 448(%rdi),%rdi 1603 subq $448,%rdx 1604 vmovdqa %ymm4,32(%rsp) 1605 1606L$oop_tail8x: 1607 movzbl (%rsi,%r10,1),%eax 1608 movzbl (%rsp,%r10,1),%ecx 1609 leaq 1(%r10),%r10 1610 xorl %ecx,%eax 1611 movb %al,-1(%rdi,%r10,1) 1612 decq %rdx 1613 jnz L$oop_tail8x 1614 1615L$done8x: 1616 vzeroall 1617 leaq (%r9),%rsp 1618 1619L$8x_epilogue: 1620 .byte 0xf3,0xc3 1621 1622 1623#endif 1624