1# This file is generated from a similarly-named Perl script in the BoringSSL 2# source tree. Do not edit by hand. 3 4#if defined(__has_feature) 5#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 6#define OPENSSL_NO_ASM 7#endif 8#endif 9 10#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) 11#if defined(BORINGSSL_PREFIX) 12#include <boringssl_prefix_symbols_asm.h> 13#endif 14.text 15 16 17 18.p2align 6 19L$zero: 20.long 0,0,0,0 21L$one: 22.long 1,0,0,0 23L$inc: 24.long 0,1,2,3 25L$four: 26.long 4,4,4,4 27L$incy: 28.long 0,2,4,6,1,3,5,7 29L$eight: 30.long 8,8,8,8,8,8,8,8 31L$rot16: 32.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 33L$rot24: 34.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 35L$sigma: 36.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 37.p2align 6 38L$zeroz: 39.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 40L$fourz: 41.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 42L$incz: 43.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 44L$sixteen: 45.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 46.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 47.globl _ChaCha20_ctr32 48.private_extern _ChaCha20_ctr32 49 50.p2align 6 51_ChaCha20_ctr32: 52 53 cmpq $0,%rdx 54 je L$no_data 55 movq _OPENSSL_ia32cap_P+4(%rip),%r10 56 testl $512,%r10d 57 jnz L$ChaCha20_ssse3 58 59 pushq %rbx 60 61 pushq %rbp 62 63 pushq %r12 64 65 pushq %r13 66 67 pushq %r14 68 69 pushq %r15 70 71 subq $64+24,%rsp 72 73L$ctr32_body: 74 75 76 movdqu (%rcx),%xmm1 77 movdqu 16(%rcx),%xmm2 78 movdqu (%r8),%xmm3 79 movdqa L$one(%rip),%xmm4 80 81 82 movdqa %xmm1,16(%rsp) 83 movdqa %xmm2,32(%rsp) 84 movdqa %xmm3,48(%rsp) 85 movq %rdx,%rbp 86 jmp L$oop_outer 87 88.p2align 5 89L$oop_outer: 90 movl $0x61707865,%eax 91 movl $0x3320646e,%ebx 92 movl $0x79622d32,%ecx 93 movl $0x6b206574,%edx 94 movl 16(%rsp),%r8d 95 movl 20(%rsp),%r9d 96 movl 24(%rsp),%r10d 97 movl 28(%rsp),%r11d 98 movd %xmm3,%r12d 99 movl 52(%rsp),%r13d 100 movl 56(%rsp),%r14d 101 movl 60(%rsp),%r15d 102 103 movq %rbp,64+0(%rsp) 104 movl $10,%ebp 105 movq %rsi,64+8(%rsp) 106.byte 102,72,15,126,214 107 movq %rdi,64+16(%rsp) 108 movq %rsi,%rdi 109 shrq $32,%rdi 110 jmp L$oop 111 112.p2align 5 113L$oop: 114 addl %r8d,%eax 115 xorl %eax,%r12d 116 roll $16,%r12d 117 addl %r9d,%ebx 118 xorl %ebx,%r13d 119 roll $16,%r13d 120 addl %r12d,%esi 121 xorl %esi,%r8d 122 roll $12,%r8d 123 addl %r13d,%edi 124 xorl %edi,%r9d 125 roll $12,%r9d 126 addl %r8d,%eax 127 xorl %eax,%r12d 128 roll $8,%r12d 129 addl %r9d,%ebx 130 xorl %ebx,%r13d 131 roll $8,%r13d 132 addl %r12d,%esi 133 xorl %esi,%r8d 134 roll $7,%r8d 135 addl %r13d,%edi 136 xorl %edi,%r9d 137 roll $7,%r9d 138 movl %esi,32(%rsp) 139 movl %edi,36(%rsp) 140 movl 40(%rsp),%esi 141 movl 44(%rsp),%edi 142 addl %r10d,%ecx 143 xorl %ecx,%r14d 144 roll $16,%r14d 145 addl %r11d,%edx 146 xorl %edx,%r15d 147 roll $16,%r15d 148 addl %r14d,%esi 149 xorl %esi,%r10d 150 roll $12,%r10d 151 addl %r15d,%edi 152 xorl %edi,%r11d 153 roll $12,%r11d 154 addl %r10d,%ecx 155 xorl %ecx,%r14d 156 roll $8,%r14d 157 addl %r11d,%edx 158 xorl %edx,%r15d 159 roll $8,%r15d 160 addl %r14d,%esi 161 xorl %esi,%r10d 162 roll $7,%r10d 163 addl %r15d,%edi 164 xorl %edi,%r11d 165 roll $7,%r11d 166 addl %r9d,%eax 167 xorl %eax,%r15d 168 roll $16,%r15d 169 addl %r10d,%ebx 170 xorl %ebx,%r12d 171 roll $16,%r12d 172 addl %r15d,%esi 173 xorl %esi,%r9d 174 roll $12,%r9d 175 addl %r12d,%edi 176 xorl %edi,%r10d 177 roll $12,%r10d 178 addl %r9d,%eax 179 xorl %eax,%r15d 180 roll $8,%r15d 181 addl %r10d,%ebx 182 xorl %ebx,%r12d 183 roll $8,%r12d 184 addl %r15d,%esi 185 xorl %esi,%r9d 186 roll $7,%r9d 187 addl %r12d,%edi 188 xorl %edi,%r10d 189 roll $7,%r10d 190 movl %esi,40(%rsp) 191 movl %edi,44(%rsp) 192 movl 32(%rsp),%esi 193 movl 36(%rsp),%edi 194 addl %r11d,%ecx 195 xorl %ecx,%r13d 196 roll $16,%r13d 197 addl %r8d,%edx 198 xorl %edx,%r14d 199 roll $16,%r14d 200 addl %r13d,%esi 201 xorl %esi,%r11d 202 roll $12,%r11d 203 addl %r14d,%edi 204 xorl %edi,%r8d 205 roll $12,%r8d 206 addl %r11d,%ecx 207 xorl %ecx,%r13d 208 roll $8,%r13d 209 addl %r8d,%edx 210 xorl %edx,%r14d 211 roll $8,%r14d 212 addl %r13d,%esi 213 xorl %esi,%r11d 214 roll $7,%r11d 215 addl %r14d,%edi 216 xorl %edi,%r8d 217 roll $7,%r8d 218 decl %ebp 219 jnz L$oop 220 movl %edi,36(%rsp) 221 movl %esi,32(%rsp) 222 movq 64(%rsp),%rbp 223 movdqa %xmm2,%xmm1 224 movq 64+8(%rsp),%rsi 225 paddd %xmm4,%xmm3 226 movq 64+16(%rsp),%rdi 227 228 addl $0x61707865,%eax 229 addl $0x3320646e,%ebx 230 addl $0x79622d32,%ecx 231 addl $0x6b206574,%edx 232 addl 16(%rsp),%r8d 233 addl 20(%rsp),%r9d 234 addl 24(%rsp),%r10d 235 addl 28(%rsp),%r11d 236 addl 48(%rsp),%r12d 237 addl 52(%rsp),%r13d 238 addl 56(%rsp),%r14d 239 addl 60(%rsp),%r15d 240 paddd 32(%rsp),%xmm1 241 242 cmpq $64,%rbp 243 jb L$tail 244 245 xorl 0(%rsi),%eax 246 xorl 4(%rsi),%ebx 247 xorl 8(%rsi),%ecx 248 xorl 12(%rsi),%edx 249 xorl 16(%rsi),%r8d 250 xorl 20(%rsi),%r9d 251 xorl 24(%rsi),%r10d 252 xorl 28(%rsi),%r11d 253 movdqu 32(%rsi),%xmm0 254 xorl 48(%rsi),%r12d 255 xorl 52(%rsi),%r13d 256 xorl 56(%rsi),%r14d 257 xorl 60(%rsi),%r15d 258 leaq 64(%rsi),%rsi 259 pxor %xmm1,%xmm0 260 261 movdqa %xmm2,32(%rsp) 262 movd %xmm3,48(%rsp) 263 264 movl %eax,0(%rdi) 265 movl %ebx,4(%rdi) 266 movl %ecx,8(%rdi) 267 movl %edx,12(%rdi) 268 movl %r8d,16(%rdi) 269 movl %r9d,20(%rdi) 270 movl %r10d,24(%rdi) 271 movl %r11d,28(%rdi) 272 movdqu %xmm0,32(%rdi) 273 movl %r12d,48(%rdi) 274 movl %r13d,52(%rdi) 275 movl %r14d,56(%rdi) 276 movl %r15d,60(%rdi) 277 leaq 64(%rdi),%rdi 278 279 subq $64,%rbp 280 jnz L$oop_outer 281 282 jmp L$done 283 284.p2align 4 285L$tail: 286 movl %eax,0(%rsp) 287 movl %ebx,4(%rsp) 288 xorq %rbx,%rbx 289 movl %ecx,8(%rsp) 290 movl %edx,12(%rsp) 291 movl %r8d,16(%rsp) 292 movl %r9d,20(%rsp) 293 movl %r10d,24(%rsp) 294 movl %r11d,28(%rsp) 295 movdqa %xmm1,32(%rsp) 296 movl %r12d,48(%rsp) 297 movl %r13d,52(%rsp) 298 movl %r14d,56(%rsp) 299 movl %r15d,60(%rsp) 300 301L$oop_tail: 302 movzbl (%rsi,%rbx,1),%eax 303 movzbl (%rsp,%rbx,1),%edx 304 leaq 1(%rbx),%rbx 305 xorl %edx,%eax 306 movb %al,-1(%rdi,%rbx,1) 307 decq %rbp 308 jnz L$oop_tail 309 310L$done: 311 leaq 64+24+48(%rsp),%rsi 312 movq -48(%rsi),%r15 313 314 movq -40(%rsi),%r14 315 316 movq -32(%rsi),%r13 317 318 movq -24(%rsi),%r12 319 320 movq -16(%rsi),%rbp 321 322 movq -8(%rsi),%rbx 323 324 leaq (%rsi),%rsp 325 326L$no_data: 327 .byte 0xf3,0xc3 328 329 330 331.p2align 5 332ChaCha20_ssse3: 333L$ChaCha20_ssse3: 334 335 movq %rsp,%r9 336 337 cmpq $128,%rdx 338 ja L$ChaCha20_4x 339 340L$do_sse3_after_all: 341 subq $64+8,%rsp 342 movdqa L$sigma(%rip),%xmm0 343 movdqu (%rcx),%xmm1 344 movdqu 16(%rcx),%xmm2 345 movdqu (%r8),%xmm3 346 movdqa L$rot16(%rip),%xmm6 347 movdqa L$rot24(%rip),%xmm7 348 349 movdqa %xmm0,0(%rsp) 350 movdqa %xmm1,16(%rsp) 351 movdqa %xmm2,32(%rsp) 352 movdqa %xmm3,48(%rsp) 353 movq $10,%r8 354 jmp L$oop_ssse3 355 356.p2align 5 357L$oop_outer_ssse3: 358 movdqa L$one(%rip),%xmm3 359 movdqa 0(%rsp),%xmm0 360 movdqa 16(%rsp),%xmm1 361 movdqa 32(%rsp),%xmm2 362 paddd 48(%rsp),%xmm3 363 movq $10,%r8 364 movdqa %xmm3,48(%rsp) 365 jmp L$oop_ssse3 366 367.p2align 5 368L$oop_ssse3: 369 paddd %xmm1,%xmm0 370 pxor %xmm0,%xmm3 371.byte 102,15,56,0,222 372 paddd %xmm3,%xmm2 373 pxor %xmm2,%xmm1 374 movdqa %xmm1,%xmm4 375 psrld $20,%xmm1 376 pslld $12,%xmm4 377 por %xmm4,%xmm1 378 paddd %xmm1,%xmm0 379 pxor %xmm0,%xmm3 380.byte 102,15,56,0,223 381 paddd %xmm3,%xmm2 382 pxor %xmm2,%xmm1 383 movdqa %xmm1,%xmm4 384 psrld $25,%xmm1 385 pslld $7,%xmm4 386 por %xmm4,%xmm1 387 pshufd $78,%xmm2,%xmm2 388 pshufd $57,%xmm1,%xmm1 389 pshufd $147,%xmm3,%xmm3 390 nop 391 paddd %xmm1,%xmm0 392 pxor %xmm0,%xmm3 393.byte 102,15,56,0,222 394 paddd %xmm3,%xmm2 395 pxor %xmm2,%xmm1 396 movdqa %xmm1,%xmm4 397 psrld $20,%xmm1 398 pslld $12,%xmm4 399 por %xmm4,%xmm1 400 paddd %xmm1,%xmm0 401 pxor %xmm0,%xmm3 402.byte 102,15,56,0,223 403 paddd %xmm3,%xmm2 404 pxor %xmm2,%xmm1 405 movdqa %xmm1,%xmm4 406 psrld $25,%xmm1 407 pslld $7,%xmm4 408 por %xmm4,%xmm1 409 pshufd $78,%xmm2,%xmm2 410 pshufd $147,%xmm1,%xmm1 411 pshufd $57,%xmm3,%xmm3 412 decq %r8 413 jnz L$oop_ssse3 414 paddd 0(%rsp),%xmm0 415 paddd 16(%rsp),%xmm1 416 paddd 32(%rsp),%xmm2 417 paddd 48(%rsp),%xmm3 418 419 cmpq $64,%rdx 420 jb L$tail_ssse3 421 422 movdqu 0(%rsi),%xmm4 423 movdqu 16(%rsi),%xmm5 424 pxor %xmm4,%xmm0 425 movdqu 32(%rsi),%xmm4 426 pxor %xmm5,%xmm1 427 movdqu 48(%rsi),%xmm5 428 leaq 64(%rsi),%rsi 429 pxor %xmm4,%xmm2 430 pxor %xmm5,%xmm3 431 432 movdqu %xmm0,0(%rdi) 433 movdqu %xmm1,16(%rdi) 434 movdqu %xmm2,32(%rdi) 435 movdqu %xmm3,48(%rdi) 436 leaq 64(%rdi),%rdi 437 438 subq $64,%rdx 439 jnz L$oop_outer_ssse3 440 441 jmp L$done_ssse3 442 443.p2align 4 444L$tail_ssse3: 445 movdqa %xmm0,0(%rsp) 446 movdqa %xmm1,16(%rsp) 447 movdqa %xmm2,32(%rsp) 448 movdqa %xmm3,48(%rsp) 449 xorq %r8,%r8 450 451L$oop_tail_ssse3: 452 movzbl (%rsi,%r8,1),%eax 453 movzbl (%rsp,%r8,1),%ecx 454 leaq 1(%r8),%r8 455 xorl %ecx,%eax 456 movb %al,-1(%rdi,%r8,1) 457 decq %rdx 458 jnz L$oop_tail_ssse3 459 460L$done_ssse3: 461 leaq (%r9),%rsp 462 463L$ssse3_epilogue: 464 .byte 0xf3,0xc3 465 466 467 468.p2align 5 469ChaCha20_4x: 470L$ChaCha20_4x: 471 472 movq %rsp,%r9 473 474 movq %r10,%r11 475 shrq $32,%r10 476 testq $32,%r10 477 jnz L$ChaCha20_8x 478 cmpq $192,%rdx 479 ja L$proceed4x 480 481 andq $71303168,%r11 482 cmpq $4194304,%r11 483 je L$do_sse3_after_all 484 485L$proceed4x: 486 subq $0x140+8,%rsp 487 movdqa L$sigma(%rip),%xmm11 488 movdqu (%rcx),%xmm15 489 movdqu 16(%rcx),%xmm7 490 movdqu (%r8),%xmm3 491 leaq 256(%rsp),%rcx 492 leaq L$rot16(%rip),%r10 493 leaq L$rot24(%rip),%r11 494 495 pshufd $0x00,%xmm11,%xmm8 496 pshufd $0x55,%xmm11,%xmm9 497 movdqa %xmm8,64(%rsp) 498 pshufd $0xaa,%xmm11,%xmm10 499 movdqa %xmm9,80(%rsp) 500 pshufd $0xff,%xmm11,%xmm11 501 movdqa %xmm10,96(%rsp) 502 movdqa %xmm11,112(%rsp) 503 504 pshufd $0x00,%xmm15,%xmm12 505 pshufd $0x55,%xmm15,%xmm13 506 movdqa %xmm12,128-256(%rcx) 507 pshufd $0xaa,%xmm15,%xmm14 508 movdqa %xmm13,144-256(%rcx) 509 pshufd $0xff,%xmm15,%xmm15 510 movdqa %xmm14,160-256(%rcx) 511 movdqa %xmm15,176-256(%rcx) 512 513 pshufd $0x00,%xmm7,%xmm4 514 pshufd $0x55,%xmm7,%xmm5 515 movdqa %xmm4,192-256(%rcx) 516 pshufd $0xaa,%xmm7,%xmm6 517 movdqa %xmm5,208-256(%rcx) 518 pshufd $0xff,%xmm7,%xmm7 519 movdqa %xmm6,224-256(%rcx) 520 movdqa %xmm7,240-256(%rcx) 521 522 pshufd $0x00,%xmm3,%xmm0 523 pshufd $0x55,%xmm3,%xmm1 524 paddd L$inc(%rip),%xmm0 525 pshufd $0xaa,%xmm3,%xmm2 526 movdqa %xmm1,272-256(%rcx) 527 pshufd $0xff,%xmm3,%xmm3 528 movdqa %xmm2,288-256(%rcx) 529 movdqa %xmm3,304-256(%rcx) 530 531 jmp L$oop_enter4x 532 533.p2align 5 534L$oop_outer4x: 535 movdqa 64(%rsp),%xmm8 536 movdqa 80(%rsp),%xmm9 537 movdqa 96(%rsp),%xmm10 538 movdqa 112(%rsp),%xmm11 539 movdqa 128-256(%rcx),%xmm12 540 movdqa 144-256(%rcx),%xmm13 541 movdqa 160-256(%rcx),%xmm14 542 movdqa 176-256(%rcx),%xmm15 543 movdqa 192-256(%rcx),%xmm4 544 movdqa 208-256(%rcx),%xmm5 545 movdqa 224-256(%rcx),%xmm6 546 movdqa 240-256(%rcx),%xmm7 547 movdqa 256-256(%rcx),%xmm0 548 movdqa 272-256(%rcx),%xmm1 549 movdqa 288-256(%rcx),%xmm2 550 movdqa 304-256(%rcx),%xmm3 551 paddd L$four(%rip),%xmm0 552 553L$oop_enter4x: 554 movdqa %xmm6,32(%rsp) 555 movdqa %xmm7,48(%rsp) 556 movdqa (%r10),%xmm7 557 movl $10,%eax 558 movdqa %xmm0,256-256(%rcx) 559 jmp L$oop4x 560 561.p2align 5 562L$oop4x: 563 paddd %xmm12,%xmm8 564 paddd %xmm13,%xmm9 565 pxor %xmm8,%xmm0 566 pxor %xmm9,%xmm1 567.byte 102,15,56,0,199 568.byte 102,15,56,0,207 569 paddd %xmm0,%xmm4 570 paddd %xmm1,%xmm5 571 pxor %xmm4,%xmm12 572 pxor %xmm5,%xmm13 573 movdqa %xmm12,%xmm6 574 pslld $12,%xmm12 575 psrld $20,%xmm6 576 movdqa %xmm13,%xmm7 577 pslld $12,%xmm13 578 por %xmm6,%xmm12 579 psrld $20,%xmm7 580 movdqa (%r11),%xmm6 581 por %xmm7,%xmm13 582 paddd %xmm12,%xmm8 583 paddd %xmm13,%xmm9 584 pxor %xmm8,%xmm0 585 pxor %xmm9,%xmm1 586.byte 102,15,56,0,198 587.byte 102,15,56,0,206 588 paddd %xmm0,%xmm4 589 paddd %xmm1,%xmm5 590 pxor %xmm4,%xmm12 591 pxor %xmm5,%xmm13 592 movdqa %xmm12,%xmm7 593 pslld $7,%xmm12 594 psrld $25,%xmm7 595 movdqa %xmm13,%xmm6 596 pslld $7,%xmm13 597 por %xmm7,%xmm12 598 psrld $25,%xmm6 599 movdqa (%r10),%xmm7 600 por %xmm6,%xmm13 601 movdqa %xmm4,0(%rsp) 602 movdqa %xmm5,16(%rsp) 603 movdqa 32(%rsp),%xmm4 604 movdqa 48(%rsp),%xmm5 605 paddd %xmm14,%xmm10 606 paddd %xmm15,%xmm11 607 pxor %xmm10,%xmm2 608 pxor %xmm11,%xmm3 609.byte 102,15,56,0,215 610.byte 102,15,56,0,223 611 paddd %xmm2,%xmm4 612 paddd %xmm3,%xmm5 613 pxor %xmm4,%xmm14 614 pxor %xmm5,%xmm15 615 movdqa %xmm14,%xmm6 616 pslld $12,%xmm14 617 psrld $20,%xmm6 618 movdqa %xmm15,%xmm7 619 pslld $12,%xmm15 620 por %xmm6,%xmm14 621 psrld $20,%xmm7 622 movdqa (%r11),%xmm6 623 por %xmm7,%xmm15 624 paddd %xmm14,%xmm10 625 paddd %xmm15,%xmm11 626 pxor %xmm10,%xmm2 627 pxor %xmm11,%xmm3 628.byte 102,15,56,0,214 629.byte 102,15,56,0,222 630 paddd %xmm2,%xmm4 631 paddd %xmm3,%xmm5 632 pxor %xmm4,%xmm14 633 pxor %xmm5,%xmm15 634 movdqa %xmm14,%xmm7 635 pslld $7,%xmm14 636 psrld $25,%xmm7 637 movdqa %xmm15,%xmm6 638 pslld $7,%xmm15 639 por %xmm7,%xmm14 640 psrld $25,%xmm6 641 movdqa (%r10),%xmm7 642 por %xmm6,%xmm15 643 paddd %xmm13,%xmm8 644 paddd %xmm14,%xmm9 645 pxor %xmm8,%xmm3 646 pxor %xmm9,%xmm0 647.byte 102,15,56,0,223 648.byte 102,15,56,0,199 649 paddd %xmm3,%xmm4 650 paddd %xmm0,%xmm5 651 pxor %xmm4,%xmm13 652 pxor %xmm5,%xmm14 653 movdqa %xmm13,%xmm6 654 pslld $12,%xmm13 655 psrld $20,%xmm6 656 movdqa %xmm14,%xmm7 657 pslld $12,%xmm14 658 por %xmm6,%xmm13 659 psrld $20,%xmm7 660 movdqa (%r11),%xmm6 661 por %xmm7,%xmm14 662 paddd %xmm13,%xmm8 663 paddd %xmm14,%xmm9 664 pxor %xmm8,%xmm3 665 pxor %xmm9,%xmm0 666.byte 102,15,56,0,222 667.byte 102,15,56,0,198 668 paddd %xmm3,%xmm4 669 paddd %xmm0,%xmm5 670 pxor %xmm4,%xmm13 671 pxor %xmm5,%xmm14 672 movdqa %xmm13,%xmm7 673 pslld $7,%xmm13 674 psrld $25,%xmm7 675 movdqa %xmm14,%xmm6 676 pslld $7,%xmm14 677 por %xmm7,%xmm13 678 psrld $25,%xmm6 679 movdqa (%r10),%xmm7 680 por %xmm6,%xmm14 681 movdqa %xmm4,32(%rsp) 682 movdqa %xmm5,48(%rsp) 683 movdqa 0(%rsp),%xmm4 684 movdqa 16(%rsp),%xmm5 685 paddd %xmm15,%xmm10 686 paddd %xmm12,%xmm11 687 pxor %xmm10,%xmm1 688 pxor %xmm11,%xmm2 689.byte 102,15,56,0,207 690.byte 102,15,56,0,215 691 paddd %xmm1,%xmm4 692 paddd %xmm2,%xmm5 693 pxor %xmm4,%xmm15 694 pxor %xmm5,%xmm12 695 movdqa %xmm15,%xmm6 696 pslld $12,%xmm15 697 psrld $20,%xmm6 698 movdqa %xmm12,%xmm7 699 pslld $12,%xmm12 700 por %xmm6,%xmm15 701 psrld $20,%xmm7 702 movdqa (%r11),%xmm6 703 por %xmm7,%xmm12 704 paddd %xmm15,%xmm10 705 paddd %xmm12,%xmm11 706 pxor %xmm10,%xmm1 707 pxor %xmm11,%xmm2 708.byte 102,15,56,0,206 709.byte 102,15,56,0,214 710 paddd %xmm1,%xmm4 711 paddd %xmm2,%xmm5 712 pxor %xmm4,%xmm15 713 pxor %xmm5,%xmm12 714 movdqa %xmm15,%xmm7 715 pslld $7,%xmm15 716 psrld $25,%xmm7 717 movdqa %xmm12,%xmm6 718 pslld $7,%xmm12 719 por %xmm7,%xmm15 720 psrld $25,%xmm6 721 movdqa (%r10),%xmm7 722 por %xmm6,%xmm12 723 decl %eax 724 jnz L$oop4x 725 726 paddd 64(%rsp),%xmm8 727 paddd 80(%rsp),%xmm9 728 paddd 96(%rsp),%xmm10 729 paddd 112(%rsp),%xmm11 730 731 movdqa %xmm8,%xmm6 732 punpckldq %xmm9,%xmm8 733 movdqa %xmm10,%xmm7 734 punpckldq %xmm11,%xmm10 735 punpckhdq %xmm9,%xmm6 736 punpckhdq %xmm11,%xmm7 737 movdqa %xmm8,%xmm9 738 punpcklqdq %xmm10,%xmm8 739 movdqa %xmm6,%xmm11 740 punpcklqdq %xmm7,%xmm6 741 punpckhqdq %xmm10,%xmm9 742 punpckhqdq %xmm7,%xmm11 743 paddd 128-256(%rcx),%xmm12 744 paddd 144-256(%rcx),%xmm13 745 paddd 160-256(%rcx),%xmm14 746 paddd 176-256(%rcx),%xmm15 747 748 movdqa %xmm8,0(%rsp) 749 movdqa %xmm9,16(%rsp) 750 movdqa 32(%rsp),%xmm8 751 movdqa 48(%rsp),%xmm9 752 753 movdqa %xmm12,%xmm10 754 punpckldq %xmm13,%xmm12 755 movdqa %xmm14,%xmm7 756 punpckldq %xmm15,%xmm14 757 punpckhdq %xmm13,%xmm10 758 punpckhdq %xmm15,%xmm7 759 movdqa %xmm12,%xmm13 760 punpcklqdq %xmm14,%xmm12 761 movdqa %xmm10,%xmm15 762 punpcklqdq %xmm7,%xmm10 763 punpckhqdq %xmm14,%xmm13 764 punpckhqdq %xmm7,%xmm15 765 paddd 192-256(%rcx),%xmm4 766 paddd 208-256(%rcx),%xmm5 767 paddd 224-256(%rcx),%xmm8 768 paddd 240-256(%rcx),%xmm9 769 770 movdqa %xmm6,32(%rsp) 771 movdqa %xmm11,48(%rsp) 772 773 movdqa %xmm4,%xmm14 774 punpckldq %xmm5,%xmm4 775 movdqa %xmm8,%xmm7 776 punpckldq %xmm9,%xmm8 777 punpckhdq %xmm5,%xmm14 778 punpckhdq %xmm9,%xmm7 779 movdqa %xmm4,%xmm5 780 punpcklqdq %xmm8,%xmm4 781 movdqa %xmm14,%xmm9 782 punpcklqdq %xmm7,%xmm14 783 punpckhqdq %xmm8,%xmm5 784 punpckhqdq %xmm7,%xmm9 785 paddd 256-256(%rcx),%xmm0 786 paddd 272-256(%rcx),%xmm1 787 paddd 288-256(%rcx),%xmm2 788 paddd 304-256(%rcx),%xmm3 789 790 movdqa %xmm0,%xmm8 791 punpckldq %xmm1,%xmm0 792 movdqa %xmm2,%xmm7 793 punpckldq %xmm3,%xmm2 794 punpckhdq %xmm1,%xmm8 795 punpckhdq %xmm3,%xmm7 796 movdqa %xmm0,%xmm1 797 punpcklqdq %xmm2,%xmm0 798 movdqa %xmm8,%xmm3 799 punpcklqdq %xmm7,%xmm8 800 punpckhqdq %xmm2,%xmm1 801 punpckhqdq %xmm7,%xmm3 802 cmpq $256,%rdx 803 jb L$tail4x 804 805 movdqu 0(%rsi),%xmm6 806 movdqu 16(%rsi),%xmm11 807 movdqu 32(%rsi),%xmm2 808 movdqu 48(%rsi),%xmm7 809 pxor 0(%rsp),%xmm6 810 pxor %xmm12,%xmm11 811 pxor %xmm4,%xmm2 812 pxor %xmm0,%xmm7 813 814 movdqu %xmm6,0(%rdi) 815 movdqu 64(%rsi),%xmm6 816 movdqu %xmm11,16(%rdi) 817 movdqu 80(%rsi),%xmm11 818 movdqu %xmm2,32(%rdi) 819 movdqu 96(%rsi),%xmm2 820 movdqu %xmm7,48(%rdi) 821 movdqu 112(%rsi),%xmm7 822 leaq 128(%rsi),%rsi 823 pxor 16(%rsp),%xmm6 824 pxor %xmm13,%xmm11 825 pxor %xmm5,%xmm2 826 pxor %xmm1,%xmm7 827 828 movdqu %xmm6,64(%rdi) 829 movdqu 0(%rsi),%xmm6 830 movdqu %xmm11,80(%rdi) 831 movdqu 16(%rsi),%xmm11 832 movdqu %xmm2,96(%rdi) 833 movdqu 32(%rsi),%xmm2 834 movdqu %xmm7,112(%rdi) 835 leaq 128(%rdi),%rdi 836 movdqu 48(%rsi),%xmm7 837 pxor 32(%rsp),%xmm6 838 pxor %xmm10,%xmm11 839 pxor %xmm14,%xmm2 840 pxor %xmm8,%xmm7 841 842 movdqu %xmm6,0(%rdi) 843 movdqu 64(%rsi),%xmm6 844 movdqu %xmm11,16(%rdi) 845 movdqu 80(%rsi),%xmm11 846 movdqu %xmm2,32(%rdi) 847 movdqu 96(%rsi),%xmm2 848 movdqu %xmm7,48(%rdi) 849 movdqu 112(%rsi),%xmm7 850 leaq 128(%rsi),%rsi 851 pxor 48(%rsp),%xmm6 852 pxor %xmm15,%xmm11 853 pxor %xmm9,%xmm2 854 pxor %xmm3,%xmm7 855 movdqu %xmm6,64(%rdi) 856 movdqu %xmm11,80(%rdi) 857 movdqu %xmm2,96(%rdi) 858 movdqu %xmm7,112(%rdi) 859 leaq 128(%rdi),%rdi 860 861 subq $256,%rdx 862 jnz L$oop_outer4x 863 864 jmp L$done4x 865 866L$tail4x: 867 cmpq $192,%rdx 868 jae L$192_or_more4x 869 cmpq $128,%rdx 870 jae L$128_or_more4x 871 cmpq $64,%rdx 872 jae L$64_or_more4x 873 874 875 xorq %r10,%r10 876 877 movdqa %xmm12,16(%rsp) 878 movdqa %xmm4,32(%rsp) 879 movdqa %xmm0,48(%rsp) 880 jmp L$oop_tail4x 881 882.p2align 5 883L$64_or_more4x: 884 movdqu 0(%rsi),%xmm6 885 movdqu 16(%rsi),%xmm11 886 movdqu 32(%rsi),%xmm2 887 movdqu 48(%rsi),%xmm7 888 pxor 0(%rsp),%xmm6 889 pxor %xmm12,%xmm11 890 pxor %xmm4,%xmm2 891 pxor %xmm0,%xmm7 892 movdqu %xmm6,0(%rdi) 893 movdqu %xmm11,16(%rdi) 894 movdqu %xmm2,32(%rdi) 895 movdqu %xmm7,48(%rdi) 896 je L$done4x 897 898 movdqa 16(%rsp),%xmm6 899 leaq 64(%rsi),%rsi 900 xorq %r10,%r10 901 movdqa %xmm6,0(%rsp) 902 movdqa %xmm13,16(%rsp) 903 leaq 64(%rdi),%rdi 904 movdqa %xmm5,32(%rsp) 905 subq $64,%rdx 906 movdqa %xmm1,48(%rsp) 907 jmp L$oop_tail4x 908 909.p2align 5 910L$128_or_more4x: 911 movdqu 0(%rsi),%xmm6 912 movdqu 16(%rsi),%xmm11 913 movdqu 32(%rsi),%xmm2 914 movdqu 48(%rsi),%xmm7 915 pxor 0(%rsp),%xmm6 916 pxor %xmm12,%xmm11 917 pxor %xmm4,%xmm2 918 pxor %xmm0,%xmm7 919 920 movdqu %xmm6,0(%rdi) 921 movdqu 64(%rsi),%xmm6 922 movdqu %xmm11,16(%rdi) 923 movdqu 80(%rsi),%xmm11 924 movdqu %xmm2,32(%rdi) 925 movdqu 96(%rsi),%xmm2 926 movdqu %xmm7,48(%rdi) 927 movdqu 112(%rsi),%xmm7 928 pxor 16(%rsp),%xmm6 929 pxor %xmm13,%xmm11 930 pxor %xmm5,%xmm2 931 pxor %xmm1,%xmm7 932 movdqu %xmm6,64(%rdi) 933 movdqu %xmm11,80(%rdi) 934 movdqu %xmm2,96(%rdi) 935 movdqu %xmm7,112(%rdi) 936 je L$done4x 937 938 movdqa 32(%rsp),%xmm6 939 leaq 128(%rsi),%rsi 940 xorq %r10,%r10 941 movdqa %xmm6,0(%rsp) 942 movdqa %xmm10,16(%rsp) 943 leaq 128(%rdi),%rdi 944 movdqa %xmm14,32(%rsp) 945 subq $128,%rdx 946 movdqa %xmm8,48(%rsp) 947 jmp L$oop_tail4x 948 949.p2align 5 950L$192_or_more4x: 951 movdqu 0(%rsi),%xmm6 952 movdqu 16(%rsi),%xmm11 953 movdqu 32(%rsi),%xmm2 954 movdqu 48(%rsi),%xmm7 955 pxor 0(%rsp),%xmm6 956 pxor %xmm12,%xmm11 957 pxor %xmm4,%xmm2 958 pxor %xmm0,%xmm7 959 960 movdqu %xmm6,0(%rdi) 961 movdqu 64(%rsi),%xmm6 962 movdqu %xmm11,16(%rdi) 963 movdqu 80(%rsi),%xmm11 964 movdqu %xmm2,32(%rdi) 965 movdqu 96(%rsi),%xmm2 966 movdqu %xmm7,48(%rdi) 967 movdqu 112(%rsi),%xmm7 968 leaq 128(%rsi),%rsi 969 pxor 16(%rsp),%xmm6 970 pxor %xmm13,%xmm11 971 pxor %xmm5,%xmm2 972 pxor %xmm1,%xmm7 973 974 movdqu %xmm6,64(%rdi) 975 movdqu 0(%rsi),%xmm6 976 movdqu %xmm11,80(%rdi) 977 movdqu 16(%rsi),%xmm11 978 movdqu %xmm2,96(%rdi) 979 movdqu 32(%rsi),%xmm2 980 movdqu %xmm7,112(%rdi) 981 leaq 128(%rdi),%rdi 982 movdqu 48(%rsi),%xmm7 983 pxor 32(%rsp),%xmm6 984 pxor %xmm10,%xmm11 985 pxor %xmm14,%xmm2 986 pxor %xmm8,%xmm7 987 movdqu %xmm6,0(%rdi) 988 movdqu %xmm11,16(%rdi) 989 movdqu %xmm2,32(%rdi) 990 movdqu %xmm7,48(%rdi) 991 je L$done4x 992 993 movdqa 48(%rsp),%xmm6 994 leaq 64(%rsi),%rsi 995 xorq %r10,%r10 996 movdqa %xmm6,0(%rsp) 997 movdqa %xmm15,16(%rsp) 998 leaq 64(%rdi),%rdi 999 movdqa %xmm9,32(%rsp) 1000 subq $192,%rdx 1001 movdqa %xmm3,48(%rsp) 1002 1003L$oop_tail4x: 1004 movzbl (%rsi,%r10,1),%eax 1005 movzbl (%rsp,%r10,1),%ecx 1006 leaq 1(%r10),%r10 1007 xorl %ecx,%eax 1008 movb %al,-1(%rdi,%r10,1) 1009 decq %rdx 1010 jnz L$oop_tail4x 1011 1012L$done4x: 1013 leaq (%r9),%rsp 1014 1015L$4x_epilogue: 1016 .byte 0xf3,0xc3 1017 1018 1019 1020.p2align 5 1021ChaCha20_8x: 1022L$ChaCha20_8x: 1023 1024 movq %rsp,%r9 1025 1026 subq $0x280+8,%rsp 1027 andq $-32,%rsp 1028 vzeroupper 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 vbroadcasti128 L$sigma(%rip),%ymm11 1040 vbroadcasti128 (%rcx),%ymm3 1041 vbroadcasti128 16(%rcx),%ymm15 1042 vbroadcasti128 (%r8),%ymm7 1043 leaq 256(%rsp),%rcx 1044 leaq 512(%rsp),%rax 1045 leaq L$rot16(%rip),%r10 1046 leaq L$rot24(%rip),%r11 1047 1048 vpshufd $0x00,%ymm11,%ymm8 1049 vpshufd $0x55,%ymm11,%ymm9 1050 vmovdqa %ymm8,128-256(%rcx) 1051 vpshufd $0xaa,%ymm11,%ymm10 1052 vmovdqa %ymm9,160-256(%rcx) 1053 vpshufd $0xff,%ymm11,%ymm11 1054 vmovdqa %ymm10,192-256(%rcx) 1055 vmovdqa %ymm11,224-256(%rcx) 1056 1057 vpshufd $0x00,%ymm3,%ymm0 1058 vpshufd $0x55,%ymm3,%ymm1 1059 vmovdqa %ymm0,256-256(%rcx) 1060 vpshufd $0xaa,%ymm3,%ymm2 1061 vmovdqa %ymm1,288-256(%rcx) 1062 vpshufd $0xff,%ymm3,%ymm3 1063 vmovdqa %ymm2,320-256(%rcx) 1064 vmovdqa %ymm3,352-256(%rcx) 1065 1066 vpshufd $0x00,%ymm15,%ymm12 1067 vpshufd $0x55,%ymm15,%ymm13 1068 vmovdqa %ymm12,384-512(%rax) 1069 vpshufd $0xaa,%ymm15,%ymm14 1070 vmovdqa %ymm13,416-512(%rax) 1071 vpshufd $0xff,%ymm15,%ymm15 1072 vmovdqa %ymm14,448-512(%rax) 1073 vmovdqa %ymm15,480-512(%rax) 1074 1075 vpshufd $0x00,%ymm7,%ymm4 1076 vpshufd $0x55,%ymm7,%ymm5 1077 vpaddd L$incy(%rip),%ymm4,%ymm4 1078 vpshufd $0xaa,%ymm7,%ymm6 1079 vmovdqa %ymm5,544-512(%rax) 1080 vpshufd $0xff,%ymm7,%ymm7 1081 vmovdqa %ymm6,576-512(%rax) 1082 vmovdqa %ymm7,608-512(%rax) 1083 1084 jmp L$oop_enter8x 1085 1086.p2align 5 1087L$oop_outer8x: 1088 vmovdqa 128-256(%rcx),%ymm8 1089 vmovdqa 160-256(%rcx),%ymm9 1090 vmovdqa 192-256(%rcx),%ymm10 1091 vmovdqa 224-256(%rcx),%ymm11 1092 vmovdqa 256-256(%rcx),%ymm0 1093 vmovdqa 288-256(%rcx),%ymm1 1094 vmovdqa 320-256(%rcx),%ymm2 1095 vmovdqa 352-256(%rcx),%ymm3 1096 vmovdqa 384-512(%rax),%ymm12 1097 vmovdqa 416-512(%rax),%ymm13 1098 vmovdqa 448-512(%rax),%ymm14 1099 vmovdqa 480-512(%rax),%ymm15 1100 vmovdqa 512-512(%rax),%ymm4 1101 vmovdqa 544-512(%rax),%ymm5 1102 vmovdqa 576-512(%rax),%ymm6 1103 vmovdqa 608-512(%rax),%ymm7 1104 vpaddd L$eight(%rip),%ymm4,%ymm4 1105 1106L$oop_enter8x: 1107 vmovdqa %ymm14,64(%rsp) 1108 vmovdqa %ymm15,96(%rsp) 1109 vbroadcasti128 (%r10),%ymm15 1110 vmovdqa %ymm4,512-512(%rax) 1111 movl $10,%eax 1112 jmp L$oop8x 1113 1114.p2align 5 1115L$oop8x: 1116 vpaddd %ymm0,%ymm8,%ymm8 1117 vpxor %ymm4,%ymm8,%ymm4 1118 vpshufb %ymm15,%ymm4,%ymm4 1119 vpaddd %ymm1,%ymm9,%ymm9 1120 vpxor %ymm5,%ymm9,%ymm5 1121 vpshufb %ymm15,%ymm5,%ymm5 1122 vpaddd %ymm4,%ymm12,%ymm12 1123 vpxor %ymm0,%ymm12,%ymm0 1124 vpslld $12,%ymm0,%ymm14 1125 vpsrld $20,%ymm0,%ymm0 1126 vpor %ymm0,%ymm14,%ymm0 1127 vbroadcasti128 (%r11),%ymm14 1128 vpaddd %ymm5,%ymm13,%ymm13 1129 vpxor %ymm1,%ymm13,%ymm1 1130 vpslld $12,%ymm1,%ymm15 1131 vpsrld $20,%ymm1,%ymm1 1132 vpor %ymm1,%ymm15,%ymm1 1133 vpaddd %ymm0,%ymm8,%ymm8 1134 vpxor %ymm4,%ymm8,%ymm4 1135 vpshufb %ymm14,%ymm4,%ymm4 1136 vpaddd %ymm1,%ymm9,%ymm9 1137 vpxor %ymm5,%ymm9,%ymm5 1138 vpshufb %ymm14,%ymm5,%ymm5 1139 vpaddd %ymm4,%ymm12,%ymm12 1140 vpxor %ymm0,%ymm12,%ymm0 1141 vpslld $7,%ymm0,%ymm15 1142 vpsrld $25,%ymm0,%ymm0 1143 vpor %ymm0,%ymm15,%ymm0 1144 vbroadcasti128 (%r10),%ymm15 1145 vpaddd %ymm5,%ymm13,%ymm13 1146 vpxor %ymm1,%ymm13,%ymm1 1147 vpslld $7,%ymm1,%ymm14 1148 vpsrld $25,%ymm1,%ymm1 1149 vpor %ymm1,%ymm14,%ymm1 1150 vmovdqa %ymm12,0(%rsp) 1151 vmovdqa %ymm13,32(%rsp) 1152 vmovdqa 64(%rsp),%ymm12 1153 vmovdqa 96(%rsp),%ymm13 1154 vpaddd %ymm2,%ymm10,%ymm10 1155 vpxor %ymm6,%ymm10,%ymm6 1156 vpshufb %ymm15,%ymm6,%ymm6 1157 vpaddd %ymm3,%ymm11,%ymm11 1158 vpxor %ymm7,%ymm11,%ymm7 1159 vpshufb %ymm15,%ymm7,%ymm7 1160 vpaddd %ymm6,%ymm12,%ymm12 1161 vpxor %ymm2,%ymm12,%ymm2 1162 vpslld $12,%ymm2,%ymm14 1163 vpsrld $20,%ymm2,%ymm2 1164 vpor %ymm2,%ymm14,%ymm2 1165 vbroadcasti128 (%r11),%ymm14 1166 vpaddd %ymm7,%ymm13,%ymm13 1167 vpxor %ymm3,%ymm13,%ymm3 1168 vpslld $12,%ymm3,%ymm15 1169 vpsrld $20,%ymm3,%ymm3 1170 vpor %ymm3,%ymm15,%ymm3 1171 vpaddd %ymm2,%ymm10,%ymm10 1172 vpxor %ymm6,%ymm10,%ymm6 1173 vpshufb %ymm14,%ymm6,%ymm6 1174 vpaddd %ymm3,%ymm11,%ymm11 1175 vpxor %ymm7,%ymm11,%ymm7 1176 vpshufb %ymm14,%ymm7,%ymm7 1177 vpaddd %ymm6,%ymm12,%ymm12 1178 vpxor %ymm2,%ymm12,%ymm2 1179 vpslld $7,%ymm2,%ymm15 1180 vpsrld $25,%ymm2,%ymm2 1181 vpor %ymm2,%ymm15,%ymm2 1182 vbroadcasti128 (%r10),%ymm15 1183 vpaddd %ymm7,%ymm13,%ymm13 1184 vpxor %ymm3,%ymm13,%ymm3 1185 vpslld $7,%ymm3,%ymm14 1186 vpsrld $25,%ymm3,%ymm3 1187 vpor %ymm3,%ymm14,%ymm3 1188 vpaddd %ymm1,%ymm8,%ymm8 1189 vpxor %ymm7,%ymm8,%ymm7 1190 vpshufb %ymm15,%ymm7,%ymm7 1191 vpaddd %ymm2,%ymm9,%ymm9 1192 vpxor %ymm4,%ymm9,%ymm4 1193 vpshufb %ymm15,%ymm4,%ymm4 1194 vpaddd %ymm7,%ymm12,%ymm12 1195 vpxor %ymm1,%ymm12,%ymm1 1196 vpslld $12,%ymm1,%ymm14 1197 vpsrld $20,%ymm1,%ymm1 1198 vpor %ymm1,%ymm14,%ymm1 1199 vbroadcasti128 (%r11),%ymm14 1200 vpaddd %ymm4,%ymm13,%ymm13 1201 vpxor %ymm2,%ymm13,%ymm2 1202 vpslld $12,%ymm2,%ymm15 1203 vpsrld $20,%ymm2,%ymm2 1204 vpor %ymm2,%ymm15,%ymm2 1205 vpaddd %ymm1,%ymm8,%ymm8 1206 vpxor %ymm7,%ymm8,%ymm7 1207 vpshufb %ymm14,%ymm7,%ymm7 1208 vpaddd %ymm2,%ymm9,%ymm9 1209 vpxor %ymm4,%ymm9,%ymm4 1210 vpshufb %ymm14,%ymm4,%ymm4 1211 vpaddd %ymm7,%ymm12,%ymm12 1212 vpxor %ymm1,%ymm12,%ymm1 1213 vpslld $7,%ymm1,%ymm15 1214 vpsrld $25,%ymm1,%ymm1 1215 vpor %ymm1,%ymm15,%ymm1 1216 vbroadcasti128 (%r10),%ymm15 1217 vpaddd %ymm4,%ymm13,%ymm13 1218 vpxor %ymm2,%ymm13,%ymm2 1219 vpslld $7,%ymm2,%ymm14 1220 vpsrld $25,%ymm2,%ymm2 1221 vpor %ymm2,%ymm14,%ymm2 1222 vmovdqa %ymm12,64(%rsp) 1223 vmovdqa %ymm13,96(%rsp) 1224 vmovdqa 0(%rsp),%ymm12 1225 vmovdqa 32(%rsp),%ymm13 1226 vpaddd %ymm3,%ymm10,%ymm10 1227 vpxor %ymm5,%ymm10,%ymm5 1228 vpshufb %ymm15,%ymm5,%ymm5 1229 vpaddd %ymm0,%ymm11,%ymm11 1230 vpxor %ymm6,%ymm11,%ymm6 1231 vpshufb %ymm15,%ymm6,%ymm6 1232 vpaddd %ymm5,%ymm12,%ymm12 1233 vpxor %ymm3,%ymm12,%ymm3 1234 vpslld $12,%ymm3,%ymm14 1235 vpsrld $20,%ymm3,%ymm3 1236 vpor %ymm3,%ymm14,%ymm3 1237 vbroadcasti128 (%r11),%ymm14 1238 vpaddd %ymm6,%ymm13,%ymm13 1239 vpxor %ymm0,%ymm13,%ymm0 1240 vpslld $12,%ymm0,%ymm15 1241 vpsrld $20,%ymm0,%ymm0 1242 vpor %ymm0,%ymm15,%ymm0 1243 vpaddd %ymm3,%ymm10,%ymm10 1244 vpxor %ymm5,%ymm10,%ymm5 1245 vpshufb %ymm14,%ymm5,%ymm5 1246 vpaddd %ymm0,%ymm11,%ymm11 1247 vpxor %ymm6,%ymm11,%ymm6 1248 vpshufb %ymm14,%ymm6,%ymm6 1249 vpaddd %ymm5,%ymm12,%ymm12 1250 vpxor %ymm3,%ymm12,%ymm3 1251 vpslld $7,%ymm3,%ymm15 1252 vpsrld $25,%ymm3,%ymm3 1253 vpor %ymm3,%ymm15,%ymm3 1254 vbroadcasti128 (%r10),%ymm15 1255 vpaddd %ymm6,%ymm13,%ymm13 1256 vpxor %ymm0,%ymm13,%ymm0 1257 vpslld $7,%ymm0,%ymm14 1258 vpsrld $25,%ymm0,%ymm0 1259 vpor %ymm0,%ymm14,%ymm0 1260 decl %eax 1261 jnz L$oop8x 1262 1263 leaq 512(%rsp),%rax 1264 vpaddd 128-256(%rcx),%ymm8,%ymm8 1265 vpaddd 160-256(%rcx),%ymm9,%ymm9 1266 vpaddd 192-256(%rcx),%ymm10,%ymm10 1267 vpaddd 224-256(%rcx),%ymm11,%ymm11 1268 1269 vpunpckldq %ymm9,%ymm8,%ymm14 1270 vpunpckldq %ymm11,%ymm10,%ymm15 1271 vpunpckhdq %ymm9,%ymm8,%ymm8 1272 vpunpckhdq %ymm11,%ymm10,%ymm10 1273 vpunpcklqdq %ymm15,%ymm14,%ymm9 1274 vpunpckhqdq %ymm15,%ymm14,%ymm14 1275 vpunpcklqdq %ymm10,%ymm8,%ymm11 1276 vpunpckhqdq %ymm10,%ymm8,%ymm8 1277 vpaddd 256-256(%rcx),%ymm0,%ymm0 1278 vpaddd 288-256(%rcx),%ymm1,%ymm1 1279 vpaddd 320-256(%rcx),%ymm2,%ymm2 1280 vpaddd 352-256(%rcx),%ymm3,%ymm3 1281 1282 vpunpckldq %ymm1,%ymm0,%ymm10 1283 vpunpckldq %ymm3,%ymm2,%ymm15 1284 vpunpckhdq %ymm1,%ymm0,%ymm0 1285 vpunpckhdq %ymm3,%ymm2,%ymm2 1286 vpunpcklqdq %ymm15,%ymm10,%ymm1 1287 vpunpckhqdq %ymm15,%ymm10,%ymm10 1288 vpunpcklqdq %ymm2,%ymm0,%ymm3 1289 vpunpckhqdq %ymm2,%ymm0,%ymm0 1290 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1291 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1292 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1293 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1294 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1295 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1296 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1297 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1298 vmovdqa %ymm15,0(%rsp) 1299 vmovdqa %ymm9,32(%rsp) 1300 vmovdqa 64(%rsp),%ymm15 1301 vmovdqa 96(%rsp),%ymm9 1302 1303 vpaddd 384-512(%rax),%ymm12,%ymm12 1304 vpaddd 416-512(%rax),%ymm13,%ymm13 1305 vpaddd 448-512(%rax),%ymm15,%ymm15 1306 vpaddd 480-512(%rax),%ymm9,%ymm9 1307 1308 vpunpckldq %ymm13,%ymm12,%ymm2 1309 vpunpckldq %ymm9,%ymm15,%ymm8 1310 vpunpckhdq %ymm13,%ymm12,%ymm12 1311 vpunpckhdq %ymm9,%ymm15,%ymm15 1312 vpunpcklqdq %ymm8,%ymm2,%ymm13 1313 vpunpckhqdq %ymm8,%ymm2,%ymm2 1314 vpunpcklqdq %ymm15,%ymm12,%ymm9 1315 vpunpckhqdq %ymm15,%ymm12,%ymm12 1316 vpaddd 512-512(%rax),%ymm4,%ymm4 1317 vpaddd 544-512(%rax),%ymm5,%ymm5 1318 vpaddd 576-512(%rax),%ymm6,%ymm6 1319 vpaddd 608-512(%rax),%ymm7,%ymm7 1320 1321 vpunpckldq %ymm5,%ymm4,%ymm15 1322 vpunpckldq %ymm7,%ymm6,%ymm8 1323 vpunpckhdq %ymm5,%ymm4,%ymm4 1324 vpunpckhdq %ymm7,%ymm6,%ymm6 1325 vpunpcklqdq %ymm8,%ymm15,%ymm5 1326 vpunpckhqdq %ymm8,%ymm15,%ymm15 1327 vpunpcklqdq %ymm6,%ymm4,%ymm7 1328 vpunpckhqdq %ymm6,%ymm4,%ymm4 1329 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1330 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1331 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1332 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1333 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1334 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1335 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1336 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1337 vmovdqa 0(%rsp),%ymm6 1338 vmovdqa 32(%rsp),%ymm12 1339 1340 cmpq $512,%rdx 1341 jb L$tail8x 1342 1343 vpxor 0(%rsi),%ymm6,%ymm6 1344 vpxor 32(%rsi),%ymm8,%ymm8 1345 vpxor 64(%rsi),%ymm1,%ymm1 1346 vpxor 96(%rsi),%ymm5,%ymm5 1347 leaq 128(%rsi),%rsi 1348 vmovdqu %ymm6,0(%rdi) 1349 vmovdqu %ymm8,32(%rdi) 1350 vmovdqu %ymm1,64(%rdi) 1351 vmovdqu %ymm5,96(%rdi) 1352 leaq 128(%rdi),%rdi 1353 1354 vpxor 0(%rsi),%ymm12,%ymm12 1355 vpxor 32(%rsi),%ymm13,%ymm13 1356 vpxor 64(%rsi),%ymm10,%ymm10 1357 vpxor 96(%rsi),%ymm15,%ymm15 1358 leaq 128(%rsi),%rsi 1359 vmovdqu %ymm12,0(%rdi) 1360 vmovdqu %ymm13,32(%rdi) 1361 vmovdqu %ymm10,64(%rdi) 1362 vmovdqu %ymm15,96(%rdi) 1363 leaq 128(%rdi),%rdi 1364 1365 vpxor 0(%rsi),%ymm14,%ymm14 1366 vpxor 32(%rsi),%ymm2,%ymm2 1367 vpxor 64(%rsi),%ymm3,%ymm3 1368 vpxor 96(%rsi),%ymm7,%ymm7 1369 leaq 128(%rsi),%rsi 1370 vmovdqu %ymm14,0(%rdi) 1371 vmovdqu %ymm2,32(%rdi) 1372 vmovdqu %ymm3,64(%rdi) 1373 vmovdqu %ymm7,96(%rdi) 1374 leaq 128(%rdi),%rdi 1375 1376 vpxor 0(%rsi),%ymm11,%ymm11 1377 vpxor 32(%rsi),%ymm9,%ymm9 1378 vpxor 64(%rsi),%ymm0,%ymm0 1379 vpxor 96(%rsi),%ymm4,%ymm4 1380 leaq 128(%rsi),%rsi 1381 vmovdqu %ymm11,0(%rdi) 1382 vmovdqu %ymm9,32(%rdi) 1383 vmovdqu %ymm0,64(%rdi) 1384 vmovdqu %ymm4,96(%rdi) 1385 leaq 128(%rdi),%rdi 1386 1387 subq $512,%rdx 1388 jnz L$oop_outer8x 1389 1390 jmp L$done8x 1391 1392L$tail8x: 1393 cmpq $448,%rdx 1394 jae L$448_or_more8x 1395 cmpq $384,%rdx 1396 jae L$384_or_more8x 1397 cmpq $320,%rdx 1398 jae L$320_or_more8x 1399 cmpq $256,%rdx 1400 jae L$256_or_more8x 1401 cmpq $192,%rdx 1402 jae L$192_or_more8x 1403 cmpq $128,%rdx 1404 jae L$128_or_more8x 1405 cmpq $64,%rdx 1406 jae L$64_or_more8x 1407 1408 xorq %r10,%r10 1409 vmovdqa %ymm6,0(%rsp) 1410 vmovdqa %ymm8,32(%rsp) 1411 jmp L$oop_tail8x 1412 1413.p2align 5 1414L$64_or_more8x: 1415 vpxor 0(%rsi),%ymm6,%ymm6 1416 vpxor 32(%rsi),%ymm8,%ymm8 1417 vmovdqu %ymm6,0(%rdi) 1418 vmovdqu %ymm8,32(%rdi) 1419 je L$done8x 1420 1421 leaq 64(%rsi),%rsi 1422 xorq %r10,%r10 1423 vmovdqa %ymm1,0(%rsp) 1424 leaq 64(%rdi),%rdi 1425 subq $64,%rdx 1426 vmovdqa %ymm5,32(%rsp) 1427 jmp L$oop_tail8x 1428 1429.p2align 5 1430L$128_or_more8x: 1431 vpxor 0(%rsi),%ymm6,%ymm6 1432 vpxor 32(%rsi),%ymm8,%ymm8 1433 vpxor 64(%rsi),%ymm1,%ymm1 1434 vpxor 96(%rsi),%ymm5,%ymm5 1435 vmovdqu %ymm6,0(%rdi) 1436 vmovdqu %ymm8,32(%rdi) 1437 vmovdqu %ymm1,64(%rdi) 1438 vmovdqu %ymm5,96(%rdi) 1439 je L$done8x 1440 1441 leaq 128(%rsi),%rsi 1442 xorq %r10,%r10 1443 vmovdqa %ymm12,0(%rsp) 1444 leaq 128(%rdi),%rdi 1445 subq $128,%rdx 1446 vmovdqa %ymm13,32(%rsp) 1447 jmp L$oop_tail8x 1448 1449.p2align 5 1450L$192_or_more8x: 1451 vpxor 0(%rsi),%ymm6,%ymm6 1452 vpxor 32(%rsi),%ymm8,%ymm8 1453 vpxor 64(%rsi),%ymm1,%ymm1 1454 vpxor 96(%rsi),%ymm5,%ymm5 1455 vpxor 128(%rsi),%ymm12,%ymm12 1456 vpxor 160(%rsi),%ymm13,%ymm13 1457 vmovdqu %ymm6,0(%rdi) 1458 vmovdqu %ymm8,32(%rdi) 1459 vmovdqu %ymm1,64(%rdi) 1460 vmovdqu %ymm5,96(%rdi) 1461 vmovdqu %ymm12,128(%rdi) 1462 vmovdqu %ymm13,160(%rdi) 1463 je L$done8x 1464 1465 leaq 192(%rsi),%rsi 1466 xorq %r10,%r10 1467 vmovdqa %ymm10,0(%rsp) 1468 leaq 192(%rdi),%rdi 1469 subq $192,%rdx 1470 vmovdqa %ymm15,32(%rsp) 1471 jmp L$oop_tail8x 1472 1473.p2align 5 1474L$256_or_more8x: 1475 vpxor 0(%rsi),%ymm6,%ymm6 1476 vpxor 32(%rsi),%ymm8,%ymm8 1477 vpxor 64(%rsi),%ymm1,%ymm1 1478 vpxor 96(%rsi),%ymm5,%ymm5 1479 vpxor 128(%rsi),%ymm12,%ymm12 1480 vpxor 160(%rsi),%ymm13,%ymm13 1481 vpxor 192(%rsi),%ymm10,%ymm10 1482 vpxor 224(%rsi),%ymm15,%ymm15 1483 vmovdqu %ymm6,0(%rdi) 1484 vmovdqu %ymm8,32(%rdi) 1485 vmovdqu %ymm1,64(%rdi) 1486 vmovdqu %ymm5,96(%rdi) 1487 vmovdqu %ymm12,128(%rdi) 1488 vmovdqu %ymm13,160(%rdi) 1489 vmovdqu %ymm10,192(%rdi) 1490 vmovdqu %ymm15,224(%rdi) 1491 je L$done8x 1492 1493 leaq 256(%rsi),%rsi 1494 xorq %r10,%r10 1495 vmovdqa %ymm14,0(%rsp) 1496 leaq 256(%rdi),%rdi 1497 subq $256,%rdx 1498 vmovdqa %ymm2,32(%rsp) 1499 jmp L$oop_tail8x 1500 1501.p2align 5 1502L$320_or_more8x: 1503 vpxor 0(%rsi),%ymm6,%ymm6 1504 vpxor 32(%rsi),%ymm8,%ymm8 1505 vpxor 64(%rsi),%ymm1,%ymm1 1506 vpxor 96(%rsi),%ymm5,%ymm5 1507 vpxor 128(%rsi),%ymm12,%ymm12 1508 vpxor 160(%rsi),%ymm13,%ymm13 1509 vpxor 192(%rsi),%ymm10,%ymm10 1510 vpxor 224(%rsi),%ymm15,%ymm15 1511 vpxor 256(%rsi),%ymm14,%ymm14 1512 vpxor 288(%rsi),%ymm2,%ymm2 1513 vmovdqu %ymm6,0(%rdi) 1514 vmovdqu %ymm8,32(%rdi) 1515 vmovdqu %ymm1,64(%rdi) 1516 vmovdqu %ymm5,96(%rdi) 1517 vmovdqu %ymm12,128(%rdi) 1518 vmovdqu %ymm13,160(%rdi) 1519 vmovdqu %ymm10,192(%rdi) 1520 vmovdqu %ymm15,224(%rdi) 1521 vmovdqu %ymm14,256(%rdi) 1522 vmovdqu %ymm2,288(%rdi) 1523 je L$done8x 1524 1525 leaq 320(%rsi),%rsi 1526 xorq %r10,%r10 1527 vmovdqa %ymm3,0(%rsp) 1528 leaq 320(%rdi),%rdi 1529 subq $320,%rdx 1530 vmovdqa %ymm7,32(%rsp) 1531 jmp L$oop_tail8x 1532 1533.p2align 5 1534L$384_or_more8x: 1535 vpxor 0(%rsi),%ymm6,%ymm6 1536 vpxor 32(%rsi),%ymm8,%ymm8 1537 vpxor 64(%rsi),%ymm1,%ymm1 1538 vpxor 96(%rsi),%ymm5,%ymm5 1539 vpxor 128(%rsi),%ymm12,%ymm12 1540 vpxor 160(%rsi),%ymm13,%ymm13 1541 vpxor 192(%rsi),%ymm10,%ymm10 1542 vpxor 224(%rsi),%ymm15,%ymm15 1543 vpxor 256(%rsi),%ymm14,%ymm14 1544 vpxor 288(%rsi),%ymm2,%ymm2 1545 vpxor 320(%rsi),%ymm3,%ymm3 1546 vpxor 352(%rsi),%ymm7,%ymm7 1547 vmovdqu %ymm6,0(%rdi) 1548 vmovdqu %ymm8,32(%rdi) 1549 vmovdqu %ymm1,64(%rdi) 1550 vmovdqu %ymm5,96(%rdi) 1551 vmovdqu %ymm12,128(%rdi) 1552 vmovdqu %ymm13,160(%rdi) 1553 vmovdqu %ymm10,192(%rdi) 1554 vmovdqu %ymm15,224(%rdi) 1555 vmovdqu %ymm14,256(%rdi) 1556 vmovdqu %ymm2,288(%rdi) 1557 vmovdqu %ymm3,320(%rdi) 1558 vmovdqu %ymm7,352(%rdi) 1559 je L$done8x 1560 1561 leaq 384(%rsi),%rsi 1562 xorq %r10,%r10 1563 vmovdqa %ymm11,0(%rsp) 1564 leaq 384(%rdi),%rdi 1565 subq $384,%rdx 1566 vmovdqa %ymm9,32(%rsp) 1567 jmp L$oop_tail8x 1568 1569.p2align 5 1570L$448_or_more8x: 1571 vpxor 0(%rsi),%ymm6,%ymm6 1572 vpxor 32(%rsi),%ymm8,%ymm8 1573 vpxor 64(%rsi),%ymm1,%ymm1 1574 vpxor 96(%rsi),%ymm5,%ymm5 1575 vpxor 128(%rsi),%ymm12,%ymm12 1576 vpxor 160(%rsi),%ymm13,%ymm13 1577 vpxor 192(%rsi),%ymm10,%ymm10 1578 vpxor 224(%rsi),%ymm15,%ymm15 1579 vpxor 256(%rsi),%ymm14,%ymm14 1580 vpxor 288(%rsi),%ymm2,%ymm2 1581 vpxor 320(%rsi),%ymm3,%ymm3 1582 vpxor 352(%rsi),%ymm7,%ymm7 1583 vpxor 384(%rsi),%ymm11,%ymm11 1584 vpxor 416(%rsi),%ymm9,%ymm9 1585 vmovdqu %ymm6,0(%rdi) 1586 vmovdqu %ymm8,32(%rdi) 1587 vmovdqu %ymm1,64(%rdi) 1588 vmovdqu %ymm5,96(%rdi) 1589 vmovdqu %ymm12,128(%rdi) 1590 vmovdqu %ymm13,160(%rdi) 1591 vmovdqu %ymm10,192(%rdi) 1592 vmovdqu %ymm15,224(%rdi) 1593 vmovdqu %ymm14,256(%rdi) 1594 vmovdqu %ymm2,288(%rdi) 1595 vmovdqu %ymm3,320(%rdi) 1596 vmovdqu %ymm7,352(%rdi) 1597 vmovdqu %ymm11,384(%rdi) 1598 vmovdqu %ymm9,416(%rdi) 1599 je L$done8x 1600 1601 leaq 448(%rsi),%rsi 1602 xorq %r10,%r10 1603 vmovdqa %ymm0,0(%rsp) 1604 leaq 448(%rdi),%rdi 1605 subq $448,%rdx 1606 vmovdqa %ymm4,32(%rsp) 1607 1608L$oop_tail8x: 1609 movzbl (%rsi,%r10,1),%eax 1610 movzbl (%rsp,%r10,1),%ecx 1611 leaq 1(%r10),%r10 1612 xorl %ecx,%eax 1613 movb %al,-1(%rdi,%r10,1) 1614 decq %rdx 1615 jnz L$oop_tail8x 1616 1617L$done8x: 1618 vzeroall 1619 leaq (%r9),%rsp 1620 1621L$8x_epilogue: 1622 .byte 0xf3,0xc3 1623 1624 1625#endif 1626