1.text 2.globl ChaCha20_ctr32 3.type ChaCha20_ctr32,@function 4.align 16 5ChaCha20_ctr32: 6.L_ChaCha20_ctr32_begin: 7 %ifdef __CET__ 8 9.byte 243,15,30,251 10 %endif 11 12 pushl %ebp 13 pushl %ebx 14 pushl %esi 15 pushl %edi 16 xorl %eax,%eax 17 cmpl 28(%esp),%eax 18 je .L000no_data 19 call .Lpic_point 20.Lpic_point: 21 popl %eax 22 leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp 23 testl $16777216,(%ebp) 24 jz .L001x86 25 testl $512,4(%ebp) 26 jz .L001x86 27 jmp .Lssse3_shortcut 28.L001x86: 29 movl 32(%esp),%esi 30 movl 36(%esp),%edi 31 subl $132,%esp 32 movl (%esi),%eax 33 movl 4(%esi),%ebx 34 movl 8(%esi),%ecx 35 movl 12(%esi),%edx 36 movl %eax,80(%esp) 37 movl %ebx,84(%esp) 38 movl %ecx,88(%esp) 39 movl %edx,92(%esp) 40 movl 16(%esi),%eax 41 movl 20(%esi),%ebx 42 movl 24(%esi),%ecx 43 movl 28(%esi),%edx 44 movl %eax,96(%esp) 45 movl %ebx,100(%esp) 46 movl %ecx,104(%esp) 47 movl %edx,108(%esp) 48 movl (%edi),%eax 49 movl 4(%edi),%ebx 50 movl 8(%edi),%ecx 51 movl 12(%edi),%edx 52 subl $1,%eax 53 movl %eax,112(%esp) 54 movl %ebx,116(%esp) 55 movl %ecx,120(%esp) 56 movl %edx,124(%esp) 57 jmp .L002entry 58.align 16 59.L003outer_loop: 60 movl %ebx,156(%esp) 61 movl %eax,152(%esp) 62 movl %ecx,160(%esp) 63.L002entry: 64 movl $1634760805,%eax 65 movl $857760878,4(%esp) 66 movl $2036477234,8(%esp) 67 movl $1797285236,12(%esp) 68 movl 84(%esp),%ebx 69 movl 88(%esp),%ebp 70 movl 104(%esp),%ecx 71 movl 108(%esp),%esi 72 movl 116(%esp),%edx 73 movl 120(%esp),%edi 74 movl %ebx,20(%esp) 75 movl %ebp,24(%esp) 76 movl %ecx,40(%esp) 77 movl %esi,44(%esp) 78 movl %edx,52(%esp) 79 movl %edi,56(%esp) 80 movl 92(%esp),%ebx 81 movl 124(%esp),%edi 82 movl 112(%esp),%edx 83 movl 80(%esp),%ebp 84 movl 96(%esp),%ecx 85 movl 100(%esp),%esi 86 addl $1,%edx 87 movl %ebx,28(%esp) 88 movl %edi,60(%esp) 89 movl %edx,112(%esp) 90 movl $10,%ebx 91 jmp .L004loop 92.align 16 93.L004loop: 94 addl %ebp,%eax 95 movl %ebx,128(%esp) 96 movl %ebp,%ebx 97 xorl %eax,%edx 98 roll $16,%edx 99 addl %edx,%ecx 100 xorl %ecx,%ebx 101 movl 52(%esp),%edi 102 roll $12,%ebx 103 movl 20(%esp),%ebp 104 addl %ebx,%eax 105 xorl %eax,%edx 106 movl %eax,(%esp) 107 roll $8,%edx 108 movl 4(%esp),%eax 109 addl %edx,%ecx 110 movl %edx,48(%esp) 111 xorl %ecx,%ebx 112 addl %ebp,%eax 113 roll $7,%ebx 114 xorl %eax,%edi 115 movl %ecx,32(%esp) 116 roll $16,%edi 117 movl %ebx,16(%esp) 118 addl %edi,%esi 119 movl 40(%esp),%ecx 120 xorl %esi,%ebp 121 movl 56(%esp),%edx 122 roll $12,%ebp 123 movl 24(%esp),%ebx 124 addl %ebp,%eax 125 xorl %eax,%edi 126 movl %eax,4(%esp) 127 roll $8,%edi 128 movl 8(%esp),%eax 129 addl %edi,%esi 130 movl %edi,52(%esp) 131 xorl %esi,%ebp 132 addl %ebx,%eax 133 roll $7,%ebp 134 xorl %eax,%edx 135 movl %esi,36(%esp) 136 roll $16,%edx 137 movl %ebp,20(%esp) 138 addl %edx,%ecx 139 movl 44(%esp),%esi 140 xorl %ecx,%ebx 141 movl 60(%esp),%edi 142 roll $12,%ebx 143 movl 28(%esp),%ebp 144 addl %ebx,%eax 145 xorl %eax,%edx 146 movl %eax,8(%esp) 147 roll $8,%edx 148 movl 12(%esp),%eax 149 addl %edx,%ecx 150 movl %edx,56(%esp) 151 xorl %ecx,%ebx 152 addl %ebp,%eax 153 roll $7,%ebx 154 xorl %eax,%edi 155 roll $16,%edi 156 movl %ebx,24(%esp) 157 addl %edi,%esi 158 xorl %esi,%ebp 159 roll $12,%ebp 160 movl 20(%esp),%ebx 161 addl %ebp,%eax 162 xorl %eax,%edi 163 movl %eax,12(%esp) 164 roll $8,%edi 165 movl (%esp),%eax 166 addl %edi,%esi 167 movl %edi,%edx 168 xorl %esi,%ebp 169 addl %ebx,%eax 170 roll $7,%ebp 171 xorl %eax,%edx 172 roll $16,%edx 173 movl %ebp,28(%esp) 174 addl %edx,%ecx 175 xorl %ecx,%ebx 176 movl 48(%esp),%edi 177 roll $12,%ebx 178 movl 24(%esp),%ebp 179 addl %ebx,%eax 180 xorl %eax,%edx 181 movl %eax,(%esp) 182 roll $8,%edx 183 movl 4(%esp),%eax 184 addl %edx,%ecx 185 movl %edx,60(%esp) 186 xorl %ecx,%ebx 187 addl %ebp,%eax 188 roll $7,%ebx 189 xorl %eax,%edi 190 movl %ecx,40(%esp) 191 roll $16,%edi 192 movl %ebx,20(%esp) 193 addl %edi,%esi 194 movl 32(%esp),%ecx 195 xorl %esi,%ebp 196 movl 52(%esp),%edx 197 roll $12,%ebp 198 movl 28(%esp),%ebx 199 addl %ebp,%eax 200 xorl %eax,%edi 201 movl %eax,4(%esp) 202 roll $8,%edi 203 movl 8(%esp),%eax 204 addl %edi,%esi 205 movl %edi,48(%esp) 206 xorl %esi,%ebp 207 addl %ebx,%eax 208 roll $7,%ebp 209 xorl %eax,%edx 210 movl %esi,44(%esp) 211 roll $16,%edx 212 movl %ebp,24(%esp) 213 addl %edx,%ecx 214 movl 36(%esp),%esi 215 xorl %ecx,%ebx 216 movl 56(%esp),%edi 217 roll $12,%ebx 218 movl 16(%esp),%ebp 219 addl %ebx,%eax 220 xorl %eax,%edx 221 movl %eax,8(%esp) 222 roll $8,%edx 223 movl 12(%esp),%eax 224 addl %edx,%ecx 225 movl %edx,52(%esp) 226 xorl %ecx,%ebx 227 addl %ebp,%eax 228 roll $7,%ebx 229 xorl %eax,%edi 230 roll $16,%edi 231 movl %ebx,28(%esp) 232 addl %edi,%esi 233 xorl %esi,%ebp 234 movl 48(%esp),%edx 235 roll $12,%ebp 236 movl 128(%esp),%ebx 237 addl %ebp,%eax 238 xorl %eax,%edi 239 movl %eax,12(%esp) 240 roll $8,%edi 241 movl (%esp),%eax 242 addl %edi,%esi 243 movl %edi,56(%esp) 244 xorl %esi,%ebp 245 roll $7,%ebp 246 decl %ebx 247 jnz .L004loop 248 movl 160(%esp),%ebx 249 addl $1634760805,%eax 250 addl 80(%esp),%ebp 251 addl 96(%esp),%ecx 252 addl 100(%esp),%esi 253 cmpl $64,%ebx 254 jb .L005tail 255 movl 156(%esp),%ebx 256 addl 112(%esp),%edx 257 addl 120(%esp),%edi 258 xorl (%ebx),%eax 259 xorl 16(%ebx),%ebp 260 movl %eax,(%esp) 261 movl 152(%esp),%eax 262 xorl 32(%ebx),%ecx 263 xorl 36(%ebx),%esi 264 xorl 48(%ebx),%edx 265 xorl 56(%ebx),%edi 266 movl %ebp,16(%eax) 267 movl %ecx,32(%eax) 268 movl %esi,36(%eax) 269 movl %edx,48(%eax) 270 movl %edi,56(%eax) 271 movl 4(%esp),%ebp 272 movl 8(%esp),%ecx 273 movl 12(%esp),%esi 274 movl 20(%esp),%edx 275 movl 24(%esp),%edi 276 addl $857760878,%ebp 277 addl $2036477234,%ecx 278 addl $1797285236,%esi 279 addl 84(%esp),%edx 280 addl 88(%esp),%edi 281 xorl 4(%ebx),%ebp 282 xorl 8(%ebx),%ecx 283 xorl 12(%ebx),%esi 284 xorl 20(%ebx),%edx 285 xorl 24(%ebx),%edi 286 movl %ebp,4(%eax) 287 movl %ecx,8(%eax) 288 movl %esi,12(%eax) 289 movl %edx,20(%eax) 290 movl %edi,24(%eax) 291 movl 28(%esp),%ebp 292 movl 40(%esp),%ecx 293 movl 44(%esp),%esi 294 movl 52(%esp),%edx 295 movl 60(%esp),%edi 296 addl 92(%esp),%ebp 297 addl 104(%esp),%ecx 298 addl 108(%esp),%esi 299 addl 116(%esp),%edx 300 addl 124(%esp),%edi 301 xorl 28(%ebx),%ebp 302 xorl 40(%ebx),%ecx 303 xorl 44(%ebx),%esi 304 xorl 52(%ebx),%edx 305 xorl 60(%ebx),%edi 306 leal 64(%ebx),%ebx 307 movl %ebp,28(%eax) 308 movl (%esp),%ebp 309 movl %ecx,40(%eax) 310 movl 160(%esp),%ecx 311 movl %esi,44(%eax) 312 movl %edx,52(%eax) 313 movl %edi,60(%eax) 314 movl %ebp,(%eax) 315 leal 64(%eax),%eax 316 subl $64,%ecx 317 jnz .L003outer_loop 318 jmp .L006done 319.L005tail: 320 addl 112(%esp),%edx 321 addl 120(%esp),%edi 322 movl %eax,(%esp) 323 movl %ebp,16(%esp) 324 movl %ecx,32(%esp) 325 movl %esi,36(%esp) 326 movl %edx,48(%esp) 327 movl %edi,56(%esp) 328 movl 4(%esp),%ebp 329 movl 8(%esp),%ecx 330 movl 12(%esp),%esi 331 movl 20(%esp),%edx 332 movl 24(%esp),%edi 333 addl $857760878,%ebp 334 addl $2036477234,%ecx 335 addl $1797285236,%esi 336 addl 84(%esp),%edx 337 addl 88(%esp),%edi 338 movl %ebp,4(%esp) 339 movl %ecx,8(%esp) 340 movl %esi,12(%esp) 341 movl %edx,20(%esp) 342 movl %edi,24(%esp) 343 movl 28(%esp),%ebp 344 movl 40(%esp),%ecx 345 movl 44(%esp),%esi 346 movl 52(%esp),%edx 347 movl 60(%esp),%edi 348 addl 92(%esp),%ebp 349 addl 104(%esp),%ecx 350 addl 108(%esp),%esi 351 addl 116(%esp),%edx 352 addl 124(%esp),%edi 353 movl %ebp,28(%esp) 354 movl 156(%esp),%ebp 355 movl %ecx,40(%esp) 356 movl 152(%esp),%ecx 357 movl %esi,44(%esp) 358 xorl %esi,%esi 359 movl %edx,52(%esp) 360 movl %edi,60(%esp) 361 xorl %eax,%eax 362 xorl %edx,%edx 363.L007tail_loop: 364 movb (%esi,%ebp,1),%al 365 movb (%esp,%esi,1),%dl 366 leal 1(%esi),%esi 367 xorb %dl,%al 368 movb %al,-1(%ecx,%esi,1) 369 decl %ebx 370 jnz .L007tail_loop 371.L006done: 372 addl $132,%esp 373.L000no_data: 374 popl %edi 375 popl %esi 376 popl %ebx 377 popl %ebp 378 ret 379.size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin 380.globl ChaCha20_ssse3 381.type ChaCha20_ssse3,@function 382.align 16 383ChaCha20_ssse3: 384.L_ChaCha20_ssse3_begin: 385 %ifdef __CET__ 386 387.byte 243,15,30,251 388 %endif 389 390 pushl %ebp 391 pushl %ebx 392 pushl %esi 393 pushl %edi 394.Lssse3_shortcut: 395 testl $2048,4(%ebp) 396 jnz .Lxop_shortcut 397 movl 20(%esp),%edi 398 movl 24(%esp),%esi 399 movl 28(%esp),%ecx 400 movl 32(%esp),%edx 401 movl 36(%esp),%ebx 402 movl %esp,%ebp 403 subl $524,%esp 404 andl $-64,%esp 405 movl %ebp,512(%esp) 406 leal .Lssse3_data-.Lpic_point(%eax),%eax 407 movdqu (%ebx),%xmm3 408 cmpl $256,%ecx 409 jb .L0081x 410 movl %edx,516(%esp) 411 movl %ebx,520(%esp) 412 subl $256,%ecx 413 leal 384(%esp),%ebp 414 movdqu (%edx),%xmm7 415 pshufd $0,%xmm3,%xmm0 416 pshufd $85,%xmm3,%xmm1 417 pshufd $170,%xmm3,%xmm2 418 pshufd $255,%xmm3,%xmm3 419 paddd 48(%eax),%xmm0 420 pshufd $0,%xmm7,%xmm4 421 pshufd $85,%xmm7,%xmm5 422 psubd 64(%eax),%xmm0 423 pshufd $170,%xmm7,%xmm6 424 pshufd $255,%xmm7,%xmm7 425 movdqa %xmm0,64(%ebp) 426 movdqa %xmm1,80(%ebp) 427 movdqa %xmm2,96(%ebp) 428 movdqa %xmm3,112(%ebp) 429 movdqu 16(%edx),%xmm3 430 movdqa %xmm4,-64(%ebp) 431 movdqa %xmm5,-48(%ebp) 432 movdqa %xmm6,-32(%ebp) 433 movdqa %xmm7,-16(%ebp) 434 movdqa 32(%eax),%xmm7 435 leal 128(%esp),%ebx 436 pshufd $0,%xmm3,%xmm0 437 pshufd $85,%xmm3,%xmm1 438 pshufd $170,%xmm3,%xmm2 439 pshufd $255,%xmm3,%xmm3 440 pshufd $0,%xmm7,%xmm4 441 pshufd $85,%xmm7,%xmm5 442 pshufd $170,%xmm7,%xmm6 443 pshufd $255,%xmm7,%xmm7 444 movdqa %xmm0,(%ebp) 445 movdqa %xmm1,16(%ebp) 446 movdqa %xmm2,32(%ebp) 447 movdqa %xmm3,48(%ebp) 448 movdqa %xmm4,-128(%ebp) 449 movdqa %xmm5,-112(%ebp) 450 movdqa %xmm6,-96(%ebp) 451 movdqa %xmm7,-80(%ebp) 452 leal 128(%esi),%esi 453 leal 128(%edi),%edi 454 jmp .L009outer_loop 455.align 16 456.L009outer_loop: 457 movdqa -112(%ebp),%xmm1 458 movdqa -96(%ebp),%xmm2 459 movdqa -80(%ebp),%xmm3 460 movdqa -48(%ebp),%xmm5 461 movdqa -32(%ebp),%xmm6 462 movdqa -16(%ebp),%xmm7 463 movdqa %xmm1,-112(%ebx) 464 movdqa %xmm2,-96(%ebx) 465 movdqa %xmm3,-80(%ebx) 466 movdqa %xmm5,-48(%ebx) 467 movdqa %xmm6,-32(%ebx) 468 movdqa %xmm7,-16(%ebx) 469 movdqa 32(%ebp),%xmm2 470 movdqa 48(%ebp),%xmm3 471 movdqa 64(%ebp),%xmm4 472 movdqa 80(%ebp),%xmm5 473 movdqa 96(%ebp),%xmm6 474 movdqa 112(%ebp),%xmm7 475 paddd 64(%eax),%xmm4 476 movdqa %xmm2,32(%ebx) 477 movdqa %xmm3,48(%ebx) 478 movdqa %xmm4,64(%ebx) 479 movdqa %xmm5,80(%ebx) 480 movdqa %xmm6,96(%ebx) 481 movdqa %xmm7,112(%ebx) 482 movdqa %xmm4,64(%ebp) 483 movdqa -128(%ebp),%xmm0 484 movdqa %xmm4,%xmm6 485 movdqa -64(%ebp),%xmm3 486 movdqa (%ebp),%xmm4 487 movdqa 16(%ebp),%xmm5 488 movl $10,%edx 489 nop 490.align 16 491.L010loop: 492 paddd %xmm3,%xmm0 493 movdqa %xmm3,%xmm2 494 pxor %xmm0,%xmm6 495 pshufb (%eax),%xmm6 496 paddd %xmm6,%xmm4 497 pxor %xmm4,%xmm2 498 movdqa -48(%ebx),%xmm3 499 movdqa %xmm2,%xmm1 500 pslld $12,%xmm2 501 psrld $20,%xmm1 502 por %xmm1,%xmm2 503 movdqa -112(%ebx),%xmm1 504 paddd %xmm2,%xmm0 505 movdqa 80(%ebx),%xmm7 506 pxor %xmm0,%xmm6 507 movdqa %xmm0,-128(%ebx) 508 pshufb 16(%eax),%xmm6 509 paddd %xmm6,%xmm4 510 movdqa %xmm6,64(%ebx) 511 pxor %xmm4,%xmm2 512 paddd %xmm3,%xmm1 513 movdqa %xmm2,%xmm0 514 pslld $7,%xmm2 515 psrld $25,%xmm0 516 pxor %xmm1,%xmm7 517 por %xmm0,%xmm2 518 movdqa %xmm4,(%ebx) 519 pshufb (%eax),%xmm7 520 movdqa %xmm2,-64(%ebx) 521 paddd %xmm7,%xmm5 522 movdqa 32(%ebx),%xmm4 523 pxor %xmm5,%xmm3 524 movdqa -32(%ebx),%xmm2 525 movdqa %xmm3,%xmm0 526 pslld $12,%xmm3 527 psrld $20,%xmm0 528 por %xmm0,%xmm3 529 movdqa -96(%ebx),%xmm0 530 paddd %xmm3,%xmm1 531 movdqa 96(%ebx),%xmm6 532 pxor %xmm1,%xmm7 533 movdqa %xmm1,-112(%ebx) 534 pshufb 16(%eax),%xmm7 535 paddd %xmm7,%xmm5 536 movdqa %xmm7,80(%ebx) 537 pxor %xmm5,%xmm3 538 paddd %xmm2,%xmm0 539 movdqa %xmm3,%xmm1 540 pslld $7,%xmm3 541 psrld $25,%xmm1 542 pxor %xmm0,%xmm6 543 por %xmm1,%xmm3 544 movdqa %xmm5,16(%ebx) 545 pshufb (%eax),%xmm6 546 movdqa %xmm3,-48(%ebx) 547 paddd %xmm6,%xmm4 548 movdqa 48(%ebx),%xmm5 549 pxor %xmm4,%xmm2 550 movdqa -16(%ebx),%xmm3 551 movdqa %xmm2,%xmm1 552 pslld $12,%xmm2 553 psrld $20,%xmm1 554 por %xmm1,%xmm2 555 movdqa -80(%ebx),%xmm1 556 paddd %xmm2,%xmm0 557 movdqa 112(%ebx),%xmm7 558 pxor %xmm0,%xmm6 559 movdqa %xmm0,-96(%ebx) 560 pshufb 16(%eax),%xmm6 561 paddd %xmm6,%xmm4 562 movdqa %xmm6,96(%ebx) 563 pxor %xmm4,%xmm2 564 paddd %xmm3,%xmm1 565 movdqa %xmm2,%xmm0 566 pslld $7,%xmm2 567 psrld $25,%xmm0 568 pxor %xmm1,%xmm7 569 por %xmm0,%xmm2 570 pshufb (%eax),%xmm7 571 movdqa %xmm2,-32(%ebx) 572 paddd %xmm7,%xmm5 573 pxor %xmm5,%xmm3 574 movdqa -48(%ebx),%xmm2 575 movdqa %xmm3,%xmm0 576 pslld $12,%xmm3 577 psrld $20,%xmm0 578 por %xmm0,%xmm3 579 movdqa -128(%ebx),%xmm0 580 paddd %xmm3,%xmm1 581 pxor %xmm1,%xmm7 582 movdqa %xmm1,-80(%ebx) 583 pshufb 16(%eax),%xmm7 584 paddd %xmm7,%xmm5 585 movdqa %xmm7,%xmm6 586 pxor %xmm5,%xmm3 587 paddd %xmm2,%xmm0 588 movdqa %xmm3,%xmm1 589 pslld $7,%xmm3 590 psrld $25,%xmm1 591 pxor %xmm0,%xmm6 592 por %xmm1,%xmm3 593 pshufb (%eax),%xmm6 594 movdqa %xmm3,-16(%ebx) 595 paddd %xmm6,%xmm4 596 pxor %xmm4,%xmm2 597 movdqa -32(%ebx),%xmm3 598 movdqa %xmm2,%xmm1 599 pslld $12,%xmm2 600 psrld $20,%xmm1 601 por %xmm1,%xmm2 602 movdqa -112(%ebx),%xmm1 603 paddd %xmm2,%xmm0 604 movdqa 64(%ebx),%xmm7 605 pxor %xmm0,%xmm6 606 movdqa %xmm0,-128(%ebx) 607 pshufb 16(%eax),%xmm6 608 paddd %xmm6,%xmm4 609 movdqa %xmm6,112(%ebx) 610 pxor %xmm4,%xmm2 611 paddd %xmm3,%xmm1 612 movdqa %xmm2,%xmm0 613 pslld $7,%xmm2 614 psrld $25,%xmm0 615 pxor %xmm1,%xmm7 616 por %xmm0,%xmm2 617 movdqa %xmm4,32(%ebx) 618 pshufb (%eax),%xmm7 619 movdqa %xmm2,-48(%ebx) 620 paddd %xmm7,%xmm5 621 movdqa (%ebx),%xmm4 622 pxor %xmm5,%xmm3 623 movdqa -16(%ebx),%xmm2 624 movdqa %xmm3,%xmm0 625 pslld $12,%xmm3 626 psrld $20,%xmm0 627 por %xmm0,%xmm3 628 movdqa -96(%ebx),%xmm0 629 paddd %xmm3,%xmm1 630 movdqa 80(%ebx),%xmm6 631 pxor %xmm1,%xmm7 632 movdqa %xmm1,-112(%ebx) 633 pshufb 16(%eax),%xmm7 634 paddd %xmm7,%xmm5 635 movdqa %xmm7,64(%ebx) 636 pxor %xmm5,%xmm3 637 paddd %xmm2,%xmm0 638 movdqa %xmm3,%xmm1 639 pslld $7,%xmm3 640 psrld $25,%xmm1 641 pxor %xmm0,%xmm6 642 por %xmm1,%xmm3 643 movdqa %xmm5,48(%ebx) 644 pshufb (%eax),%xmm6 645 movdqa %xmm3,-32(%ebx) 646 paddd %xmm6,%xmm4 647 movdqa 16(%ebx),%xmm5 648 pxor %xmm4,%xmm2 649 movdqa -64(%ebx),%xmm3 650 movdqa %xmm2,%xmm1 651 pslld $12,%xmm2 652 psrld $20,%xmm1 653 por %xmm1,%xmm2 654 movdqa -80(%ebx),%xmm1 655 paddd %xmm2,%xmm0 656 movdqa 96(%ebx),%xmm7 657 pxor %xmm0,%xmm6 658 movdqa %xmm0,-96(%ebx) 659 pshufb 16(%eax),%xmm6 660 paddd %xmm6,%xmm4 661 movdqa %xmm6,80(%ebx) 662 pxor %xmm4,%xmm2 663 paddd %xmm3,%xmm1 664 movdqa %xmm2,%xmm0 665 pslld $7,%xmm2 666 psrld $25,%xmm0 667 pxor %xmm1,%xmm7 668 por %xmm0,%xmm2 669 pshufb (%eax),%xmm7 670 movdqa %xmm2,-16(%ebx) 671 paddd %xmm7,%xmm5 672 pxor %xmm5,%xmm3 673 movdqa %xmm3,%xmm0 674 pslld $12,%xmm3 675 psrld $20,%xmm0 676 por %xmm0,%xmm3 677 movdqa -128(%ebx),%xmm0 678 paddd %xmm3,%xmm1 679 movdqa 64(%ebx),%xmm6 680 pxor %xmm1,%xmm7 681 movdqa %xmm1,-80(%ebx) 682 pshufb 16(%eax),%xmm7 683 paddd %xmm7,%xmm5 684 movdqa %xmm7,96(%ebx) 685 pxor %xmm5,%xmm3 686 movdqa %xmm3,%xmm1 687 pslld $7,%xmm3 688 psrld $25,%xmm1 689 por %xmm1,%xmm3 690 decl %edx 691 jnz .L010loop 692 movdqa %xmm3,-64(%ebx) 693 movdqa %xmm4,(%ebx) 694 movdqa %xmm5,16(%ebx) 695 movdqa %xmm6,64(%ebx) 696 movdqa %xmm7,96(%ebx) 697 movdqa -112(%ebx),%xmm1 698 movdqa -96(%ebx),%xmm2 699 movdqa -80(%ebx),%xmm3 700 paddd -128(%ebp),%xmm0 701 paddd -112(%ebp),%xmm1 702 paddd -96(%ebp),%xmm2 703 paddd -80(%ebp),%xmm3 704 movdqa %xmm0,%xmm6 705 punpckldq %xmm1,%xmm0 706 movdqa %xmm2,%xmm7 707 punpckldq %xmm3,%xmm2 708 punpckhdq %xmm1,%xmm6 709 punpckhdq %xmm3,%xmm7 710 movdqa %xmm0,%xmm1 711 punpcklqdq %xmm2,%xmm0 712 movdqa %xmm6,%xmm3 713 punpcklqdq %xmm7,%xmm6 714 punpckhqdq %xmm2,%xmm1 715 punpckhqdq %xmm7,%xmm3 716 movdqu -128(%esi),%xmm4 717 movdqu -64(%esi),%xmm5 718 movdqu (%esi),%xmm2 719 movdqu 64(%esi),%xmm7 720 leal 16(%esi),%esi 721 pxor %xmm0,%xmm4 722 movdqa -64(%ebx),%xmm0 723 pxor %xmm1,%xmm5 724 movdqa -48(%ebx),%xmm1 725 pxor %xmm2,%xmm6 726 movdqa -32(%ebx),%xmm2 727 pxor %xmm3,%xmm7 728 movdqa -16(%ebx),%xmm3 729 movdqu %xmm4,-128(%edi) 730 movdqu %xmm5,-64(%edi) 731 movdqu %xmm6,(%edi) 732 movdqu %xmm7,64(%edi) 733 leal 16(%edi),%edi 734 paddd -64(%ebp),%xmm0 735 paddd -48(%ebp),%xmm1 736 paddd -32(%ebp),%xmm2 737 paddd -16(%ebp),%xmm3 738 movdqa %xmm0,%xmm6 739 punpckldq %xmm1,%xmm0 740 movdqa %xmm2,%xmm7 741 punpckldq %xmm3,%xmm2 742 punpckhdq %xmm1,%xmm6 743 punpckhdq %xmm3,%xmm7 744 movdqa %xmm0,%xmm1 745 punpcklqdq %xmm2,%xmm0 746 movdqa %xmm6,%xmm3 747 punpcklqdq %xmm7,%xmm6 748 punpckhqdq %xmm2,%xmm1 749 punpckhqdq %xmm7,%xmm3 750 movdqu -128(%esi),%xmm4 751 movdqu -64(%esi),%xmm5 752 movdqu (%esi),%xmm2 753 movdqu 64(%esi),%xmm7 754 leal 16(%esi),%esi 755 pxor %xmm0,%xmm4 756 movdqa (%ebx),%xmm0 757 pxor %xmm1,%xmm5 758 movdqa 16(%ebx),%xmm1 759 pxor %xmm2,%xmm6 760 movdqa 32(%ebx),%xmm2 761 pxor %xmm3,%xmm7 762 movdqa 48(%ebx),%xmm3 763 movdqu %xmm4,-128(%edi) 764 movdqu %xmm5,-64(%edi) 765 movdqu %xmm6,(%edi) 766 movdqu %xmm7,64(%edi) 767 leal 16(%edi),%edi 768 paddd (%ebp),%xmm0 769 paddd 16(%ebp),%xmm1 770 paddd 32(%ebp),%xmm2 771 paddd 48(%ebp),%xmm3 772 movdqa %xmm0,%xmm6 773 punpckldq %xmm1,%xmm0 774 movdqa %xmm2,%xmm7 775 punpckldq %xmm3,%xmm2 776 punpckhdq %xmm1,%xmm6 777 punpckhdq %xmm3,%xmm7 778 movdqa %xmm0,%xmm1 779 punpcklqdq %xmm2,%xmm0 780 movdqa %xmm6,%xmm3 781 punpcklqdq %xmm7,%xmm6 782 punpckhqdq %xmm2,%xmm1 783 punpckhqdq %xmm7,%xmm3 784 movdqu -128(%esi),%xmm4 785 movdqu -64(%esi),%xmm5 786 movdqu (%esi),%xmm2 787 movdqu 64(%esi),%xmm7 788 leal 16(%esi),%esi 789 pxor %xmm0,%xmm4 790 movdqa 64(%ebx),%xmm0 791 pxor %xmm1,%xmm5 792 movdqa 80(%ebx),%xmm1 793 pxor %xmm2,%xmm6 794 movdqa 96(%ebx),%xmm2 795 pxor %xmm3,%xmm7 796 movdqa 112(%ebx),%xmm3 797 movdqu %xmm4,-128(%edi) 798 movdqu %xmm5,-64(%edi) 799 movdqu %xmm6,(%edi) 800 movdqu %xmm7,64(%edi) 801 leal 16(%edi),%edi 802 paddd 64(%ebp),%xmm0 803 paddd 80(%ebp),%xmm1 804 paddd 96(%ebp),%xmm2 805 paddd 112(%ebp),%xmm3 806 movdqa %xmm0,%xmm6 807 punpckldq %xmm1,%xmm0 808 movdqa %xmm2,%xmm7 809 punpckldq %xmm3,%xmm2 810 punpckhdq %xmm1,%xmm6 811 punpckhdq %xmm3,%xmm7 812 movdqa %xmm0,%xmm1 813 punpcklqdq %xmm2,%xmm0 814 movdqa %xmm6,%xmm3 815 punpcklqdq %xmm7,%xmm6 816 punpckhqdq %xmm2,%xmm1 817 punpckhqdq %xmm7,%xmm3 818 movdqu -128(%esi),%xmm4 819 movdqu -64(%esi),%xmm5 820 movdqu (%esi),%xmm2 821 movdqu 64(%esi),%xmm7 822 leal 208(%esi),%esi 823 pxor %xmm0,%xmm4 824 pxor %xmm1,%xmm5 825 pxor %xmm2,%xmm6 826 pxor %xmm3,%xmm7 827 movdqu %xmm4,-128(%edi) 828 movdqu %xmm5,-64(%edi) 829 movdqu %xmm6,(%edi) 830 movdqu %xmm7,64(%edi) 831 leal 208(%edi),%edi 832 subl $256,%ecx 833 jnc .L009outer_loop 834 addl $256,%ecx 835 jz .L011done 836 movl 520(%esp),%ebx 837 leal -128(%esi),%esi 838 movl 516(%esp),%edx 839 leal -128(%edi),%edi 840 movd 64(%ebp),%xmm2 841 movdqu (%ebx),%xmm3 842 paddd 96(%eax),%xmm2 843 pand 112(%eax),%xmm3 844 por %xmm2,%xmm3 845.L0081x: 846 movdqa 32(%eax),%xmm0 847 movdqu (%edx),%xmm1 848 movdqu 16(%edx),%xmm2 849 movdqa (%eax),%xmm6 850 movdqa 16(%eax),%xmm7 851 movl %ebp,48(%esp) 852 movdqa %xmm0,(%esp) 853 movdqa %xmm1,16(%esp) 854 movdqa %xmm2,32(%esp) 855 movdqa %xmm3,48(%esp) 856 movl $10,%edx 857 jmp .L012loop1x 858.align 16 859.L013outer1x: 860 movdqa 80(%eax),%xmm3 861 movdqa (%esp),%xmm0 862 movdqa 16(%esp),%xmm1 863 movdqa 32(%esp),%xmm2 864 paddd 48(%esp),%xmm3 865 movl $10,%edx 866 movdqa %xmm3,48(%esp) 867 jmp .L012loop1x 868.align 16 869.L012loop1x: 870 paddd %xmm1,%xmm0 871 pxor %xmm0,%xmm3 872.byte 102,15,56,0,222 873 paddd %xmm3,%xmm2 874 pxor %xmm2,%xmm1 875 movdqa %xmm1,%xmm4 876 psrld $20,%xmm1 877 pslld $12,%xmm4 878 por %xmm4,%xmm1 879 paddd %xmm1,%xmm0 880 pxor %xmm0,%xmm3 881.byte 102,15,56,0,223 882 paddd %xmm3,%xmm2 883 pxor %xmm2,%xmm1 884 movdqa %xmm1,%xmm4 885 psrld $25,%xmm1 886 pslld $7,%xmm4 887 por %xmm4,%xmm1 888 pshufd $78,%xmm2,%xmm2 889 pshufd $57,%xmm1,%xmm1 890 pshufd $147,%xmm3,%xmm3 891 nop 892 paddd %xmm1,%xmm0 893 pxor %xmm0,%xmm3 894.byte 102,15,56,0,222 895 paddd %xmm3,%xmm2 896 pxor %xmm2,%xmm1 897 movdqa %xmm1,%xmm4 898 psrld $20,%xmm1 899 pslld $12,%xmm4 900 por %xmm4,%xmm1 901 paddd %xmm1,%xmm0 902 pxor %xmm0,%xmm3 903.byte 102,15,56,0,223 904 paddd %xmm3,%xmm2 905 pxor %xmm2,%xmm1 906 movdqa %xmm1,%xmm4 907 psrld $25,%xmm1 908 pslld $7,%xmm4 909 por %xmm4,%xmm1 910 pshufd $78,%xmm2,%xmm2 911 pshufd $147,%xmm1,%xmm1 912 pshufd $57,%xmm3,%xmm3 913 decl %edx 914 jnz .L012loop1x 915 paddd (%esp),%xmm0 916 paddd 16(%esp),%xmm1 917 paddd 32(%esp),%xmm2 918 paddd 48(%esp),%xmm3 919 cmpl $64,%ecx 920 jb .L014tail 921 movdqu (%esi),%xmm4 922 movdqu 16(%esi),%xmm5 923 pxor %xmm4,%xmm0 924 movdqu 32(%esi),%xmm4 925 pxor %xmm5,%xmm1 926 movdqu 48(%esi),%xmm5 927 pxor %xmm4,%xmm2 928 pxor %xmm5,%xmm3 929 leal 64(%esi),%esi 930 movdqu %xmm0,(%edi) 931 movdqu %xmm1,16(%edi) 932 movdqu %xmm2,32(%edi) 933 movdqu %xmm3,48(%edi) 934 leal 64(%edi),%edi 935 subl $64,%ecx 936 jnz .L013outer1x 937 jmp .L011done 938.L014tail: 939 movdqa %xmm0,(%esp) 940 movdqa %xmm1,16(%esp) 941 movdqa %xmm2,32(%esp) 942 movdqa %xmm3,48(%esp) 943 xorl %eax,%eax 944 xorl %edx,%edx 945 xorl %ebp,%ebp 946.L015tail_loop: 947 movb (%esp,%ebp,1),%al 948 movb (%esi,%ebp,1),%dl 949 leal 1(%ebp),%ebp 950 xorb %dl,%al 951 movb %al,-1(%edi,%ebp,1) 952 decl %ecx 953 jnz .L015tail_loop 954.L011done: 955 movl 512(%esp),%esp 956 popl %edi 957 popl %esi 958 popl %ebx 959 popl %ebp 960 ret 961.size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin 962.align 64 963.Lssse3_data: 964.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 965.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 966.long 1634760805,857760878,2036477234,1797285236 967.long 0,1,2,3 968.long 4,4,4,4 969.long 1,0,0,0 970.long 4,0,0,0 971.long 0,-1,-1,-1 972.align 64 973.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 974.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 975.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 976.byte 114,103,62,0 977.globl ChaCha20_xop 978.type ChaCha20_xop,@function 979.align 16 980ChaCha20_xop: 981.L_ChaCha20_xop_begin: 982 %ifdef __CET__ 983 984.byte 243,15,30,251 985 %endif 986 987 pushl %ebp 988 pushl %ebx 989 pushl %esi 990 pushl %edi 991.Lxop_shortcut: 992 movl 20(%esp),%edi 993 movl 24(%esp),%esi 994 movl 28(%esp),%ecx 995 movl 32(%esp),%edx 996 movl 36(%esp),%ebx 997 vzeroupper 998 movl %esp,%ebp 999 subl $524,%esp 1000 andl $-64,%esp 1001 movl %ebp,512(%esp) 1002 leal .Lssse3_data-.Lpic_point(%eax),%eax 1003 vmovdqu (%ebx),%xmm3 1004 cmpl $256,%ecx 1005 jb .L0161x 1006 movl %edx,516(%esp) 1007 movl %ebx,520(%esp) 1008 subl $256,%ecx 1009 leal 384(%esp),%ebp 1010 vmovdqu (%edx),%xmm7 1011 vpshufd $0,%xmm3,%xmm0 1012 vpshufd $85,%xmm3,%xmm1 1013 vpshufd $170,%xmm3,%xmm2 1014 vpshufd $255,%xmm3,%xmm3 1015 vpaddd 48(%eax),%xmm0,%xmm0 1016 vpshufd $0,%xmm7,%xmm4 1017 vpshufd $85,%xmm7,%xmm5 1018 vpsubd 64(%eax),%xmm0,%xmm0 1019 vpshufd $170,%xmm7,%xmm6 1020 vpshufd $255,%xmm7,%xmm7 1021 vmovdqa %xmm0,64(%ebp) 1022 vmovdqa %xmm1,80(%ebp) 1023 vmovdqa %xmm2,96(%ebp) 1024 vmovdqa %xmm3,112(%ebp) 1025 vmovdqu 16(%edx),%xmm3 1026 vmovdqa %xmm4,-64(%ebp) 1027 vmovdqa %xmm5,-48(%ebp) 1028 vmovdqa %xmm6,-32(%ebp) 1029 vmovdqa %xmm7,-16(%ebp) 1030 vmovdqa 32(%eax),%xmm7 1031 leal 128(%esp),%ebx 1032 vpshufd $0,%xmm3,%xmm0 1033 vpshufd $85,%xmm3,%xmm1 1034 vpshufd $170,%xmm3,%xmm2 1035 vpshufd $255,%xmm3,%xmm3 1036 vpshufd $0,%xmm7,%xmm4 1037 vpshufd $85,%xmm7,%xmm5 1038 vpshufd $170,%xmm7,%xmm6 1039 vpshufd $255,%xmm7,%xmm7 1040 vmovdqa %xmm0,(%ebp) 1041 vmovdqa %xmm1,16(%ebp) 1042 vmovdqa %xmm2,32(%ebp) 1043 vmovdqa %xmm3,48(%ebp) 1044 vmovdqa %xmm4,-128(%ebp) 1045 vmovdqa %xmm5,-112(%ebp) 1046 vmovdqa %xmm6,-96(%ebp) 1047 vmovdqa %xmm7,-80(%ebp) 1048 leal 128(%esi),%esi 1049 leal 128(%edi),%edi 1050 jmp .L017outer_loop 1051.align 32 1052.L017outer_loop: 1053 vmovdqa -112(%ebp),%xmm1 1054 vmovdqa -96(%ebp),%xmm2 1055 vmovdqa -80(%ebp),%xmm3 1056 vmovdqa -48(%ebp),%xmm5 1057 vmovdqa -32(%ebp),%xmm6 1058 vmovdqa -16(%ebp),%xmm7 1059 vmovdqa %xmm1,-112(%ebx) 1060 vmovdqa %xmm2,-96(%ebx) 1061 vmovdqa %xmm3,-80(%ebx) 1062 vmovdqa %xmm5,-48(%ebx) 1063 vmovdqa %xmm6,-32(%ebx) 1064 vmovdqa %xmm7,-16(%ebx) 1065 vmovdqa 32(%ebp),%xmm2 1066 vmovdqa 48(%ebp),%xmm3 1067 vmovdqa 64(%ebp),%xmm4 1068 vmovdqa 80(%ebp),%xmm5 1069 vmovdqa 96(%ebp),%xmm6 1070 vmovdqa 112(%ebp),%xmm7 1071 vpaddd 64(%eax),%xmm4,%xmm4 1072 vmovdqa %xmm2,32(%ebx) 1073 vmovdqa %xmm3,48(%ebx) 1074 vmovdqa %xmm4,64(%ebx) 1075 vmovdqa %xmm5,80(%ebx) 1076 vmovdqa %xmm6,96(%ebx) 1077 vmovdqa %xmm7,112(%ebx) 1078 vmovdqa %xmm4,64(%ebp) 1079 vmovdqa -128(%ebp),%xmm0 1080 vmovdqa %xmm4,%xmm6 1081 vmovdqa -64(%ebp),%xmm3 1082 vmovdqa (%ebp),%xmm4 1083 vmovdqa 16(%ebp),%xmm5 1084 movl $10,%edx 1085 nop 1086.align 32 1087.L018loop: 1088 vpaddd %xmm3,%xmm0,%xmm0 1089 vpxor %xmm0,%xmm6,%xmm6 1090.byte 143,232,120,194,246,16 1091 vpaddd %xmm6,%xmm4,%xmm4 1092 vpxor %xmm4,%xmm3,%xmm2 1093 vmovdqa -112(%ebx),%xmm1 1094.byte 143,232,120,194,210,12 1095 vmovdqa -48(%ebx),%xmm3 1096 vpaddd %xmm2,%xmm0,%xmm0 1097 vmovdqa 80(%ebx),%xmm7 1098 vpxor %xmm0,%xmm6,%xmm6 1099 vpaddd %xmm3,%xmm1,%xmm1 1100.byte 143,232,120,194,246,8 1101 vmovdqa %xmm0,-128(%ebx) 1102 vpaddd %xmm6,%xmm4,%xmm4 1103 vmovdqa %xmm6,64(%ebx) 1104 vpxor %xmm4,%xmm2,%xmm2 1105 vpxor %xmm1,%xmm7,%xmm7 1106.byte 143,232,120,194,210,7 1107 vmovdqa %xmm4,(%ebx) 1108.byte 143,232,120,194,255,16 1109 vmovdqa %xmm2,-64(%ebx) 1110 vpaddd %xmm7,%xmm5,%xmm5 1111 vmovdqa 32(%ebx),%xmm4 1112 vpxor %xmm5,%xmm3,%xmm3 1113 vmovdqa -96(%ebx),%xmm0 1114.byte 143,232,120,194,219,12 1115 vmovdqa -32(%ebx),%xmm2 1116 vpaddd %xmm3,%xmm1,%xmm1 1117 vmovdqa 96(%ebx),%xmm6 1118 vpxor %xmm1,%xmm7,%xmm7 1119 vpaddd %xmm2,%xmm0,%xmm0 1120.byte 143,232,120,194,255,8 1121 vmovdqa %xmm1,-112(%ebx) 1122 vpaddd %xmm7,%xmm5,%xmm5 1123 vmovdqa %xmm7,80(%ebx) 1124 vpxor %xmm5,%xmm3,%xmm3 1125 vpxor %xmm0,%xmm6,%xmm6 1126.byte 143,232,120,194,219,7 1127 vmovdqa %xmm5,16(%ebx) 1128.byte 143,232,120,194,246,16 1129 vmovdqa %xmm3,-48(%ebx) 1130 vpaddd %xmm6,%xmm4,%xmm4 1131 vmovdqa 48(%ebx),%xmm5 1132 vpxor %xmm4,%xmm2,%xmm2 1133 vmovdqa -80(%ebx),%xmm1 1134.byte 143,232,120,194,210,12 1135 vmovdqa -16(%ebx),%xmm3 1136 vpaddd %xmm2,%xmm0,%xmm0 1137 vmovdqa 112(%ebx),%xmm7 1138 vpxor %xmm0,%xmm6,%xmm6 1139 vpaddd %xmm3,%xmm1,%xmm1 1140.byte 143,232,120,194,246,8 1141 vmovdqa %xmm0,-96(%ebx) 1142 vpaddd %xmm6,%xmm4,%xmm4 1143 vmovdqa %xmm6,96(%ebx) 1144 vpxor %xmm4,%xmm2,%xmm2 1145 vpxor %xmm1,%xmm7,%xmm7 1146.byte 143,232,120,194,210,7 1147.byte 143,232,120,194,255,16 1148 vmovdqa %xmm2,-32(%ebx) 1149 vpaddd %xmm7,%xmm5,%xmm5 1150 vpxor %xmm5,%xmm3,%xmm3 1151 vmovdqa -128(%ebx),%xmm0 1152.byte 143,232,120,194,219,12 1153 vmovdqa -48(%ebx),%xmm2 1154 vpaddd %xmm3,%xmm1,%xmm1 1155 vpxor %xmm1,%xmm7,%xmm7 1156 vpaddd %xmm2,%xmm0,%xmm0 1157.byte 143,232,120,194,255,8 1158 vmovdqa %xmm1,-80(%ebx) 1159 vpaddd %xmm7,%xmm5,%xmm5 1160 vpxor %xmm5,%xmm3,%xmm3 1161 vpxor %xmm0,%xmm7,%xmm6 1162.byte 143,232,120,194,219,7 1163.byte 143,232,120,194,246,16 1164 vmovdqa %xmm3,-16(%ebx) 1165 vpaddd %xmm6,%xmm4,%xmm4 1166 vpxor %xmm4,%xmm2,%xmm2 1167 vmovdqa -112(%ebx),%xmm1 1168.byte 143,232,120,194,210,12 1169 vmovdqa -32(%ebx),%xmm3 1170 vpaddd %xmm2,%xmm0,%xmm0 1171 vmovdqa 64(%ebx),%xmm7 1172 vpxor %xmm0,%xmm6,%xmm6 1173 vpaddd %xmm3,%xmm1,%xmm1 1174.byte 143,232,120,194,246,8 1175 vmovdqa %xmm0,-128(%ebx) 1176 vpaddd %xmm6,%xmm4,%xmm4 1177 vmovdqa %xmm6,112(%ebx) 1178 vpxor %xmm4,%xmm2,%xmm2 1179 vpxor %xmm1,%xmm7,%xmm7 1180.byte 143,232,120,194,210,7 1181 vmovdqa %xmm4,32(%ebx) 1182.byte 143,232,120,194,255,16 1183 vmovdqa %xmm2,-48(%ebx) 1184 vpaddd %xmm7,%xmm5,%xmm5 1185 vmovdqa (%ebx),%xmm4 1186 vpxor %xmm5,%xmm3,%xmm3 1187 vmovdqa -96(%ebx),%xmm0 1188.byte 143,232,120,194,219,12 1189 vmovdqa -16(%ebx),%xmm2 1190 vpaddd %xmm3,%xmm1,%xmm1 1191 vmovdqa 80(%ebx),%xmm6 1192 vpxor %xmm1,%xmm7,%xmm7 1193 vpaddd %xmm2,%xmm0,%xmm0 1194.byte 143,232,120,194,255,8 1195 vmovdqa %xmm1,-112(%ebx) 1196 vpaddd %xmm7,%xmm5,%xmm5 1197 vmovdqa %xmm7,64(%ebx) 1198 vpxor %xmm5,%xmm3,%xmm3 1199 vpxor %xmm0,%xmm6,%xmm6 1200.byte 143,232,120,194,219,7 1201 vmovdqa %xmm5,48(%ebx) 1202.byte 143,232,120,194,246,16 1203 vmovdqa %xmm3,-32(%ebx) 1204 vpaddd %xmm6,%xmm4,%xmm4 1205 vmovdqa 16(%ebx),%xmm5 1206 vpxor %xmm4,%xmm2,%xmm2 1207 vmovdqa -80(%ebx),%xmm1 1208.byte 143,232,120,194,210,12 1209 vmovdqa -64(%ebx),%xmm3 1210 vpaddd %xmm2,%xmm0,%xmm0 1211 vmovdqa 96(%ebx),%xmm7 1212 vpxor %xmm0,%xmm6,%xmm6 1213 vpaddd %xmm3,%xmm1,%xmm1 1214.byte 143,232,120,194,246,8 1215 vmovdqa %xmm0,-96(%ebx) 1216 vpaddd %xmm6,%xmm4,%xmm4 1217 vmovdqa %xmm6,80(%ebx) 1218 vpxor %xmm4,%xmm2,%xmm2 1219 vpxor %xmm1,%xmm7,%xmm7 1220.byte 143,232,120,194,210,7 1221.byte 143,232,120,194,255,16 1222 vmovdqa %xmm2,-16(%ebx) 1223 vpaddd %xmm7,%xmm5,%xmm5 1224 vpxor %xmm5,%xmm3,%xmm3 1225 vmovdqa -128(%ebx),%xmm0 1226.byte 143,232,120,194,219,12 1227 vpaddd %xmm3,%xmm1,%xmm1 1228 vmovdqa 64(%ebx),%xmm6 1229 vpxor %xmm1,%xmm7,%xmm7 1230.byte 143,232,120,194,255,8 1231 vmovdqa %xmm1,-80(%ebx) 1232 vpaddd %xmm7,%xmm5,%xmm5 1233 vmovdqa %xmm7,96(%ebx) 1234 vpxor %xmm5,%xmm3,%xmm3 1235.byte 143,232,120,194,219,7 1236 decl %edx 1237 jnz .L018loop 1238 vmovdqa %xmm3,-64(%ebx) 1239 vmovdqa %xmm4,(%ebx) 1240 vmovdqa %xmm5,16(%ebx) 1241 vmovdqa %xmm6,64(%ebx) 1242 vmovdqa %xmm7,96(%ebx) 1243 vmovdqa -112(%ebx),%xmm1 1244 vmovdqa -96(%ebx),%xmm2 1245 vmovdqa -80(%ebx),%xmm3 1246 vpaddd -128(%ebp),%xmm0,%xmm0 1247 vpaddd -112(%ebp),%xmm1,%xmm1 1248 vpaddd -96(%ebp),%xmm2,%xmm2 1249 vpaddd -80(%ebp),%xmm3,%xmm3 1250 vpunpckldq %xmm1,%xmm0,%xmm6 1251 vpunpckldq %xmm3,%xmm2,%xmm7 1252 vpunpckhdq %xmm1,%xmm0,%xmm0 1253 vpunpckhdq %xmm3,%xmm2,%xmm2 1254 vpunpcklqdq %xmm7,%xmm6,%xmm1 1255 vpunpckhqdq %xmm7,%xmm6,%xmm6 1256 vpunpcklqdq %xmm2,%xmm0,%xmm7 1257 vpunpckhqdq %xmm2,%xmm0,%xmm3 1258 vpxor -128(%esi),%xmm1,%xmm4 1259 vpxor -64(%esi),%xmm6,%xmm5 1260 vpxor (%esi),%xmm7,%xmm6 1261 vpxor 64(%esi),%xmm3,%xmm7 1262 leal 16(%esi),%esi 1263 vmovdqa -64(%ebx),%xmm0 1264 vmovdqa -48(%ebx),%xmm1 1265 vmovdqa -32(%ebx),%xmm2 1266 vmovdqa -16(%ebx),%xmm3 1267 vmovdqu %xmm4,-128(%edi) 1268 vmovdqu %xmm5,-64(%edi) 1269 vmovdqu %xmm6,(%edi) 1270 vmovdqu %xmm7,64(%edi) 1271 leal 16(%edi),%edi 1272 vpaddd -64(%ebp),%xmm0,%xmm0 1273 vpaddd -48(%ebp),%xmm1,%xmm1 1274 vpaddd -32(%ebp),%xmm2,%xmm2 1275 vpaddd -16(%ebp),%xmm3,%xmm3 1276 vpunpckldq %xmm1,%xmm0,%xmm6 1277 vpunpckldq %xmm3,%xmm2,%xmm7 1278 vpunpckhdq %xmm1,%xmm0,%xmm0 1279 vpunpckhdq %xmm3,%xmm2,%xmm2 1280 vpunpcklqdq %xmm7,%xmm6,%xmm1 1281 vpunpckhqdq %xmm7,%xmm6,%xmm6 1282 vpunpcklqdq %xmm2,%xmm0,%xmm7 1283 vpunpckhqdq %xmm2,%xmm0,%xmm3 1284 vpxor -128(%esi),%xmm1,%xmm4 1285 vpxor -64(%esi),%xmm6,%xmm5 1286 vpxor (%esi),%xmm7,%xmm6 1287 vpxor 64(%esi),%xmm3,%xmm7 1288 leal 16(%esi),%esi 1289 vmovdqa (%ebx),%xmm0 1290 vmovdqa 16(%ebx),%xmm1 1291 vmovdqa 32(%ebx),%xmm2 1292 vmovdqa 48(%ebx),%xmm3 1293 vmovdqu %xmm4,-128(%edi) 1294 vmovdqu %xmm5,-64(%edi) 1295 vmovdqu %xmm6,(%edi) 1296 vmovdqu %xmm7,64(%edi) 1297 leal 16(%edi),%edi 1298 vpaddd (%ebp),%xmm0,%xmm0 1299 vpaddd 16(%ebp),%xmm1,%xmm1 1300 vpaddd 32(%ebp),%xmm2,%xmm2 1301 vpaddd 48(%ebp),%xmm3,%xmm3 1302 vpunpckldq %xmm1,%xmm0,%xmm6 1303 vpunpckldq %xmm3,%xmm2,%xmm7 1304 vpunpckhdq %xmm1,%xmm0,%xmm0 1305 vpunpckhdq %xmm3,%xmm2,%xmm2 1306 vpunpcklqdq %xmm7,%xmm6,%xmm1 1307 vpunpckhqdq %xmm7,%xmm6,%xmm6 1308 vpunpcklqdq %xmm2,%xmm0,%xmm7 1309 vpunpckhqdq %xmm2,%xmm0,%xmm3 1310 vpxor -128(%esi),%xmm1,%xmm4 1311 vpxor -64(%esi),%xmm6,%xmm5 1312 vpxor (%esi),%xmm7,%xmm6 1313 vpxor 64(%esi),%xmm3,%xmm7 1314 leal 16(%esi),%esi 1315 vmovdqa 64(%ebx),%xmm0 1316 vmovdqa 80(%ebx),%xmm1 1317 vmovdqa 96(%ebx),%xmm2 1318 vmovdqa 112(%ebx),%xmm3 1319 vmovdqu %xmm4,-128(%edi) 1320 vmovdqu %xmm5,-64(%edi) 1321 vmovdqu %xmm6,(%edi) 1322 vmovdqu %xmm7,64(%edi) 1323 leal 16(%edi),%edi 1324 vpaddd 64(%ebp),%xmm0,%xmm0 1325 vpaddd 80(%ebp),%xmm1,%xmm1 1326 vpaddd 96(%ebp),%xmm2,%xmm2 1327 vpaddd 112(%ebp),%xmm3,%xmm3 1328 vpunpckldq %xmm1,%xmm0,%xmm6 1329 vpunpckldq %xmm3,%xmm2,%xmm7 1330 vpunpckhdq %xmm1,%xmm0,%xmm0 1331 vpunpckhdq %xmm3,%xmm2,%xmm2 1332 vpunpcklqdq %xmm7,%xmm6,%xmm1 1333 vpunpckhqdq %xmm7,%xmm6,%xmm6 1334 vpunpcklqdq %xmm2,%xmm0,%xmm7 1335 vpunpckhqdq %xmm2,%xmm0,%xmm3 1336 vpxor -128(%esi),%xmm1,%xmm4 1337 vpxor -64(%esi),%xmm6,%xmm5 1338 vpxor (%esi),%xmm7,%xmm6 1339 vpxor 64(%esi),%xmm3,%xmm7 1340 leal 208(%esi),%esi 1341 vmovdqu %xmm4,-128(%edi) 1342 vmovdqu %xmm5,-64(%edi) 1343 vmovdqu %xmm6,(%edi) 1344 vmovdqu %xmm7,64(%edi) 1345 leal 208(%edi),%edi 1346 subl $256,%ecx 1347 jnc .L017outer_loop 1348 addl $256,%ecx 1349 jz .L019done 1350 movl 520(%esp),%ebx 1351 leal -128(%esi),%esi 1352 movl 516(%esp),%edx 1353 leal -128(%edi),%edi 1354 vmovd 64(%ebp),%xmm2 1355 vmovdqu (%ebx),%xmm3 1356 vpaddd 96(%eax),%xmm2,%xmm2 1357 vpand 112(%eax),%xmm3,%xmm3 1358 vpor %xmm2,%xmm3,%xmm3 1359.L0161x: 1360 vmovdqa 32(%eax),%xmm0 1361 vmovdqu (%edx),%xmm1 1362 vmovdqu 16(%edx),%xmm2 1363 vmovdqa (%eax),%xmm6 1364 vmovdqa 16(%eax),%xmm7 1365 movl %ebp,48(%esp) 1366 vmovdqa %xmm0,(%esp) 1367 vmovdqa %xmm1,16(%esp) 1368 vmovdqa %xmm2,32(%esp) 1369 vmovdqa %xmm3,48(%esp) 1370 movl $10,%edx 1371 jmp .L020loop1x 1372.align 16 1373.L021outer1x: 1374 vmovdqa 80(%eax),%xmm3 1375 vmovdqa (%esp),%xmm0 1376 vmovdqa 16(%esp),%xmm1 1377 vmovdqa 32(%esp),%xmm2 1378 vpaddd 48(%esp),%xmm3,%xmm3 1379 movl $10,%edx 1380 vmovdqa %xmm3,48(%esp) 1381 jmp .L020loop1x 1382.align 16 1383.L020loop1x: 1384 vpaddd %xmm1,%xmm0,%xmm0 1385 vpxor %xmm0,%xmm3,%xmm3 1386.byte 143,232,120,194,219,16 1387 vpaddd %xmm3,%xmm2,%xmm2 1388 vpxor %xmm2,%xmm1,%xmm1 1389.byte 143,232,120,194,201,12 1390 vpaddd %xmm1,%xmm0,%xmm0 1391 vpxor %xmm0,%xmm3,%xmm3 1392.byte 143,232,120,194,219,8 1393 vpaddd %xmm3,%xmm2,%xmm2 1394 vpxor %xmm2,%xmm1,%xmm1 1395.byte 143,232,120,194,201,7 1396 vpshufd $78,%xmm2,%xmm2 1397 vpshufd $57,%xmm1,%xmm1 1398 vpshufd $147,%xmm3,%xmm3 1399 vpaddd %xmm1,%xmm0,%xmm0 1400 vpxor %xmm0,%xmm3,%xmm3 1401.byte 143,232,120,194,219,16 1402 vpaddd %xmm3,%xmm2,%xmm2 1403 vpxor %xmm2,%xmm1,%xmm1 1404.byte 143,232,120,194,201,12 1405 vpaddd %xmm1,%xmm0,%xmm0 1406 vpxor %xmm0,%xmm3,%xmm3 1407.byte 143,232,120,194,219,8 1408 vpaddd %xmm3,%xmm2,%xmm2 1409 vpxor %xmm2,%xmm1,%xmm1 1410.byte 143,232,120,194,201,7 1411 vpshufd $78,%xmm2,%xmm2 1412 vpshufd $147,%xmm1,%xmm1 1413 vpshufd $57,%xmm3,%xmm3 1414 decl %edx 1415 jnz .L020loop1x 1416 vpaddd (%esp),%xmm0,%xmm0 1417 vpaddd 16(%esp),%xmm1,%xmm1 1418 vpaddd 32(%esp),%xmm2,%xmm2 1419 vpaddd 48(%esp),%xmm3,%xmm3 1420 cmpl $64,%ecx 1421 jb .L022tail 1422 vpxor (%esi),%xmm0,%xmm0 1423 vpxor 16(%esi),%xmm1,%xmm1 1424 vpxor 32(%esi),%xmm2,%xmm2 1425 vpxor 48(%esi),%xmm3,%xmm3 1426 leal 64(%esi),%esi 1427 vmovdqu %xmm0,(%edi) 1428 vmovdqu %xmm1,16(%edi) 1429 vmovdqu %xmm2,32(%edi) 1430 vmovdqu %xmm3,48(%edi) 1431 leal 64(%edi),%edi 1432 subl $64,%ecx 1433 jnz .L021outer1x 1434 jmp .L019done 1435.L022tail: 1436 vmovdqa %xmm0,(%esp) 1437 vmovdqa %xmm1,16(%esp) 1438 vmovdqa %xmm2,32(%esp) 1439 vmovdqa %xmm3,48(%esp) 1440 xorl %eax,%eax 1441 xorl %edx,%edx 1442 xorl %ebp,%ebp 1443.L023tail_loop: 1444 movb (%esp,%ebp,1),%al 1445 movb (%esi,%ebp,1),%dl 1446 leal 1(%ebp),%ebp 1447 xorb %dl,%al 1448 movb %al,-1(%edi,%ebp,1) 1449 decl %ecx 1450 jnz .L023tail_loop 1451.L019done: 1452 vzeroupper 1453 movl 512(%esp),%esp 1454 popl %edi 1455 popl %esi 1456 popl %ebx 1457 popl %ebp 1458 ret 1459.size ChaCha20_xop,.-.L_ChaCha20_xop_begin 1460.comm OPENSSL_ia32cap_P,16,4 1461 1462 .section ".note.gnu.property", "a" 1463 .p2align 2 1464 .long 1f - 0f 1465 .long 4f - 1f 1466 .long 5 14670: 1468 .asciz "GNU" 14691: 1470 .p2align 2 1471 .long 0xc0000002 1472 .long 3f - 2f 14732: 1474 .long 3 14753: 1476 .p2align 2 14774: 1478