1.text 2.globl _ChaCha20_ctr32 3.type _ChaCha20_ctr32,@function 4.align 4 5_ChaCha20_ctr32: 6L_ChaCha20_ctr32_begin: 7 %ifdef __CET__ 8 9.byte 243,15,30,251 10 %endif 11 12 pushl %ebp 13 pushl %ebx 14 pushl %esi 15 pushl %edi 16 xorl %eax,%eax 17 cmpl 28(%esp),%eax 18 je L000no_data 19 call Lpic_point 20Lpic_point: 21 popl %eax 22 leal __GLOBAL_OFFSET_TABLE_+[.-Lpic_point](%eax),%ebp 23 movl _OPENSSL_ia32cap_P@GOT(%ebp),%ebp 24 testl $16777216,(%ebp) 25 jz L001x86 26 testl $512,4(%ebp) 27 jz L001x86 28 jmp Lssse3_shortcut 29L001x86: 30 movl 32(%esp),%esi 31 movl 36(%esp),%edi 32 subl $132,%esp 33 movl (%esi),%eax 34 movl 4(%esi),%ebx 35 movl 8(%esi),%ecx 36 movl 12(%esi),%edx 37 movl %eax,80(%esp) 38 movl %ebx,84(%esp) 39 movl %ecx,88(%esp) 40 movl %edx,92(%esp) 41 movl 16(%esi),%eax 42 movl 20(%esi),%ebx 43 movl 24(%esi),%ecx 44 movl 28(%esi),%edx 45 movl %eax,96(%esp) 46 movl %ebx,100(%esp) 47 movl %ecx,104(%esp) 48 movl %edx,108(%esp) 49 movl (%edi),%eax 50 movl 4(%edi),%ebx 51 movl 8(%edi),%ecx 52 movl 12(%edi),%edx 53 subl $1,%eax 54 movl %eax,112(%esp) 55 movl %ebx,116(%esp) 56 movl %ecx,120(%esp) 57 movl %edx,124(%esp) 58 jmp L002entry 59.align 4,0x90 60L003outer_loop: 61 movl %ebx,156(%esp) 62 movl %eax,152(%esp) 63 movl %ecx,160(%esp) 64L002entry: 65 movl $1634760805,%eax 66 movl $857760878,4(%esp) 67 movl $2036477234,8(%esp) 68 movl $1797285236,12(%esp) 69 movl 84(%esp),%ebx 70 movl 88(%esp),%ebp 71 movl 104(%esp),%ecx 72 movl 108(%esp),%esi 73 movl 116(%esp),%edx 74 movl 120(%esp),%edi 75 movl %ebx,20(%esp) 76 movl %ebp,24(%esp) 77 movl %ecx,40(%esp) 78 movl %esi,44(%esp) 79 movl %edx,52(%esp) 80 movl %edi,56(%esp) 81 movl 92(%esp),%ebx 82 movl 124(%esp),%edi 83 movl 112(%esp),%edx 84 movl 80(%esp),%ebp 85 movl 96(%esp),%ecx 86 movl 100(%esp),%esi 87 addl $1,%edx 88 movl %ebx,28(%esp) 89 movl %edi,60(%esp) 90 movl %edx,112(%esp) 91 movl $10,%ebx 92 jmp L004loop 93.align 4,0x90 94L004loop: 95 addl %ebp,%eax 96 movl %ebx,128(%esp) 97 movl %ebp,%ebx 98 xorl %eax,%edx 99 roll $16,%edx 100 addl %edx,%ecx 101 xorl %ecx,%ebx 102 movl 52(%esp),%edi 103 roll $12,%ebx 104 movl 20(%esp),%ebp 105 addl %ebx,%eax 106 xorl %eax,%edx 107 movl %eax,(%esp) 108 roll $8,%edx 109 movl 4(%esp),%eax 110 addl %edx,%ecx 111 movl %edx,48(%esp) 112 xorl %ecx,%ebx 113 addl %ebp,%eax 114 roll $7,%ebx 115 xorl %eax,%edi 116 movl %ecx,32(%esp) 117 roll $16,%edi 118 movl %ebx,16(%esp) 119 addl %edi,%esi 120 movl 40(%esp),%ecx 121 xorl %esi,%ebp 122 movl 56(%esp),%edx 123 roll $12,%ebp 124 movl 24(%esp),%ebx 125 addl %ebp,%eax 126 xorl %eax,%edi 127 movl %eax,4(%esp) 128 roll $8,%edi 129 movl 8(%esp),%eax 130 addl %edi,%esi 131 movl %edi,52(%esp) 132 xorl %esi,%ebp 133 addl %ebx,%eax 134 roll $7,%ebp 135 xorl %eax,%edx 136 movl %esi,36(%esp) 137 roll $16,%edx 138 movl %ebp,20(%esp) 139 addl %edx,%ecx 140 movl 44(%esp),%esi 141 xorl %ecx,%ebx 142 movl 60(%esp),%edi 143 roll $12,%ebx 144 movl 28(%esp),%ebp 145 addl %ebx,%eax 146 xorl %eax,%edx 147 movl %eax,8(%esp) 148 roll $8,%edx 149 movl 12(%esp),%eax 150 addl %edx,%ecx 151 movl %edx,56(%esp) 152 xorl %ecx,%ebx 153 addl %ebp,%eax 154 roll $7,%ebx 155 xorl %eax,%edi 156 roll $16,%edi 157 movl %ebx,24(%esp) 158 addl %edi,%esi 159 xorl %esi,%ebp 160 roll $12,%ebp 161 movl 20(%esp),%ebx 162 addl %ebp,%eax 163 xorl %eax,%edi 164 movl %eax,12(%esp) 165 roll $8,%edi 166 movl (%esp),%eax 167 addl %edi,%esi 168 movl %edi,%edx 169 xorl %esi,%ebp 170 addl %ebx,%eax 171 roll $7,%ebp 172 xorl %eax,%edx 173 roll $16,%edx 174 movl %ebp,28(%esp) 175 addl %edx,%ecx 176 xorl %ecx,%ebx 177 movl 48(%esp),%edi 178 roll $12,%ebx 179 movl 24(%esp),%ebp 180 addl %ebx,%eax 181 xorl %eax,%edx 182 movl %eax,(%esp) 183 roll $8,%edx 184 movl 4(%esp),%eax 185 addl %edx,%ecx 186 movl %edx,60(%esp) 187 xorl %ecx,%ebx 188 addl %ebp,%eax 189 roll $7,%ebx 190 xorl %eax,%edi 191 movl %ecx,40(%esp) 192 roll $16,%edi 193 movl %ebx,20(%esp) 194 addl %edi,%esi 195 movl 32(%esp),%ecx 196 xorl %esi,%ebp 197 movl 52(%esp),%edx 198 roll $12,%ebp 199 movl 28(%esp),%ebx 200 addl %ebp,%eax 201 xorl %eax,%edi 202 movl %eax,4(%esp) 203 roll $8,%edi 204 movl 8(%esp),%eax 205 addl %edi,%esi 206 movl %edi,48(%esp) 207 xorl %esi,%ebp 208 addl %ebx,%eax 209 roll $7,%ebp 210 xorl %eax,%edx 211 movl %esi,44(%esp) 212 roll $16,%edx 213 movl %ebp,24(%esp) 214 addl %edx,%ecx 215 movl 36(%esp),%esi 216 xorl %ecx,%ebx 217 movl 56(%esp),%edi 218 roll $12,%ebx 219 movl 16(%esp),%ebp 220 addl %ebx,%eax 221 xorl %eax,%edx 222 movl %eax,8(%esp) 223 roll $8,%edx 224 movl 12(%esp),%eax 225 addl %edx,%ecx 226 movl %edx,52(%esp) 227 xorl %ecx,%ebx 228 addl %ebp,%eax 229 roll $7,%ebx 230 xorl %eax,%edi 231 roll $16,%edi 232 movl %ebx,28(%esp) 233 addl %edi,%esi 234 xorl %esi,%ebp 235 movl 48(%esp),%edx 236 roll $12,%ebp 237 movl 128(%esp),%ebx 238 addl %ebp,%eax 239 xorl %eax,%edi 240 movl %eax,12(%esp) 241 roll $8,%edi 242 movl (%esp),%eax 243 addl %edi,%esi 244 movl %edi,56(%esp) 245 xorl %esi,%ebp 246 roll $7,%ebp 247 decl %ebx 248 jnz L004loop 249 movl 160(%esp),%ebx 250 addl $1634760805,%eax 251 addl 80(%esp),%ebp 252 addl 96(%esp),%ecx 253 addl 100(%esp),%esi 254 cmpl $64,%ebx 255 jb L005tail 256 movl 156(%esp),%ebx 257 addl 112(%esp),%edx 258 addl 120(%esp),%edi 259 xorl (%ebx),%eax 260 xorl 16(%ebx),%ebp 261 movl %eax,(%esp) 262 movl 152(%esp),%eax 263 xorl 32(%ebx),%ecx 264 xorl 36(%ebx),%esi 265 xorl 48(%ebx),%edx 266 xorl 56(%ebx),%edi 267 movl %ebp,16(%eax) 268 movl %ecx,32(%eax) 269 movl %esi,36(%eax) 270 movl %edx,48(%eax) 271 movl %edi,56(%eax) 272 movl 4(%esp),%ebp 273 movl 8(%esp),%ecx 274 movl 12(%esp),%esi 275 movl 20(%esp),%edx 276 movl 24(%esp),%edi 277 addl $857760878,%ebp 278 addl $2036477234,%ecx 279 addl $1797285236,%esi 280 addl 84(%esp),%edx 281 addl 88(%esp),%edi 282 xorl 4(%ebx),%ebp 283 xorl 8(%ebx),%ecx 284 xorl 12(%ebx),%esi 285 xorl 20(%ebx),%edx 286 xorl 24(%ebx),%edi 287 movl %ebp,4(%eax) 288 movl %ecx,8(%eax) 289 movl %esi,12(%eax) 290 movl %edx,20(%eax) 291 movl %edi,24(%eax) 292 movl 28(%esp),%ebp 293 movl 40(%esp),%ecx 294 movl 44(%esp),%esi 295 movl 52(%esp),%edx 296 movl 60(%esp),%edi 297 addl 92(%esp),%ebp 298 addl 104(%esp),%ecx 299 addl 108(%esp),%esi 300 addl 116(%esp),%edx 301 addl 124(%esp),%edi 302 xorl 28(%ebx),%ebp 303 xorl 40(%ebx),%ecx 304 xorl 44(%ebx),%esi 305 xorl 52(%ebx),%edx 306 xorl 60(%ebx),%edi 307 leal 64(%ebx),%ebx 308 movl %ebp,28(%eax) 309 movl (%esp),%ebp 310 movl %ecx,40(%eax) 311 movl 160(%esp),%ecx 312 movl %esi,44(%eax) 313 movl %edx,52(%eax) 314 movl %edi,60(%eax) 315 movl %ebp,(%eax) 316 leal 64(%eax),%eax 317 subl $64,%ecx 318 jnz L003outer_loop 319 jmp L006done 320L005tail: 321 addl 112(%esp),%edx 322 addl 120(%esp),%edi 323 movl %eax,(%esp) 324 movl %ebp,16(%esp) 325 movl %ecx,32(%esp) 326 movl %esi,36(%esp) 327 movl %edx,48(%esp) 328 movl %edi,56(%esp) 329 movl 4(%esp),%ebp 330 movl 8(%esp),%ecx 331 movl 12(%esp),%esi 332 movl 20(%esp),%edx 333 movl 24(%esp),%edi 334 addl $857760878,%ebp 335 addl $2036477234,%ecx 336 addl $1797285236,%esi 337 addl 84(%esp),%edx 338 addl 88(%esp),%edi 339 movl %ebp,4(%esp) 340 movl %ecx,8(%esp) 341 movl %esi,12(%esp) 342 movl %edx,20(%esp) 343 movl %edi,24(%esp) 344 movl 28(%esp),%ebp 345 movl 40(%esp),%ecx 346 movl 44(%esp),%esi 347 movl 52(%esp),%edx 348 movl 60(%esp),%edi 349 addl 92(%esp),%ebp 350 addl 104(%esp),%ecx 351 addl 108(%esp),%esi 352 addl 116(%esp),%edx 353 addl 124(%esp),%edi 354 movl %ebp,28(%esp) 355 movl 156(%esp),%ebp 356 movl %ecx,40(%esp) 357 movl 152(%esp),%ecx 358 movl %esi,44(%esp) 359 xorl %esi,%esi 360 movl %edx,52(%esp) 361 movl %edi,60(%esp) 362 xorl %eax,%eax 363 xorl %edx,%edx 364L007tail_loop: 365 movb (%esi,%ebp,1),%al 366 movb (%esp,%esi,1),%dl 367 leal 1(%esi),%esi 368 xorb %dl,%al 369 movb %al,-1(%ecx,%esi,1) 370 decl %ebx 371 jnz L007tail_loop 372L006done: 373 addl $132,%esp 374L000no_data: 375 popl %edi 376 popl %esi 377 popl %ebx 378 popl %ebp 379 ret 380.globl _ChaCha20_ssse3 381.type _ChaCha20_ssse3,@function 382.align 4 383_ChaCha20_ssse3: 384L_ChaCha20_ssse3_begin: 385 %ifdef __CET__ 386 387.byte 243,15,30,251 388 %endif 389 390 pushl %ebp 391 pushl %ebx 392 pushl %esi 393 pushl %edi 394Lssse3_shortcut: 395 testl $2048,4(%ebp) 396 jnz Lxop_shortcut 397 movl 20(%esp),%edi 398 movl 24(%esp),%esi 399 movl 28(%esp),%ecx 400 movl 32(%esp),%edx 401 movl 36(%esp),%ebx 402 movl %esp,%ebp 403 subl $524,%esp 404 andl $-64,%esp 405 movl %ebp,512(%esp) 406 leal Lssse3_data-Lpic_point(%eax),%eax 407 movdqu (%ebx),%xmm3 408 cmpl $256,%ecx 409 jb L0081x 410 movl %edx,516(%esp) 411 movl %ebx,520(%esp) 412 subl $256,%ecx 413 leal 384(%esp),%ebp 414 movdqu (%edx),%xmm7 415 pshufd $0,%xmm3,%xmm0 416 pshufd $85,%xmm3,%xmm1 417 pshufd $170,%xmm3,%xmm2 418 pshufd $255,%xmm3,%xmm3 419 paddd 48(%eax),%xmm0 420 pshufd $0,%xmm7,%xmm4 421 pshufd $85,%xmm7,%xmm5 422 psubd 64(%eax),%xmm0 423 pshufd $170,%xmm7,%xmm6 424 pshufd $255,%xmm7,%xmm7 425 movdqa %xmm0,64(%ebp) 426 movdqa %xmm1,80(%ebp) 427 movdqa %xmm2,96(%ebp) 428 movdqa %xmm3,112(%ebp) 429 movdqu 16(%edx),%xmm3 430 movdqa %xmm4,-64(%ebp) 431 movdqa %xmm5,-48(%ebp) 432 movdqa %xmm6,-32(%ebp) 433 movdqa %xmm7,-16(%ebp) 434 movdqa 32(%eax),%xmm7 435 leal 128(%esp),%ebx 436 pshufd $0,%xmm3,%xmm0 437 pshufd $85,%xmm3,%xmm1 438 pshufd $170,%xmm3,%xmm2 439 pshufd $255,%xmm3,%xmm3 440 pshufd $0,%xmm7,%xmm4 441 pshufd $85,%xmm7,%xmm5 442 pshufd $170,%xmm7,%xmm6 443 pshufd $255,%xmm7,%xmm7 444 movdqa %xmm0,(%ebp) 445 movdqa %xmm1,16(%ebp) 446 movdqa %xmm2,32(%ebp) 447 movdqa %xmm3,48(%ebp) 448 movdqa %xmm4,-128(%ebp) 449 movdqa %xmm5,-112(%ebp) 450 movdqa %xmm6,-96(%ebp) 451 movdqa %xmm7,-80(%ebp) 452 leal 128(%esi),%esi 453 leal 128(%edi),%edi 454 jmp L009outer_loop 455.align 4,0x90 456L009outer_loop: 457 movdqa -112(%ebp),%xmm1 458 movdqa -96(%ebp),%xmm2 459 movdqa -80(%ebp),%xmm3 460 movdqa -48(%ebp),%xmm5 461 movdqa -32(%ebp),%xmm6 462 movdqa -16(%ebp),%xmm7 463 movdqa %xmm1,-112(%ebx) 464 movdqa %xmm2,-96(%ebx) 465 movdqa %xmm3,-80(%ebx) 466 movdqa %xmm5,-48(%ebx) 467 movdqa %xmm6,-32(%ebx) 468 movdqa %xmm7,-16(%ebx) 469 movdqa 32(%ebp),%xmm2 470 movdqa 48(%ebp),%xmm3 471 movdqa 64(%ebp),%xmm4 472 movdqa 80(%ebp),%xmm5 473 movdqa 96(%ebp),%xmm6 474 movdqa 112(%ebp),%xmm7 475 paddd 64(%eax),%xmm4 476 movdqa %xmm2,32(%ebx) 477 movdqa %xmm3,48(%ebx) 478 movdqa %xmm4,64(%ebx) 479 movdqa %xmm5,80(%ebx) 480 movdqa %xmm6,96(%ebx) 481 movdqa %xmm7,112(%ebx) 482 movdqa %xmm4,64(%ebp) 483 movdqa -128(%ebp),%xmm0 484 movdqa %xmm4,%xmm6 485 movdqa -64(%ebp),%xmm3 486 movdqa (%ebp),%xmm4 487 movdqa 16(%ebp),%xmm5 488 movl $10,%edx 489 nop 490.align 4,0x90 491L010loop: 492 paddd %xmm3,%xmm0 493 movdqa %xmm3,%xmm2 494 pxor %xmm0,%xmm6 495 pshufb (%eax),%xmm6 496 paddd %xmm6,%xmm4 497 pxor %xmm4,%xmm2 498 movdqa -48(%ebx),%xmm3 499 movdqa %xmm2,%xmm1 500 pslld $12,%xmm2 501 psrld $20,%xmm1 502 por %xmm1,%xmm2 503 movdqa -112(%ebx),%xmm1 504 paddd %xmm2,%xmm0 505 movdqa 80(%ebx),%xmm7 506 pxor %xmm0,%xmm6 507 movdqa %xmm0,-128(%ebx) 508 pshufb 16(%eax),%xmm6 509 paddd %xmm6,%xmm4 510 movdqa %xmm6,64(%ebx) 511 pxor %xmm4,%xmm2 512 paddd %xmm3,%xmm1 513 movdqa %xmm2,%xmm0 514 pslld $7,%xmm2 515 psrld $25,%xmm0 516 pxor %xmm1,%xmm7 517 por %xmm0,%xmm2 518 movdqa %xmm4,(%ebx) 519 pshufb (%eax),%xmm7 520 movdqa %xmm2,-64(%ebx) 521 paddd %xmm7,%xmm5 522 movdqa 32(%ebx),%xmm4 523 pxor %xmm5,%xmm3 524 movdqa -32(%ebx),%xmm2 525 movdqa %xmm3,%xmm0 526 pslld $12,%xmm3 527 psrld $20,%xmm0 528 por %xmm0,%xmm3 529 movdqa -96(%ebx),%xmm0 530 paddd %xmm3,%xmm1 531 movdqa 96(%ebx),%xmm6 532 pxor %xmm1,%xmm7 533 movdqa %xmm1,-112(%ebx) 534 pshufb 16(%eax),%xmm7 535 paddd %xmm7,%xmm5 536 movdqa %xmm7,80(%ebx) 537 pxor %xmm5,%xmm3 538 paddd %xmm2,%xmm0 539 movdqa %xmm3,%xmm1 540 pslld $7,%xmm3 541 psrld $25,%xmm1 542 pxor %xmm0,%xmm6 543 por %xmm1,%xmm3 544 movdqa %xmm5,16(%ebx) 545 pshufb (%eax),%xmm6 546 movdqa %xmm3,-48(%ebx) 547 paddd %xmm6,%xmm4 548 movdqa 48(%ebx),%xmm5 549 pxor %xmm4,%xmm2 550 movdqa -16(%ebx),%xmm3 551 movdqa %xmm2,%xmm1 552 pslld $12,%xmm2 553 psrld $20,%xmm1 554 por %xmm1,%xmm2 555 movdqa -80(%ebx),%xmm1 556 paddd %xmm2,%xmm0 557 movdqa 112(%ebx),%xmm7 558 pxor %xmm0,%xmm6 559 movdqa %xmm0,-96(%ebx) 560 pshufb 16(%eax),%xmm6 561 paddd %xmm6,%xmm4 562 movdqa %xmm6,96(%ebx) 563 pxor %xmm4,%xmm2 564 paddd %xmm3,%xmm1 565 movdqa %xmm2,%xmm0 566 pslld $7,%xmm2 567 psrld $25,%xmm0 568 pxor %xmm1,%xmm7 569 por %xmm0,%xmm2 570 pshufb (%eax),%xmm7 571 movdqa %xmm2,-32(%ebx) 572 paddd %xmm7,%xmm5 573 pxor %xmm5,%xmm3 574 movdqa -48(%ebx),%xmm2 575 movdqa %xmm3,%xmm0 576 pslld $12,%xmm3 577 psrld $20,%xmm0 578 por %xmm0,%xmm3 579 movdqa -128(%ebx),%xmm0 580 paddd %xmm3,%xmm1 581 pxor %xmm1,%xmm7 582 movdqa %xmm1,-80(%ebx) 583 pshufb 16(%eax),%xmm7 584 paddd %xmm7,%xmm5 585 movdqa %xmm7,%xmm6 586 pxor %xmm5,%xmm3 587 paddd %xmm2,%xmm0 588 movdqa %xmm3,%xmm1 589 pslld $7,%xmm3 590 psrld $25,%xmm1 591 pxor %xmm0,%xmm6 592 por %xmm1,%xmm3 593 pshufb (%eax),%xmm6 594 movdqa %xmm3,-16(%ebx) 595 paddd %xmm6,%xmm4 596 pxor %xmm4,%xmm2 597 movdqa -32(%ebx),%xmm3 598 movdqa %xmm2,%xmm1 599 pslld $12,%xmm2 600 psrld $20,%xmm1 601 por %xmm1,%xmm2 602 movdqa -112(%ebx),%xmm1 603 paddd %xmm2,%xmm0 604 movdqa 64(%ebx),%xmm7 605 pxor %xmm0,%xmm6 606 movdqa %xmm0,-128(%ebx) 607 pshufb 16(%eax),%xmm6 608 paddd %xmm6,%xmm4 609 movdqa %xmm6,112(%ebx) 610 pxor %xmm4,%xmm2 611 paddd %xmm3,%xmm1 612 movdqa %xmm2,%xmm0 613 pslld $7,%xmm2 614 psrld $25,%xmm0 615 pxor %xmm1,%xmm7 616 por %xmm0,%xmm2 617 movdqa %xmm4,32(%ebx) 618 pshufb (%eax),%xmm7 619 movdqa %xmm2,-48(%ebx) 620 paddd %xmm7,%xmm5 621 movdqa (%ebx),%xmm4 622 pxor %xmm5,%xmm3 623 movdqa -16(%ebx),%xmm2 624 movdqa %xmm3,%xmm0 625 pslld $12,%xmm3 626 psrld $20,%xmm0 627 por %xmm0,%xmm3 628 movdqa -96(%ebx),%xmm0 629 paddd %xmm3,%xmm1 630 movdqa 80(%ebx),%xmm6 631 pxor %xmm1,%xmm7 632 movdqa %xmm1,-112(%ebx) 633 pshufb 16(%eax),%xmm7 634 paddd %xmm7,%xmm5 635 movdqa %xmm7,64(%ebx) 636 pxor %xmm5,%xmm3 637 paddd %xmm2,%xmm0 638 movdqa %xmm3,%xmm1 639 pslld $7,%xmm3 640 psrld $25,%xmm1 641 pxor %xmm0,%xmm6 642 por %xmm1,%xmm3 643 movdqa %xmm5,48(%ebx) 644 pshufb (%eax),%xmm6 645 movdqa %xmm3,-32(%ebx) 646 paddd %xmm6,%xmm4 647 movdqa 16(%ebx),%xmm5 648 pxor %xmm4,%xmm2 649 movdqa -64(%ebx),%xmm3 650 movdqa %xmm2,%xmm1 651 pslld $12,%xmm2 652 psrld $20,%xmm1 653 por %xmm1,%xmm2 654 movdqa -80(%ebx),%xmm1 655 paddd %xmm2,%xmm0 656 movdqa 96(%ebx),%xmm7 657 pxor %xmm0,%xmm6 658 movdqa %xmm0,-96(%ebx) 659 pshufb 16(%eax),%xmm6 660 paddd %xmm6,%xmm4 661 movdqa %xmm6,80(%ebx) 662 pxor %xmm4,%xmm2 663 paddd %xmm3,%xmm1 664 movdqa %xmm2,%xmm0 665 pslld $7,%xmm2 666 psrld $25,%xmm0 667 pxor %xmm1,%xmm7 668 por %xmm0,%xmm2 669 pshufb (%eax),%xmm7 670 movdqa %xmm2,-16(%ebx) 671 paddd %xmm7,%xmm5 672 pxor %xmm5,%xmm3 673 movdqa %xmm3,%xmm0 674 pslld $12,%xmm3 675 psrld $20,%xmm0 676 por %xmm0,%xmm3 677 movdqa -128(%ebx),%xmm0 678 paddd %xmm3,%xmm1 679 movdqa 64(%ebx),%xmm6 680 pxor %xmm1,%xmm7 681 movdqa %xmm1,-80(%ebx) 682 pshufb 16(%eax),%xmm7 683 paddd %xmm7,%xmm5 684 movdqa %xmm7,96(%ebx) 685 pxor %xmm5,%xmm3 686 movdqa %xmm3,%xmm1 687 pslld $7,%xmm3 688 psrld $25,%xmm1 689 por %xmm1,%xmm3 690 decl %edx 691 jnz L010loop 692 movdqa %xmm3,-64(%ebx) 693 movdqa %xmm4,(%ebx) 694 movdqa %xmm5,16(%ebx) 695 movdqa %xmm6,64(%ebx) 696 movdqa %xmm7,96(%ebx) 697 movdqa -112(%ebx),%xmm1 698 movdqa -96(%ebx),%xmm2 699 movdqa -80(%ebx),%xmm3 700 paddd -128(%ebp),%xmm0 701 paddd -112(%ebp),%xmm1 702 paddd -96(%ebp),%xmm2 703 paddd -80(%ebp),%xmm3 704 movdqa %xmm0,%xmm6 705 punpckldq %xmm1,%xmm0 706 movdqa %xmm2,%xmm7 707 punpckldq %xmm3,%xmm2 708 punpckhdq %xmm1,%xmm6 709 punpckhdq %xmm3,%xmm7 710 movdqa %xmm0,%xmm1 711 punpcklqdq %xmm2,%xmm0 712 movdqa %xmm6,%xmm3 713 punpcklqdq %xmm7,%xmm6 714 punpckhqdq %xmm2,%xmm1 715 punpckhqdq %xmm7,%xmm3 716 movdqu -128(%esi),%xmm4 717 movdqu -64(%esi),%xmm5 718 movdqu (%esi),%xmm2 719 movdqu 64(%esi),%xmm7 720 leal 16(%esi),%esi 721 pxor %xmm0,%xmm4 722 movdqa -64(%ebx),%xmm0 723 pxor %xmm1,%xmm5 724 movdqa -48(%ebx),%xmm1 725 pxor %xmm2,%xmm6 726 movdqa -32(%ebx),%xmm2 727 pxor %xmm3,%xmm7 728 movdqa -16(%ebx),%xmm3 729 movdqu %xmm4,-128(%edi) 730 movdqu %xmm5,-64(%edi) 731 movdqu %xmm6,(%edi) 732 movdqu %xmm7,64(%edi) 733 leal 16(%edi),%edi 734 paddd -64(%ebp),%xmm0 735 paddd -48(%ebp),%xmm1 736 paddd -32(%ebp),%xmm2 737 paddd -16(%ebp),%xmm3 738 movdqa %xmm0,%xmm6 739 punpckldq %xmm1,%xmm0 740 movdqa %xmm2,%xmm7 741 punpckldq %xmm3,%xmm2 742 punpckhdq %xmm1,%xmm6 743 punpckhdq %xmm3,%xmm7 744 movdqa %xmm0,%xmm1 745 punpcklqdq %xmm2,%xmm0 746 movdqa %xmm6,%xmm3 747 punpcklqdq %xmm7,%xmm6 748 punpckhqdq %xmm2,%xmm1 749 punpckhqdq %xmm7,%xmm3 750 movdqu -128(%esi),%xmm4 751 movdqu -64(%esi),%xmm5 752 movdqu (%esi),%xmm2 753 movdqu 64(%esi),%xmm7 754 leal 16(%esi),%esi 755 pxor %xmm0,%xmm4 756 movdqa (%ebx),%xmm0 757 pxor %xmm1,%xmm5 758 movdqa 16(%ebx),%xmm1 759 pxor %xmm2,%xmm6 760 movdqa 32(%ebx),%xmm2 761 pxor %xmm3,%xmm7 762 movdqa 48(%ebx),%xmm3 763 movdqu %xmm4,-128(%edi) 764 movdqu %xmm5,-64(%edi) 765 movdqu %xmm6,(%edi) 766 movdqu %xmm7,64(%edi) 767 leal 16(%edi),%edi 768 paddd (%ebp),%xmm0 769 paddd 16(%ebp),%xmm1 770 paddd 32(%ebp),%xmm2 771 paddd 48(%ebp),%xmm3 772 movdqa %xmm0,%xmm6 773 punpckldq %xmm1,%xmm0 774 movdqa %xmm2,%xmm7 775 punpckldq %xmm3,%xmm2 776 punpckhdq %xmm1,%xmm6 777 punpckhdq %xmm3,%xmm7 778 movdqa %xmm0,%xmm1 779 punpcklqdq %xmm2,%xmm0 780 movdqa %xmm6,%xmm3 781 punpcklqdq %xmm7,%xmm6 782 punpckhqdq %xmm2,%xmm1 783 punpckhqdq %xmm7,%xmm3 784 movdqu -128(%esi),%xmm4 785 movdqu -64(%esi),%xmm5 786 movdqu (%esi),%xmm2 787 movdqu 64(%esi),%xmm7 788 leal 16(%esi),%esi 789 pxor %xmm0,%xmm4 790 movdqa 64(%ebx),%xmm0 791 pxor %xmm1,%xmm5 792 movdqa 80(%ebx),%xmm1 793 pxor %xmm2,%xmm6 794 movdqa 96(%ebx),%xmm2 795 pxor %xmm3,%xmm7 796 movdqa 112(%ebx),%xmm3 797 movdqu %xmm4,-128(%edi) 798 movdqu %xmm5,-64(%edi) 799 movdqu %xmm6,(%edi) 800 movdqu %xmm7,64(%edi) 801 leal 16(%edi),%edi 802 paddd 64(%ebp),%xmm0 803 paddd 80(%ebp),%xmm1 804 paddd 96(%ebp),%xmm2 805 paddd 112(%ebp),%xmm3 806 movdqa %xmm0,%xmm6 807 punpckldq %xmm1,%xmm0 808 movdqa %xmm2,%xmm7 809 punpckldq %xmm3,%xmm2 810 punpckhdq %xmm1,%xmm6 811 punpckhdq %xmm3,%xmm7 812 movdqa %xmm0,%xmm1 813 punpcklqdq %xmm2,%xmm0 814 movdqa %xmm6,%xmm3 815 punpcklqdq %xmm7,%xmm6 816 punpckhqdq %xmm2,%xmm1 817 punpckhqdq %xmm7,%xmm3 818 movdqu -128(%esi),%xmm4 819 movdqu -64(%esi),%xmm5 820 movdqu (%esi),%xmm2 821 movdqu 64(%esi),%xmm7 822 leal 208(%esi),%esi 823 pxor %xmm0,%xmm4 824 pxor %xmm1,%xmm5 825 pxor %xmm2,%xmm6 826 pxor %xmm3,%xmm7 827 movdqu %xmm4,-128(%edi) 828 movdqu %xmm5,-64(%edi) 829 movdqu %xmm6,(%edi) 830 movdqu %xmm7,64(%edi) 831 leal 208(%edi),%edi 832 subl $256,%ecx 833 jnc L009outer_loop 834 addl $256,%ecx 835 jz L011done 836 movl 520(%esp),%ebx 837 leal -128(%esi),%esi 838 movl 516(%esp),%edx 839 leal -128(%edi),%edi 840 movd 64(%ebp),%xmm2 841 movdqu (%ebx),%xmm3 842 paddd 96(%eax),%xmm2 843 pand 112(%eax),%xmm3 844 por %xmm2,%xmm3 845L0081x: 846 movdqa 32(%eax),%xmm0 847 movdqu (%edx),%xmm1 848 movdqu 16(%edx),%xmm2 849 movdqa (%eax),%xmm6 850 movdqa 16(%eax),%xmm7 851 movl %ebp,48(%esp) 852 movdqa %xmm0,(%esp) 853 movdqa %xmm1,16(%esp) 854 movdqa %xmm2,32(%esp) 855 movdqa %xmm3,48(%esp) 856 movl $10,%edx 857 jmp L012loop1x 858.align 4,0x90 859L013outer1x: 860 movdqa 80(%eax),%xmm3 861 movdqa (%esp),%xmm0 862 movdqa 16(%esp),%xmm1 863 movdqa 32(%esp),%xmm2 864 paddd 48(%esp),%xmm3 865 movl $10,%edx 866 movdqa %xmm3,48(%esp) 867 jmp L012loop1x 868.align 4,0x90 869L012loop1x: 870 paddd %xmm1,%xmm0 871 pxor %xmm0,%xmm3 872.byte 102,15,56,0,222 873 paddd %xmm3,%xmm2 874 pxor %xmm2,%xmm1 875 movdqa %xmm1,%xmm4 876 psrld $20,%xmm1 877 pslld $12,%xmm4 878 por %xmm4,%xmm1 879 paddd %xmm1,%xmm0 880 pxor %xmm0,%xmm3 881.byte 102,15,56,0,223 882 paddd %xmm3,%xmm2 883 pxor %xmm2,%xmm1 884 movdqa %xmm1,%xmm4 885 psrld $25,%xmm1 886 pslld $7,%xmm4 887 por %xmm4,%xmm1 888 pshufd $78,%xmm2,%xmm2 889 pshufd $57,%xmm1,%xmm1 890 pshufd $147,%xmm3,%xmm3 891 nop 892 paddd %xmm1,%xmm0 893 pxor %xmm0,%xmm3 894.byte 102,15,56,0,222 895 paddd %xmm3,%xmm2 896 pxor %xmm2,%xmm1 897 movdqa %xmm1,%xmm4 898 psrld $20,%xmm1 899 pslld $12,%xmm4 900 por %xmm4,%xmm1 901 paddd %xmm1,%xmm0 902 pxor %xmm0,%xmm3 903.byte 102,15,56,0,223 904 paddd %xmm3,%xmm2 905 pxor %xmm2,%xmm1 906 movdqa %xmm1,%xmm4 907 psrld $25,%xmm1 908 pslld $7,%xmm4 909 por %xmm4,%xmm1 910 pshufd $78,%xmm2,%xmm2 911 pshufd $147,%xmm1,%xmm1 912 pshufd $57,%xmm3,%xmm3 913 decl %edx 914 jnz L012loop1x 915 paddd (%esp),%xmm0 916 paddd 16(%esp),%xmm1 917 paddd 32(%esp),%xmm2 918 paddd 48(%esp),%xmm3 919 cmpl $64,%ecx 920 jb L014tail 921 movdqu (%esi),%xmm4 922 movdqu 16(%esi),%xmm5 923 pxor %xmm4,%xmm0 924 movdqu 32(%esi),%xmm4 925 pxor %xmm5,%xmm1 926 movdqu 48(%esi),%xmm5 927 pxor %xmm4,%xmm2 928 pxor %xmm5,%xmm3 929 leal 64(%esi),%esi 930 movdqu %xmm0,(%edi) 931 movdqu %xmm1,16(%edi) 932 movdqu %xmm2,32(%edi) 933 movdqu %xmm3,48(%edi) 934 leal 64(%edi),%edi 935 subl $64,%ecx 936 jnz L013outer1x 937 jmp L011done 938L014tail: 939 movdqa %xmm0,(%esp) 940 movdqa %xmm1,16(%esp) 941 movdqa %xmm2,32(%esp) 942 movdqa %xmm3,48(%esp) 943 xorl %eax,%eax 944 xorl %edx,%edx 945 xorl %ebp,%ebp 946L015tail_loop: 947 movb (%esp,%ebp,1),%al 948 movb (%esi,%ebp,1),%dl 949 leal 1(%ebp),%ebp 950 xorb %dl,%al 951 movb %al,-1(%edi,%ebp,1) 952 decl %ecx 953 jnz L015tail_loop 954L011done: 955 movl 512(%esp),%esp 956 popl %edi 957 popl %esi 958 popl %ebx 959 popl %ebp 960 ret 961.align 6,0x90 962Lssse3_data: 963.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 964.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 965.long 1634760805,857760878,2036477234,1797285236 966.long 0,1,2,3 967.long 4,4,4,4 968.long 1,0,0,0 969.long 4,0,0,0 970.long 0,-1,-1,-1 971.align 6,0x90 972.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 973.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 974.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 975.byte 114,103,62,0 976.globl _ChaCha20_xop 977.type _ChaCha20_xop,@function 978.align 4 979_ChaCha20_xop: 980L_ChaCha20_xop_begin: 981 %ifdef __CET__ 982 983.byte 243,15,30,251 984 %endif 985 986 pushl %ebp 987 pushl %ebx 988 pushl %esi 989 pushl %edi 990Lxop_shortcut: 991 movl 20(%esp),%edi 992 movl 24(%esp),%esi 993 movl 28(%esp),%ecx 994 movl 32(%esp),%edx 995 movl 36(%esp),%ebx 996 vzeroupper 997 movl %esp,%ebp 998 subl $524,%esp 999 andl $-64,%esp 1000 movl %ebp,512(%esp) 1001 leal Lssse3_data-Lpic_point(%eax),%eax 1002 vmovdqu (%ebx),%xmm3 1003 cmpl $256,%ecx 1004 jb L0161x 1005 movl %edx,516(%esp) 1006 movl %ebx,520(%esp) 1007 subl $256,%ecx 1008 leal 384(%esp),%ebp 1009 vmovdqu (%edx),%xmm7 1010 vpshufd $0,%xmm3,%xmm0 1011 vpshufd $85,%xmm3,%xmm1 1012 vpshufd $170,%xmm3,%xmm2 1013 vpshufd $255,%xmm3,%xmm3 1014 vpaddd 48(%eax),%xmm0,%xmm0 1015 vpshufd $0,%xmm7,%xmm4 1016 vpshufd $85,%xmm7,%xmm5 1017 vpsubd 64(%eax),%xmm0,%xmm0 1018 vpshufd $170,%xmm7,%xmm6 1019 vpshufd $255,%xmm7,%xmm7 1020 vmovdqa %xmm0,64(%ebp) 1021 vmovdqa %xmm1,80(%ebp) 1022 vmovdqa %xmm2,96(%ebp) 1023 vmovdqa %xmm3,112(%ebp) 1024 vmovdqu 16(%edx),%xmm3 1025 vmovdqa %xmm4,-64(%ebp) 1026 vmovdqa %xmm5,-48(%ebp) 1027 vmovdqa %xmm6,-32(%ebp) 1028 vmovdqa %xmm7,-16(%ebp) 1029 vmovdqa 32(%eax),%xmm7 1030 leal 128(%esp),%ebx 1031 vpshufd $0,%xmm3,%xmm0 1032 vpshufd $85,%xmm3,%xmm1 1033 vpshufd $170,%xmm3,%xmm2 1034 vpshufd $255,%xmm3,%xmm3 1035 vpshufd $0,%xmm7,%xmm4 1036 vpshufd $85,%xmm7,%xmm5 1037 vpshufd $170,%xmm7,%xmm6 1038 vpshufd $255,%xmm7,%xmm7 1039 vmovdqa %xmm0,(%ebp) 1040 vmovdqa %xmm1,16(%ebp) 1041 vmovdqa %xmm2,32(%ebp) 1042 vmovdqa %xmm3,48(%ebp) 1043 vmovdqa %xmm4,-128(%ebp) 1044 vmovdqa %xmm5,-112(%ebp) 1045 vmovdqa %xmm6,-96(%ebp) 1046 vmovdqa %xmm7,-80(%ebp) 1047 leal 128(%esi),%esi 1048 leal 128(%edi),%edi 1049 jmp L017outer_loop 1050.align 5,0x90 1051L017outer_loop: 1052 vmovdqa -112(%ebp),%xmm1 1053 vmovdqa -96(%ebp),%xmm2 1054 vmovdqa -80(%ebp),%xmm3 1055 vmovdqa -48(%ebp),%xmm5 1056 vmovdqa -32(%ebp),%xmm6 1057 vmovdqa -16(%ebp),%xmm7 1058 vmovdqa %xmm1,-112(%ebx) 1059 vmovdqa %xmm2,-96(%ebx) 1060 vmovdqa %xmm3,-80(%ebx) 1061 vmovdqa %xmm5,-48(%ebx) 1062 vmovdqa %xmm6,-32(%ebx) 1063 vmovdqa %xmm7,-16(%ebx) 1064 vmovdqa 32(%ebp),%xmm2 1065 vmovdqa 48(%ebp),%xmm3 1066 vmovdqa 64(%ebp),%xmm4 1067 vmovdqa 80(%ebp),%xmm5 1068 vmovdqa 96(%ebp),%xmm6 1069 vmovdqa 112(%ebp),%xmm7 1070 vpaddd 64(%eax),%xmm4,%xmm4 1071 vmovdqa %xmm2,32(%ebx) 1072 vmovdqa %xmm3,48(%ebx) 1073 vmovdqa %xmm4,64(%ebx) 1074 vmovdqa %xmm5,80(%ebx) 1075 vmovdqa %xmm6,96(%ebx) 1076 vmovdqa %xmm7,112(%ebx) 1077 vmovdqa %xmm4,64(%ebp) 1078 vmovdqa -128(%ebp),%xmm0 1079 vmovdqa %xmm4,%xmm6 1080 vmovdqa -64(%ebp),%xmm3 1081 vmovdqa (%ebp),%xmm4 1082 vmovdqa 16(%ebp),%xmm5 1083 movl $10,%edx 1084 nop 1085.align 5,0x90 1086L018loop: 1087 vpaddd %xmm3,%xmm0,%xmm0 1088 vpxor %xmm0,%xmm6,%xmm6 1089.byte 143,232,120,194,246,16 1090 vpaddd %xmm6,%xmm4,%xmm4 1091 vpxor %xmm4,%xmm3,%xmm2 1092 vmovdqa -112(%ebx),%xmm1 1093.byte 143,232,120,194,210,12 1094 vmovdqa -48(%ebx),%xmm3 1095 vpaddd %xmm2,%xmm0,%xmm0 1096 vmovdqa 80(%ebx),%xmm7 1097 vpxor %xmm0,%xmm6,%xmm6 1098 vpaddd %xmm3,%xmm1,%xmm1 1099.byte 143,232,120,194,246,8 1100 vmovdqa %xmm0,-128(%ebx) 1101 vpaddd %xmm6,%xmm4,%xmm4 1102 vmovdqa %xmm6,64(%ebx) 1103 vpxor %xmm4,%xmm2,%xmm2 1104 vpxor %xmm1,%xmm7,%xmm7 1105.byte 143,232,120,194,210,7 1106 vmovdqa %xmm4,(%ebx) 1107.byte 143,232,120,194,255,16 1108 vmovdqa %xmm2,-64(%ebx) 1109 vpaddd %xmm7,%xmm5,%xmm5 1110 vmovdqa 32(%ebx),%xmm4 1111 vpxor %xmm5,%xmm3,%xmm3 1112 vmovdqa -96(%ebx),%xmm0 1113.byte 143,232,120,194,219,12 1114 vmovdqa -32(%ebx),%xmm2 1115 vpaddd %xmm3,%xmm1,%xmm1 1116 vmovdqa 96(%ebx),%xmm6 1117 vpxor %xmm1,%xmm7,%xmm7 1118 vpaddd %xmm2,%xmm0,%xmm0 1119.byte 143,232,120,194,255,8 1120 vmovdqa %xmm1,-112(%ebx) 1121 vpaddd %xmm7,%xmm5,%xmm5 1122 vmovdqa %xmm7,80(%ebx) 1123 vpxor %xmm5,%xmm3,%xmm3 1124 vpxor %xmm0,%xmm6,%xmm6 1125.byte 143,232,120,194,219,7 1126 vmovdqa %xmm5,16(%ebx) 1127.byte 143,232,120,194,246,16 1128 vmovdqa %xmm3,-48(%ebx) 1129 vpaddd %xmm6,%xmm4,%xmm4 1130 vmovdqa 48(%ebx),%xmm5 1131 vpxor %xmm4,%xmm2,%xmm2 1132 vmovdqa -80(%ebx),%xmm1 1133.byte 143,232,120,194,210,12 1134 vmovdqa -16(%ebx),%xmm3 1135 vpaddd %xmm2,%xmm0,%xmm0 1136 vmovdqa 112(%ebx),%xmm7 1137 vpxor %xmm0,%xmm6,%xmm6 1138 vpaddd %xmm3,%xmm1,%xmm1 1139.byte 143,232,120,194,246,8 1140 vmovdqa %xmm0,-96(%ebx) 1141 vpaddd %xmm6,%xmm4,%xmm4 1142 vmovdqa %xmm6,96(%ebx) 1143 vpxor %xmm4,%xmm2,%xmm2 1144 vpxor %xmm1,%xmm7,%xmm7 1145.byte 143,232,120,194,210,7 1146.byte 143,232,120,194,255,16 1147 vmovdqa %xmm2,-32(%ebx) 1148 vpaddd %xmm7,%xmm5,%xmm5 1149 vpxor %xmm5,%xmm3,%xmm3 1150 vmovdqa -128(%ebx),%xmm0 1151.byte 143,232,120,194,219,12 1152 vmovdqa -48(%ebx),%xmm2 1153 vpaddd %xmm3,%xmm1,%xmm1 1154 vpxor %xmm1,%xmm7,%xmm7 1155 vpaddd %xmm2,%xmm0,%xmm0 1156.byte 143,232,120,194,255,8 1157 vmovdqa %xmm1,-80(%ebx) 1158 vpaddd %xmm7,%xmm5,%xmm5 1159 vpxor %xmm5,%xmm3,%xmm3 1160 vpxor %xmm0,%xmm7,%xmm6 1161.byte 143,232,120,194,219,7 1162.byte 143,232,120,194,246,16 1163 vmovdqa %xmm3,-16(%ebx) 1164 vpaddd %xmm6,%xmm4,%xmm4 1165 vpxor %xmm4,%xmm2,%xmm2 1166 vmovdqa -112(%ebx),%xmm1 1167.byte 143,232,120,194,210,12 1168 vmovdqa -32(%ebx),%xmm3 1169 vpaddd %xmm2,%xmm0,%xmm0 1170 vmovdqa 64(%ebx),%xmm7 1171 vpxor %xmm0,%xmm6,%xmm6 1172 vpaddd %xmm3,%xmm1,%xmm1 1173.byte 143,232,120,194,246,8 1174 vmovdqa %xmm0,-128(%ebx) 1175 vpaddd %xmm6,%xmm4,%xmm4 1176 vmovdqa %xmm6,112(%ebx) 1177 vpxor %xmm4,%xmm2,%xmm2 1178 vpxor %xmm1,%xmm7,%xmm7 1179.byte 143,232,120,194,210,7 1180 vmovdqa %xmm4,32(%ebx) 1181.byte 143,232,120,194,255,16 1182 vmovdqa %xmm2,-48(%ebx) 1183 vpaddd %xmm7,%xmm5,%xmm5 1184 vmovdqa (%ebx),%xmm4 1185 vpxor %xmm5,%xmm3,%xmm3 1186 vmovdqa -96(%ebx),%xmm0 1187.byte 143,232,120,194,219,12 1188 vmovdqa -16(%ebx),%xmm2 1189 vpaddd %xmm3,%xmm1,%xmm1 1190 vmovdqa 80(%ebx),%xmm6 1191 vpxor %xmm1,%xmm7,%xmm7 1192 vpaddd %xmm2,%xmm0,%xmm0 1193.byte 143,232,120,194,255,8 1194 vmovdqa %xmm1,-112(%ebx) 1195 vpaddd %xmm7,%xmm5,%xmm5 1196 vmovdqa %xmm7,64(%ebx) 1197 vpxor %xmm5,%xmm3,%xmm3 1198 vpxor %xmm0,%xmm6,%xmm6 1199.byte 143,232,120,194,219,7 1200 vmovdqa %xmm5,48(%ebx) 1201.byte 143,232,120,194,246,16 1202 vmovdqa %xmm3,-32(%ebx) 1203 vpaddd %xmm6,%xmm4,%xmm4 1204 vmovdqa 16(%ebx),%xmm5 1205 vpxor %xmm4,%xmm2,%xmm2 1206 vmovdqa -80(%ebx),%xmm1 1207.byte 143,232,120,194,210,12 1208 vmovdqa -64(%ebx),%xmm3 1209 vpaddd %xmm2,%xmm0,%xmm0 1210 vmovdqa 96(%ebx),%xmm7 1211 vpxor %xmm0,%xmm6,%xmm6 1212 vpaddd %xmm3,%xmm1,%xmm1 1213.byte 143,232,120,194,246,8 1214 vmovdqa %xmm0,-96(%ebx) 1215 vpaddd %xmm6,%xmm4,%xmm4 1216 vmovdqa %xmm6,80(%ebx) 1217 vpxor %xmm4,%xmm2,%xmm2 1218 vpxor %xmm1,%xmm7,%xmm7 1219.byte 143,232,120,194,210,7 1220.byte 143,232,120,194,255,16 1221 vmovdqa %xmm2,-16(%ebx) 1222 vpaddd %xmm7,%xmm5,%xmm5 1223 vpxor %xmm5,%xmm3,%xmm3 1224 vmovdqa -128(%ebx),%xmm0 1225.byte 143,232,120,194,219,12 1226 vpaddd %xmm3,%xmm1,%xmm1 1227 vmovdqa 64(%ebx),%xmm6 1228 vpxor %xmm1,%xmm7,%xmm7 1229.byte 143,232,120,194,255,8 1230 vmovdqa %xmm1,-80(%ebx) 1231 vpaddd %xmm7,%xmm5,%xmm5 1232 vmovdqa %xmm7,96(%ebx) 1233 vpxor %xmm5,%xmm3,%xmm3 1234.byte 143,232,120,194,219,7 1235 decl %edx 1236 jnz L018loop 1237 vmovdqa %xmm3,-64(%ebx) 1238 vmovdqa %xmm4,(%ebx) 1239 vmovdqa %xmm5,16(%ebx) 1240 vmovdqa %xmm6,64(%ebx) 1241 vmovdqa %xmm7,96(%ebx) 1242 vmovdqa -112(%ebx),%xmm1 1243 vmovdqa -96(%ebx),%xmm2 1244 vmovdqa -80(%ebx),%xmm3 1245 vpaddd -128(%ebp),%xmm0,%xmm0 1246 vpaddd -112(%ebp),%xmm1,%xmm1 1247 vpaddd -96(%ebp),%xmm2,%xmm2 1248 vpaddd -80(%ebp),%xmm3,%xmm3 1249 vpunpckldq %xmm1,%xmm0,%xmm6 1250 vpunpckldq %xmm3,%xmm2,%xmm7 1251 vpunpckhdq %xmm1,%xmm0,%xmm0 1252 vpunpckhdq %xmm3,%xmm2,%xmm2 1253 vpunpcklqdq %xmm7,%xmm6,%xmm1 1254 vpunpckhqdq %xmm7,%xmm6,%xmm6 1255 vpunpcklqdq %xmm2,%xmm0,%xmm7 1256 vpunpckhqdq %xmm2,%xmm0,%xmm3 1257 vpxor -128(%esi),%xmm1,%xmm4 1258 vpxor -64(%esi),%xmm6,%xmm5 1259 vpxor (%esi),%xmm7,%xmm6 1260 vpxor 64(%esi),%xmm3,%xmm7 1261 leal 16(%esi),%esi 1262 vmovdqa -64(%ebx),%xmm0 1263 vmovdqa -48(%ebx),%xmm1 1264 vmovdqa -32(%ebx),%xmm2 1265 vmovdqa -16(%ebx),%xmm3 1266 vmovdqu %xmm4,-128(%edi) 1267 vmovdqu %xmm5,-64(%edi) 1268 vmovdqu %xmm6,(%edi) 1269 vmovdqu %xmm7,64(%edi) 1270 leal 16(%edi),%edi 1271 vpaddd -64(%ebp),%xmm0,%xmm0 1272 vpaddd -48(%ebp),%xmm1,%xmm1 1273 vpaddd -32(%ebp),%xmm2,%xmm2 1274 vpaddd -16(%ebp),%xmm3,%xmm3 1275 vpunpckldq %xmm1,%xmm0,%xmm6 1276 vpunpckldq %xmm3,%xmm2,%xmm7 1277 vpunpckhdq %xmm1,%xmm0,%xmm0 1278 vpunpckhdq %xmm3,%xmm2,%xmm2 1279 vpunpcklqdq %xmm7,%xmm6,%xmm1 1280 vpunpckhqdq %xmm7,%xmm6,%xmm6 1281 vpunpcklqdq %xmm2,%xmm0,%xmm7 1282 vpunpckhqdq %xmm2,%xmm0,%xmm3 1283 vpxor -128(%esi),%xmm1,%xmm4 1284 vpxor -64(%esi),%xmm6,%xmm5 1285 vpxor (%esi),%xmm7,%xmm6 1286 vpxor 64(%esi),%xmm3,%xmm7 1287 leal 16(%esi),%esi 1288 vmovdqa (%ebx),%xmm0 1289 vmovdqa 16(%ebx),%xmm1 1290 vmovdqa 32(%ebx),%xmm2 1291 vmovdqa 48(%ebx),%xmm3 1292 vmovdqu %xmm4,-128(%edi) 1293 vmovdqu %xmm5,-64(%edi) 1294 vmovdqu %xmm6,(%edi) 1295 vmovdqu %xmm7,64(%edi) 1296 leal 16(%edi),%edi 1297 vpaddd (%ebp),%xmm0,%xmm0 1298 vpaddd 16(%ebp),%xmm1,%xmm1 1299 vpaddd 32(%ebp),%xmm2,%xmm2 1300 vpaddd 48(%ebp),%xmm3,%xmm3 1301 vpunpckldq %xmm1,%xmm0,%xmm6 1302 vpunpckldq %xmm3,%xmm2,%xmm7 1303 vpunpckhdq %xmm1,%xmm0,%xmm0 1304 vpunpckhdq %xmm3,%xmm2,%xmm2 1305 vpunpcklqdq %xmm7,%xmm6,%xmm1 1306 vpunpckhqdq %xmm7,%xmm6,%xmm6 1307 vpunpcklqdq %xmm2,%xmm0,%xmm7 1308 vpunpckhqdq %xmm2,%xmm0,%xmm3 1309 vpxor -128(%esi),%xmm1,%xmm4 1310 vpxor -64(%esi),%xmm6,%xmm5 1311 vpxor (%esi),%xmm7,%xmm6 1312 vpxor 64(%esi),%xmm3,%xmm7 1313 leal 16(%esi),%esi 1314 vmovdqa 64(%ebx),%xmm0 1315 vmovdqa 80(%ebx),%xmm1 1316 vmovdqa 96(%ebx),%xmm2 1317 vmovdqa 112(%ebx),%xmm3 1318 vmovdqu %xmm4,-128(%edi) 1319 vmovdqu %xmm5,-64(%edi) 1320 vmovdqu %xmm6,(%edi) 1321 vmovdqu %xmm7,64(%edi) 1322 leal 16(%edi),%edi 1323 vpaddd 64(%ebp),%xmm0,%xmm0 1324 vpaddd 80(%ebp),%xmm1,%xmm1 1325 vpaddd 96(%ebp),%xmm2,%xmm2 1326 vpaddd 112(%ebp),%xmm3,%xmm3 1327 vpunpckldq %xmm1,%xmm0,%xmm6 1328 vpunpckldq %xmm3,%xmm2,%xmm7 1329 vpunpckhdq %xmm1,%xmm0,%xmm0 1330 vpunpckhdq %xmm3,%xmm2,%xmm2 1331 vpunpcklqdq %xmm7,%xmm6,%xmm1 1332 vpunpckhqdq %xmm7,%xmm6,%xmm6 1333 vpunpcklqdq %xmm2,%xmm0,%xmm7 1334 vpunpckhqdq %xmm2,%xmm0,%xmm3 1335 vpxor -128(%esi),%xmm1,%xmm4 1336 vpxor -64(%esi),%xmm6,%xmm5 1337 vpxor (%esi),%xmm7,%xmm6 1338 vpxor 64(%esi),%xmm3,%xmm7 1339 leal 208(%esi),%esi 1340 vmovdqu %xmm4,-128(%edi) 1341 vmovdqu %xmm5,-64(%edi) 1342 vmovdqu %xmm6,(%edi) 1343 vmovdqu %xmm7,64(%edi) 1344 leal 208(%edi),%edi 1345 subl $256,%ecx 1346 jnc L017outer_loop 1347 addl $256,%ecx 1348 jz L019done 1349 movl 520(%esp),%ebx 1350 leal -128(%esi),%esi 1351 movl 516(%esp),%edx 1352 leal -128(%edi),%edi 1353 vmovd 64(%ebp),%xmm2 1354 vmovdqu (%ebx),%xmm3 1355 vpaddd 96(%eax),%xmm2,%xmm2 1356 vpand 112(%eax),%xmm3,%xmm3 1357 vpor %xmm2,%xmm3,%xmm3 1358L0161x: 1359 vmovdqa 32(%eax),%xmm0 1360 vmovdqu (%edx),%xmm1 1361 vmovdqu 16(%edx),%xmm2 1362 vmovdqa (%eax),%xmm6 1363 vmovdqa 16(%eax),%xmm7 1364 movl %ebp,48(%esp) 1365 vmovdqa %xmm0,(%esp) 1366 vmovdqa %xmm1,16(%esp) 1367 vmovdqa %xmm2,32(%esp) 1368 vmovdqa %xmm3,48(%esp) 1369 movl $10,%edx 1370 jmp L020loop1x 1371.align 4,0x90 1372L021outer1x: 1373 vmovdqa 80(%eax),%xmm3 1374 vmovdqa (%esp),%xmm0 1375 vmovdqa 16(%esp),%xmm1 1376 vmovdqa 32(%esp),%xmm2 1377 vpaddd 48(%esp),%xmm3,%xmm3 1378 movl $10,%edx 1379 vmovdqa %xmm3,48(%esp) 1380 jmp L020loop1x 1381.align 4,0x90 1382L020loop1x: 1383 vpaddd %xmm1,%xmm0,%xmm0 1384 vpxor %xmm0,%xmm3,%xmm3 1385.byte 143,232,120,194,219,16 1386 vpaddd %xmm3,%xmm2,%xmm2 1387 vpxor %xmm2,%xmm1,%xmm1 1388.byte 143,232,120,194,201,12 1389 vpaddd %xmm1,%xmm0,%xmm0 1390 vpxor %xmm0,%xmm3,%xmm3 1391.byte 143,232,120,194,219,8 1392 vpaddd %xmm3,%xmm2,%xmm2 1393 vpxor %xmm2,%xmm1,%xmm1 1394.byte 143,232,120,194,201,7 1395 vpshufd $78,%xmm2,%xmm2 1396 vpshufd $57,%xmm1,%xmm1 1397 vpshufd $147,%xmm3,%xmm3 1398 vpaddd %xmm1,%xmm0,%xmm0 1399 vpxor %xmm0,%xmm3,%xmm3 1400.byte 143,232,120,194,219,16 1401 vpaddd %xmm3,%xmm2,%xmm2 1402 vpxor %xmm2,%xmm1,%xmm1 1403.byte 143,232,120,194,201,12 1404 vpaddd %xmm1,%xmm0,%xmm0 1405 vpxor %xmm0,%xmm3,%xmm3 1406.byte 143,232,120,194,219,8 1407 vpaddd %xmm3,%xmm2,%xmm2 1408 vpxor %xmm2,%xmm1,%xmm1 1409.byte 143,232,120,194,201,7 1410 vpshufd $78,%xmm2,%xmm2 1411 vpshufd $147,%xmm1,%xmm1 1412 vpshufd $57,%xmm3,%xmm3 1413 decl %edx 1414 jnz L020loop1x 1415 vpaddd (%esp),%xmm0,%xmm0 1416 vpaddd 16(%esp),%xmm1,%xmm1 1417 vpaddd 32(%esp),%xmm2,%xmm2 1418 vpaddd 48(%esp),%xmm3,%xmm3 1419 cmpl $64,%ecx 1420 jb L022tail 1421 vpxor (%esi),%xmm0,%xmm0 1422 vpxor 16(%esi),%xmm1,%xmm1 1423 vpxor 32(%esi),%xmm2,%xmm2 1424 vpxor 48(%esi),%xmm3,%xmm3 1425 leal 64(%esi),%esi 1426 vmovdqu %xmm0,(%edi) 1427 vmovdqu %xmm1,16(%edi) 1428 vmovdqu %xmm2,32(%edi) 1429 vmovdqu %xmm3,48(%edi) 1430 leal 64(%edi),%edi 1431 subl $64,%ecx 1432 jnz L021outer1x 1433 jmp L019done 1434L022tail: 1435 vmovdqa %xmm0,(%esp) 1436 vmovdqa %xmm1,16(%esp) 1437 vmovdqa %xmm2,32(%esp) 1438 vmovdqa %xmm3,48(%esp) 1439 xorl %eax,%eax 1440 xorl %edx,%edx 1441 xorl %ebp,%ebp 1442L023tail_loop: 1443 movb (%esp,%ebp,1),%al 1444 movb (%esi,%ebp,1),%dl 1445 leal 1(%ebp),%ebp 1446 xorb %dl,%al 1447 movb %al,-1(%edi,%ebp,1) 1448 decl %ecx 1449 jnz L023tail_loop 1450L019done: 1451 vzeroupper 1452 movl 512(%esp),%esp 1453 popl %edi 1454 popl %esi 1455 popl %ebx 1456 popl %ebp 1457 ret 1458.comm _OPENSSL_ia32cap_P,16 1459