1.text 2.align 64 3.globl poly1305_init 4.type poly1305_init,@function 5.align 16 6poly1305_init: 7.L_poly1305_init_begin: 8 %ifdef __CET__ 9 10.byte 243,15,30,251 11 %endif 12 13 pushl %ebp 14 pushl %ebx 15 pushl %esi 16 pushl %edi 17 movl 20(%esp),%edi 18 movl 24(%esp),%esi 19 movl 28(%esp),%ebp 20 xorl %eax,%eax 21 movl %eax,(%edi) 22 movl %eax,4(%edi) 23 movl %eax,8(%edi) 24 movl %eax,12(%edi) 25 movl %eax,16(%edi) 26 movl %eax,20(%edi) 27 cmpl $0,%esi 28 je .L000nokey 29 call .L001pic_point 30.L001pic_point: 31 popl %ebx 32 leal poly1305_blocks-.L001pic_point(%ebx),%eax 33 leal poly1305_emit-.L001pic_point(%ebx),%edx 34 leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi 35 movl (%edi),%ecx 36 andl $83886080,%ecx 37 cmpl $83886080,%ecx 38 jne .L002no_sse2 39 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 40 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 41 movl 8(%edi),%ecx 42 testl $32,%ecx 43 jz .L002no_sse2 44 leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax 45.L002no_sse2: 46 movl 20(%esp),%edi 47 movl %eax,(%ebp) 48 movl %edx,4(%ebp) 49 movl (%esi),%eax 50 movl 4(%esi),%ebx 51 movl 8(%esi),%ecx 52 movl 12(%esi),%edx 53 andl $268435455,%eax 54 andl $268435452,%ebx 55 andl $268435452,%ecx 56 andl $268435452,%edx 57 movl %eax,24(%edi) 58 movl %ebx,28(%edi) 59 movl %ecx,32(%edi) 60 movl %edx,36(%edi) 61 movl $1,%eax 62.L000nokey: 63 popl %edi 64 popl %esi 65 popl %ebx 66 popl %ebp 67 ret 68.size poly1305_init,.-.L_poly1305_init_begin 69.globl poly1305_blocks 70.type poly1305_blocks,@function 71.align 16 72poly1305_blocks: 73.L_poly1305_blocks_begin: 74 %ifdef __CET__ 75 76.byte 243,15,30,251 77 %endif 78 79 pushl %ebp 80 pushl %ebx 81 pushl %esi 82 pushl %edi 83 movl 20(%esp),%edi 84 movl 24(%esp),%esi 85 movl 28(%esp),%ecx 86.Lenter_blocks: 87 andl $-15,%ecx 88 jz .L003nodata 89 subl $64,%esp 90 movl 24(%edi),%eax 91 movl 28(%edi),%ebx 92 leal (%esi,%ecx,1),%ebp 93 movl 32(%edi),%ecx 94 movl 36(%edi),%edx 95 movl %ebp,92(%esp) 96 movl %esi,%ebp 97 movl %eax,36(%esp) 98 movl %ebx,%eax 99 shrl $2,%eax 100 movl %ebx,40(%esp) 101 addl %ebx,%eax 102 movl %ecx,%ebx 103 shrl $2,%ebx 104 movl %ecx,44(%esp) 105 addl %ecx,%ebx 106 movl %edx,%ecx 107 shrl $2,%ecx 108 movl %edx,48(%esp) 109 addl %edx,%ecx 110 movl %eax,52(%esp) 111 movl %ebx,56(%esp) 112 movl %ecx,60(%esp) 113 movl (%edi),%eax 114 movl 4(%edi),%ebx 115 movl 8(%edi),%ecx 116 movl 12(%edi),%esi 117 movl 16(%edi),%edi 118 jmp .L004loop 119.align 32 120.L004loop: 121 addl (%ebp),%eax 122 adcl 4(%ebp),%ebx 123 adcl 8(%ebp),%ecx 124 adcl 12(%ebp),%esi 125 leal 16(%ebp),%ebp 126 adcl 96(%esp),%edi 127 movl %eax,(%esp) 128 movl %esi,12(%esp) 129 mull 36(%esp) 130 movl %edi,16(%esp) 131 movl %eax,%edi 132 movl %ebx,%eax 133 movl %edx,%esi 134 mull 60(%esp) 135 addl %eax,%edi 136 movl %ecx,%eax 137 adcl %edx,%esi 138 mull 56(%esp) 139 addl %eax,%edi 140 movl 12(%esp),%eax 141 adcl %edx,%esi 142 mull 52(%esp) 143 addl %eax,%edi 144 movl (%esp),%eax 145 adcl %edx,%esi 146 mull 40(%esp) 147 movl %edi,20(%esp) 148 xorl %edi,%edi 149 addl %eax,%esi 150 movl %ebx,%eax 151 adcl %edx,%edi 152 mull 36(%esp) 153 addl %eax,%esi 154 movl %ecx,%eax 155 adcl %edx,%edi 156 mull 60(%esp) 157 addl %eax,%esi 158 movl 12(%esp),%eax 159 adcl %edx,%edi 160 mull 56(%esp) 161 addl %eax,%esi 162 movl 16(%esp),%eax 163 adcl %edx,%edi 164 imull 52(%esp),%eax 165 addl %eax,%esi 166 movl (%esp),%eax 167 adcl $0,%edi 168 mull 44(%esp) 169 movl %esi,24(%esp) 170 xorl %esi,%esi 171 addl %eax,%edi 172 movl %ebx,%eax 173 adcl %edx,%esi 174 mull 40(%esp) 175 addl %eax,%edi 176 movl %ecx,%eax 177 adcl %edx,%esi 178 mull 36(%esp) 179 addl %eax,%edi 180 movl 12(%esp),%eax 181 adcl %edx,%esi 182 mull 60(%esp) 183 addl %eax,%edi 184 movl 16(%esp),%eax 185 adcl %edx,%esi 186 imull 56(%esp),%eax 187 addl %eax,%edi 188 movl (%esp),%eax 189 adcl $0,%esi 190 mull 48(%esp) 191 movl %edi,28(%esp) 192 xorl %edi,%edi 193 addl %eax,%esi 194 movl %ebx,%eax 195 adcl %edx,%edi 196 mull 44(%esp) 197 addl %eax,%esi 198 movl %ecx,%eax 199 adcl %edx,%edi 200 mull 40(%esp) 201 addl %eax,%esi 202 movl 12(%esp),%eax 203 adcl %edx,%edi 204 mull 36(%esp) 205 addl %eax,%esi 206 movl 16(%esp),%ecx 207 adcl %edx,%edi 208 movl %ecx,%edx 209 imull 60(%esp),%ecx 210 addl %ecx,%esi 211 movl 20(%esp),%eax 212 adcl $0,%edi 213 imull 36(%esp),%edx 214 addl %edi,%edx 215 movl 24(%esp),%ebx 216 movl 28(%esp),%ecx 217 movl %edx,%edi 218 shrl $2,%edx 219 andl $3,%edi 220 leal (%edx,%edx,4),%edx 221 addl %edx,%eax 222 adcl $0,%ebx 223 adcl $0,%ecx 224 adcl $0,%esi 225 adcl $0,%edi 226 cmpl 92(%esp),%ebp 227 jne .L004loop 228 movl 84(%esp),%edx 229 addl $64,%esp 230 movl %eax,(%edx) 231 movl %ebx,4(%edx) 232 movl %ecx,8(%edx) 233 movl %esi,12(%edx) 234 movl %edi,16(%edx) 235.L003nodata: 236 popl %edi 237 popl %esi 238 popl %ebx 239 popl %ebp 240 ret 241.size poly1305_blocks,.-.L_poly1305_blocks_begin 242.globl poly1305_emit 243.type poly1305_emit,@function 244.align 16 245poly1305_emit: 246.L_poly1305_emit_begin: 247 %ifdef __CET__ 248 249.byte 243,15,30,251 250 %endif 251 252 pushl %ebp 253 pushl %ebx 254 pushl %esi 255 pushl %edi 256 movl 20(%esp),%ebp 257.Lenter_emit: 258 movl 24(%esp),%edi 259 movl (%ebp),%eax 260 movl 4(%ebp),%ebx 261 movl 8(%ebp),%ecx 262 movl 12(%ebp),%edx 263 movl 16(%ebp),%esi 264 addl $5,%eax 265 adcl $0,%ebx 266 adcl $0,%ecx 267 adcl $0,%edx 268 adcl $0,%esi 269 shrl $2,%esi 270 negl %esi 271 andl %esi,%eax 272 andl %esi,%ebx 273 andl %esi,%ecx 274 andl %esi,%edx 275 movl %eax,(%edi) 276 movl %ebx,4(%edi) 277 movl %ecx,8(%edi) 278 movl %edx,12(%edi) 279 notl %esi 280 movl (%ebp),%eax 281 movl 4(%ebp),%ebx 282 movl 8(%ebp),%ecx 283 movl 12(%ebp),%edx 284 movl 28(%esp),%ebp 285 andl %esi,%eax 286 andl %esi,%ebx 287 andl %esi,%ecx 288 andl %esi,%edx 289 orl (%edi),%eax 290 orl 4(%edi),%ebx 291 orl 8(%edi),%ecx 292 orl 12(%edi),%edx 293 addl (%ebp),%eax 294 adcl 4(%ebp),%ebx 295 adcl 8(%ebp),%ecx 296 adcl 12(%ebp),%edx 297 movl %eax,(%edi) 298 movl %ebx,4(%edi) 299 movl %ecx,8(%edi) 300 movl %edx,12(%edi) 301 popl %edi 302 popl %esi 303 popl %ebx 304 popl %ebp 305 ret 306.size poly1305_emit,.-.L_poly1305_emit_begin 307.align 32 308.type _poly1305_init_sse2,@function 309.align 16 310_poly1305_init_sse2: 311 %ifdef __CET__ 312 313.byte 243,15,30,251 314 %endif 315 316 movdqu 24(%edi),%xmm4 317 leal 48(%edi),%edi 318 movl %esp,%ebp 319 subl $224,%esp 320 andl $-16,%esp 321 movq 64(%ebx),%xmm7 322 movdqa %xmm4,%xmm0 323 movdqa %xmm4,%xmm1 324 movdqa %xmm4,%xmm2 325 pand %xmm7,%xmm0 326 psrlq $26,%xmm1 327 psrldq $6,%xmm2 328 pand %xmm7,%xmm1 329 movdqa %xmm2,%xmm3 330 psrlq $4,%xmm2 331 psrlq $30,%xmm3 332 pand %xmm7,%xmm2 333 pand %xmm7,%xmm3 334 psrldq $13,%xmm4 335 leal 144(%esp),%edx 336 movl $2,%ecx 337.L005square: 338 movdqa %xmm0,(%esp) 339 movdqa %xmm1,16(%esp) 340 movdqa %xmm2,32(%esp) 341 movdqa %xmm3,48(%esp) 342 movdqa %xmm4,64(%esp) 343 movdqa %xmm1,%xmm6 344 movdqa %xmm2,%xmm5 345 pslld $2,%xmm6 346 pslld $2,%xmm5 347 paddd %xmm1,%xmm6 348 paddd %xmm2,%xmm5 349 movdqa %xmm6,80(%esp) 350 movdqa %xmm5,96(%esp) 351 movdqa %xmm3,%xmm6 352 movdqa %xmm4,%xmm5 353 pslld $2,%xmm6 354 pslld $2,%xmm5 355 paddd %xmm3,%xmm6 356 paddd %xmm4,%xmm5 357 movdqa %xmm6,112(%esp) 358 movdqa %xmm5,128(%esp) 359 pshufd $68,%xmm0,%xmm6 360 movdqa %xmm1,%xmm5 361 pshufd $68,%xmm1,%xmm1 362 pshufd $68,%xmm2,%xmm2 363 pshufd $68,%xmm3,%xmm3 364 pshufd $68,%xmm4,%xmm4 365 movdqa %xmm6,(%edx) 366 movdqa %xmm1,16(%edx) 367 movdqa %xmm2,32(%edx) 368 movdqa %xmm3,48(%edx) 369 movdqa %xmm4,64(%edx) 370 pmuludq %xmm0,%xmm4 371 pmuludq %xmm0,%xmm3 372 pmuludq %xmm0,%xmm2 373 pmuludq %xmm0,%xmm1 374 pmuludq %xmm6,%xmm0 375 movdqa %xmm5,%xmm6 376 pmuludq 48(%edx),%xmm5 377 movdqa %xmm6,%xmm7 378 pmuludq 32(%edx),%xmm6 379 paddq %xmm5,%xmm4 380 movdqa %xmm7,%xmm5 381 pmuludq 16(%edx),%xmm7 382 paddq %xmm6,%xmm3 383 movdqa 80(%esp),%xmm6 384 pmuludq (%edx),%xmm5 385 paddq %xmm7,%xmm2 386 pmuludq 64(%edx),%xmm6 387 movdqa 32(%esp),%xmm7 388 paddq %xmm5,%xmm1 389 movdqa %xmm7,%xmm5 390 pmuludq 32(%edx),%xmm7 391 paddq %xmm6,%xmm0 392 movdqa %xmm5,%xmm6 393 pmuludq 16(%edx),%xmm5 394 paddq %xmm7,%xmm4 395 movdqa 96(%esp),%xmm7 396 pmuludq (%edx),%xmm6 397 paddq %xmm5,%xmm3 398 movdqa %xmm7,%xmm5 399 pmuludq 64(%edx),%xmm7 400 paddq %xmm6,%xmm2 401 pmuludq 48(%edx),%xmm5 402 movdqa 48(%esp),%xmm6 403 paddq %xmm7,%xmm1 404 movdqa %xmm6,%xmm7 405 pmuludq 16(%edx),%xmm6 406 paddq %xmm5,%xmm0 407 movdqa 112(%esp),%xmm5 408 pmuludq (%edx),%xmm7 409 paddq %xmm6,%xmm4 410 movdqa %xmm5,%xmm6 411 pmuludq 64(%edx),%xmm5 412 paddq %xmm7,%xmm3 413 movdqa %xmm6,%xmm7 414 pmuludq 48(%edx),%xmm6 415 paddq %xmm5,%xmm2 416 pmuludq 32(%edx),%xmm7 417 movdqa 64(%esp),%xmm5 418 paddq %xmm6,%xmm1 419 movdqa 128(%esp),%xmm6 420 pmuludq (%edx),%xmm5 421 paddq %xmm7,%xmm0 422 movdqa %xmm6,%xmm7 423 pmuludq 64(%edx),%xmm6 424 paddq %xmm5,%xmm4 425 movdqa %xmm7,%xmm5 426 pmuludq 16(%edx),%xmm7 427 paddq %xmm6,%xmm3 428 movdqa %xmm5,%xmm6 429 pmuludq 32(%edx),%xmm5 430 paddq %xmm7,%xmm0 431 pmuludq 48(%edx),%xmm6 432 movdqa 64(%ebx),%xmm7 433 paddq %xmm5,%xmm1 434 paddq %xmm6,%xmm2 435 movdqa %xmm3,%xmm5 436 pand %xmm7,%xmm3 437 psrlq $26,%xmm5 438 paddq %xmm4,%xmm5 439 movdqa %xmm0,%xmm6 440 pand %xmm7,%xmm0 441 psrlq $26,%xmm6 442 movdqa %xmm5,%xmm4 443 paddq %xmm1,%xmm6 444 psrlq $26,%xmm5 445 pand %xmm7,%xmm4 446 movdqa %xmm6,%xmm1 447 psrlq $26,%xmm6 448 paddd %xmm5,%xmm0 449 psllq $2,%xmm5 450 paddq %xmm2,%xmm6 451 paddq %xmm0,%xmm5 452 pand %xmm7,%xmm1 453 movdqa %xmm6,%xmm2 454 psrlq $26,%xmm6 455 pand %xmm7,%xmm2 456 paddd %xmm3,%xmm6 457 movdqa %xmm5,%xmm0 458 psrlq $26,%xmm5 459 movdqa %xmm6,%xmm3 460 psrlq $26,%xmm6 461 pand %xmm7,%xmm0 462 paddd %xmm5,%xmm1 463 pand %xmm7,%xmm3 464 paddd %xmm6,%xmm4 465 decl %ecx 466 jz .L006square_break 467 punpcklqdq (%esp),%xmm0 468 punpcklqdq 16(%esp),%xmm1 469 punpcklqdq 32(%esp),%xmm2 470 punpcklqdq 48(%esp),%xmm3 471 punpcklqdq 64(%esp),%xmm4 472 jmp .L005square 473.L006square_break: 474 psllq $32,%xmm0 475 psllq $32,%xmm1 476 psllq $32,%xmm2 477 psllq $32,%xmm3 478 psllq $32,%xmm4 479 por (%esp),%xmm0 480 por 16(%esp),%xmm1 481 por 32(%esp),%xmm2 482 por 48(%esp),%xmm3 483 por 64(%esp),%xmm4 484 pshufd $141,%xmm0,%xmm0 485 pshufd $141,%xmm1,%xmm1 486 pshufd $141,%xmm2,%xmm2 487 pshufd $141,%xmm3,%xmm3 488 pshufd $141,%xmm4,%xmm4 489 movdqu %xmm0,(%edi) 490 movdqu %xmm1,16(%edi) 491 movdqu %xmm2,32(%edi) 492 movdqu %xmm3,48(%edi) 493 movdqu %xmm4,64(%edi) 494 movdqa %xmm1,%xmm6 495 movdqa %xmm2,%xmm5 496 pslld $2,%xmm6 497 pslld $2,%xmm5 498 paddd %xmm1,%xmm6 499 paddd %xmm2,%xmm5 500 movdqu %xmm6,80(%edi) 501 movdqu %xmm5,96(%edi) 502 movdqa %xmm3,%xmm6 503 movdqa %xmm4,%xmm5 504 pslld $2,%xmm6 505 pslld $2,%xmm5 506 paddd %xmm3,%xmm6 507 paddd %xmm4,%xmm5 508 movdqu %xmm6,112(%edi) 509 movdqu %xmm5,128(%edi) 510 movl %ebp,%esp 511 leal -48(%edi),%edi 512 ret 513.size _poly1305_init_sse2,.-_poly1305_init_sse2 514.align 32 515.type _poly1305_blocks_sse2,@function 516.align 16 517_poly1305_blocks_sse2: 518 %ifdef __CET__ 519 520.byte 243,15,30,251 521 %endif 522 523 pushl %ebp 524 pushl %ebx 525 pushl %esi 526 pushl %edi 527 movl 20(%esp),%edi 528 movl 24(%esp),%esi 529 movl 28(%esp),%ecx 530 movl 20(%edi),%eax 531 andl $-16,%ecx 532 jz .L007nodata 533 cmpl $64,%ecx 534 jae .L008enter_sse2 535 testl %eax,%eax 536 jz .Lenter_blocks 537.align 16 538.L008enter_sse2: 539 call .L009pic_point 540.L009pic_point: 541 popl %ebx 542 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 543 testl %eax,%eax 544 jnz .L010base2_26 545 call _poly1305_init_sse2 546 movl (%edi),%eax 547 movl 3(%edi),%ecx 548 movl 6(%edi),%edx 549 movl 9(%edi),%esi 550 movl 13(%edi),%ebp 551 movl $1,20(%edi) 552 shrl $2,%ecx 553 andl $67108863,%eax 554 shrl $4,%edx 555 andl $67108863,%ecx 556 shrl $6,%esi 557 andl $67108863,%edx 558 movd %eax,%xmm0 559 movd %ecx,%xmm1 560 movd %edx,%xmm2 561 movd %esi,%xmm3 562 movd %ebp,%xmm4 563 movl 24(%esp),%esi 564 movl 28(%esp),%ecx 565 jmp .L011base2_32 566.align 16 567.L010base2_26: 568 movd (%edi),%xmm0 569 movd 4(%edi),%xmm1 570 movd 8(%edi),%xmm2 571 movd 12(%edi),%xmm3 572 movd 16(%edi),%xmm4 573 movdqa 64(%ebx),%xmm7 574.L011base2_32: 575 movl 32(%esp),%eax 576 movl %esp,%ebp 577 subl $528,%esp 578 andl $-16,%esp 579 leal 48(%edi),%edi 580 shll $24,%eax 581 testl $31,%ecx 582 jz .L012even 583 movdqu (%esi),%xmm6 584 leal 16(%esi),%esi 585 movdqa %xmm6,%xmm5 586 pand %xmm7,%xmm6 587 paddd %xmm6,%xmm0 588 movdqa %xmm5,%xmm6 589 psrlq $26,%xmm5 590 psrldq $6,%xmm6 591 pand %xmm7,%xmm5 592 paddd %xmm5,%xmm1 593 movdqa %xmm6,%xmm5 594 psrlq $4,%xmm6 595 pand %xmm7,%xmm6 596 paddd %xmm6,%xmm2 597 movdqa %xmm5,%xmm6 598 psrlq $30,%xmm5 599 pand %xmm7,%xmm5 600 psrldq $7,%xmm6 601 paddd %xmm5,%xmm3 602 movd %eax,%xmm5 603 paddd %xmm6,%xmm4 604 movd 12(%edi),%xmm6 605 paddd %xmm5,%xmm4 606 movdqa %xmm0,(%esp) 607 movdqa %xmm1,16(%esp) 608 movdqa %xmm2,32(%esp) 609 movdqa %xmm3,48(%esp) 610 movdqa %xmm4,64(%esp) 611 pmuludq %xmm6,%xmm0 612 pmuludq %xmm6,%xmm1 613 pmuludq %xmm6,%xmm2 614 movd 28(%edi),%xmm5 615 pmuludq %xmm6,%xmm3 616 pmuludq %xmm6,%xmm4 617 movdqa %xmm5,%xmm6 618 pmuludq 48(%esp),%xmm5 619 movdqa %xmm6,%xmm7 620 pmuludq 32(%esp),%xmm6 621 paddq %xmm5,%xmm4 622 movdqa %xmm7,%xmm5 623 pmuludq 16(%esp),%xmm7 624 paddq %xmm6,%xmm3 625 movd 92(%edi),%xmm6 626 pmuludq (%esp),%xmm5 627 paddq %xmm7,%xmm2 628 pmuludq 64(%esp),%xmm6 629 movd 44(%edi),%xmm7 630 paddq %xmm5,%xmm1 631 movdqa %xmm7,%xmm5 632 pmuludq 32(%esp),%xmm7 633 paddq %xmm6,%xmm0 634 movdqa %xmm5,%xmm6 635 pmuludq 16(%esp),%xmm5 636 paddq %xmm7,%xmm4 637 movd 108(%edi),%xmm7 638 pmuludq (%esp),%xmm6 639 paddq %xmm5,%xmm3 640 movdqa %xmm7,%xmm5 641 pmuludq 64(%esp),%xmm7 642 paddq %xmm6,%xmm2 643 pmuludq 48(%esp),%xmm5 644 movd 60(%edi),%xmm6 645 paddq %xmm7,%xmm1 646 movdqa %xmm6,%xmm7 647 pmuludq 16(%esp),%xmm6 648 paddq %xmm5,%xmm0 649 movd 124(%edi),%xmm5 650 pmuludq (%esp),%xmm7 651 paddq %xmm6,%xmm4 652 movdqa %xmm5,%xmm6 653 pmuludq 64(%esp),%xmm5 654 paddq %xmm7,%xmm3 655 movdqa %xmm6,%xmm7 656 pmuludq 48(%esp),%xmm6 657 paddq %xmm5,%xmm2 658 pmuludq 32(%esp),%xmm7 659 movd 76(%edi),%xmm5 660 paddq %xmm6,%xmm1 661 movd 140(%edi),%xmm6 662 pmuludq (%esp),%xmm5 663 paddq %xmm7,%xmm0 664 movdqa %xmm6,%xmm7 665 pmuludq 64(%esp),%xmm6 666 paddq %xmm5,%xmm4 667 movdqa %xmm7,%xmm5 668 pmuludq 16(%esp),%xmm7 669 paddq %xmm6,%xmm3 670 movdqa %xmm5,%xmm6 671 pmuludq 32(%esp),%xmm5 672 paddq %xmm7,%xmm0 673 pmuludq 48(%esp),%xmm6 674 movdqa 64(%ebx),%xmm7 675 paddq %xmm5,%xmm1 676 paddq %xmm6,%xmm2 677 movdqa %xmm3,%xmm5 678 pand %xmm7,%xmm3 679 psrlq $26,%xmm5 680 paddq %xmm4,%xmm5 681 movdqa %xmm0,%xmm6 682 pand %xmm7,%xmm0 683 psrlq $26,%xmm6 684 movdqa %xmm5,%xmm4 685 paddq %xmm1,%xmm6 686 psrlq $26,%xmm5 687 pand %xmm7,%xmm4 688 movdqa %xmm6,%xmm1 689 psrlq $26,%xmm6 690 paddd %xmm5,%xmm0 691 psllq $2,%xmm5 692 paddq %xmm2,%xmm6 693 paddq %xmm0,%xmm5 694 pand %xmm7,%xmm1 695 movdqa %xmm6,%xmm2 696 psrlq $26,%xmm6 697 pand %xmm7,%xmm2 698 paddd %xmm3,%xmm6 699 movdqa %xmm5,%xmm0 700 psrlq $26,%xmm5 701 movdqa %xmm6,%xmm3 702 psrlq $26,%xmm6 703 pand %xmm7,%xmm0 704 paddd %xmm5,%xmm1 705 pand %xmm7,%xmm3 706 paddd %xmm6,%xmm4 707 subl $16,%ecx 708 jz .L013done 709.L012even: 710 leal 384(%esp),%edx 711 leal -32(%esi),%eax 712 subl $64,%ecx 713 movdqu (%edi),%xmm5 714 pshufd $68,%xmm5,%xmm6 715 cmovbl %eax,%esi 716 pshufd $238,%xmm5,%xmm5 717 movdqa %xmm6,(%edx) 718 leal 160(%esp),%eax 719 movdqu 16(%edi),%xmm6 720 movdqa %xmm5,-144(%edx) 721 pshufd $68,%xmm6,%xmm5 722 pshufd $238,%xmm6,%xmm6 723 movdqa %xmm5,16(%edx) 724 movdqu 32(%edi),%xmm5 725 movdqa %xmm6,-128(%edx) 726 pshufd $68,%xmm5,%xmm6 727 pshufd $238,%xmm5,%xmm5 728 movdqa %xmm6,32(%edx) 729 movdqu 48(%edi),%xmm6 730 movdqa %xmm5,-112(%edx) 731 pshufd $68,%xmm6,%xmm5 732 pshufd $238,%xmm6,%xmm6 733 movdqa %xmm5,48(%edx) 734 movdqu 64(%edi),%xmm5 735 movdqa %xmm6,-96(%edx) 736 pshufd $68,%xmm5,%xmm6 737 pshufd $238,%xmm5,%xmm5 738 movdqa %xmm6,64(%edx) 739 movdqu 80(%edi),%xmm6 740 movdqa %xmm5,-80(%edx) 741 pshufd $68,%xmm6,%xmm5 742 pshufd $238,%xmm6,%xmm6 743 movdqa %xmm5,80(%edx) 744 movdqu 96(%edi),%xmm5 745 movdqa %xmm6,-64(%edx) 746 pshufd $68,%xmm5,%xmm6 747 pshufd $238,%xmm5,%xmm5 748 movdqa %xmm6,96(%edx) 749 movdqu 112(%edi),%xmm6 750 movdqa %xmm5,-48(%edx) 751 pshufd $68,%xmm6,%xmm5 752 pshufd $238,%xmm6,%xmm6 753 movdqa %xmm5,112(%edx) 754 movdqu 128(%edi),%xmm5 755 movdqa %xmm6,-32(%edx) 756 pshufd $68,%xmm5,%xmm6 757 pshufd $238,%xmm5,%xmm5 758 movdqa %xmm6,128(%edx) 759 movdqa %xmm5,-16(%edx) 760 movdqu 32(%esi),%xmm5 761 movdqu 48(%esi),%xmm6 762 leal 32(%esi),%esi 763 movdqa %xmm2,112(%esp) 764 movdqa %xmm3,128(%esp) 765 movdqa %xmm4,144(%esp) 766 movdqa %xmm5,%xmm2 767 movdqa %xmm6,%xmm3 768 psrldq $6,%xmm2 769 psrldq $6,%xmm3 770 movdqa %xmm5,%xmm4 771 punpcklqdq %xmm3,%xmm2 772 punpckhqdq %xmm6,%xmm4 773 punpcklqdq %xmm6,%xmm5 774 movdqa %xmm2,%xmm3 775 psrlq $4,%xmm2 776 psrlq $30,%xmm3 777 movdqa %xmm5,%xmm6 778 psrlq $40,%xmm4 779 psrlq $26,%xmm6 780 pand %xmm7,%xmm5 781 pand %xmm7,%xmm6 782 pand %xmm7,%xmm2 783 pand %xmm7,%xmm3 784 por (%ebx),%xmm4 785 movdqa %xmm0,80(%esp) 786 movdqa %xmm1,96(%esp) 787 jbe .L014skip_loop 788 jmp .L015loop 789.align 32 790.L015loop: 791 movdqa -144(%edx),%xmm7 792 movdqa %xmm6,16(%eax) 793 movdqa %xmm2,32(%eax) 794 movdqa %xmm3,48(%eax) 795 movdqa %xmm4,64(%eax) 796 movdqa %xmm5,%xmm1 797 pmuludq %xmm7,%xmm5 798 movdqa %xmm6,%xmm0 799 pmuludq %xmm7,%xmm6 800 pmuludq %xmm7,%xmm2 801 pmuludq %xmm7,%xmm3 802 pmuludq %xmm7,%xmm4 803 pmuludq -16(%edx),%xmm0 804 movdqa %xmm1,%xmm7 805 pmuludq -128(%edx),%xmm1 806 paddq %xmm5,%xmm0 807 movdqa %xmm7,%xmm5 808 pmuludq -112(%edx),%xmm7 809 paddq %xmm6,%xmm1 810 movdqa %xmm5,%xmm6 811 pmuludq -96(%edx),%xmm5 812 paddq %xmm7,%xmm2 813 movdqa 16(%eax),%xmm7 814 pmuludq -80(%edx),%xmm6 815 paddq %xmm5,%xmm3 816 movdqa %xmm7,%xmm5 817 pmuludq -128(%edx),%xmm7 818 paddq %xmm6,%xmm4 819 movdqa %xmm5,%xmm6 820 pmuludq -112(%edx),%xmm5 821 paddq %xmm7,%xmm2 822 movdqa 32(%eax),%xmm7 823 pmuludq -96(%edx),%xmm6 824 paddq %xmm5,%xmm3 825 movdqa %xmm7,%xmm5 826 pmuludq -32(%edx),%xmm7 827 paddq %xmm6,%xmm4 828 movdqa %xmm5,%xmm6 829 pmuludq -16(%edx),%xmm5 830 paddq %xmm7,%xmm0 831 movdqa %xmm6,%xmm7 832 pmuludq -128(%edx),%xmm6 833 paddq %xmm5,%xmm1 834 movdqa 48(%eax),%xmm5 835 pmuludq -112(%edx),%xmm7 836 paddq %xmm6,%xmm3 837 movdqa %xmm5,%xmm6 838 pmuludq -48(%edx),%xmm5 839 paddq %xmm7,%xmm4 840 movdqa %xmm6,%xmm7 841 pmuludq -32(%edx),%xmm6 842 paddq %xmm5,%xmm0 843 movdqa %xmm7,%xmm5 844 pmuludq -16(%edx),%xmm7 845 paddq %xmm6,%xmm1 846 movdqa 64(%eax),%xmm6 847 pmuludq -128(%edx),%xmm5 848 paddq %xmm7,%xmm2 849 movdqa %xmm6,%xmm7 850 pmuludq -16(%edx),%xmm6 851 paddq %xmm5,%xmm4 852 movdqa %xmm7,%xmm5 853 pmuludq -64(%edx),%xmm7 854 paddq %xmm6,%xmm3 855 movdqa %xmm5,%xmm6 856 pmuludq -48(%edx),%xmm5 857 paddq %xmm7,%xmm0 858 movdqa 64(%ebx),%xmm7 859 pmuludq -32(%edx),%xmm6 860 paddq %xmm5,%xmm1 861 paddq %xmm6,%xmm2 862 movdqu -32(%esi),%xmm5 863 movdqu -16(%esi),%xmm6 864 leal 32(%esi),%esi 865 movdqa %xmm2,32(%esp) 866 movdqa %xmm3,48(%esp) 867 movdqa %xmm4,64(%esp) 868 movdqa %xmm5,%xmm2 869 movdqa %xmm6,%xmm3 870 psrldq $6,%xmm2 871 psrldq $6,%xmm3 872 movdqa %xmm5,%xmm4 873 punpcklqdq %xmm3,%xmm2 874 punpckhqdq %xmm6,%xmm4 875 punpcklqdq %xmm6,%xmm5 876 movdqa %xmm2,%xmm3 877 psrlq $4,%xmm2 878 psrlq $30,%xmm3 879 movdqa %xmm5,%xmm6 880 psrlq $40,%xmm4 881 psrlq $26,%xmm6 882 pand %xmm7,%xmm5 883 pand %xmm7,%xmm6 884 pand %xmm7,%xmm2 885 pand %xmm7,%xmm3 886 por (%ebx),%xmm4 887 leal -32(%esi),%eax 888 subl $64,%ecx 889 paddd 80(%esp),%xmm5 890 paddd 96(%esp),%xmm6 891 paddd 112(%esp),%xmm2 892 paddd 128(%esp),%xmm3 893 paddd 144(%esp),%xmm4 894 cmovbl %eax,%esi 895 leal 160(%esp),%eax 896 movdqa (%edx),%xmm7 897 movdqa %xmm1,16(%esp) 898 movdqa %xmm6,16(%eax) 899 movdqa %xmm2,32(%eax) 900 movdqa %xmm3,48(%eax) 901 movdqa %xmm4,64(%eax) 902 movdqa %xmm5,%xmm1 903 pmuludq %xmm7,%xmm5 904 paddq %xmm0,%xmm5 905 movdqa %xmm6,%xmm0 906 pmuludq %xmm7,%xmm6 907 pmuludq %xmm7,%xmm2 908 pmuludq %xmm7,%xmm3 909 pmuludq %xmm7,%xmm4 910 paddq 16(%esp),%xmm6 911 paddq 32(%esp),%xmm2 912 paddq 48(%esp),%xmm3 913 paddq 64(%esp),%xmm4 914 pmuludq 128(%edx),%xmm0 915 movdqa %xmm1,%xmm7 916 pmuludq 16(%edx),%xmm1 917 paddq %xmm5,%xmm0 918 movdqa %xmm7,%xmm5 919 pmuludq 32(%edx),%xmm7 920 paddq %xmm6,%xmm1 921 movdqa %xmm5,%xmm6 922 pmuludq 48(%edx),%xmm5 923 paddq %xmm7,%xmm2 924 movdqa 16(%eax),%xmm7 925 pmuludq 64(%edx),%xmm6 926 paddq %xmm5,%xmm3 927 movdqa %xmm7,%xmm5 928 pmuludq 16(%edx),%xmm7 929 paddq %xmm6,%xmm4 930 movdqa %xmm5,%xmm6 931 pmuludq 32(%edx),%xmm5 932 paddq %xmm7,%xmm2 933 movdqa 32(%eax),%xmm7 934 pmuludq 48(%edx),%xmm6 935 paddq %xmm5,%xmm3 936 movdqa %xmm7,%xmm5 937 pmuludq 112(%edx),%xmm7 938 paddq %xmm6,%xmm4 939 movdqa %xmm5,%xmm6 940 pmuludq 128(%edx),%xmm5 941 paddq %xmm7,%xmm0 942 movdqa %xmm6,%xmm7 943 pmuludq 16(%edx),%xmm6 944 paddq %xmm5,%xmm1 945 movdqa 48(%eax),%xmm5 946 pmuludq 32(%edx),%xmm7 947 paddq %xmm6,%xmm3 948 movdqa %xmm5,%xmm6 949 pmuludq 96(%edx),%xmm5 950 paddq %xmm7,%xmm4 951 movdqa %xmm6,%xmm7 952 pmuludq 112(%edx),%xmm6 953 paddq %xmm5,%xmm0 954 movdqa %xmm7,%xmm5 955 pmuludq 128(%edx),%xmm7 956 paddq %xmm6,%xmm1 957 movdqa 64(%eax),%xmm6 958 pmuludq 16(%edx),%xmm5 959 paddq %xmm7,%xmm2 960 movdqa %xmm6,%xmm7 961 pmuludq 128(%edx),%xmm6 962 paddq %xmm5,%xmm4 963 movdqa %xmm7,%xmm5 964 pmuludq 80(%edx),%xmm7 965 paddq %xmm6,%xmm3 966 movdqa %xmm5,%xmm6 967 pmuludq 96(%edx),%xmm5 968 paddq %xmm7,%xmm0 969 movdqa 64(%ebx),%xmm7 970 pmuludq 112(%edx),%xmm6 971 paddq %xmm5,%xmm1 972 paddq %xmm6,%xmm2 973 movdqa %xmm3,%xmm5 974 pand %xmm7,%xmm3 975 psrlq $26,%xmm5 976 paddq %xmm4,%xmm5 977 movdqa %xmm0,%xmm6 978 pand %xmm7,%xmm0 979 psrlq $26,%xmm6 980 movdqa %xmm5,%xmm4 981 paddq %xmm1,%xmm6 982 psrlq $26,%xmm5 983 pand %xmm7,%xmm4 984 movdqa %xmm6,%xmm1 985 psrlq $26,%xmm6 986 paddd %xmm5,%xmm0 987 psllq $2,%xmm5 988 paddq %xmm2,%xmm6 989 paddq %xmm0,%xmm5 990 pand %xmm7,%xmm1 991 movdqa %xmm6,%xmm2 992 psrlq $26,%xmm6 993 pand %xmm7,%xmm2 994 paddd %xmm3,%xmm6 995 movdqa %xmm5,%xmm0 996 psrlq $26,%xmm5 997 movdqa %xmm6,%xmm3 998 psrlq $26,%xmm6 999 pand %xmm7,%xmm0 1000 paddd %xmm5,%xmm1 1001 pand %xmm7,%xmm3 1002 paddd %xmm6,%xmm4 1003 movdqu 32(%esi),%xmm5 1004 movdqu 48(%esi),%xmm6 1005 leal 32(%esi),%esi 1006 movdqa %xmm2,112(%esp) 1007 movdqa %xmm3,128(%esp) 1008 movdqa %xmm4,144(%esp) 1009 movdqa %xmm5,%xmm2 1010 movdqa %xmm6,%xmm3 1011 psrldq $6,%xmm2 1012 psrldq $6,%xmm3 1013 movdqa %xmm5,%xmm4 1014 punpcklqdq %xmm3,%xmm2 1015 punpckhqdq %xmm6,%xmm4 1016 punpcklqdq %xmm6,%xmm5 1017 movdqa %xmm2,%xmm3 1018 psrlq $4,%xmm2 1019 psrlq $30,%xmm3 1020 movdqa %xmm5,%xmm6 1021 psrlq $40,%xmm4 1022 psrlq $26,%xmm6 1023 pand %xmm7,%xmm5 1024 pand %xmm7,%xmm6 1025 pand %xmm7,%xmm2 1026 pand %xmm7,%xmm3 1027 por (%ebx),%xmm4 1028 movdqa %xmm0,80(%esp) 1029 movdqa %xmm1,96(%esp) 1030 ja .L015loop 1031.L014skip_loop: 1032 pshufd $16,-144(%edx),%xmm7 1033 addl $32,%ecx 1034 jnz .L016long_tail 1035 paddd %xmm0,%xmm5 1036 paddd %xmm1,%xmm6 1037 paddd 112(%esp),%xmm2 1038 paddd 128(%esp),%xmm3 1039 paddd 144(%esp),%xmm4 1040.L016long_tail: 1041 movdqa %xmm5,(%eax) 1042 movdqa %xmm6,16(%eax) 1043 movdqa %xmm2,32(%eax) 1044 movdqa %xmm3,48(%eax) 1045 movdqa %xmm4,64(%eax) 1046 pmuludq %xmm7,%xmm5 1047 pmuludq %xmm7,%xmm6 1048 pmuludq %xmm7,%xmm2 1049 movdqa %xmm5,%xmm0 1050 pshufd $16,-128(%edx),%xmm5 1051 pmuludq %xmm7,%xmm3 1052 movdqa %xmm6,%xmm1 1053 pmuludq %xmm7,%xmm4 1054 movdqa %xmm5,%xmm6 1055 pmuludq 48(%eax),%xmm5 1056 movdqa %xmm6,%xmm7 1057 pmuludq 32(%eax),%xmm6 1058 paddq %xmm5,%xmm4 1059 movdqa %xmm7,%xmm5 1060 pmuludq 16(%eax),%xmm7 1061 paddq %xmm6,%xmm3 1062 pshufd $16,-64(%edx),%xmm6 1063 pmuludq (%eax),%xmm5 1064 paddq %xmm7,%xmm2 1065 pmuludq 64(%eax),%xmm6 1066 pshufd $16,-112(%edx),%xmm7 1067 paddq %xmm5,%xmm1 1068 movdqa %xmm7,%xmm5 1069 pmuludq 32(%eax),%xmm7 1070 paddq %xmm6,%xmm0 1071 movdqa %xmm5,%xmm6 1072 pmuludq 16(%eax),%xmm5 1073 paddq %xmm7,%xmm4 1074 pshufd $16,-48(%edx),%xmm7 1075 pmuludq (%eax),%xmm6 1076 paddq %xmm5,%xmm3 1077 movdqa %xmm7,%xmm5 1078 pmuludq 64(%eax),%xmm7 1079 paddq %xmm6,%xmm2 1080 pmuludq 48(%eax),%xmm5 1081 pshufd $16,-96(%edx),%xmm6 1082 paddq %xmm7,%xmm1 1083 movdqa %xmm6,%xmm7 1084 pmuludq 16(%eax),%xmm6 1085 paddq %xmm5,%xmm0 1086 pshufd $16,-32(%edx),%xmm5 1087 pmuludq (%eax),%xmm7 1088 paddq %xmm6,%xmm4 1089 movdqa %xmm5,%xmm6 1090 pmuludq 64(%eax),%xmm5 1091 paddq %xmm7,%xmm3 1092 movdqa %xmm6,%xmm7 1093 pmuludq 48(%eax),%xmm6 1094 paddq %xmm5,%xmm2 1095 pmuludq 32(%eax),%xmm7 1096 pshufd $16,-80(%edx),%xmm5 1097 paddq %xmm6,%xmm1 1098 pshufd $16,-16(%edx),%xmm6 1099 pmuludq (%eax),%xmm5 1100 paddq %xmm7,%xmm0 1101 movdqa %xmm6,%xmm7 1102 pmuludq 64(%eax),%xmm6 1103 paddq %xmm5,%xmm4 1104 movdqa %xmm7,%xmm5 1105 pmuludq 16(%eax),%xmm7 1106 paddq %xmm6,%xmm3 1107 movdqa %xmm5,%xmm6 1108 pmuludq 32(%eax),%xmm5 1109 paddq %xmm7,%xmm0 1110 pmuludq 48(%eax),%xmm6 1111 movdqa 64(%ebx),%xmm7 1112 paddq %xmm5,%xmm1 1113 paddq %xmm6,%xmm2 1114 jz .L017short_tail 1115 movdqu -32(%esi),%xmm5 1116 movdqu -16(%esi),%xmm6 1117 leal 32(%esi),%esi 1118 movdqa %xmm2,32(%esp) 1119 movdqa %xmm3,48(%esp) 1120 movdqa %xmm4,64(%esp) 1121 movdqa %xmm5,%xmm2 1122 movdqa %xmm6,%xmm3 1123 psrldq $6,%xmm2 1124 psrldq $6,%xmm3 1125 movdqa %xmm5,%xmm4 1126 punpcklqdq %xmm3,%xmm2 1127 punpckhqdq %xmm6,%xmm4 1128 punpcklqdq %xmm6,%xmm5 1129 movdqa %xmm2,%xmm3 1130 psrlq $4,%xmm2 1131 psrlq $30,%xmm3 1132 movdqa %xmm5,%xmm6 1133 psrlq $40,%xmm4 1134 psrlq $26,%xmm6 1135 pand %xmm7,%xmm5 1136 pand %xmm7,%xmm6 1137 pand %xmm7,%xmm2 1138 pand %xmm7,%xmm3 1139 por (%ebx),%xmm4 1140 pshufd $16,(%edx),%xmm7 1141 paddd 80(%esp),%xmm5 1142 paddd 96(%esp),%xmm6 1143 paddd 112(%esp),%xmm2 1144 paddd 128(%esp),%xmm3 1145 paddd 144(%esp),%xmm4 1146 movdqa %xmm5,(%esp) 1147 pmuludq %xmm7,%xmm5 1148 movdqa %xmm6,16(%esp) 1149 pmuludq %xmm7,%xmm6 1150 paddq %xmm5,%xmm0 1151 movdqa %xmm2,%xmm5 1152 pmuludq %xmm7,%xmm2 1153 paddq %xmm6,%xmm1 1154 movdqa %xmm3,%xmm6 1155 pmuludq %xmm7,%xmm3 1156 paddq 32(%esp),%xmm2 1157 movdqa %xmm5,32(%esp) 1158 pshufd $16,16(%edx),%xmm5 1159 paddq 48(%esp),%xmm3 1160 movdqa %xmm6,48(%esp) 1161 movdqa %xmm4,%xmm6 1162 pmuludq %xmm7,%xmm4 1163 paddq 64(%esp),%xmm4 1164 movdqa %xmm6,64(%esp) 1165 movdqa %xmm5,%xmm6 1166 pmuludq 48(%esp),%xmm5 1167 movdqa %xmm6,%xmm7 1168 pmuludq 32(%esp),%xmm6 1169 paddq %xmm5,%xmm4 1170 movdqa %xmm7,%xmm5 1171 pmuludq 16(%esp),%xmm7 1172 paddq %xmm6,%xmm3 1173 pshufd $16,80(%edx),%xmm6 1174 pmuludq (%esp),%xmm5 1175 paddq %xmm7,%xmm2 1176 pmuludq 64(%esp),%xmm6 1177 pshufd $16,32(%edx),%xmm7 1178 paddq %xmm5,%xmm1 1179 movdqa %xmm7,%xmm5 1180 pmuludq 32(%esp),%xmm7 1181 paddq %xmm6,%xmm0 1182 movdqa %xmm5,%xmm6 1183 pmuludq 16(%esp),%xmm5 1184 paddq %xmm7,%xmm4 1185 pshufd $16,96(%edx),%xmm7 1186 pmuludq (%esp),%xmm6 1187 paddq %xmm5,%xmm3 1188 movdqa %xmm7,%xmm5 1189 pmuludq 64(%esp),%xmm7 1190 paddq %xmm6,%xmm2 1191 pmuludq 48(%esp),%xmm5 1192 pshufd $16,48(%edx),%xmm6 1193 paddq %xmm7,%xmm1 1194 movdqa %xmm6,%xmm7 1195 pmuludq 16(%esp),%xmm6 1196 paddq %xmm5,%xmm0 1197 pshufd $16,112(%edx),%xmm5 1198 pmuludq (%esp),%xmm7 1199 paddq %xmm6,%xmm4 1200 movdqa %xmm5,%xmm6 1201 pmuludq 64(%esp),%xmm5 1202 paddq %xmm7,%xmm3 1203 movdqa %xmm6,%xmm7 1204 pmuludq 48(%esp),%xmm6 1205 paddq %xmm5,%xmm2 1206 pmuludq 32(%esp),%xmm7 1207 pshufd $16,64(%edx),%xmm5 1208 paddq %xmm6,%xmm1 1209 pshufd $16,128(%edx),%xmm6 1210 pmuludq (%esp),%xmm5 1211 paddq %xmm7,%xmm0 1212 movdqa %xmm6,%xmm7 1213 pmuludq 64(%esp),%xmm6 1214 paddq %xmm5,%xmm4 1215 movdqa %xmm7,%xmm5 1216 pmuludq 16(%esp),%xmm7 1217 paddq %xmm6,%xmm3 1218 movdqa %xmm5,%xmm6 1219 pmuludq 32(%esp),%xmm5 1220 paddq %xmm7,%xmm0 1221 pmuludq 48(%esp),%xmm6 1222 movdqa 64(%ebx),%xmm7 1223 paddq %xmm5,%xmm1 1224 paddq %xmm6,%xmm2 1225.L017short_tail: 1226 pshufd $78,%xmm4,%xmm6 1227 pshufd $78,%xmm3,%xmm5 1228 paddq %xmm6,%xmm4 1229 paddq %xmm5,%xmm3 1230 pshufd $78,%xmm0,%xmm6 1231 pshufd $78,%xmm1,%xmm5 1232 paddq %xmm6,%xmm0 1233 paddq %xmm5,%xmm1 1234 pshufd $78,%xmm2,%xmm6 1235 movdqa %xmm3,%xmm5 1236 pand %xmm7,%xmm3 1237 psrlq $26,%xmm5 1238 paddq %xmm6,%xmm2 1239 paddq %xmm4,%xmm5 1240 movdqa %xmm0,%xmm6 1241 pand %xmm7,%xmm0 1242 psrlq $26,%xmm6 1243 movdqa %xmm5,%xmm4 1244 paddq %xmm1,%xmm6 1245 psrlq $26,%xmm5 1246 pand %xmm7,%xmm4 1247 movdqa %xmm6,%xmm1 1248 psrlq $26,%xmm6 1249 paddd %xmm5,%xmm0 1250 psllq $2,%xmm5 1251 paddq %xmm2,%xmm6 1252 paddq %xmm0,%xmm5 1253 pand %xmm7,%xmm1 1254 movdqa %xmm6,%xmm2 1255 psrlq $26,%xmm6 1256 pand %xmm7,%xmm2 1257 paddd %xmm3,%xmm6 1258 movdqa %xmm5,%xmm0 1259 psrlq $26,%xmm5 1260 movdqa %xmm6,%xmm3 1261 psrlq $26,%xmm6 1262 pand %xmm7,%xmm0 1263 paddd %xmm5,%xmm1 1264 pand %xmm7,%xmm3 1265 paddd %xmm6,%xmm4 1266.L013done: 1267 movd %xmm0,-48(%edi) 1268 movd %xmm1,-44(%edi) 1269 movd %xmm2,-40(%edi) 1270 movd %xmm3,-36(%edi) 1271 movd %xmm4,-32(%edi) 1272 movl %ebp,%esp 1273.L007nodata: 1274 popl %edi 1275 popl %esi 1276 popl %ebx 1277 popl %ebp 1278 ret 1279.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 1280.align 32 1281.type _poly1305_emit_sse2,@function 1282.align 16 1283_poly1305_emit_sse2: 1284 %ifdef __CET__ 1285 1286.byte 243,15,30,251 1287 %endif 1288 1289 pushl %ebp 1290 pushl %ebx 1291 pushl %esi 1292 pushl %edi 1293 movl 20(%esp),%ebp 1294 cmpl $0,20(%ebp) 1295 je .Lenter_emit 1296 movl (%ebp),%eax 1297 movl 4(%ebp),%edi 1298 movl 8(%ebp),%ecx 1299 movl 12(%ebp),%edx 1300 movl 16(%ebp),%esi 1301 movl %edi,%ebx 1302 shll $26,%edi 1303 shrl $6,%ebx 1304 addl %edi,%eax 1305 movl %ecx,%edi 1306 adcl $0,%ebx 1307 shll $20,%edi 1308 shrl $12,%ecx 1309 addl %edi,%ebx 1310 movl %edx,%edi 1311 adcl $0,%ecx 1312 shll $14,%edi 1313 shrl $18,%edx 1314 addl %edi,%ecx 1315 movl %esi,%edi 1316 adcl $0,%edx 1317 shll $8,%edi 1318 shrl $24,%esi 1319 addl %edi,%edx 1320 adcl $0,%esi 1321 movl %esi,%edi 1322 andl $3,%esi 1323 shrl $2,%edi 1324 leal (%edi,%edi,4),%ebp 1325 movl 24(%esp),%edi 1326 addl %ebp,%eax 1327 movl 28(%esp),%ebp 1328 adcl $0,%ebx 1329 adcl $0,%ecx 1330 adcl $0,%edx 1331 adcl $0,%esi 1332 movd %eax,%xmm0 1333 addl $5,%eax 1334 movd %ebx,%xmm1 1335 adcl $0,%ebx 1336 movd %ecx,%xmm2 1337 adcl $0,%ecx 1338 movd %edx,%xmm3 1339 adcl $0,%edx 1340 adcl $0,%esi 1341 shrl $2,%esi 1342 negl %esi 1343 andl %esi,%eax 1344 andl %esi,%ebx 1345 andl %esi,%ecx 1346 andl %esi,%edx 1347 movl %eax,(%edi) 1348 movd %xmm0,%eax 1349 movl %ebx,4(%edi) 1350 movd %xmm1,%ebx 1351 movl %ecx,8(%edi) 1352 movd %xmm2,%ecx 1353 movl %edx,12(%edi) 1354 movd %xmm3,%edx 1355 notl %esi 1356 andl %esi,%eax 1357 andl %esi,%ebx 1358 orl (%edi),%eax 1359 andl %esi,%ecx 1360 orl 4(%edi),%ebx 1361 andl %esi,%edx 1362 orl 8(%edi),%ecx 1363 orl 12(%edi),%edx 1364 addl (%ebp),%eax 1365 adcl 4(%ebp),%ebx 1366 movl %eax,(%edi) 1367 adcl 8(%ebp),%ecx 1368 movl %ebx,4(%edi) 1369 adcl 12(%ebp),%edx 1370 movl %ecx,8(%edi) 1371 movl %edx,12(%edi) 1372 popl %edi 1373 popl %esi 1374 popl %ebx 1375 popl %ebp 1376 ret 1377.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 1378.align 32 1379.type _poly1305_init_avx2,@function 1380.align 16 1381_poly1305_init_avx2: 1382 %ifdef __CET__ 1383 1384.byte 243,15,30,251 1385 %endif 1386 1387 vmovdqu 24(%edi),%xmm4 1388 leal 48(%edi),%edi 1389 movl %esp,%ebp 1390 subl $224,%esp 1391 andl $-16,%esp 1392 vmovdqa 64(%ebx),%xmm7 1393 vpand %xmm7,%xmm4,%xmm0 1394 vpsrlq $26,%xmm4,%xmm1 1395 vpsrldq $6,%xmm4,%xmm3 1396 vpand %xmm7,%xmm1,%xmm1 1397 vpsrlq $4,%xmm3,%xmm2 1398 vpsrlq $30,%xmm3,%xmm3 1399 vpand %xmm7,%xmm2,%xmm2 1400 vpand %xmm7,%xmm3,%xmm3 1401 vpsrldq $13,%xmm4,%xmm4 1402 leal 144(%esp),%edx 1403 movl $2,%ecx 1404.L018square: 1405 vmovdqa %xmm0,(%esp) 1406 vmovdqa %xmm1,16(%esp) 1407 vmovdqa %xmm2,32(%esp) 1408 vmovdqa %xmm3,48(%esp) 1409 vmovdqa %xmm4,64(%esp) 1410 vpslld $2,%xmm1,%xmm6 1411 vpslld $2,%xmm2,%xmm5 1412 vpaddd %xmm1,%xmm6,%xmm6 1413 vpaddd %xmm2,%xmm5,%xmm5 1414 vmovdqa %xmm6,80(%esp) 1415 vmovdqa %xmm5,96(%esp) 1416 vpslld $2,%xmm3,%xmm6 1417 vpslld $2,%xmm4,%xmm5 1418 vpaddd %xmm3,%xmm6,%xmm6 1419 vpaddd %xmm4,%xmm5,%xmm5 1420 vmovdqa %xmm6,112(%esp) 1421 vmovdqa %xmm5,128(%esp) 1422 vpshufd $68,%xmm0,%xmm5 1423 vmovdqa %xmm1,%xmm6 1424 vpshufd $68,%xmm1,%xmm1 1425 vpshufd $68,%xmm2,%xmm2 1426 vpshufd $68,%xmm3,%xmm3 1427 vpshufd $68,%xmm4,%xmm4 1428 vmovdqa %xmm5,(%edx) 1429 vmovdqa %xmm1,16(%edx) 1430 vmovdqa %xmm2,32(%edx) 1431 vmovdqa %xmm3,48(%edx) 1432 vmovdqa %xmm4,64(%edx) 1433 vpmuludq %xmm0,%xmm4,%xmm4 1434 vpmuludq %xmm0,%xmm3,%xmm3 1435 vpmuludq %xmm0,%xmm2,%xmm2 1436 vpmuludq %xmm0,%xmm1,%xmm1 1437 vpmuludq %xmm0,%xmm5,%xmm0 1438 vpmuludq 48(%edx),%xmm6,%xmm5 1439 vpaddq %xmm5,%xmm4,%xmm4 1440 vpmuludq 32(%edx),%xmm6,%xmm7 1441 vpaddq %xmm7,%xmm3,%xmm3 1442 vpmuludq 16(%edx),%xmm6,%xmm5 1443 vpaddq %xmm5,%xmm2,%xmm2 1444 vmovdqa 80(%esp),%xmm7 1445 vpmuludq (%edx),%xmm6,%xmm6 1446 vpaddq %xmm6,%xmm1,%xmm1 1447 vmovdqa 32(%esp),%xmm5 1448 vpmuludq 64(%edx),%xmm7,%xmm7 1449 vpaddq %xmm7,%xmm0,%xmm0 1450 vpmuludq 32(%edx),%xmm5,%xmm6 1451 vpaddq %xmm6,%xmm4,%xmm4 1452 vpmuludq 16(%edx),%xmm5,%xmm7 1453 vpaddq %xmm7,%xmm3,%xmm3 1454 vmovdqa 96(%esp),%xmm6 1455 vpmuludq (%edx),%xmm5,%xmm5 1456 vpaddq %xmm5,%xmm2,%xmm2 1457 vpmuludq 64(%edx),%xmm6,%xmm7 1458 vpaddq %xmm7,%xmm1,%xmm1 1459 vmovdqa 48(%esp),%xmm5 1460 vpmuludq 48(%edx),%xmm6,%xmm6 1461 vpaddq %xmm6,%xmm0,%xmm0 1462 vpmuludq 16(%edx),%xmm5,%xmm7 1463 vpaddq %xmm7,%xmm4,%xmm4 1464 vmovdqa 112(%esp),%xmm6 1465 vpmuludq (%edx),%xmm5,%xmm5 1466 vpaddq %xmm5,%xmm3,%xmm3 1467 vpmuludq 64(%edx),%xmm6,%xmm7 1468 vpaddq %xmm7,%xmm2,%xmm2 1469 vpmuludq 48(%edx),%xmm6,%xmm5 1470 vpaddq %xmm5,%xmm1,%xmm1 1471 vmovdqa 64(%esp),%xmm7 1472 vpmuludq 32(%edx),%xmm6,%xmm6 1473 vpaddq %xmm6,%xmm0,%xmm0 1474 vmovdqa 128(%esp),%xmm5 1475 vpmuludq (%edx),%xmm7,%xmm7 1476 vpaddq %xmm7,%xmm4,%xmm4 1477 vpmuludq 64(%edx),%xmm5,%xmm6 1478 vpaddq %xmm6,%xmm3,%xmm3 1479 vpmuludq 16(%edx),%xmm5,%xmm7 1480 vpaddq %xmm7,%xmm0,%xmm0 1481 vpmuludq 32(%edx),%xmm5,%xmm6 1482 vpaddq %xmm6,%xmm1,%xmm1 1483 vmovdqa 64(%ebx),%xmm7 1484 vpmuludq 48(%edx),%xmm5,%xmm5 1485 vpaddq %xmm5,%xmm2,%xmm2 1486 vpsrlq $26,%xmm3,%xmm5 1487 vpand %xmm7,%xmm3,%xmm3 1488 vpsrlq $26,%xmm0,%xmm6 1489 vpand %xmm7,%xmm0,%xmm0 1490 vpaddq %xmm5,%xmm4,%xmm4 1491 vpaddq %xmm6,%xmm1,%xmm1 1492 vpsrlq $26,%xmm4,%xmm5 1493 vpand %xmm7,%xmm4,%xmm4 1494 vpsrlq $26,%xmm1,%xmm6 1495 vpand %xmm7,%xmm1,%xmm1 1496 vpaddq %xmm6,%xmm2,%xmm2 1497 vpaddd %xmm5,%xmm0,%xmm0 1498 vpsllq $2,%xmm5,%xmm5 1499 vpsrlq $26,%xmm2,%xmm6 1500 vpand %xmm7,%xmm2,%xmm2 1501 vpaddd %xmm5,%xmm0,%xmm0 1502 vpaddd %xmm6,%xmm3,%xmm3 1503 vpsrlq $26,%xmm3,%xmm6 1504 vpsrlq $26,%xmm0,%xmm5 1505 vpand %xmm7,%xmm0,%xmm0 1506 vpand %xmm7,%xmm3,%xmm3 1507 vpaddd %xmm5,%xmm1,%xmm1 1508 vpaddd %xmm6,%xmm4,%xmm4 1509 decl %ecx 1510 jz .L019square_break 1511 vpunpcklqdq (%esp),%xmm0,%xmm0 1512 vpunpcklqdq 16(%esp),%xmm1,%xmm1 1513 vpunpcklqdq 32(%esp),%xmm2,%xmm2 1514 vpunpcklqdq 48(%esp),%xmm3,%xmm3 1515 vpunpcklqdq 64(%esp),%xmm4,%xmm4 1516 jmp .L018square 1517.L019square_break: 1518 vpsllq $32,%xmm0,%xmm0 1519 vpsllq $32,%xmm1,%xmm1 1520 vpsllq $32,%xmm2,%xmm2 1521 vpsllq $32,%xmm3,%xmm3 1522 vpsllq $32,%xmm4,%xmm4 1523 vpor (%esp),%xmm0,%xmm0 1524 vpor 16(%esp),%xmm1,%xmm1 1525 vpor 32(%esp),%xmm2,%xmm2 1526 vpor 48(%esp),%xmm3,%xmm3 1527 vpor 64(%esp),%xmm4,%xmm4 1528 vpshufd $141,%xmm0,%xmm0 1529 vpshufd $141,%xmm1,%xmm1 1530 vpshufd $141,%xmm2,%xmm2 1531 vpshufd $141,%xmm3,%xmm3 1532 vpshufd $141,%xmm4,%xmm4 1533 vmovdqu %xmm0,(%edi) 1534 vmovdqu %xmm1,16(%edi) 1535 vmovdqu %xmm2,32(%edi) 1536 vmovdqu %xmm3,48(%edi) 1537 vmovdqu %xmm4,64(%edi) 1538 vpslld $2,%xmm1,%xmm6 1539 vpslld $2,%xmm2,%xmm5 1540 vpaddd %xmm1,%xmm6,%xmm6 1541 vpaddd %xmm2,%xmm5,%xmm5 1542 vmovdqu %xmm6,80(%edi) 1543 vmovdqu %xmm5,96(%edi) 1544 vpslld $2,%xmm3,%xmm6 1545 vpslld $2,%xmm4,%xmm5 1546 vpaddd %xmm3,%xmm6,%xmm6 1547 vpaddd %xmm4,%xmm5,%xmm5 1548 vmovdqu %xmm6,112(%edi) 1549 vmovdqu %xmm5,128(%edi) 1550 movl %ebp,%esp 1551 leal -48(%edi),%edi 1552 ret 1553.size _poly1305_init_avx2,.-_poly1305_init_avx2 1554.align 32 1555.type _poly1305_blocks_avx2,@function 1556.align 16 1557_poly1305_blocks_avx2: 1558 %ifdef __CET__ 1559 1560.byte 243,15,30,251 1561 %endif 1562 1563 pushl %ebp 1564 pushl %ebx 1565 pushl %esi 1566 pushl %edi 1567 movl 20(%esp),%edi 1568 movl 24(%esp),%esi 1569 movl 28(%esp),%ecx 1570 movl 20(%edi),%eax 1571 andl $-16,%ecx 1572 jz .L020nodata 1573 cmpl $64,%ecx 1574 jae .L021enter_avx2 1575 testl %eax,%eax 1576 jz .Lenter_blocks 1577.L021enter_avx2: 1578 vzeroupper 1579 call .L022pic_point 1580.L022pic_point: 1581 popl %ebx 1582 leal .Lconst_sse2-.L022pic_point(%ebx),%ebx 1583 testl %eax,%eax 1584 jnz .L023base2_26 1585 call _poly1305_init_avx2 1586 movl (%edi),%eax 1587 movl 3(%edi),%ecx 1588 movl 6(%edi),%edx 1589 movl 9(%edi),%esi 1590 movl 13(%edi),%ebp 1591 shrl $2,%ecx 1592 andl $67108863,%eax 1593 shrl $4,%edx 1594 andl $67108863,%ecx 1595 shrl $6,%esi 1596 andl $67108863,%edx 1597 movl %eax,(%edi) 1598 movl %ecx,4(%edi) 1599 movl %edx,8(%edi) 1600 movl %esi,12(%edi) 1601 movl %ebp,16(%edi) 1602 movl $1,20(%edi) 1603 movl 24(%esp),%esi 1604 movl 28(%esp),%ecx 1605.L023base2_26: 1606 movl 32(%esp),%eax 1607 movl %esp,%ebp 1608 subl $448,%esp 1609 andl $-512,%esp 1610 vmovdqu 48(%edi),%xmm0 1611 leal 288(%esp),%edx 1612 vmovdqu 64(%edi),%xmm1 1613 vmovdqu 80(%edi),%xmm2 1614 vmovdqu 96(%edi),%xmm3 1615 vmovdqu 112(%edi),%xmm4 1616 leal 48(%edi),%edi 1617 vpermq $64,%ymm0,%ymm0 1618 vpermq $64,%ymm1,%ymm1 1619 vpermq $64,%ymm2,%ymm2 1620 vpermq $64,%ymm3,%ymm3 1621 vpermq $64,%ymm4,%ymm4 1622 vpshufd $200,%ymm0,%ymm0 1623 vpshufd $200,%ymm1,%ymm1 1624 vpshufd $200,%ymm2,%ymm2 1625 vpshufd $200,%ymm3,%ymm3 1626 vpshufd $200,%ymm4,%ymm4 1627 vmovdqa %ymm0,-128(%edx) 1628 vmovdqu 80(%edi),%xmm0 1629 vmovdqa %ymm1,-96(%edx) 1630 vmovdqu 96(%edi),%xmm1 1631 vmovdqa %ymm2,-64(%edx) 1632 vmovdqu 112(%edi),%xmm2 1633 vmovdqa %ymm3,-32(%edx) 1634 vmovdqu 128(%edi),%xmm3 1635 vmovdqa %ymm4,(%edx) 1636 vpermq $64,%ymm0,%ymm0 1637 vpermq $64,%ymm1,%ymm1 1638 vpermq $64,%ymm2,%ymm2 1639 vpermq $64,%ymm3,%ymm3 1640 vpshufd $200,%ymm0,%ymm0 1641 vpshufd $200,%ymm1,%ymm1 1642 vpshufd $200,%ymm2,%ymm2 1643 vpshufd $200,%ymm3,%ymm3 1644 vmovdqa %ymm0,32(%edx) 1645 vmovd -48(%edi),%xmm0 1646 vmovdqa %ymm1,64(%edx) 1647 vmovd -44(%edi),%xmm1 1648 vmovdqa %ymm2,96(%edx) 1649 vmovd -40(%edi),%xmm2 1650 vmovdqa %ymm3,128(%edx) 1651 vmovd -36(%edi),%xmm3 1652 vmovd -32(%edi),%xmm4 1653 vmovdqa 64(%ebx),%ymm7 1654 negl %eax 1655 testl $63,%ecx 1656 jz .L024even 1657 movl %ecx,%edx 1658 andl $-64,%ecx 1659 andl $63,%edx 1660 vmovdqu (%esi),%xmm5 1661 cmpl $32,%edx 1662 jb .L025one 1663 vmovdqu 16(%esi),%xmm6 1664 je .L026two 1665 vinserti128 $1,32(%esi),%ymm5,%ymm5 1666 leal 48(%esi),%esi 1667 leal 8(%ebx),%ebx 1668 leal 296(%esp),%edx 1669 jmp .L027tail 1670.L026two: 1671 leal 32(%esi),%esi 1672 leal 16(%ebx),%ebx 1673 leal 304(%esp),%edx 1674 jmp .L027tail 1675.L025one: 1676 leal 16(%esi),%esi 1677 vpxor %ymm6,%ymm6,%ymm6 1678 leal 32(%ebx,%eax,8),%ebx 1679 leal 312(%esp),%edx 1680 jmp .L027tail 1681.align 32 1682.L024even: 1683 vmovdqu (%esi),%xmm5 1684 vmovdqu 16(%esi),%xmm6 1685 vinserti128 $1,32(%esi),%ymm5,%ymm5 1686 vinserti128 $1,48(%esi),%ymm6,%ymm6 1687 leal 64(%esi),%esi 1688 subl $64,%ecx 1689 jz .L027tail 1690.L028loop: 1691 vmovdqa %ymm2,64(%esp) 1692 vpsrldq $6,%ymm5,%ymm2 1693 vmovdqa %ymm0,(%esp) 1694 vpsrldq $6,%ymm6,%ymm0 1695 vmovdqa %ymm1,32(%esp) 1696 vpunpckhqdq %ymm6,%ymm5,%ymm1 1697 vpunpcklqdq %ymm6,%ymm5,%ymm5 1698 vpunpcklqdq %ymm0,%ymm2,%ymm2 1699 vpsrlq $30,%ymm2,%ymm0 1700 vpsrlq $4,%ymm2,%ymm2 1701 vpsrlq $26,%ymm5,%ymm6 1702 vpsrlq $40,%ymm1,%ymm1 1703 vpand %ymm7,%ymm2,%ymm2 1704 vpand %ymm7,%ymm5,%ymm5 1705 vpand %ymm7,%ymm6,%ymm6 1706 vpand %ymm7,%ymm0,%ymm0 1707 vpor (%ebx),%ymm1,%ymm1 1708 vpaddq 64(%esp),%ymm2,%ymm2 1709 vpaddq (%esp),%ymm5,%ymm5 1710 vpaddq 32(%esp),%ymm6,%ymm6 1711 vpaddq %ymm3,%ymm0,%ymm0 1712 vpaddq %ymm4,%ymm1,%ymm1 1713 vpmuludq -96(%edx),%ymm2,%ymm3 1714 vmovdqa %ymm6,32(%esp) 1715 vpmuludq -64(%edx),%ymm2,%ymm4 1716 vmovdqa %ymm0,96(%esp) 1717 vpmuludq 96(%edx),%ymm2,%ymm0 1718 vmovdqa %ymm1,128(%esp) 1719 vpmuludq 128(%edx),%ymm2,%ymm1 1720 vpmuludq -128(%edx),%ymm2,%ymm2 1721 vpmuludq -32(%edx),%ymm5,%ymm7 1722 vpaddq %ymm7,%ymm3,%ymm3 1723 vpmuludq (%edx),%ymm5,%ymm6 1724 vpaddq %ymm6,%ymm4,%ymm4 1725 vpmuludq -128(%edx),%ymm5,%ymm7 1726 vpaddq %ymm7,%ymm0,%ymm0 1727 vmovdqa 32(%esp),%ymm7 1728 vpmuludq -96(%edx),%ymm5,%ymm6 1729 vpaddq %ymm6,%ymm1,%ymm1 1730 vpmuludq -64(%edx),%ymm5,%ymm5 1731 vpaddq %ymm5,%ymm2,%ymm2 1732 vpmuludq -64(%edx),%ymm7,%ymm6 1733 vpaddq %ymm6,%ymm3,%ymm3 1734 vpmuludq -32(%edx),%ymm7,%ymm5 1735 vpaddq %ymm5,%ymm4,%ymm4 1736 vpmuludq 128(%edx),%ymm7,%ymm6 1737 vpaddq %ymm6,%ymm0,%ymm0 1738 vmovdqa 96(%esp),%ymm6 1739 vpmuludq -128(%edx),%ymm7,%ymm5 1740 vpaddq %ymm5,%ymm1,%ymm1 1741 vpmuludq -96(%edx),%ymm7,%ymm7 1742 vpaddq %ymm7,%ymm2,%ymm2 1743 vpmuludq -128(%edx),%ymm6,%ymm5 1744 vpaddq %ymm5,%ymm3,%ymm3 1745 vpmuludq -96(%edx),%ymm6,%ymm7 1746 vpaddq %ymm7,%ymm4,%ymm4 1747 vpmuludq 64(%edx),%ymm6,%ymm5 1748 vpaddq %ymm5,%ymm0,%ymm0 1749 vmovdqa 128(%esp),%ymm5 1750 vpmuludq 96(%edx),%ymm6,%ymm7 1751 vpaddq %ymm7,%ymm1,%ymm1 1752 vpmuludq 128(%edx),%ymm6,%ymm6 1753 vpaddq %ymm6,%ymm2,%ymm2 1754 vpmuludq 128(%edx),%ymm5,%ymm7 1755 vpaddq %ymm7,%ymm3,%ymm3 1756 vpmuludq 32(%edx),%ymm5,%ymm6 1757 vpaddq %ymm6,%ymm0,%ymm0 1758 vpmuludq -128(%edx),%ymm5,%ymm7 1759 vpaddq %ymm7,%ymm4,%ymm4 1760 vmovdqa 64(%ebx),%ymm7 1761 vpmuludq 64(%edx),%ymm5,%ymm6 1762 vpaddq %ymm6,%ymm1,%ymm1 1763 vpmuludq 96(%edx),%ymm5,%ymm5 1764 vpaddq %ymm5,%ymm2,%ymm2 1765 vpsrlq $26,%ymm3,%ymm5 1766 vpand %ymm7,%ymm3,%ymm3 1767 vpsrlq $26,%ymm0,%ymm6 1768 vpand %ymm7,%ymm0,%ymm0 1769 vpaddq %ymm5,%ymm4,%ymm4 1770 vpaddq %ymm6,%ymm1,%ymm1 1771 vpsrlq $26,%ymm4,%ymm5 1772 vpand %ymm7,%ymm4,%ymm4 1773 vpsrlq $26,%ymm1,%ymm6 1774 vpand %ymm7,%ymm1,%ymm1 1775 vpaddq %ymm6,%ymm2,%ymm2 1776 vpaddq %ymm5,%ymm0,%ymm0 1777 vpsllq $2,%ymm5,%ymm5 1778 vpsrlq $26,%ymm2,%ymm6 1779 vpand %ymm7,%ymm2,%ymm2 1780 vpaddq %ymm5,%ymm0,%ymm0 1781 vpaddq %ymm6,%ymm3,%ymm3 1782 vpsrlq $26,%ymm3,%ymm6 1783 vpsrlq $26,%ymm0,%ymm5 1784 vpand %ymm7,%ymm0,%ymm0 1785 vpand %ymm7,%ymm3,%ymm3 1786 vpaddq %ymm5,%ymm1,%ymm1 1787 vpaddq %ymm6,%ymm4,%ymm4 1788 vmovdqu (%esi),%xmm5 1789 vmovdqu 16(%esi),%xmm6 1790 vinserti128 $1,32(%esi),%ymm5,%ymm5 1791 vinserti128 $1,48(%esi),%ymm6,%ymm6 1792 leal 64(%esi),%esi 1793 subl $64,%ecx 1794 jnz .L028loop 1795.L027tail: 1796 vmovdqa %ymm2,64(%esp) 1797 vpsrldq $6,%ymm5,%ymm2 1798 vmovdqa %ymm0,(%esp) 1799 vpsrldq $6,%ymm6,%ymm0 1800 vmovdqa %ymm1,32(%esp) 1801 vpunpckhqdq %ymm6,%ymm5,%ymm1 1802 vpunpcklqdq %ymm6,%ymm5,%ymm5 1803 vpunpcklqdq %ymm0,%ymm2,%ymm2 1804 vpsrlq $30,%ymm2,%ymm0 1805 vpsrlq $4,%ymm2,%ymm2 1806 vpsrlq $26,%ymm5,%ymm6 1807 vpsrlq $40,%ymm1,%ymm1 1808 vpand %ymm7,%ymm2,%ymm2 1809 vpand %ymm7,%ymm5,%ymm5 1810 vpand %ymm7,%ymm6,%ymm6 1811 vpand %ymm7,%ymm0,%ymm0 1812 vpor (%ebx),%ymm1,%ymm1 1813 andl $-64,%ebx 1814 vpaddq 64(%esp),%ymm2,%ymm2 1815 vpaddq (%esp),%ymm5,%ymm5 1816 vpaddq 32(%esp),%ymm6,%ymm6 1817 vpaddq %ymm3,%ymm0,%ymm0 1818 vpaddq %ymm4,%ymm1,%ymm1 1819 vpmuludq -92(%edx),%ymm2,%ymm3 1820 vmovdqa %ymm6,32(%esp) 1821 vpmuludq -60(%edx),%ymm2,%ymm4 1822 vmovdqa %ymm0,96(%esp) 1823 vpmuludq 100(%edx),%ymm2,%ymm0 1824 vmovdqa %ymm1,128(%esp) 1825 vpmuludq 132(%edx),%ymm2,%ymm1 1826 vpmuludq -124(%edx),%ymm2,%ymm2 1827 vpmuludq -28(%edx),%ymm5,%ymm7 1828 vpaddq %ymm7,%ymm3,%ymm3 1829 vpmuludq 4(%edx),%ymm5,%ymm6 1830 vpaddq %ymm6,%ymm4,%ymm4 1831 vpmuludq -124(%edx),%ymm5,%ymm7 1832 vpaddq %ymm7,%ymm0,%ymm0 1833 vmovdqa 32(%esp),%ymm7 1834 vpmuludq -92(%edx),%ymm5,%ymm6 1835 vpaddq %ymm6,%ymm1,%ymm1 1836 vpmuludq -60(%edx),%ymm5,%ymm5 1837 vpaddq %ymm5,%ymm2,%ymm2 1838 vpmuludq -60(%edx),%ymm7,%ymm6 1839 vpaddq %ymm6,%ymm3,%ymm3 1840 vpmuludq -28(%edx),%ymm7,%ymm5 1841 vpaddq %ymm5,%ymm4,%ymm4 1842 vpmuludq 132(%edx),%ymm7,%ymm6 1843 vpaddq %ymm6,%ymm0,%ymm0 1844 vmovdqa 96(%esp),%ymm6 1845 vpmuludq -124(%edx),%ymm7,%ymm5 1846 vpaddq %ymm5,%ymm1,%ymm1 1847 vpmuludq -92(%edx),%ymm7,%ymm7 1848 vpaddq %ymm7,%ymm2,%ymm2 1849 vpmuludq -124(%edx),%ymm6,%ymm5 1850 vpaddq %ymm5,%ymm3,%ymm3 1851 vpmuludq -92(%edx),%ymm6,%ymm7 1852 vpaddq %ymm7,%ymm4,%ymm4 1853 vpmuludq 68(%edx),%ymm6,%ymm5 1854 vpaddq %ymm5,%ymm0,%ymm0 1855 vmovdqa 128(%esp),%ymm5 1856 vpmuludq 100(%edx),%ymm6,%ymm7 1857 vpaddq %ymm7,%ymm1,%ymm1 1858 vpmuludq 132(%edx),%ymm6,%ymm6 1859 vpaddq %ymm6,%ymm2,%ymm2 1860 vpmuludq 132(%edx),%ymm5,%ymm7 1861 vpaddq %ymm7,%ymm3,%ymm3 1862 vpmuludq 36(%edx),%ymm5,%ymm6 1863 vpaddq %ymm6,%ymm0,%ymm0 1864 vpmuludq -124(%edx),%ymm5,%ymm7 1865 vpaddq %ymm7,%ymm4,%ymm4 1866 vmovdqa 64(%ebx),%ymm7 1867 vpmuludq 68(%edx),%ymm5,%ymm6 1868 vpaddq %ymm6,%ymm1,%ymm1 1869 vpmuludq 100(%edx),%ymm5,%ymm5 1870 vpaddq %ymm5,%ymm2,%ymm2 1871 vpsrldq $8,%ymm4,%ymm5 1872 vpsrldq $8,%ymm3,%ymm6 1873 vpaddq %ymm5,%ymm4,%ymm4 1874 vpsrldq $8,%ymm0,%ymm5 1875 vpaddq %ymm6,%ymm3,%ymm3 1876 vpsrldq $8,%ymm1,%ymm6 1877 vpaddq %ymm5,%ymm0,%ymm0 1878 vpsrldq $8,%ymm2,%ymm5 1879 vpaddq %ymm6,%ymm1,%ymm1 1880 vpermq $2,%ymm4,%ymm6 1881 vpaddq %ymm5,%ymm2,%ymm2 1882 vpermq $2,%ymm3,%ymm5 1883 vpaddq %ymm6,%ymm4,%ymm4 1884 vpermq $2,%ymm0,%ymm6 1885 vpaddq %ymm5,%ymm3,%ymm3 1886 vpermq $2,%ymm1,%ymm5 1887 vpaddq %ymm6,%ymm0,%ymm0 1888 vpermq $2,%ymm2,%ymm6 1889 vpaddq %ymm5,%ymm1,%ymm1 1890 vpaddq %ymm6,%ymm2,%ymm2 1891 vpsrlq $26,%ymm3,%ymm5 1892 vpand %ymm7,%ymm3,%ymm3 1893 vpsrlq $26,%ymm0,%ymm6 1894 vpand %ymm7,%ymm0,%ymm0 1895 vpaddq %ymm5,%ymm4,%ymm4 1896 vpaddq %ymm6,%ymm1,%ymm1 1897 vpsrlq $26,%ymm4,%ymm5 1898 vpand %ymm7,%ymm4,%ymm4 1899 vpsrlq $26,%ymm1,%ymm6 1900 vpand %ymm7,%ymm1,%ymm1 1901 vpaddq %ymm6,%ymm2,%ymm2 1902 vpaddq %ymm5,%ymm0,%ymm0 1903 vpsllq $2,%ymm5,%ymm5 1904 vpsrlq $26,%ymm2,%ymm6 1905 vpand %ymm7,%ymm2,%ymm2 1906 vpaddq %ymm5,%ymm0,%ymm0 1907 vpaddq %ymm6,%ymm3,%ymm3 1908 vpsrlq $26,%ymm3,%ymm6 1909 vpsrlq $26,%ymm0,%ymm5 1910 vpand %ymm7,%ymm0,%ymm0 1911 vpand %ymm7,%ymm3,%ymm3 1912 vpaddq %ymm5,%ymm1,%ymm1 1913 vpaddq %ymm6,%ymm4,%ymm4 1914 cmpl $0,%ecx 1915 je .L029done 1916 vpshufd $252,%xmm0,%xmm0 1917 leal 288(%esp),%edx 1918 vpshufd $252,%xmm1,%xmm1 1919 vpshufd $252,%xmm2,%xmm2 1920 vpshufd $252,%xmm3,%xmm3 1921 vpshufd $252,%xmm4,%xmm4 1922 jmp .L024even 1923.align 16 1924.L029done: 1925 vmovd %xmm0,-48(%edi) 1926 vmovd %xmm1,-44(%edi) 1927 vmovd %xmm2,-40(%edi) 1928 vmovd %xmm3,-36(%edi) 1929 vmovd %xmm4,-32(%edi) 1930 vzeroupper 1931 movl %ebp,%esp 1932.L020nodata: 1933 popl %edi 1934 popl %esi 1935 popl %ebx 1936 popl %ebp 1937 ret 1938.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 1939.align 64 1940.Lconst_sse2: 1941.long 16777216,0,16777216,0,16777216,0,16777216,0 1942.long 0,0,0,0,0,0,0,0 1943.long 67108863,0,67108863,0,67108863,0,67108863,0 1944.long 268435455,268435452,268435452,268435452 1945.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 1946.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 1947.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 1948.byte 114,103,62,0 1949.align 4 1950.comm OPENSSL_ia32cap_P,16,4 1951 1952 .section ".note.gnu.property", "a" 1953 .p2align 2 1954 .long 1f - 0f 1955 .long 4f - 1f 1956 .long 5 19570: 1958 .asciz "GNU" 19591: 1960 .p2align 2 1961 .long 0xc0000002 1962 .long 3f - 2f 19632: 1964 .long 3 19653: 1966 .p2align 2 19674: 1968