1.text 2 3 4 5.globl poly1305_init 6.hidden poly1305_init 7.globl poly1305_blocks 8.hidden poly1305_blocks 9.globl poly1305_emit 10.hidden poly1305_emit 11 12.type poly1305_init,@function 13.align 32 14poly1305_init: 15.cfi_startproc 16 xorq %rax,%rax 17 movq %rax,0(%rdi) 18 movq %rax,8(%rdi) 19 movq %rax,16(%rdi) 20 21 cmpq $0,%rsi 22 je .Lno_key 23 24 leaq poly1305_blocks(%rip),%r10 25 leaq poly1305_emit(%rip),%r11 26 movq OPENSSL_ia32cap_P+4(%rip),%r9 27 leaq poly1305_blocks_avx(%rip),%rax 28 leaq poly1305_emit_avx(%rip),%rcx 29 btq $28,%r9 30 cmovcq %rax,%r10 31 cmovcq %rcx,%r11 32 leaq poly1305_blocks_avx2(%rip),%rax 33 btq $37,%r9 34 cmovcq %rax,%r10 35 movq $0x0ffffffc0fffffff,%rax 36 movq $0x0ffffffc0ffffffc,%rcx 37 andq 0(%rsi),%rax 38 andq 8(%rsi),%rcx 39 movq %rax,24(%rdi) 40 movq %rcx,32(%rdi) 41 movq %r10,0(%rdx) 42 movq %r11,8(%rdx) 43 movl $1,%eax 44.Lno_key: 45 .byte 0xf3,0xc3 46.cfi_endproc 47.size poly1305_init,.-poly1305_init 48 49.type poly1305_blocks,@function 50.align 32 51poly1305_blocks: 52.cfi_startproc 53.Lblocks: 54 shrq $4,%rdx 55 jz .Lno_data 56 57 pushq %rbx 58.cfi_adjust_cfa_offset 8 59.cfi_offset %rbx,-16 60 pushq %rbp 61.cfi_adjust_cfa_offset 8 62.cfi_offset %rbp,-24 63 pushq %r12 64.cfi_adjust_cfa_offset 8 65.cfi_offset %r12,-32 66 pushq %r13 67.cfi_adjust_cfa_offset 8 68.cfi_offset %r13,-40 69 pushq %r14 70.cfi_adjust_cfa_offset 8 71.cfi_offset %r14,-48 72 pushq %r15 73.cfi_adjust_cfa_offset 8 74.cfi_offset %r15,-56 75.Lblocks_body: 76 77 movq %rdx,%r15 78 79 movq 24(%rdi),%r11 80 movq 32(%rdi),%r13 81 82 movq 0(%rdi),%r14 83 movq 8(%rdi),%rbx 84 movq 16(%rdi),%rbp 85 86 movq %r13,%r12 87 shrq $2,%r13 88 movq %r12,%rax 89 addq %r12,%r13 90 jmp .Loop 91 92.align 32 93.Loop: 94 addq 0(%rsi),%r14 95 adcq 8(%rsi),%rbx 96 leaq 16(%rsi),%rsi 97 adcq %rcx,%rbp 98 mulq %r14 99 movq %rax,%r9 100 movq %r11,%rax 101 movq %rdx,%r10 102 103 mulq %r14 104 movq %rax,%r14 105 movq %r11,%rax 106 movq %rdx,%r8 107 108 mulq %rbx 109 addq %rax,%r9 110 movq %r13,%rax 111 adcq %rdx,%r10 112 113 mulq %rbx 114 movq %rbp,%rbx 115 addq %rax,%r14 116 adcq %rdx,%r8 117 118 imulq %r13,%rbx 119 addq %rbx,%r9 120 movq %r8,%rbx 121 adcq $0,%r10 122 123 imulq %r11,%rbp 124 addq %r9,%rbx 125 movq $-4,%rax 126 adcq %rbp,%r10 127 128 andq %r10,%rax 129 movq %r10,%rbp 130 shrq $2,%r10 131 andq $3,%rbp 132 addq %r10,%rax 133 addq %rax,%r14 134 adcq $0,%rbx 135 adcq $0,%rbp 136 movq %r12,%rax 137 decq %r15 138 jnz .Loop 139 140 movq %r14,0(%rdi) 141 movq %rbx,8(%rdi) 142 movq %rbp,16(%rdi) 143 144 movq 0(%rsp),%r15 145.cfi_restore %r15 146 movq 8(%rsp),%r14 147.cfi_restore %r14 148 movq 16(%rsp),%r13 149.cfi_restore %r13 150 movq 24(%rsp),%r12 151.cfi_restore %r12 152 movq 32(%rsp),%rbp 153.cfi_restore %rbp 154 movq 40(%rsp),%rbx 155.cfi_restore %rbx 156 leaq 48(%rsp),%rsp 157.cfi_adjust_cfa_offset -48 158.Lno_data: 159.Lblocks_epilogue: 160 .byte 0xf3,0xc3 161.cfi_endproc 162.size poly1305_blocks,.-poly1305_blocks 163 164.type poly1305_emit,@function 165.align 32 166poly1305_emit: 167.cfi_startproc 168.Lemit: 169 movq 0(%rdi),%r8 170 movq 8(%rdi),%r9 171 movq 16(%rdi),%r10 172 173 movq %r8,%rax 174 addq $5,%r8 175 movq %r9,%rcx 176 adcq $0,%r9 177 adcq $0,%r10 178 shrq $2,%r10 179 cmovnzq %r8,%rax 180 cmovnzq %r9,%rcx 181 182 addq 0(%rdx),%rax 183 adcq 8(%rdx),%rcx 184 movq %rax,0(%rsi) 185 movq %rcx,8(%rsi) 186 187 .byte 0xf3,0xc3 188.cfi_endproc 189.size poly1305_emit,.-poly1305_emit 190.type __poly1305_block,@function 191.align 32 192__poly1305_block: 193.cfi_startproc 194 mulq %r14 195 movq %rax,%r9 196 movq %r11,%rax 197 movq %rdx,%r10 198 199 mulq %r14 200 movq %rax,%r14 201 movq %r11,%rax 202 movq %rdx,%r8 203 204 mulq %rbx 205 addq %rax,%r9 206 movq %r13,%rax 207 adcq %rdx,%r10 208 209 mulq %rbx 210 movq %rbp,%rbx 211 addq %rax,%r14 212 adcq %rdx,%r8 213 214 imulq %r13,%rbx 215 addq %rbx,%r9 216 movq %r8,%rbx 217 adcq $0,%r10 218 219 imulq %r11,%rbp 220 addq %r9,%rbx 221 movq $-4,%rax 222 adcq %rbp,%r10 223 224 andq %r10,%rax 225 movq %r10,%rbp 226 shrq $2,%r10 227 andq $3,%rbp 228 addq %r10,%rax 229 addq %rax,%r14 230 adcq $0,%rbx 231 adcq $0,%rbp 232 .byte 0xf3,0xc3 233.cfi_endproc 234.size __poly1305_block,.-__poly1305_block 235 236.type __poly1305_init_avx,@function 237.align 32 238__poly1305_init_avx: 239.cfi_startproc 240 movq %r11,%r14 241 movq %r12,%rbx 242 xorq %rbp,%rbp 243 244 leaq 48+64(%rdi),%rdi 245 246 movq %r12,%rax 247 call __poly1305_block 248 249 movl $0x3ffffff,%eax 250 movl $0x3ffffff,%edx 251 movq %r14,%r8 252 andl %r14d,%eax 253 movq %r11,%r9 254 andl %r11d,%edx 255 movl %eax,-64(%rdi) 256 shrq $26,%r8 257 movl %edx,-60(%rdi) 258 shrq $26,%r9 259 260 movl $0x3ffffff,%eax 261 movl $0x3ffffff,%edx 262 andl %r8d,%eax 263 andl %r9d,%edx 264 movl %eax,-48(%rdi) 265 leal (%rax,%rax,4),%eax 266 movl %edx,-44(%rdi) 267 leal (%rdx,%rdx,4),%edx 268 movl %eax,-32(%rdi) 269 shrq $26,%r8 270 movl %edx,-28(%rdi) 271 shrq $26,%r9 272 273 movq %rbx,%rax 274 movq %r12,%rdx 275 shlq $12,%rax 276 shlq $12,%rdx 277 orq %r8,%rax 278 orq %r9,%rdx 279 andl $0x3ffffff,%eax 280 andl $0x3ffffff,%edx 281 movl %eax,-16(%rdi) 282 leal (%rax,%rax,4),%eax 283 movl %edx,-12(%rdi) 284 leal (%rdx,%rdx,4),%edx 285 movl %eax,0(%rdi) 286 movq %rbx,%r8 287 movl %edx,4(%rdi) 288 movq %r12,%r9 289 290 movl $0x3ffffff,%eax 291 movl $0x3ffffff,%edx 292 shrq $14,%r8 293 shrq $14,%r9 294 andl %r8d,%eax 295 andl %r9d,%edx 296 movl %eax,16(%rdi) 297 leal (%rax,%rax,4),%eax 298 movl %edx,20(%rdi) 299 leal (%rdx,%rdx,4),%edx 300 movl %eax,32(%rdi) 301 shrq $26,%r8 302 movl %edx,36(%rdi) 303 shrq $26,%r9 304 305 movq %rbp,%rax 306 shlq $24,%rax 307 orq %rax,%r8 308 movl %r8d,48(%rdi) 309 leaq (%r8,%r8,4),%r8 310 movl %r9d,52(%rdi) 311 leaq (%r9,%r9,4),%r9 312 movl %r8d,64(%rdi) 313 movl %r9d,68(%rdi) 314 315 movq %r12,%rax 316 call __poly1305_block 317 318 movl $0x3ffffff,%eax 319 movq %r14,%r8 320 andl %r14d,%eax 321 shrq $26,%r8 322 movl %eax,-52(%rdi) 323 324 movl $0x3ffffff,%edx 325 andl %r8d,%edx 326 movl %edx,-36(%rdi) 327 leal (%rdx,%rdx,4),%edx 328 shrq $26,%r8 329 movl %edx,-20(%rdi) 330 331 movq %rbx,%rax 332 shlq $12,%rax 333 orq %r8,%rax 334 andl $0x3ffffff,%eax 335 movl %eax,-4(%rdi) 336 leal (%rax,%rax,4),%eax 337 movq %rbx,%r8 338 movl %eax,12(%rdi) 339 340 movl $0x3ffffff,%edx 341 shrq $14,%r8 342 andl %r8d,%edx 343 movl %edx,28(%rdi) 344 leal (%rdx,%rdx,4),%edx 345 shrq $26,%r8 346 movl %edx,44(%rdi) 347 348 movq %rbp,%rax 349 shlq $24,%rax 350 orq %rax,%r8 351 movl %r8d,60(%rdi) 352 leaq (%r8,%r8,4),%r8 353 movl %r8d,76(%rdi) 354 355 movq %r12,%rax 356 call __poly1305_block 357 358 movl $0x3ffffff,%eax 359 movq %r14,%r8 360 andl %r14d,%eax 361 shrq $26,%r8 362 movl %eax,-56(%rdi) 363 364 movl $0x3ffffff,%edx 365 andl %r8d,%edx 366 movl %edx,-40(%rdi) 367 leal (%rdx,%rdx,4),%edx 368 shrq $26,%r8 369 movl %edx,-24(%rdi) 370 371 movq %rbx,%rax 372 shlq $12,%rax 373 orq %r8,%rax 374 andl $0x3ffffff,%eax 375 movl %eax,-8(%rdi) 376 leal (%rax,%rax,4),%eax 377 movq %rbx,%r8 378 movl %eax,8(%rdi) 379 380 movl $0x3ffffff,%edx 381 shrq $14,%r8 382 andl %r8d,%edx 383 movl %edx,24(%rdi) 384 leal (%rdx,%rdx,4),%edx 385 shrq $26,%r8 386 movl %edx,40(%rdi) 387 388 movq %rbp,%rax 389 shlq $24,%rax 390 orq %rax,%r8 391 movl %r8d,56(%rdi) 392 leaq (%r8,%r8,4),%r8 393 movl %r8d,72(%rdi) 394 395 leaq -48-64(%rdi),%rdi 396 .byte 0xf3,0xc3 397.cfi_endproc 398.size __poly1305_init_avx,.-__poly1305_init_avx 399 400.type poly1305_blocks_avx,@function 401.align 32 402poly1305_blocks_avx: 403.cfi_startproc 404 movl 20(%rdi),%r8d 405 cmpq $128,%rdx 406 jae .Lblocks_avx 407 testl %r8d,%r8d 408 jz .Lblocks 409 410.Lblocks_avx: 411 andq $-16,%rdx 412 jz .Lno_data_avx 413 414 vzeroupper 415 416 testl %r8d,%r8d 417 jz .Lbase2_64_avx 418 419 testq $31,%rdx 420 jz .Leven_avx 421 422 pushq %rbx 423.cfi_adjust_cfa_offset 8 424.cfi_offset %rbx,-16 425 pushq %rbp 426.cfi_adjust_cfa_offset 8 427.cfi_offset %rbp,-24 428 pushq %r12 429.cfi_adjust_cfa_offset 8 430.cfi_offset %r12,-32 431 pushq %r13 432.cfi_adjust_cfa_offset 8 433.cfi_offset %r13,-40 434 pushq %r14 435.cfi_adjust_cfa_offset 8 436.cfi_offset %r14,-48 437 pushq %r15 438.cfi_adjust_cfa_offset 8 439.cfi_offset %r15,-56 440.Lblocks_avx_body: 441 442 movq %rdx,%r15 443 444 movq 0(%rdi),%r8 445 movq 8(%rdi),%r9 446 movl 16(%rdi),%ebp 447 448 movq 24(%rdi),%r11 449 movq 32(%rdi),%r13 450 451 452 movl %r8d,%r14d 453 andq $-2147483648,%r8 454 movq %r9,%r12 455 movl %r9d,%ebx 456 andq $-2147483648,%r9 457 458 shrq $6,%r8 459 shlq $52,%r12 460 addq %r8,%r14 461 shrq $12,%rbx 462 shrq $18,%r9 463 addq %r12,%r14 464 adcq %r9,%rbx 465 466 movq %rbp,%r8 467 shlq $40,%r8 468 shrq $24,%rbp 469 addq %r8,%rbx 470 adcq $0,%rbp 471 472 movq $-4,%r9 473 movq %rbp,%r8 474 andq %rbp,%r9 475 shrq $2,%r8 476 andq $3,%rbp 477 addq %r9,%r8 478 addq %r8,%r14 479 adcq $0,%rbx 480 adcq $0,%rbp 481 482 movq %r13,%r12 483 movq %r13,%rax 484 shrq $2,%r13 485 addq %r12,%r13 486 487 addq 0(%rsi),%r14 488 adcq 8(%rsi),%rbx 489 leaq 16(%rsi),%rsi 490 adcq %rcx,%rbp 491 492 call __poly1305_block 493 494 testq %rcx,%rcx 495 jz .Lstore_base2_64_avx 496 497 498 movq %r14,%rax 499 movq %r14,%rdx 500 shrq $52,%r14 501 movq %rbx,%r11 502 movq %rbx,%r12 503 shrq $26,%rdx 504 andq $0x3ffffff,%rax 505 shlq $12,%r11 506 andq $0x3ffffff,%rdx 507 shrq $14,%rbx 508 orq %r11,%r14 509 shlq $24,%rbp 510 andq $0x3ffffff,%r14 511 shrq $40,%r12 512 andq $0x3ffffff,%rbx 513 orq %r12,%rbp 514 515 subq $16,%r15 516 jz .Lstore_base2_26_avx 517 518 vmovd %eax,%xmm0 519 vmovd %edx,%xmm1 520 vmovd %r14d,%xmm2 521 vmovd %ebx,%xmm3 522 vmovd %ebp,%xmm4 523 jmp .Lproceed_avx 524 525.align 32 526.Lstore_base2_64_avx: 527 movq %r14,0(%rdi) 528 movq %rbx,8(%rdi) 529 movq %rbp,16(%rdi) 530 jmp .Ldone_avx 531 532.align 16 533.Lstore_base2_26_avx: 534 movl %eax,0(%rdi) 535 movl %edx,4(%rdi) 536 movl %r14d,8(%rdi) 537 movl %ebx,12(%rdi) 538 movl %ebp,16(%rdi) 539.align 16 540.Ldone_avx: 541 movq 0(%rsp),%r15 542.cfi_restore %r15 543 movq 8(%rsp),%r14 544.cfi_restore %r14 545 movq 16(%rsp),%r13 546.cfi_restore %r13 547 movq 24(%rsp),%r12 548.cfi_restore %r12 549 movq 32(%rsp),%rbp 550.cfi_restore %rbp 551 movq 40(%rsp),%rbx 552.cfi_restore %rbx 553 leaq 48(%rsp),%rsp 554.cfi_adjust_cfa_offset -48 555.Lno_data_avx: 556.Lblocks_avx_epilogue: 557 .byte 0xf3,0xc3 558.cfi_endproc 559 560.align 32 561.Lbase2_64_avx: 562.cfi_startproc 563 pushq %rbx 564.cfi_adjust_cfa_offset 8 565.cfi_offset %rbx,-16 566 pushq %rbp 567.cfi_adjust_cfa_offset 8 568.cfi_offset %rbp,-24 569 pushq %r12 570.cfi_adjust_cfa_offset 8 571.cfi_offset %r12,-32 572 pushq %r13 573.cfi_adjust_cfa_offset 8 574.cfi_offset %r13,-40 575 pushq %r14 576.cfi_adjust_cfa_offset 8 577.cfi_offset %r14,-48 578 pushq %r15 579.cfi_adjust_cfa_offset 8 580.cfi_offset %r15,-56 581.Lbase2_64_avx_body: 582 583 movq %rdx,%r15 584 585 movq 24(%rdi),%r11 586 movq 32(%rdi),%r13 587 588 movq 0(%rdi),%r14 589 movq 8(%rdi),%rbx 590 movl 16(%rdi),%ebp 591 592 movq %r13,%r12 593 movq %r13,%rax 594 shrq $2,%r13 595 addq %r12,%r13 596 597 testq $31,%rdx 598 jz .Linit_avx 599 600 addq 0(%rsi),%r14 601 adcq 8(%rsi),%rbx 602 leaq 16(%rsi),%rsi 603 adcq %rcx,%rbp 604 subq $16,%r15 605 606 call __poly1305_block 607 608.Linit_avx: 609 610 movq %r14,%rax 611 movq %r14,%rdx 612 shrq $52,%r14 613 movq %rbx,%r8 614 movq %rbx,%r9 615 shrq $26,%rdx 616 andq $0x3ffffff,%rax 617 shlq $12,%r8 618 andq $0x3ffffff,%rdx 619 shrq $14,%rbx 620 orq %r8,%r14 621 shlq $24,%rbp 622 andq $0x3ffffff,%r14 623 shrq $40,%r9 624 andq $0x3ffffff,%rbx 625 orq %r9,%rbp 626 627 vmovd %eax,%xmm0 628 vmovd %edx,%xmm1 629 vmovd %r14d,%xmm2 630 vmovd %ebx,%xmm3 631 vmovd %ebp,%xmm4 632 movl $1,20(%rdi) 633 634 call __poly1305_init_avx 635 636.Lproceed_avx: 637 movq %r15,%rdx 638 639 movq 0(%rsp),%r15 640.cfi_restore %r15 641 movq 8(%rsp),%r14 642.cfi_restore %r14 643 movq 16(%rsp),%r13 644.cfi_restore %r13 645 movq 24(%rsp),%r12 646.cfi_restore %r12 647 movq 32(%rsp),%rbp 648.cfi_restore %rbp 649 movq 40(%rsp),%rbx 650.cfi_restore %rbx 651 leaq 48(%rsp),%rax 652 leaq 48(%rsp),%rsp 653.cfi_adjust_cfa_offset -48 654.Lbase2_64_avx_epilogue: 655 jmp .Ldo_avx 656.cfi_endproc 657 658.align 32 659.Leven_avx: 660.cfi_startproc 661 vmovd 0(%rdi),%xmm0 662 vmovd 4(%rdi),%xmm1 663 vmovd 8(%rdi),%xmm2 664 vmovd 12(%rdi),%xmm3 665 vmovd 16(%rdi),%xmm4 666 667.Ldo_avx: 668 leaq -88(%rsp),%r11 669.cfi_def_cfa %r11,0x60 670 subq $0x178,%rsp 671 subq $64,%rdx 672 leaq -32(%rsi),%rax 673 cmovcq %rax,%rsi 674 675 vmovdqu 48(%rdi),%xmm14 676 leaq 112(%rdi),%rdi 677 leaq .Lconst(%rip),%rcx 678 679 680 681 vmovdqu 32(%rsi),%xmm5 682 vmovdqu 48(%rsi),%xmm6 683 vmovdqa 64(%rcx),%xmm15 684 685 vpsrldq $6,%xmm5,%xmm7 686 vpsrldq $6,%xmm6,%xmm8 687 vpunpckhqdq %xmm6,%xmm5,%xmm9 688 vpunpcklqdq %xmm6,%xmm5,%xmm5 689 vpunpcklqdq %xmm8,%xmm7,%xmm8 690 691 vpsrlq $40,%xmm9,%xmm9 692 vpsrlq $26,%xmm5,%xmm6 693 vpand %xmm15,%xmm5,%xmm5 694 vpsrlq $4,%xmm8,%xmm7 695 vpand %xmm15,%xmm6,%xmm6 696 vpsrlq $30,%xmm8,%xmm8 697 vpand %xmm15,%xmm7,%xmm7 698 vpand %xmm15,%xmm8,%xmm8 699 vpor 32(%rcx),%xmm9,%xmm9 700 701 jbe .Lskip_loop_avx 702 703 704 vmovdqu -48(%rdi),%xmm11 705 vmovdqu -32(%rdi),%xmm12 706 vpshufd $0xEE,%xmm14,%xmm13 707 vpshufd $0x44,%xmm14,%xmm10 708 vmovdqa %xmm13,-144(%r11) 709 vmovdqa %xmm10,0(%rsp) 710 vpshufd $0xEE,%xmm11,%xmm14 711 vmovdqu -16(%rdi),%xmm10 712 vpshufd $0x44,%xmm11,%xmm11 713 vmovdqa %xmm14,-128(%r11) 714 vmovdqa %xmm11,16(%rsp) 715 vpshufd $0xEE,%xmm12,%xmm13 716 vmovdqu 0(%rdi),%xmm11 717 vpshufd $0x44,%xmm12,%xmm12 718 vmovdqa %xmm13,-112(%r11) 719 vmovdqa %xmm12,32(%rsp) 720 vpshufd $0xEE,%xmm10,%xmm14 721 vmovdqu 16(%rdi),%xmm12 722 vpshufd $0x44,%xmm10,%xmm10 723 vmovdqa %xmm14,-96(%r11) 724 vmovdqa %xmm10,48(%rsp) 725 vpshufd $0xEE,%xmm11,%xmm13 726 vmovdqu 32(%rdi),%xmm10 727 vpshufd $0x44,%xmm11,%xmm11 728 vmovdqa %xmm13,-80(%r11) 729 vmovdqa %xmm11,64(%rsp) 730 vpshufd $0xEE,%xmm12,%xmm14 731 vmovdqu 48(%rdi),%xmm11 732 vpshufd $0x44,%xmm12,%xmm12 733 vmovdqa %xmm14,-64(%r11) 734 vmovdqa %xmm12,80(%rsp) 735 vpshufd $0xEE,%xmm10,%xmm13 736 vmovdqu 64(%rdi),%xmm12 737 vpshufd $0x44,%xmm10,%xmm10 738 vmovdqa %xmm13,-48(%r11) 739 vmovdqa %xmm10,96(%rsp) 740 vpshufd $0xEE,%xmm11,%xmm14 741 vpshufd $0x44,%xmm11,%xmm11 742 vmovdqa %xmm14,-32(%r11) 743 vmovdqa %xmm11,112(%rsp) 744 vpshufd $0xEE,%xmm12,%xmm13 745 vmovdqa 0(%rsp),%xmm14 746 vpshufd $0x44,%xmm12,%xmm12 747 vmovdqa %xmm13,-16(%r11) 748 vmovdqa %xmm12,128(%rsp) 749 750 jmp .Loop_avx 751 752.align 32 753.Loop_avx: 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 vpmuludq %xmm5,%xmm14,%xmm10 775 vpmuludq %xmm6,%xmm14,%xmm11 776 vmovdqa %xmm2,32(%r11) 777 vpmuludq %xmm7,%xmm14,%xmm12 778 vmovdqa 16(%rsp),%xmm2 779 vpmuludq %xmm8,%xmm14,%xmm13 780 vpmuludq %xmm9,%xmm14,%xmm14 781 782 vmovdqa %xmm0,0(%r11) 783 vpmuludq 32(%rsp),%xmm9,%xmm0 784 vmovdqa %xmm1,16(%r11) 785 vpmuludq %xmm8,%xmm2,%xmm1 786 vpaddq %xmm0,%xmm10,%xmm10 787 vpaddq %xmm1,%xmm14,%xmm14 788 vmovdqa %xmm3,48(%r11) 789 vpmuludq %xmm7,%xmm2,%xmm0 790 vpmuludq %xmm6,%xmm2,%xmm1 791 vpaddq %xmm0,%xmm13,%xmm13 792 vmovdqa 48(%rsp),%xmm3 793 vpaddq %xmm1,%xmm12,%xmm12 794 vmovdqa %xmm4,64(%r11) 795 vpmuludq %xmm5,%xmm2,%xmm2 796 vpmuludq %xmm7,%xmm3,%xmm0 797 vpaddq %xmm2,%xmm11,%xmm11 798 799 vmovdqa 64(%rsp),%xmm4 800 vpaddq %xmm0,%xmm14,%xmm14 801 vpmuludq %xmm6,%xmm3,%xmm1 802 vpmuludq %xmm5,%xmm3,%xmm3 803 vpaddq %xmm1,%xmm13,%xmm13 804 vmovdqa 80(%rsp),%xmm2 805 vpaddq %xmm3,%xmm12,%xmm12 806 vpmuludq %xmm9,%xmm4,%xmm0 807 vpmuludq %xmm8,%xmm4,%xmm4 808 vpaddq %xmm0,%xmm11,%xmm11 809 vmovdqa 96(%rsp),%xmm3 810 vpaddq %xmm4,%xmm10,%xmm10 811 812 vmovdqa 128(%rsp),%xmm4 813 vpmuludq %xmm6,%xmm2,%xmm1 814 vpmuludq %xmm5,%xmm2,%xmm2 815 vpaddq %xmm1,%xmm14,%xmm14 816 vpaddq %xmm2,%xmm13,%xmm13 817 vpmuludq %xmm9,%xmm3,%xmm0 818 vpmuludq %xmm8,%xmm3,%xmm1 819 vpaddq %xmm0,%xmm12,%xmm12 820 vmovdqu 0(%rsi),%xmm0 821 vpaddq %xmm1,%xmm11,%xmm11 822 vpmuludq %xmm7,%xmm3,%xmm3 823 vpmuludq %xmm7,%xmm4,%xmm7 824 vpaddq %xmm3,%xmm10,%xmm10 825 826 vmovdqu 16(%rsi),%xmm1 827 vpaddq %xmm7,%xmm11,%xmm11 828 vpmuludq %xmm8,%xmm4,%xmm8 829 vpmuludq %xmm9,%xmm4,%xmm9 830 vpsrldq $6,%xmm0,%xmm2 831 vpaddq %xmm8,%xmm12,%xmm12 832 vpaddq %xmm9,%xmm13,%xmm13 833 vpsrldq $6,%xmm1,%xmm3 834 vpmuludq 112(%rsp),%xmm5,%xmm9 835 vpmuludq %xmm6,%xmm4,%xmm5 836 vpunpckhqdq %xmm1,%xmm0,%xmm4 837 vpaddq %xmm9,%xmm14,%xmm14 838 vmovdqa -144(%r11),%xmm9 839 vpaddq %xmm5,%xmm10,%xmm10 840 841 vpunpcklqdq %xmm1,%xmm0,%xmm0 842 vpunpcklqdq %xmm3,%xmm2,%xmm3 843 844 845 vpsrldq $5,%xmm4,%xmm4 846 vpsrlq $26,%xmm0,%xmm1 847 vpand %xmm15,%xmm0,%xmm0 848 vpsrlq $4,%xmm3,%xmm2 849 vpand %xmm15,%xmm1,%xmm1 850 vpand 0(%rcx),%xmm4,%xmm4 851 vpsrlq $30,%xmm3,%xmm3 852 vpand %xmm15,%xmm2,%xmm2 853 vpand %xmm15,%xmm3,%xmm3 854 vpor 32(%rcx),%xmm4,%xmm4 855 856 vpaddq 0(%r11),%xmm0,%xmm0 857 vpaddq 16(%r11),%xmm1,%xmm1 858 vpaddq 32(%r11),%xmm2,%xmm2 859 vpaddq 48(%r11),%xmm3,%xmm3 860 vpaddq 64(%r11),%xmm4,%xmm4 861 862 leaq 32(%rsi),%rax 863 leaq 64(%rsi),%rsi 864 subq $64,%rdx 865 cmovcq %rax,%rsi 866 867 868 869 870 871 872 873 874 875 876 vpmuludq %xmm0,%xmm9,%xmm5 877 vpmuludq %xmm1,%xmm9,%xmm6 878 vpaddq %xmm5,%xmm10,%xmm10 879 vpaddq %xmm6,%xmm11,%xmm11 880 vmovdqa -128(%r11),%xmm7 881 vpmuludq %xmm2,%xmm9,%xmm5 882 vpmuludq %xmm3,%xmm9,%xmm6 883 vpaddq %xmm5,%xmm12,%xmm12 884 vpaddq %xmm6,%xmm13,%xmm13 885 vpmuludq %xmm4,%xmm9,%xmm9 886 vpmuludq -112(%r11),%xmm4,%xmm5 887 vpaddq %xmm9,%xmm14,%xmm14 888 889 vpaddq %xmm5,%xmm10,%xmm10 890 vpmuludq %xmm2,%xmm7,%xmm6 891 vpmuludq %xmm3,%xmm7,%xmm5 892 vpaddq %xmm6,%xmm13,%xmm13 893 vmovdqa -96(%r11),%xmm8 894 vpaddq %xmm5,%xmm14,%xmm14 895 vpmuludq %xmm1,%xmm7,%xmm6 896 vpmuludq %xmm0,%xmm7,%xmm7 897 vpaddq %xmm6,%xmm12,%xmm12 898 vpaddq %xmm7,%xmm11,%xmm11 899 900 vmovdqa -80(%r11),%xmm9 901 vpmuludq %xmm2,%xmm8,%xmm5 902 vpmuludq %xmm1,%xmm8,%xmm6 903 vpaddq %xmm5,%xmm14,%xmm14 904 vpaddq %xmm6,%xmm13,%xmm13 905 vmovdqa -64(%r11),%xmm7 906 vpmuludq %xmm0,%xmm8,%xmm8 907 vpmuludq %xmm4,%xmm9,%xmm5 908 vpaddq %xmm8,%xmm12,%xmm12 909 vpaddq %xmm5,%xmm11,%xmm11 910 vmovdqa -48(%r11),%xmm8 911 vpmuludq %xmm3,%xmm9,%xmm9 912 vpmuludq %xmm1,%xmm7,%xmm6 913 vpaddq %xmm9,%xmm10,%xmm10 914 915 vmovdqa -16(%r11),%xmm9 916 vpaddq %xmm6,%xmm14,%xmm14 917 vpmuludq %xmm0,%xmm7,%xmm7 918 vpmuludq %xmm4,%xmm8,%xmm5 919 vpaddq %xmm7,%xmm13,%xmm13 920 vpaddq %xmm5,%xmm12,%xmm12 921 vmovdqu 32(%rsi),%xmm5 922 vpmuludq %xmm3,%xmm8,%xmm7 923 vpmuludq %xmm2,%xmm8,%xmm8 924 vpaddq %xmm7,%xmm11,%xmm11 925 vmovdqu 48(%rsi),%xmm6 926 vpaddq %xmm8,%xmm10,%xmm10 927 928 vpmuludq %xmm2,%xmm9,%xmm2 929 vpmuludq %xmm3,%xmm9,%xmm3 930 vpsrldq $6,%xmm5,%xmm7 931 vpaddq %xmm2,%xmm11,%xmm11 932 vpmuludq %xmm4,%xmm9,%xmm4 933 vpsrldq $6,%xmm6,%xmm8 934 vpaddq %xmm3,%xmm12,%xmm2 935 vpaddq %xmm4,%xmm13,%xmm3 936 vpmuludq -32(%r11),%xmm0,%xmm4 937 vpmuludq %xmm1,%xmm9,%xmm0 938 vpunpckhqdq %xmm6,%xmm5,%xmm9 939 vpaddq %xmm4,%xmm14,%xmm4 940 vpaddq %xmm0,%xmm10,%xmm0 941 942 vpunpcklqdq %xmm6,%xmm5,%xmm5 943 vpunpcklqdq %xmm8,%xmm7,%xmm8 944 945 946 vpsrldq $5,%xmm9,%xmm9 947 vpsrlq $26,%xmm5,%xmm6 948 vmovdqa 0(%rsp),%xmm14 949 vpand %xmm15,%xmm5,%xmm5 950 vpsrlq $4,%xmm8,%xmm7 951 vpand %xmm15,%xmm6,%xmm6 952 vpand 0(%rcx),%xmm9,%xmm9 953 vpsrlq $30,%xmm8,%xmm8 954 vpand %xmm15,%xmm7,%xmm7 955 vpand %xmm15,%xmm8,%xmm8 956 vpor 32(%rcx),%xmm9,%xmm9 957 958 959 960 961 962 vpsrlq $26,%xmm3,%xmm13 963 vpand %xmm15,%xmm3,%xmm3 964 vpaddq %xmm13,%xmm4,%xmm4 965 966 vpsrlq $26,%xmm0,%xmm10 967 vpand %xmm15,%xmm0,%xmm0 968 vpaddq %xmm10,%xmm11,%xmm1 969 970 vpsrlq $26,%xmm4,%xmm10 971 vpand %xmm15,%xmm4,%xmm4 972 973 vpsrlq $26,%xmm1,%xmm11 974 vpand %xmm15,%xmm1,%xmm1 975 vpaddq %xmm11,%xmm2,%xmm2 976 977 vpaddq %xmm10,%xmm0,%xmm0 978 vpsllq $2,%xmm10,%xmm10 979 vpaddq %xmm10,%xmm0,%xmm0 980 981 vpsrlq $26,%xmm2,%xmm12 982 vpand %xmm15,%xmm2,%xmm2 983 vpaddq %xmm12,%xmm3,%xmm3 984 985 vpsrlq $26,%xmm0,%xmm10 986 vpand %xmm15,%xmm0,%xmm0 987 vpaddq %xmm10,%xmm1,%xmm1 988 989 vpsrlq $26,%xmm3,%xmm13 990 vpand %xmm15,%xmm3,%xmm3 991 vpaddq %xmm13,%xmm4,%xmm4 992 993 ja .Loop_avx 994 995.Lskip_loop_avx: 996 997 998 999 vpshufd $0x10,%xmm14,%xmm14 1000 addq $32,%rdx 1001 jnz .Long_tail_avx 1002 1003 vpaddq %xmm2,%xmm7,%xmm7 1004 vpaddq %xmm0,%xmm5,%xmm5 1005 vpaddq %xmm1,%xmm6,%xmm6 1006 vpaddq %xmm3,%xmm8,%xmm8 1007 vpaddq %xmm4,%xmm9,%xmm9 1008 1009.Long_tail_avx: 1010 vmovdqa %xmm2,32(%r11) 1011 vmovdqa %xmm0,0(%r11) 1012 vmovdqa %xmm1,16(%r11) 1013 vmovdqa %xmm3,48(%r11) 1014 vmovdqa %xmm4,64(%r11) 1015 1016 1017 1018 1019 1020 1021 1022 vpmuludq %xmm7,%xmm14,%xmm12 1023 vpmuludq %xmm5,%xmm14,%xmm10 1024 vpshufd $0x10,-48(%rdi),%xmm2 1025 vpmuludq %xmm6,%xmm14,%xmm11 1026 vpmuludq %xmm8,%xmm14,%xmm13 1027 vpmuludq %xmm9,%xmm14,%xmm14 1028 1029 vpmuludq %xmm8,%xmm2,%xmm0 1030 vpaddq %xmm0,%xmm14,%xmm14 1031 vpshufd $0x10,-32(%rdi),%xmm3 1032 vpmuludq %xmm7,%xmm2,%xmm1 1033 vpaddq %xmm1,%xmm13,%xmm13 1034 vpshufd $0x10,-16(%rdi),%xmm4 1035 vpmuludq %xmm6,%xmm2,%xmm0 1036 vpaddq %xmm0,%xmm12,%xmm12 1037 vpmuludq %xmm5,%xmm2,%xmm2 1038 vpaddq %xmm2,%xmm11,%xmm11 1039 vpmuludq %xmm9,%xmm3,%xmm3 1040 vpaddq %xmm3,%xmm10,%xmm10 1041 1042 vpshufd $0x10,0(%rdi),%xmm2 1043 vpmuludq %xmm7,%xmm4,%xmm1 1044 vpaddq %xmm1,%xmm14,%xmm14 1045 vpmuludq %xmm6,%xmm4,%xmm0 1046 vpaddq %xmm0,%xmm13,%xmm13 1047 vpshufd $0x10,16(%rdi),%xmm3 1048 vpmuludq %xmm5,%xmm4,%xmm4 1049 vpaddq %xmm4,%xmm12,%xmm12 1050 vpmuludq %xmm9,%xmm2,%xmm1 1051 vpaddq %xmm1,%xmm11,%xmm11 1052 vpshufd $0x10,32(%rdi),%xmm4 1053 vpmuludq %xmm8,%xmm2,%xmm2 1054 vpaddq %xmm2,%xmm10,%xmm10 1055 1056 vpmuludq %xmm6,%xmm3,%xmm0 1057 vpaddq %xmm0,%xmm14,%xmm14 1058 vpmuludq %xmm5,%xmm3,%xmm3 1059 vpaddq %xmm3,%xmm13,%xmm13 1060 vpshufd $0x10,48(%rdi),%xmm2 1061 vpmuludq %xmm9,%xmm4,%xmm1 1062 vpaddq %xmm1,%xmm12,%xmm12 1063 vpshufd $0x10,64(%rdi),%xmm3 1064 vpmuludq %xmm8,%xmm4,%xmm0 1065 vpaddq %xmm0,%xmm11,%xmm11 1066 vpmuludq %xmm7,%xmm4,%xmm4 1067 vpaddq %xmm4,%xmm10,%xmm10 1068 1069 vpmuludq %xmm5,%xmm2,%xmm2 1070 vpaddq %xmm2,%xmm14,%xmm14 1071 vpmuludq %xmm9,%xmm3,%xmm1 1072 vpaddq %xmm1,%xmm13,%xmm13 1073 vpmuludq %xmm8,%xmm3,%xmm0 1074 vpaddq %xmm0,%xmm12,%xmm12 1075 vpmuludq %xmm7,%xmm3,%xmm1 1076 vpaddq %xmm1,%xmm11,%xmm11 1077 vpmuludq %xmm6,%xmm3,%xmm3 1078 vpaddq %xmm3,%xmm10,%xmm10 1079 1080 jz .Lshort_tail_avx 1081 1082 vmovdqu 0(%rsi),%xmm0 1083 vmovdqu 16(%rsi),%xmm1 1084 1085 vpsrldq $6,%xmm0,%xmm2 1086 vpsrldq $6,%xmm1,%xmm3 1087 vpunpckhqdq %xmm1,%xmm0,%xmm4 1088 vpunpcklqdq %xmm1,%xmm0,%xmm0 1089 vpunpcklqdq %xmm3,%xmm2,%xmm3 1090 1091 vpsrlq $40,%xmm4,%xmm4 1092 vpsrlq $26,%xmm0,%xmm1 1093 vpand %xmm15,%xmm0,%xmm0 1094 vpsrlq $4,%xmm3,%xmm2 1095 vpand %xmm15,%xmm1,%xmm1 1096 vpsrlq $30,%xmm3,%xmm3 1097 vpand %xmm15,%xmm2,%xmm2 1098 vpand %xmm15,%xmm3,%xmm3 1099 vpor 32(%rcx),%xmm4,%xmm4 1100 1101 vpshufd $0x32,-64(%rdi),%xmm9 1102 vpaddq 0(%r11),%xmm0,%xmm0 1103 vpaddq 16(%r11),%xmm1,%xmm1 1104 vpaddq 32(%r11),%xmm2,%xmm2 1105 vpaddq 48(%r11),%xmm3,%xmm3 1106 vpaddq 64(%r11),%xmm4,%xmm4 1107 1108 1109 1110 1111 vpmuludq %xmm0,%xmm9,%xmm5 1112 vpaddq %xmm5,%xmm10,%xmm10 1113 vpmuludq %xmm1,%xmm9,%xmm6 1114 vpaddq %xmm6,%xmm11,%xmm11 1115 vpmuludq %xmm2,%xmm9,%xmm5 1116 vpaddq %xmm5,%xmm12,%xmm12 1117 vpshufd $0x32,-48(%rdi),%xmm7 1118 vpmuludq %xmm3,%xmm9,%xmm6 1119 vpaddq %xmm6,%xmm13,%xmm13 1120 vpmuludq %xmm4,%xmm9,%xmm9 1121 vpaddq %xmm9,%xmm14,%xmm14 1122 1123 vpmuludq %xmm3,%xmm7,%xmm5 1124 vpaddq %xmm5,%xmm14,%xmm14 1125 vpshufd $0x32,-32(%rdi),%xmm8 1126 vpmuludq %xmm2,%xmm7,%xmm6 1127 vpaddq %xmm6,%xmm13,%xmm13 1128 vpshufd $0x32,-16(%rdi),%xmm9 1129 vpmuludq %xmm1,%xmm7,%xmm5 1130 vpaddq %xmm5,%xmm12,%xmm12 1131 vpmuludq %xmm0,%xmm7,%xmm7 1132 vpaddq %xmm7,%xmm11,%xmm11 1133 vpmuludq %xmm4,%xmm8,%xmm8 1134 vpaddq %xmm8,%xmm10,%xmm10 1135 1136 vpshufd $0x32,0(%rdi),%xmm7 1137 vpmuludq %xmm2,%xmm9,%xmm6 1138 vpaddq %xmm6,%xmm14,%xmm14 1139 vpmuludq %xmm1,%xmm9,%xmm5 1140 vpaddq %xmm5,%xmm13,%xmm13 1141 vpshufd $0x32,16(%rdi),%xmm8 1142 vpmuludq %xmm0,%xmm9,%xmm9 1143 vpaddq %xmm9,%xmm12,%xmm12 1144 vpmuludq %xmm4,%xmm7,%xmm6 1145 vpaddq %xmm6,%xmm11,%xmm11 1146 vpshufd $0x32,32(%rdi),%xmm9 1147 vpmuludq %xmm3,%xmm7,%xmm7 1148 vpaddq %xmm7,%xmm10,%xmm10 1149 1150 vpmuludq %xmm1,%xmm8,%xmm5 1151 vpaddq %xmm5,%xmm14,%xmm14 1152 vpmuludq %xmm0,%xmm8,%xmm8 1153 vpaddq %xmm8,%xmm13,%xmm13 1154 vpshufd $0x32,48(%rdi),%xmm7 1155 vpmuludq %xmm4,%xmm9,%xmm6 1156 vpaddq %xmm6,%xmm12,%xmm12 1157 vpshufd $0x32,64(%rdi),%xmm8 1158 vpmuludq %xmm3,%xmm9,%xmm5 1159 vpaddq %xmm5,%xmm11,%xmm11 1160 vpmuludq %xmm2,%xmm9,%xmm9 1161 vpaddq %xmm9,%xmm10,%xmm10 1162 1163 vpmuludq %xmm0,%xmm7,%xmm7 1164 vpaddq %xmm7,%xmm14,%xmm14 1165 vpmuludq %xmm4,%xmm8,%xmm6 1166 vpaddq %xmm6,%xmm13,%xmm13 1167 vpmuludq %xmm3,%xmm8,%xmm5 1168 vpaddq %xmm5,%xmm12,%xmm12 1169 vpmuludq %xmm2,%xmm8,%xmm6 1170 vpaddq %xmm6,%xmm11,%xmm11 1171 vpmuludq %xmm1,%xmm8,%xmm8 1172 vpaddq %xmm8,%xmm10,%xmm10 1173 1174.Lshort_tail_avx: 1175 1176 1177 1178 vpsrldq $8,%xmm14,%xmm9 1179 vpsrldq $8,%xmm13,%xmm8 1180 vpsrldq $8,%xmm11,%xmm6 1181 vpsrldq $8,%xmm10,%xmm5 1182 vpsrldq $8,%xmm12,%xmm7 1183 vpaddq %xmm8,%xmm13,%xmm13 1184 vpaddq %xmm9,%xmm14,%xmm14 1185 vpaddq %xmm5,%xmm10,%xmm10 1186 vpaddq %xmm6,%xmm11,%xmm11 1187 vpaddq %xmm7,%xmm12,%xmm12 1188 1189 1190 1191 1192 vpsrlq $26,%xmm13,%xmm3 1193 vpand %xmm15,%xmm13,%xmm13 1194 vpaddq %xmm3,%xmm14,%xmm14 1195 1196 vpsrlq $26,%xmm10,%xmm0 1197 vpand %xmm15,%xmm10,%xmm10 1198 vpaddq %xmm0,%xmm11,%xmm11 1199 1200 vpsrlq $26,%xmm14,%xmm4 1201 vpand %xmm15,%xmm14,%xmm14 1202 1203 vpsrlq $26,%xmm11,%xmm1 1204 vpand %xmm15,%xmm11,%xmm11 1205 vpaddq %xmm1,%xmm12,%xmm12 1206 1207 vpaddq %xmm4,%xmm10,%xmm10 1208 vpsllq $2,%xmm4,%xmm4 1209 vpaddq %xmm4,%xmm10,%xmm10 1210 1211 vpsrlq $26,%xmm12,%xmm2 1212 vpand %xmm15,%xmm12,%xmm12 1213 vpaddq %xmm2,%xmm13,%xmm13 1214 1215 vpsrlq $26,%xmm10,%xmm0 1216 vpand %xmm15,%xmm10,%xmm10 1217 vpaddq %xmm0,%xmm11,%xmm11 1218 1219 vpsrlq $26,%xmm13,%xmm3 1220 vpand %xmm15,%xmm13,%xmm13 1221 vpaddq %xmm3,%xmm14,%xmm14 1222 1223 vmovd %xmm10,-112(%rdi) 1224 vmovd %xmm11,-108(%rdi) 1225 vmovd %xmm12,-104(%rdi) 1226 vmovd %xmm13,-100(%rdi) 1227 vmovd %xmm14,-96(%rdi) 1228 leaq 88(%r11),%rsp 1229.cfi_def_cfa %rsp,8 1230 vzeroupper 1231 .byte 0xf3,0xc3 1232.cfi_endproc 1233.size poly1305_blocks_avx,.-poly1305_blocks_avx 1234 1235.type poly1305_emit_avx,@function 1236.align 32 1237poly1305_emit_avx: 1238.cfi_startproc 1239 cmpl $0,20(%rdi) 1240 je .Lemit 1241 1242 movl 0(%rdi),%eax 1243 movl 4(%rdi),%ecx 1244 movl 8(%rdi),%r8d 1245 movl 12(%rdi),%r11d 1246 movl 16(%rdi),%r10d 1247 1248 shlq $26,%rcx 1249 movq %r8,%r9 1250 shlq $52,%r8 1251 addq %rcx,%rax 1252 shrq $12,%r9 1253 addq %rax,%r8 1254 adcq $0,%r9 1255 1256 shlq $14,%r11 1257 movq %r10,%rax 1258 shrq $24,%r10 1259 addq %r11,%r9 1260 shlq $40,%rax 1261 addq %rax,%r9 1262 adcq $0,%r10 1263 1264 movq %r10,%rax 1265 movq %r10,%rcx 1266 andq $3,%r10 1267 shrq $2,%rax 1268 andq $-4,%rcx 1269 addq %rcx,%rax 1270 addq %rax,%r8 1271 adcq $0,%r9 1272 adcq $0,%r10 1273 1274 movq %r8,%rax 1275 addq $5,%r8 1276 movq %r9,%rcx 1277 adcq $0,%r9 1278 adcq $0,%r10 1279 shrq $2,%r10 1280 cmovnzq %r8,%rax 1281 cmovnzq %r9,%rcx 1282 1283 addq 0(%rdx),%rax 1284 adcq 8(%rdx),%rcx 1285 movq %rax,0(%rsi) 1286 movq %rcx,8(%rsi) 1287 1288 .byte 0xf3,0xc3 1289.cfi_endproc 1290.size poly1305_emit_avx,.-poly1305_emit_avx 1291.type poly1305_blocks_avx2,@function 1292.align 32 1293poly1305_blocks_avx2: 1294.cfi_startproc 1295 movl 20(%rdi),%r8d 1296 cmpq $128,%rdx 1297 jae .Lblocks_avx2 1298 testl %r8d,%r8d 1299 jz .Lblocks 1300 1301.Lblocks_avx2: 1302 andq $-16,%rdx 1303 jz .Lno_data_avx2 1304 1305 vzeroupper 1306 1307 testl %r8d,%r8d 1308 jz .Lbase2_64_avx2 1309 1310 testq $63,%rdx 1311 jz .Leven_avx2 1312 1313 pushq %rbx 1314.cfi_adjust_cfa_offset 8 1315.cfi_offset %rbx,-16 1316 pushq %rbp 1317.cfi_adjust_cfa_offset 8 1318.cfi_offset %rbp,-24 1319 pushq %r12 1320.cfi_adjust_cfa_offset 8 1321.cfi_offset %r12,-32 1322 pushq %r13 1323.cfi_adjust_cfa_offset 8 1324.cfi_offset %r13,-40 1325 pushq %r14 1326.cfi_adjust_cfa_offset 8 1327.cfi_offset %r14,-48 1328 pushq %r15 1329.cfi_adjust_cfa_offset 8 1330.cfi_offset %r15,-56 1331.Lblocks_avx2_body: 1332 1333 movq %rdx,%r15 1334 1335 movq 0(%rdi),%r8 1336 movq 8(%rdi),%r9 1337 movl 16(%rdi),%ebp 1338 1339 movq 24(%rdi),%r11 1340 movq 32(%rdi),%r13 1341 1342 1343 movl %r8d,%r14d 1344 andq $-2147483648,%r8 1345 movq %r9,%r12 1346 movl %r9d,%ebx 1347 andq $-2147483648,%r9 1348 1349 shrq $6,%r8 1350 shlq $52,%r12 1351 addq %r8,%r14 1352 shrq $12,%rbx 1353 shrq $18,%r9 1354 addq %r12,%r14 1355 adcq %r9,%rbx 1356 1357 movq %rbp,%r8 1358 shlq $40,%r8 1359 shrq $24,%rbp 1360 addq %r8,%rbx 1361 adcq $0,%rbp 1362 1363 movq $-4,%r9 1364 movq %rbp,%r8 1365 andq %rbp,%r9 1366 shrq $2,%r8 1367 andq $3,%rbp 1368 addq %r9,%r8 1369 addq %r8,%r14 1370 adcq $0,%rbx 1371 adcq $0,%rbp 1372 1373 movq %r13,%r12 1374 movq %r13,%rax 1375 shrq $2,%r13 1376 addq %r12,%r13 1377 1378.Lbase2_26_pre_avx2: 1379 addq 0(%rsi),%r14 1380 adcq 8(%rsi),%rbx 1381 leaq 16(%rsi),%rsi 1382 adcq %rcx,%rbp 1383 subq $16,%r15 1384 1385 call __poly1305_block 1386 movq %r12,%rax 1387 1388 testq $63,%r15 1389 jnz .Lbase2_26_pre_avx2 1390 1391 testq %rcx,%rcx 1392 jz .Lstore_base2_64_avx2 1393 1394 1395 movq %r14,%rax 1396 movq %r14,%rdx 1397 shrq $52,%r14 1398 movq %rbx,%r11 1399 movq %rbx,%r12 1400 shrq $26,%rdx 1401 andq $0x3ffffff,%rax 1402 shlq $12,%r11 1403 andq $0x3ffffff,%rdx 1404 shrq $14,%rbx 1405 orq %r11,%r14 1406 shlq $24,%rbp 1407 andq $0x3ffffff,%r14 1408 shrq $40,%r12 1409 andq $0x3ffffff,%rbx 1410 orq %r12,%rbp 1411 1412 testq %r15,%r15 1413 jz .Lstore_base2_26_avx2 1414 1415 vmovd %eax,%xmm0 1416 vmovd %edx,%xmm1 1417 vmovd %r14d,%xmm2 1418 vmovd %ebx,%xmm3 1419 vmovd %ebp,%xmm4 1420 jmp .Lproceed_avx2 1421 1422.align 32 1423.Lstore_base2_64_avx2: 1424 movq %r14,0(%rdi) 1425 movq %rbx,8(%rdi) 1426 movq %rbp,16(%rdi) 1427 jmp .Ldone_avx2 1428 1429.align 16 1430.Lstore_base2_26_avx2: 1431 movl %eax,0(%rdi) 1432 movl %edx,4(%rdi) 1433 movl %r14d,8(%rdi) 1434 movl %ebx,12(%rdi) 1435 movl %ebp,16(%rdi) 1436.align 16 1437.Ldone_avx2: 1438 movq 0(%rsp),%r15 1439.cfi_restore %r15 1440 movq 8(%rsp),%r14 1441.cfi_restore %r14 1442 movq 16(%rsp),%r13 1443.cfi_restore %r13 1444 movq 24(%rsp),%r12 1445.cfi_restore %r12 1446 movq 32(%rsp),%rbp 1447.cfi_restore %rbp 1448 movq 40(%rsp),%rbx 1449.cfi_restore %rbx 1450 leaq 48(%rsp),%rsp 1451.cfi_adjust_cfa_offset -48 1452.Lno_data_avx2: 1453.Lblocks_avx2_epilogue: 1454 .byte 0xf3,0xc3 1455.cfi_endproc 1456 1457.align 32 1458.Lbase2_64_avx2: 1459.cfi_startproc 1460 pushq %rbx 1461.cfi_adjust_cfa_offset 8 1462.cfi_offset %rbx,-16 1463 pushq %rbp 1464.cfi_adjust_cfa_offset 8 1465.cfi_offset %rbp,-24 1466 pushq %r12 1467.cfi_adjust_cfa_offset 8 1468.cfi_offset %r12,-32 1469 pushq %r13 1470.cfi_adjust_cfa_offset 8 1471.cfi_offset %r13,-40 1472 pushq %r14 1473.cfi_adjust_cfa_offset 8 1474.cfi_offset %r14,-48 1475 pushq %r15 1476.cfi_adjust_cfa_offset 8 1477.cfi_offset %r15,-56 1478.Lbase2_64_avx2_body: 1479 1480 movq %rdx,%r15 1481 1482 movq 24(%rdi),%r11 1483 movq 32(%rdi),%r13 1484 1485 movq 0(%rdi),%r14 1486 movq 8(%rdi),%rbx 1487 movl 16(%rdi),%ebp 1488 1489 movq %r13,%r12 1490 movq %r13,%rax 1491 shrq $2,%r13 1492 addq %r12,%r13 1493 1494 testq $63,%rdx 1495 jz .Linit_avx2 1496 1497.Lbase2_64_pre_avx2: 1498 addq 0(%rsi),%r14 1499 adcq 8(%rsi),%rbx 1500 leaq 16(%rsi),%rsi 1501 adcq %rcx,%rbp 1502 subq $16,%r15 1503 1504 call __poly1305_block 1505 movq %r12,%rax 1506 1507 testq $63,%r15 1508 jnz .Lbase2_64_pre_avx2 1509 1510.Linit_avx2: 1511 1512 movq %r14,%rax 1513 movq %r14,%rdx 1514 shrq $52,%r14 1515 movq %rbx,%r8 1516 movq %rbx,%r9 1517 shrq $26,%rdx 1518 andq $0x3ffffff,%rax 1519 shlq $12,%r8 1520 andq $0x3ffffff,%rdx 1521 shrq $14,%rbx 1522 orq %r8,%r14 1523 shlq $24,%rbp 1524 andq $0x3ffffff,%r14 1525 shrq $40,%r9 1526 andq $0x3ffffff,%rbx 1527 orq %r9,%rbp 1528 1529 vmovd %eax,%xmm0 1530 vmovd %edx,%xmm1 1531 vmovd %r14d,%xmm2 1532 vmovd %ebx,%xmm3 1533 vmovd %ebp,%xmm4 1534 movl $1,20(%rdi) 1535 1536 call __poly1305_init_avx 1537 1538.Lproceed_avx2: 1539 movq %r15,%rdx 1540 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1541 movl $3221291008,%r11d 1542 1543 movq 0(%rsp),%r15 1544.cfi_restore %r15 1545 movq 8(%rsp),%r14 1546.cfi_restore %r14 1547 movq 16(%rsp),%r13 1548.cfi_restore %r13 1549 movq 24(%rsp),%r12 1550.cfi_restore %r12 1551 movq 32(%rsp),%rbp 1552.cfi_restore %rbp 1553 movq 40(%rsp),%rbx 1554.cfi_restore %rbx 1555 leaq 48(%rsp),%rax 1556 leaq 48(%rsp),%rsp 1557.cfi_adjust_cfa_offset -48 1558.Lbase2_64_avx2_epilogue: 1559 jmp .Ldo_avx2 1560.cfi_endproc 1561 1562.align 32 1563.Leven_avx2: 1564.cfi_startproc 1565 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1566 vmovd 0(%rdi),%xmm0 1567 vmovd 4(%rdi),%xmm1 1568 vmovd 8(%rdi),%xmm2 1569 vmovd 12(%rdi),%xmm3 1570 vmovd 16(%rdi),%xmm4 1571 1572.Ldo_avx2: 1573 leaq -8(%rsp),%r11 1574.cfi_def_cfa %r11,16 1575 subq $0x128,%rsp 1576 leaq .Lconst(%rip),%rcx 1577 leaq 48+64(%rdi),%rdi 1578 vmovdqa 96(%rcx),%ymm7 1579 1580 1581 vmovdqu -64(%rdi),%xmm9 1582 andq $-512,%rsp 1583 vmovdqu -48(%rdi),%xmm10 1584 vmovdqu -32(%rdi),%xmm6 1585 vmovdqu -16(%rdi),%xmm11 1586 vmovdqu 0(%rdi),%xmm12 1587 vmovdqu 16(%rdi),%xmm13 1588 leaq 144(%rsp),%rax 1589 vmovdqu 32(%rdi),%xmm14 1590 vpermd %ymm9,%ymm7,%ymm9 1591 vmovdqu 48(%rdi),%xmm15 1592 vpermd %ymm10,%ymm7,%ymm10 1593 vmovdqu 64(%rdi),%xmm5 1594 vpermd %ymm6,%ymm7,%ymm6 1595 vmovdqa %ymm9,0(%rsp) 1596 vpermd %ymm11,%ymm7,%ymm11 1597 vmovdqa %ymm10,32-144(%rax) 1598 vpermd %ymm12,%ymm7,%ymm12 1599 vmovdqa %ymm6,64-144(%rax) 1600 vpermd %ymm13,%ymm7,%ymm13 1601 vmovdqa %ymm11,96-144(%rax) 1602 vpermd %ymm14,%ymm7,%ymm14 1603 vmovdqa %ymm12,128-144(%rax) 1604 vpermd %ymm15,%ymm7,%ymm15 1605 vmovdqa %ymm13,160-144(%rax) 1606 vpermd %ymm5,%ymm7,%ymm5 1607 vmovdqa %ymm14,192-144(%rax) 1608 vmovdqa %ymm15,224-144(%rax) 1609 vmovdqa %ymm5,256-144(%rax) 1610 vmovdqa 64(%rcx),%ymm5 1611 1612 1613 1614 vmovdqu 0(%rsi),%xmm7 1615 vmovdqu 16(%rsi),%xmm8 1616 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1617 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1618 leaq 64(%rsi),%rsi 1619 1620 vpsrldq $6,%ymm7,%ymm9 1621 vpsrldq $6,%ymm8,%ymm10 1622 vpunpckhqdq %ymm8,%ymm7,%ymm6 1623 vpunpcklqdq %ymm10,%ymm9,%ymm9 1624 vpunpcklqdq %ymm8,%ymm7,%ymm7 1625 1626 vpsrlq $30,%ymm9,%ymm10 1627 vpsrlq $4,%ymm9,%ymm9 1628 vpsrlq $26,%ymm7,%ymm8 1629 vpsrlq $40,%ymm6,%ymm6 1630 vpand %ymm5,%ymm9,%ymm9 1631 vpand %ymm5,%ymm7,%ymm7 1632 vpand %ymm5,%ymm8,%ymm8 1633 vpand %ymm5,%ymm10,%ymm10 1634 vpor 32(%rcx),%ymm6,%ymm6 1635 1636 vpaddq %ymm2,%ymm9,%ymm2 1637 subq $64,%rdx 1638 jz .Ltail_avx2 1639 jmp .Loop_avx2 1640 1641.align 32 1642.Loop_avx2: 1643 1644 1645 1646 1647 1648 1649 1650 1651 vpaddq %ymm0,%ymm7,%ymm0 1652 vmovdqa 0(%rsp),%ymm7 1653 vpaddq %ymm1,%ymm8,%ymm1 1654 vmovdqa 32(%rsp),%ymm8 1655 vpaddq %ymm3,%ymm10,%ymm3 1656 vmovdqa 96(%rsp),%ymm9 1657 vpaddq %ymm4,%ymm6,%ymm4 1658 vmovdqa 48(%rax),%ymm10 1659 vmovdqa 112(%rax),%ymm5 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 vpmuludq %ymm2,%ymm7,%ymm13 1677 vpmuludq %ymm2,%ymm8,%ymm14 1678 vpmuludq %ymm2,%ymm9,%ymm15 1679 vpmuludq %ymm2,%ymm10,%ymm11 1680 vpmuludq %ymm2,%ymm5,%ymm12 1681 1682 vpmuludq %ymm0,%ymm8,%ymm6 1683 vpmuludq %ymm1,%ymm8,%ymm2 1684 vpaddq %ymm6,%ymm12,%ymm12 1685 vpaddq %ymm2,%ymm13,%ymm13 1686 vpmuludq %ymm3,%ymm8,%ymm6 1687 vpmuludq 64(%rsp),%ymm4,%ymm2 1688 vpaddq %ymm6,%ymm15,%ymm15 1689 vpaddq %ymm2,%ymm11,%ymm11 1690 vmovdqa -16(%rax),%ymm8 1691 1692 vpmuludq %ymm0,%ymm7,%ymm6 1693 vpmuludq %ymm1,%ymm7,%ymm2 1694 vpaddq %ymm6,%ymm11,%ymm11 1695 vpaddq %ymm2,%ymm12,%ymm12 1696 vpmuludq %ymm3,%ymm7,%ymm6 1697 vpmuludq %ymm4,%ymm7,%ymm2 1698 vmovdqu 0(%rsi),%xmm7 1699 vpaddq %ymm6,%ymm14,%ymm14 1700 vpaddq %ymm2,%ymm15,%ymm15 1701 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1702 1703 vpmuludq %ymm3,%ymm8,%ymm6 1704 vpmuludq %ymm4,%ymm8,%ymm2 1705 vmovdqu 16(%rsi),%xmm8 1706 vpaddq %ymm6,%ymm11,%ymm11 1707 vpaddq %ymm2,%ymm12,%ymm12 1708 vmovdqa 16(%rax),%ymm2 1709 vpmuludq %ymm1,%ymm9,%ymm6 1710 vpmuludq %ymm0,%ymm9,%ymm9 1711 vpaddq %ymm6,%ymm14,%ymm14 1712 vpaddq %ymm9,%ymm13,%ymm13 1713 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1714 leaq 64(%rsi),%rsi 1715 1716 vpmuludq %ymm1,%ymm2,%ymm6 1717 vpmuludq %ymm0,%ymm2,%ymm2 1718 vpsrldq $6,%ymm7,%ymm9 1719 vpaddq %ymm6,%ymm15,%ymm15 1720 vpaddq %ymm2,%ymm14,%ymm14 1721 vpmuludq %ymm3,%ymm10,%ymm6 1722 vpmuludq %ymm4,%ymm10,%ymm2 1723 vpsrldq $6,%ymm8,%ymm10 1724 vpaddq %ymm6,%ymm12,%ymm12 1725 vpaddq %ymm2,%ymm13,%ymm13 1726 vpunpckhqdq %ymm8,%ymm7,%ymm6 1727 1728 vpmuludq %ymm3,%ymm5,%ymm3 1729 vpmuludq %ymm4,%ymm5,%ymm4 1730 vpunpcklqdq %ymm8,%ymm7,%ymm7 1731 vpaddq %ymm3,%ymm13,%ymm2 1732 vpaddq %ymm4,%ymm14,%ymm3 1733 vpunpcklqdq %ymm10,%ymm9,%ymm10 1734 vpmuludq 80(%rax),%ymm0,%ymm4 1735 vpmuludq %ymm1,%ymm5,%ymm0 1736 vmovdqa 64(%rcx),%ymm5 1737 vpaddq %ymm4,%ymm15,%ymm4 1738 vpaddq %ymm0,%ymm11,%ymm0 1739 1740 1741 1742 1743 vpsrlq $26,%ymm3,%ymm14 1744 vpand %ymm5,%ymm3,%ymm3 1745 vpaddq %ymm14,%ymm4,%ymm4 1746 1747 vpsrlq $26,%ymm0,%ymm11 1748 vpand %ymm5,%ymm0,%ymm0 1749 vpaddq %ymm11,%ymm12,%ymm1 1750 1751 vpsrlq $26,%ymm4,%ymm15 1752 vpand %ymm5,%ymm4,%ymm4 1753 1754 vpsrlq $4,%ymm10,%ymm9 1755 1756 vpsrlq $26,%ymm1,%ymm12 1757 vpand %ymm5,%ymm1,%ymm1 1758 vpaddq %ymm12,%ymm2,%ymm2 1759 1760 vpaddq %ymm15,%ymm0,%ymm0 1761 vpsllq $2,%ymm15,%ymm15 1762 vpaddq %ymm15,%ymm0,%ymm0 1763 1764 vpand %ymm5,%ymm9,%ymm9 1765 vpsrlq $26,%ymm7,%ymm8 1766 1767 vpsrlq $26,%ymm2,%ymm13 1768 vpand %ymm5,%ymm2,%ymm2 1769 vpaddq %ymm13,%ymm3,%ymm3 1770 1771 vpaddq %ymm9,%ymm2,%ymm2 1772 vpsrlq $30,%ymm10,%ymm10 1773 1774 vpsrlq $26,%ymm0,%ymm11 1775 vpand %ymm5,%ymm0,%ymm0 1776 vpaddq %ymm11,%ymm1,%ymm1 1777 1778 vpsrlq $40,%ymm6,%ymm6 1779 1780 vpsrlq $26,%ymm3,%ymm14 1781 vpand %ymm5,%ymm3,%ymm3 1782 vpaddq %ymm14,%ymm4,%ymm4 1783 1784 vpand %ymm5,%ymm7,%ymm7 1785 vpand %ymm5,%ymm8,%ymm8 1786 vpand %ymm5,%ymm10,%ymm10 1787 vpor 32(%rcx),%ymm6,%ymm6 1788 1789 subq $64,%rdx 1790 jnz .Loop_avx2 1791 1792.byte 0x66,0x90 1793.Ltail_avx2: 1794 1795 1796 1797 1798 1799 1800 1801 vpaddq %ymm0,%ymm7,%ymm0 1802 vmovdqu 4(%rsp),%ymm7 1803 vpaddq %ymm1,%ymm8,%ymm1 1804 vmovdqu 36(%rsp),%ymm8 1805 vpaddq %ymm3,%ymm10,%ymm3 1806 vmovdqu 100(%rsp),%ymm9 1807 vpaddq %ymm4,%ymm6,%ymm4 1808 vmovdqu 52(%rax),%ymm10 1809 vmovdqu 116(%rax),%ymm5 1810 1811 vpmuludq %ymm2,%ymm7,%ymm13 1812 vpmuludq %ymm2,%ymm8,%ymm14 1813 vpmuludq %ymm2,%ymm9,%ymm15 1814 vpmuludq %ymm2,%ymm10,%ymm11 1815 vpmuludq %ymm2,%ymm5,%ymm12 1816 1817 vpmuludq %ymm0,%ymm8,%ymm6 1818 vpmuludq %ymm1,%ymm8,%ymm2 1819 vpaddq %ymm6,%ymm12,%ymm12 1820 vpaddq %ymm2,%ymm13,%ymm13 1821 vpmuludq %ymm3,%ymm8,%ymm6 1822 vpmuludq 68(%rsp),%ymm4,%ymm2 1823 vpaddq %ymm6,%ymm15,%ymm15 1824 vpaddq %ymm2,%ymm11,%ymm11 1825 1826 vpmuludq %ymm0,%ymm7,%ymm6 1827 vpmuludq %ymm1,%ymm7,%ymm2 1828 vpaddq %ymm6,%ymm11,%ymm11 1829 vmovdqu -12(%rax),%ymm8 1830 vpaddq %ymm2,%ymm12,%ymm12 1831 vpmuludq %ymm3,%ymm7,%ymm6 1832 vpmuludq %ymm4,%ymm7,%ymm2 1833 vpaddq %ymm6,%ymm14,%ymm14 1834 vpaddq %ymm2,%ymm15,%ymm15 1835 1836 vpmuludq %ymm3,%ymm8,%ymm6 1837 vpmuludq %ymm4,%ymm8,%ymm2 1838 vpaddq %ymm6,%ymm11,%ymm11 1839 vpaddq %ymm2,%ymm12,%ymm12 1840 vmovdqu 20(%rax),%ymm2 1841 vpmuludq %ymm1,%ymm9,%ymm6 1842 vpmuludq %ymm0,%ymm9,%ymm9 1843 vpaddq %ymm6,%ymm14,%ymm14 1844 vpaddq %ymm9,%ymm13,%ymm13 1845 1846 vpmuludq %ymm1,%ymm2,%ymm6 1847 vpmuludq %ymm0,%ymm2,%ymm2 1848 vpaddq %ymm6,%ymm15,%ymm15 1849 vpaddq %ymm2,%ymm14,%ymm14 1850 vpmuludq %ymm3,%ymm10,%ymm6 1851 vpmuludq %ymm4,%ymm10,%ymm2 1852 vpaddq %ymm6,%ymm12,%ymm12 1853 vpaddq %ymm2,%ymm13,%ymm13 1854 1855 vpmuludq %ymm3,%ymm5,%ymm3 1856 vpmuludq %ymm4,%ymm5,%ymm4 1857 vpaddq %ymm3,%ymm13,%ymm2 1858 vpaddq %ymm4,%ymm14,%ymm3 1859 vpmuludq 84(%rax),%ymm0,%ymm4 1860 vpmuludq %ymm1,%ymm5,%ymm0 1861 vmovdqa 64(%rcx),%ymm5 1862 vpaddq %ymm4,%ymm15,%ymm4 1863 vpaddq %ymm0,%ymm11,%ymm0 1864 1865 1866 1867 1868 vpsrldq $8,%ymm12,%ymm8 1869 vpsrldq $8,%ymm2,%ymm9 1870 vpsrldq $8,%ymm3,%ymm10 1871 vpsrldq $8,%ymm4,%ymm6 1872 vpsrldq $8,%ymm0,%ymm7 1873 vpaddq %ymm8,%ymm12,%ymm12 1874 vpaddq %ymm9,%ymm2,%ymm2 1875 vpaddq %ymm10,%ymm3,%ymm3 1876 vpaddq %ymm6,%ymm4,%ymm4 1877 vpaddq %ymm7,%ymm0,%ymm0 1878 1879 vpermq $0x2,%ymm3,%ymm10 1880 vpermq $0x2,%ymm4,%ymm6 1881 vpermq $0x2,%ymm0,%ymm7 1882 vpermq $0x2,%ymm12,%ymm8 1883 vpermq $0x2,%ymm2,%ymm9 1884 vpaddq %ymm10,%ymm3,%ymm3 1885 vpaddq %ymm6,%ymm4,%ymm4 1886 vpaddq %ymm7,%ymm0,%ymm0 1887 vpaddq %ymm8,%ymm12,%ymm12 1888 vpaddq %ymm9,%ymm2,%ymm2 1889 1890 1891 1892 1893 vpsrlq $26,%ymm3,%ymm14 1894 vpand %ymm5,%ymm3,%ymm3 1895 vpaddq %ymm14,%ymm4,%ymm4 1896 1897 vpsrlq $26,%ymm0,%ymm11 1898 vpand %ymm5,%ymm0,%ymm0 1899 vpaddq %ymm11,%ymm12,%ymm1 1900 1901 vpsrlq $26,%ymm4,%ymm15 1902 vpand %ymm5,%ymm4,%ymm4 1903 1904 vpsrlq $26,%ymm1,%ymm12 1905 vpand %ymm5,%ymm1,%ymm1 1906 vpaddq %ymm12,%ymm2,%ymm2 1907 1908 vpaddq %ymm15,%ymm0,%ymm0 1909 vpsllq $2,%ymm15,%ymm15 1910 vpaddq %ymm15,%ymm0,%ymm0 1911 1912 vpsrlq $26,%ymm2,%ymm13 1913 vpand %ymm5,%ymm2,%ymm2 1914 vpaddq %ymm13,%ymm3,%ymm3 1915 1916 vpsrlq $26,%ymm0,%ymm11 1917 vpand %ymm5,%ymm0,%ymm0 1918 vpaddq %ymm11,%ymm1,%ymm1 1919 1920 vpsrlq $26,%ymm3,%ymm14 1921 vpand %ymm5,%ymm3,%ymm3 1922 vpaddq %ymm14,%ymm4,%ymm4 1923 1924 vmovd %xmm0,-112(%rdi) 1925 vmovd %xmm1,-108(%rdi) 1926 vmovd %xmm2,-104(%rdi) 1927 vmovd %xmm3,-100(%rdi) 1928 vmovd %xmm4,-96(%rdi) 1929 leaq 8(%r11),%rsp 1930.cfi_def_cfa %rsp,8 1931 vzeroupper 1932 .byte 0xf3,0xc3 1933.cfi_endproc 1934.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 1935.align 64 1936.Lconst: 1937.Lmask24: 1938.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 1939.L129: 1940.long 16777216,0,16777216,0,16777216,0,16777216,0 1941.Lmask26: 1942.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 1943.Lpermd_avx2: 1944.long 2,2,2,3,2,0,2,1 1945.Lpermd_avx512: 1946.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 1947 1948.L2_44_inp_permd: 1949.long 0,1,1,2,2,3,7,7 1950.L2_44_inp_shift: 1951.quad 0,12,24,64 1952.L2_44_mask: 1953.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 1954.L2_44_shift_rgt: 1955.quad 44,44,42,64 1956.L2_44_shift_lft: 1957.quad 8,8,10,64 1958 1959.align 64 1960.Lx_mask44: 1961.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1962.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 1963.Lx_mask42: 1964.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1965.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 1966.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1967.align 16 1968.globl xor128_encrypt_n_pad 1969.type xor128_encrypt_n_pad,@function 1970.align 16 1971xor128_encrypt_n_pad: 1972.cfi_startproc 1973 subq %rdx,%rsi 1974 subq %rdx,%rdi 1975 movq %rcx,%r10 1976 shrq $4,%rcx 1977 jz .Ltail_enc 1978 nop 1979.Loop_enc_xmm: 1980 movdqu (%rsi,%rdx,1),%xmm0 1981 pxor (%rdx),%xmm0 1982 movdqu %xmm0,(%rdi,%rdx,1) 1983 movdqa %xmm0,(%rdx) 1984 leaq 16(%rdx),%rdx 1985 decq %rcx 1986 jnz .Loop_enc_xmm 1987 1988 andq $15,%r10 1989 jz .Ldone_enc 1990 1991.Ltail_enc: 1992 movq $16,%rcx 1993 subq %r10,%rcx 1994 xorl %eax,%eax 1995.Loop_enc_byte: 1996 movb (%rsi,%rdx,1),%al 1997 xorb (%rdx),%al 1998 movb %al,(%rdi,%rdx,1) 1999 movb %al,(%rdx) 2000 leaq 1(%rdx),%rdx 2001 decq %r10 2002 jnz .Loop_enc_byte 2003 2004 xorl %eax,%eax 2005.Loop_enc_pad: 2006 movb %al,(%rdx) 2007 leaq 1(%rdx),%rdx 2008 decq %rcx 2009 jnz .Loop_enc_pad 2010 2011.Ldone_enc: 2012 movq %rdx,%rax 2013 .byte 0xf3,0xc3 2014.cfi_endproc 2015.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 2016 2017.globl xor128_decrypt_n_pad 2018.type xor128_decrypt_n_pad,@function 2019.align 16 2020xor128_decrypt_n_pad: 2021.cfi_startproc 2022 subq %rdx,%rsi 2023 subq %rdx,%rdi 2024 movq %rcx,%r10 2025 shrq $4,%rcx 2026 jz .Ltail_dec 2027 nop 2028.Loop_dec_xmm: 2029 movdqu (%rsi,%rdx,1),%xmm0 2030 movdqa (%rdx),%xmm1 2031 pxor %xmm0,%xmm1 2032 movdqu %xmm1,(%rdi,%rdx,1) 2033 movdqa %xmm0,(%rdx) 2034 leaq 16(%rdx),%rdx 2035 decq %rcx 2036 jnz .Loop_dec_xmm 2037 2038 pxor %xmm1,%xmm1 2039 andq $15,%r10 2040 jz .Ldone_dec 2041 2042.Ltail_dec: 2043 movq $16,%rcx 2044 subq %r10,%rcx 2045 xorl %eax,%eax 2046 xorq %r11,%r11 2047.Loop_dec_byte: 2048 movb (%rsi,%rdx,1),%r11b 2049 movb (%rdx),%al 2050 xorb %r11b,%al 2051 movb %al,(%rdi,%rdx,1) 2052 movb %r11b,(%rdx) 2053 leaq 1(%rdx),%rdx 2054 decq %r10 2055 jnz .Loop_dec_byte 2056 2057 xorl %eax,%eax 2058.Loop_dec_pad: 2059 movb %al,(%rdx) 2060 leaq 1(%rdx),%rdx 2061 decq %rcx 2062 jnz .Loop_dec_pad 2063 2064.Ldone_dec: 2065 movq %rdx,%rax 2066 .byte 0xf3,0xc3 2067.cfi_endproc 2068.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 2069