1/* 2Copyright (c) 2011, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#define USE_AS_STRNCPY 32#define STRCPY strlcpy_ssse3 33#define STRLEN strlcpy_ssse3 34#define USE_AS_STRLCPY 35#include "ssse3-strcpy-atom.S" 36 37 .p2align 4 38L(CopyFrom1To16Bytes): 39 add %esi, %edx 40 add %esi, %ecx 41 42 POP (%esi) 43 test %al, %al 44 jz L(ExitHigh8) 45 46L(CopyFrom1To16BytesLess8): 47 mov %al, %ah 48 and $15, %ah 49 jz L(ExitHigh4) 50 51 test $0x01, %al 52 jnz L(Exit1) 53 test $0x02, %al 54 jnz L(Exit2) 55 test $0x04, %al 56 jnz L(Exit3) 57L(Exit4): 58 movl (%ecx), %eax 59 movl %eax, (%edx) 60 61 lea 3(%ecx), %eax 62 sub %edi, %eax 63 RETURN1 64 65 .p2align 4 66L(ExitHigh4): 67 test $0x10, %al 68 jnz L(Exit5) 69 test $0x20, %al 70 jnz L(Exit6) 71 test $0x40, %al 72 jnz L(Exit7) 73L(Exit8): 74 movlpd (%ecx), %xmm0 75 movlpd %xmm0, (%edx) 76 77 lea 7(%ecx), %eax 78 sub %edi, %eax 79 RETURN1 80 81 .p2align 4 82L(ExitHigh8): 83 mov %ah, %al 84 and $15, %al 85 jz L(ExitHigh12) 86 87 test $0x01, %ah 88 jnz L(Exit9) 89 test $0x02, %ah 90 jnz L(Exit10) 91 test $0x04, %ah 92 jnz L(Exit11) 93L(Exit12): 94 movlpd (%ecx), %xmm0 95 movlpd %xmm0, (%edx) 96 movl 8(%ecx), %eax 97 movl %eax, 8(%edx) 98 99 lea 11(%ecx), %eax 100 sub %edi, %eax 101 RETURN1 102 103 .p2align 4 104L(ExitHigh12): 105 test $0x10, %ah 106 jnz L(Exit13) 107 test $0x20, %ah 108 jnz L(Exit14) 109 test $0x40, %ah 110 jnz L(Exit15) 111L(Exit16): 112 movlpd (%ecx), %xmm0 113 movlpd 8(%ecx), %xmm1 114 movlpd %xmm0, (%edx) 115 movlpd %xmm1, 8(%edx) 116 117 lea 15(%ecx), %eax 118 sub %edi, %eax 119 RETURN1 120 121 CFI_PUSH(%esi) 122 123 .p2align 4 124L(CopyFrom1To16BytesCase2): 125 add $16, %ebx 126 add %esi, %ecx 127 add %esi, %edx 128 129 POP (%esi) 130 131 test %al, %al 132 jz L(ExitHighCase2) 133 134 cmp $8, %ebx 135 ja L(CopyFrom1To16BytesLess8) 136 137 test $0x01, %al 138 jnz L(Exit1) 139 cmp $1, %ebx 140 je L(StrlcpyExit1) 141 test $0x02, %al 142 jnz L(Exit2) 143 cmp $2, %ebx 144 je L(StrlcpyExit2) 145 test $0x04, %al 146 jnz L(Exit3) 147 cmp $3, %ebx 148 je L(StrlcpyExit3) 149 test $0x08, %al 150 jnz L(Exit4) 151 cmp $4, %ebx 152 je L(StrlcpyExit4) 153 test $0x10, %al 154 jnz L(Exit5) 155 cmp $5, %ebx 156 je L(StrlcpyExit5) 157 test $0x20, %al 158 jnz L(Exit6) 159 cmp $6, %ebx 160 je L(StrlcpyExit6) 161 test $0x40, %al 162 jnz L(Exit7) 163 cmp $7, %ebx 164 je L(StrlcpyExit7) 165 test $0x80, %al 166 jnz L(Exit8) 167 jmp L(StrlcpyExit8) 168 169 .p2align 4 170L(ExitHighCase2): 171 cmp $8, %ebx 172 jbe L(CopyFrom1To16BytesLess8Case3) 173 174 test $0x01, %ah 175 jnz L(Exit9) 176 cmp $9, %ebx 177 je L(StrlcpyExit9) 178 test $0x02, %ah 179 jnz L(Exit10) 180 cmp $10, %ebx 181 je L(StrlcpyExit10) 182 test $0x04, %ah 183 jnz L(Exit11) 184 cmp $11, %ebx 185 je L(StrlcpyExit11) 186 test $0x8, %ah 187 jnz L(Exit12) 188 cmp $12, %ebx 189 je L(StrlcpyExit12) 190 test $0x10, %ah 191 jnz L(Exit13) 192 cmp $13, %ebx 193 je L(StrlcpyExit13) 194 test $0x20, %ah 195 jnz L(Exit14) 196 cmp $14, %ebx 197 je L(StrlcpyExit14) 198 test $0x40, %ah 199 jnz L(Exit15) 200 cmp $15, %ebx 201 je L(StrlcpyExit15) 202 test $0x80, %ah 203 jnz L(Exit16) 204 jmp L(StrlcpyExit16) 205 206 CFI_PUSH(%esi) 207 208 .p2align 4 209L(CopyFrom1To16BytesCase2OrCase3): 210 test %eax, %eax 211 jnz L(CopyFrom1To16BytesCase2) 212 213 .p2align 4 214L(CopyFrom1To16BytesCase3): 215 add $16, %ebx 216 add %esi, %edx 217 add %esi, %ecx 218 219 POP (%esi) 220 221 cmp $8, %ebx 222 ja L(ExitHigh8Case3) 223 224L(CopyFrom1To16BytesLess8Case3): 225 cmp $4, %ebx 226 ja L(ExitHigh4Case3) 227 228 cmp $1, %ebx 229 je L(StrlcpyExit1) 230 cmp $2, %ebx 231 je L(StrlcpyExit2) 232 cmp $3, %ebx 233 je L(StrlcpyExit3) 234L(StrlcpyExit4): 235 movb %bh, 3(%edx) 236 movw (%ecx), %ax 237 movw %ax, (%edx) 238 movb 2(%ecx), %al 239 movb %al, 2(%edx) 240 241 lea 4(%ecx), %edx 242 mov %edi, %ecx 243 POP (%edi) 244 jmp L(CalculateLengthOfSrc) 245 CFI_PUSH (%edi) 246 247 .p2align 4 248L(ExitHigh4Case3): 249 cmp $5, %ebx 250 je L(StrlcpyExit5) 251 cmp $6, %ebx 252 je L(StrlcpyExit6) 253 cmp $7, %ebx 254 je L(StrlcpyExit7) 255L(StrlcpyExit8): 256 movb %bh, 7(%edx) 257 movl (%ecx), %eax 258 movl %eax, (%edx) 259 movl 3(%ecx), %eax 260 movl %eax, 3(%edx) 261 262 lea 8(%ecx), %edx 263 mov %edi, %ecx 264 POP (%edi) 265 jmp L(CalculateLengthOfSrc) 266 CFI_PUSH (%edi) 267 268 .p2align 4 269L(ExitHigh8Case3): 270 cmp $12, %ebx 271 ja L(ExitHigh12Case3) 272 273 cmp $9, %ebx 274 je L(StrlcpyExit9) 275 cmp $10, %ebx 276 je L(StrlcpyExit10) 277 cmp $11, %ebx 278 je L(StrlcpyExit11) 279L(StrlcpyExit12): 280 movb %bh, 11(%edx) 281 movlpd (%ecx), %xmm0 282 movlpd %xmm0, (%edx) 283 movl 7(%ecx), %eax 284 movl %eax, 7(%edx) 285 286 lea 12(%ecx), %edx 287 mov %edi, %ecx 288 POP (%edi) 289 jmp L(CalculateLengthOfSrc) 290 CFI_PUSH (%edi) 291 292 .p2align 4 293L(ExitHigh12Case3): 294 cmp $13, %ebx 295 je L(StrlcpyExit13) 296 cmp $14, %ebx 297 je L(StrlcpyExit14) 298 cmp $15, %ebx 299 je L(StrlcpyExit15) 300L(StrlcpyExit16): 301 movb %bh, 15(%edx) 302 movlpd (%ecx), %xmm0 303 movlpd %xmm0, (%edx) 304 movlpd 7(%ecx), %xmm0 305 movlpd %xmm0, 7(%edx) 306 307 lea 16(%ecx), %edx 308 mov %edi, %ecx 309 POP (%edi) 310 jmp L(CalculateLengthOfSrc) 311 CFI_PUSH (%edi) 312 313 .p2align 4 314L(StrlcpyExit1): 315 movb %bh, (%edx) 316 317 lea 1(%ecx), %edx 318 mov %edi, %ecx 319 POP (%edi) 320 jmp L(CalculateLengthOfSrc) 321 CFI_PUSH (%edi) 322 323 .p2align 4 324L(Exit1): 325 movb (%ecx), %al 326 movb %al, (%edx) 327 328 mov %ecx, %eax 329 sub %edi, %eax 330 RETURN1 331 332 .p2align 4 333L(StrlcpyExit2): 334 movb %bh, 1(%edx) 335 movb (%ecx), %al 336 movb %al, (%edx) 337 338 lea 2(%ecx), %edx 339 mov %edi, %ecx 340 POP (%edi) 341 jmp L(CalculateLengthOfSrc) 342 CFI_PUSH (%edi) 343 344 .p2align 4 345L(Exit2): 346 movw (%ecx), %ax 347 movw %ax, (%edx) 348 movl %edi, %eax 349 350 lea 1(%ecx), %eax 351 sub %edi, %eax 352 RETURN1 353 354 .p2align 4 355L(StrlcpyExit3): 356 movb %bh, 2(%edx) 357 movw (%ecx), %ax 358 movw %ax, (%edx) 359 360 lea 3(%ecx), %edx 361 mov %edi, %ecx 362 POP (%edi) 363 jmp L(CalculateLengthOfSrc) 364 CFI_PUSH (%edi) 365 366 .p2align 4 367L(Exit3): 368 movw (%ecx), %ax 369 movw %ax, (%edx) 370 movb 2(%ecx), %al 371 movb %al, 2(%edx) 372 373 lea 2(%ecx), %eax 374 sub %edi, %eax 375 RETURN1 376 377 .p2align 4 378L(StrlcpyExit5): 379 movb %bh, 4(%edx) 380 movl (%ecx), %eax 381 movl %eax, (%edx) 382 movl %edi, %eax 383 384 lea 5(%ecx), %edx 385 mov %edi, %ecx 386 POP (%edi) 387 jmp L(CalculateLengthOfSrc) 388 CFI_PUSH (%edi) 389 390 .p2align 4 391L(Exit5): 392 movl (%ecx), %eax 393 movl %eax, (%edx) 394 movb 4(%ecx), %al 395 movb %al, 4(%edx) 396 397 lea 4(%ecx), %eax 398 sub %edi, %eax 399 RETURN1 400 401 .p2align 4 402L(StrlcpyExit6): 403 movb %bh, 5(%edx) 404 movl (%ecx), %eax 405 movl %eax, (%edx) 406 movb 4(%ecx), %al 407 movb %al, 4(%edx) 408 409 lea 6(%ecx), %edx 410 mov %edi, %ecx 411 POP (%edi) 412 jmp L(CalculateLengthOfSrc) 413 CFI_PUSH (%edi) 414 415 .p2align 4 416L(Exit6): 417 movl (%ecx), %eax 418 movl %eax, (%edx) 419 movw 4(%ecx), %ax 420 movw %ax, 4(%edx) 421 422 lea 5(%ecx), %eax 423 sub %edi, %eax 424 RETURN1 425 426 .p2align 4 427L(StrlcpyExit7): 428 movb %bh, 6(%edx) 429 movl (%ecx), %eax 430 movl %eax, (%edx) 431 movw 4(%ecx), %ax 432 movw %ax, 4(%edx) 433 434 lea 7(%ecx), %edx 435 mov %edi, %ecx 436 POP (%edi) 437 jmp L(CalculateLengthOfSrc) 438 CFI_PUSH (%edi) 439 440 .p2align 4 441L(Exit7): 442 movl (%ecx), %eax 443 movl %eax, (%edx) 444 movl 3(%ecx), %eax 445 movl %eax, 3(%edx) 446 447 lea 6(%ecx), %eax 448 sub %edi, %eax 449 RETURN1 450 451 .p2align 4 452L(StrlcpyExit9): 453 movb %bh, 8(%edx) 454 movlpd (%ecx), %xmm0 455 movlpd %xmm0, (%edx) 456 457 lea 9(%ecx), %edx 458 mov %edi, %ecx 459 POP (%edi) 460 jmp L(CalculateLengthOfSrc) 461 CFI_PUSH (%edi) 462 463 .p2align 4 464L(Exit9): 465 movlpd (%ecx), %xmm0 466 movlpd %xmm0, (%edx) 467 movb 8(%ecx), %al 468 movb %al, 8(%edx) 469 470 lea 8(%ecx), %eax 471 sub %edi, %eax 472 RETURN1 473 474 .p2align 4 475L(StrlcpyExit10): 476 movb %bh, 9(%edx) 477 movlpd (%ecx), %xmm0 478 movlpd %xmm0, (%edx) 479 movb 8(%ecx), %al 480 movb %al, 8(%edx) 481 482 lea 10(%ecx), %edx 483 mov %edi, %ecx 484 POP (%edi) 485 jmp L(CalculateLengthOfSrc) 486 CFI_PUSH (%edi) 487 488 .p2align 4 489L(Exit10): 490 movlpd (%ecx), %xmm0 491 movlpd %xmm0, (%edx) 492 movw 8(%ecx), %ax 493 movw %ax, 8(%edx) 494 495 lea 9(%ecx), %eax 496 sub %edi, %eax 497 RETURN1 498 499 .p2align 4 500L(StrlcpyExit11): 501 movb %bh, 10(%edx) 502 movlpd (%ecx), %xmm0 503 movlpd %xmm0, (%edx) 504 movw 8(%ecx), %ax 505 movw %ax, 8(%edx) 506 507 lea 11(%ecx), %edx 508 mov %edi, %ecx 509 POP (%edi) 510 jmp L(CalculateLengthOfSrc) 511 CFI_PUSH (%edi) 512 513 .p2align 4 514L(Exit11): 515 movlpd (%ecx), %xmm0 516 movlpd %xmm0, (%edx) 517 movl 7(%ecx), %eax 518 movl %eax, 7(%edx) 519 520 lea 10(%ecx), %eax 521 sub %edi, %eax 522 RETURN1 523 524 .p2align 4 525L(StrlcpyExit13): 526 movb %bh, 12(%edx) 527 movlpd (%ecx), %xmm0 528 movlpd %xmm0, (%edx) 529 movl 8(%ecx), %eax 530 movl %eax, 8(%edx) 531 532 lea 13(%ecx), %edx 533 mov %edi, %ecx 534 POP (%edi) 535 jmp L(CalculateLengthOfSrc) 536 CFI_PUSH (%edi) 537 538 .p2align 4 539L(Exit13): 540 movlpd (%ecx), %xmm0 541 movlpd %xmm0, (%edx) 542 movlpd 5(%ecx), %xmm0 543 movlpd %xmm0, 5(%edx) 544 545 lea 12(%ecx), %eax 546 sub %edi, %eax 547 RETURN1 548 549 .p2align 4 550L(StrlcpyExit14): 551 movb %bh, 13(%edx) 552 movlpd (%ecx), %xmm0 553 movlpd %xmm0, (%edx) 554 movlpd 5(%ecx), %xmm0 555 movlpd %xmm0, 5(%edx) 556 557 lea 14(%ecx), %edx 558 mov %edi, %ecx 559 POP (%edi) 560 jmp L(CalculateLengthOfSrc) 561 CFI_PUSH (%edi) 562 563 .p2align 4 564L(Exit14): 565 movlpd (%ecx), %xmm0 566 movlpd %xmm0, (%edx) 567 movlpd 6(%ecx), %xmm0 568 movlpd %xmm0, 6(%edx) 569 570 lea 13(%ecx), %eax 571 sub %edi, %eax 572 RETURN1 573 574 .p2align 4 575L(StrlcpyExit15): 576 movb %bh, 14(%edx) 577 movlpd (%ecx), %xmm0 578 movlpd %xmm0, (%edx) 579 movlpd 6(%ecx), %xmm0 580 movlpd %xmm0, 6(%edx) 581 582 lea 15(%ecx), %edx 583 mov %edi, %ecx 584 POP (%edi) 585 jmp L(CalculateLengthOfSrc) 586 CFI_PUSH (%edi) 587 588 .p2align 4 589L(Exit15): 590 movlpd (%ecx), %xmm0 591 movlpd %xmm0, (%edx) 592 movlpd 7(%ecx), %xmm0 593 movlpd %xmm0, 7(%edx) 594 595 lea 14(%ecx), %eax 596 sub %edi, %eax 597 RETURN1 598 599 CFI_POP (%edi) 600 601 .p2align 4 602L(StrlcpyExit0): 603 movl $0, %eax 604 RETURN 605 606 .p2align 4 607L(StrncpyExit15Bytes): 608 cmp $12, %ebx 609 ja L(StrncpyExit15Bytes1) 610 611 cmpb $0, 8(%ecx) 612 jz L(ExitTail9) 613 cmp $9, %ebx 614 je L(StrlcpyExitTail9) 615 616 cmpb $0, 9(%ecx) 617 jz L(ExitTail10) 618 cmp $10, %ebx 619 je L(StrlcpyExitTail10) 620 621 cmpb $0, 10(%ecx) 622 jz L(ExitTail11) 623 cmp $11, %ebx 624 je L(StrlcpyExitTail11) 625 626 cmpb $0, 11(%ecx) 627 jz L(ExitTail12) 628 629 movb %bh, 11(%edx) 630 movlpd (%ecx), %xmm0 631 movlpd %xmm0, (%edx) 632 movl 7(%ecx), %eax 633 movl %eax, 7(%edx) 634 635 lea 12(%ecx), %edx 636 jmp L(CalculateLengthOfSrc) 637 638 .p2align 4 639L(StrncpyExit15Bytes1): 640 cmpb $0, 8(%ecx) 641 jz L(ExitTail9) 642 cmpb $0, 9(%ecx) 643 jz L(ExitTail10) 644 cmpb $0, 10(%ecx) 645 jz L(ExitTail11) 646 cmpb $0, 11(%ecx) 647 jz L(ExitTail12) 648 649 cmpb $0, 12(%ecx) 650 jz L(ExitTail13) 651 cmp $13, %ebx 652 je L(StrlcpyExitTail13) 653 654 cmpb $0, 13(%ecx) 655 jz L(ExitTail14) 656 cmp $14, %ebx 657 je L(StrlcpyExitTail14) 658 659 cmpb $0, 14(%ecx) 660 jz L(ExitTail15) 661 662 movb %bh, 14(%edx) 663 movlpd (%ecx), %xmm0 664 movlpd %xmm0, (%edx) 665 movlpd 6(%ecx), %xmm0 666 movlpd %xmm0, 6(%edx) 667 668 lea 15(%ecx), %edx 669 jmp L(CalculateLengthOfSrc) 670 671 .p2align 4 672L(StrncpyExit8Bytes): 673 cmp $4, %ebx 674 ja L(StrncpyExit8Bytes1) 675 676 test %ebx, %ebx 677 jz L(StrlcpyExitTail0) 678 679 cmpb $0, (%ecx) 680 jz L(ExitTail1) 681 cmp $1, %ebx 682 je L(StrlcpyExitTail1) 683 684 cmpb $0, 1(%ecx) 685 jz L(ExitTail2) 686 cmp $2, %ebx 687 je L(StrlcpyExitTail2) 688 689 cmpb $0, 2(%ecx) 690 jz L(ExitTail3) 691 cmp $3, %ebx 692 je L(StrlcpyExitTail3) 693 694 cmpb $0, 3(%ecx) 695 jz L(ExitTail4) 696 697 movb %bh, 3(%edx) 698 movw (%ecx), %ax 699 movw %ax, (%edx) 700 movb 2(%ecx), %al 701 movb %al, 2(%edx) 702 703 lea 4(%ecx), %edx 704 jmp L(CalculateLengthOfSrc) 705 706 .p2align 4 707L(StrncpyExit8Bytes1): 708 cmpb $0, (%ecx) 709 jz L(ExitTail1) 710 cmpb $0, 1(%ecx) 711 jz L(ExitTail2) 712 cmpb $0, 2(%ecx) 713 jz L(ExitTail3) 714 cmpb $0, 3(%ecx) 715 jz L(ExitTail4) 716 717 cmpb $0, 4(%ecx) 718 jz L(ExitTail5) 719 cmp $5, %ebx 720 je L(StrlcpyExitTail5) 721 722 cmpb $0, 5(%ecx) 723 jz L(ExitTail6) 724 cmp $6, %ebx 725 je L(StrlcpyExitTail6) 726 727 cmpb $0, 6(%ecx) 728 jz L(ExitTail7) 729 cmp $7, %ebx 730 je L(StrlcpyExitTail7) 731 732 cmpb $0, 7(%ecx) 733 jz L(ExitTail8) 734 735 movb %bh, 7(%edx) 736 movl (%ecx), %eax 737 movl %eax, (%edx) 738 movl 3(%ecx), %eax 739 movl %eax, 3(%edx) 740 741 lea 8(%ecx), %edx 742 jmp L(CalculateLengthOfSrc) 743 744 .p2align 4 745L(StrlcpyExitTail0): 746 mov %ecx, %edx 747 jmp L(CalculateLengthOfSrc) 748 749 .p2align 4 750L(StrlcpyExitTail1): 751 movb %bh, (%edx) 752 753 lea 1(%ecx), %edx 754 jmp L(CalculateLengthOfSrc) 755 756 .p2align 4 757L(ExitTail1): 758 movb (%ecx), %al 759 movb %al, (%edx) 760 761 mov $0, %eax 762 RETURN 763 764 .p2align 4 765L(StrlcpyExitTail2): 766 movb %bh, 1(%edx) 767 movb (%ecx), %al 768 movb %al, (%edx) 769 770 lea 2(%ecx), %edx 771 jmp L(CalculateLengthOfSrc) 772 773 .p2align 4 774L(ExitTail2): 775 movw (%ecx), %ax 776 movw %ax, (%edx) 777 movl %edx, %eax 778 779 mov $1, %eax 780 RETURN 781 782 .p2align 4 783L(StrlcpyExitTail3): 784 movb %bh, 2(%edx) 785 movw (%ecx), %ax 786 movw %ax, (%edx) 787 788 lea 3(%ecx), %edx 789 jmp L(CalculateLengthOfSrc) 790 791 .p2align 4 792L(ExitTail3): 793 movw (%ecx), %ax 794 movw %ax, (%edx) 795 movb 2(%ecx), %al 796 movb %al, 2(%edx) 797 798 mov $2, %eax 799 RETURN 800 801 .p2align 4 802L(ExitTail4): 803 movl (%ecx), %eax 804 movl %eax, (%edx) 805 806 mov $3, %eax 807 RETURN 808 809 .p2align 4 810L(StrlcpyExitTail5): 811 movb %bh, 4(%edx) 812 movl (%ecx), %eax 813 movl %eax, (%edx) 814 movl %edx, %eax 815 816 lea 5(%ecx), %edx 817 jmp L(CalculateLengthOfSrc) 818 819 .p2align 4 820L(ExitTail5): 821 movl (%ecx), %eax 822 movl %eax, (%edx) 823 movb 4(%ecx), %al 824 movb %al, 4(%edx) 825 826 mov $4, %eax 827 RETURN 828 829 .p2align 4 830L(StrlcpyExitTail6): 831 movb %bh, 5(%edx) 832 movl (%ecx), %eax 833 movl %eax, (%edx) 834 movb 4(%ecx), %al 835 movb %al, 4(%edx) 836 837 lea 6(%ecx), %edx 838 jmp L(CalculateLengthOfSrc) 839 840 .p2align 4 841L(ExitTail6): 842 movl (%ecx), %eax 843 movl %eax, (%edx) 844 movw 4(%ecx), %ax 845 movw %ax, 4(%edx) 846 847 mov $5, %eax 848 RETURN 849 850 .p2align 4 851L(StrlcpyExitTail7): 852 movb %bh, 6(%edx) 853 movl (%ecx), %eax 854 movl %eax, (%edx) 855 movw 4(%ecx), %ax 856 movw %ax, 4(%edx) 857 858 lea 7(%ecx), %edx 859 jmp L(CalculateLengthOfSrc) 860 861 .p2align 4 862L(ExitTail7): 863 movl (%ecx), %eax 864 movl %eax, (%edx) 865 movl 3(%ecx), %eax 866 movl %eax, 3(%edx) 867 868 mov $6, %eax 869 RETURN 870 871 .p2align 4 872L(ExitTail8): 873 movlpd (%ecx), %xmm0 874 movlpd %xmm0, (%edx) 875 876 mov $7, %eax 877 RETURN 878 879 .p2align 4 880L(StrlcpyExitTail9): 881 movb %bh, 8(%edx) 882 movlpd (%ecx), %xmm0 883 movlpd %xmm0, (%edx) 884 885 lea 9(%ecx), %edx 886 jmp L(CalculateLengthOfSrc) 887 888 .p2align 4 889L(ExitTail9): 890 movlpd (%ecx), %xmm0 891 movlpd %xmm0, (%edx) 892 movb 8(%ecx), %al 893 movb %al, 8(%edx) 894 895 mov $8, %eax 896 RETURN 897 898 .p2align 4 899L(StrlcpyExitTail10): 900 movb %bh, 9(%edx) 901 movlpd (%ecx), %xmm0 902 movlpd %xmm0, (%edx) 903 movb 8(%ecx), %al 904 movb %al, 8(%edx) 905 906 lea 10(%ecx), %edx 907 jmp L(CalculateLengthOfSrc) 908 909 .p2align 4 910L(ExitTail10): 911 movlpd (%ecx), %xmm0 912 movlpd %xmm0, (%edx) 913 movw 8(%ecx), %ax 914 movw %ax, 8(%edx) 915 916 mov $9, %eax 917 RETURN 918 919 .p2align 4 920L(StrlcpyExitTail11): 921 movb %bh, 10(%edx) 922 movlpd (%ecx), %xmm0 923 movlpd %xmm0, (%edx) 924 movw 8(%ecx), %ax 925 movw %ax, 8(%edx) 926 927 lea 11(%ecx), %edx 928 jmp L(CalculateLengthOfSrc) 929 930 .p2align 4 931L(ExitTail11): 932 movlpd (%ecx), %xmm0 933 movlpd %xmm0, (%edx) 934 movl 7(%ecx), %eax 935 movl %eax, 7(%edx) 936 937 mov $10, %eax 938 RETURN 939 940 .p2align 4 941L(ExitTail12): 942 movlpd (%ecx), %xmm0 943 movlpd %xmm0, (%edx) 944 movl 8(%ecx), %eax 945 movl %eax, 8(%edx) 946 947 mov $11, %eax 948 RETURN 949 950 .p2align 4 951L(StrlcpyExitTail13): 952 movb %bh, 12(%edx) 953 movlpd (%ecx), %xmm0 954 movlpd %xmm0, (%edx) 955 movl 8(%ecx), %eax 956 movl %eax, 8(%edx) 957 958 lea 13(%ecx), %edx 959 jmp L(CalculateLengthOfSrc) 960 961 .p2align 4 962L(ExitTail13): 963 movlpd (%ecx), %xmm0 964 movlpd %xmm0, (%edx) 965 movlpd 5(%ecx), %xmm0 966 movlpd %xmm0, 5(%edx) 967 968 mov $12, %eax 969 RETURN 970 971 .p2align 4 972L(StrlcpyExitTail14): 973 movb %bh, 13(%edx) 974 movlpd (%ecx), %xmm0 975 movlpd %xmm0, (%edx) 976 movlpd 5(%ecx), %xmm0 977 movlpd %xmm0, 5(%edx) 978 979 lea 14(%ecx), %edx 980 jmp L(CalculateLengthOfSrc) 981 982 .p2align 4 983L(ExitTail14): 984 movlpd (%ecx), %xmm0 985 movlpd %xmm0, (%edx) 986 movlpd 6(%ecx), %xmm0 987 movlpd %xmm0, 6(%edx) 988 989 mov $13, %eax 990 RETURN 991 992 .p2align 4 993L(ExitTail15): 994 movlpd (%ecx), %xmm0 995 movlpd %xmm0, (%edx) 996 movlpd 7(%ecx), %xmm0 997 movlpd %xmm0, 7(%edx) 998 999 mov $14, %eax 1000 RETURN 1001 1002 .p2align 4 1003L(StrlcpyExitTail16): 1004 movb %bh, 15(%edx) 1005 movlpd (%ecx), %xmm0 1006 movlpd %xmm0, (%edx) 1007 movlpd 7(%ecx), %xmm0 1008 movlpd %xmm0, 7(%edx) 1009 1010 lea 16(%ecx), %edx 1011 jmp L(CalculateLengthOfSrc) 1012 1013 .p2align 4 1014L(ExitTail16): 1015 movlpd (%ecx), %xmm0 1016 movlpd 8(%ecx), %xmm1 1017 movlpd %xmm0, (%edx) 1018 movlpd %xmm1, 8(%edx) 1019 1020 mov $15, %eax 1021 RETURN 1022 1023 .p2align 4 1024L(CalculateLengthOfSrc): 1025 xor %eax, %eax 1026 cmpb $0, (%edx) 1027 jz L(exit_tail0) 1028 cmpb $0, 1(%edx) 1029 jz L(exit_tail1) 1030 cmpb $0, 2(%edx) 1031 jz L(exit_tail2) 1032 cmpb $0, 3(%edx) 1033 jz L(exit_tail3) 1034 1035 cmpb $0, 4(%edx) 1036 jz L(exit_tail4) 1037 cmpb $0, 5(%edx) 1038 jz L(exit_tail5) 1039 cmpb $0, 6(%edx) 1040 jz L(exit_tail6) 1041 cmpb $0, 7(%edx) 1042 jz L(exit_tail7) 1043 1044 cmpb $0, 8(%edx) 1045 jz L(exit_tail8) 1046 cmpb $0, 9(%edx) 1047 jz L(exit_tail9) 1048 cmpb $0, 10(%edx) 1049 jz L(exit_tail10) 1050 cmpb $0, 11(%edx) 1051 jz L(exit_tail11) 1052 1053 cmpb $0, 12(%edx) 1054 jz L(exit_tail12) 1055 cmpb $0, 13(%edx) 1056 jz L(exit_tail13) 1057 cmpb $0, 14(%edx) 1058 jz L(exit_tail14) 1059 cmpb $0, 15(%edx) 1060 jz L(exit_tail15) 1061 1062 pxor %xmm0, %xmm0 1063 lea 16(%edx), %eax 1064 add $16, %ecx 1065 and $-16, %eax 1066 1067 pcmpeqb (%eax), %xmm0 1068 pmovmskb %xmm0, %edx 1069 pxor %xmm1, %xmm1 1070 lea 16(%eax), %eax 1071 test %edx, %edx 1072 jnz L(exit) 1073 1074 pcmpeqb (%eax), %xmm1 1075 pmovmskb %xmm1, %edx 1076 pxor %xmm2, %xmm2 1077 lea 16(%eax), %eax 1078 test %edx, %edx 1079 jnz L(exit) 1080 1081 pcmpeqb (%eax), %xmm2 1082 pmovmskb %xmm2, %edx 1083 pxor %xmm3, %xmm3 1084 lea 16(%eax), %eax 1085 test %edx, %edx 1086 jnz L(exit) 1087 1088 pcmpeqb (%eax), %xmm3 1089 pmovmskb %xmm3, %edx 1090 lea 16(%eax), %eax 1091 test %edx, %edx 1092 jnz L(exit) 1093 1094 pcmpeqb (%eax), %xmm0 1095 pmovmskb %xmm0, %edx 1096 lea 16(%eax), %eax 1097 test %edx, %edx 1098 jnz L(exit) 1099 1100 pcmpeqb (%eax), %xmm1 1101 pmovmskb %xmm1, %edx 1102 lea 16(%eax), %eax 1103 test %edx, %edx 1104 jnz L(exit) 1105 1106 pcmpeqb (%eax), %xmm2 1107 pmovmskb %xmm2, %edx 1108 lea 16(%eax), %eax 1109 test %edx, %edx 1110 jnz L(exit) 1111 1112 pcmpeqb (%eax), %xmm3 1113 pmovmskb %xmm3, %edx 1114 lea 16(%eax), %eax 1115 test %edx, %edx 1116 jnz L(exit) 1117 1118 pcmpeqb (%eax), %xmm0 1119 pmovmskb %xmm0, %edx 1120 lea 16(%eax), %eax 1121 test %edx, %edx 1122 jnz L(exit) 1123 1124 pcmpeqb (%eax), %xmm1 1125 pmovmskb %xmm1, %edx 1126 lea 16(%eax), %eax 1127 test %edx, %edx 1128 jnz L(exit) 1129 1130 pcmpeqb (%eax), %xmm2 1131 pmovmskb %xmm2, %edx 1132 lea 16(%eax), %eax 1133 test %edx, %edx 1134 jnz L(exit) 1135 1136 pcmpeqb (%eax), %xmm3 1137 pmovmskb %xmm3, %edx 1138 lea 16(%eax), %eax 1139 test %edx, %edx 1140 jnz L(exit) 1141 1142 pcmpeqb (%eax), %xmm0 1143 pmovmskb %xmm0, %edx 1144 lea 16(%eax), %eax 1145 test %edx, %edx 1146 jnz L(exit) 1147 1148 pcmpeqb (%eax), %xmm1 1149 pmovmskb %xmm1, %edx 1150 lea 16(%eax), %eax 1151 test %edx, %edx 1152 jnz L(exit) 1153 1154 pcmpeqb (%eax), %xmm2 1155 pmovmskb %xmm2, %edx 1156 lea 16(%eax), %eax 1157 test %edx, %edx 1158 jnz L(exit) 1159 1160 pcmpeqb (%eax), %xmm3 1161 pmovmskb %xmm3, %edx 1162 lea 16(%eax), %eax 1163 test %edx, %edx 1164 jnz L(exit) 1165 1166 and $-0x40, %eax 1167 1168 .p2align 4 1169L(aligned_64_loop): 1170 movaps (%eax), %xmm0 1171 movaps 16(%eax), %xmm1 1172 movaps 32(%eax), %xmm2 1173 movaps 48(%eax), %xmm6 1174 pminub %xmm1, %xmm0 1175 pminub %xmm6, %xmm2 1176 pminub %xmm0, %xmm2 1177 pcmpeqb %xmm3, %xmm2 1178 pmovmskb %xmm2, %edx 1179 lea 64(%eax), %eax 1180 test %edx, %edx 1181 jz L(aligned_64_loop) 1182 1183 pcmpeqb -64(%eax), %xmm3 1184 pmovmskb %xmm3, %edx 1185 lea 48(%ecx), %ecx 1186 test %edx, %edx 1187 jnz L(exit) 1188 1189 pcmpeqb %xmm1, %xmm3 1190 pmovmskb %xmm3, %edx 1191 lea -16(%ecx), %ecx 1192 test %edx, %edx 1193 jnz L(exit) 1194 1195 pcmpeqb -32(%eax), %xmm3 1196 pmovmskb %xmm3, %edx 1197 lea -16(%ecx), %ecx 1198 test %edx, %edx 1199 jnz L(exit) 1200 1201 pcmpeqb %xmm6, %xmm3 1202 pmovmskb %xmm3, %edx 1203 lea -16(%ecx), %ecx 1204 1205 .p2align 4 1206L(exit): 1207 sub %ecx, %eax 1208 test %dl, %dl 1209 jz L(exit_more_8) 1210 1211 mov %dl, %cl 1212 and $15, %cl 1213 jz L(exit_more_4) 1214 test $0x01, %dl 1215 jnz L(exit_0) 1216 test $0x02, %dl 1217 jnz L(exit_1) 1218 test $0x04, %dl 1219 jnz L(exit_2) 1220 add $3, %eax 1221 RETURN 1222 1223 .p2align 4 1224L(exit_more_4): 1225 test $0x10, %dl 1226 jnz L(exit_4) 1227 test $0x20, %dl 1228 jnz L(exit_5) 1229 test $0x40, %dl 1230 jnz L(exit_6) 1231 add $7, %eax 1232 RETURN 1233 1234 .p2align 4 1235L(exit_more_8): 1236 mov %dh, %ch 1237 and $15, %ch 1238 jz L(exit_more_12) 1239 test $0x01, %dh 1240 jnz L(exit_8) 1241 test $0x02, %dh 1242 jnz L(exit_9) 1243 test $0x04, %dh 1244 jnz L(exit_10) 1245 add $11, %eax 1246 RETURN 1247 1248 .p2align 4 1249L(exit_more_12): 1250 test $0x10, %dh 1251 jnz L(exit_12) 1252 test $0x20, %dh 1253 jnz L(exit_13) 1254 test $0x40, %dh 1255 jnz L(exit_14) 1256 add $15, %eax 1257L(exit_0): 1258 RETURN 1259 1260 .p2align 4 1261L(exit_1): 1262 add $1, %eax 1263 RETURN 1264 1265L(exit_2): 1266 add $2, %eax 1267 RETURN 1268 1269L(exit_3): 1270 add $3, %eax 1271 RETURN 1272 1273L(exit_4): 1274 add $4, %eax 1275 RETURN 1276 1277L(exit_5): 1278 add $5, %eax 1279 RETURN 1280 1281L(exit_6): 1282 add $6, %eax 1283 RETURN 1284 1285L(exit_7): 1286 add $7, %eax 1287 RETURN 1288 1289L(exit_8): 1290 add $8, %eax 1291 RETURN 1292 1293L(exit_9): 1294 add $9, %eax 1295 RETURN 1296 1297L(exit_10): 1298 add $10, %eax 1299 RETURN 1300 1301L(exit_11): 1302 add $11, %eax 1303 RETURN 1304 1305L(exit_12): 1306 add $12, %eax 1307 RETURN 1308 1309L(exit_13): 1310 add $13, %eax 1311 RETURN 1312 1313L(exit_14): 1314 add $14, %eax 1315 RETURN 1316 1317L(exit_15): 1318 add $15, %eax 1319 RETURN 1320 1321L(exit_tail0): 1322 mov %edx, %eax 1323 sub %ecx, %eax 1324 RETURN 1325 1326 .p2align 4 1327L(exit_tail1): 1328 lea 1(%edx), %eax 1329 sub %ecx, %eax 1330 RETURN 1331 1332L(exit_tail2): 1333 lea 2(%edx), %eax 1334 sub %ecx, %eax 1335 RETURN 1336 1337L(exit_tail3): 1338 lea 3(%edx), %eax 1339 sub %ecx, %eax 1340 RETURN 1341 1342L(exit_tail4): 1343 lea 4(%edx), %eax 1344 sub %ecx, %eax 1345 RETURN 1346 1347L(exit_tail5): 1348 lea 5(%edx), %eax 1349 sub %ecx, %eax 1350 RETURN 1351 1352L(exit_tail6): 1353 lea 6(%edx), %eax 1354 sub %ecx, %eax 1355 RETURN 1356 1357L(exit_tail7): 1358 lea 7(%edx), %eax 1359 sub %ecx, %eax 1360 RETURN 1361 1362L(exit_tail8): 1363 lea 8(%edx), %eax 1364 sub %ecx, %eax 1365 RETURN 1366 1367L(exit_tail9): 1368 lea 9(%edx), %eax 1369 sub %ecx, %eax 1370 RETURN 1371 1372L(exit_tail10): 1373 lea 10(%edx), %eax 1374 sub %ecx, %eax 1375 RETURN 1376 1377L(exit_tail11): 1378 lea 11(%edx), %eax 1379 sub %ecx, %eax 1380 RETURN 1381 1382L(exit_tail12): 1383 lea 12(%edx), %eax 1384 sub %ecx, %eax 1385 RETURN 1386 1387L(exit_tail13): 1388 lea 13(%edx), %eax 1389 sub %ecx, %eax 1390 RETURN 1391 1392L(exit_tail14): 1393 lea 14(%edx), %eax 1394 sub %ecx, %eax 1395 RETURN 1396 1397L(exit_tail15): 1398 lea 15(%edx), %eax 1399 sub %ecx, %eax 1400 RETURN 1401 1402END (STRCPY) 1403 1404