1/* 2Copyright (c) 2011, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#ifndef USE_AS_WCSCAT 32 33# ifndef L 34# define L(label) .L##label 35# endif 36 37# ifndef cfi_startproc 38# define cfi_startproc .cfi_startproc 39# endif 40 41# ifndef cfi_endproc 42# define cfi_endproc .cfi_endproc 43# endif 44 45# ifndef cfi_rel_offset 46# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 47# endif 48 49# ifndef cfi_restore 50# define cfi_restore(reg) .cfi_restore reg 51# endif 52 53# ifndef cfi_adjust_cfa_offset 54# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 55# endif 56 57# ifndef ENTRY 58# define ENTRY(name) \ 59 .type name, @function; \ 60 .globl name; \ 61 .p2align 4; \ 62name: \ 63 cfi_startproc 64# endif 65 66# ifndef END 67# define END(name) \ 68 cfi_endproc; \ 69 .size name, .-name 70# endif 71 72# define CFI_PUSH(REG) \ 73 cfi_adjust_cfa_offset (4); \ 74 cfi_rel_offset (REG, 0) 75 76# define CFI_POP(REG) \ 77 cfi_adjust_cfa_offset (-4); \ 78 cfi_restore (REG) 79 80# define PUSH(REG) pushl REG; CFI_PUSH (REG) 81# define POP(REG) popl REG; CFI_POP (REG) 82 83# define PARMS 4 84# define RETURN POP (%edi); ret; CFI_PUSH (%edi) 85 86# define STR1 PARMS 87# define STR2 STR1+4 88# define LEN STR2+4 89 90.text 91ENTRY (wcscpy_ssse3) 92 mov STR1(%esp), %edx 93 mov STR2(%esp), %ecx 94 95 cmpl $0, (%ecx) 96 jz L(ExitTail4) 97 cmpl $0, 4(%ecx) 98 jz L(ExitTail8) 99 cmpl $0, 8(%ecx) 100 jz L(ExitTail12) 101 cmpl $0, 12(%ecx) 102 jz L(ExitTail16) 103 104 PUSH (%edi) 105 mov %edx, %edi 106#endif 107 PUSH (%esi) 108 lea 16(%ecx), %esi 109 110 and $-16, %esi 111 112 pxor %xmm0, %xmm0 113 pcmpeqd (%esi), %xmm0 114 movdqu (%ecx), %xmm1 115 movdqu %xmm1, (%edx) 116 117 pmovmskb %xmm0, %eax 118 sub %ecx, %esi 119 120 test %eax, %eax 121 jnz L(CopyFrom1To16Bytes) 122 123 mov %edx, %eax 124 lea 16(%edx), %edx 125 and $-16, %edx 126 sub %edx, %eax 127 128 sub %eax, %ecx 129 mov %ecx, %eax 130 and $0xf, %eax 131 mov $0, %esi 132 133 jz L(Align16Both) 134 cmp $4, %eax 135 je L(Shl4) 136 cmp $8, %eax 137 je L(Shl8) 138 jmp L(Shl12) 139 140L(Align16Both): 141 movaps (%ecx), %xmm1 142 movaps 16(%ecx), %xmm2 143 movaps %xmm1, (%edx) 144 pcmpeqd %xmm2, %xmm0 145 pmovmskb %xmm0, %eax 146 lea 16(%esi), %esi 147 148 test %eax, %eax 149 jnz L(CopyFrom1To16Bytes) 150 151 movaps 16(%ecx, %esi), %xmm3 152 movaps %xmm2, (%edx, %esi) 153 pcmpeqd %xmm3, %xmm0 154 pmovmskb %xmm0, %eax 155 lea 16(%esi), %esi 156 157 test %eax, %eax 158 jnz L(CopyFrom1To16Bytes) 159 160 movaps 16(%ecx, %esi), %xmm4 161 movaps %xmm3, (%edx, %esi) 162 pcmpeqd %xmm4, %xmm0 163 pmovmskb %xmm0, %eax 164 lea 16(%esi), %esi 165 166 test %eax, %eax 167 jnz L(CopyFrom1To16Bytes) 168 169 movaps 16(%ecx, %esi), %xmm1 170 movaps %xmm4, (%edx, %esi) 171 pcmpeqd %xmm1, %xmm0 172 pmovmskb %xmm0, %eax 173 lea 16(%esi), %esi 174 175 test %eax, %eax 176 jnz L(CopyFrom1To16Bytes) 177 178 movaps 16(%ecx, %esi), %xmm2 179 movaps %xmm1, (%edx, %esi) 180 pcmpeqd %xmm2, %xmm0 181 pmovmskb %xmm0, %eax 182 lea 16(%esi), %esi 183 184 test %eax, %eax 185 jnz L(CopyFrom1To16Bytes) 186 187 movaps 16(%ecx, %esi), %xmm3 188 movaps %xmm2, (%edx, %esi) 189 pcmpeqd %xmm3, %xmm0 190 pmovmskb %xmm0, %eax 191 lea 16(%esi), %esi 192 193 test %eax, %eax 194 jnz L(CopyFrom1To16Bytes) 195 196 movaps %xmm3, (%edx, %esi) 197 mov %ecx, %eax 198 lea 16(%ecx, %esi), %ecx 199 and $-0x40, %ecx 200 sub %ecx, %eax 201 sub %eax, %edx 202 203 mov $-0x40, %esi 204 205L(Aligned64Loop): 206 movaps (%ecx), %xmm2 207 movaps 32(%ecx), %xmm3 208 movaps %xmm2, %xmm4 209 movaps 16(%ecx), %xmm5 210 movaps %xmm3, %xmm6 211 movaps 48(%ecx), %xmm7 212 pminub %xmm5, %xmm2 213 pminub %xmm7, %xmm3 214 pminub %xmm2, %xmm3 215 lea 64(%edx), %edx 216 pcmpeqd %xmm0, %xmm3 217 lea 64(%ecx), %ecx 218 pmovmskb %xmm3, %eax 219 220 test %eax, %eax 221 jnz L(Aligned64Leave) 222 movaps %xmm4, -64(%edx) 223 movaps %xmm5, -48(%edx) 224 movaps %xmm6, -32(%edx) 225 movaps %xmm7, -16(%edx) 226 jmp L(Aligned64Loop) 227 228L(Aligned64Leave): 229 pcmpeqd %xmm4, %xmm0 230 pmovmskb %xmm0, %eax 231 test %eax, %eax 232 jnz L(CopyFrom1To16Bytes) 233 234 pcmpeqd %xmm5, %xmm0 235 pmovmskb %xmm0, %eax 236 movaps %xmm4, -64(%edx) 237 lea 16(%esi), %esi 238 test %eax, %eax 239 jnz L(CopyFrom1To16Bytes) 240 241 pcmpeqd %xmm6, %xmm0 242 pmovmskb %xmm0, %eax 243 movaps %xmm5, -48(%edx) 244 lea 16(%esi), %esi 245 test %eax, %eax 246 jnz L(CopyFrom1To16Bytes) 247 248 movaps %xmm6, -32(%edx) 249 pcmpeqd %xmm7, %xmm0 250 pmovmskb %xmm0, %eax 251 lea 16(%esi), %esi 252 test %eax, %eax 253 jnz L(CopyFrom1To16Bytes) 254 255 mov $-0x40, %esi 256 movaps %xmm7, -16(%edx) 257 jmp L(Aligned64Loop) 258 259 .p2align 4 260L(Shl4): 261 movaps -4(%ecx), %xmm1 262 movaps 12(%ecx), %xmm2 263L(Shl4Start): 264 pcmpeqd %xmm2, %xmm0 265 pmovmskb %xmm0, %eax 266 movaps %xmm2, %xmm3 267 268 test %eax, %eax 269 jnz L(Shl4LoopExit) 270 271 palignr $4, %xmm1, %xmm2 272 movaps %xmm2, (%edx) 273 movaps 28(%ecx), %xmm2 274 275 pcmpeqd %xmm2, %xmm0 276 lea 16(%edx), %edx 277 pmovmskb %xmm0, %eax 278 lea 16(%ecx), %ecx 279 movaps %xmm2, %xmm1 280 281 test %eax, %eax 282 jnz L(Shl4LoopExit) 283 284 palignr $4, %xmm3, %xmm2 285 movaps %xmm2, (%edx) 286 movaps 28(%ecx), %xmm2 287 288 pcmpeqd %xmm2, %xmm0 289 lea 16(%edx), %edx 290 pmovmskb %xmm0, %eax 291 lea 16(%ecx), %ecx 292 movaps %xmm2, %xmm3 293 294 test %eax, %eax 295 jnz L(Shl4LoopExit) 296 297 palignr $4, %xmm1, %xmm2 298 movaps %xmm2, (%edx) 299 movaps 28(%ecx), %xmm2 300 301 pcmpeqd %xmm2, %xmm0 302 lea 16(%edx), %edx 303 pmovmskb %xmm0, %eax 304 lea 16(%ecx), %ecx 305 306 test %eax, %eax 307 jnz L(Shl4LoopExit) 308 309 palignr $4, %xmm3, %xmm2 310 movaps %xmm2, (%edx) 311 lea 28(%ecx), %ecx 312 lea 16(%edx), %edx 313 314 mov %ecx, %eax 315 and $-0x40, %ecx 316 sub %ecx, %eax 317 lea -12(%ecx), %ecx 318 sub %eax, %edx 319 320 movaps -4(%ecx), %xmm1 321 322L(Shl4LoopStart): 323 movaps 12(%ecx), %xmm2 324 movaps 28(%ecx), %xmm3 325 movaps %xmm3, %xmm6 326 movaps 44(%ecx), %xmm4 327 movaps %xmm4, %xmm7 328 movaps 60(%ecx), %xmm5 329 pminub %xmm2, %xmm6 330 pminub %xmm5, %xmm7 331 pminub %xmm6, %xmm7 332 pcmpeqd %xmm0, %xmm7 333 pmovmskb %xmm7, %eax 334 movaps %xmm5, %xmm7 335 palignr $4, %xmm4, %xmm5 336 palignr $4, %xmm3, %xmm4 337 test %eax, %eax 338 jnz L(Shl4Start) 339 340 palignr $4, %xmm2, %xmm3 341 lea 64(%ecx), %ecx 342 palignr $4, %xmm1, %xmm2 343 movaps %xmm7, %xmm1 344 movaps %xmm5, 48(%edx) 345 movaps %xmm4, 32(%edx) 346 movaps %xmm3, 16(%edx) 347 movaps %xmm2, (%edx) 348 lea 64(%edx), %edx 349 jmp L(Shl4LoopStart) 350 351L(Shl4LoopExit): 352 movlpd (%ecx), %xmm0 353 movl 8(%ecx), %esi 354 movlpd %xmm0, (%edx) 355 movl %esi, 8(%edx) 356 POP (%esi) 357 add $12, %edx 358 add $12, %ecx 359 test %al, %al 360 jz L(ExitHigh) 361 test $0x01, %al 362 jnz L(Exit4) 363 movlpd (%ecx), %xmm0 364 movlpd %xmm0, (%edx) 365 movl %edi, %eax 366 RETURN 367 368 CFI_PUSH (%esi) 369 370 .p2align 4 371L(Shl8): 372 movaps -8(%ecx), %xmm1 373 movaps 8(%ecx), %xmm2 374L(Shl8Start): 375 pcmpeqd %xmm2, %xmm0 376 pmovmskb %xmm0, %eax 377 movaps %xmm2, %xmm3 378 379 test %eax, %eax 380 jnz L(Shl8LoopExit) 381 382 palignr $8, %xmm1, %xmm2 383 movaps %xmm2, (%edx) 384 movaps 24(%ecx), %xmm2 385 386 pcmpeqd %xmm2, %xmm0 387 lea 16(%edx), %edx 388 pmovmskb %xmm0, %eax 389 lea 16(%ecx), %ecx 390 movaps %xmm2, %xmm1 391 392 test %eax, %eax 393 jnz L(Shl8LoopExit) 394 395 palignr $8, %xmm3, %xmm2 396 movaps %xmm2, (%edx) 397 movaps 24(%ecx), %xmm2 398 399 pcmpeqd %xmm2, %xmm0 400 lea 16(%edx), %edx 401 pmovmskb %xmm0, %eax 402 lea 16(%ecx), %ecx 403 movaps %xmm2, %xmm3 404 405 test %eax, %eax 406 jnz L(Shl8LoopExit) 407 408 palignr $8, %xmm1, %xmm2 409 movaps %xmm2, (%edx) 410 movaps 24(%ecx), %xmm2 411 412 pcmpeqd %xmm2, %xmm0 413 lea 16(%edx), %edx 414 pmovmskb %xmm0, %eax 415 lea 16(%ecx), %ecx 416 417 test %eax, %eax 418 jnz L(Shl8LoopExit) 419 420 palignr $8, %xmm3, %xmm2 421 movaps %xmm2, (%edx) 422 lea 24(%ecx), %ecx 423 lea 16(%edx), %edx 424 425 mov %ecx, %eax 426 and $-0x40, %ecx 427 sub %ecx, %eax 428 lea -8(%ecx), %ecx 429 sub %eax, %edx 430 431 movaps -8(%ecx), %xmm1 432 433L(Shl8LoopStart): 434 movaps 8(%ecx), %xmm2 435 movaps 24(%ecx), %xmm3 436 movaps %xmm3, %xmm6 437 movaps 40(%ecx), %xmm4 438 movaps %xmm4, %xmm7 439 movaps 56(%ecx), %xmm5 440 pminub %xmm2, %xmm6 441 pminub %xmm5, %xmm7 442 pminub %xmm6, %xmm7 443 pcmpeqd %xmm0, %xmm7 444 pmovmskb %xmm7, %eax 445 movaps %xmm5, %xmm7 446 palignr $8, %xmm4, %xmm5 447 palignr $8, %xmm3, %xmm4 448 test %eax, %eax 449 jnz L(Shl8Start) 450 451 palignr $8, %xmm2, %xmm3 452 lea 64(%ecx), %ecx 453 palignr $8, %xmm1, %xmm2 454 movaps %xmm7, %xmm1 455 movaps %xmm5, 48(%edx) 456 movaps %xmm4, 32(%edx) 457 movaps %xmm3, 16(%edx) 458 movaps %xmm2, (%edx) 459 lea 64(%edx), %edx 460 jmp L(Shl8LoopStart) 461 462L(Shl8LoopExit): 463 movlpd (%ecx), %xmm0 464 movlpd %xmm0, (%edx) 465 POP (%esi) 466 add $8, %edx 467 add $8, %ecx 468 test %al, %al 469 jz L(ExitHigh) 470 test $0x01, %al 471 jnz L(Exit4) 472 movlpd (%ecx), %xmm0 473 movlpd %xmm0, (%edx) 474 movl %edi, %eax 475 RETURN 476 477 CFI_PUSH (%esi) 478 479 .p2align 4 480L(Shl12): 481 movaps -12(%ecx), %xmm1 482 movaps 4(%ecx), %xmm2 483L(Shl12Start): 484 pcmpeqd %xmm2, %xmm0 485 pmovmskb %xmm0, %eax 486 movaps %xmm2, %xmm3 487 488 test %eax, %eax 489 jnz L(Shl12LoopExit) 490 491 palignr $12, %xmm1, %xmm2 492 movaps %xmm2, (%edx) 493 movaps 20(%ecx), %xmm2 494 495 pcmpeqd %xmm2, %xmm0 496 lea 16(%edx), %edx 497 pmovmskb %xmm0, %eax 498 lea 16(%ecx), %ecx 499 movaps %xmm2, %xmm1 500 501 test %eax, %eax 502 jnz L(Shl12LoopExit) 503 504 palignr $12, %xmm3, %xmm2 505 movaps %xmm2, (%edx) 506 movaps 20(%ecx), %xmm2 507 508 pcmpeqd %xmm2, %xmm0 509 lea 16(%edx), %edx 510 pmovmskb %xmm0, %eax 511 lea 16(%ecx), %ecx 512 movaps %xmm2, %xmm3 513 514 test %eax, %eax 515 jnz L(Shl12LoopExit) 516 517 palignr $12, %xmm1, %xmm2 518 movaps %xmm2, (%edx) 519 movaps 20(%ecx), %xmm2 520 521 pcmpeqd %xmm2, %xmm0 522 lea 16(%edx), %edx 523 pmovmskb %xmm0, %eax 524 lea 16(%ecx), %ecx 525 526 test %eax, %eax 527 jnz L(Shl12LoopExit) 528 529 palignr $12, %xmm3, %xmm2 530 movaps %xmm2, (%edx) 531 lea 20(%ecx), %ecx 532 lea 16(%edx), %edx 533 534 mov %ecx, %eax 535 and $-0x40, %ecx 536 sub %ecx, %eax 537 lea -4(%ecx), %ecx 538 sub %eax, %edx 539 540 movaps -12(%ecx), %xmm1 541 542L(Shl12LoopStart): 543 movaps 4(%ecx), %xmm2 544 movaps 20(%ecx), %xmm3 545 movaps %xmm3, %xmm6 546 movaps 36(%ecx), %xmm4 547 movaps %xmm4, %xmm7 548 movaps 52(%ecx), %xmm5 549 pminub %xmm2, %xmm6 550 pminub %xmm5, %xmm7 551 pminub %xmm6, %xmm7 552 pcmpeqd %xmm0, %xmm7 553 pmovmskb %xmm7, %eax 554 movaps %xmm5, %xmm7 555 palignr $12, %xmm4, %xmm5 556 palignr $12, %xmm3, %xmm4 557 test %eax, %eax 558 jnz L(Shl12Start) 559 560 palignr $12, %xmm2, %xmm3 561 lea 64(%ecx), %ecx 562 palignr $12, %xmm1, %xmm2 563 movaps %xmm7, %xmm1 564 movaps %xmm5, 48(%edx) 565 movaps %xmm4, 32(%edx) 566 movaps %xmm3, 16(%edx) 567 movaps %xmm2, (%edx) 568 lea 64(%edx), %edx 569 jmp L(Shl12LoopStart) 570 571L(Shl12LoopExit): 572 movl (%ecx), %esi 573 movl %esi, (%edx) 574 mov $4, %esi 575 576 .p2align 4 577L(CopyFrom1To16Bytes): 578 add %esi, %edx 579 add %esi, %ecx 580 581 POP (%esi) 582 test %al, %al 583 jz L(ExitHigh) 584 test $0x01, %al 585 jnz L(Exit4) 586L(Exit8): 587 movlpd (%ecx), %xmm0 588 movlpd %xmm0, (%edx) 589 movl %edi, %eax 590 RETURN 591 592 .p2align 4 593L(ExitHigh): 594 test $0x01, %ah 595 jnz L(Exit12) 596L(Exit16): 597 movdqu (%ecx), %xmm0 598 movdqu %xmm0, (%edx) 599 movl %edi, %eax 600 RETURN 601 602 .p2align 4 603L(Exit4): 604 movl (%ecx), %eax 605 movl %eax, (%edx) 606 movl %edi, %eax 607 RETURN 608 609 .p2align 4 610L(Exit12): 611 movlpd (%ecx), %xmm0 612 movlpd %xmm0, (%edx) 613 movl 8(%ecx), %eax 614 movl %eax, 8(%edx) 615 movl %edi, %eax 616 RETURN 617 618CFI_POP (%edi) 619 620 .p2align 4 621L(ExitTail4): 622 movl (%ecx), %eax 623 movl %eax, (%edx) 624 movl %edx, %eax 625 ret 626 627 .p2align 4 628L(ExitTail8): 629 movlpd (%ecx), %xmm0 630 movlpd %xmm0, (%edx) 631 movl %edx, %eax 632 ret 633 634 .p2align 4 635L(ExitTail12): 636 movlpd (%ecx), %xmm0 637 movlpd %xmm0, (%edx) 638 movl 8(%ecx), %eax 639 movl %eax, 8(%edx) 640 movl %edx, %eax 641 ret 642 643 .p2align 4 644L(ExitTail16): 645 movdqu (%ecx), %xmm0 646 movdqu %xmm0, (%edx) 647 movl %edx, %eax 648 ret 649 650#ifndef USE_AS_WCSCAT 651END (wcscpy_ssse3) 652#endif 653