1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#ifndef L 32# define L(label) .L##label 33#endif 34 35#ifndef cfi_startproc 36# define cfi_startproc .cfi_startproc 37#endif 38 39#ifndef cfi_endproc 40# define cfi_endproc .cfi_endproc 41#endif 42 43#ifndef cfi_rel_offset 44# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 45#endif 46 47#ifndef cfi_restore 48# define cfi_restore(reg) .cfi_restore reg 49#endif 50 51#ifndef cfi_adjust_cfa_offset 52# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 53#endif 54 55#ifndef cfi_remember_state 56# define cfi_remember_state .cfi_remember_state 57#endif 58 59#ifndef cfi_restore_state 60# define cfi_restore_state .cfi_restore_state 61#endif 62 63#ifndef ENTRY 64# define ENTRY(name) \ 65 .type name, @function; \ 66 .globl name; \ 67 .p2align 4; \ 68name: \ 69 cfi_startproc 70#endif 71 72#ifndef END 73# define END(name) \ 74 cfi_endproc; \ 75 .size name, .-name 76#endif 77 78#ifndef MEMCMP 79# define MEMCMP memcmp_sse4 80#endif 81 82#define CFI_PUSH(REG) \ 83 cfi_adjust_cfa_offset (4); \ 84 cfi_rel_offset (REG, 0) 85 86#define CFI_POP(REG) \ 87 cfi_adjust_cfa_offset (-4); \ 88 cfi_restore (REG) 89 90#define PUSH(REG) pushl REG; CFI_PUSH (REG) 91#define POP(REG) popl REG; CFI_POP (REG) 92 93#define PARMS 4 94#define BLK1 PARMS 95#define BLK2 BLK1 + 4 96#define LEN BLK2 + 4 97#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) 98 99 100#if (defined SHARED || defined __PIC__) 101# define JMPTBL(I, B) I - B 102 103/* Load an entry in a jump table into EBX and branch to it. TABLE is a 104 jump table with relative offsets. INDEX is a register contains the 105 index into the jump table. SCALE is the scale of INDEX. */ 106 107# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 108/* We first load PC into EBX. */ \ 109 call __x86.get_pc_thunk.bx; \ 110/* Get the address of the jump table. */ \ 111 addl $(TABLE - .), %ebx; \ 112/* Get the entry and convert the relative offset to the \ 113 absolute address. */ \ 114 addl (%ebx,INDEX,SCALE), %ebx; \ 115/* We loaded the jump table and adjuested EDX/ESI. Go. */ \ 116 jmp *%ebx 117#else 118# define JMPTBL(I, B) I 119 120/* Load an entry in a jump table into EBX and branch to it. TABLE is a 121 jump table with relative offsets. INDEX is a register contains the 122 index into the jump table. SCALE is the scale of INDEX. */ 123# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 124 jmp *TABLE(,INDEX,SCALE) 125#endif 126 127 128/* Warning! 129 wmemcmp has to use SIGNED comparison for elements. 130 memcmp has to use UNSIGNED comparison for elemnts. 131*/ 132 133 .section .text.sse4.2,"ax",@progbits 134ENTRY (MEMCMP) 135 movl BLK1(%esp), %eax 136 movl BLK2(%esp), %edx 137 movl LEN(%esp), %ecx 138 139#ifdef USE_AS_WMEMCMP 140 shl $2, %ecx 141 test %ecx, %ecx 142 jz L(return0) 143#else 144 cmp $1, %ecx 145 jbe L(less1bytes) 146#endif 147 148 pxor %xmm0, %xmm0 149 cmp $64, %ecx 150 ja L(64bytesormore) 151 cmp $8, %ecx 152 153#ifndef USE_AS_WMEMCMP 154 PUSH (%ebx) 155 jb L(less8bytes) 156#else 157 jb L(less8bytes) 158 PUSH (%ebx) 159#endif 160 161 add %ecx, %edx 162 add %ecx, %eax 163 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) 164 165#ifndef USE_AS_WMEMCMP 166 .p2align 4 167L(less8bytes): 168 mov (%eax), %bl 169 cmpb (%edx), %bl 170 jne L(nonzero) 171 172 mov 1(%eax), %bl 173 cmpb 1(%edx), %bl 174 jne L(nonzero) 175 176 cmp $2, %ecx 177 jz L(0bytes) 178 179 mov 2(%eax), %bl 180 cmpb 2(%edx), %bl 181 jne L(nonzero) 182 183 cmp $3, %ecx 184 jz L(0bytes) 185 186 mov 3(%eax), %bl 187 cmpb 3(%edx), %bl 188 jne L(nonzero) 189 190 cmp $4, %ecx 191 jz L(0bytes) 192 193 mov 4(%eax), %bl 194 cmpb 4(%edx), %bl 195 jne L(nonzero) 196 197 cmp $5, %ecx 198 jz L(0bytes) 199 200 mov 5(%eax), %bl 201 cmpb 5(%edx), %bl 202 jne L(nonzero) 203 204 cmp $6, %ecx 205 jz L(0bytes) 206 207 mov 6(%eax), %bl 208 cmpb 6(%edx), %bl 209 je L(0bytes) 210 211L(nonzero): 212 POP (%ebx) 213 mov $1, %eax 214 ja L(above) 215 neg %eax 216L(above): 217 ret 218 CFI_PUSH (%ebx) 219#endif 220 221 .p2align 4 222L(0bytes): 223 POP (%ebx) 224 xor %eax, %eax 225 ret 226 227#ifdef USE_AS_WMEMCMP 228 229/* for wmemcmp, case N == 1 */ 230 231 .p2align 4 232L(less8bytes): 233 mov (%eax), %ecx 234 cmp (%edx), %ecx 235 je L(return0) 236 mov $1, %eax 237 jg L(find_diff_bigger) 238 neg %eax 239 ret 240 241 .p2align 4 242L(find_diff_bigger): 243 ret 244 245 .p2align 4 246L(return0): 247 xor %eax, %eax 248 ret 249#endif 250 251#ifndef USE_AS_WMEMCMP 252 .p2align 4 253L(less1bytes): 254 jb L(0bytesend) 255 movzbl (%eax), %eax 256 movzbl (%edx), %edx 257 sub %edx, %eax 258 ret 259 260 .p2align 4 261L(0bytesend): 262 xor %eax, %eax 263 ret 264#endif 265 .p2align 4 266L(64bytesormore): 267 PUSH (%ebx) 268 mov %ecx, %ebx 269 mov $64, %ecx 270 sub $64, %ebx 271L(64bytesormore_loop): 272 movdqu (%eax), %xmm1 273 movdqu (%edx), %xmm2 274 pxor %xmm1, %xmm2 275 ptest %xmm2, %xmm0 276 jnc L(find_16diff) 277 278 movdqu 16(%eax), %xmm1 279 movdqu 16(%edx), %xmm2 280 pxor %xmm1, %xmm2 281 ptest %xmm2, %xmm0 282 jnc L(find_32diff) 283 284 movdqu 32(%eax), %xmm1 285 movdqu 32(%edx), %xmm2 286 pxor %xmm1, %xmm2 287 ptest %xmm2, %xmm0 288 jnc L(find_48diff) 289 290 movdqu 48(%eax), %xmm1 291 movdqu 48(%edx), %xmm2 292 pxor %xmm1, %xmm2 293 ptest %xmm2, %xmm0 294 jnc L(find_64diff) 295 add %ecx, %eax 296 add %ecx, %edx 297 sub %ecx, %ebx 298 jae L(64bytesormore_loop) 299 add %ebx, %ecx 300 add %ecx, %edx 301 add %ecx, %eax 302 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) 303 304#ifdef USE_AS_WMEMCMP 305 306/* Label needs only for table_64bytes filling */ 307L(unreal_case): 308/* no code here */ 309 310#endif 311 .p2align 4 312L(find_16diff): 313 sub $16, %ecx 314L(find_32diff): 315 sub $16, %ecx 316L(find_48diff): 317 sub $16, %ecx 318L(find_64diff): 319 add %ecx, %edx 320 add %ecx, %eax 321 322#ifndef USE_AS_WMEMCMP 323 .p2align 4 324L(16bytes): 325 mov -16(%eax), %ecx 326 mov -16(%edx), %ebx 327 cmp %ebx, %ecx 328 jne L(find_diff) 329L(12bytes): 330 mov -12(%eax), %ecx 331 mov -12(%edx), %ebx 332 cmp %ebx, %ecx 333 jne L(find_diff) 334L(8bytes): 335 mov -8(%eax), %ecx 336 mov -8(%edx), %ebx 337 cmp %ebx, %ecx 338 jne L(find_diff) 339L(4bytes): 340 mov -4(%eax), %ecx 341 mov -4(%edx), %ebx 342 cmp %ebx, %ecx 343 mov $0, %eax 344 jne L(find_diff) 345 RETURN 346#else 347 .p2align 4 348L(16bytes): 349 mov -16(%eax), %ecx 350 cmp -16(%edx), %ecx 351 jne L(find_diff) 352L(12bytes): 353 mov -12(%eax), %ecx 354 cmp -12(%edx), %ecx 355 jne L(find_diff) 356L(8bytes): 357 mov -8(%eax), %ecx 358 cmp -8(%edx), %ecx 359 jne L(find_diff) 360L(4bytes): 361 mov -4(%eax), %ecx 362 cmp -4(%edx), %ecx 363 mov $0, %eax 364 jne L(find_diff) 365 RETURN 366#endif 367 368#ifndef USE_AS_WMEMCMP 369 .p2align 4 370L(49bytes): 371 movdqu -49(%eax), %xmm1 372 movdqu -49(%edx), %xmm2 373 mov $-49, %ebx 374 pxor %xmm1, %xmm2 375 ptest %xmm2, %xmm0 376 jnc L(less16bytes) 377L(33bytes): 378 movdqu -33(%eax), %xmm1 379 movdqu -33(%edx), %xmm2 380 mov $-33, %ebx 381 pxor %xmm1, %xmm2 382 ptest %xmm2, %xmm0 383 jnc L(less16bytes) 384L(17bytes): 385 mov -17(%eax), %ecx 386 mov -17(%edx), %ebx 387 cmp %ebx, %ecx 388 jne L(find_diff) 389L(13bytes): 390 mov -13(%eax), %ecx 391 mov -13(%edx), %ebx 392 cmp %ebx, %ecx 393 jne L(find_diff) 394L(9bytes): 395 mov -9(%eax), %ecx 396 mov -9(%edx), %ebx 397 cmp %ebx, %ecx 398 jne L(find_diff) 399L(5bytes): 400 mov -5(%eax), %ecx 401 mov -5(%edx), %ebx 402 cmp %ebx, %ecx 403 jne L(find_diff) 404 movzbl -1(%eax), %ecx 405 cmp -1(%edx), %cl 406 mov $0, %eax 407 jne L(end) 408 RETURN 409 410 .p2align 4 411L(50bytes): 412 mov $-50, %ebx 413 movdqu -50(%eax), %xmm1 414 movdqu -50(%edx), %xmm2 415 pxor %xmm1, %xmm2 416 ptest %xmm2, %xmm0 417 jnc L(less16bytes) 418L(34bytes): 419 mov $-34, %ebx 420 movdqu -34(%eax), %xmm1 421 movdqu -34(%edx), %xmm2 422 pxor %xmm1, %xmm2 423 ptest %xmm2, %xmm0 424 jnc L(less16bytes) 425L(18bytes): 426 mov -18(%eax), %ecx 427 mov -18(%edx), %ebx 428 cmp %ebx, %ecx 429 jne L(find_diff) 430L(14bytes): 431 mov -14(%eax), %ecx 432 mov -14(%edx), %ebx 433 cmp %ebx, %ecx 434 jne L(find_diff) 435L(10bytes): 436 mov -10(%eax), %ecx 437 mov -10(%edx), %ebx 438 cmp %ebx, %ecx 439 jne L(find_diff) 440L(6bytes): 441 mov -6(%eax), %ecx 442 mov -6(%edx), %ebx 443 cmp %ebx, %ecx 444 jne L(find_diff) 445L(2bytes): 446 movzwl -2(%eax), %ecx 447 movzwl -2(%edx), %ebx 448 cmp %bl, %cl 449 jne L(end) 450 cmp %bh, %ch 451 mov $0, %eax 452 jne L(end) 453 RETURN 454 455 .p2align 4 456L(51bytes): 457 mov $-51, %ebx 458 movdqu -51(%eax), %xmm1 459 movdqu -51(%edx), %xmm2 460 pxor %xmm1, %xmm2 461 ptest %xmm2, %xmm0 462 jnc L(less16bytes) 463L(35bytes): 464 mov $-35, %ebx 465 movdqu -35(%eax), %xmm1 466 movdqu -35(%edx), %xmm2 467 pxor %xmm1, %xmm2 468 ptest %xmm2, %xmm0 469 jnc L(less16bytes) 470L(19bytes): 471 movl -19(%eax), %ecx 472 movl -19(%edx), %ebx 473 cmp %ebx, %ecx 474 jne L(find_diff) 475L(15bytes): 476 movl -15(%eax), %ecx 477 movl -15(%edx), %ebx 478 cmp %ebx, %ecx 479 jne L(find_diff) 480L(11bytes): 481 movl -11(%eax), %ecx 482 movl -11(%edx), %ebx 483 cmp %ebx, %ecx 484 jne L(find_diff) 485L(7bytes): 486 movl -7(%eax), %ecx 487 movl -7(%edx), %ebx 488 cmp %ebx, %ecx 489 jne L(find_diff) 490L(3bytes): 491 movzwl -3(%eax), %ecx 492 movzwl -3(%edx), %ebx 493 cmpb %bl, %cl 494 jne L(end) 495 cmp %bx, %cx 496 jne L(end) 497L(1bytes): 498 movzbl -1(%eax), %eax 499 cmpb -1(%edx), %al 500 mov $0, %eax 501 jne L(end) 502 RETURN 503#endif 504 .p2align 4 505L(52bytes): 506 movdqu -52(%eax), %xmm1 507 movdqu -52(%edx), %xmm2 508 mov $-52, %ebx 509 pxor %xmm1, %xmm2 510 ptest %xmm2, %xmm0 511 jnc L(less16bytes) 512L(36bytes): 513 movdqu -36(%eax), %xmm1 514 movdqu -36(%edx), %xmm2 515 mov $-36, %ebx 516 pxor %xmm1, %xmm2 517 ptest %xmm2, %xmm0 518 jnc L(less16bytes) 519L(20bytes): 520 movdqu -20(%eax), %xmm1 521 movdqu -20(%edx), %xmm2 522 mov $-20, %ebx 523 pxor %xmm1, %xmm2 524 ptest %xmm2, %xmm0 525 jnc L(less16bytes) 526 mov -4(%eax), %ecx 527#ifndef USE_AS_WMEMCMP 528 mov -4(%edx), %ebx 529 cmp %ebx, %ecx 530#else 531 cmp -4(%edx), %ecx 532#endif 533 mov $0, %eax 534 jne L(find_diff) 535 RETURN 536 537#ifndef USE_AS_WMEMCMP 538 .p2align 4 539L(53bytes): 540 movdqu -53(%eax), %xmm1 541 movdqu -53(%edx), %xmm2 542 mov $-53, %ebx 543 pxor %xmm1, %xmm2 544 ptest %xmm2, %xmm0 545 jnc L(less16bytes) 546L(37bytes): 547 mov $-37, %ebx 548 movdqu -37(%eax), %xmm1 549 movdqu -37(%edx), %xmm2 550 pxor %xmm1, %xmm2 551 ptest %xmm2, %xmm0 552 jnc L(less16bytes) 553L(21bytes): 554 mov $-21, %ebx 555 movdqu -21(%eax), %xmm1 556 movdqu -21(%edx), %xmm2 557 pxor %xmm1, %xmm2 558 ptest %xmm2, %xmm0 559 jnc L(less16bytes) 560 mov -5(%eax), %ecx 561 mov -5(%edx), %ebx 562 cmp %ebx, %ecx 563 jne L(find_diff) 564 movzbl -1(%eax), %ecx 565 cmp -1(%edx), %cl 566 mov $0, %eax 567 jne L(end) 568 RETURN 569 570 .p2align 4 571L(54bytes): 572 movdqu -54(%eax), %xmm1 573 movdqu -54(%edx), %xmm2 574 mov $-54, %ebx 575 pxor %xmm1, %xmm2 576 ptest %xmm2, %xmm0 577 jnc L(less16bytes) 578L(38bytes): 579 mov $-38, %ebx 580 movdqu -38(%eax), %xmm1 581 movdqu -38(%edx), %xmm2 582 pxor %xmm1, %xmm2 583 ptest %xmm2, %xmm0 584 jnc L(less16bytes) 585L(22bytes): 586 mov $-22, %ebx 587 movdqu -22(%eax), %xmm1 588 movdqu -22(%edx), %xmm2 589 pxor %xmm1, %xmm2 590 ptest %xmm2, %xmm0 591 jnc L(less16bytes) 592 593 mov -6(%eax), %ecx 594 mov -6(%edx), %ebx 595 cmp %ebx, %ecx 596 jne L(find_diff) 597 movzwl -2(%eax), %ecx 598 movzwl -2(%edx), %ebx 599 cmp %bl, %cl 600 jne L(end) 601 cmp %bh, %ch 602 mov $0, %eax 603 jne L(end) 604 RETURN 605 606 .p2align 4 607L(55bytes): 608 movdqu -55(%eax), %xmm1 609 movdqu -55(%edx), %xmm2 610 mov $-55, %ebx 611 pxor %xmm1, %xmm2 612 ptest %xmm2, %xmm0 613 jnc L(less16bytes) 614L(39bytes): 615 mov $-39, %ebx 616 movdqu -39(%eax), %xmm1 617 movdqu -39(%edx), %xmm2 618 pxor %xmm1, %xmm2 619 ptest %xmm2, %xmm0 620 jnc L(less16bytes) 621L(23bytes): 622 mov $-23, %ebx 623 movdqu -23(%eax), %xmm1 624 movdqu -23(%edx), %xmm2 625 pxor %xmm1, %xmm2 626 ptest %xmm2, %xmm0 627 jnc L(less16bytes) 628 movl -7(%eax), %ecx 629 movl -7(%edx), %ebx 630 cmp %ebx, %ecx 631 jne L(find_diff) 632 movzwl -3(%eax), %ecx 633 movzwl -3(%edx), %ebx 634 cmpb %bl, %cl 635 jne L(end) 636 cmp %bx, %cx 637 jne L(end) 638 movzbl -1(%eax), %eax 639 cmpb -1(%edx), %al 640 mov $0, %eax 641 jne L(end) 642 RETURN 643#endif 644 .p2align 4 645L(56bytes): 646 movdqu -56(%eax), %xmm1 647 movdqu -56(%edx), %xmm2 648 mov $-56, %ebx 649 pxor %xmm1, %xmm2 650 ptest %xmm2, %xmm0 651 jnc L(less16bytes) 652L(40bytes): 653 mov $-40, %ebx 654 movdqu -40(%eax), %xmm1 655 movdqu -40(%edx), %xmm2 656 pxor %xmm1, %xmm2 657 ptest %xmm2, %xmm0 658 jnc L(less16bytes) 659L(24bytes): 660 mov $-24, %ebx 661 movdqu -24(%eax), %xmm1 662 movdqu -24(%edx), %xmm2 663 pxor %xmm1, %xmm2 664 ptest %xmm2, %xmm0 665 jnc L(less16bytes) 666 667 mov -8(%eax), %ecx 668#ifndef USE_AS_WMEMCMP 669 mov -8(%edx), %ebx 670 cmp %ebx, %ecx 671#else 672 cmp -8(%edx), %ecx 673#endif 674 jne L(find_diff) 675 676 mov -4(%eax), %ecx 677#ifndef USE_AS_WMEMCMP 678 mov -4(%edx), %ebx 679 cmp %ebx, %ecx 680#else 681 cmp -4(%edx), %ecx 682#endif 683 mov $0, %eax 684 jne L(find_diff) 685 RETURN 686 687#ifndef USE_AS_WMEMCMP 688 .p2align 4 689L(57bytes): 690 movdqu -57(%eax), %xmm1 691 movdqu -57(%edx), %xmm2 692 mov $-57, %ebx 693 pxor %xmm1, %xmm2 694 ptest %xmm2, %xmm0 695 jnc L(less16bytes) 696L(41bytes): 697 mov $-41, %ebx 698 movdqu -41(%eax), %xmm1 699 movdqu -41(%edx), %xmm2 700 pxor %xmm1, %xmm2 701 ptest %xmm2, %xmm0 702 jnc L(less16bytes) 703L(25bytes): 704 mov $-25, %ebx 705 movdqu -25(%eax), %xmm1 706 movdqu -25(%edx), %xmm2 707 pxor %xmm1, %xmm2 708 ptest %xmm2, %xmm0 709 jnc L(less16bytes) 710 mov -9(%eax), %ecx 711 mov -9(%edx), %ebx 712 cmp %ebx, %ecx 713 jne L(find_diff) 714 mov -5(%eax), %ecx 715 mov -5(%edx), %ebx 716 cmp %ebx, %ecx 717 jne L(find_diff) 718 movzbl -1(%eax), %ecx 719 cmp -1(%edx), %cl 720 mov $0, %eax 721 jne L(end) 722 RETURN 723 724 .p2align 4 725L(58bytes): 726 movdqu -58(%eax), %xmm1 727 movdqu -58(%edx), %xmm2 728 mov $-58, %ebx 729 pxor %xmm1, %xmm2 730 ptest %xmm2, %xmm0 731 jnc L(less16bytes) 732L(42bytes): 733 mov $-42, %ebx 734 movdqu -42(%eax), %xmm1 735 movdqu -42(%edx), %xmm2 736 pxor %xmm1, %xmm2 737 ptest %xmm2, %xmm0 738 jnc L(less16bytes) 739L(26bytes): 740 mov $-26, %ebx 741 movdqu -26(%eax), %xmm1 742 movdqu -26(%edx), %xmm2 743 pxor %xmm1, %xmm2 744 ptest %xmm2, %xmm0 745 jnc L(less16bytes) 746 747 mov -10(%eax), %ecx 748 mov -10(%edx), %ebx 749 cmp %ebx, %ecx 750 jne L(find_diff) 751 752 mov -6(%eax), %ecx 753 mov -6(%edx), %ebx 754 cmp %ebx, %ecx 755 jne L(find_diff) 756 757 movzwl -2(%eax), %ecx 758 movzwl -2(%edx), %ebx 759 cmp %bl, %cl 760 jne L(end) 761 cmp %bh, %ch 762 mov $0, %eax 763 jne L(end) 764 RETURN 765 766 .p2align 4 767L(59bytes): 768 movdqu -59(%eax), %xmm1 769 movdqu -59(%edx), %xmm2 770 mov $-59, %ebx 771 pxor %xmm1, %xmm2 772 ptest %xmm2, %xmm0 773 jnc L(less16bytes) 774L(43bytes): 775 mov $-43, %ebx 776 movdqu -43(%eax), %xmm1 777 movdqu -43(%edx), %xmm2 778 pxor %xmm1, %xmm2 779 ptest %xmm2, %xmm0 780 jnc L(less16bytes) 781L(27bytes): 782 mov $-27, %ebx 783 movdqu -27(%eax), %xmm1 784 movdqu -27(%edx), %xmm2 785 pxor %xmm1, %xmm2 786 ptest %xmm2, %xmm0 787 jnc L(less16bytes) 788 movl -11(%eax), %ecx 789 movl -11(%edx), %ebx 790 cmp %ebx, %ecx 791 jne L(find_diff) 792 movl -7(%eax), %ecx 793 movl -7(%edx), %ebx 794 cmp %ebx, %ecx 795 jne L(find_diff) 796 movzwl -3(%eax), %ecx 797 movzwl -3(%edx), %ebx 798 cmpb %bl, %cl 799 jne L(end) 800 cmp %bx, %cx 801 jne L(end) 802 movzbl -1(%eax), %eax 803 cmpb -1(%edx), %al 804 mov $0, %eax 805 jne L(end) 806 RETURN 807#endif 808 .p2align 4 809L(60bytes): 810 movdqu -60(%eax), %xmm1 811 movdqu -60(%edx), %xmm2 812 mov $-60, %ebx 813 pxor %xmm1, %xmm2 814 ptest %xmm2, %xmm0 815 jnc L(less16bytes) 816L(44bytes): 817 mov $-44, %ebx 818 movdqu -44(%eax), %xmm1 819 movdqu -44(%edx), %xmm2 820 pxor %xmm1, %xmm2 821 ptest %xmm2, %xmm0 822 jnc L(less16bytes) 823L(28bytes): 824 mov $-28, %ebx 825 movdqu -28(%eax), %xmm1 826 movdqu -28(%edx), %xmm2 827 pxor %xmm1, %xmm2 828 ptest %xmm2, %xmm0 829 jnc L(less16bytes) 830 831 mov -12(%eax), %ecx 832#ifndef USE_AS_WMEMCMP 833 mov -12(%edx), %ebx 834 cmp %ebx, %ecx 835#else 836 cmp -12(%edx), %ecx 837#endif 838 jne L(find_diff) 839 840 mov -8(%eax), %ecx 841#ifndef USE_AS_WMEMCMP 842 mov -8(%edx), %ebx 843 cmp %ebx, %ecx 844#else 845 cmp -8(%edx), %ecx 846#endif 847 jne L(find_diff) 848 849 mov -4(%eax), %ecx 850#ifndef USE_AS_WMEMCMP 851 mov -4(%edx), %ebx 852 cmp %ebx, %ecx 853#else 854 cmp -4(%edx), %ecx 855#endif 856 mov $0, %eax 857 jne L(find_diff) 858 RETURN 859 860#ifndef USE_AS_WMEMCMP 861 .p2align 4 862L(61bytes): 863 movdqu -61(%eax), %xmm1 864 movdqu -61(%edx), %xmm2 865 mov $-61, %ebx 866 pxor %xmm1, %xmm2 867 ptest %xmm2, %xmm0 868 jnc L(less16bytes) 869L(45bytes): 870 mov $-45, %ebx 871 movdqu -45(%eax), %xmm1 872 movdqu -45(%edx), %xmm2 873 pxor %xmm1, %xmm2 874 ptest %xmm2, %xmm0 875 jnc L(less16bytes) 876L(29bytes): 877 mov $-29, %ebx 878 movdqu -29(%eax), %xmm1 879 movdqu -29(%edx), %xmm2 880 pxor %xmm1, %xmm2 881 ptest %xmm2, %xmm0 882 jnc L(less16bytes) 883 884 mov -13(%eax), %ecx 885 mov -13(%edx), %ebx 886 cmp %ebx, %ecx 887 jne L(find_diff) 888 889 mov -9(%eax), %ecx 890 mov -9(%edx), %ebx 891 cmp %ebx, %ecx 892 jne L(find_diff) 893 894 mov -5(%eax), %ecx 895 mov -5(%edx), %ebx 896 cmp %ebx, %ecx 897 jne L(find_diff) 898 movzbl -1(%eax), %ecx 899 cmp -1(%edx), %cl 900 mov $0, %eax 901 jne L(end) 902 RETURN 903 904 .p2align 4 905L(62bytes): 906 movdqu -62(%eax), %xmm1 907 movdqu -62(%edx), %xmm2 908 mov $-62, %ebx 909 pxor %xmm1, %xmm2 910 ptest %xmm2, %xmm0 911 jnc L(less16bytes) 912L(46bytes): 913 mov $-46, %ebx 914 movdqu -46(%eax), %xmm1 915 movdqu -46(%edx), %xmm2 916 pxor %xmm1, %xmm2 917 ptest %xmm2, %xmm0 918 jnc L(less16bytes) 919L(30bytes): 920 mov $-30, %ebx 921 movdqu -30(%eax), %xmm1 922 movdqu -30(%edx), %xmm2 923 pxor %xmm1, %xmm2 924 ptest %xmm2, %xmm0 925 jnc L(less16bytes) 926 mov -14(%eax), %ecx 927 mov -14(%edx), %ebx 928 cmp %ebx, %ecx 929 jne L(find_diff) 930 mov -10(%eax), %ecx 931 mov -10(%edx), %ebx 932 cmp %ebx, %ecx 933 jne L(find_diff) 934 mov -6(%eax), %ecx 935 mov -6(%edx), %ebx 936 cmp %ebx, %ecx 937 jne L(find_diff) 938 movzwl -2(%eax), %ecx 939 movzwl -2(%edx), %ebx 940 cmp %bl, %cl 941 jne L(end) 942 cmp %bh, %ch 943 mov $0, %eax 944 jne L(end) 945 RETURN 946 947 .p2align 4 948L(63bytes): 949 movdqu -63(%eax), %xmm1 950 movdqu -63(%edx), %xmm2 951 mov $-63, %ebx 952 pxor %xmm1, %xmm2 953 ptest %xmm2, %xmm0 954 jnc L(less16bytes) 955L(47bytes): 956 mov $-47, %ebx 957 movdqu -47(%eax), %xmm1 958 movdqu -47(%edx), %xmm2 959 pxor %xmm1, %xmm2 960 ptest %xmm2, %xmm0 961 jnc L(less16bytes) 962L(31bytes): 963 mov $-31, %ebx 964 movdqu -31(%eax), %xmm1 965 movdqu -31(%edx), %xmm2 966 pxor %xmm1, %xmm2 967 ptest %xmm2, %xmm0 968 jnc L(less16bytes) 969 970 movl -15(%eax), %ecx 971 movl -15(%edx), %ebx 972 cmp %ebx, %ecx 973 jne L(find_diff) 974 movl -11(%eax), %ecx 975 movl -11(%edx), %ebx 976 cmp %ebx, %ecx 977 jne L(find_diff) 978 movl -7(%eax), %ecx 979 movl -7(%edx), %ebx 980 cmp %ebx, %ecx 981 jne L(find_diff) 982 movzwl -3(%eax), %ecx 983 movzwl -3(%edx), %ebx 984 cmpb %bl, %cl 985 jne L(end) 986 cmp %bx, %cx 987 jne L(end) 988 movzbl -1(%eax), %eax 989 cmpb -1(%edx), %al 990 mov $0, %eax 991 jne L(end) 992 RETURN 993#endif 994 995 .p2align 4 996L(64bytes): 997 movdqu -64(%eax), %xmm1 998 movdqu -64(%edx), %xmm2 999 mov $-64, %ebx 1000 pxor %xmm1, %xmm2 1001 ptest %xmm2, %xmm0 1002 jnc L(less16bytes) 1003L(48bytes): 1004 movdqu -48(%eax), %xmm1 1005 movdqu -48(%edx), %xmm2 1006 mov $-48, %ebx 1007 pxor %xmm1, %xmm2 1008 ptest %xmm2, %xmm0 1009 jnc L(less16bytes) 1010L(32bytes): 1011 movdqu -32(%eax), %xmm1 1012 movdqu -32(%edx), %xmm2 1013 mov $-32, %ebx 1014 pxor %xmm1, %xmm2 1015 ptest %xmm2, %xmm0 1016 jnc L(less16bytes) 1017 1018 mov -16(%eax), %ecx 1019#ifndef USE_AS_WMEMCMP 1020 mov -16(%edx), %ebx 1021 cmp %ebx, %ecx 1022#else 1023 cmp -16(%edx), %ecx 1024#endif 1025 jne L(find_diff) 1026 1027 mov -12(%eax), %ecx 1028#ifndef USE_AS_WMEMCMP 1029 mov -12(%edx), %ebx 1030 cmp %ebx, %ecx 1031#else 1032 cmp -12(%edx), %ecx 1033#endif 1034 jne L(find_diff) 1035 1036 mov -8(%eax), %ecx 1037#ifndef USE_AS_WMEMCMP 1038 mov -8(%edx), %ebx 1039 cmp %ebx, %ecx 1040#else 1041 cmp -8(%edx), %ecx 1042#endif 1043 jne L(find_diff) 1044 1045 mov -4(%eax), %ecx 1046#ifndef USE_AS_WMEMCMP 1047 mov -4(%edx), %ebx 1048 cmp %ebx, %ecx 1049#else 1050 cmp -4(%edx), %ecx 1051#endif 1052 mov $0, %eax 1053 jne L(find_diff) 1054 RETURN 1055 1056#ifndef USE_AS_WMEMCMP 1057 .p2align 4 1058L(less16bytes): 1059 add %ebx, %eax 1060 add %ebx, %edx 1061 1062 mov (%eax), %ecx 1063 mov (%edx), %ebx 1064 cmp %ebx, %ecx 1065 jne L(find_diff) 1066 1067 mov 4(%eax), %ecx 1068 mov 4(%edx), %ebx 1069 cmp %ebx, %ecx 1070 jne L(find_diff) 1071 1072 mov 8(%eax), %ecx 1073 mov 8(%edx), %ebx 1074 cmp %ebx, %ecx 1075 jne L(find_diff) 1076 1077 mov 12(%eax), %ecx 1078 mov 12(%edx), %ebx 1079 cmp %ebx, %ecx 1080 mov $0, %eax 1081 jne L(find_diff) 1082 RETURN 1083#else 1084 .p2align 4 1085L(less16bytes): 1086 add %ebx, %eax 1087 add %ebx, %edx 1088 1089 mov (%eax), %ecx 1090 cmp (%edx), %ecx 1091 jne L(find_diff) 1092 1093 mov 4(%eax), %ecx 1094 cmp 4(%edx), %ecx 1095 jne L(find_diff) 1096 1097 mov 8(%eax), %ecx 1098 cmp 8(%edx), %ecx 1099 jne L(find_diff) 1100 1101 mov 12(%eax), %ecx 1102 cmp 12(%edx), %ecx 1103 1104 mov $0, %eax 1105 jne L(find_diff) 1106 RETURN 1107#endif 1108 1109 .p2align 4 1110L(find_diff): 1111#ifndef USE_AS_WMEMCMP 1112 cmpb %bl, %cl 1113 jne L(end) 1114 cmp %bx, %cx 1115 jne L(end) 1116 shr $16,%ecx 1117 shr $16,%ebx 1118 cmp %bl, %cl 1119 jne L(end) 1120 cmp %bx, %cx 1121L(end): 1122 POP (%ebx) 1123 mov $1, %eax 1124 ja L(bigger) 1125 neg %eax 1126L(bigger): 1127 ret 1128#else 1129 POP (%ebx) 1130 mov $1, %eax 1131 jg L(bigger) 1132 neg %eax 1133 ret 1134 1135 .p2align 4 1136L(bigger): 1137 ret 1138#endif 1139END (MEMCMP) 1140 1141 .section .rodata.sse4.2,"a",@progbits 1142 .p2align 2 1143 .type L(table_64bytes), @object 1144#ifndef USE_AS_WMEMCMP 1145L(table_64bytes): 1146 .int JMPTBL (L(0bytes), L(table_64bytes)) 1147 .int JMPTBL (L(1bytes), L(table_64bytes)) 1148 .int JMPTBL (L(2bytes), L(table_64bytes)) 1149 .int JMPTBL (L(3bytes), L(table_64bytes)) 1150 .int JMPTBL (L(4bytes), L(table_64bytes)) 1151 .int JMPTBL (L(5bytes), L(table_64bytes)) 1152 .int JMPTBL (L(6bytes), L(table_64bytes)) 1153 .int JMPTBL (L(7bytes), L(table_64bytes)) 1154 .int JMPTBL (L(8bytes), L(table_64bytes)) 1155 .int JMPTBL (L(9bytes), L(table_64bytes)) 1156 .int JMPTBL (L(10bytes), L(table_64bytes)) 1157 .int JMPTBL (L(11bytes), L(table_64bytes)) 1158 .int JMPTBL (L(12bytes), L(table_64bytes)) 1159 .int JMPTBL (L(13bytes), L(table_64bytes)) 1160 .int JMPTBL (L(14bytes), L(table_64bytes)) 1161 .int JMPTBL (L(15bytes), L(table_64bytes)) 1162 .int JMPTBL (L(16bytes), L(table_64bytes)) 1163 .int JMPTBL (L(17bytes), L(table_64bytes)) 1164 .int JMPTBL (L(18bytes), L(table_64bytes)) 1165 .int JMPTBL (L(19bytes), L(table_64bytes)) 1166 .int JMPTBL (L(20bytes), L(table_64bytes)) 1167 .int JMPTBL (L(21bytes), L(table_64bytes)) 1168 .int JMPTBL (L(22bytes), L(table_64bytes)) 1169 .int JMPTBL (L(23bytes), L(table_64bytes)) 1170 .int JMPTBL (L(24bytes), L(table_64bytes)) 1171 .int JMPTBL (L(25bytes), L(table_64bytes)) 1172 .int JMPTBL (L(26bytes), L(table_64bytes)) 1173 .int JMPTBL (L(27bytes), L(table_64bytes)) 1174 .int JMPTBL (L(28bytes), L(table_64bytes)) 1175 .int JMPTBL (L(29bytes), L(table_64bytes)) 1176 .int JMPTBL (L(30bytes), L(table_64bytes)) 1177 .int JMPTBL (L(31bytes), L(table_64bytes)) 1178 .int JMPTBL (L(32bytes), L(table_64bytes)) 1179 .int JMPTBL (L(33bytes), L(table_64bytes)) 1180 .int JMPTBL (L(34bytes), L(table_64bytes)) 1181 .int JMPTBL (L(35bytes), L(table_64bytes)) 1182 .int JMPTBL (L(36bytes), L(table_64bytes)) 1183 .int JMPTBL (L(37bytes), L(table_64bytes)) 1184 .int JMPTBL (L(38bytes), L(table_64bytes)) 1185 .int JMPTBL (L(39bytes), L(table_64bytes)) 1186 .int JMPTBL (L(40bytes), L(table_64bytes)) 1187 .int JMPTBL (L(41bytes), L(table_64bytes)) 1188 .int JMPTBL (L(42bytes), L(table_64bytes)) 1189 .int JMPTBL (L(43bytes), L(table_64bytes)) 1190 .int JMPTBL (L(44bytes), L(table_64bytes)) 1191 .int JMPTBL (L(45bytes), L(table_64bytes)) 1192 .int JMPTBL (L(46bytes), L(table_64bytes)) 1193 .int JMPTBL (L(47bytes), L(table_64bytes)) 1194 .int JMPTBL (L(48bytes), L(table_64bytes)) 1195 .int JMPTBL (L(49bytes), L(table_64bytes)) 1196 .int JMPTBL (L(50bytes), L(table_64bytes)) 1197 .int JMPTBL (L(51bytes), L(table_64bytes)) 1198 .int JMPTBL (L(52bytes), L(table_64bytes)) 1199 .int JMPTBL (L(53bytes), L(table_64bytes)) 1200 .int JMPTBL (L(54bytes), L(table_64bytes)) 1201 .int JMPTBL (L(55bytes), L(table_64bytes)) 1202 .int JMPTBL (L(56bytes), L(table_64bytes)) 1203 .int JMPTBL (L(57bytes), L(table_64bytes)) 1204 .int JMPTBL (L(58bytes), L(table_64bytes)) 1205 .int JMPTBL (L(59bytes), L(table_64bytes)) 1206 .int JMPTBL (L(60bytes), L(table_64bytes)) 1207 .int JMPTBL (L(61bytes), L(table_64bytes)) 1208 .int JMPTBL (L(62bytes), L(table_64bytes)) 1209 .int JMPTBL (L(63bytes), L(table_64bytes)) 1210 .int JMPTBL (L(64bytes), L(table_64bytes)) 1211#else 1212L(table_64bytes): 1213 .int JMPTBL (L(0bytes), L(table_64bytes)) 1214 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1215 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1216 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1217 .int JMPTBL (L(4bytes), L(table_64bytes)) 1218 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1219 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1220 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1221 .int JMPTBL (L(8bytes), L(table_64bytes)) 1222 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1223 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1224 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1225 .int JMPTBL (L(12bytes), L(table_64bytes)) 1226 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1227 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1228 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1229 .int JMPTBL (L(16bytes), L(table_64bytes)) 1230 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1231 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1232 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1233 .int JMPTBL (L(20bytes), L(table_64bytes)) 1234 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1235 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1236 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1237 .int JMPTBL (L(24bytes), L(table_64bytes)) 1238 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1239 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1240 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1241 .int JMPTBL (L(28bytes), L(table_64bytes)) 1242 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1243 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1244 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1245 .int JMPTBL (L(32bytes), L(table_64bytes)) 1246 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1247 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1248 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1249 .int JMPTBL (L(36bytes), L(table_64bytes)) 1250 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1251 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1252 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1253 .int JMPTBL (L(40bytes), L(table_64bytes)) 1254 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1255 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1256 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1257 .int JMPTBL (L(44bytes), L(table_64bytes)) 1258 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1259 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1260 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1261 .int JMPTBL (L(48bytes), L(table_64bytes)) 1262 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1263 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1264 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1265 .int JMPTBL (L(52bytes), L(table_64bytes)) 1266 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1267 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1268 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1269 .int JMPTBL (L(56bytes), L(table_64bytes)) 1270 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1271 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1272 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1273 .int JMPTBL (L(60bytes), L(table_64bytes)) 1274 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1275 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1276 .int JMPTBL (L(unreal_case), L(table_64bytes)) 1277 .int JMPTBL (L(64bytes), L(table_64bytes)) 1278#endif 1279