1/* 2Copyright (c) 2010, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#ifndef MEMCPY 32# define MEMCPY ssse3_memcpy5 33#endif 34 35#ifndef L 36# define L(label) .L##label 37#endif 38 39#ifndef ALIGN 40# define ALIGN(n) .p2align n 41#endif 42 43#ifndef cfi_startproc 44# define cfi_startproc .cfi_startproc 45#endif 46 47#ifndef cfi_endproc 48# define cfi_endproc .cfi_endproc 49#endif 50 51#ifndef cfi_rel_offset 52# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 53#endif 54 55#ifndef cfi_restore 56# define cfi_restore(reg) .cfi_restore reg 57#endif 58 59#ifndef cfi_adjust_cfa_offset 60# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 61#endif 62 63#ifndef cfi_remember_state 64# define cfi_remember_state .cfi_remember_state 65#endif 66 67#ifndef cfi_restore_state 68# define cfi_restore_state .cfi_restore_state 69#endif 70 71#ifndef ENTRY 72# define ENTRY(name) \ 73 .type name, @function; \ 74 .globl name; \ 75 .p2align 4; \ 76name: \ 77 cfi_startproc 78#endif 79 80#ifndef END 81# define END(name) \ 82 cfi_endproc; \ 83 .size name, .-name 84#endif 85 86#ifdef USE_AS_BCOPY 87# define SRC PARMS 88# define DEST SRC+4 89# define LEN DEST+4 90#else 91# define DEST PARMS 92# define SRC DEST+4 93# define LEN SRC+4 94#endif 95 96#define CFI_PUSH(REG) \ 97 cfi_adjust_cfa_offset (4); \ 98 cfi_rel_offset (REG, 0) 99 100#define CFI_POP(REG) \ 101 cfi_adjust_cfa_offset (-4); \ 102 cfi_restore (REG) 103 104#define PUSH(REG) pushl REG; CFI_PUSH (REG) 105#define POP(REG) popl REG; CFI_POP (REG) 106 107#ifdef SHARED 108# define PARMS 8 /* Preserve EBX. */ 109# define ENTRANCE PUSH (%ebx); 110# define RETURN_END POP (%ebx); ret 111# define RETURN RETURN_END; CFI_PUSH (%ebx) 112# define JMPTBL(I, B) I - B 113 114/* Load an entry in a jump table into EBX and branch to it. TABLE is a 115 jump table with relative offsets. INDEX is a register contains the 116 index into the jump table. SCALE is the scale of INDEX. */ 117# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 118 /* We first load PC into EBX. */ \ 119 call __i686.get_pc_thunk.bx; \ 120 /* Get the address of the jump table. */ \ 121 addl $(TABLE - .), %ebx; \ 122 /* Get the entry and convert the relative offset to the \ 123 absolute address. */ \ 124 addl (%ebx,INDEX,SCALE), %ebx; \ 125 /* We loaded the jump table. Go. */ \ 126 jmp *%ebx 127 128# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ 129 addl $(TABLE - .), %ebx 130 131# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ 132 addl (%ebx,INDEX,SCALE), %ebx; \ 133 /* We loaded the jump table. Go. */ \ 134 jmp *%ebx 135 136 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits 137 .globl __i686.get_pc_thunk.bx 138 .hidden __i686.get_pc_thunk.bx 139 ALIGN (4) 140 .type __i686.get_pc_thunk.bx,@function 141__i686.get_pc_thunk.bx: 142 movl (%esp), %ebx 143 ret 144#else 145# define PARMS 4 146# define ENTRANCE 147# define RETURN_END ret 148# define RETURN RETURN_END 149# define JMPTBL(I, B) I 150 151/* Branch to an entry in a jump table. TABLE is a jump table with 152 absolute offsets. INDEX is a register contains the index into the 153 jump table. SCALE is the scale of INDEX. */ 154# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ 155 jmp *TABLE(,INDEX,SCALE) 156 157# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) 158 159# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ 160 jmp *TABLE(,INDEX,SCALE) 161#endif 162 163 .section .text.ssse3,"ax",@progbits 164ENTRY (MEMCPY) 165 ENTRANCE 166 movl LEN(%esp), %ecx 167 movl SRC(%esp), %eax 168 movl DEST(%esp), %edx 169 170#ifdef USE_AS_MEMMOVE 171 cmp %eax, %edx 172 jb L(copy_forward) 173 je L(fwd_write_0bytes) 174 cmp $32, %ecx 175 jae L(memmove_bwd) 176 jmp L(bk_write_less32bytes_2) 177L(memmove_bwd): 178 add %ecx, %eax 179 cmp %eax, %edx 180 movl SRC(%esp), %eax 181 jb L(copy_backward) 182 183L(copy_forward): 184#endif 185 cmp $48, %ecx 186 jae L(48bytesormore) 187 188L(fwd_write_less32bytes): 189#ifndef USE_AS_MEMMOVE 190 cmp %dl, %al 191 jb L(bk_write) 192#endif 193 add %ecx, %edx 194 add %ecx, %eax 195 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 196#ifndef USE_AS_MEMMOVE 197L(bk_write): 198 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 199#endif 200 201 ALIGN (4) 202/* ECX > 32 and EDX is 4 byte aligned. */ 203L(48bytesormore): 204 movdqu (%eax), %xmm0 205 PUSH (%edi) 206 movl %edx, %edi 207 and $-16, %edx 208 PUSH (%esi) 209 cfi_remember_state 210 add $16, %edx 211 movl %edi, %esi 212 sub %edx, %edi 213 add %edi, %ecx 214 sub %edi, %eax 215 216#ifdef SHARED_CACHE_SIZE_HALF 217 cmp $SHARED_CACHE_SIZE_HALF, %ecx 218#else 219# ifdef SHARED 220 call __i686.get_pc_thunk.bx 221 add $_GLOBAL_OFFSET_TABLE_, %ebx 222 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 223# else 224 cmp __x86_shared_cache_size_half, %ecx 225# endif 226#endif 227 228 mov %eax, %edi 229 jae L(large_page) 230 and $0xf, %edi 231 jz L(shl_0) 232 233 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) 234 235 cfi_restore_state 236 cfi_remember_state 237 ALIGN (4) 238L(shl_0): 239 movdqu %xmm0, (%esi) 240 xor %edi, %edi 241 POP (%esi) 242 cmp $127, %ecx 243 ja L(shl_0_gobble) 244 lea -32(%ecx), %ecx 245L(shl_0_loop): 246 movdqa (%eax, %edi), %xmm0 247 movdqa 16(%eax, %edi), %xmm1 248 sub $32, %ecx 249 movdqa %xmm0, (%edx, %edi) 250 movdqa %xmm1, 16(%edx, %edi) 251 lea 32(%edi), %edi 252 jb L(shl_0_end) 253 254 movdqa (%eax, %edi), %xmm0 255 movdqa 16(%eax, %edi), %xmm1 256 sub $32, %ecx 257 movdqa %xmm0, (%edx, %edi) 258 movdqa %xmm1, 16(%edx, %edi) 259 lea 32(%edi), %edi 260 jb L(shl_0_end) 261 262 movdqa (%eax, %edi), %xmm0 263 movdqa 16(%eax, %edi), %xmm1 264 sub $32, %ecx 265 movdqa %xmm0, (%edx, %edi) 266 movdqa %xmm1, 16(%edx, %edi) 267 lea 32(%edi), %edi 268 jb L(shl_0_end) 269 270 movdqa (%eax, %edi), %xmm0 271 movdqa 16(%eax, %edi), %xmm1 272 sub $32, %ecx 273 movdqa %xmm0, (%edx, %edi) 274 movdqa %xmm1, 16(%edx, %edi) 275 lea 32(%edi), %edi 276L(shl_0_end): 277 lea 32(%ecx), %ecx 278 add %ecx, %edi 279 add %edi, %edx 280 add %edi, %eax 281 POP (%edi) 282 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 283 284 CFI_PUSH (%edi) 285L(shl_0_gobble): 286 287#ifdef DATA_CACHE_SIZE_HALF 288 cmp $DATA_CACHE_SIZE_HALF, %ecx 289#else 290# ifdef SHARED 291 call __i686.get_pc_thunk.bx 292 add $_GLOBAL_OFFSET_TABLE_, %ebx 293 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx 294# else 295 cmp __x86_data_cache_size_half, %ecx 296# endif 297#endif 298 299 POP (%edi) 300 lea -128(%ecx), %ecx 301 jae L(shl_0_gobble_mem_loop) 302L(shl_0_gobble_cache_loop): 303 movdqa (%eax), %xmm0 304 movdqa 0x10(%eax), %xmm1 305 movdqa 0x20(%eax), %xmm2 306 movdqa 0x30(%eax), %xmm3 307 movdqa 0x40(%eax), %xmm4 308 movdqa 0x50(%eax), %xmm5 309 movdqa 0x60(%eax), %xmm6 310 movdqa 0x70(%eax), %xmm7 311 lea 0x80(%eax), %eax 312 sub $128, %ecx 313 movdqa %xmm0, (%edx) 314 movdqa %xmm1, 0x10(%edx) 315 movdqa %xmm2, 0x20(%edx) 316 movdqa %xmm3, 0x30(%edx) 317 movdqa %xmm4, 0x40(%edx) 318 movdqa %xmm5, 0x50(%edx) 319 movdqa %xmm6, 0x60(%edx) 320 movdqa %xmm7, 0x70(%edx) 321 lea 0x80(%edx), %edx 322 323 jae L(shl_0_gobble_cache_loop) 324 cmp $-0x40, %ecx 325 lea 0x80(%ecx), %ecx 326 jl L(shl_0_cache_less_64bytes) 327 328 movdqa (%eax), %xmm0 329 sub $0x40, %ecx 330 movdqa 0x10(%eax), %xmm1 331 332 movdqa %xmm0, (%edx) 333 movdqa %xmm1, 0x10(%edx) 334 335 movdqa 0x20(%eax), %xmm0 336 movdqa 0x30(%eax), %xmm1 337 add $0x40, %eax 338 339 movdqa %xmm0, 0x20(%edx) 340 movdqa %xmm1, 0x30(%edx) 341 add $0x40, %edx 342L(shl_0_cache_less_64bytes): 343 cmp $0x20, %ecx 344 jb L(shl_0_cache_less_32bytes) 345 movdqa (%eax), %xmm0 346 sub $0x20, %ecx 347 movdqa 0x10(%eax), %xmm1 348 add $0x20, %eax 349 movdqa %xmm0, (%edx) 350 movdqa %xmm1, 0x10(%edx) 351 add $0x20, %edx 352L(shl_0_cache_less_32bytes): 353 cmp $0x10, %ecx 354 jb L(shl_0_cache_less_16bytes) 355 sub $0x10, %ecx 356 movdqa (%eax), %xmm0 357 add $0x10, %eax 358 movdqa %xmm0, (%edx) 359 add $0x10, %edx 360L(shl_0_cache_less_16bytes): 361 add %ecx, %edx 362 add %ecx, %eax 363 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 364 365 366 ALIGN (4) 367L(shl_0_gobble_mem_loop): 368 prefetcht0 0x1c0(%eax) 369 prefetcht0 0x280(%eax) 370 prefetcht0 0x1c0(%edx) 371 372 movdqa (%eax), %xmm0 373 movdqa 0x10(%eax), %xmm1 374 movdqa 0x20(%eax), %xmm2 375 movdqa 0x30(%eax), %xmm3 376 movdqa 0x40(%eax), %xmm4 377 movdqa 0x50(%eax), %xmm5 378 movdqa 0x60(%eax), %xmm6 379 movdqa 0x70(%eax), %xmm7 380 lea 0x80(%eax), %eax 381 sub $0x80, %ecx 382 movdqa %xmm0, (%edx) 383 movdqa %xmm1, 0x10(%edx) 384 movdqa %xmm2, 0x20(%edx) 385 movdqa %xmm3, 0x30(%edx) 386 movdqa %xmm4, 0x40(%edx) 387 movdqa %xmm5, 0x50(%edx) 388 movdqa %xmm6, 0x60(%edx) 389 movdqa %xmm7, 0x70(%edx) 390 lea 0x80(%edx), %edx 391 392 jae L(shl_0_gobble_mem_loop) 393 cmp $-0x40, %ecx 394 lea 0x80(%ecx), %ecx 395 jl L(shl_0_mem_less_64bytes) 396 397 movdqa (%eax), %xmm0 398 sub $0x40, %ecx 399 movdqa 0x10(%eax), %xmm1 400 401 movdqa %xmm0, (%edx) 402 movdqa %xmm1, 0x10(%edx) 403 404 movdqa 0x20(%eax), %xmm0 405 movdqa 0x30(%eax), %xmm1 406 add $0x40, %eax 407 408 movdqa %xmm0, 0x20(%edx) 409 movdqa %xmm1, 0x30(%edx) 410 add $0x40, %edx 411L(shl_0_mem_less_64bytes): 412 cmp $0x20, %ecx 413 jb L(shl_0_mem_less_32bytes) 414 movdqa (%eax), %xmm0 415 sub $0x20, %ecx 416 movdqa 0x10(%eax), %xmm1 417 add $0x20, %eax 418 movdqa %xmm0, (%edx) 419 movdqa %xmm1, 0x10(%edx) 420 add $0x20, %edx 421L(shl_0_mem_less_32bytes): 422 cmp $0x10, %ecx 423 jb L(shl_0_mem_less_16bytes) 424 sub $0x10, %ecx 425 movdqa (%eax), %xmm0 426 add $0x10, %eax 427 movdqa %xmm0, (%edx) 428 add $0x10, %edx 429L(shl_0_mem_less_16bytes): 430 add %ecx, %edx 431 add %ecx, %eax 432 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 433 434 cfi_restore_state 435 cfi_remember_state 436 ALIGN (4) 437L(shl_1): 438 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 439 lea -1(%eax), %eax 440 movaps (%eax), %xmm1 441 xor %edi, %edi 442 lea -32(%ecx), %ecx 443 movdqu %xmm0, (%esi) 444 POP (%esi) 445L(shl_1_loop): 446 447 movdqa 16(%eax, %edi), %xmm2 448 sub $32, %ecx 449 movdqa 32(%eax, %edi), %xmm3 450 movdqa %xmm3, %xmm4 451 palignr $1, %xmm2, %xmm3 452 palignr $1, %xmm1, %xmm2 453 lea 32(%edi), %edi 454 movdqa %xmm2, -32(%edx, %edi) 455 movdqa %xmm3, -16(%edx, %edi) 456 457 jb L(shl_1_end) 458 459 movdqa 16(%eax, %edi), %xmm2 460 sub $32, %ecx 461 movdqa 32(%eax, %edi), %xmm3 462 movdqa %xmm3, %xmm1 463 palignr $1, %xmm2, %xmm3 464 palignr $1, %xmm4, %xmm2 465 lea 32(%edi), %edi 466 movdqa %xmm2, -32(%edx, %edi) 467 movdqa %xmm3, -16(%edx, %edi) 468 469 jae L(shl_1_loop) 470 471L(shl_1_end): 472 lea 32(%ecx), %ecx 473 add %ecx, %edi 474 add %edi, %edx 475 lea 1(%edi, %eax), %eax 476 POP (%edi) 477 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 478 479 cfi_restore_state 480 cfi_remember_state 481 ALIGN (4) 482L(shl_2): 483 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 484 lea -2(%eax), %eax 485 movaps (%eax), %xmm1 486 xor %edi, %edi 487 lea -32(%ecx), %ecx 488 movdqu %xmm0, (%esi) 489 POP (%esi) 490L(shl_2_loop): 491 492 movdqa 16(%eax, %edi), %xmm2 493 sub $32, %ecx 494 movdqa 32(%eax, %edi), %xmm3 495 movdqa %xmm3, %xmm4 496 palignr $2, %xmm2, %xmm3 497 palignr $2, %xmm1, %xmm2 498 lea 32(%edi), %edi 499 movdqa %xmm2, -32(%edx, %edi) 500 movdqa %xmm3, -16(%edx, %edi) 501 502 jb L(shl_2_end) 503 504 movdqa 16(%eax, %edi), %xmm2 505 sub $32, %ecx 506 movdqa 32(%eax, %edi), %xmm3 507 movdqa %xmm3, %xmm1 508 palignr $2, %xmm2, %xmm3 509 palignr $2, %xmm4, %xmm2 510 lea 32(%edi), %edi 511 movdqa %xmm2, -32(%edx, %edi) 512 movdqa %xmm3, -16(%edx, %edi) 513 514 jae L(shl_2_loop) 515 516L(shl_2_end): 517 lea 32(%ecx), %ecx 518 add %ecx, %edi 519 add %edi, %edx 520 lea 2(%edi, %eax), %eax 521 POP (%edi) 522 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 523 524 cfi_restore_state 525 cfi_remember_state 526 ALIGN (4) 527L(shl_3): 528 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 529 lea -3(%eax), %eax 530 movaps (%eax), %xmm1 531 xor %edi, %edi 532 lea -32(%ecx), %ecx 533 movdqu %xmm0, (%esi) 534 POP (%esi) 535L(shl_3_loop): 536 537 movdqa 16(%eax, %edi), %xmm2 538 sub $32, %ecx 539 movdqa 32(%eax, %edi), %xmm3 540 movdqa %xmm3, %xmm4 541 palignr $3, %xmm2, %xmm3 542 palignr $3, %xmm1, %xmm2 543 lea 32(%edi), %edi 544 movdqa %xmm2, -32(%edx, %edi) 545 movdqa %xmm3, -16(%edx, %edi) 546 547 jb L(shl_3_end) 548 549 movdqa 16(%eax, %edi), %xmm2 550 sub $32, %ecx 551 movdqa 32(%eax, %edi), %xmm3 552 movdqa %xmm3, %xmm1 553 palignr $3, %xmm2, %xmm3 554 palignr $3, %xmm4, %xmm2 555 lea 32(%edi), %edi 556 movdqa %xmm2, -32(%edx, %edi) 557 movdqa %xmm3, -16(%edx, %edi) 558 559 jae L(shl_3_loop) 560 561L(shl_3_end): 562 lea 32(%ecx), %ecx 563 add %ecx, %edi 564 add %edi, %edx 565 lea 3(%edi, %eax), %eax 566 POP (%edi) 567 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 568 569 cfi_restore_state 570 cfi_remember_state 571 ALIGN (4) 572L(shl_4): 573 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 574 lea -4(%eax), %eax 575 movaps (%eax), %xmm1 576 xor %edi, %edi 577 lea -32(%ecx), %ecx 578 movdqu %xmm0, (%esi) 579 POP (%esi) 580L(shl_4_loop): 581 582 movdqa 16(%eax, %edi), %xmm2 583 sub $32, %ecx 584 movdqa 32(%eax, %edi), %xmm3 585 movdqa %xmm3, %xmm4 586 palignr $4, %xmm2, %xmm3 587 palignr $4, %xmm1, %xmm2 588 lea 32(%edi), %edi 589 movdqa %xmm2, -32(%edx, %edi) 590 movdqa %xmm3, -16(%edx, %edi) 591 592 jb L(shl_4_end) 593 594 movdqa 16(%eax, %edi), %xmm2 595 sub $32, %ecx 596 movdqa 32(%eax, %edi), %xmm3 597 movdqa %xmm3, %xmm1 598 palignr $4, %xmm2, %xmm3 599 palignr $4, %xmm4, %xmm2 600 lea 32(%edi), %edi 601 movdqa %xmm2, -32(%edx, %edi) 602 movdqa %xmm3, -16(%edx, %edi) 603 604 jae L(shl_4_loop) 605 606L(shl_4_end): 607 lea 32(%ecx), %ecx 608 add %ecx, %edi 609 add %edi, %edx 610 lea 4(%edi, %eax), %eax 611 POP (%edi) 612 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 613 614 cfi_restore_state 615 cfi_remember_state 616 ALIGN (4) 617L(shl_5): 618 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 619 lea -5(%eax), %eax 620 movaps (%eax), %xmm1 621 xor %edi, %edi 622 lea -32(%ecx), %ecx 623 movdqu %xmm0, (%esi) 624 POP (%esi) 625L(shl_5_loop): 626 627 movdqa 16(%eax, %edi), %xmm2 628 sub $32, %ecx 629 movdqa 32(%eax, %edi), %xmm3 630 movdqa %xmm3, %xmm4 631 palignr $5, %xmm2, %xmm3 632 palignr $5, %xmm1, %xmm2 633 lea 32(%edi), %edi 634 movdqa %xmm2, -32(%edx, %edi) 635 movdqa %xmm3, -16(%edx, %edi) 636 637 jb L(shl_5_end) 638 639 movdqa 16(%eax, %edi), %xmm2 640 sub $32, %ecx 641 movdqa 32(%eax, %edi), %xmm3 642 movdqa %xmm3, %xmm1 643 palignr $5, %xmm2, %xmm3 644 palignr $5, %xmm4, %xmm2 645 lea 32(%edi), %edi 646 movdqa %xmm2, -32(%edx, %edi) 647 movdqa %xmm3, -16(%edx, %edi) 648 649 jae L(shl_5_loop) 650 651L(shl_5_end): 652 lea 32(%ecx), %ecx 653 add %ecx, %edi 654 add %edi, %edx 655 lea 5(%edi, %eax), %eax 656 POP (%edi) 657 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 658 659 cfi_restore_state 660 cfi_remember_state 661 ALIGN (4) 662L(shl_6): 663 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 664 lea -6(%eax), %eax 665 movaps (%eax), %xmm1 666 xor %edi, %edi 667 lea -32(%ecx), %ecx 668 movdqu %xmm0, (%esi) 669 POP (%esi) 670L(shl_6_loop): 671 672 movdqa 16(%eax, %edi), %xmm2 673 sub $32, %ecx 674 movdqa 32(%eax, %edi), %xmm3 675 movdqa %xmm3, %xmm4 676 palignr $6, %xmm2, %xmm3 677 palignr $6, %xmm1, %xmm2 678 lea 32(%edi), %edi 679 movdqa %xmm2, -32(%edx, %edi) 680 movdqa %xmm3, -16(%edx, %edi) 681 682 jb L(shl_6_end) 683 684 movdqa 16(%eax, %edi), %xmm2 685 sub $32, %ecx 686 movdqa 32(%eax, %edi), %xmm3 687 movdqa %xmm3, %xmm1 688 palignr $6, %xmm2, %xmm3 689 palignr $6, %xmm4, %xmm2 690 lea 32(%edi), %edi 691 movdqa %xmm2, -32(%edx, %edi) 692 movdqa %xmm3, -16(%edx, %edi) 693 694 jae L(shl_6_loop) 695 696L(shl_6_end): 697 lea 32(%ecx), %ecx 698 add %ecx, %edi 699 add %edi, %edx 700 lea 6(%edi, %eax), %eax 701 POP (%edi) 702 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 703 704 cfi_restore_state 705 cfi_remember_state 706 ALIGN (4) 707L(shl_7): 708 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 709 lea -7(%eax), %eax 710 movaps (%eax), %xmm1 711 xor %edi, %edi 712 lea -32(%ecx), %ecx 713 movdqu %xmm0, (%esi) 714 POP (%esi) 715L(shl_7_loop): 716 717 movdqa 16(%eax, %edi), %xmm2 718 sub $32, %ecx 719 movdqa 32(%eax, %edi), %xmm3 720 movdqa %xmm3, %xmm4 721 palignr $7, %xmm2, %xmm3 722 palignr $7, %xmm1, %xmm2 723 lea 32(%edi), %edi 724 movdqa %xmm2, -32(%edx, %edi) 725 movdqa %xmm3, -16(%edx, %edi) 726 727 jb L(shl_7_end) 728 729 movdqa 16(%eax, %edi), %xmm2 730 sub $32, %ecx 731 movdqa 32(%eax, %edi), %xmm3 732 movdqa %xmm3, %xmm1 733 palignr $7, %xmm2, %xmm3 734 palignr $7, %xmm4, %xmm2 735 lea 32(%edi), %edi 736 movdqa %xmm2, -32(%edx, %edi) 737 movdqa %xmm3, -16(%edx, %edi) 738 739 jae L(shl_7_loop) 740 741L(shl_7_end): 742 lea 32(%ecx), %ecx 743 add %ecx, %edi 744 add %edi, %edx 745 lea 7(%edi, %eax), %eax 746 POP (%edi) 747 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 748 749 cfi_restore_state 750 cfi_remember_state 751 ALIGN (4) 752L(shl_8): 753 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 754 lea -8(%eax), %eax 755 movaps (%eax), %xmm1 756 xor %edi, %edi 757 lea -32(%ecx), %ecx 758 movdqu %xmm0, (%esi) 759 POP (%esi) 760L(shl_8_loop): 761 762 movdqa 16(%eax, %edi), %xmm2 763 sub $32, %ecx 764 movdqa 32(%eax, %edi), %xmm3 765 movdqa %xmm3, %xmm4 766 palignr $8, %xmm2, %xmm3 767 palignr $8, %xmm1, %xmm2 768 lea 32(%edi), %edi 769 movdqa %xmm2, -32(%edx, %edi) 770 movdqa %xmm3, -16(%edx, %edi) 771 772 jb L(shl_8_end) 773 774 movdqa 16(%eax, %edi), %xmm2 775 sub $32, %ecx 776 movdqa 32(%eax, %edi), %xmm3 777 movdqa %xmm3, %xmm1 778 palignr $8, %xmm2, %xmm3 779 palignr $8, %xmm4, %xmm2 780 lea 32(%edi), %edi 781 movdqa %xmm2, -32(%edx, %edi) 782 movdqa %xmm3, -16(%edx, %edi) 783 784 jae L(shl_8_loop) 785 786L(shl_8_end): 787 lea 32(%ecx), %ecx 788 add %ecx, %edi 789 add %edi, %edx 790 lea 8(%edi, %eax), %eax 791 POP (%edi) 792 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 793 794 cfi_restore_state 795 cfi_remember_state 796 ALIGN (4) 797L(shl_9): 798 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 799 lea -9(%eax), %eax 800 movaps (%eax), %xmm1 801 xor %edi, %edi 802 lea -32(%ecx), %ecx 803 movdqu %xmm0, (%esi) 804 POP (%esi) 805L(shl_9_loop): 806 807 movdqa 16(%eax, %edi), %xmm2 808 sub $32, %ecx 809 movdqa 32(%eax, %edi), %xmm3 810 movdqa %xmm3, %xmm4 811 palignr $9, %xmm2, %xmm3 812 palignr $9, %xmm1, %xmm2 813 lea 32(%edi), %edi 814 movdqa %xmm2, -32(%edx, %edi) 815 movdqa %xmm3, -16(%edx, %edi) 816 817 jb L(shl_9_end) 818 819 movdqa 16(%eax, %edi), %xmm2 820 sub $32, %ecx 821 movdqa 32(%eax, %edi), %xmm3 822 movdqa %xmm3, %xmm1 823 palignr $9, %xmm2, %xmm3 824 palignr $9, %xmm4, %xmm2 825 lea 32(%edi), %edi 826 movdqa %xmm2, -32(%edx, %edi) 827 movdqa %xmm3, -16(%edx, %edi) 828 829 jae L(shl_9_loop) 830 831L(shl_9_end): 832 lea 32(%ecx), %ecx 833 add %ecx, %edi 834 add %edi, %edx 835 lea 9(%edi, %eax), %eax 836 POP (%edi) 837 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 838 839 cfi_restore_state 840 cfi_remember_state 841 ALIGN (4) 842L(shl_10): 843 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 844 lea -10(%eax), %eax 845 movaps (%eax), %xmm1 846 xor %edi, %edi 847 lea -32(%ecx), %ecx 848 movdqu %xmm0, (%esi) 849 POP (%esi) 850L(shl_10_loop): 851 852 movdqa 16(%eax, %edi), %xmm2 853 sub $32, %ecx 854 movdqa 32(%eax, %edi), %xmm3 855 movdqa %xmm3, %xmm4 856 palignr $10, %xmm2, %xmm3 857 palignr $10, %xmm1, %xmm2 858 lea 32(%edi), %edi 859 movdqa %xmm2, -32(%edx, %edi) 860 movdqa %xmm3, -16(%edx, %edi) 861 862 jb L(shl_10_end) 863 864 movdqa 16(%eax, %edi), %xmm2 865 sub $32, %ecx 866 movdqa 32(%eax, %edi), %xmm3 867 movdqa %xmm3, %xmm1 868 palignr $10, %xmm2, %xmm3 869 palignr $10, %xmm4, %xmm2 870 lea 32(%edi), %edi 871 movdqa %xmm2, -32(%edx, %edi) 872 movdqa %xmm3, -16(%edx, %edi) 873 874 jae L(shl_10_loop) 875 876L(shl_10_end): 877 lea 32(%ecx), %ecx 878 add %ecx, %edi 879 add %edi, %edx 880 lea 10(%edi, %eax), %eax 881 POP (%edi) 882 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 883 884 cfi_restore_state 885 cfi_remember_state 886 ALIGN (4) 887L(shl_11): 888 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 889 lea -11(%eax), %eax 890 movaps (%eax), %xmm1 891 xor %edi, %edi 892 lea -32(%ecx), %ecx 893 movdqu %xmm0, (%esi) 894 POP (%esi) 895L(shl_11_loop): 896 897 movdqa 16(%eax, %edi), %xmm2 898 sub $32, %ecx 899 movdqa 32(%eax, %edi), %xmm3 900 movdqa %xmm3, %xmm4 901 palignr $11, %xmm2, %xmm3 902 palignr $11, %xmm1, %xmm2 903 lea 32(%edi), %edi 904 movdqa %xmm2, -32(%edx, %edi) 905 movdqa %xmm3, -16(%edx, %edi) 906 907 jb L(shl_11_end) 908 909 movdqa 16(%eax, %edi), %xmm2 910 sub $32, %ecx 911 movdqa 32(%eax, %edi), %xmm3 912 movdqa %xmm3, %xmm1 913 palignr $11, %xmm2, %xmm3 914 palignr $11, %xmm4, %xmm2 915 lea 32(%edi), %edi 916 movdqa %xmm2, -32(%edx, %edi) 917 movdqa %xmm3, -16(%edx, %edi) 918 919 jae L(shl_11_loop) 920 921L(shl_11_end): 922 lea 32(%ecx), %ecx 923 add %ecx, %edi 924 add %edi, %edx 925 lea 11(%edi, %eax), %eax 926 POP (%edi) 927 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 928 929 cfi_restore_state 930 cfi_remember_state 931 ALIGN (4) 932L(shl_12): 933 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 934 lea -12(%eax), %eax 935 movaps (%eax), %xmm1 936 xor %edi, %edi 937 lea -32(%ecx), %ecx 938 movdqu %xmm0, (%esi) 939 POP (%esi) 940L(shl_12_loop): 941 942 movdqa 16(%eax, %edi), %xmm2 943 sub $32, %ecx 944 movdqa 32(%eax, %edi), %xmm3 945 movdqa %xmm3, %xmm4 946 palignr $12, %xmm2, %xmm3 947 palignr $12, %xmm1, %xmm2 948 lea 32(%edi), %edi 949 movdqa %xmm2, -32(%edx, %edi) 950 movdqa %xmm3, -16(%edx, %edi) 951 952 jb L(shl_12_end) 953 954 movdqa 16(%eax, %edi), %xmm2 955 sub $32, %ecx 956 movdqa 32(%eax, %edi), %xmm3 957 movdqa %xmm3, %xmm1 958 palignr $12, %xmm2, %xmm3 959 palignr $12, %xmm4, %xmm2 960 lea 32(%edi), %edi 961 movdqa %xmm2, -32(%edx, %edi) 962 movdqa %xmm3, -16(%edx, %edi) 963 964 jae L(shl_12_loop) 965 966L(shl_12_end): 967 lea 32(%ecx), %ecx 968 add %ecx, %edi 969 add %edi, %edx 970 lea 12(%edi, %eax), %eax 971 POP (%edi) 972 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 973 974 cfi_restore_state 975 cfi_remember_state 976 ALIGN (4) 977L(shl_13): 978 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 979 lea -13(%eax), %eax 980 movaps (%eax), %xmm1 981 xor %edi, %edi 982 lea -32(%ecx), %ecx 983 movdqu %xmm0, (%esi) 984 POP (%esi) 985L(shl_13_loop): 986 987 movdqa 16(%eax, %edi), %xmm2 988 sub $32, %ecx 989 movdqa 32(%eax, %edi), %xmm3 990 movdqa %xmm3, %xmm4 991 palignr $13, %xmm2, %xmm3 992 palignr $13, %xmm1, %xmm2 993 lea 32(%edi), %edi 994 movdqa %xmm2, -32(%edx, %edi) 995 movdqa %xmm3, -16(%edx, %edi) 996 997 jb L(shl_13_end) 998 999 movdqa 16(%eax, %edi), %xmm2 1000 sub $32, %ecx 1001 movdqa 32(%eax, %edi), %xmm3 1002 movdqa %xmm3, %xmm1 1003 palignr $13, %xmm2, %xmm3 1004 palignr $13, %xmm4, %xmm2 1005 lea 32(%edi), %edi 1006 movdqa %xmm2, -32(%edx, %edi) 1007 movdqa %xmm3, -16(%edx, %edi) 1008 1009 jae L(shl_13_loop) 1010 1011L(shl_13_end): 1012 lea 32(%ecx), %ecx 1013 add %ecx, %edi 1014 add %edi, %edx 1015 lea 13(%edi, %eax), %eax 1016 POP (%edi) 1017 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 1018 1019 cfi_restore_state 1020 cfi_remember_state 1021 ALIGN (4) 1022L(shl_14): 1023 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 1024 lea -14(%eax), %eax 1025 movaps (%eax), %xmm1 1026 xor %edi, %edi 1027 lea -32(%ecx), %ecx 1028 movdqu %xmm0, (%esi) 1029 POP (%esi) 1030L(shl_14_loop): 1031 1032 movdqa 16(%eax, %edi), %xmm2 1033 sub $32, %ecx 1034 movdqa 32(%eax, %edi), %xmm3 1035 movdqa %xmm3, %xmm4 1036 palignr $14, %xmm2, %xmm3 1037 palignr $14, %xmm1, %xmm2 1038 lea 32(%edi), %edi 1039 movdqa %xmm2, -32(%edx, %edi) 1040 movdqa %xmm3, -16(%edx, %edi) 1041 1042 jb L(shl_14_end) 1043 1044 movdqa 16(%eax, %edi), %xmm2 1045 sub $32, %ecx 1046 movdqa 32(%eax, %edi), %xmm3 1047 movdqa %xmm3, %xmm1 1048 palignr $14, %xmm2, %xmm3 1049 palignr $14, %xmm4, %xmm2 1050 lea 32(%edi), %edi 1051 movdqa %xmm2, -32(%edx, %edi) 1052 movdqa %xmm3, -16(%edx, %edi) 1053 1054 jae L(shl_14_loop) 1055 1056L(shl_14_end): 1057 lea 32(%ecx), %ecx 1058 add %ecx, %edi 1059 add %edi, %edx 1060 lea 14(%edi, %eax), %eax 1061 POP (%edi) 1062 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 1063 1064 cfi_restore_state 1065 cfi_remember_state 1066 ALIGN (4) 1067L(shl_15): 1068 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) 1069 lea -15(%eax), %eax 1070 movaps (%eax), %xmm1 1071 xor %edi, %edi 1072 lea -32(%ecx), %ecx 1073 movdqu %xmm0, (%esi) 1074 POP (%esi) 1075L(shl_15_loop): 1076 1077 movdqa 16(%eax, %edi), %xmm2 1078 sub $32, %ecx 1079 movdqa 32(%eax, %edi), %xmm3 1080 movdqa %xmm3, %xmm4 1081 palignr $15, %xmm2, %xmm3 1082 palignr $15, %xmm1, %xmm2 1083 lea 32(%edi), %edi 1084 movdqa %xmm2, -32(%edx, %edi) 1085 movdqa %xmm3, -16(%edx, %edi) 1086 1087 jb L(shl_15_end) 1088 1089 movdqa 16(%eax, %edi), %xmm2 1090 sub $32, %ecx 1091 movdqa 32(%eax, %edi), %xmm3 1092 movdqa %xmm3, %xmm1 1093 palignr $15, %xmm2, %xmm3 1094 palignr $15, %xmm4, %xmm2 1095 lea 32(%edi), %edi 1096 movdqa %xmm2, -32(%edx, %edi) 1097 movdqa %xmm3, -16(%edx, %edi) 1098 1099 jae L(shl_15_loop) 1100 1101L(shl_15_end): 1102 lea 32(%ecx), %ecx 1103 add %ecx, %edi 1104 add %edi, %edx 1105 lea 15(%edi, %eax), %eax 1106 POP (%edi) 1107 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) 1108 1109 1110 ALIGN (4) 1111L(fwd_write_44bytes): 1112 movl -44(%eax), %ecx 1113 movl %ecx, -44(%edx) 1114L(fwd_write_40bytes): 1115 movl -40(%eax), %ecx 1116 movl %ecx, -40(%edx) 1117L(fwd_write_36bytes): 1118 movl -36(%eax), %ecx 1119 movl %ecx, -36(%edx) 1120L(fwd_write_32bytes): 1121 movl -32(%eax), %ecx 1122 movl %ecx, -32(%edx) 1123L(fwd_write_28bytes): 1124 movl -28(%eax), %ecx 1125 movl %ecx, -28(%edx) 1126L(fwd_write_24bytes): 1127 movl -24(%eax), %ecx 1128 movl %ecx, -24(%edx) 1129L(fwd_write_20bytes): 1130 movl -20(%eax), %ecx 1131 movl %ecx, -20(%edx) 1132L(fwd_write_16bytes): 1133 movl -16(%eax), %ecx 1134 movl %ecx, -16(%edx) 1135L(fwd_write_12bytes): 1136 movl -12(%eax), %ecx 1137 movl %ecx, -12(%edx) 1138L(fwd_write_8bytes): 1139 movl -8(%eax), %ecx 1140 movl %ecx, -8(%edx) 1141L(fwd_write_4bytes): 1142 movl -4(%eax), %ecx 1143 movl %ecx, -4(%edx) 1144L(fwd_write_0bytes): 1145#ifndef USE_AS_BCOPY 1146# ifdef USE_AS_MEMPCPY 1147 movl %edx, %eax 1148# else 1149 movl DEST(%esp), %eax 1150# endif 1151#endif 1152 RETURN 1153 1154 ALIGN (4) 1155L(fwd_write_5bytes): 1156 movl -5(%eax), %ecx 1157 movl -4(%eax), %eax 1158 movl %ecx, -5(%edx) 1159 movl %eax, -4(%edx) 1160#ifndef USE_AS_BCOPY 1161# ifdef USE_AS_MEMPCPY 1162 movl %edx, %eax 1163# else 1164 movl DEST(%esp), %eax 1165# endif 1166#endif 1167 RETURN 1168 1169 ALIGN (4) 1170L(fwd_write_45bytes): 1171 movl -45(%eax), %ecx 1172 movl %ecx, -45(%edx) 1173L(fwd_write_41bytes): 1174 movl -41(%eax), %ecx 1175 movl %ecx, -41(%edx) 1176L(fwd_write_37bytes): 1177 movl -37(%eax), %ecx 1178 movl %ecx, -37(%edx) 1179L(fwd_write_33bytes): 1180 movl -33(%eax), %ecx 1181 movl %ecx, -33(%edx) 1182L(fwd_write_29bytes): 1183 movl -29(%eax), %ecx 1184 movl %ecx, -29(%edx) 1185L(fwd_write_25bytes): 1186 movl -25(%eax), %ecx 1187 movl %ecx, -25(%edx) 1188L(fwd_write_21bytes): 1189 movl -21(%eax), %ecx 1190 movl %ecx, -21(%edx) 1191L(fwd_write_17bytes): 1192 movl -17(%eax), %ecx 1193 movl %ecx, -17(%edx) 1194L(fwd_write_13bytes): 1195 movl -13(%eax), %ecx 1196 movl %ecx, -13(%edx) 1197L(fwd_write_9bytes): 1198 movl -9(%eax), %ecx 1199 movl %ecx, -9(%edx) 1200 movl -5(%eax), %ecx 1201 movl %ecx, -5(%edx) 1202L(fwd_write_1bytes): 1203 movzbl -1(%eax), %ecx 1204 movb %cl, -1(%edx) 1205#ifndef USE_AS_BCOPY 1206# ifdef USE_AS_MEMPCPY 1207 movl %edx, %eax 1208# else 1209 movl DEST(%esp), %eax 1210# endif 1211#endif 1212 RETURN 1213 1214 ALIGN (4) 1215L(fwd_write_46bytes): 1216 movl -46(%eax), %ecx 1217 movl %ecx, -46(%edx) 1218L(fwd_write_42bytes): 1219 movl -42(%eax), %ecx 1220 movl %ecx, -42(%edx) 1221L(fwd_write_38bytes): 1222 movl -38(%eax), %ecx 1223 movl %ecx, -38(%edx) 1224L(fwd_write_34bytes): 1225 movl -34(%eax), %ecx 1226 movl %ecx, -34(%edx) 1227L(fwd_write_30bytes): 1228 movl -30(%eax), %ecx 1229 movl %ecx, -30(%edx) 1230L(fwd_write_26bytes): 1231 movl -26(%eax), %ecx 1232 movl %ecx, -26(%edx) 1233L(fwd_write_22bytes): 1234 movl -22(%eax), %ecx 1235 movl %ecx, -22(%edx) 1236L(fwd_write_18bytes): 1237 movl -18(%eax), %ecx 1238 movl %ecx, -18(%edx) 1239L(fwd_write_14bytes): 1240 movl -14(%eax), %ecx 1241 movl %ecx, -14(%edx) 1242L(fwd_write_10bytes): 1243 movl -10(%eax), %ecx 1244 movl %ecx, -10(%edx) 1245L(fwd_write_6bytes): 1246 movl -6(%eax), %ecx 1247 movl %ecx, -6(%edx) 1248L(fwd_write_2bytes): 1249 movzwl -2(%eax), %ecx 1250 movw %cx, -2(%edx) 1251#ifndef USE_AS_BCOPY 1252# ifdef USE_AS_MEMPCPY 1253 movl %edx, %eax 1254# else 1255 movl DEST(%esp), %eax 1256# endif 1257#endif 1258 RETURN 1259 1260 ALIGN (4) 1261L(fwd_write_47bytes): 1262 movl -47(%eax), %ecx 1263 movl %ecx, -47(%edx) 1264L(fwd_write_43bytes): 1265 movl -43(%eax), %ecx 1266 movl %ecx, -43(%edx) 1267L(fwd_write_39bytes): 1268 movl -39(%eax), %ecx 1269 movl %ecx, -39(%edx) 1270L(fwd_write_35bytes): 1271 movl -35(%eax), %ecx 1272 movl %ecx, -35(%edx) 1273L(fwd_write_31bytes): 1274 movl -31(%eax), %ecx 1275 movl %ecx, -31(%edx) 1276L(fwd_write_27bytes): 1277 movl -27(%eax), %ecx 1278 movl %ecx, -27(%edx) 1279L(fwd_write_23bytes): 1280 movl -23(%eax), %ecx 1281 movl %ecx, -23(%edx) 1282L(fwd_write_19bytes): 1283 movl -19(%eax), %ecx 1284 movl %ecx, -19(%edx) 1285L(fwd_write_15bytes): 1286 movl -15(%eax), %ecx 1287 movl %ecx, -15(%edx) 1288L(fwd_write_11bytes): 1289 movl -11(%eax), %ecx 1290 movl %ecx, -11(%edx) 1291L(fwd_write_7bytes): 1292 movl -7(%eax), %ecx 1293 movl %ecx, -7(%edx) 1294L(fwd_write_3bytes): 1295 movzwl -3(%eax), %ecx 1296 movzbl -1(%eax), %eax 1297 movw %cx, -3(%edx) 1298 movb %al, -1(%edx) 1299#ifndef USE_AS_BCOPY 1300# ifdef USE_AS_MEMPCPY 1301 movl %edx, %eax 1302# else 1303 movl DEST(%esp), %eax 1304# endif 1305#endif 1306 RETURN_END 1307 1308 cfi_restore_state 1309 cfi_remember_state 1310 ALIGN (4) 1311L(large_page): 1312 movdqu (%eax), %xmm1 1313 lea 16(%eax), %eax 1314 movdqu %xmm0, (%esi) 1315 movntdq %xmm1, (%edx) 1316 lea 16(%edx), %edx 1317 POP (%esi) 1318 lea -0x90(%ecx), %ecx 1319 POP (%edi) 1320L(large_page_loop): 1321 movdqu (%eax), %xmm0 1322 movdqu 0x10(%eax), %xmm1 1323 movdqu 0x20(%eax), %xmm2 1324 movdqu 0x30(%eax), %xmm3 1325 movdqu 0x40(%eax), %xmm4 1326 movdqu 0x50(%eax), %xmm5 1327 movdqu 0x60(%eax), %xmm6 1328 movdqu 0x70(%eax), %xmm7 1329 lea 0x80(%eax), %eax 1330 1331 sub $0x80, %ecx 1332 movntdq %xmm0, (%edx) 1333 movntdq %xmm1, 0x10(%edx) 1334 movntdq %xmm2, 0x20(%edx) 1335 movntdq %xmm3, 0x30(%edx) 1336 movntdq %xmm4, 0x40(%edx) 1337 movntdq %xmm5, 0x50(%edx) 1338 movntdq %xmm6, 0x60(%edx) 1339 movntdq %xmm7, 0x70(%edx) 1340 lea 0x80(%edx), %edx 1341 jae L(large_page_loop) 1342 cmp $-0x40, %ecx 1343 lea 0x80(%ecx), %ecx 1344 jl L(large_page_less_64bytes) 1345 1346 movdqu (%eax), %xmm0 1347 movdqu 0x10(%eax), %xmm1 1348 movdqu 0x20(%eax), %xmm2 1349 movdqu 0x30(%eax), %xmm3 1350 lea 0x40(%eax), %eax 1351 1352 movntdq %xmm0, (%edx) 1353 movntdq %xmm1, 0x10(%edx) 1354 movntdq %xmm2, 0x20(%edx) 1355 movntdq %xmm3, 0x30(%edx) 1356 lea 0x40(%edx), %edx 1357 sub $0x40, %ecx 1358L(large_page_less_64bytes): 1359 cmp $32, %ecx 1360 jb L(large_page_less_32bytes) 1361 movdqu (%eax), %xmm0 1362 movdqu 0x10(%eax), %xmm1 1363 lea 0x20(%eax), %eax 1364 movntdq %xmm0, (%edx) 1365 movntdq %xmm1, 0x10(%edx) 1366 lea 0x20(%edx), %edx 1367 sub $0x20, %ecx 1368L(large_page_less_32bytes): 1369 add %ecx, %edx 1370 add %ecx, %eax 1371 sfence 1372 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) 1373 1374 1375 ALIGN (4) 1376L(bk_write_44bytes): 1377 movl 40(%eax), %ecx 1378 movl %ecx, 40(%edx) 1379L(bk_write_40bytes): 1380 movl 36(%eax), %ecx 1381 movl %ecx, 36(%edx) 1382L(bk_write_36bytes): 1383 movl 32(%eax), %ecx 1384 movl %ecx, 32(%edx) 1385L(bk_write_32bytes): 1386 movl 28(%eax), %ecx 1387 movl %ecx, 28(%edx) 1388L(bk_write_28bytes): 1389 movl 24(%eax), %ecx 1390 movl %ecx, 24(%edx) 1391L(bk_write_24bytes): 1392 movl 20(%eax), %ecx 1393 movl %ecx, 20(%edx) 1394L(bk_write_20bytes): 1395 movl 16(%eax), %ecx 1396 movl %ecx, 16(%edx) 1397L(bk_write_16bytes): 1398 movl 12(%eax), %ecx 1399 movl %ecx, 12(%edx) 1400L(bk_write_12bytes): 1401 movl 8(%eax), %ecx 1402 movl %ecx, 8(%edx) 1403L(bk_write_8bytes): 1404 movl 4(%eax), %ecx 1405 movl %ecx, 4(%edx) 1406L(bk_write_4bytes): 1407 movl (%eax), %ecx 1408 movl %ecx, (%edx) 1409L(bk_write_0bytes): 1410#ifndef USE_AS_BCOPY 1411 movl DEST(%esp), %eax 1412# ifdef USE_AS_MEMPCPY 1413 movl LEN(%esp), %ecx 1414 add %ecx, %eax 1415# endif 1416#endif 1417 RETURN 1418 1419 ALIGN (4) 1420L(bk_write_45bytes): 1421 movl 41(%eax), %ecx 1422 movl %ecx, 41(%edx) 1423L(bk_write_41bytes): 1424 movl 37(%eax), %ecx 1425 movl %ecx, 37(%edx) 1426L(bk_write_37bytes): 1427 movl 33(%eax), %ecx 1428 movl %ecx, 33(%edx) 1429L(bk_write_33bytes): 1430 movl 29(%eax), %ecx 1431 movl %ecx, 29(%edx) 1432L(bk_write_29bytes): 1433 movl 25(%eax), %ecx 1434 movl %ecx, 25(%edx) 1435L(bk_write_25bytes): 1436 movl 21(%eax), %ecx 1437 movl %ecx, 21(%edx) 1438L(bk_write_21bytes): 1439 movl 17(%eax), %ecx 1440 movl %ecx, 17(%edx) 1441L(bk_write_17bytes): 1442 movl 13(%eax), %ecx 1443 movl %ecx, 13(%edx) 1444L(bk_write_13bytes): 1445 movl 9(%eax), %ecx 1446 movl %ecx, 9(%edx) 1447L(bk_write_9bytes): 1448 movl 5(%eax), %ecx 1449 movl %ecx, 5(%edx) 1450L(bk_write_5bytes): 1451 movl 1(%eax), %ecx 1452 movl %ecx, 1(%edx) 1453L(bk_write_1bytes): 1454 movzbl (%eax), %ecx 1455 movb %cl, (%edx) 1456#ifndef USE_AS_BCOPY 1457 movl DEST(%esp), %eax 1458# ifdef USE_AS_MEMPCPY 1459 movl LEN(%esp), %ecx 1460 add %ecx, %eax 1461# endif 1462#endif 1463 RETURN 1464 1465 ALIGN (4) 1466L(bk_write_46bytes): 1467 movl 42(%eax), %ecx 1468 movl %ecx, 42(%edx) 1469L(bk_write_42bytes): 1470 movl 38(%eax), %ecx 1471 movl %ecx, 38(%edx) 1472L(bk_write_38bytes): 1473 movl 34(%eax), %ecx 1474 movl %ecx, 34(%edx) 1475L(bk_write_34bytes): 1476 movl 30(%eax), %ecx 1477 movl %ecx, 30(%edx) 1478L(bk_write_30bytes): 1479 movl 26(%eax), %ecx 1480 movl %ecx, 26(%edx) 1481L(bk_write_26bytes): 1482 movl 22(%eax), %ecx 1483 movl %ecx, 22(%edx) 1484L(bk_write_22bytes): 1485 movl 18(%eax), %ecx 1486 movl %ecx, 18(%edx) 1487L(bk_write_18bytes): 1488 movl 14(%eax), %ecx 1489 movl %ecx, 14(%edx) 1490L(bk_write_14bytes): 1491 movl 10(%eax), %ecx 1492 movl %ecx, 10(%edx) 1493L(bk_write_10bytes): 1494 movl 6(%eax), %ecx 1495 movl %ecx, 6(%edx) 1496L(bk_write_6bytes): 1497 movl 2(%eax), %ecx 1498 movl %ecx, 2(%edx) 1499L(bk_write_2bytes): 1500 movzwl (%eax), %ecx 1501 movw %cx, (%edx) 1502#ifndef USE_AS_BCOPY 1503 movl DEST(%esp), %eax 1504# ifdef USE_AS_MEMPCPY 1505 movl LEN(%esp), %ecx 1506 add %ecx, %eax 1507# endif 1508#endif 1509 RETURN 1510 1511 ALIGN (4) 1512L(bk_write_47bytes): 1513 movl 43(%eax), %ecx 1514 movl %ecx, 43(%edx) 1515L(bk_write_43bytes): 1516 movl 39(%eax), %ecx 1517 movl %ecx, 39(%edx) 1518L(bk_write_39bytes): 1519 movl 35(%eax), %ecx 1520 movl %ecx, 35(%edx) 1521L(bk_write_35bytes): 1522 movl 31(%eax), %ecx 1523 movl %ecx, 31(%edx) 1524L(bk_write_31bytes): 1525 movl 27(%eax), %ecx 1526 movl %ecx, 27(%edx) 1527L(bk_write_27bytes): 1528 movl 23(%eax), %ecx 1529 movl %ecx, 23(%edx) 1530L(bk_write_23bytes): 1531 movl 19(%eax), %ecx 1532 movl %ecx, 19(%edx) 1533L(bk_write_19bytes): 1534 movl 15(%eax), %ecx 1535 movl %ecx, 15(%edx) 1536L(bk_write_15bytes): 1537 movl 11(%eax), %ecx 1538 movl %ecx, 11(%edx) 1539L(bk_write_11bytes): 1540 movl 7(%eax), %ecx 1541 movl %ecx, 7(%edx) 1542L(bk_write_7bytes): 1543 movl 3(%eax), %ecx 1544 movl %ecx, 3(%edx) 1545L(bk_write_3bytes): 1546 movzwl 1(%eax), %ecx 1547 movw %cx, 1(%edx) 1548 movzbl (%eax), %eax 1549 movb %al, (%edx) 1550#ifndef USE_AS_BCOPY 1551 movl DEST(%esp), %eax 1552# ifdef USE_AS_MEMPCPY 1553 movl LEN(%esp), %ecx 1554 add %ecx, %eax 1555# endif 1556#endif 1557 RETURN_END 1558 1559 1560 .pushsection .rodata.ssse3,"a",@progbits 1561 ALIGN (2) 1562L(table_48bytes_fwd): 1563 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) 1564 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) 1565 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) 1566 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) 1567 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) 1568 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) 1569 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) 1570 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) 1571 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) 1572 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) 1573 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) 1574 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) 1575 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) 1576 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) 1577 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) 1578 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) 1579 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) 1580 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) 1581 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) 1582 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) 1583 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) 1584 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) 1585 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) 1586 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) 1587 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) 1588 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) 1589 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) 1590 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) 1591 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) 1592 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) 1593 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) 1594 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) 1595 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) 1596 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) 1597 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) 1598 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) 1599 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) 1600 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) 1601 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) 1602 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) 1603 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) 1604 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) 1605 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) 1606 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) 1607 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) 1608 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) 1609 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) 1610 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) 1611 1612 ALIGN (2) 1613L(shl_table): 1614 .int JMPTBL (L(shl_0), L(shl_table)) 1615 .int JMPTBL (L(shl_1), L(shl_table)) 1616 .int JMPTBL (L(shl_2), L(shl_table)) 1617 .int JMPTBL (L(shl_3), L(shl_table)) 1618 .int JMPTBL (L(shl_4), L(shl_table)) 1619 .int JMPTBL (L(shl_5), L(shl_table)) 1620 .int JMPTBL (L(shl_6), L(shl_table)) 1621 .int JMPTBL (L(shl_7), L(shl_table)) 1622 .int JMPTBL (L(shl_8), L(shl_table)) 1623 .int JMPTBL (L(shl_9), L(shl_table)) 1624 .int JMPTBL (L(shl_10), L(shl_table)) 1625 .int JMPTBL (L(shl_11), L(shl_table)) 1626 .int JMPTBL (L(shl_12), L(shl_table)) 1627 .int JMPTBL (L(shl_13), L(shl_table)) 1628 .int JMPTBL (L(shl_14), L(shl_table)) 1629 .int JMPTBL (L(shl_15), L(shl_table)) 1630 1631 ALIGN (2) 1632L(table_48_bytes_bwd): 1633 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) 1634 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) 1635 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) 1636 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) 1637 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) 1638 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) 1639 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) 1640 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) 1641 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) 1642 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) 1643 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) 1644 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) 1645 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) 1646 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) 1647 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) 1648 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) 1649 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) 1650 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) 1651 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) 1652 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) 1653 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) 1654 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) 1655 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) 1656 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) 1657 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) 1658 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) 1659 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) 1660 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) 1661 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) 1662 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) 1663 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) 1664 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) 1665 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) 1666 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) 1667 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) 1668 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) 1669 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) 1670 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) 1671 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) 1672 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) 1673 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) 1674 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) 1675 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) 1676 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) 1677 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) 1678 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) 1679 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) 1680 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) 1681 1682 .popsection 1683 1684#ifdef USE_AS_MEMMOVE 1685 ALIGN (4) 1686L(copy_backward): 1687 PUSH (%esi) 1688 movl %eax, %esi 1689 lea (%ecx,%edx,1),%edx 1690 lea (%ecx,%esi,1),%esi 1691 testl $0x3, %edx 1692 jnz L(bk_align) 1693 1694L(bk_aligned_4): 1695 cmp $64, %ecx 1696 jae L(bk_write_more64bytes) 1697 1698L(bk_write_64bytesless): 1699 cmp $32, %ecx 1700 jb L(bk_write_less32bytes) 1701 1702L(bk_write_more32bytes): 1703 /* Copy 32 bytes at a time. */ 1704 sub $32, %ecx 1705 movl -4(%esi), %eax 1706 movl %eax, -4(%edx) 1707 movl -8(%esi), %eax 1708 movl %eax, -8(%edx) 1709 movl -12(%esi), %eax 1710 movl %eax, -12(%edx) 1711 movl -16(%esi), %eax 1712 movl %eax, -16(%edx) 1713 movl -20(%esi), %eax 1714 movl %eax, -20(%edx) 1715 movl -24(%esi), %eax 1716 movl %eax, -24(%edx) 1717 movl -28(%esi), %eax 1718 movl %eax, -28(%edx) 1719 movl -32(%esi), %eax 1720 movl %eax, -32(%edx) 1721 sub $32, %edx 1722 sub $32, %esi 1723 1724L(bk_write_less32bytes): 1725 movl %esi, %eax 1726 sub %ecx, %edx 1727 sub %ecx, %eax 1728 POP (%esi) 1729L(bk_write_less32bytes_2): 1730 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) 1731 1732 CFI_PUSH (%esi) 1733 ALIGN (4) 1734L(bk_align): 1735 cmp $8, %ecx 1736 jbe L(bk_write_less32bytes) 1737 testl $1, %edx 1738 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, 1739 then (EDX & 2) must be != 0. */ 1740 jz L(bk_got2) 1741 sub $1, %esi 1742 sub $1, %ecx 1743 sub $1, %edx 1744 movzbl (%esi), %eax 1745 movb %al, (%edx) 1746 1747 testl $2, %edx 1748 jz L(bk_aligned_4) 1749 1750L(bk_got2): 1751 sub $2, %esi 1752 sub $2, %ecx 1753 sub $2, %edx 1754 movzwl (%esi), %eax 1755 movw %ax, (%edx) 1756 jmp L(bk_aligned_4) 1757 1758 ALIGN (4) 1759L(bk_write_more64bytes): 1760 /* Check alignment of last byte. */ 1761 testl $15, %edx 1762 jz L(bk_ssse3_cpy_pre) 1763 1764/* EDX is aligned 4 bytes, but not 16 bytes. */ 1765L(bk_ssse3_align): 1766 sub $4, %esi 1767 sub $4, %ecx 1768 sub $4, %edx 1769 movl (%esi), %eax 1770 movl %eax, (%edx) 1771 1772 testl $15, %edx 1773 jz L(bk_ssse3_cpy_pre) 1774 1775 sub $4, %esi 1776 sub $4, %ecx 1777 sub $4, %edx 1778 movl (%esi), %eax 1779 movl %eax, (%edx) 1780 1781 testl $15, %edx 1782 jz L(bk_ssse3_cpy_pre) 1783 1784 sub $4, %esi 1785 sub $4, %ecx 1786 sub $4, %edx 1787 movl (%esi), %eax 1788 movl %eax, (%edx) 1789 1790L(bk_ssse3_cpy_pre): 1791 cmp $64, %ecx 1792 jb L(bk_write_more32bytes) 1793 1794L(bk_ssse3_cpy): 1795 sub $64, %esi 1796 sub $64, %ecx 1797 sub $64, %edx 1798 movdqu 0x30(%esi), %xmm3 1799 movdqa %xmm3, 0x30(%edx) 1800 movdqu 0x20(%esi), %xmm2 1801 movdqa %xmm2, 0x20(%edx) 1802 movdqu 0x10(%esi), %xmm1 1803 movdqa %xmm1, 0x10(%edx) 1804 movdqu (%esi), %xmm0 1805 movdqa %xmm0, (%edx) 1806 cmp $64, %ecx 1807 jae L(bk_ssse3_cpy) 1808 jmp L(bk_write_64bytesless) 1809 1810#endif 1811 1812END (MEMCPY) 1813