1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31 32#ifndef MEMMOVE 33# define MEMMOVE memmove 34#endif 35 36#ifndef L 37# define L(label) .L##label 38#endif 39 40#ifndef cfi_startproc 41# define cfi_startproc .cfi_startproc 42#endif 43 44#ifndef cfi_endproc 45# define cfi_endproc .cfi_endproc 46#endif 47 48#ifndef cfi_rel_offset 49# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 50#endif 51 52#ifndef cfi_restore 53# define cfi_restore(reg) .cfi_restore reg 54#endif 55 56#ifndef cfi_adjust_cfa_offset 57# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 58#endif 59 60#ifndef ENTRY 61# define ENTRY(name) \ 62 .type name, @function; \ 63 .globl name; \ 64 .p2align 4; \ 65name: \ 66 cfi_startproc 67#endif 68 69#ifndef ALIAS_SYMBOL 70# define ALIAS_SYMBOL(alias, original) \ 71 .globl alias; \ 72 .equ alias, original 73#endif 74 75#ifndef END 76# define END(name) \ 77 cfi_endproc; \ 78 .size name, .-name 79#endif 80 81#define CFI_PUSH(REG) \ 82 cfi_adjust_cfa_offset (8); \ 83 cfi_rel_offset (REG, 0) 84 85#define CFI_POP(REG) \ 86 cfi_adjust_cfa_offset (-8); \ 87 cfi_restore (REG) 88 89#define PUSH(REG) push REG; 90#define POP(REG) pop REG; 91 92#define ENTRANCE \ 93 PUSH (%rbx); \ 94 CFI_PUSH (%rbx); 95#define RETURN_END \ 96 POP (%rbx); \ 97 CFI_POP (%rbx); \ 98 ret 99#define RETURN RETURN_END; 100 101 .section .text.sse2,"ax",@progbits 102ENTRY (__memcpy_chk) 103 cmp %rcx, %rdx 104 ja __memcpy_chk_fail 105/* Fall through to memcpy/memmove. */ 106END (__memcpy_chk) 107ENTRY (MEMMOVE) 108 ENTRANCE 109 mov %rdi, %rax 110 111/* Check whether we should copy backward or forward. */ 112 cmp %rsi, %rdi 113 je L(mm_return) 114 jg L(mm_len_0_or_more_backward) 115 116/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 117 separately. */ 118 cmp $16, %rdx 119 jbe L(mm_len_0_16_bytes_forward) 120 121 cmp $32, %rdx 122 ja L(mm_len_32_or_more_forward) 123 124/* Copy [0..32] and return. */ 125 movdqu (%rsi), %xmm0 126 movdqu -16(%rsi, %rdx), %xmm1 127 movdqu %xmm0, (%rdi) 128 movdqu %xmm1, -16(%rdi, %rdx) 129 jmp L(mm_return) 130 131L(mm_len_32_or_more_forward): 132 cmp $64, %rdx 133 ja L(mm_len_64_or_more_forward) 134 135/* Copy [0..64] and return. */ 136 movdqu (%rsi), %xmm0 137 movdqu 16(%rsi), %xmm1 138 movdqu -16(%rsi, %rdx), %xmm2 139 movdqu -32(%rsi, %rdx), %xmm3 140 movdqu %xmm0, (%rdi) 141 movdqu %xmm1, 16(%rdi) 142 movdqu %xmm2, -16(%rdi, %rdx) 143 movdqu %xmm3, -32(%rdi, %rdx) 144 jmp L(mm_return) 145 146L(mm_len_64_or_more_forward): 147 cmp $128, %rdx 148 ja L(mm_len_128_or_more_forward) 149 150/* Copy [0..128] and return. */ 151 movdqu (%rsi), %xmm0 152 movdqu 16(%rsi), %xmm1 153 movdqu 32(%rsi), %xmm2 154 movdqu 48(%rsi), %xmm3 155 movdqu -64(%rsi, %rdx), %xmm4 156 movdqu -48(%rsi, %rdx), %xmm5 157 movdqu -32(%rsi, %rdx), %xmm6 158 movdqu -16(%rsi, %rdx), %xmm7 159 movdqu %xmm0, (%rdi) 160 movdqu %xmm1, 16(%rdi) 161 movdqu %xmm2, 32(%rdi) 162 movdqu %xmm3, 48(%rdi) 163 movdqu %xmm4, -64(%rdi, %rdx) 164 movdqu %xmm5, -48(%rdi, %rdx) 165 movdqu %xmm6, -32(%rdi, %rdx) 166 movdqu %xmm7, -16(%rdi, %rdx) 167 jmp L(mm_return) 168 169L(mm_len_128_or_more_forward): 170/* Aligning the address of destination. */ 171/* save first unaligned 64 bytes */ 172 movdqu (%rsi), %xmm0 173 movdqu 16(%rsi), %xmm1 174 movdqu 32(%rsi), %xmm2 175 movdqu 48(%rsi), %xmm3 176 177 lea 64(%rdi), %r8 178 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ 179 sub %rdi, %rsi /* rsi = src - dst = diff */ 180 181 movdqu (%r8, %rsi), %xmm4 182 movdqu 16(%r8, %rsi), %xmm5 183 movdqu 32(%r8, %rsi), %xmm6 184 movdqu 48(%r8, %rsi), %xmm7 185 186 movdqu %xmm0, (%rdi) 187 movdqu %xmm1, 16(%rdi) 188 movdqu %xmm2, 32(%rdi) 189 movdqu %xmm3, 48(%rdi) 190 movdqa %xmm4, (%r8) 191 movaps %xmm5, 16(%r8) 192 movaps %xmm6, 32(%r8) 193 movaps %xmm7, 48(%r8) 194 add $64, %r8 195 196 lea (%rdi, %rdx), %rbx 197 and $-64, %rbx 198 cmp %r8, %rbx 199 jbe L(mm_copy_remaining_forward) 200 201 cmp __x86_shared_cache_size_half(%rip), %rdx 202 203 ja L(mm_overlapping_check_forward) 204 205 .p2align 4 206L(mm_main_loop_forward): 207 208 prefetcht0 128(%r8, %rsi) 209 210 movdqu (%r8, %rsi), %xmm0 211 movdqu 16(%r8, %rsi), %xmm1 212 movdqu 32(%r8, %rsi), %xmm2 213 movdqu 48(%r8, %rsi), %xmm3 214 movdqa %xmm0, (%r8) 215 movaps %xmm1, 16(%r8) 216 movaps %xmm2, 32(%r8) 217 movaps %xmm3, 48(%r8) 218 lea 64(%r8), %r8 219 cmp %r8, %rbx 220 ja L(mm_main_loop_forward) 221 222L(mm_copy_remaining_forward): 223 add %rdi, %rdx 224 sub %r8, %rdx 225/* We copied all up till %rdi position in the dst. 226 In %rdx now is how many bytes are left to copy. 227 Now we need to advance %r8. */ 228 lea (%r8, %rsi), %r9 229 230L(mm_remaining_0_64_bytes_forward): 231 cmp $32, %rdx 232 ja L(mm_remaining_33_64_bytes_forward) 233 cmp $16, %rdx 234 ja L(mm_remaining_17_32_bytes_forward) 235 test %rdx, %rdx 236 .p2align 4,,2 237 je L(mm_return) 238 239 cmpb $8, %dl 240 ja L(mm_remaining_9_16_bytes_forward) 241 cmpb $4, %dl 242 .p2align 4,,5 243 ja L(mm_remaining_5_8_bytes_forward) 244 cmpb $2, %dl 245 .p2align 4,,1 246 ja L(mm_remaining_3_4_bytes_forward) 247 movzbl -1(%r9,%rdx), %esi 248 movzbl (%r9), %ebx 249 movb %sil, -1(%r8,%rdx) 250 movb %bl, (%r8) 251 jmp L(mm_return) 252 253L(mm_remaining_33_64_bytes_forward): 254 movdqu (%r9), %xmm0 255 movdqu 16(%r9), %xmm1 256 movdqu -32(%r9, %rdx), %xmm2 257 movdqu -16(%r9, %rdx), %xmm3 258 movdqu %xmm0, (%r8) 259 movdqu %xmm1, 16(%r8) 260 movdqu %xmm2, -32(%r8, %rdx) 261 movdqu %xmm3, -16(%r8, %rdx) 262 jmp L(mm_return) 263 264L(mm_remaining_17_32_bytes_forward): 265 movdqu (%r9), %xmm0 266 movdqu -16(%r9, %rdx), %xmm1 267 movdqu %xmm0, (%r8) 268 movdqu %xmm1, -16(%r8, %rdx) 269 jmp L(mm_return) 270 271L(mm_remaining_5_8_bytes_forward): 272 movl (%r9), %esi 273 movl -4(%r9,%rdx), %ebx 274 movl %esi, (%r8) 275 movl %ebx, -4(%r8,%rdx) 276 jmp L(mm_return) 277 278L(mm_remaining_9_16_bytes_forward): 279 mov (%r9), %rsi 280 mov -8(%r9, %rdx), %rbx 281 mov %rsi, (%r8) 282 mov %rbx, -8(%r8, %rdx) 283 jmp L(mm_return) 284 285L(mm_remaining_3_4_bytes_forward): 286 movzwl -2(%r9,%rdx), %esi 287 movzwl (%r9), %ebx 288 movw %si, -2(%r8,%rdx) 289 movw %bx, (%r8) 290 jmp L(mm_return) 291 292L(mm_len_0_16_bytes_forward): 293 testb $24, %dl 294 jne L(mm_len_9_16_bytes_forward) 295 testb $4, %dl 296 .p2align 4,,5 297 jne L(mm_len_5_8_bytes_forward) 298 test %rdx, %rdx 299 .p2align 4,,2 300 je L(mm_return) 301 testb $2, %dl 302 .p2align 4,,1 303 jne L(mm_len_2_4_bytes_forward) 304 movzbl -1(%rsi,%rdx), %ebx 305 movzbl (%rsi), %esi 306 movb %bl, -1(%rdi,%rdx) 307 movb %sil, (%rdi) 308 jmp L(mm_return) 309 310L(mm_len_2_4_bytes_forward): 311 movzwl -2(%rsi,%rdx), %ebx 312 movzwl (%rsi), %esi 313 movw %bx, -2(%rdi,%rdx) 314 movw %si, (%rdi) 315 jmp L(mm_return) 316 317L(mm_len_5_8_bytes_forward): 318 movl (%rsi), %ebx 319 movl -4(%rsi,%rdx), %esi 320 movl %ebx, (%rdi) 321 movl %esi, -4(%rdi,%rdx) 322 jmp L(mm_return) 323 324L(mm_len_9_16_bytes_forward): 325 mov (%rsi), %rbx 326 mov -8(%rsi, %rdx), %rsi 327 mov %rbx, (%rdi) 328 mov %rsi, -8(%rdi, %rdx) 329 jmp L(mm_return) 330 331L(mm_recalc_len): 332/* Compute in %rdx how many bytes are left to copy after 333 the main loop stops. */ 334 mov %rbx, %rdx 335 sub %rdi, %rdx 336/* The code for copying backwards. */ 337L(mm_len_0_or_more_backward): 338 339/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 340 separately. */ 341 cmp $16, %rdx 342 jbe L(mm_len_0_16_bytes_backward) 343 344 cmp $32, %rdx 345 ja L(mm_len_32_or_more_backward) 346 347/* Copy [0..32] and return. */ 348 movdqu (%rsi), %xmm0 349 movdqu -16(%rsi, %rdx), %xmm1 350 movdqu %xmm0, (%rdi) 351 movdqu %xmm1, -16(%rdi, %rdx) 352 jmp L(mm_return) 353 354L(mm_len_32_or_more_backward): 355 cmp $64, %rdx 356 ja L(mm_len_64_or_more_backward) 357 358/* Copy [0..64] and return. */ 359 movdqu (%rsi), %xmm0 360 movdqu 16(%rsi), %xmm1 361 movdqu -16(%rsi, %rdx), %xmm2 362 movdqu -32(%rsi, %rdx), %xmm3 363 movdqu %xmm0, (%rdi) 364 movdqu %xmm1, 16(%rdi) 365 movdqu %xmm2, -16(%rdi, %rdx) 366 movdqu %xmm3, -32(%rdi, %rdx) 367 jmp L(mm_return) 368 369L(mm_len_64_or_more_backward): 370 cmp $128, %rdx 371 ja L(mm_len_128_or_more_backward) 372 373/* Copy [0..128] and return. */ 374 movdqu (%rsi), %xmm0 375 movdqu 16(%rsi), %xmm1 376 movdqu 32(%rsi), %xmm2 377 movdqu 48(%rsi), %xmm3 378 movdqu -64(%rsi, %rdx), %xmm4 379 movdqu -48(%rsi, %rdx), %xmm5 380 movdqu -32(%rsi, %rdx), %xmm6 381 movdqu -16(%rsi, %rdx), %xmm7 382 movdqu %xmm0, (%rdi) 383 movdqu %xmm1, 16(%rdi) 384 movdqu %xmm2, 32(%rdi) 385 movdqu %xmm3, 48(%rdi) 386 movdqu %xmm4, -64(%rdi, %rdx) 387 movdqu %xmm5, -48(%rdi, %rdx) 388 movdqu %xmm6, -32(%rdi, %rdx) 389 movdqu %xmm7, -16(%rdi, %rdx) 390 jmp L(mm_return) 391 392L(mm_len_128_or_more_backward): 393/* Aligning the address of destination. We need to save 394 16 bits from the source in order not to overwrite them. */ 395 movdqu -16(%rsi, %rdx), %xmm0 396 movdqu -32(%rsi, %rdx), %xmm1 397 movdqu -48(%rsi, %rdx), %xmm2 398 movdqu -64(%rsi, %rdx), %xmm3 399 400 lea (%rdi, %rdx), %r9 401 and $-64, %r9 /* r9 = aligned dst */ 402 403 mov %rsi, %r8 404 sub %rdi, %r8 /* r8 = src - dst, diff */ 405 406 movdqu -16(%r9, %r8), %xmm4 407 movdqu -32(%r9, %r8), %xmm5 408 movdqu -48(%r9, %r8), %xmm6 409 movdqu -64(%r9, %r8), %xmm7 410 411 movdqu %xmm0, -16(%rdi, %rdx) 412 movdqu %xmm1, -32(%rdi, %rdx) 413 movdqu %xmm2, -48(%rdi, %rdx) 414 movdqu %xmm3, -64(%rdi, %rdx) 415 movdqa %xmm4, -16(%r9) 416 movaps %xmm5, -32(%r9) 417 movaps %xmm6, -48(%r9) 418 movaps %xmm7, -64(%r9) 419 lea -64(%r9), %r9 420 421 lea 64(%rdi), %rbx 422 and $-64, %rbx 423 424 cmp %r9, %rbx 425 jae L(mm_recalc_len) 426 427 cmp __x86_shared_cache_size_half(%rip), %rdx 428 429 ja L(mm_overlapping_check_backward) 430 431 .p2align 4 432L(mm_main_loop_backward): 433 434 prefetcht0 -128(%r9, %r8) 435 436 movdqu -64(%r9, %r8), %xmm0 437 movdqu -48(%r9, %r8), %xmm1 438 movdqu -32(%r9, %r8), %xmm2 439 movdqu -16(%r9, %r8), %xmm3 440 movdqa %xmm0, -64(%r9) 441 movaps %xmm1, -48(%r9) 442 movaps %xmm2, -32(%r9) 443 movaps %xmm3, -16(%r9) 444 lea -64(%r9), %r9 445 cmp %r9, %rbx 446 jb L(mm_main_loop_backward) 447 jmp L(mm_recalc_len) 448 449/* Copy [0..16] and return. */ 450L(mm_len_0_16_bytes_backward): 451 testb $24, %dl 452 jnz L(mm_len_9_16_bytes_backward) 453 testb $4, %dl 454 .p2align 4,,5 455 jnz L(mm_len_5_8_bytes_backward) 456 test %rdx, %rdx 457 .p2align 4,,2 458 je L(mm_return) 459 testb $2, %dl 460 .p2align 4,,1 461 jne L(mm_len_3_4_bytes_backward) 462 movzbl -1(%rsi,%rdx), %ebx 463 movzbl (%rsi), %ecx 464 movb %bl, -1(%rdi,%rdx) 465 movb %cl, (%rdi) 466 jmp L(mm_return) 467 468L(mm_len_3_4_bytes_backward): 469 movzwl -2(%rsi,%rdx), %ebx 470 movzwl (%rsi), %ecx 471 movw %bx, -2(%rdi,%rdx) 472 movw %cx, (%rdi) 473 jmp L(mm_return) 474 475L(mm_len_9_16_bytes_backward): 476 movl -4(%rsi,%rdx), %ebx 477 movl -8(%rsi,%rdx), %ecx 478 movl %ebx, -4(%rdi,%rdx) 479 movl %ecx, -8(%rdi,%rdx) 480 sub $8, %rdx 481 jmp L(mm_len_0_16_bytes_backward) 482 483L(mm_len_5_8_bytes_backward): 484 movl (%rsi), %ebx 485 movl -4(%rsi,%rdx), %ecx 486 movl %ebx, (%rdi) 487 movl %ecx, -4(%rdi,%rdx) 488 489L(mm_return): 490 RETURN 491 492/* Big length copy forward part. */ 493 494 .p2align 4 495 496L(mm_overlapping_check_forward): 497 mov %rsi, %r9 498 add %rdx, %r9 499 cmp __x86_shared_cache_size(%rip), %r9 500 jbe L(mm_main_loop_forward) 501 502L(mm_large_page_loop_forward): 503 movdqu (%r8, %rsi), %xmm0 504 movdqu 16(%r8, %rsi), %xmm1 505 movdqu 32(%r8, %rsi), %xmm2 506 movdqu 48(%r8, %rsi), %xmm3 507 movntdq %xmm0, (%r8) 508 movntdq %xmm1, 16(%r8) 509 movntdq %xmm2, 32(%r8) 510 movntdq %xmm3, 48(%r8) 511 lea 64(%r8), %r8 512 cmp %r8, %rbx 513 ja L(mm_large_page_loop_forward) 514 sfence 515 jmp L(mm_copy_remaining_forward) 516 517/* Big length copy backward part. */ 518 .p2align 4 519 520L(mm_overlapping_check_backward): 521 mov %rdi, %r11 522 sub %rsi, %r11 /* r11 = dst - src, diff */ 523 add %rdx, %r11 524 cmp __x86_shared_cache_size(%rip), %r11 525 jbe L(mm_main_loop_backward) 526 527L(mm_large_page_loop_backward): 528 movdqu -64(%r9, %r8), %xmm0 529 movdqu -48(%r9, %r8), %xmm1 530 movdqu -32(%r9, %r8), %xmm2 531 movdqu -16(%r9, %r8), %xmm3 532 movntdq %xmm0, -64(%r9) 533 movntdq %xmm1, -48(%r9) 534 movntdq %xmm2, -32(%r9) 535 movntdq %xmm3, -16(%r9) 536 lea -64(%r9), %r9 537 cmp %r9, %rbx 538 jb L(mm_large_page_loop_backward) 539 sfence 540 jmp L(mm_recalc_len) 541 542END (MEMMOVE) 543 544ALIAS_SYMBOL(memcpy, MEMMOVE) 545