1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#include "cache.h" 32 33#ifndef MEMMOVE 34# define MEMMOVE memmove 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef ALIAS_SYMBOL 71# define ALIAS_SYMBOL(alias, original) \ 72 .globl alias; \ 73 .equ alias, original 74#endif 75 76#ifndef END 77# define END(name) \ 78 cfi_endproc; \ 79 .size name, .-name 80#endif 81 82#define CFI_PUSH(REG) \ 83 cfi_adjust_cfa_offset (4); \ 84 cfi_rel_offset (REG, 0) 85 86#define CFI_POP(REG) \ 87 cfi_adjust_cfa_offset (-4); \ 88 cfi_restore (REG) 89 90#define PUSH(REG) push REG; 91#define POP(REG) pop REG; 92 93#define ENTRANCE PUSH (%rbx); 94#define RETURN_END POP (%rbx); ret 95#define RETURN RETURN_END; 96 97 .section .text.sse2,"ax",@progbits 98ENTRY (MEMMOVE) 99 ENTRANCE 100 mov %rdi, %rax 101 102/* Check whether we should copy backward or forward. */ 103 cmp %rsi, %rdi 104 je L(mm_return) 105 jg L(mm_len_0_or_more_backward) 106 107/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 108 separately. */ 109 cmp $16, %rdx 110 jbe L(mm_len_0_16_bytes_forward) 111 112 cmp $32, %rdx 113 ja L(mm_len_32_or_more_forward) 114 115/* Copy [0..32] and return. */ 116 movdqu (%rsi), %xmm0 117 movdqu -16(%rsi, %rdx), %xmm1 118 movdqu %xmm0, (%rdi) 119 movdqu %xmm1, -16(%rdi, %rdx) 120 jmp L(mm_return) 121 122L(mm_len_32_or_more_forward): 123 cmp $64, %rdx 124 ja L(mm_len_64_or_more_forward) 125 126/* Copy [0..64] and return. */ 127 movdqu (%rsi), %xmm0 128 movdqu 16(%rsi), %xmm1 129 movdqu -16(%rsi, %rdx), %xmm2 130 movdqu -32(%rsi, %rdx), %xmm3 131 movdqu %xmm0, (%rdi) 132 movdqu %xmm1, 16(%rdi) 133 movdqu %xmm2, -16(%rdi, %rdx) 134 movdqu %xmm3, -32(%rdi, %rdx) 135 jmp L(mm_return) 136 137L(mm_len_64_or_more_forward): 138 cmp $128, %rdx 139 ja L(mm_len_128_or_more_forward) 140 141/* Copy [0..128] and return. */ 142 movdqu (%rsi), %xmm0 143 movdqu 16(%rsi), %xmm1 144 movdqu 32(%rsi), %xmm2 145 movdqu 48(%rsi), %xmm3 146 movdqu -64(%rsi, %rdx), %xmm4 147 movdqu -48(%rsi, %rdx), %xmm5 148 movdqu -32(%rsi, %rdx), %xmm6 149 movdqu -16(%rsi, %rdx), %xmm7 150 movdqu %xmm0, (%rdi) 151 movdqu %xmm1, 16(%rdi) 152 movdqu %xmm2, 32(%rdi) 153 movdqu %xmm3, 48(%rdi) 154 movdqu %xmm4, -64(%rdi, %rdx) 155 movdqu %xmm5, -48(%rdi, %rdx) 156 movdqu %xmm6, -32(%rdi, %rdx) 157 movdqu %xmm7, -16(%rdi, %rdx) 158 jmp L(mm_return) 159 160L(mm_len_128_or_more_forward): 161/* Aligning the address of destination. */ 162/* save first unaligned 64 bytes */ 163 movdqu (%rsi), %xmm0 164 movdqu 16(%rsi), %xmm1 165 movdqu 32(%rsi), %xmm2 166 movdqu 48(%rsi), %xmm3 167 168 lea 64(%rdi), %r8 169 and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ 170 sub %rdi, %rsi /* rsi = src - dst = diff */ 171 172 movdqu (%r8, %rsi), %xmm4 173 movdqu 16(%r8, %rsi), %xmm5 174 movdqu 32(%r8, %rsi), %xmm6 175 movdqu 48(%r8, %rsi), %xmm7 176 177 movdqu %xmm0, (%rdi) 178 movdqu %xmm1, 16(%rdi) 179 movdqu %xmm2, 32(%rdi) 180 movdqu %xmm3, 48(%rdi) 181 movdqa %xmm4, (%r8) 182 movaps %xmm5, 16(%r8) 183 movaps %xmm6, 32(%r8) 184 movaps %xmm7, 48(%r8) 185 add $64, %r8 186 187 lea (%rdi, %rdx), %rbx 188 and $-64, %rbx 189 cmp %r8, %rbx 190 jbe L(mm_copy_remaining_forward) 191 192 cmp $SHARED_CACHE_SIZE_HALF, %rdx 193 jae L(mm_large_page_loop_forward) 194 195 .p2align 4 196L(mm_main_loop_forward): 197 198 prefetcht0 128(%r8, %rsi) 199 200 movdqu (%r8, %rsi), %xmm0 201 movdqu 16(%r8, %rsi), %xmm1 202 movdqu 32(%r8, %rsi), %xmm2 203 movdqu 48(%r8, %rsi), %xmm3 204 movdqa %xmm0, (%r8) 205 movaps %xmm1, 16(%r8) 206 movaps %xmm2, 32(%r8) 207 movaps %xmm3, 48(%r8) 208 lea 64(%r8), %r8 209 cmp %r8, %rbx 210 ja L(mm_main_loop_forward) 211 212L(mm_copy_remaining_forward): 213 add %rdi, %rdx 214 sub %r8, %rdx 215/* We copied all up till %rdi position in the dst. 216 In %rdx now is how many bytes are left to copy. 217 Now we need to advance %r8. */ 218 lea (%r8, %rsi), %r9 219 220L(mm_remaining_0_64_bytes_forward): 221 cmp $32, %rdx 222 ja L(mm_remaining_33_64_bytes_forward) 223 cmp $16, %rdx 224 ja L(mm_remaining_17_32_bytes_forward) 225 test %rdx, %rdx 226 .p2align 4,,2 227 je L(mm_return) 228 229 cmpb $8, %dl 230 ja L(mm_remaining_9_16_bytes_forward) 231 cmpb $4, %dl 232 .p2align 4,,5 233 ja L(mm_remaining_5_8_bytes_forward) 234 cmpb $2, %dl 235 .p2align 4,,1 236 ja L(mm_remaining_3_4_bytes_forward) 237 movzbl -1(%r9,%rdx), %esi 238 movzbl (%r9), %ebx 239 movb %sil, -1(%r8,%rdx) 240 movb %bl, (%r8) 241 jmp L(mm_return) 242 243L(mm_remaining_33_64_bytes_forward): 244 movdqu (%r9), %xmm0 245 movdqu 16(%r9), %xmm1 246 movdqu -32(%r9, %rdx), %xmm2 247 movdqu -16(%r9, %rdx), %xmm3 248 movdqu %xmm0, (%r8) 249 movdqu %xmm1, 16(%r8) 250 movdqu %xmm2, -32(%r8, %rdx) 251 movdqu %xmm3, -16(%r8, %rdx) 252 jmp L(mm_return) 253 254L(mm_remaining_17_32_bytes_forward): 255 movdqu (%r9), %xmm0 256 movdqu -16(%r9, %rdx), %xmm1 257 movdqu %xmm0, (%r8) 258 movdqu %xmm1, -16(%r8, %rdx) 259 jmp L(mm_return) 260 261L(mm_remaining_5_8_bytes_forward): 262 movl (%r9), %esi 263 movl -4(%r9,%rdx), %ebx 264 movl %esi, (%r8) 265 movl %ebx, -4(%r8,%rdx) 266 jmp L(mm_return) 267 268L(mm_remaining_9_16_bytes_forward): 269 mov (%r9), %rsi 270 mov -8(%r9, %rdx), %rbx 271 mov %rsi, (%r8) 272 mov %rbx, -8(%r8, %rdx) 273 jmp L(mm_return) 274 275L(mm_remaining_3_4_bytes_forward): 276 movzwl -2(%r9,%rdx), %esi 277 movzwl (%r9), %ebx 278 movw %si, -2(%r8,%rdx) 279 movw %bx, (%r8) 280 jmp L(mm_return) 281 282L(mm_len_0_16_bytes_forward): 283 testb $24, %dl 284 jne L(mm_len_9_16_bytes_forward) 285 testb $4, %dl 286 .p2align 4,,5 287 jne L(mm_len_5_8_bytes_forward) 288 test %rdx, %rdx 289 .p2align 4,,2 290 je L(mm_return) 291 testb $2, %dl 292 .p2align 4,,1 293 jne L(mm_len_2_4_bytes_forward) 294 movzbl -1(%rsi,%rdx), %ebx 295 movzbl (%rsi), %esi 296 movb %bl, -1(%rdi,%rdx) 297 movb %sil, (%rdi) 298 jmp L(mm_return) 299 300L(mm_len_2_4_bytes_forward): 301 movzwl -2(%rsi,%rdx), %ebx 302 movzwl (%rsi), %esi 303 movw %bx, -2(%rdi,%rdx) 304 movw %si, (%rdi) 305 jmp L(mm_return) 306 307L(mm_len_5_8_bytes_forward): 308 movl (%rsi), %ebx 309 movl -4(%rsi,%rdx), %esi 310 movl %ebx, (%rdi) 311 movl %esi, -4(%rdi,%rdx) 312 jmp L(mm_return) 313 314L(mm_len_9_16_bytes_forward): 315 mov (%rsi), %rbx 316 mov -8(%rsi, %rdx), %rsi 317 mov %rbx, (%rdi) 318 mov %rsi, -8(%rdi, %rdx) 319 jmp L(mm_return) 320 321L(mm_recalc_len): 322/* Compute in %rdx how many bytes are left to copy after 323 the main loop stops. */ 324 mov %rbx, %rdx 325 sub %rdi, %rdx 326/* The code for copying backwards. */ 327L(mm_len_0_or_more_backward): 328 329/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 330 separately. */ 331 cmp $16, %rdx 332 jbe L(mm_len_0_16_bytes_backward) 333 334 cmp $32, %rdx 335 ja L(mm_len_32_or_more_backward) 336 337/* Copy [0..32] and return. */ 338 movdqu (%rsi), %xmm0 339 movdqu -16(%rsi, %rdx), %xmm1 340 movdqu %xmm0, (%rdi) 341 movdqu %xmm1, -16(%rdi, %rdx) 342 jmp L(mm_return) 343 344L(mm_len_32_or_more_backward): 345 cmp $64, %rdx 346 ja L(mm_len_64_or_more_backward) 347 348/* Copy [0..64] and return. */ 349 movdqu (%rsi), %xmm0 350 movdqu 16(%rsi), %xmm1 351 movdqu -16(%rsi, %rdx), %xmm2 352 movdqu -32(%rsi, %rdx), %xmm3 353 movdqu %xmm0, (%rdi) 354 movdqu %xmm1, 16(%rdi) 355 movdqu %xmm2, -16(%rdi, %rdx) 356 movdqu %xmm3, -32(%rdi, %rdx) 357 jmp L(mm_return) 358 359L(mm_len_64_or_more_backward): 360 cmp $128, %rdx 361 ja L(mm_len_128_or_more_backward) 362 363/* Copy [0..128] and return. */ 364 movdqu (%rsi), %xmm0 365 movdqu 16(%rsi), %xmm1 366 movdqu 32(%rsi), %xmm2 367 movdqu 48(%rsi), %xmm3 368 movdqu -64(%rsi, %rdx), %xmm4 369 movdqu -48(%rsi, %rdx), %xmm5 370 movdqu -32(%rsi, %rdx), %xmm6 371 movdqu -16(%rsi, %rdx), %xmm7 372 movdqu %xmm0, (%rdi) 373 movdqu %xmm1, 16(%rdi) 374 movdqu %xmm2, 32(%rdi) 375 movdqu %xmm3, 48(%rdi) 376 movdqu %xmm4, -64(%rdi, %rdx) 377 movdqu %xmm5, -48(%rdi, %rdx) 378 movdqu %xmm6, -32(%rdi, %rdx) 379 movdqu %xmm7, -16(%rdi, %rdx) 380 jmp L(mm_return) 381 382L(mm_len_128_or_more_backward): 383/* Aligning the address of destination. We need to save 384 16 bits from the source in order not to overwrite them. */ 385 movdqu -16(%rsi, %rdx), %xmm0 386 movdqu -32(%rsi, %rdx), %xmm1 387 movdqu -48(%rsi, %rdx), %xmm2 388 movdqu -64(%rsi, %rdx), %xmm3 389 390 lea (%rdi, %rdx), %r9 391 and $-64, %r9 /* r9 = aligned dst */ 392 393 mov %rsi, %r8 394 sub %rdi, %r8 /* r8 = src - dst, diff */ 395 396 movdqu -16(%r9, %r8), %xmm4 397 movdqu -32(%r9, %r8), %xmm5 398 movdqu -48(%r9, %r8), %xmm6 399 movdqu -64(%r9, %r8), %xmm7 400 401 movdqu %xmm0, -16(%rdi, %rdx) 402 movdqu %xmm1, -32(%rdi, %rdx) 403 movdqu %xmm2, -48(%rdi, %rdx) 404 movdqu %xmm3, -64(%rdi, %rdx) 405 movdqa %xmm4, -16(%r9) 406 movaps %xmm5, -32(%r9) 407 movaps %xmm6, -48(%r9) 408 movaps %xmm7, -64(%r9) 409 lea -64(%r9), %r9 410 411 lea 64(%rdi), %rbx 412 and $-64, %rbx 413 414 cmp %r9, %rbx 415 jae L(mm_recalc_len) 416 417 cmp $SHARED_CACHE_SIZE_HALF, %rdx 418 jae L(mm_large_page_loop_backward) 419 420 .p2align 4 421L(mm_main_loop_backward): 422 423 prefetcht0 -128(%r9, %r8) 424 425 movdqu -64(%r9, %r8), %xmm0 426 movdqu -48(%r9, %r8), %xmm1 427 movdqu -32(%r9, %r8), %xmm2 428 movdqu -16(%r9, %r8), %xmm3 429 movdqa %xmm0, -64(%r9) 430 movaps %xmm1, -48(%r9) 431 movaps %xmm2, -32(%r9) 432 movaps %xmm3, -16(%r9) 433 lea -64(%r9), %r9 434 cmp %r9, %rbx 435 jb L(mm_main_loop_backward) 436 jmp L(mm_recalc_len) 437 438/* Copy [0..16] and return. */ 439L(mm_len_0_16_bytes_backward): 440 testb $24, %dl 441 jnz L(mm_len_9_16_bytes_backward) 442 testb $4, %dl 443 .p2align 4,,5 444 jnz L(mm_len_5_8_bytes_backward) 445 test %rdx, %rdx 446 .p2align 4,,2 447 je L(mm_return) 448 testb $2, %dl 449 .p2align 4,,1 450 jne L(mm_len_3_4_bytes_backward) 451 movzbl -1(%rsi,%rdx), %ebx 452 movzbl (%rsi), %ecx 453 movb %bl, -1(%rdi,%rdx) 454 movb %cl, (%rdi) 455 jmp L(mm_return) 456 457L(mm_len_3_4_bytes_backward): 458 movzwl -2(%rsi,%rdx), %ebx 459 movzwl (%rsi), %ecx 460 movw %bx, -2(%rdi,%rdx) 461 movw %cx, (%rdi) 462 jmp L(mm_return) 463 464L(mm_len_9_16_bytes_backward): 465 movl -4(%rsi,%rdx), %ebx 466 movl -8(%rsi,%rdx), %ecx 467 movl %ebx, -4(%rdi,%rdx) 468 movl %ecx, -8(%rdi,%rdx) 469 sub $8, %rdx 470 jmp L(mm_len_0_16_bytes_backward) 471 472L(mm_len_5_8_bytes_backward): 473 movl (%rsi), %ebx 474 movl -4(%rsi,%rdx), %ecx 475 movl %ebx, (%rdi) 476 movl %ecx, -4(%rdi,%rdx) 477 478L(mm_return): 479 RETURN 480 481/* Big length copy forward part. */ 482 483 .p2align 4 484L(mm_large_page_loop_forward): 485 movdqu (%r8, %rsi), %xmm0 486 movdqu 16(%r8, %rsi), %xmm1 487 movdqu 32(%r8, %rsi), %xmm2 488 movdqu 48(%r8, %rsi), %xmm3 489 movntdq %xmm0, (%r8) 490 movntdq %xmm1, 16(%r8) 491 movntdq %xmm2, 32(%r8) 492 movntdq %xmm3, 48(%r8) 493 lea 64(%r8), %r8 494 cmp %r8, %rbx 495 ja L(mm_large_page_loop_forward) 496 sfence 497 jmp L(mm_copy_remaining_forward) 498 499/* Big length copy backward part. */ 500 .p2align 4 501L(mm_large_page_loop_backward): 502 movdqu -64(%r9, %r8), %xmm0 503 movdqu -48(%r9, %r8), %xmm1 504 movdqu -32(%r9, %r8), %xmm2 505 movdqu -16(%r9, %r8), %xmm3 506 movntdq %xmm0, -64(%r9) 507 movntdq %xmm1, -48(%r9) 508 movntdq %xmm2, -32(%r9) 509 movntdq %xmm3, -16(%r9) 510 lea -64(%r9), %r9 511 cmp %r9, %rbx 512 jb L(mm_large_page_loop_backward) 513 sfence 514 jmp L(mm_recalc_len) 515 516END (MEMMOVE) 517 518ALIAS_SYMBOL(memcpy, MEMMOVE) 519