1/* 2 * Copyright 2010 Tilera Corporation. All Rights Reserved. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation, version 2. 7 * 8 * This program is distributed in the hope that it will be useful, but 9 * WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or 11 * NON INFRINGEMENT. See the GNU General Public License for 12 * more details. 13 */ 14 15#include <arch/chip.h> 16 17 18/* 19 * This file shares the implementation of the userspace memcpy and 20 * the kernel's memcpy, copy_to_user and copy_from_user. 21 */ 22 23#include <linux/linkage.h> 24 25#define IS_MEMCPY 0 26#define IS_COPY_FROM_USER 1 27#define IS_COPY_TO_USER -1 28 29 .section .text.memcpy_common, "ax" 30 .align 64 31 32/* Use this to preface each bundle that can cause an exception so 33 * the kernel can clean up properly. The special cleanup code should 34 * not use these, since it knows what it is doing. 35 */ 36#define EX \ 37 .pushsection __ex_table, "a"; \ 38 .align 4; \ 39 .word 9f, memcpy_common_fixup; \ 40 .popsection; \ 41 9 42 43 44/* raw_copy_from_user takes the kernel target address in r0, 45 * the user source in r1, and the bytes to copy in r2. 46 * It returns the number of uncopiable bytes (hopefully zero) in r0. 47 */ 48ENTRY(raw_copy_from_user) 49.type raw_copy_from_user, @function 50 FEEDBACK_ENTER_EXPLICIT(raw_copy_from_user, \ 51 .text.memcpy_common, \ 52 .Lend_memcpy_common - raw_copy_from_user) 53 { movei r29, IS_COPY_FROM_USER; j memcpy_common } 54 .size raw_copy_from_user, . - raw_copy_from_user 55 56/* raw_copy_to_user takes the user target address in r0, 57 * the kernel source in r1, and the bytes to copy in r2. 58 * It returns the number of uncopiable bytes (hopefully zero) in r0. 59 */ 60ENTRY(raw_copy_to_user) 61.type raw_copy_to_user, @function 62 FEEDBACK_REENTER(raw_copy_from_user) 63 { movei r29, IS_COPY_TO_USER; j memcpy_common } 64 .size raw_copy_to_user, . - raw_copy_to_user 65 66ENTRY(memcpy) 67.type memcpy, @function 68 FEEDBACK_REENTER(raw_copy_from_user) 69 { movei r29, IS_MEMCPY } 70 .size memcpy, . - memcpy 71 /* Fall through */ 72 73 .type memcpy_common, @function 74memcpy_common: 75 /* On entry, r29 holds one of the IS_* macro values from above. */ 76 77 78 /* r0 is the dest, r1 is the source, r2 is the size. */ 79 80 /* Save aside original dest so we can return it at the end. */ 81 { sw sp, lr; move r23, r0; or r4, r0, r1 } 82 83 /* Check for an empty size. */ 84 { bz r2, .Ldone; andi r4, r4, 3 } 85 86 /* Save aside original values in case of a fault. */ 87 { move r24, r1; move r25, r2 } 88 move r27, lr 89 90 /* Check for an unaligned source or dest. */ 91 { bnz r4, .Lcopy_unaligned_maybe_many; addli r4, r2, -256 } 92 93.Lcheck_aligned_copy_size: 94 /* If we are copying < 256 bytes, branch to simple case. */ 95 { blzt r4, .Lcopy_8_check; slti_u r8, r2, 8 } 96 97 /* Copying >= 256 bytes, so jump to complex prefetching loop. */ 98 { andi r6, r1, 63; j .Lcopy_many } 99 100/* 101 * 102 * Aligned 4 byte at a time copy loop 103 * 104 */ 105 106.Lcopy_8_loop: 107 /* Copy two words at a time to hide load latency. */ 108EX: { lw r3, r1; addi r1, r1, 4; slti_u r8, r2, 16 } 109EX: { lw r4, r1; addi r1, r1, 4 } 110EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } 111EX: { sw r0, r4; addi r0, r0, 4; addi r2, r2, -4 } 112.Lcopy_8_check: 113 { bzt r8, .Lcopy_8_loop; slti_u r4, r2, 4 } 114 115 /* Copy odd leftover word, if any. */ 116 { bnzt r4, .Lcheck_odd_stragglers } 117EX: { lw r3, r1; addi r1, r1, 4 } 118EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } 119 120.Lcheck_odd_stragglers: 121 { bnz r2, .Lcopy_unaligned_few } 122 123.Ldone: 124 /* For memcpy return original dest address, else zero. */ 125 { mz r0, r29, r23; jrp lr } 126 127 128/* 129 * 130 * Prefetching multiple cache line copy handler (for large transfers). 131 * 132 */ 133 134 /* Copy words until r1 is cache-line-aligned. */ 135.Lalign_loop: 136EX: { lw r3, r1; addi r1, r1, 4 } 137 { andi r6, r1, 63 } 138EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } 139.Lcopy_many: 140 { bnzt r6, .Lalign_loop; addi r9, r0, 63 } 141 142 { addi r3, r1, 60; andi r9, r9, -64 } 143 144 /* No need to prefetch dst, we'll just do the wh64 145 * right before we copy a line. 146 */ 147EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } 148 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 149 { bnzt zero, .; move r27, lr } 150EX: { lw r6, r3; addi r3, r3, 64 } 151 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 152 { bnzt zero, . } 153EX: { lw r7, r3; addi r3, r3, 64 } 154 /* Intentionally stall for a few cycles to leave L2 cache alone. */ 155 { bz zero, .Lbig_loop2 } 156 157 /* On entry to this loop: 158 * - r0 points to the start of dst line 0 159 * - r1 points to start of src line 0 160 * - r2 >= (256 - 60), only the first time the loop trips. 161 * - r3 contains r1 + 128 + 60 [pointer to end of source line 2] 162 * This is our prefetch address. When we get near the end 163 * rather than prefetching off the end this is changed to point 164 * to some "safe" recently loaded address. 165 * - r5 contains *(r1 + 60) [i.e. last word of source line 0] 166 * - r6 contains *(r1 + 64 + 60) [i.e. last word of source line 1] 167 * - r9 contains ((r0 + 63) & -64) 168 * [start of next dst cache line.] 169 */ 170 171.Lbig_loop: 172 { jal .Lcopy_line2; add r15, r1, r2 } 173 174.Lbig_loop2: 175 /* Copy line 0, first stalling until r5 is ready. */ 176EX: { move r12, r5; lw r16, r1 } 177 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } 178 /* Prefetch several lines ahead. */ 179EX: { lw r5, r3; addi r3, r3, 64 } 180 { jal .Lcopy_line } 181 182 /* Copy line 1, first stalling until r6 is ready. */ 183EX: { move r12, r6; lw r16, r1 } 184 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } 185 /* Prefetch several lines ahead. */ 186EX: { lw r6, r3; addi r3, r3, 64 } 187 { jal .Lcopy_line } 188 189 /* Copy line 2, first stalling until r7 is ready. */ 190EX: { move r12, r7; lw r16, r1 } 191 { bz r4, .Lcopy_8_check; slti_u r8, r2, 8 } 192 /* Prefetch several lines ahead. */ 193EX: { lw r7, r3; addi r3, r3, 64 } 194 /* Use up a caches-busy cycle by jumping back to the top of the 195 * loop. Might as well get it out of the way now. 196 */ 197 { j .Lbig_loop } 198 199 200 /* On entry: 201 * - r0 points to the destination line. 202 * - r1 points to the source line. 203 * - r3 is the next prefetch address. 204 * - r9 holds the last address used for wh64. 205 * - r12 = WORD_15 206 * - r16 = WORD_0. 207 * - r17 == r1 + 16. 208 * - r27 holds saved lr to restore. 209 * 210 * On exit: 211 * - r0 is incremented by 64. 212 * - r1 is incremented by 64, unless that would point to a word 213 * beyond the end of the source array, in which case it is redirected 214 * to point to an arbitrary word already in the cache. 215 * - r2 is decremented by 64. 216 * - r3 is unchanged, unless it points to a word beyond the 217 * end of the source array, in which case it is redirected 218 * to point to an arbitrary word already in the cache. 219 * Redirecting is OK since if we are that close to the end 220 * of the array we will not come back to this subroutine 221 * and use the contents of the prefetched address. 222 * - r4 is nonzero iff r2 >= 64. 223 * - r9 is incremented by 64, unless it points beyond the 224 * end of the last full destination cache line, in which 225 * case it is redirected to a "safe address" that can be 226 * clobbered (sp - 64) 227 * - lr contains the value in r27. 228 */ 229 230/* r26 unused */ 231 232.Lcopy_line: 233 /* TODO: when r3 goes past the end, we would like to redirect it 234 * to prefetch the last partial cache line (if any) just once, for the 235 * benefit of the final cleanup loop. But we don't want to 236 * prefetch that line more than once, or subsequent prefetches 237 * will go into the RTF. But then .Lbig_loop should unconditionally 238 * branch to top of loop to execute final prefetch, and its 239 * nop should become a conditional branch. 240 */ 241 242 /* We need two non-memory cycles here to cover the resources 243 * used by the loads initiated by the caller. 244 */ 245 { add r15, r1, r2 } 246.Lcopy_line2: 247 { slt_u r13, r3, r15; addi r17, r1, 16 } 248 249 /* NOTE: this will stall for one cycle as L1 is busy. */ 250 251 /* Fill second L1D line. */ 252EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ 253 254 /* Prepare destination line for writing. */ 255EX: { wh64 r9; addi r9, r9, 64 } 256 /* Load seven words that are L1D hits to cover wh64 L2 usage. */ 257 258 /* Load the three remaining words from the last L1D line, which 259 * we know has already filled the L1D. 260 */ 261EX: { lw r4, r1; addi r1, r1, 4; addi r20, r1, 16 } /* r4 = WORD_12 */ 262EX: { lw r8, r1; addi r1, r1, 4; slt_u r13, r20, r15 }/* r8 = WORD_13 */ 263EX: { lw r11, r1; addi r1, r1, -52; mvz r20, r13, r1 } /* r11 = WORD_14 */ 264 265 /* Load the three remaining words from the first L1D line, first 266 * stalling until it has filled by "looking at" r16. 267 */ 268EX: { lw r13, r1; addi r1, r1, 4; move zero, r16 } /* r13 = WORD_1 */ 269EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_2 */ 270EX: { lw r15, r1; addi r1, r1, 8; addi r10, r0, 60 } /* r15 = WORD_3 */ 271 272 /* Load second word from the second L1D line, first 273 * stalling until it has filled by "looking at" r17. 274 */ 275EX: { lw r19, r1; addi r1, r1, 4; move zero, r17 } /* r19 = WORD_5 */ 276 277 /* Store last word to the destination line, potentially dirtying it 278 * for the first time, which keeps the L2 busy for two cycles. 279 */ 280EX: { sw r10, r12 } /* store(WORD_15) */ 281 282 /* Use two L1D hits to cover the sw L2 access above. */ 283EX: { lw r10, r1; addi r1, r1, 4 } /* r10 = WORD_6 */ 284EX: { lw r12, r1; addi r1, r1, 4 } /* r12 = WORD_7 */ 285 286 /* Fill third L1D line. */ 287EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ 288 289 /* Store first L1D line. */ 290EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ 291EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ 292EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ 293EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ 294 /* Store second L1D line. */ 295EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ 296EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ 297EX: { sw r0, r10; addi r0, r0, 4 } /* store(WORD_6) */ 298EX: { sw r0, r12; addi r0, r0, 4 } /* store(WORD_7) */ 299 300EX: { lw r13, r1; addi r1, r1, 4; move zero, r18 } /* r13 = WORD_9 */ 301EX: { lw r14, r1; addi r1, r1, 4 } /* r14 = WORD_10 */ 302EX: { lw r15, r1; move r1, r20 } /* r15 = WORD_11 */ 303 304 /* Store third L1D line. */ 305EX: { sw r0, r18; addi r0, r0, 4 } /* store(WORD_8) */ 306EX: { sw r0, r13; addi r0, r0, 4 } /* store(WORD_9) */ 307EX: { sw r0, r14; addi r0, r0, 4 } /* store(WORD_10) */ 308EX: { sw r0, r15; addi r0, r0, 4 } /* store(WORD_11) */ 309 310 /* Store rest of fourth L1D line. */ 311EX: { sw r0, r4; addi r0, r0, 4 } /* store(WORD_12) */ 312 { 313EX: sw r0, r8 /* store(WORD_13) */ 314 addi r0, r0, 4 315 /* Will r2 be > 64 after we subtract 64 below? */ 316 shri r4, r2, 7 317 } 318 { 319EX: sw r0, r11 /* store(WORD_14) */ 320 addi r0, r0, 8 321 /* Record 64 bytes successfully copied. */ 322 addi r2, r2, -64 323 } 324 325 { jrp lr; move lr, r27 } 326 327 /* Convey to the backtrace library that the stack frame is size 328 * zero, and the real return address is on the stack rather than 329 * in 'lr'. 330 */ 331 { info 8 } 332 333 .align 64 334.Lcopy_unaligned_maybe_many: 335 /* Skip the setup overhead if we aren't copying many bytes. */ 336 { slti_u r8, r2, 20; sub r4, zero, r0 } 337 { bnzt r8, .Lcopy_unaligned_few; andi r4, r4, 3 } 338 { bz r4, .Ldest_is_word_aligned; add r18, r1, r2 } 339 340/* 341 * 342 * unaligned 4 byte at a time copy handler. 343 * 344 */ 345 346 /* Copy single bytes until r0 == 0 mod 4, so we can store words. */ 347.Lalign_dest_loop: 348EX: { lb_u r3, r1; addi r1, r1, 1; addi r4, r4, -1 } 349EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } 350 { bnzt r4, .Lalign_dest_loop; andi r3, r1, 3 } 351 352 /* If source and dest are now *both* aligned, do an aligned copy. */ 353 { bz r3, .Lcheck_aligned_copy_size; addli r4, r2, -256 } 354 355.Ldest_is_word_aligned: 356 357EX: { andi r8, r0, 63; lwadd_na r6, r1, 4} 358 { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } 359 360 /* This copies unaligned words until either there are fewer 361 * than 4 bytes left to copy, or until the destination pointer 362 * is cache-aligned, whichever comes first. 363 * 364 * On entry: 365 * - r0 is the next store address. 366 * - r1 points 4 bytes past the load address corresponding to r0. 367 * - r2 >= 4 368 * - r6 is the next aligned word loaded. 369 */ 370.Lcopy_unaligned_src_words: 371EX: { lwadd_na r7, r1, 4; slti_u r8, r2, 4 + 4 } 372 /* stall */ 373 { dword_align r6, r7, r1; slti_u r9, r2, 64 + 4 } 374EX: { swadd r0, r6, 4; addi r2, r2, -4 } 375 { bnz r8, .Lcleanup_unaligned_words; andi r8, r0, 63 } 376 { bnzt r8, .Lcopy_unaligned_src_words; move r6, r7 } 377 378 /* On entry: 379 * - r0 is the next store address. 380 * - r1 points 4 bytes past the load address corresponding to r0. 381 * - r2 >= 4 (# of bytes left to store). 382 * - r6 is the next aligned src word value. 383 * - r9 = (r2 < 64U). 384 * - r18 points one byte past the end of source memory. 385 */ 386.Ldest_is_L2_line_aligned: 387 388 { 389 /* Not a full cache line remains. */ 390 bnz r9, .Lcleanup_unaligned_words 391 move r7, r6 392 } 393 394 /* r2 >= 64 */ 395 396 /* Kick off two prefetches, but don't go past the end. */ 397 { addi r3, r1, 63 - 4; addi r8, r1, 64 + 63 - 4 } 398 { prefetch r3; move r3, r8; slt_u r8, r8, r18 } 399 { mvz r3, r8, r1; addi r8, r3, 64 } 400 { prefetch r3; move r3, r8; slt_u r8, r8, r18 } 401 { mvz r3, r8, r1; movei r17, 0 } 402 403.Lcopy_unaligned_line: 404 /* Prefetch another line. */ 405 { prefetch r3; addi r15, r1, 60; addi r3, r3, 64 } 406 /* Fire off a load of the last word we are about to copy. */ 407EX: { lw_na r15, r15; slt_u r8, r3, r18 } 408 409EX: { mvz r3, r8, r1; wh64 r0 } 410 411 /* This loop runs twice. 412 * 413 * On entry: 414 * - r17 is even before the first iteration, and odd before 415 * the second. It is incremented inside the loop. Encountering 416 * an even value at the end of the loop makes it stop. 417 */ 418.Lcopy_half_an_unaligned_line: 419EX: { 420 /* Stall until the last byte is ready. In the steady state this 421 * guarantees all words to load below will be in the L2 cache, which 422 * avoids shunting the loads to the RTF. 423 */ 424 move zero, r15 425 lwadd_na r7, r1, 16 426 } 427EX: { lwadd_na r11, r1, 12 } 428EX: { lwadd_na r14, r1, -24 } 429EX: { lwadd_na r8, r1, 4 } 430EX: { lwadd_na r9, r1, 4 } 431EX: { 432 lwadd_na r10, r1, 8 433 /* r16 = (r2 < 64), after we subtract 32 from r2 below. */ 434 slti_u r16, r2, 64 + 32 435 } 436EX: { lwadd_na r12, r1, 4; addi r17, r17, 1 } 437EX: { lwadd_na r13, r1, 8; dword_align r6, r7, r1 } 438EX: { swadd r0, r6, 4; dword_align r7, r8, r1 } 439EX: { swadd r0, r7, 4; dword_align r8, r9, r1 } 440EX: { swadd r0, r8, 4; dword_align r9, r10, r1 } 441EX: { swadd r0, r9, 4; dword_align r10, r11, r1 } 442EX: { swadd r0, r10, 4; dword_align r11, r12, r1 } 443EX: { swadd r0, r11, 4; dword_align r12, r13, r1 } 444EX: { swadd r0, r12, 4; dword_align r13, r14, r1 } 445EX: { swadd r0, r13, 4; addi r2, r2, -32 } 446 { move r6, r14; bbst r17, .Lcopy_half_an_unaligned_line } 447 448 { bzt r16, .Lcopy_unaligned_line; move r7, r6 } 449 450 /* On entry: 451 * - r0 is the next store address. 452 * - r1 points 4 bytes past the load address corresponding to r0. 453 * - r2 >= 0 (# of bytes left to store). 454 * - r7 is the next aligned src word value. 455 */ 456.Lcleanup_unaligned_words: 457 /* Handle any trailing bytes. */ 458 { bz r2, .Lcopy_unaligned_done; slti_u r8, r2, 4 } 459 { bzt r8, .Lcopy_unaligned_src_words; move r6, r7 } 460 461 /* Move r1 back to the point where it corresponds to r0. */ 462 { addi r1, r1, -4 } 463 464 /* Fall through */ 465 466/* 467 * 468 * 1 byte at a time copy handler. 469 * 470 */ 471 472.Lcopy_unaligned_few: 473EX: { lb_u r3, r1; addi r1, r1, 1 } 474EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } 475 { bnzt r2, .Lcopy_unaligned_few } 476 477.Lcopy_unaligned_done: 478 479 /* For memcpy return original dest address, else zero. */ 480 { mz r0, r29, r23; jrp lr } 481 482.Lend_memcpy_common: 483 .size memcpy_common, .Lend_memcpy_common - memcpy_common 484 485 .section .fixup,"ax" 486memcpy_common_fixup: 487 .type memcpy_common_fixup, @function 488 489 /* Skip any bytes we already successfully copied. 490 * r2 (num remaining) is correct, but r0 (dst) and r1 (src) 491 * may not be quite right because of unrolling and prefetching. 492 * So we need to recompute their values as the address just 493 * after the last byte we are sure was successfully loaded and 494 * then stored. 495 */ 496 497 /* Determine how many bytes we successfully copied. */ 498 { sub r3, r25, r2 } 499 500 /* Add this to the original r0 and r1 to get their new values. */ 501 { add r0, r23, r3; add r1, r24, r3 } 502 503 { bzt r29, memcpy_fixup_loop } 504 { blzt r29, copy_to_user_fixup_loop } 505 506copy_from_user_fixup_loop: 507 /* Try copying the rest one byte at a time, expecting a load fault. */ 508.Lcfu: { lb_u r3, r1; addi r1, r1, 1 } 509 { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } 510 { bnzt r2, copy_from_user_fixup_loop } 511 512.Lcopy_from_user_fixup_zero_remainder: 513 move lr, r27 514 { move r0, r2; jrp lr } 515 516copy_to_user_fixup_loop: 517 /* Try copying the rest one byte at a time, expecting a store fault. */ 518 { lb_u r3, r1; addi r1, r1, 1 } 519.Lctu: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } 520 { bnzt r2, copy_to_user_fixup_loop } 521.Lcopy_to_user_fixup_done: 522 move lr, r27 523 { move r0, r2; jrp lr } 524 525memcpy_fixup_loop: 526 /* Try copying the rest one byte at a time. We expect a disastrous 527 * fault to happen since we are in fixup code, but let it happen. 528 */ 529 { lb_u r3, r1; addi r1, r1, 1 } 530 { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } 531 { bnzt r2, memcpy_fixup_loop } 532 /* This should be unreachable, we should have faulted again. 533 * But be paranoid and handle it in case some interrupt changed 534 * the TLB or something. 535 */ 536 move lr, r27 537 { move r0, r23; jrp lr } 538 539 .size memcpy_common_fixup, . - memcpy_common_fixup 540 541 .section __ex_table,"a" 542 .align 4 543 .word .Lcfu, .Lcopy_from_user_fixup_zero_remainder 544 .word .Lctu, .Lcopy_to_user_fixup_done 545