1/* 2 * This file is subject to the terms and conditions of the GNU General Public 3 * License. See the file "COPYING" in the main directory of this archive 4 * for more details. 5 * 6 * Unified implementation of memcpy, memmove and the __copy_user backend. 7 * 8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org) 9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc. 10 * Copyright (C) 2002 Broadcom, Inc. 11 * memcpy/copy_user author: Mark Vandevoorde 12 * 13 * Mnemonic names for arguments to memcpy/__copy_user 14 */ 15 16#include <asm/asm.h> 17#include <asm/asm-offsets.h> 18#include <asm/regdef.h> 19 20#define dst a0 21#define src a1 22#define len a2 23 24/* 25 * Spec 26 * 27 * memcpy copies len bytes from src to dst and sets v0 to dst. 28 * It assumes that 29 * - src and dst don't overlap 30 * - src is readable 31 * - dst is writable 32 * memcpy uses the standard calling convention 33 * 34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to 35 * the number of uncopied bytes due to an exception caused by a read or write. 36 * __copy_user assumes that src and dst don't overlap, and that the call is 37 * implementing one of the following: 38 * copy_to_user 39 * - src is readable (no exceptions when reading src) 40 * copy_from_user 41 * - dst is writable (no exceptions when writing dst) 42 * __copy_user uses a non-standard calling convention; see 43 * arch/mips/include/asm/uaccess.h 44 * 45 * When an exception happens on a load, the handler must 46 # ensure that all of the destination buffer is overwritten to prevent 47 * leaking information to user mode programs. 48 */ 49 50/* 51 * Implementation 52 */ 53 54/* 55 * The exception handler for loads requires that: 56 * 1- AT contain the address of the byte just past the end of the source 57 * of the copy, 58 * 2- src_entry <= src < AT, and 59 * 3- (dst - src) == (dst_entry - src_entry), 60 * The _entry suffix denotes values when __copy_user was called. 61 * 62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user 63 * (2) is met by incrementing src by the number of bytes copied 64 * (3) is met by not doing loads between a pair of increments of dst and src 65 * 66 * The exception handlers for stores adjust len (if necessary) and return. 67 * These handlers do not need to overwrite any data. 68 * 69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore 70 * they're not protected. 71 */ 72 73#define EXC(inst_reg,addr,handler) \ 749: inst_reg, addr; \ 75 .section __ex_table,"a"; \ 76 PTR 9b, handler; \ 77 .previous 78 79/* 80 * Only on the 64-bit kernel we can made use of 64-bit registers. 81 */ 82 83#define LOAD ld 84#define LOADL ldl 85#define LOADR ldr 86#define STOREL sdl 87#define STORER sdr 88#define STORE sd 89#define ADD daddu 90#define SUB dsubu 91#define SRL dsrl 92#define SRA dsra 93#define SLL dsll 94#define SLLV dsllv 95#define SRLV dsrlv 96#define NBYTES 8 97#define LOG_NBYTES 3 98 99/* 100 * As we are sharing code base with the mips32 tree (which use the o32 ABI 101 * register definitions). We need to redefine the register definitions from 102 * the n64 ABI register naming to the o32 ABI register naming. 103 */ 104#undef t0 105#undef t1 106#undef t2 107#undef t3 108#define t0 $8 109#define t1 $9 110#define t2 $10 111#define t3 $11 112#define t4 $12 113#define t5 $13 114#define t6 $14 115#define t7 $15 116 117#ifdef CONFIG_CPU_LITTLE_ENDIAN 118#define LDFIRST LOADR 119#define LDREST LOADL 120#define STFIRST STORER 121#define STREST STOREL 122#define SHIFT_DISCARD SLLV 123#else 124#define LDFIRST LOADL 125#define LDREST LOADR 126#define STFIRST STOREL 127#define STREST STORER 128#define SHIFT_DISCARD SRLV 129#endif 130 131#define FIRST(unit) ((unit)*NBYTES) 132#define REST(unit) (FIRST(unit)+NBYTES-1) 133#define UNIT(unit) FIRST(unit) 134 135#define ADDRMASK (NBYTES-1) 136 137 .text 138 .set noreorder 139 .set noat 140 141/* 142 * t7 is used as a flag to note inatomic mode. 143 */ 144LEAF(__copy_user_inatomic) 145 b __copy_user_common 146 li t7, 1 147 END(__copy_user_inatomic) 148 149/* 150 * A combined memcpy/__copy_user 151 * __copy_user sets len to 0 for success; else to an upper bound of 152 * the number of uncopied bytes. 153 * memcpy sets v0 to dst. 154 */ 155 .align 5 156LEAF(memcpy) /* a0=dst a1=src a2=len */ 157 move v0, dst /* return value */ 158__memcpy: 159FEXPORT(__copy_user) 160 li t7, 0 /* not inatomic */ 161__copy_user_common: 162 /* 163 * Note: dst & src may be unaligned, len may be 0 164 * Temps 165 */ 166 # 167 # Octeon doesn't care if the destination is unaligned. The hardware 168 # can fix it faster than we can special case the assembly. 169 # 170 pref 0, 0(src) 171 sltu t0, len, NBYTES # Check if < 1 word 172 bnez t0, copy_bytes_checklen 173 and t0, src, ADDRMASK # Check if src unaligned 174 bnez t0, src_unaligned 175 sltu t0, len, 4*NBYTES # Check if < 4 words 176 bnez t0, less_than_4units 177 sltu t0, len, 8*NBYTES # Check if < 8 words 178 bnez t0, less_than_8units 179 sltu t0, len, 16*NBYTES # Check if < 16 words 180 bnez t0, cleanup_both_aligned 181 sltu t0, len, 128+1 # Check if len < 129 182 bnez t0, 1f # Skip prefetch if len is too short 183 sltu t0, len, 256+1 # Check if len < 257 184 bnez t0, 1f # Skip prefetch if len is too short 185 pref 0, 128(src) # We must not prefetch invalid addresses 186 # 187 # This is where we loop if there is more than 128 bytes left 1882: pref 0, 256(src) # We must not prefetch invalid addresses 189 # 190 # This is where we loop if we can't prefetch anymore 1911: 192EXC( LOAD t0, UNIT(0)(src), l_exc) 193EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 194EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 195EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 196 SUB len, len, 16*NBYTES 197EXC( STORE t0, UNIT(0)(dst), s_exc_p16u) 198EXC( STORE t1, UNIT(1)(dst), s_exc_p15u) 199EXC( STORE t2, UNIT(2)(dst), s_exc_p14u) 200EXC( STORE t3, UNIT(3)(dst), s_exc_p13u) 201EXC( LOAD t0, UNIT(4)(src), l_exc_copy) 202EXC( LOAD t1, UNIT(5)(src), l_exc_copy) 203EXC( LOAD t2, UNIT(6)(src), l_exc_copy) 204EXC( LOAD t3, UNIT(7)(src), l_exc_copy) 205EXC( STORE t0, UNIT(4)(dst), s_exc_p12u) 206EXC( STORE t1, UNIT(5)(dst), s_exc_p11u) 207EXC( STORE t2, UNIT(6)(dst), s_exc_p10u) 208 ADD src, src, 16*NBYTES 209EXC( STORE t3, UNIT(7)(dst), s_exc_p9u) 210 ADD dst, dst, 16*NBYTES 211EXC( LOAD t0, UNIT(-8)(src), l_exc_copy_rewind16) 212EXC( LOAD t1, UNIT(-7)(src), l_exc_copy_rewind16) 213EXC( LOAD t2, UNIT(-6)(src), l_exc_copy_rewind16) 214EXC( LOAD t3, UNIT(-5)(src), l_exc_copy_rewind16) 215EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u) 216EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u) 217EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u) 218EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u) 219EXC( LOAD t0, UNIT(-4)(src), l_exc_copy_rewind16) 220EXC( LOAD t1, UNIT(-3)(src), l_exc_copy_rewind16) 221EXC( LOAD t2, UNIT(-2)(src), l_exc_copy_rewind16) 222EXC( LOAD t3, UNIT(-1)(src), l_exc_copy_rewind16) 223EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u) 224EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u) 225EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u) 226EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u) 227 sltu t0, len, 256+1 # See if we can prefetch more 228 beqz t0, 2b 229 sltu t0, len, 128 # See if we can loop more time 230 beqz t0, 1b 231 nop 232 # 233 # Jump here if there are less than 16*NBYTES left. 234 # 235cleanup_both_aligned: 236 beqz len, done 237 sltu t0, len, 8*NBYTES 238 bnez t0, less_than_8units 239 nop 240EXC( LOAD t0, UNIT(0)(src), l_exc) 241EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 242EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 243EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 244 SUB len, len, 8*NBYTES 245EXC( STORE t0, UNIT(0)(dst), s_exc_p8u) 246EXC( STORE t1, UNIT(1)(dst), s_exc_p7u) 247EXC( STORE t2, UNIT(2)(dst), s_exc_p6u) 248EXC( STORE t3, UNIT(3)(dst), s_exc_p5u) 249EXC( LOAD t0, UNIT(4)(src), l_exc_copy) 250EXC( LOAD t1, UNIT(5)(src), l_exc_copy) 251EXC( LOAD t2, UNIT(6)(src), l_exc_copy) 252EXC( LOAD t3, UNIT(7)(src), l_exc_copy) 253EXC( STORE t0, UNIT(4)(dst), s_exc_p4u) 254EXC( STORE t1, UNIT(5)(dst), s_exc_p3u) 255EXC( STORE t2, UNIT(6)(dst), s_exc_p2u) 256EXC( STORE t3, UNIT(7)(dst), s_exc_p1u) 257 ADD src, src, 8*NBYTES 258 beqz len, done 259 ADD dst, dst, 8*NBYTES 260 # 261 # Jump here if there are less than 8*NBYTES left. 262 # 263less_than_8units: 264 sltu t0, len, 4*NBYTES 265 bnez t0, less_than_4units 266 nop 267EXC( LOAD t0, UNIT(0)(src), l_exc) 268EXC( LOAD t1, UNIT(1)(src), l_exc_copy) 269EXC( LOAD t2, UNIT(2)(src), l_exc_copy) 270EXC( LOAD t3, UNIT(3)(src), l_exc_copy) 271 SUB len, len, 4*NBYTES 272EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 273EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 274EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 275EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 276 ADD src, src, 4*NBYTES 277 beqz len, done 278 ADD dst, dst, 4*NBYTES 279 # 280 # Jump here if there are less than 4*NBYTES left. This means 281 # we may need to copy up to 3 NBYTES words. 282 # 283less_than_4units: 284 sltu t0, len, 1*NBYTES 285 bnez t0, copy_bytes_checklen 286 nop 287 # 288 # 1) Copy NBYTES, then check length again 289 # 290EXC( LOAD t0, 0(src), l_exc) 291 SUB len, len, NBYTES 292 sltu t1, len, 8 293EXC( STORE t0, 0(dst), s_exc_p1u) 294 ADD src, src, NBYTES 295 bnez t1, copy_bytes_checklen 296 ADD dst, dst, NBYTES 297 # 298 # 2) Copy NBYTES, then check length again 299 # 300EXC( LOAD t0, 0(src), l_exc) 301 SUB len, len, NBYTES 302 sltu t1, len, 8 303EXC( STORE t0, 0(dst), s_exc_p1u) 304 ADD src, src, NBYTES 305 bnez t1, copy_bytes_checklen 306 ADD dst, dst, NBYTES 307 # 308 # 3) Copy NBYTES, then check length again 309 # 310EXC( LOAD t0, 0(src), l_exc) 311 SUB len, len, NBYTES 312 ADD src, src, NBYTES 313 ADD dst, dst, NBYTES 314 b copy_bytes_checklen 315EXC( STORE t0, -8(dst), s_exc_p1u) 316 317src_unaligned: 318#define rem t8 319 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter 320 beqz t0, cleanup_src_unaligned 321 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES 3221: 323/* 324 * Avoid consecutive LD*'s to the same register since some mips 325 * implementations can't issue them in the same cycle. 326 * It's OK to load FIRST(N+1) before REST(N) because the two addresses 327 * are to the same unit (unless src is aligned, but it's not). 328 */ 329EXC( LDFIRST t0, FIRST(0)(src), l_exc) 330EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy) 331 SUB len, len, 4*NBYTES 332EXC( LDREST t0, REST(0)(src), l_exc_copy) 333EXC( LDREST t1, REST(1)(src), l_exc_copy) 334EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy) 335EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy) 336EXC( LDREST t2, REST(2)(src), l_exc_copy) 337EXC( LDREST t3, REST(3)(src), l_exc_copy) 338 ADD src, src, 4*NBYTES 339EXC( STORE t0, UNIT(0)(dst), s_exc_p4u) 340EXC( STORE t1, UNIT(1)(dst), s_exc_p3u) 341EXC( STORE t2, UNIT(2)(dst), s_exc_p2u) 342EXC( STORE t3, UNIT(3)(dst), s_exc_p1u) 343 bne len, rem, 1b 344 ADD dst, dst, 4*NBYTES 345 346cleanup_src_unaligned: 347 beqz len, done 348 and rem, len, NBYTES-1 # rem = len % NBYTES 349 beq rem, len, copy_bytes 350 nop 3511: 352EXC( LDFIRST t0, FIRST(0)(src), l_exc) 353EXC( LDREST t0, REST(0)(src), l_exc_copy) 354 SUB len, len, NBYTES 355EXC( STORE t0, 0(dst), s_exc_p1u) 356 ADD src, src, NBYTES 357 bne len, rem, 1b 358 ADD dst, dst, NBYTES 359 360copy_bytes_checklen: 361 beqz len, done 362 nop 363copy_bytes: 364 /* 0 < len < NBYTES */ 365#define COPY_BYTE(N) \ 366EXC( lb t0, N(src), l_exc); \ 367 SUB len, len, 1; \ 368 beqz len, done; \ 369EXC( sb t0, N(dst), s_exc_p1) 370 371 COPY_BYTE(0) 372 COPY_BYTE(1) 373 COPY_BYTE(2) 374 COPY_BYTE(3) 375 COPY_BYTE(4) 376 COPY_BYTE(5) 377EXC( lb t0, NBYTES-2(src), l_exc) 378 SUB len, len, 1 379 jr ra 380EXC( sb t0, NBYTES-2(dst), s_exc_p1) 381done: 382 jr ra 383 nop 384 END(memcpy) 385 386l_exc_copy_rewind16: 387 /* Rewind src and dst by 16*NBYTES for l_exc_copy */ 388 SUB src, src, 16*NBYTES 389 SUB dst, dst, 16*NBYTES 390l_exc_copy: 391 /* 392 * Copy bytes from src until faulting load address (or until a 393 * lb faults) 394 * 395 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28) 396 * may be more than a byte beyond the last address. 397 * Hence, the lb below may get an exception. 398 * 399 * Assumes src < THREAD_BUADDR($28) 400 */ 401 LOAD t0, TI_TASK($28) 402 LOAD t0, THREAD_BUADDR(t0) 4031: 404EXC( lb t1, 0(src), l_exc) 405 ADD src, src, 1 406 sb t1, 0(dst) # can't fault -- we're copy_from_user 407 bne src, t0, 1b 408 ADD dst, dst, 1 409l_exc: 410 LOAD t0, TI_TASK($28) 411 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address 412 SUB len, AT, t0 # len number of uncopied bytes 413 bnez t7, 2f /* Skip the zeroing out part if inatomic */ 414 /* 415 * Here's where we rely on src and dst being incremented in tandem, 416 * See (3) above. 417 * dst += (fault addr - src) to put dst at first byte to clear 418 */ 419 ADD dst, t0 # compute start address in a1 420 SUB dst, src 421 /* 422 * Clear len bytes starting at dst. Can't call __bzero because it 423 * might modify len. An inefficient loop for these rare times... 424 */ 425 beqz len, done 426 SUB src, len, 1 4271: sb zero, 0(dst) 428 ADD dst, dst, 1 429 bnez src, 1b 430 SUB src, src, 1 4312: jr ra 432 nop 433 434 435#define SEXC(n) \ 436s_exc_p ## n ## u: \ 437 jr ra; \ 438 ADD len, len, n*NBYTES 439 440SEXC(16) 441SEXC(15) 442SEXC(14) 443SEXC(13) 444SEXC(12) 445SEXC(11) 446SEXC(10) 447SEXC(9) 448SEXC(8) 449SEXC(7) 450SEXC(6) 451SEXC(5) 452SEXC(4) 453SEXC(3) 454SEXC(2) 455SEXC(1) 456 457s_exc_p1: 458 jr ra 459 ADD len, len, 1 460s_exc: 461 jr ra 462 nop 463 464 .align 5 465LEAF(memmove) 466 ADD t0, a0, a2 467 ADD t1, a1, a2 468 sltu t0, a1, t0 # dst + len <= src -> memcpy 469 sltu t1, a0, t1 # dst >= src + len -> memcpy 470 and t0, t1 471 beqz t0, __memcpy 472 move v0, a0 /* return value */ 473 beqz a2, r_out 474 END(memmove) 475 476 /* fall through to __rmemcpy */ 477LEAF(__rmemcpy) /* a0=dst a1=src a2=len */ 478 sltu t0, a1, a0 479 beqz t0, r_end_bytes_up # src >= dst 480 nop 481 ADD a0, a2 # dst = dst + len 482 ADD a1, a2 # src = src + len 483 484r_end_bytes: 485 lb t0, -1(a1) 486 SUB a2, a2, 0x1 487 sb t0, -1(a0) 488 SUB a1, a1, 0x1 489 bnez a2, r_end_bytes 490 SUB a0, a0, 0x1 491 492r_out: 493 jr ra 494 move a2, zero 495 496r_end_bytes_up: 497 lb t0, (a1) 498 SUB a2, a2, 0x1 499 sb t0, (a0) 500 ADD a1, a1, 0x1 501 bnez a2, r_end_bytes_up 502 ADD a0, a0, 0x1 503 504 jr ra 505 move a2, zero 506 END(__rmemcpy) 507