1/* SPDX-License-Identifier: GPL-2.0 */ 2/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */ 3/* Modified by SuperH, Inc. September 2003 */ 4! 5! Fast SH memcpy 6! 7! by Toshiyasu Morita (tm@netcom.com) 8! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut) 9! SH5 code Copyright 2002 SuperH Ltd. 10! 11! Entry: ARG0: destination pointer 12! ARG1: source pointer 13! ARG2: byte count 14! 15! Exit: RESULT: destination pointer 16! any other registers in the range r0-r7: trashed 17! 18! Notes: Usually one wants to do small reads and write a longword, but 19! unfortunately it is difficult in some cases to concatanate bytes 20! into a longword on the SH, so this does a longword read and small 21! writes. 22! 23! This implementation makes two assumptions about how it is called: 24! 25! 1.: If the byte count is nonzero, the address of the last byte to be 26! copied is unsigned greater than the address of the first byte to 27! be copied. This could be easily swapped for a signed comparison, 28! but the algorithm used needs some comparison. 29! 30! 2.: When there are two or three bytes in the last word of an 11-or-more 31! bytes memory chunk to b copied, the rest of the word can be read 32! without side effects. 33! This could be easily changed by increasing the minimum size of 34! a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2, 35! however, this would cost a few extra cyles on average. 36! For SHmedia, the assumption is that any quadword can be read in its 37! enirety if at least one byte is included in the copy. 38! 39 40 .section .text..SHmedia32,"ax" 41 .globl memcpy 42 .type memcpy, @function 43 .align 5 44 45memcpy: 46 47#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1 48#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1 49#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1 50#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1 51 52 ld.b r3,0,r63 53 pta/l Large,tr0 54 movi 25,r0 55 bgeu/u r4,r0,tr0 56 nsb r4,r0 57 shlli r0,5,r0 58 movi (L1-L0+63*32 + 1) & 0xffff,r1 59 sub r1, r0, r0 60L0: ptrel r0,tr0 61 add r2,r4,r5 62 ptabs r18,tr1 63 add r3,r4,r6 64 blink tr0,r63 65 66/* Rearranged to make cut2 safe */ 67 .balign 8 68L4_7: /* 4..7 byte memcpy cntd. */ 69 stlo.l r2, 0, r0 70 or r6, r7, r6 71 sthi.l r5, -1, r6 72 stlo.l r5, -4, r6 73 blink tr1,r63 74 75 .balign 8 76L1: /* 0 byte memcpy */ 77 nop 78 blink tr1,r63 79 nop 80 nop 81 nop 82 nop 83 84L2_3: /* 2 or 3 byte memcpy cntd. */ 85 st.b r5,-1,r6 86 blink tr1,r63 87 88 /* 1 byte memcpy */ 89 ld.b r3,0,r0 90 st.b r2,0,r0 91 blink tr1,r63 92 93L8_15: /* 8..15 byte memcpy cntd. */ 94 stlo.q r2, 0, r0 95 or r6, r7, r6 96 sthi.q r5, -1, r6 97 stlo.q r5, -8, r6 98 blink tr1,r63 99 100 /* 2 or 3 byte memcpy */ 101 ld.b r3,0,r0 102 ld.b r2,0,r63 103 ld.b r3,1,r1 104 st.b r2,0,r0 105 pta/l L2_3,tr0 106 ld.b r6,-1,r6 107 st.b r2,1,r1 108 blink tr0, r63 109 110 /* 4 .. 7 byte memcpy */ 111 LDUAL (r3, 0, r0, r1) 112 pta L4_7, tr0 113 ldlo.l r6, -4, r7 114 or r0, r1, r0 115 sthi.l r2, 3, r0 116 ldhi.l r6, -1, r6 117 blink tr0, r63 118 119 /* 8 .. 15 byte memcpy */ 120 LDUAQ (r3, 0, r0, r1) 121 pta L8_15, tr0 122 ldlo.q r6, -8, r7 123 or r0, r1, r0 124 sthi.q r2, 7, r0 125 ldhi.q r6, -1, r6 126 blink tr0, r63 127 128 /* 16 .. 24 byte memcpy */ 129 LDUAQ (r3, 0, r0, r1) 130 LDUAQ (r3, 8, r8, r9) 131 or r0, r1, r0 132 sthi.q r2, 7, r0 133 or r8, r9, r8 134 sthi.q r2, 15, r8 135 ldlo.q r6, -8, r7 136 ldhi.q r6, -1, r6 137 stlo.q r2, 8, r8 138 stlo.q r2, 0, r0 139 or r6, r7, r6 140 sthi.q r5, -1, r6 141 stlo.q r5, -8, r6 142 blink tr1,r63 143 144Large: 145 ld.b r2, 0, r63 146 pta/l Loop_ua, tr1 147 ori r3, -8, r7 148 sub r2, r7, r22 149 sub r3, r2, r6 150 add r2, r4, r5 151 ldlo.q r3, 0, r0 152 addi r5, -16, r5 153 movi 64+8, r27 // could subtract r7 from that. 154 stlo.q r2, 0, r0 155 sthi.q r2, 7, r0 156 ldx.q r22, r6, r0 157 bgtu/l r27, r4, tr1 158 159 addi r5, -48, r27 160 pta/l Loop_line, tr0 161 addi r6, 64, r36 162 addi r6, -24, r19 163 addi r6, -16, r20 164 addi r6, -8, r21 165 166Loop_line: 167 ldx.q r22, r36, r63 168 alloco r22, 32 169 addi r22, 32, r22 170 ldx.q r22, r19, r23 171 sthi.q r22, -25, r0 172 ldx.q r22, r20, r24 173 ldx.q r22, r21, r25 174 stlo.q r22, -32, r0 175 ldx.q r22, r6, r0 176 sthi.q r22, -17, r23 177 sthi.q r22, -9, r24 178 sthi.q r22, -1, r25 179 stlo.q r22, -24, r23 180 stlo.q r22, -16, r24 181 stlo.q r22, -8, r25 182 bgeu r27, r22, tr0 183 184Loop_ua: 185 addi r22, 8, r22 186 sthi.q r22, -1, r0 187 stlo.q r22, -8, r0 188 ldx.q r22, r6, r0 189 bgtu/l r5, r22, tr1 190 191 add r3, r4, r7 192 ldlo.q r7, -8, r1 193 sthi.q r22, 7, r0 194 ldhi.q r7, -1, r7 195 ptabs r18,tr1 196 stlo.q r22, 0, r0 197 or r1, r7, r1 198 sthi.q r5, 15, r1 199 stlo.q r5, 8, r1 200 blink tr1, r63 201 202 .size memcpy,.-memcpy 203