• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/* SPDX-License-Identifier: GPL-2.0 */
2/* Cloned and hacked for uClibc by Paul Mundt, December 2003 */
3/* Modified by SuperH, Inc. September 2003 */
4!
5! Fast SH memcpy
6!
7! by Toshiyasu Morita (tm@netcom.com)
8! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
9! SH5 code Copyright 2002 SuperH Ltd.
10!
11! Entry: ARG0: destination pointer
12!        ARG1: source pointer
13!        ARG2: byte count
14!
15! Exit:  RESULT: destination pointer
16!        any other registers in the range r0-r7: trashed
17!
18! Notes: Usually one wants to do small reads and write a longword, but
19!        unfortunately it is difficult in some cases to concatanate bytes
20!        into a longword on the SH, so this does a longword read and small
21!        writes.
22!
23! This implementation makes two assumptions about how it is called:
24!
25! 1.: If the byte count is nonzero, the address of the last byte to be
26!     copied is unsigned greater than the address of the first byte to
27!     be copied.  This could be easily swapped for a signed comparison,
28!     but the algorithm used needs some comparison.
29!
30! 2.: When there are two or three bytes in the last word of an 11-or-more
31!     bytes memory chunk to b copied, the rest of the word can be read
32!     without side effects.
33!     This could be easily changed by increasing the minimum size of
34!     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
35!     however, this would cost a few extra cyles on average.
36!     For SHmedia, the assumption is that any quadword can be read in its
37!     enirety if at least one byte is included in the copy.
38!
39
40	.section .text..SHmedia32,"ax"
41	.globl	memcpy
42	.type	memcpy, @function
43	.align	5
44
45memcpy:
46
47#define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
48#define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
49#define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
50#define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
51
52	ld.b r3,0,r63
53	pta/l Large,tr0
54	movi 25,r0
55	bgeu/u r4,r0,tr0
56	nsb r4,r0
57	shlli r0,5,r0
58	movi (L1-L0+63*32 + 1) & 0xffff,r1
59	sub r1, r0, r0
60L0:	ptrel r0,tr0
61	add r2,r4,r5
62	ptabs r18,tr1
63	add r3,r4,r6
64	blink tr0,r63
65
66/* Rearranged to make cut2 safe */
67	.balign 8
68L4_7:	/* 4..7 byte memcpy cntd. */
69	stlo.l r2, 0, r0
70	or r6, r7, r6
71	sthi.l r5, -1, r6
72	stlo.l r5, -4, r6
73	blink tr1,r63
74
75	.balign 8
76L1:	/* 0 byte memcpy */
77	nop
78	blink tr1,r63
79	nop
80	nop
81	nop
82	nop
83
84L2_3:	/* 2 or 3 byte memcpy cntd. */
85	st.b r5,-1,r6
86	blink tr1,r63
87
88	/* 1 byte memcpy */
89	ld.b r3,0,r0
90	st.b r2,0,r0
91	blink tr1,r63
92
93L8_15:	/* 8..15 byte memcpy cntd. */
94	stlo.q r2, 0, r0
95	or r6, r7, r6
96	sthi.q r5, -1, r6
97	stlo.q r5, -8, r6
98	blink tr1,r63
99
100	/* 2 or 3 byte memcpy */
101	ld.b r3,0,r0
102	ld.b r2,0,r63
103	ld.b r3,1,r1
104	st.b r2,0,r0
105	pta/l L2_3,tr0
106	ld.b r6,-1,r6
107	st.b r2,1,r1
108	blink tr0, r63
109
110	/* 4 .. 7 byte memcpy */
111	LDUAL (r3, 0, r0, r1)
112	pta L4_7, tr0
113	ldlo.l r6, -4, r7
114	or r0, r1, r0
115	sthi.l r2, 3, r0
116	ldhi.l r6, -1, r6
117	blink tr0, r63
118
119	/* 8 .. 15 byte memcpy */
120	LDUAQ (r3, 0, r0, r1)
121	pta L8_15, tr0
122	ldlo.q r6, -8, r7
123	or r0, r1, r0
124	sthi.q r2, 7, r0
125	ldhi.q r6, -1, r6
126	blink tr0, r63
127
128	/* 16 .. 24 byte memcpy */
129	LDUAQ (r3, 0, r0, r1)
130	LDUAQ (r3, 8, r8, r9)
131	or r0, r1, r0
132	sthi.q r2, 7, r0
133	or r8, r9, r8
134	sthi.q r2, 15, r8
135	ldlo.q r6, -8, r7
136	ldhi.q r6, -1, r6
137	stlo.q r2, 8, r8
138	stlo.q r2, 0, r0
139	or r6, r7, r6
140	sthi.q r5, -1, r6
141	stlo.q r5, -8, r6
142	blink tr1,r63
143
144Large:
145	ld.b r2, 0, r63
146	pta/l  Loop_ua, tr1
147	ori r3, -8, r7
148	sub r2, r7, r22
149	sub r3, r2, r6
150	add r2, r4, r5
151	ldlo.q r3, 0, r0
152	addi r5, -16, r5
153	movi 64+8, r27 // could subtract r7 from that.
154	stlo.q r2, 0, r0
155	sthi.q r2, 7, r0
156	ldx.q r22, r6, r0
157	bgtu/l r27, r4, tr1
158
159	addi r5, -48, r27
160	pta/l Loop_line, tr0
161	addi r6, 64, r36
162	addi r6, -24, r19
163	addi r6, -16, r20
164	addi r6, -8, r21
165
166Loop_line:
167	ldx.q r22, r36, r63
168	alloco r22, 32
169	addi r22, 32, r22
170	ldx.q r22, r19, r23
171	sthi.q r22, -25, r0
172	ldx.q r22, r20, r24
173	ldx.q r22, r21, r25
174	stlo.q r22, -32, r0
175	ldx.q r22, r6,  r0
176	sthi.q r22, -17, r23
177	sthi.q r22,  -9, r24
178	sthi.q r22,  -1, r25
179	stlo.q r22, -24, r23
180	stlo.q r22, -16, r24
181	stlo.q r22,  -8, r25
182	bgeu r27, r22, tr0
183
184Loop_ua:
185	addi r22, 8, r22
186	sthi.q r22, -1, r0
187	stlo.q r22, -8, r0
188	ldx.q r22, r6, r0
189	bgtu/l r5, r22, tr1
190
191	add r3, r4, r7
192	ldlo.q r7, -8, r1
193	sthi.q r22, 7, r0
194	ldhi.q r7, -1, r7
195	ptabs r18,tr1
196	stlo.q r22, 0, r0
197	or r1, r7, r1
198	sthi.q r5, 15, r1
199	stlo.q r5, 8, r1
200	blink tr1, r63
201
202	.size memcpy,.-memcpy
203