• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (C) 2013 ARM Ltd.
3 * Copyright (C) 2013 Linaro.
4 *
5 * This code is based on glibc cortex strings work originally authored by Linaro
6 * and re-licensed under GPLv2 for the Linux kernel. The original code can
7 * be found @
8 *
9 * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10 * files/head:/src/aarch64/
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
23 */
24
25#include <linux/linkage.h>
26#include <asm/assembler.h>
27#include <asm/cache.h>
28
29/*
30 * Move a buffer from src to test (alignment handled by the hardware).
31 * If dest <= src, call memcpy, otherwise copy in reverse order.
32 *
33 * Parameters:
34 *	x0 - dest
35 *	x1 - src
36 *	x2 - n
37 * Returns:
38 *	x0 - dest
39 */
40dstin	.req	x0
41src	.req	x1
42count	.req	x2
43tmp1	.req	x3
44tmp1w	.req	w3
45tmp2	.req	x4
46tmp2w	.req	w4
47tmp3	.req	x5
48tmp3w	.req	w5
49dst	.req	x6
50
51A_l	.req	x7
52A_h	.req	x8
53B_l	.req	x9
54B_h	.req	x10
55C_l	.req	x11
56C_h	.req	x12
57D_l	.req	x13
58D_h	.req	x14
59
60	.weak memmove
61ENTRY(__memmove)
62ENTRY(memmove)
63	cmp	dstin, src
64	b.lo	__memcpy
65	add	tmp1, src, count
66	cmp	dstin, tmp1
67	b.hs	__memcpy		/* No overlap.  */
68
69	add	dst, dstin, count
70	add	src, src, count
71	cmp	count, #16
72	b.lo	.Ltail15  /*probably non-alignment accesses.*/
73
74	ands	tmp2, src, #15     /* Bytes to reach alignment.  */
75	b.eq	.LSrcAligned
76	sub	count, count, tmp2
77	/*
78	* process the aligned offset length to make the src aligned firstly.
79	* those extra instructions' cost is acceptable. It also make the
80	* coming accesses are based on aligned address.
81	*/
82	tbz	tmp2, #0, 1f
83	ldrb	tmp1w, [src, #-1]!
84	strb	tmp1w, [dst, #-1]!
851:
86	tbz	tmp2, #1, 2f
87	ldrh	tmp1w, [src, #-2]!
88	strh	tmp1w, [dst, #-2]!
892:
90	tbz	tmp2, #2, 3f
91	ldr	tmp1w, [src, #-4]!
92	str	tmp1w, [dst, #-4]!
933:
94	tbz	tmp2, #3, .LSrcAligned
95	ldr	tmp1, [src, #-8]!
96	str	tmp1, [dst, #-8]!
97
98.LSrcAligned:
99	cmp	count, #64
100	b.ge	.Lcpy_over64
101
102	/*
103	* Deal with small copies quickly by dropping straight into the
104	* exit block.
105	*/
106.Ltail63:
107	/*
108	* Copy up to 48 bytes of data. At this point we only need the
109	* bottom 6 bits of count to be accurate.
110	*/
111	ands	tmp1, count, #0x30
112	b.eq	.Ltail15
113	cmp	tmp1w, #0x20
114	b.eq	1f
115	b.lt	2f
116	ldp	A_l, A_h, [src, #-16]!
117	stp	A_l, A_h, [dst, #-16]!
1181:
119	ldp	A_l, A_h, [src, #-16]!
120	stp	A_l, A_h, [dst, #-16]!
1212:
122	ldp	A_l, A_h, [src, #-16]!
123	stp	A_l, A_h, [dst, #-16]!
124
125.Ltail15:
126	tbz	count, #3, 1f
127	ldr	tmp1, [src, #-8]!
128	str	tmp1, [dst, #-8]!
1291:
130	tbz	count, #2, 2f
131	ldr	tmp1w, [src, #-4]!
132	str	tmp1w, [dst, #-4]!
1332:
134	tbz	count, #1, 3f
135	ldrh	tmp1w, [src, #-2]!
136	strh	tmp1w, [dst, #-2]!
1373:
138	tbz	count, #0, .Lexitfunc
139	ldrb	tmp1w, [src, #-1]
140	strb	tmp1w, [dst, #-1]
141
142.Lexitfunc:
143	ret
144
145.Lcpy_over64:
146	subs	count, count, #128
147	b.ge	.Lcpy_body_large
148	/*
149	* Less than 128 bytes to copy, so handle 64 bytes here and then jump
150	* to the tail.
151	*/
152	ldp	A_l, A_h, [src, #-16]
153	stp	A_l, A_h, [dst, #-16]
154	ldp	B_l, B_h, [src, #-32]
155	ldp	C_l, C_h, [src, #-48]
156	stp	B_l, B_h, [dst, #-32]
157	stp	C_l, C_h, [dst, #-48]
158	ldp	D_l, D_h, [src, #-64]!
159	stp	D_l, D_h, [dst, #-64]!
160
161	tst	count, #0x3f
162	b.ne	.Ltail63
163	ret
164
165	/*
166	* Critical loop. Start at a new cache line boundary. Assuming
167	* 64 bytes per line this ensures the entire loop is in one line.
168	*/
169	.p2align	L1_CACHE_SHIFT
170.Lcpy_body_large:
171	/* pre-load 64 bytes data. */
172	ldp	A_l, A_h, [src, #-16]
173	ldp	B_l, B_h, [src, #-32]
174	ldp	C_l, C_h, [src, #-48]
175	ldp	D_l, D_h, [src, #-64]!
1761:
177	/*
178	* interlace the load of next 64 bytes data block with store of the last
179	* loaded 64 bytes data.
180	*/
181	stp	A_l, A_h, [dst, #-16]
182	ldp	A_l, A_h, [src, #-16]
183	stp	B_l, B_h, [dst, #-32]
184	ldp	B_l, B_h, [src, #-32]
185	stp	C_l, C_h, [dst, #-48]
186	ldp	C_l, C_h, [src, #-48]
187	stp	D_l, D_h, [dst, #-64]!
188	ldp	D_l, D_h, [src, #-64]!
189	subs	count, count, #64
190	b.ge	1b
191	stp	A_l, A_h, [dst, #-16]
192	stp	B_l, B_h, [dst, #-32]
193	stp	C_l, C_h, [dst, #-48]
194	stp	D_l, D_h, [dst, #-64]!
195
196	tst	count, #0x3f
197	b.ne	.Ltail63
198	ret
199ENDPIPROC(memmove)
200ENDPROC(__memmove)
201