• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * memcpy - copy memory area
3 *
4 * Copyright (c) 2012-2020, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#include "../asmdefs.h"
15
16#define dstin	x0
17#define src	x1
18#define count	x2
19#define dst	x3
20#define srcend	x4
21#define dstend	x5
22#define A_l	x6
23#define A_lw	w6
24#define A_h	x7
25#define B_l	x8
26#define B_lw	w8
27#define B_h	x9
28#define C_l	x10
29#define C_lw	w10
30#define C_h	x11
31#define D_l	x12
32#define D_h	x13
33#define E_l	x14
34#define E_h	x15
35#define F_l	x16
36#define F_h	x17
37#define G_l	count
38#define G_h	dst
39#define H_l	src
40#define H_h	srcend
41#define tmp1	x14
42
43/* This implementation handles overlaps and supports both memcpy and memmove
44   from a single entry point.  It uses unaligned accesses and branchless
45   sequences to keep the code small, simple and improve performance.
46
47   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
48   copies of up to 128 bytes, and large copies.  The overhead of the overlap
49   check is negligible since it is only required for large copies.
50
51   Large copies use a software pipelined loop processing 64 bytes per iteration.
52   The destination pointer is 16-byte aligned to minimize unaligned accesses.
53   The loop tail is handled by always copying 64 bytes from the end.
54*/
55
56ENTRY_ALIAS (__memmove_aarch64)
57ENTRY (__memcpy_aarch64)
58	PTR_ARG (0)
59	PTR_ARG (1)
60	SIZE_ARG (2)
61	prfm	PLDL1KEEP, [src]
62	add	srcend, src, count
63	add	dstend, dstin, count
64	cmp	count, 128
65	b.hi	L(copy_long)
66	cmp	count, 32
67	b.hi	L(copy32_128)
68
69	/* Small copies: 0..32 bytes.  */
70	cmp	count, 16
71	b.lo	L(copy16)
72	ldp	A_l, A_h, [src]
73	ldp	D_l, D_h, [srcend, -16]
74	stp	A_l, A_h, [dstin]
75	stp	D_l, D_h, [dstend, -16]
76	ret
77
78	/* Copy 8-15 bytes.  */
79L(copy16):
80	tbz	count, 3, L(copy8)
81	ldr	A_l, [src]
82	ldr	A_h, [srcend, -8]
83	str	A_l, [dstin]
84	str	A_h, [dstend, -8]
85	ret
86
87	.p2align 3
88	/* Copy 4-7 bytes.  */
89L(copy8):
90	tbz	count, 2, L(copy4)
91	ldr	A_lw, [src]
92	ldr	B_lw, [srcend, -4]
93	str	A_lw, [dstin]
94	str	B_lw, [dstend, -4]
95	ret
96
97	/* Copy 0..3 bytes using a branchless sequence.  */
98L(copy4):
99	cbz	count, L(copy0)
100	lsr	tmp1, count, 1
101	ldrb	A_lw, [src]
102	ldrb	C_lw, [srcend, -1]
103	ldrb	B_lw, [src, tmp1]
104	strb	A_lw, [dstin]
105	strb	B_lw, [dstin, tmp1]
106	strb	C_lw, [dstend, -1]
107L(copy0):
108	ret
109
110	.p2align 4
111	/* Medium copies: 33..128 bytes.  */
112L(copy32_128):
113	ldp	A_l, A_h, [src]
114	ldp	B_l, B_h, [src, 16]
115	ldp	C_l, C_h, [srcend, -32]
116	ldp	D_l, D_h, [srcend, -16]
117	cmp	count, 64
118	b.hi	L(copy128)
119	stp	A_l, A_h, [dstin]
120	stp	B_l, B_h, [dstin, 16]
121	stp	C_l, C_h, [dstend, -32]
122	stp	D_l, D_h, [dstend, -16]
123	ret
124
125	.p2align 4
126	/* Copy 65..128 bytes.  */
127L(copy128):
128	ldp	E_l, E_h, [src, 32]
129	ldp	F_l, F_h, [src, 48]
130	cmp	count, 96
131	b.ls	L(copy96)
132	ldp	G_l, G_h, [srcend, -64]
133	ldp	H_l, H_h, [srcend, -48]
134	stp	G_l, G_h, [dstend, -64]
135	stp	H_l, H_h, [dstend, -48]
136L(copy96):
137	stp	A_l, A_h, [dstin]
138	stp	B_l, B_h, [dstin, 16]
139	stp	E_l, E_h, [dstin, 32]
140	stp	F_l, F_h, [dstin, 48]
141	stp	C_l, C_h, [dstend, -32]
142	stp	D_l, D_h, [dstend, -16]
143	ret
144
145	.p2align 4
146	/* Copy more than 128 bytes.  */
147L(copy_long):
148	/* Use backwards copy if there is an overlap.  */
149	sub	tmp1, dstin, src
150	cbz	tmp1, L(copy0)
151	cmp	tmp1, count
152	b.lo	L(copy_long_backwards)
153
154	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
155
156	ldp	D_l, D_h, [src]
157	and	tmp1, dstin, 15
158	bic	dst, dstin, 15
159	sub	src, src, tmp1
160	add	count, count, tmp1	/* Count is now 16 too large.  */
161	ldp	A_l, A_h, [src, 16]
162	stp	D_l, D_h, [dstin]
163	ldp	B_l, B_h, [src, 32]
164	ldp	C_l, C_h, [src, 48]
165	ldp	D_l, D_h, [src, 64]!
166	subs	count, count, 128 + 16	/* Test and readjust count.  */
167	b.ls	L(copy64_from_end)
168
169L(loop64):
170	stp	A_l, A_h, [dst, 16]
171	ldp	A_l, A_h, [src, 16]
172	stp	B_l, B_h, [dst, 32]
173	ldp	B_l, B_h, [src, 32]
174	stp	C_l, C_h, [dst, 48]
175	ldp	C_l, C_h, [src, 48]
176	stp	D_l, D_h, [dst, 64]!
177	ldp	D_l, D_h, [src, 64]!
178	subs	count, count, 64
179	b.hi	L(loop64)
180
181	/* Write the last iteration and copy 64 bytes from the end.  */
182L(copy64_from_end):
183	ldp	E_l, E_h, [srcend, -64]
184	stp	A_l, A_h, [dst, 16]
185	ldp	A_l, A_h, [srcend, -48]
186	stp	B_l, B_h, [dst, 32]
187	ldp	B_l, B_h, [srcend, -32]
188	stp	C_l, C_h, [dst, 48]
189	ldp	C_l, C_h, [srcend, -16]
190	stp	D_l, D_h, [dst, 64]
191	stp	E_l, E_h, [dstend, -64]
192	stp	A_l, A_h, [dstend, -48]
193	stp	B_l, B_h, [dstend, -32]
194	stp	C_l, C_h, [dstend, -16]
195	ret
196
197	.p2align 4
198
199	/* Large backwards copy for overlapping copies.
200	   Copy 16 bytes and then align dst to 16-byte alignment.  */
201L(copy_long_backwards):
202	ldp	D_l, D_h, [srcend, -16]
203	and	tmp1, dstend, 15
204	sub	srcend, srcend, tmp1
205	sub	count, count, tmp1
206	ldp	A_l, A_h, [srcend, -16]
207	stp	D_l, D_h, [dstend, -16]
208	ldp	B_l, B_h, [srcend, -32]
209	ldp	C_l, C_h, [srcend, -48]
210	ldp	D_l, D_h, [srcend, -64]!
211	sub	dstend, dstend, tmp1
212	subs	count, count, 128
213	b.ls	L(copy64_from_start)
214
215L(loop64_backwards):
216	stp	A_l, A_h, [dstend, -16]
217	ldp	A_l, A_h, [srcend, -16]
218	stp	B_l, B_h, [dstend, -32]
219	ldp	B_l, B_h, [srcend, -32]
220	stp	C_l, C_h, [dstend, -48]
221	ldp	C_l, C_h, [srcend, -48]
222	stp	D_l, D_h, [dstend, -64]!
223	ldp	D_l, D_h, [srcend, -64]!
224	subs	count, count, 64
225	b.hi	L(loop64_backwards)
226
227	/* Write the last iteration and copy 64 bytes from the start.  */
228L(copy64_from_start):
229	ldp	G_l, G_h, [src, 48]
230	stp	A_l, A_h, [dstend, -16]
231	ldp	A_l, A_h, [src, 32]
232	stp	B_l, B_h, [dstend, -32]
233	ldp	B_l, B_h, [src, 16]
234	stp	C_l, C_h, [dstend, -48]
235	ldp	C_l, C_h, [src]
236	stp	D_l, D_h, [dstend, -64]
237	stp	G_l, G_h, [dstin, 48]
238	stp	A_l, A_h, [dstin, 32]
239	stp	B_l, B_h, [dstin, 16]
240	stp	C_l, C_h, [dstin]
241	ret
242
243END (__memcpy_aarch64)
244
245