• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * memcpy/memmove using SIMD registers
3 *
4 * Copyright (c) 2019, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses.
11 *
12 */
13
14#include "../asmdefs.h"
15
16#define dstin	x0
17#define src	x1
18#define count	x2
19#define dst	x3
20#define srcend	x4
21#define dstend	x5
22#define A_l	x6
23#define A_lw	w6
24#define A_h	x7
25#define A_hw	w7
26#define B_l	x8
27#define B_lw	w8
28#define B_h	x9
29#define C_l	x10
30#define C_h	x11
31#define D_l	x12
32#define D_h	x13
33#define E_l	x14
34#define E_h	x15
35#define F_l	x16
36#define F_h	x17
37#define G_l	count
38#define G_h	dst
39#define H_l	src
40#define H_h	srcend
41#define tmp1	x14
42
43#define A_q	q0
44#define B_q	q1
45#define C_q	q2
46#define D_q	q3
47#define E_q	q4
48#define F_q	q5
49#define G_q	q6
50#define H_q	q7
51
52/* This implementation of memcpy correctly handles overlaps, therefore
53   __memmove_aarch64_simd aliases to __memcpy_aarch64_simd. By moving the
54   src and dst buffer overlap check from the start of memmove code to the
55   beginning of large copy code, the overhead of combining memcpy
56   and memmove implementations is negligible.
57
58   Copies are split into 3 main cases: small copies of up to 16 bytes,
59   medium copies of 17..128 bytes which are fully unrolled, and large
60   copies (moves).
61
62   Large forward moves align the source and use an unrolled loop
63   processing 64 bytes per iteration.
64
65   Large backward moves align srcend and use an unrolled loop processing
66   64 bytes per iteration.
67*/
68
69ENTRY (__memcpy_aarch64_simd)
70ENTRY_ALIAS (__memmove_aarch64_simd)
71	add	srcend, src, count
72	add	dstend, dstin, count
73	cmp	count, 16
74	b.ls	L(copy16_simd)
75	cmp	count, 128
76	b.hi	L(move_long_simd)
77
78	/* Medium copies: 17..128 bytes.  */
79	ldr	A_q, [src]
80	ldr	D_q, [srcend, -16]
81	cmp	count, 32
82	b.hi	L(copy33_128_simd)
83	str	A_q, [dstin]
84	str	D_q, [dstend, -16]
85	ret
86
87	.p2align 4
88	/* Small copies: 0..16 bytes.  */
89L(copy16_simd):
90	/* 8-15 bytes.  */
91	cmp	count, 8
92	b.lo	1f
93	ldr	A_l, [src]
94	ldr	A_h, [srcend, -8]
95	str	A_l, [dstin]
96	str	A_h, [dstend, -8]
97	ret
98
99	.p2align 4
1001:
101	/* 4-7 bytes.  */
102	tbz	count, 2, 1f
103	ldr	A_lw, [src]
104	ldr	A_hw, [srcend, -4]
105	str	A_lw, [dstin]
106	str	A_hw, [dstend, -4]
107	ret
108
109	.p2align 4
110	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
111	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
1121:
113	cbz	count, 2f
114	lsr	tmp1, count, 1
115	ldrb	A_lw, [src]
116	ldrb	A_hw, [srcend, -1]
117	ldrb	B_lw, [src, tmp1]
118	strb	A_lw, [dstin]
119	strb	B_lw, [dstin, tmp1]
120	strb	A_hw, [dstend, -1]
1212:	ret
122
123	.p2align 4
124	/* Copy 33..128 bytes.  */
125L(copy33_128_simd):
126	ldr	B_q, [src, 16]
127	ldr	C_q, [srcend, -32]
128	cmp	count, 64
129	b.hi	L(copy65_128_simd)
130	str	A_q, [dstin]
131	str	D_q, [dstend, -16]
132	str	B_q, [dstin, 16]
133	str	C_q, [dstend, -32]
134	ret
135
136	.p2align 4
137	/* Copy 65..128 bytes.  */
138L(copy65_128_simd):
139	ldr	E_q, [src, 32]
140	ldr	F_q, [src, 48]
141	ldr	G_q, [srcend, -64]
142	ldr	H_q, [srcend, -48]
143	str	A_q, [dstin]
144	str	D_q, [dstend, -16]
145	str	B_q, [dstin, 16]
146	str	C_q, [dstend, -32]
147	str	E_q, [dstin, 32]
148	str	F_q, [dstin, 48]
149	str	G_q, [dstend, -64]
150	str	H_q, [dstend, -48]
151	ret
152
153	.p2align 4
154	/* Move more than 128 bytes.  */
155L(move_long_simd):
156	sub	tmp1, dstin, src	/* Overlap check.  */
157	cbz	tmp1, L(copy0_simd)
158	cmp	tmp1, count
159	b.lo	L(move_long_backwards_simd)
160
161	/* Align src to 16 byte alignment so that we don't cross cache line
162	   boundaries on both loads and stores.  There are at least 128 bytes
163	   to copy, so copy 16 bytes unaligned and then align.  The loop
164	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
165
166	ldr	D_q, [src]
167	and	tmp1, src, 15
168	bic	src, src, 15
169	sub	dst, dstin, tmp1
170	add	count, count, tmp1	/* Count is now 16 too large.  */
171	ldr	A_q, [src, 16]
172	str	D_q, [dstin]
173	ldr	B_q, [src, 32]
174	ldr	C_q, [src, 48]
175	ldr	D_q, [src, 64]!
176	subs	count, count, 128 + 16	/* Test and readjust count.  */
177	b.ls	L(copy64_from_end_simd)
178
179L(loop64_simd):
180	str	A_q, [dst, 16]
181	ldr	A_q, [src, 16]
182	str	B_q, [dst, 32]
183	ldr	B_q, [src, 32]
184	str	C_q, [dst, 48]
185	ldr	C_q, [src, 48]
186	str	D_q, [dst, 64]!
187	ldr	D_q, [src, 64]!
188	subs	count, count, 64
189	b.hi	L(loop64_simd)
190
191	/* Write the last full set of 64 bytes.  The remainder is at most 64
192	   bytes, so it is safe to always copy 64 bytes from the end even if
193	   there is just 1 byte left.  */
194L(copy64_from_end_simd):
195	ldr	E_q, [srcend, -64]
196	str	A_q, [dst, 16]
197	ldr	A_q, [srcend, -48]
198	str	B_q, [dst, 32]
199	ldr	B_q, [srcend, -32]
200	str	C_q, [dst, 48]
201	ldr	C_q, [srcend, -16]
202	str	D_q, [dst, 64]
203	str	E_q, [dstend, -64]
204	str	A_q, [dstend, -48]
205	str	B_q, [dstend, -32]
206	str	C_q, [dstend, -16]
207
208L(copy0_simd):
209	ret
210
211	.p2align 4
212
213	/* Move more than 128 bytes where src and dst buffers overlap
214	   and dst > src.
215
216     Align srcend to 16 byte alignment so that we don't cross cache line
217	   boundaries on both loads and stores.  There are at least 128 bytes
218	   to copy, so copy 16 bytes unaligned and then align.  The loop
219	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
220
221L(move_long_backwards_simd):
222	ldr	D_q, [srcend, -16]
223	and	tmp1, srcend, 15
224	sub	srcend, srcend, tmp1
225	sub	count, count, tmp1
226	ldr	A_q, [srcend, -16]
227	str	D_q, [dstend, -16]
228	ldr	B_q, [srcend, -32]
229	ldr	C_q, [srcend, -48]
230	ldr	D_q, [srcend, -64]!
231	sub	dstend, dstend, tmp1
232	subs	count, count, 128
233	b.ls	L(copy64_from_start_simd)
234
235L(loop64_backwards_simd):
236	str	A_q, [dstend, -16]
237	ldr	A_q, [srcend, -16]
238	str	B_q, [dstend, -32]
239	ldr	B_q, [srcend, -32]
240	str	C_q, [dstend, -48]
241	ldr	C_q, [srcend, -48]
242	str	D_q, [dstend, -64]!
243	ldr	D_q, [srcend, -64]!
244	subs	count, count, 64
245	b.hi	L(loop64_backwards_simd)
246
247	/* Write the last full set of 64 bytes.  The remainder is at most 64
248	   bytes, so it is safe to always copy 64 bytes from the start even if
249	   there is just 1 byte left.  */
250L(copy64_from_start_simd):
251	ldr	G_q, [src, 48]
252	str	A_q, [dstend, -16]
253	ldr	A_q, [src, 32]
254	str	B_q, [dstend, -32]
255	ldr	B_q, [src, 16]
256	str	C_q, [dstend, -48]
257	ldr	C_q, [src]
258	str	D_q, [dstend, -64]
259	str	G_q, [dstin, 48]
260	str	A_q, [dstin, 32]
261	str	B_q, [dstin, 16]
262	str	C_q, [dstin]
263	ret
264
265END (__memcpy_aarch64_simd)
266