• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * strcpy/stpcpy - copy a string returning pointer to start/end.
3 *
4 * Copyright (c) 2013-2019, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
11 */
12
13#include "../asmdefs.h"
14
15/* To build as stpcpy, define BUILD_STPCPY before compiling this file.
16
17   To test the page crossing code path more thoroughly, compile with
18   -DSTRCPY_TEST_PAGE_CROSS - this will force all copies through the slower
19   entry path.  This option is not intended for production use.  */
20
21/* Arguments and results.  */
22#define dstin		x0
23#define srcin		x1
24
25/* Locals and temporaries.  */
26#define src		x2
27#define dst		x3
28#define data1		x4
29#define data1w		w4
30#define data2		x5
31#define data2w		w5
32#define has_nul1	x6
33#define has_nul2	x7
34#define tmp1		x8
35#define tmp2		x9
36#define tmp3		x10
37#define tmp4		x11
38#define zeroones	x12
39#define data1a		x13
40#define data2a		x14
41#define pos		x15
42#define len		x16
43#define to_align	x17
44
45#ifdef BUILD_STPCPY
46#define STRCPY __stpcpy_aarch64
47#else
48#define STRCPY __strcpy_aarch64
49#endif
50
51	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
52	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
53	   can be done in parallel across the entire word.  */
54
55#define REP8_01 0x0101010101010101
56#define REP8_7f 0x7f7f7f7f7f7f7f7f
57#define REP8_80 0x8080808080808080
58
59	/* AArch64 systems have a minimum page size of 4k.  We can do a quick
60	   page size check for crossing this boundary on entry and if we
61	   do not, then we can short-circuit much of the entry code.  We
62	   expect early page-crossing strings to be rare (probability of
63	   16/MIN_PAGE_SIZE ~= 0.4%), so the branch should be quite
64	   predictable, even with random strings.
65
66	   We don't bother checking for larger page sizes, the cost of setting
67	   up the correct page size is just not worth the extra gain from
68	   a small reduction in the cases taking the slow path.  Note that
69	   we only care about whether the first fetch, which may be
70	   misaligned, crosses a page boundary - after that we move to aligned
71	   fetches for the remainder of the string.  */
72
73#ifdef STRCPY_TEST_PAGE_CROSS
74	/* Make everything that isn't Qword aligned look like a page cross.  */
75#define MIN_PAGE_P2 4
76#else
77#define MIN_PAGE_P2 12
78#endif
79
80#define MIN_PAGE_SIZE (1 << MIN_PAGE_P2)
81
82ENTRY (STRCPY)
83	/* For moderately short strings, the fastest way to do the copy is to
84	   calculate the length of the string in the same way as strlen, then
85	   essentially do a memcpy of the result.  This avoids the need for
86	   multiple byte copies and further means that by the time we
87	   reach the bulk copy loop we know we can always use DWord
88	   accesses.  We expect __strcpy_aarch64 to rarely be called repeatedly
89	   with the same source string, so branch prediction is likely to
90	   always be difficult - we mitigate against this by preferring
91	   conditional select operations over branches whenever this is
92	   feasible.  */
93	and	tmp2, srcin, #(MIN_PAGE_SIZE - 1)
94	mov	zeroones, #REP8_01
95	and	to_align, srcin, #15
96	cmp	tmp2, #(MIN_PAGE_SIZE - 16)
97	neg	tmp1, to_align
98	/* The first fetch will straddle a (possible) page boundary iff
99	   srcin + 15 causes bit[MIN_PAGE_P2] to change value.  A 16-byte
100	   aligned string will never fail the page align check, so will
101	   always take the fast path.  */
102	b.gt	L(page_cross)
103
104L(page_cross_ok):
105	ldp	data1, data2, [srcin]
106#ifdef __AARCH64EB__
107	/* Because we expect the end to be found within 16 characters
108	   (profiling shows this is the most common case), it's worth
109	   swapping the bytes now to save having to recalculate the
110	   termination syndrome later.  We preserve data1 and data2
111	   so that we can re-use the values later on.  */
112	rev	tmp2, data1
113	sub	tmp1, tmp2, zeroones
114	orr	tmp2, tmp2, #REP8_7f
115	bics	has_nul1, tmp1, tmp2
116	b.ne	L(fp_le8)
117	rev	tmp4, data2
118	sub	tmp3, tmp4, zeroones
119	orr	tmp4, tmp4, #REP8_7f
120#else
121	sub	tmp1, data1, zeroones
122	orr	tmp2, data1, #REP8_7f
123	bics	has_nul1, tmp1, tmp2
124	b.ne	L(fp_le8)
125	sub	tmp3, data2, zeroones
126	orr	tmp4, data2, #REP8_7f
127#endif
128	bics	has_nul2, tmp3, tmp4
129	b.eq	L(bulk_entry)
130
131	/* The string is short (<=16 bytes).  We don't know exactly how
132	   short though, yet.  Work out the exact length so that we can
133	   quickly select the optimal copy strategy.  */
134L(fp_gt8):
135	rev	has_nul2, has_nul2
136	clz	pos, has_nul2
137	mov	tmp2, #56
138	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
139	sub	pos, tmp2, pos
140#ifdef __AARCH64EB__
141	lsr	data2, data2, pos
142#else
143	lsl	data2, data2, pos
144#endif
145	str	data2, [dst, #1]
146	str	data1, [dstin]
147#ifdef BUILD_STPCPY
148	add	dstin, dst, #8
149#endif
150	ret
151
152L(fp_le8):
153	rev	has_nul1, has_nul1
154	clz	pos, has_nul1
155	add	dst, dstin, pos, lsr #3		/* Bits to bytes.  */
156	subs	tmp2, pos, #24			/* Pos in bits. */
157	b.lt	L(fp_lt4)
158#ifdef __AARCH64EB__
159	mov	tmp2, #56
160	sub	pos, tmp2, pos
161	lsr	data2, data1, pos
162	lsr	data1, data1, #32
163#else
164	lsr	data2, data1, tmp2
165#endif
166	/* 4->7 bytes to copy.  */
167	str	data2w, [dst, #-3]
168	str	data1w, [dstin]
169#ifdef BUILD_STPCPY
170	mov	dstin, dst
171#endif
172	ret
173L(fp_lt4):
174	cbz	pos, L(fp_lt2)
175	/* 2->3 bytes to copy.  */
176#ifdef __AARCH64EB__
177	lsr	data1, data1, #48
178#endif
179	strh	data1w, [dstin]
180	/* Fall-through, one byte (max) to go.  */
181L(fp_lt2):
182	/* Null-terminated string.  Last character must be zero!  */
183	strb	wzr, [dst]
184#ifdef BUILD_STPCPY
185	mov	dstin, dst
186#endif
187	ret
188
189	.p2align 6
190	/* Aligning here ensures that the entry code and main loop all lies
191	   within one 64-byte cache line.  */
192L(bulk_entry):
193	sub	to_align, to_align, #16
194	stp	data1, data2, [dstin]
195	sub	src, srcin, to_align
196	sub	dst, dstin, to_align
197	b	L(entry_no_page_cross)
198
199	/* The inner loop deals with two Dwords at a time.  This has a
200	   slightly higher start-up cost, but we should win quite quickly,
201	   especially on cores with a high number of issue slots per
202	   cycle, as we get much better parallelism out of the operations.  */
203L(main_loop):
204	stp	data1, data2, [dst], #16
205L(entry_no_page_cross):
206	ldp	data1, data2, [src], #16
207	sub	tmp1, data1, zeroones
208	orr	tmp2, data1, #REP8_7f
209	sub	tmp3, data2, zeroones
210	orr	tmp4, data2, #REP8_7f
211	bic	has_nul1, tmp1, tmp2
212	bics	has_nul2, tmp3, tmp4
213	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
214	b.eq	L(main_loop)
215
216	/* Since we know we are copying at least 16 bytes, the fastest way
217	   to deal with the tail is to determine the location of the
218	   trailing NUL, then (re)copy the 16 bytes leading up to that.  */
219	cmp	has_nul1, #0
220#ifdef __AARCH64EB__
221	/* For big-endian, carry propagation (if the final byte in the
222	   string is 0x01) means we cannot use has_nul directly.  The
223	   easiest way to get the correct byte is to byte-swap the data
224	   and calculate the syndrome a second time.  */
225	csel	data1, data1, data2, ne
226	rev	data1, data1
227	sub	tmp1, data1, zeroones
228	orr	tmp2, data1, #REP8_7f
229	bic	has_nul1, tmp1, tmp2
230#else
231	csel	has_nul1, has_nul1, has_nul2, ne
232#endif
233	rev	has_nul1, has_nul1
234	clz	pos, has_nul1
235	add	tmp1, pos, #72
236	add	pos, pos, #8
237	csel	pos, pos, tmp1, ne
238	add	src, src, pos, lsr #3
239	add	dst, dst, pos, lsr #3
240	ldp	data1, data2, [src, #-32]
241	stp	data1, data2, [dst, #-16]
242#ifdef BUILD_STPCPY
243	sub	dstin, dst, #1
244#endif
245	ret
246
247L(page_cross):
248	bic	src, srcin, #15
249	/* Start by loading two words at [srcin & ~15], then forcing the
250	   bytes that precede srcin to 0xff.  This means they never look
251	   like termination bytes.  */
252	ldp	data1, data2, [src]
253	lsl	tmp1, tmp1, #3	/* Bytes beyond alignment -> bits.  */
254	tst	to_align, #7
255	csetm	tmp2, ne
256#ifdef __AARCH64EB__
257	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
258#else
259	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
260#endif
261	orr	data1, data1, tmp2
262	orr	data2a, data2, tmp2
263	cmp	to_align, #8
264	csinv	data1, data1, xzr, lt
265	csel	data2, data2, data2a, lt
266	sub	tmp1, data1, zeroones
267	orr	tmp2, data1, #REP8_7f
268	sub	tmp3, data2, zeroones
269	orr	tmp4, data2, #REP8_7f
270	bic	has_nul1, tmp1, tmp2
271	bics	has_nul2, tmp3, tmp4
272	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
273	b.eq	L(page_cross_ok)
274	/* We now need to make data1 and data2 look like they've been
275	   loaded directly from srcin.  Do a rotate on the 128-bit value.  */
276	lsl	tmp1, to_align, #3	/* Bytes->bits.  */
277	neg	tmp2, to_align, lsl #3
278#ifdef __AARCH64EB__
279	lsl	data1a, data1, tmp1
280	lsr	tmp4, data2, tmp2
281	lsl	data2, data2, tmp1
282	orr	tmp4, tmp4, data1a
283	cmp	to_align, #8
284	csel	data1, tmp4, data2, lt
285	rev	tmp2, data1
286	rev	tmp4, data2
287	sub	tmp1, tmp2, zeroones
288	orr	tmp2, tmp2, #REP8_7f
289	sub	tmp3, tmp4, zeroones
290	orr	tmp4, tmp4, #REP8_7f
291#else
292	lsr	data1a, data1, tmp1
293	lsl	tmp4, data2, tmp2
294	lsr	data2, data2, tmp1
295	orr	tmp4, tmp4, data1a
296	cmp	to_align, #8
297	csel	data1, tmp4, data2, lt
298	sub	tmp1, data1, zeroones
299	orr	tmp2, data1, #REP8_7f
300	sub	tmp3, data2, zeroones
301	orr	tmp4, data2, #REP8_7f
302#endif
303	bic	has_nul1, tmp1, tmp2
304	cbnz	has_nul1, L(fp_le8)
305	bic	has_nul2, tmp3, tmp4
306	b	L(fp_gt8)
307
308END (STRCPY)
309