• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * strnlen - calculate the length of a string with limit.
3 *
4 * Copyright (c) 2013, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Assumptions:
9 *
10 * ARMv8-a, AArch64
11 */
12
13#include "../asmdefs.h"
14
15/* Arguments and results.  */
16#define srcin		x0
17#define len		x0
18#define limit		x1
19
20/* Locals and temporaries.  */
21#define src		x2
22#define data1		x3
23#define data2		x4
24#define data2a		x5
25#define has_nul1	x6
26#define has_nul2	x7
27#define tmp1		x8
28#define tmp2		x9
29#define tmp3		x10
30#define tmp4		x11
31#define zeroones	x12
32#define pos		x13
33#define limit_wd	x14
34
35#define REP8_01 0x0101010101010101
36#define REP8_7f 0x7f7f7f7f7f7f7f7f
37#define REP8_80 0x8080808080808080
38
39	.text
40	.p2align	6
41L(start):
42	/* Pre-pad to ensure critical loop begins an icache line.  */
43	.rep 7
44	nop
45	.endr
46	/* Put this code here to avoid wasting more space with pre-padding.  */
47L(hit_limit):
48	mov	len, limit
49	ret
50
51ENTRY_ALIGN (__strnlen_aarch64, 0)
52	cbz	limit, L(hit_limit)
53	mov	zeroones, #REP8_01
54	bic	src, srcin, #15
55	ands	tmp1, srcin, #15
56	b.ne	L(misaligned)
57	/* Calculate the number of full and partial words -1.  */
58	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
59	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
60
61	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
62	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
63	   can be done in parallel across the entire word.  */
64	/* The inner loop deals with two Dwords at a time.  This has a
65	   slightly higher start-up cost, but we should win quite quickly,
66	   especially on cores with a high number of issue slots per
67	   cycle, as we get much better parallelism out of the operations.  */
68
69	/* Start of critial section -- keep to one 64Byte cache line.  */
70L(loop):
71	ldp	data1, data2, [src], #16
72L(realigned):
73	sub	tmp1, data1, zeroones
74	orr	tmp2, data1, #REP8_7f
75	sub	tmp3, data2, zeroones
76	orr	tmp4, data2, #REP8_7f
77	bic	has_nul1, tmp1, tmp2
78	bic	has_nul2, tmp3, tmp4
79	subs	limit_wd, limit_wd, #1
80	orr	tmp1, has_nul1, has_nul2
81	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
82	b.eq	L(loop)
83	/* End of critical section -- keep to one 64Byte cache line.  */
84
85	orr	tmp1, has_nul1, has_nul2
86	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */
87
88	/* We know there's a null in the final Qword.  The easiest thing
89	   to do now is work out the length of the string and return
90	   MIN (len, limit).  */
91
92	sub	len, src, srcin
93	cbz	has_nul1, L(nul_in_data2)
94#ifdef __AARCH64EB__
95	mov	data2, data1
96#endif
97	sub	len, len, #8
98	mov	has_nul2, has_nul1
99L(nul_in_data2):
100#ifdef __AARCH64EB__
101	/* For big-endian, carry propagation (if the final byte in the
102	   string is 0x01) means we cannot use has_nul directly.  The
103	   easiest way to get the correct byte is to byte-swap the data
104	   and calculate the syndrome a second time.  */
105	rev	data2, data2
106	sub	tmp1, data2, zeroones
107	orr	tmp2, data2, #REP8_7f
108	bic	has_nul2, tmp1, tmp2
109#endif
110	sub	len, len, #8
111	rev	has_nul2, has_nul2
112	clz	pos, has_nul2
113	add	len, len, pos, lsr #3		/* Bits to bytes.  */
114	cmp	len, limit
115	csel	len, len, limit, ls		/* Return the lower value.  */
116	ret
117
118L(misaligned):
119	/* Deal with a partial first word.
120	   We're doing two things in parallel here;
121	   1) Calculate the number of words (but avoiding overflow if
122	      limit is near ULONG_MAX) - to do this we need to work out
123	      limit + tmp1 - 1 as a 65-bit value before shifting it;
124	   2) Load and mask the initial data words - we force the bytes
125	      before the ones we are interested in to 0xff - this ensures
126	      early bytes will not hit any zero detection.  */
127	sub	limit_wd, limit, #1
128	neg	tmp4, tmp1
129	cmp	tmp1, #8
130
131	and	tmp3, limit_wd, #15
132	lsr	limit_wd, limit_wd, #4
133	mov	tmp2, #~0
134
135	ldp	data1, data2, [src], #16
136	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
137	add	tmp3, tmp3, tmp1
138
139#ifdef __AARCH64EB__
140	/* Big-endian.  Early bytes are at MSB.  */
141	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
142#else
143	/* Little-endian.  Early bytes are at LSB.  */
144	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
145#endif
146	add	limit_wd, limit_wd, tmp3, lsr #4
147
148	orr	data1, data1, tmp2
149	orr	data2a, data2, tmp2
150
151	csinv	data1, data1, xzr, le
152	csel	data2, data2, data2a, le
153	b	L(realigned)
154
155END (__strnlen_aarch64)
156