• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * strcmp for ARMv7
3 *
4 * Copyright (c) 2012-2019, Arm Limited.
5 * SPDX-License-Identifier: MIT
6 */
7
8/* Implementation of strcmp for ARMv7 when DSP instructions are
9   available.  Use ldrd to support wider loads, provided the data
10   is sufficiently aligned.  Use saturating arithmetic to optimize
11   the compares.  */
12
13#include "../asmdefs.h"
14
15/* Build Options:
16   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
17   byte in the string.  If comparing completely random strings
18   the pre-check will save time, since there is a very high
19   probability of a mismatch in the first character: we save
20   significant overhead if this is the common case.  However,
21   if strings are likely to be identical (eg because we're
22   verifying a hit in a hash table), then this check is largely
23   redundant.  */
24
25#define STRCMP_NO_PRECHECK	0
26
27	/* This version uses Thumb-2 code.  */
28	.thumb
29	.syntax unified
30
31#ifdef __ARM_BIG_ENDIAN
32#define S2LO lsl
33#define S2LOEQ lsleq
34#define S2HI lsr
35#define MSB 0x000000ff
36#define LSB 0xff000000
37#define BYTE0_OFFSET 24
38#define BYTE1_OFFSET 16
39#define BYTE2_OFFSET 8
40#define BYTE3_OFFSET 0
41#else /* not  __ARM_BIG_ENDIAN */
42#define S2LO lsr
43#define S2LOEQ lsreq
44#define S2HI lsl
45#define BYTE0_OFFSET 0
46#define BYTE1_OFFSET 8
47#define BYTE2_OFFSET 16
48#define BYTE3_OFFSET 24
49#define MSB 0xff000000
50#define LSB 0x000000ff
51#endif /* not  __ARM_BIG_ENDIAN */
52
53/* Parameters and result.  */
54#define src1		r0
55#define src2		r1
56#define result		r0	/* Overlaps src1.  */
57
58/* Internal variables.  */
59#define tmp1		r4
60#define tmp2		r5
61#define const_m1	r12
62
63/* Additional internal variables for 64-bit aligned data.  */
64#define data1a		r2
65#define data1b		r3
66#define data2a		r6
67#define data2b		r7
68#define syndrome_a	tmp1
69#define syndrome_b	tmp2
70
71/* Additional internal variables for 32-bit aligned data.  */
72#define data1		r2
73#define data2		r3
74#define syndrome	tmp2
75
76
77	/* Macro to compute and return the result value for word-aligned
78	   cases.  */
79	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
80#ifdef __ARM_BIG_ENDIAN
81	/* If data1 contains a zero byte, then syndrome will contain a 1 in
82	   bit 7 of that byte.  Otherwise, the highest set bit in the
83	   syndrome will highlight the first different bit.  It is therefore
84	   sufficient to extract the eight bits starting with the syndrome
85	   bit.  */
86	clz	tmp1, \synd
87	lsl	r1, \d2, tmp1
88	.if \restore_r6
89	ldrd	r6, r7, [sp, #8]
90	.endif
91	.cfi_restore 6
92	.cfi_restore 7
93	lsl	\d1, \d1, tmp1
94	.cfi_remember_state
95	lsr	result, \d1, #24
96	ldrd	r4, r5, [sp], #16
97	.cfi_restore 4
98	.cfi_restore 5
99	sub	result, result, r1, lsr #24
100	bx	lr
101#else
102	/* To use the big-endian trick we'd have to reverse all three words.
103	   that's slower than this approach.  */
104	rev	\synd, \synd
105	clz	tmp1, \synd
106	bic	tmp1, tmp1, #7
107	lsr	r1, \d2, tmp1
108	.cfi_remember_state
109	.if \restore_r6
110	ldrd	r6, r7, [sp, #8]
111	.endif
112	.cfi_restore 6
113	.cfi_restore 7
114	lsr	\d1, \d1, tmp1
115	and	result, \d1, #255
116	and	r1, r1, #255
117	ldrd	r4, r5, [sp], #16
118	.cfi_restore 4
119	.cfi_restore 5
120	sub	result, result, r1
121
122	bx	lr
123#endif
124	.endm
125
126	.text
127	.p2align	5
128L(strcmp_start_addr):
129#if STRCMP_NO_PRECHECK == 0
130L(fastpath_exit):
131	sub	r0, r2, r3
132	bx	lr
133	nop
134#endif
135ENTRY_ALIGN (__strcmp_arm, 0)
136#if STRCMP_NO_PRECHECK == 0
137	ldrb	r2, [src1]
138	ldrb	r3, [src2]
139	cmp	r2, #1
140	it	cs
141	cmpcs	r2, r3
142	bne	L(fastpath_exit)
143#endif
144	strd	r4, r5, [sp, #-16]!
145	.cfi_def_cfa_offset 16
146	.cfi_offset 4, -16
147	.cfi_offset 5, -12
148	orr	tmp1, src1, src2
149	strd	r6, r7, [sp, #8]
150	.cfi_offset 6, -8
151	.cfi_offset 7, -4
152	mvn	const_m1, #0
153	lsl	r2, tmp1, #29
154	cbz	r2, L(loop_aligned8)
155
156L(not_aligned):
157	eor	tmp1, src1, src2
158	tst	tmp1, #7
159	bne	L(misaligned8)
160
161	/* Deal with mutual misalignment by aligning downwards and then
162	   masking off the unwanted loaded data to prevent a difference.  */
163	and	tmp1, src1, #7
164	bic	src1, src1, #7
165	and	tmp2, tmp1, #3
166	bic	src2, src2, #7
167	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
168	ldrd	data1a, data1b, [src1], #16
169	tst	tmp1, #4
170	ldrd	data2a, data2b, [src2], #16
171	/* In thumb code we can't use MVN with a register shift, but
172	   we do have ORN.  */
173	S2HI	tmp1, const_m1, tmp2
174	orn	data1a, data1a, tmp1
175	orn	data2a, data2a, tmp1
176	beq	L(start_realigned8)
177	orn	data1b, data1b, tmp1
178	mov	data1a, const_m1
179	orn	data2b, data2b, tmp1
180	mov	data2a, const_m1
181	b	L(start_realigned8)
182
183	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
184	   pass.  */
185	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
186	.p2align 2	/* Always word aligned.  */
187L(loop_aligned8):
188	ldrd	data1a, data1b, [src1], #16
189	ldrd	data2a, data2b, [src2], #16
190L(start_realigned8):
191	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
192	eor	syndrome_a, data1a, data2a
193	sel	syndrome_a, syndrome_a, const_m1
194	cbnz	syndrome_a, L(diff_in_a)
195	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
196	eor	syndrome_b, data1b, data2b
197	sel	syndrome_b, syndrome_b, const_m1
198	cbnz	syndrome_b, L(diff_in_b)
199
200	ldrd	data1a, data1b, [src1, #-8]
201	ldrd	data2a, data2b, [src2, #-8]
202	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
203	eor	syndrome_a, data1a, data2a
204	sel	syndrome_a, syndrome_a, const_m1
205	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
206	eor	syndrome_b, data1b, data2b
207	sel	syndrome_b, syndrome_b, const_m1
208	/* Can't use CBZ for backwards branch.  */
209	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
210	beq	L(loop_aligned8)
211
212L(diff_found):
213	cbnz	syndrome_a, L(diff_in_a)
214
215L(diff_in_b):
216	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
217
218L(diff_in_a):
219	.cfi_restore_state
220	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
221
222	.cfi_restore_state
223L(misaligned8):
224	tst	tmp1, #3
225	bne	L(misaligned4)
226	ands	tmp1, src1, #3
227	bne	L(mutual_align4)
228
229	/* Unrolled by a factor of 2, to reduce the number of post-increment
230	   operations.  */
231L(loop_aligned4):
232	ldr	data1, [src1], #8
233	ldr	data2, [src2], #8
234L(start_realigned4):
235	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
236	eor	syndrome, data1, data2
237	sel	syndrome, syndrome, const_m1
238	cbnz	syndrome, L(aligned4_done)
239	ldr	data1, [src1, #-4]
240	ldr	data2, [src2, #-4]
241	uadd8	syndrome, data1, const_m1
242	eor	syndrome, data1, data2
243	sel	syndrome, syndrome, const_m1
244	cmp	syndrome, #0
245	beq	L(loop_aligned4)
246
247L(aligned4_done):
248	strcmp_epilogue_aligned syndrome, data1, data2, 0
249
250L(mutual_align4):
251	.cfi_restore_state
252	/* Deal with mutual misalignment by aligning downwards and then
253	   masking off the unwanted loaded data to prevent a difference.  */
254	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
255	bic	src1, src1, #3
256	ldr	data1, [src1], #8
257	bic	src2, src2, #3
258	ldr	data2, [src2], #8
259
260	/* In thumb code we can't use MVN with a register shift, but
261	   we do have ORN.  */
262	S2HI	tmp1, const_m1, tmp1
263	orn	data1, data1, tmp1
264	orn	data2, data2, tmp1
265	b	L(start_realigned4)
266
267L(misaligned4):
268	ands	tmp1, src1, #3
269	beq	L(src1_aligned)
270	sub	src2, src2, tmp1
271	bic	src1, src1, #3
272	lsls	tmp1, tmp1, #31
273	ldr	data1, [src1], #4
274	beq	L(aligned_m2)
275	bcs	L(aligned_m1)
276
277#if STRCMP_NO_PRECHECK == 1
278	ldrb	data2, [src2, #1]
279	uxtb	tmp1, data1, ror #BYTE1_OFFSET
280	subs	tmp1, tmp1, data2
281	bne	L(misaligned_exit)
282	cbz	data2, L(misaligned_exit)
283
284L(aligned_m2):
285	ldrb	data2, [src2, #2]
286	uxtb	tmp1, data1, ror #BYTE2_OFFSET
287	subs	tmp1, tmp1, data2
288	bne	L(misaligned_exit)
289	cbz	data2, L(misaligned_exit)
290
291L(aligned_m1):
292	ldrb	data2, [src2, #3]
293	uxtb	tmp1, data1, ror #BYTE3_OFFSET
294	subs	tmp1, tmp1, data2
295	bne	L(misaligned_exit)
296	add	src2, src2, #4
297	cbnz	data2, L(src1_aligned)
298#else  /* STRCMP_NO_PRECHECK */
299	/* If we've done the pre-check, then we don't need to check the
300	   first byte again here.  */
301	ldrb	data2, [src2, #2]
302	uxtb	tmp1, data1, ror #BYTE2_OFFSET
303	subs	tmp1, tmp1, data2
304	bne	L(misaligned_exit)
305	cbz	data2, L(misaligned_exit)
306
307L(aligned_m2):
308	ldrb	data2, [src2, #3]
309	uxtb	tmp1, data1, ror #BYTE3_OFFSET
310	subs	tmp1, tmp1, data2
311	bne	L(misaligned_exit)
312	cbnz	data2, L(aligned_m1)
313#endif
314
315L(misaligned_exit):
316	.cfi_remember_state
317	mov	result, tmp1
318	ldr	r4, [sp], #16
319	.cfi_restore 4
320	bx	lr
321
322#if STRCMP_NO_PRECHECK == 0
323L(aligned_m1):
324	add	src2, src2, #4
325#endif
326L(src1_aligned):
327	.cfi_restore_state
328	/* src1 is word aligned, but src2 has no common alignment
329	   with it.  */
330	ldr	data1, [src1], #4
331	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
332
333	bic	src2, src2, #3
334	ldr	data2, [src2], #4
335	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
336	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
337
338	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
339L(overlap3):
340	bic	tmp1, data1, #MSB
341	uadd8	syndrome, data1, const_m1
342	eors	syndrome, tmp1, data2, S2LO #8
343	sel	syndrome, syndrome, const_m1
344	bne	4f
345	cbnz	syndrome, 5f
346	ldr	data2, [src2], #4
347	eor	tmp1, tmp1, data1
348	cmp	tmp1, data2, S2HI #24
349	bne	6f
350	ldr	data1, [src1], #4
351	b	L(overlap3)
3524:
353	S2LO	data2, data2, #8
354	b	L(strcmp_tail)
355
3565:
357	bics	syndrome, syndrome, #MSB
358	bne	L(strcmp_done_equal)
359
360	/* We can only get here if the MSB of data1 contains 0, so
361	   fast-path the exit.  */
362	ldrb	result, [src2]
363	.cfi_remember_state
364	ldrd	r4, r5, [sp], #16
365	.cfi_restore 4
366	.cfi_restore 5
367	/* R6/7 Not used in this sequence.  */
368	.cfi_restore 6
369	.cfi_restore 7
370	neg	result, result
371	bx	lr
372
3736:
374	.cfi_restore_state
375	S2LO	data1, data1, #24
376	and	data2, data2, #LSB
377	b	L(strcmp_tail)
378
379	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
380L(overlap2):
381	and	tmp1, data1, const_m1, S2LO #16
382	uadd8	syndrome, data1, const_m1
383	eors	syndrome, tmp1, data2, S2LO #16
384	sel	syndrome, syndrome, const_m1
385	bne	4f
386	cbnz	syndrome, 5f
387	ldr	data2, [src2], #4
388	eor	tmp1, tmp1, data1
389	cmp	tmp1, data2, S2HI #16
390	bne	6f
391	ldr	data1, [src1], #4
392	b	L(overlap2)
3934:
394	S2LO	data2, data2, #16
395	b	L(strcmp_tail)
3965:
397	ands	syndrome, syndrome, const_m1, S2LO #16
398	bne	L(strcmp_done_equal)
399
400	ldrh	data2, [src2]
401	S2LO	data1, data1, #16
402#ifdef __ARM_BIG_ENDIAN
403	lsl	data2, data2, #16
404#endif
405	b	L(strcmp_tail)
406
4076:
408	S2LO	data1, data1, #16
409	and	data2, data2, const_m1, S2LO #16
410	b	L(strcmp_tail)
411
412	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
413L(overlap1):
414	and	tmp1, data1, #LSB
415	uadd8	syndrome, data1, const_m1
416	eors	syndrome, tmp1, data2, S2LO #24
417	sel	syndrome, syndrome, const_m1
418	bne	4f
419	cbnz	syndrome, 5f
420	ldr	data2, [src2], #4
421	eor	tmp1, tmp1, data1
422	cmp	tmp1, data2, S2HI #8
423	bne	6f
424	ldr	data1, [src1], #4
425	b	L(overlap1)
4264:
427	S2LO	data2, data2, #24
428	b	L(strcmp_tail)
4295:
430	tst	syndrome, #LSB
431	bne	L(strcmp_done_equal)
432	ldr	data2, [src2]
4336:
434	S2LO	data1, data1, #8
435	bic	data2, data2, #MSB
436	b	L(strcmp_tail)
437
438L(strcmp_done_equal):
439	mov	result, #0
440	.cfi_remember_state
441	ldrd	r4, r5, [sp], #16
442	.cfi_restore 4
443	.cfi_restore 5
444	/* R6/7 not used in this sequence.  */
445	.cfi_restore 6
446	.cfi_restore 7
447	bx	lr
448
449L(strcmp_tail):
450	.cfi_restore_state
451#ifndef __ARM_BIG_ENDIAN
452	rev	data1, data1
453	rev	data2, data2
454	/* Now everything looks big-endian...  */
455#endif
456	uadd8	tmp1, data1, const_m1
457	eor	tmp1, data1, data2
458	sel	syndrome, tmp1, const_m1
459	clz	tmp1, syndrome
460	lsl	data1, data1, tmp1
461	lsl	data2, data2, tmp1
462	lsr	result, data1, #24
463	ldrd	r4, r5, [sp], #16
464	.cfi_restore 4
465	.cfi_restore 5
466	/* R6/7 not used in this sequence.  */
467	.cfi_restore 6
468	.cfi_restore 7
469	sub	result, result, data2, lsr #24
470	bx	lr
471
472END (__strcmp_arm)
473