1/* 2 * strcmp for ARMv7 3 * 4 * Copyright (c) 2012-2019, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Implementation of strcmp for ARMv7 when DSP instructions are 9 available. Use ldrd to support wider loads, provided the data 10 is sufficiently aligned. Use saturating arithmetic to optimize 11 the compares. */ 12 13#include "../asmdefs.h" 14 15/* Build Options: 16 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 17 byte in the string. If comparing completely random strings 18 the pre-check will save time, since there is a very high 19 probability of a mismatch in the first character: we save 20 significant overhead if this is the common case. However, 21 if strings are likely to be identical (eg because we're 22 verifying a hit in a hash table), then this check is largely 23 redundant. */ 24 25#define STRCMP_NO_PRECHECK 0 26 27 /* This version uses Thumb-2 code. */ 28 .thumb 29 .syntax unified 30 31#ifdef __ARM_BIG_ENDIAN 32#define S2LO lsl 33#define S2LOEQ lsleq 34#define S2HI lsr 35#define MSB 0x000000ff 36#define LSB 0xff000000 37#define BYTE0_OFFSET 24 38#define BYTE1_OFFSET 16 39#define BYTE2_OFFSET 8 40#define BYTE3_OFFSET 0 41#else /* not __ARM_BIG_ENDIAN */ 42#define S2LO lsr 43#define S2LOEQ lsreq 44#define S2HI lsl 45#define BYTE0_OFFSET 0 46#define BYTE1_OFFSET 8 47#define BYTE2_OFFSET 16 48#define BYTE3_OFFSET 24 49#define MSB 0xff000000 50#define LSB 0x000000ff 51#endif /* not __ARM_BIG_ENDIAN */ 52 53/* Parameters and result. */ 54#define src1 r0 55#define src2 r1 56#define result r0 /* Overlaps src1. */ 57 58/* Internal variables. */ 59#define tmp1 r4 60#define tmp2 r5 61#define const_m1 r12 62 63/* Additional internal variables for 64-bit aligned data. */ 64#define data1a r2 65#define data1b r3 66#define data2a r6 67#define data2b r7 68#define syndrome_a tmp1 69#define syndrome_b tmp2 70 71/* Additional internal variables for 32-bit aligned data. */ 72#define data1 r2 73#define data2 r3 74#define syndrome tmp2 75 76 77 /* Macro to compute and return the result value for word-aligned 78 cases. */ 79 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 80#ifdef __ARM_BIG_ENDIAN 81 /* If data1 contains a zero byte, then syndrome will contain a 1 in 82 bit 7 of that byte. Otherwise, the highest set bit in the 83 syndrome will highlight the first different bit. It is therefore 84 sufficient to extract the eight bits starting with the syndrome 85 bit. */ 86 clz tmp1, \synd 87 lsl r1, \d2, tmp1 88 .if \restore_r6 89 ldrd r6, r7, [sp, #8] 90 .endif 91 .cfi_restore 6 92 .cfi_restore 7 93 lsl \d1, \d1, tmp1 94 .cfi_remember_state 95 lsr result, \d1, #24 96 ldrd r4, r5, [sp], #16 97 .cfi_restore 4 98 .cfi_restore 5 99 sub result, result, r1, lsr #24 100 bx lr 101#else 102 /* To use the big-endian trick we'd have to reverse all three words. 103 that's slower than this approach. */ 104 rev \synd, \synd 105 clz tmp1, \synd 106 bic tmp1, tmp1, #7 107 lsr r1, \d2, tmp1 108 .cfi_remember_state 109 .if \restore_r6 110 ldrd r6, r7, [sp, #8] 111 .endif 112 .cfi_restore 6 113 .cfi_restore 7 114 lsr \d1, \d1, tmp1 115 and result, \d1, #255 116 and r1, r1, #255 117 ldrd r4, r5, [sp], #16 118 .cfi_restore 4 119 .cfi_restore 5 120 sub result, result, r1 121 122 bx lr 123#endif 124 .endm 125 126 .text 127 .p2align 5 128L(strcmp_start_addr): 129#if STRCMP_NO_PRECHECK == 0 130L(fastpath_exit): 131 sub r0, r2, r3 132 bx lr 133 nop 134#endif 135ENTRY_ALIGN (__strcmp_arm, 0) 136#if STRCMP_NO_PRECHECK == 0 137 ldrb r2, [src1] 138 ldrb r3, [src2] 139 cmp r2, #1 140 it cs 141 cmpcs r2, r3 142 bne L(fastpath_exit) 143#endif 144 strd r4, r5, [sp, #-16]! 145 .cfi_def_cfa_offset 16 146 .cfi_offset 4, -16 147 .cfi_offset 5, -12 148 orr tmp1, src1, src2 149 strd r6, r7, [sp, #8] 150 .cfi_offset 6, -8 151 .cfi_offset 7, -4 152 mvn const_m1, #0 153 lsl r2, tmp1, #29 154 cbz r2, L(loop_aligned8) 155 156L(not_aligned): 157 eor tmp1, src1, src2 158 tst tmp1, #7 159 bne L(misaligned8) 160 161 /* Deal with mutual misalignment by aligning downwards and then 162 masking off the unwanted loaded data to prevent a difference. */ 163 and tmp1, src1, #7 164 bic src1, src1, #7 165 and tmp2, tmp1, #3 166 bic src2, src2, #7 167 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 168 ldrd data1a, data1b, [src1], #16 169 tst tmp1, #4 170 ldrd data2a, data2b, [src2], #16 171 /* In thumb code we can't use MVN with a register shift, but 172 we do have ORN. */ 173 S2HI tmp1, const_m1, tmp2 174 orn data1a, data1a, tmp1 175 orn data2a, data2a, tmp1 176 beq L(start_realigned8) 177 orn data1b, data1b, tmp1 178 mov data1a, const_m1 179 orn data2b, data2b, tmp1 180 mov data2a, const_m1 181 b L(start_realigned8) 182 183 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 184 pass. */ 185 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 186 .p2align 2 /* Always word aligned. */ 187L(loop_aligned8): 188 ldrd data1a, data1b, [src1], #16 189 ldrd data2a, data2b, [src2], #16 190L(start_realigned8): 191 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 192 eor syndrome_a, data1a, data2a 193 sel syndrome_a, syndrome_a, const_m1 194 cbnz syndrome_a, L(diff_in_a) 195 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 196 eor syndrome_b, data1b, data2b 197 sel syndrome_b, syndrome_b, const_m1 198 cbnz syndrome_b, L(diff_in_b) 199 200 ldrd data1a, data1b, [src1, #-8] 201 ldrd data2a, data2b, [src2, #-8] 202 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 203 eor syndrome_a, data1a, data2a 204 sel syndrome_a, syndrome_a, const_m1 205 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 206 eor syndrome_b, data1b, data2b 207 sel syndrome_b, syndrome_b, const_m1 208 /* Can't use CBZ for backwards branch. */ 209 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 210 beq L(loop_aligned8) 211 212L(diff_found): 213 cbnz syndrome_a, L(diff_in_a) 214 215L(diff_in_b): 216 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 217 218L(diff_in_a): 219 .cfi_restore_state 220 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 221 222 .cfi_restore_state 223L(misaligned8): 224 tst tmp1, #3 225 bne L(misaligned4) 226 ands tmp1, src1, #3 227 bne L(mutual_align4) 228 229 /* Unrolled by a factor of 2, to reduce the number of post-increment 230 operations. */ 231L(loop_aligned4): 232 ldr data1, [src1], #8 233 ldr data2, [src2], #8 234L(start_realigned4): 235 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 236 eor syndrome, data1, data2 237 sel syndrome, syndrome, const_m1 238 cbnz syndrome, L(aligned4_done) 239 ldr data1, [src1, #-4] 240 ldr data2, [src2, #-4] 241 uadd8 syndrome, data1, const_m1 242 eor syndrome, data1, data2 243 sel syndrome, syndrome, const_m1 244 cmp syndrome, #0 245 beq L(loop_aligned4) 246 247L(aligned4_done): 248 strcmp_epilogue_aligned syndrome, data1, data2, 0 249 250L(mutual_align4): 251 .cfi_restore_state 252 /* Deal with mutual misalignment by aligning downwards and then 253 masking off the unwanted loaded data to prevent a difference. */ 254 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 255 bic src1, src1, #3 256 ldr data1, [src1], #8 257 bic src2, src2, #3 258 ldr data2, [src2], #8 259 260 /* In thumb code we can't use MVN with a register shift, but 261 we do have ORN. */ 262 S2HI tmp1, const_m1, tmp1 263 orn data1, data1, tmp1 264 orn data2, data2, tmp1 265 b L(start_realigned4) 266 267L(misaligned4): 268 ands tmp1, src1, #3 269 beq L(src1_aligned) 270 sub src2, src2, tmp1 271 bic src1, src1, #3 272 lsls tmp1, tmp1, #31 273 ldr data1, [src1], #4 274 beq L(aligned_m2) 275 bcs L(aligned_m1) 276 277#if STRCMP_NO_PRECHECK == 1 278 ldrb data2, [src2, #1] 279 uxtb tmp1, data1, ror #BYTE1_OFFSET 280 subs tmp1, tmp1, data2 281 bne L(misaligned_exit) 282 cbz data2, L(misaligned_exit) 283 284L(aligned_m2): 285 ldrb data2, [src2, #2] 286 uxtb tmp1, data1, ror #BYTE2_OFFSET 287 subs tmp1, tmp1, data2 288 bne L(misaligned_exit) 289 cbz data2, L(misaligned_exit) 290 291L(aligned_m1): 292 ldrb data2, [src2, #3] 293 uxtb tmp1, data1, ror #BYTE3_OFFSET 294 subs tmp1, tmp1, data2 295 bne L(misaligned_exit) 296 add src2, src2, #4 297 cbnz data2, L(src1_aligned) 298#else /* STRCMP_NO_PRECHECK */ 299 /* If we've done the pre-check, then we don't need to check the 300 first byte again here. */ 301 ldrb data2, [src2, #2] 302 uxtb tmp1, data1, ror #BYTE2_OFFSET 303 subs tmp1, tmp1, data2 304 bne L(misaligned_exit) 305 cbz data2, L(misaligned_exit) 306 307L(aligned_m2): 308 ldrb data2, [src2, #3] 309 uxtb tmp1, data1, ror #BYTE3_OFFSET 310 subs tmp1, tmp1, data2 311 bne L(misaligned_exit) 312 cbnz data2, L(aligned_m1) 313#endif 314 315L(misaligned_exit): 316 .cfi_remember_state 317 mov result, tmp1 318 ldr r4, [sp], #16 319 .cfi_restore 4 320 bx lr 321 322#if STRCMP_NO_PRECHECK == 0 323L(aligned_m1): 324 add src2, src2, #4 325#endif 326L(src1_aligned): 327 .cfi_restore_state 328 /* src1 is word aligned, but src2 has no common alignment 329 with it. */ 330 ldr data1, [src1], #4 331 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 332 333 bic src2, src2, #3 334 ldr data2, [src2], #4 335 bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ 336 bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ 337 338 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 339L(overlap3): 340 bic tmp1, data1, #MSB 341 uadd8 syndrome, data1, const_m1 342 eors syndrome, tmp1, data2, S2LO #8 343 sel syndrome, syndrome, const_m1 344 bne 4f 345 cbnz syndrome, 5f 346 ldr data2, [src2], #4 347 eor tmp1, tmp1, data1 348 cmp tmp1, data2, S2HI #24 349 bne 6f 350 ldr data1, [src1], #4 351 b L(overlap3) 3524: 353 S2LO data2, data2, #8 354 b L(strcmp_tail) 355 3565: 357 bics syndrome, syndrome, #MSB 358 bne L(strcmp_done_equal) 359 360 /* We can only get here if the MSB of data1 contains 0, so 361 fast-path the exit. */ 362 ldrb result, [src2] 363 .cfi_remember_state 364 ldrd r4, r5, [sp], #16 365 .cfi_restore 4 366 .cfi_restore 5 367 /* R6/7 Not used in this sequence. */ 368 .cfi_restore 6 369 .cfi_restore 7 370 neg result, result 371 bx lr 372 3736: 374 .cfi_restore_state 375 S2LO data1, data1, #24 376 and data2, data2, #LSB 377 b L(strcmp_tail) 378 379 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 380L(overlap2): 381 and tmp1, data1, const_m1, S2LO #16 382 uadd8 syndrome, data1, const_m1 383 eors syndrome, tmp1, data2, S2LO #16 384 sel syndrome, syndrome, const_m1 385 bne 4f 386 cbnz syndrome, 5f 387 ldr data2, [src2], #4 388 eor tmp1, tmp1, data1 389 cmp tmp1, data2, S2HI #16 390 bne 6f 391 ldr data1, [src1], #4 392 b L(overlap2) 3934: 394 S2LO data2, data2, #16 395 b L(strcmp_tail) 3965: 397 ands syndrome, syndrome, const_m1, S2LO #16 398 bne L(strcmp_done_equal) 399 400 ldrh data2, [src2] 401 S2LO data1, data1, #16 402#ifdef __ARM_BIG_ENDIAN 403 lsl data2, data2, #16 404#endif 405 b L(strcmp_tail) 406 4076: 408 S2LO data1, data1, #16 409 and data2, data2, const_m1, S2LO #16 410 b L(strcmp_tail) 411 412 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 413L(overlap1): 414 and tmp1, data1, #LSB 415 uadd8 syndrome, data1, const_m1 416 eors syndrome, tmp1, data2, S2LO #24 417 sel syndrome, syndrome, const_m1 418 bne 4f 419 cbnz syndrome, 5f 420 ldr data2, [src2], #4 421 eor tmp1, tmp1, data1 422 cmp tmp1, data2, S2HI #8 423 bne 6f 424 ldr data1, [src1], #4 425 b L(overlap1) 4264: 427 S2LO data2, data2, #24 428 b L(strcmp_tail) 4295: 430 tst syndrome, #LSB 431 bne L(strcmp_done_equal) 432 ldr data2, [src2] 4336: 434 S2LO data1, data1, #8 435 bic data2, data2, #MSB 436 b L(strcmp_tail) 437 438L(strcmp_done_equal): 439 mov result, #0 440 .cfi_remember_state 441 ldrd r4, r5, [sp], #16 442 .cfi_restore 4 443 .cfi_restore 5 444 /* R6/7 not used in this sequence. */ 445 .cfi_restore 6 446 .cfi_restore 7 447 bx lr 448 449L(strcmp_tail): 450 .cfi_restore_state 451#ifndef __ARM_BIG_ENDIAN 452 rev data1, data1 453 rev data2, data2 454 /* Now everything looks big-endian... */ 455#endif 456 uadd8 tmp1, data1, const_m1 457 eor tmp1, data1, data2 458 sel syndrome, tmp1, const_m1 459 clz tmp1, syndrome 460 lsl data1, data1, tmp1 461 lsl data2, data2, tmp1 462 lsr result, data1, #24 463 ldrd r4, r5, [sp], #16 464 .cfi_restore 4 465 .cfi_restore 5 466 /* R6/7 not used in this sequence. */ 467 .cfi_restore 6 468 .cfi_restore 7 469 sub result, result, data2, lsr #24 470 bx lr 471 472END (__strcmp_arm) 473