1/* 2 * strcmp for ARMv7 3 * 4 * Copyright (c) 2012-2022, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 9 10/* Implementation of strcmp for ARMv7 when DSP instructions are 11 available. Use ldrd to support wider loads, provided the data 12 is sufficiently aligned. Use saturating arithmetic to optimize 13 the compares. */ 14 15#include "asmdefs.h" 16 17/* Build Options: 18 STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first 19 byte in the string. If comparing completely random strings 20 the pre-check will save time, since there is a very high 21 probability of a mismatch in the first character: we save 22 significant overhead if this is the common case. However, 23 if strings are likely to be identical (eg because we're 24 verifying a hit in a hash table), then this check is largely 25 redundant. */ 26 27#define STRCMP_NO_PRECHECK 0 28 29/* Ensure the .cantunwind directive is prepended to .fnend. 30 Leaf functions cannot throw exceptions - EHABI only supports 31 synchronous exceptions. */ 32#define IS_LEAF 33 34 /* This version uses Thumb-2 code. */ 35 .thumb 36 .syntax unified 37 38#ifdef __ARM_BIG_ENDIAN 39#define S2LO lsl 40#define S2LOEQ lsleq 41#define S2HI lsr 42#define MSB 0x000000ff 43#define LSB 0xff000000 44#define BYTE0_OFFSET 24 45#define BYTE1_OFFSET 16 46#define BYTE2_OFFSET 8 47#define BYTE3_OFFSET 0 48#else /* not __ARM_BIG_ENDIAN */ 49#define S2LO lsr 50#define S2LOEQ lsreq 51#define S2HI lsl 52#define BYTE0_OFFSET 0 53#define BYTE1_OFFSET 8 54#define BYTE2_OFFSET 16 55#define BYTE3_OFFSET 24 56#define MSB 0xff000000 57#define LSB 0x000000ff 58#endif /* not __ARM_BIG_ENDIAN */ 59 60/* Parameters and result. */ 61#define src1 r0 62#define src2 r1 63#define result r0 /* Overlaps src1. */ 64 65/* Internal variables. */ 66#define tmp1 r4 67#define tmp2 r5 68#define const_m1 r12 69 70/* Additional internal variables for 64-bit aligned data. */ 71#define data1a r2 72#define data1b r3 73#define data2a r6 74#define data2b r7 75#define syndrome_a tmp1 76#define syndrome_b tmp2 77 78/* Additional internal variables for 32-bit aligned data. */ 79#define data1 r2 80#define data2 r3 81#define syndrome tmp2 82 83 84 /* Macro to compute and return the result value for word-aligned 85 cases. */ 86 .macro strcmp_epilogue_aligned synd d1 d2 restore_r6 87#ifdef __ARM_BIG_ENDIAN 88 /* If data1 contains a zero byte, then syndrome will contain a 1 in 89 bit 7 of that byte. Otherwise, the highest set bit in the 90 syndrome will highlight the first different bit. It is therefore 91 sufficient to extract the eight bits starting with the syndrome 92 bit. */ 93 clz tmp1, \synd 94 lsl r1, \d2, tmp1 95 .if \restore_r6 96 ldrd r6, r7, [sp, #8] 97 .endif 98 .cfi_restore 6 99 .cfi_restore 7 100 lsl \d1, \d1, tmp1 101 .cfi_remember_state 102 lsr result, \d1, #24 103 ldrd r4, r5, [sp], #16 104 .cfi_restore 4 105 .cfi_restore 5 106 .cfi_adjust_cfa_offset -16 107 sub result, result, r1, lsr #24 108 epilogue push_ip=HAVE_PAC_LEAF 109#else 110 /* To use the big-endian trick we'd have to reverse all three words. 111 that's slower than this approach. */ 112 rev \synd, \synd 113 clz tmp1, \synd 114 bic tmp1, tmp1, #7 115 lsr r1, \d2, tmp1 116 .cfi_remember_state 117 .if \restore_r6 118 ldrd r6, r7, [sp, #8] 119 .endif 120 .cfi_restore 6 121 .cfi_restore 7 122 lsr \d1, \d1, tmp1 123 and result, \d1, #255 124 and r1, r1, #255 125 ldrd r4, r5, [sp], #16 126 .cfi_restore 4 127 .cfi_restore 5 128 .cfi_adjust_cfa_offset -16 129 sub result, result, r1 130 131 epilogue push_ip=HAVE_PAC_LEAF 132#endif 133 .endm 134 135ENTRY(__strcmp_arm) 136 prologue push_ip=HAVE_PAC_LEAF 137#if STRCMP_NO_PRECHECK == 0 138 ldrb r2, [src1] 139 ldrb r3, [src2] 140 cmp r2, #1 141 it cs 142 cmpcs r2, r3 143 bne L(fastpath_exit) 144#endif 145 strd r4, r5, [sp, #-16]! 146 .cfi_adjust_cfa_offset 16 147 .cfi_rel_offset 4, 0 148 .cfi_rel_offset 5, 4 149 orr tmp1, src1, src2 150 strd r6, r7, [sp, #8] 151 .cfi_rel_offset 6, 8 152 .cfi_rel_offset 7, 12 153 mvn const_m1, #0 154 lsl r2, tmp1, #29 155 cbz r2, L(loop_aligned8) 156 157L(not_aligned): 158 eor tmp1, src1, src2 159 tst tmp1, #7 160 bne L(misaligned8) 161 162 /* Deal with mutual misalignment by aligning downwards and then 163 masking off the unwanted loaded data to prevent a difference. */ 164 and tmp1, src1, #7 165 bic src1, src1, #7 166 and tmp2, tmp1, #3 167 bic src2, src2, #7 168 lsl tmp2, tmp2, #3 /* Bytes -> bits. */ 169 ldrd data1a, data1b, [src1], #16 170 tst tmp1, #4 171 ldrd data2a, data2b, [src2], #16 172 /* In thumb code we can't use MVN with a register shift, but 173 we do have ORN. */ 174 S2HI tmp1, const_m1, tmp2 175 orn data1a, data1a, tmp1 176 orn data2a, data2a, tmp1 177 beq L(start_realigned8) 178 orn data1b, data1b, tmp1 179 mov data1a, const_m1 180 orn data2b, data2b, tmp1 181 mov data2a, const_m1 182 b L(start_realigned8) 183 184 /* Unwind the inner loop by a factor of 2, giving 16 bytes per 185 pass. */ 186 .p2align 5,,12 /* Don't start in the tail bytes of a cache line. */ 187 .p2align 2 /* Always word aligned. */ 188L(loop_aligned8): 189 ldrd data1a, data1b, [src1], #16 190 ldrd data2a, data2b, [src2], #16 191L(start_realigned8): 192 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 193 eor syndrome_a, data1a, data2a 194 sel syndrome_a, syndrome_a, const_m1 195 cbnz syndrome_a, L(diff_in_a) 196 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 197 eor syndrome_b, data1b, data2b 198 sel syndrome_b, syndrome_b, const_m1 199 cbnz syndrome_b, L(diff_in_b) 200 201 ldrd data1a, data1b, [src1, #-8] 202 ldrd data2a, data2b, [src2, #-8] 203 uadd8 syndrome_b, data1a, const_m1 /* Only want GE bits, */ 204 eor syndrome_a, data1a, data2a 205 sel syndrome_a, syndrome_a, const_m1 206 uadd8 syndrome_b, data1b, const_m1 /* Only want GE bits. */ 207 eor syndrome_b, data1b, data2b 208 sel syndrome_b, syndrome_b, const_m1 209 /* Can't use CBZ for backwards branch. */ 210 orrs syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */ 211 beq L(loop_aligned8) 212 213L(diff_found): 214 cbnz syndrome_a, L(diff_in_a) 215 216L(diff_in_b): 217 strcmp_epilogue_aligned syndrome_b, data1b, data2b 1 218 219L(diff_in_a): 220 .cfi_restore_state 221 strcmp_epilogue_aligned syndrome_a, data1a, data2a 1 222 223 .cfi_restore_state 224L(misaligned8): 225 tst tmp1, #3 226 bne L(misaligned4) 227 ands tmp1, src1, #3 228 bne L(mutual_align4) 229 230 /* Unrolled by a factor of 2, to reduce the number of post-increment 231 operations. */ 232L(loop_aligned4): 233 ldr data1, [src1], #8 234 ldr data2, [src2], #8 235L(start_realigned4): 236 uadd8 syndrome, data1, const_m1 /* Only need GE bits. */ 237 eor syndrome, data1, data2 238 sel syndrome, syndrome, const_m1 239 cbnz syndrome, L(aligned4_done) 240 ldr data1, [src1, #-4] 241 ldr data2, [src2, #-4] 242 uadd8 syndrome, data1, const_m1 243 eor syndrome, data1, data2 244 sel syndrome, syndrome, const_m1 245 cmp syndrome, #0 246 beq L(loop_aligned4) 247 248L(aligned4_done): 249 strcmp_epilogue_aligned syndrome, data1, data2, 0 250 251L(mutual_align4): 252 .cfi_restore_state 253 /* Deal with mutual misalignment by aligning downwards and then 254 masking off the unwanted loaded data to prevent a difference. */ 255 lsl tmp1, tmp1, #3 /* Bytes -> bits. */ 256 bic src1, src1, #3 257 ldr data1, [src1], #8 258 bic src2, src2, #3 259 ldr data2, [src2], #8 260 261 /* In thumb code we can't use MVN with a register shift, but 262 we do have ORN. */ 263 S2HI tmp1, const_m1, tmp1 264 orn data1, data1, tmp1 265 orn data2, data2, tmp1 266 b L(start_realigned4) 267 268L(misaligned4): 269 ands tmp1, src1, #3 270 beq L(src1_aligned) 271 sub src2, src2, tmp1 272 bic src1, src1, #3 273 lsls tmp1, tmp1, #31 274 ldr data1, [src1], #4 275 beq L(aligned_m2) 276 bcs L(aligned_m1) 277 278#if STRCMP_NO_PRECHECK == 1 279 ldrb data2, [src2, #1] 280 uxtb tmp1, data1, ror #BYTE1_OFFSET 281 subs tmp1, tmp1, data2 282 bne L(misaligned_exit) 283 cbz data2, L(misaligned_exit) 284 285L(aligned_m2): 286 ldrb data2, [src2, #2] 287 uxtb tmp1, data1, ror #BYTE2_OFFSET 288 subs tmp1, tmp1, data2 289 bne L(misaligned_exit) 290 cbz data2, L(misaligned_exit) 291 292L(aligned_m1): 293 ldrb data2, [src2, #3] 294 uxtb tmp1, data1, ror #BYTE3_OFFSET 295 subs tmp1, tmp1, data2 296 bne L(misaligned_exit) 297 add src2, src2, #4 298 cbnz data2, L(src1_aligned) 299#else /* STRCMP_NO_PRECHECK */ 300 /* If we've done the pre-check, then we don't need to check the 301 first byte again here. */ 302 ldrb data2, [src2, #2] 303 uxtb tmp1, data1, ror #BYTE2_OFFSET 304 subs tmp1, tmp1, data2 305 bne L(misaligned_exit) 306 cbz data2, L(misaligned_exit) 307 308L(aligned_m2): 309 ldrb data2, [src2, #3] 310 uxtb tmp1, data1, ror #BYTE3_OFFSET 311 subs tmp1, tmp1, data2 312 bne L(misaligned_exit) 313 cbnz data2, L(aligned_m1) 314#endif 315 316L(misaligned_exit): 317 .cfi_remember_state 318 mov result, tmp1 319 ldr r4, [sp], #16 320 .cfi_restore 4 321 .cfi_adjust_cfa_offset -16 322 epilogue push_ip=HAVE_PAC_LEAF 323 324#if STRCMP_NO_PRECHECK == 0 325L(fastpath_exit): 326 .cfi_restore_state 327 .cfi_remember_state 328 sub r0, r2, r3 329 epilogue push_ip=HAVE_PAC_LEAF 330 331L(aligned_m1): 332 .cfi_restore_state 333 .cfi_remember_state 334 add src2, src2, #4 335#endif 336L(src1_aligned): 337 .cfi_restore_state 338 /* src1 is word aligned, but src2 has no common alignment 339 with it. */ 340 ldr data1, [src1], #4 341 lsls tmp1, src2, #31 /* C=src2[1], Z=src2[0]. */ 342 343 bic src2, src2, #3 344 ldr data2, [src2], #4 345 bhi L(overlap1) /* C=1, Z=0 => src2[1:0] = 0b11. */ 346 bcs L(overlap2) /* C=1, Z=1 => src2[1:0] = 0b10. */ 347 348 /* (overlap3) C=0, Z=0 => src2[1:0] = 0b01. */ 349L(overlap3): 350 bic tmp1, data1, #MSB 351 uadd8 syndrome, data1, const_m1 352 eors syndrome, tmp1, data2, S2LO #8 353 sel syndrome, syndrome, const_m1 354 bne 4f 355 cbnz syndrome, 5f 356 ldr data2, [src2], #4 357 eor tmp1, tmp1, data1 358 cmp tmp1, data2, S2HI #24 359 bne 6f 360 ldr data1, [src1], #4 361 b L(overlap3) 3624: 363 S2LO data2, data2, #8 364 b L(strcmp_tail) 365 3665: 367 bics syndrome, syndrome, #MSB 368 bne L(strcmp_done_equal) 369 370 /* We can only get here if the MSB of data1 contains 0, so 371 fast-path the exit. */ 372 ldrb result, [src2] 373 .cfi_remember_state 374 ldrd r4, r5, [sp], #16 375 .cfi_restore 4 376 .cfi_restore 5 377 /* R6/7 Not used in this sequence. */ 378 .cfi_restore 6 379 .cfi_restore 7 380 .cfi_adjust_cfa_offset -16 381 neg result, result 382 epilogue push_ip=HAVE_PAC_LEAF 3836: 384 .cfi_restore_state 385 S2LO data1, data1, #24 386 and data2, data2, #LSB 387 b L(strcmp_tail) 388 389 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 390L(overlap2): 391 and tmp1, data1, const_m1, S2LO #16 392 uadd8 syndrome, data1, const_m1 393 eors syndrome, tmp1, data2, S2LO #16 394 sel syndrome, syndrome, const_m1 395 bne 4f 396 cbnz syndrome, 5f 397 ldr data2, [src2], #4 398 eor tmp1, tmp1, data1 399 cmp tmp1, data2, S2HI #16 400 bne 6f 401 ldr data1, [src1], #4 402 b L(overlap2) 4034: 404 S2LO data2, data2, #16 405 b L(strcmp_tail) 4065: 407 ands syndrome, syndrome, const_m1, S2LO #16 408 bne L(strcmp_done_equal) 409 410 ldrh data2, [src2] 411 S2LO data1, data1, #16 412#ifdef __ARM_BIG_ENDIAN 413 lsl data2, data2, #16 414#endif 415 b L(strcmp_tail) 416 4176: 418 S2LO data1, data1, #16 419 and data2, data2, const_m1, S2LO #16 420 b L(strcmp_tail) 421 422 .p2align 5,,12 /* Ensure at least 3 instructions in cache line. */ 423L(overlap1): 424 and tmp1, data1, #LSB 425 uadd8 syndrome, data1, const_m1 426 eors syndrome, tmp1, data2, S2LO #24 427 sel syndrome, syndrome, const_m1 428 bne 4f 429 cbnz syndrome, 5f 430 ldr data2, [src2], #4 431 eor tmp1, tmp1, data1 432 cmp tmp1, data2, S2HI #8 433 bne 6f 434 ldr data1, [src1], #4 435 b L(overlap1) 4364: 437 S2LO data2, data2, #24 438 b L(strcmp_tail) 4395: 440 tst syndrome, #LSB 441 bne L(strcmp_done_equal) 442 ldr data2, [src2] 4436: 444 S2LO data1, data1, #8 445 bic data2, data2, #MSB 446 b L(strcmp_tail) 447 448L(strcmp_done_equal): 449 mov result, #0 450 .cfi_remember_state 451 ldrd r4, r5, [sp], #16 452 .cfi_restore 4 453 .cfi_restore 5 454 /* R6/7 not used in this sequence. */ 455 .cfi_restore 6 456 .cfi_restore 7 457 .cfi_adjust_cfa_offset -16 458 epilogue push_ip=HAVE_PAC_LEAF 459 460L(strcmp_tail): 461 .cfi_restore_state 462#ifndef __ARM_BIG_ENDIAN 463 rev data1, data1 464 rev data2, data2 465 /* Now everything looks big-endian... */ 466#endif 467 uadd8 tmp1, data1, const_m1 468 eor tmp1, data1, data2 469 sel syndrome, tmp1, const_m1 470 clz tmp1, syndrome 471 lsl data1, data1, tmp1 472 lsl data2, data2, tmp1 473 lsr result, data1, #24 474 ldrd r4, r5, [sp], #16 475 .cfi_restore 4 476 .cfi_restore 5 477 /* R6/7 not used in this sequence. */ 478 .cfi_restore 6 479 .cfi_restore 7 480 .cfi_adjust_cfa_offset -16 481 sub result, result, data2, lsr #24 482 epilogue push_ip=HAVE_PAC_LEAF 483 484END (__strcmp_arm) 485 486#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1 */ 487