1/* 2 * strncmp - compare two strings 3 * 4 * Copyright (c) 2013-2021, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64 11 */ 12 13#include "../asmdefs.h" 14 15#define REP8_01 0x0101010101010101 16#define REP8_7f 0x7f7f7f7f7f7f7f7f 17 18/* Parameters and result. */ 19#define src1 x0 20#define src2 x1 21#define limit x2 22#define result x0 23 24/* Internal variables. */ 25#define data1 x3 26#define data1w w3 27#define data2 x4 28#define data2w w4 29#define has_nul x5 30#define diff x6 31#define syndrome x7 32#define tmp1 x8 33#define tmp2 x9 34#define tmp3 x10 35#define zeroones x11 36#define pos x12 37#define mask x13 38#define endloop x14 39#define count mask 40#define offset pos 41#define neg_offset x15 42 43/* Define endian dependent shift operations. 44 On big-endian early bytes are at MSB and on little-endian LSB. 45 LS_FW means shifting towards early bytes. 46 LS_BK means shifting towards later bytes. 47 */ 48#ifdef __AARCH64EB__ 49#define LS_FW lsl 50#define LS_BK lsr 51#else 52#define LS_FW lsr 53#define LS_BK lsl 54#endif 55 56ENTRY (__strncmp_aarch64_mte) 57 PTR_ARG (0) 58 PTR_ARG (1) 59 SIZE_ARG (2) 60 cbz limit, L(ret0) 61 eor tmp1, src1, src2 62 mov zeroones, #REP8_01 63 tst tmp1, #7 64 and count, src1, #7 65 b.ne L(misaligned8) 66 cbnz count, L(mutual_align) 67 68 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 69 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 70 can be done in parallel across the entire word. */ 71 .p2align 4 72L(loop_aligned): 73 ldr data1, [src1], #8 74 ldr data2, [src2], #8 75L(start_realigned): 76 subs limit, limit, #8 77 sub tmp1, data1, zeroones 78 orr tmp2, data1, #REP8_7f 79 eor diff, data1, data2 /* Non-zero if differences found. */ 80 csinv endloop, diff, xzr, hi /* Last Dword or differences. */ 81 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 82 ccmp endloop, #0, #0, eq 83 b.eq L(loop_aligned) 84 /* End of main loop */ 85 86L(full_check): 87#ifndef __AARCH64EB__ 88 orr syndrome, diff, has_nul 89 add limit, limit, 8 /* Rewind limit to before last subs. */ 90L(syndrome_check): 91 /* Limit was reached. Check if the NUL byte or the difference 92 is before the limit. */ 93 rev syndrome, syndrome 94 rev data1, data1 95 clz pos, syndrome 96 rev data2, data2 97 lsl data1, data1, pos 98 cmp limit, pos, lsr #3 99 lsl data2, data2, pos 100 /* But we need to zero-extend (char is unsigned) the value and then 101 perform a signed 32-bit subtraction. */ 102 lsr data1, data1, #56 103 sub result, data1, data2, lsr #56 104 csel result, result, xzr, hi 105 ret 106#else 107 /* Not reached the limit, must have found the end or a diff. */ 108 tbz limit, #63, L(not_limit) 109 add tmp1, limit, 8 110 cbz limit, L(not_limit) 111 112 lsl limit, tmp1, #3 /* Bits -> bytes. */ 113 mov mask, #~0 114 lsr mask, mask, limit 115 bic data1, data1, mask 116 bic data2, data2, mask 117 118 /* Make sure that the NUL byte is marked in the syndrome. */ 119 orr has_nul, has_nul, mask 120 121L(not_limit): 122 /* For big-endian we cannot use the trick with the syndrome value 123 as carry-propagation can corrupt the upper bits if the trailing 124 bytes in the string contain 0x01. */ 125 /* However, if there is no NUL byte in the dword, we can generate 126 the result directly. We can't just subtract the bytes as the 127 MSB might be significant. */ 128 cbnz has_nul, 1f 129 cmp data1, data2 130 cset result, ne 131 cneg result, result, lo 132 ret 1331: 134 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 135 rev tmp3, data1 136 sub tmp1, tmp3, zeroones 137 orr tmp2, tmp3, #REP8_7f 138 bic has_nul, tmp1, tmp2 139 rev has_nul, has_nul 140 orr syndrome, diff, has_nul 141 clz pos, syndrome 142 /* The most-significant-non-zero bit of the syndrome marks either the 143 first bit that is different, or the top bit of the first zero byte. 144 Shifting left now will bring the critical information into the 145 top bits. */ 146L(end_quick): 147 lsl data1, data1, pos 148 lsl data2, data2, pos 149 /* But we need to zero-extend (char is unsigned) the value and then 150 perform a signed 32-bit subtraction. */ 151 lsr data1, data1, #56 152 sub result, data1, data2, lsr #56 153 ret 154#endif 155 156L(mutual_align): 157 /* Sources are mutually aligned, but are not currently at an 158 alignment boundary. Round down the addresses and then mask off 159 the bytes that precede the start point. 160 We also need to adjust the limit calculations, but without 161 overflowing if the limit is near ULONG_MAX. */ 162 bic src1, src1, #7 163 bic src2, src2, #7 164 ldr data1, [src1], #8 165 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 166 ldr data2, [src2], #8 167 mov tmp2, #~0 168 LS_FW tmp2, tmp2, tmp3 /* Shift (count & 63). */ 169 /* Adjust the limit and ensure it doesn't overflow. */ 170 adds limit, limit, count 171 csinv limit, limit, xzr, lo 172 orr data1, data1, tmp2 173 orr data2, data2, tmp2 174 b L(start_realigned) 175 176 .p2align 4 177 /* Don't bother with dwords for up to 16 bytes. */ 178L(misaligned8): 179 cmp limit, #16 180 b.hs L(try_misaligned_words) 181 182L(byte_loop): 183 /* Perhaps we can do better than this. */ 184 ldrb data1w, [src1], #1 185 ldrb data2w, [src2], #1 186 subs limit, limit, #1 187 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 188 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 189 b.eq L(byte_loop) 190L(done): 191 sub result, data1, data2 192 ret 193 /* Align the SRC1 to a dword by doing a bytewise compare and then do 194 the dword loop. */ 195L(try_misaligned_words): 196 cbz count, L(src1_aligned) 197 198 neg count, count 199 and count, count, #7 200 sub limit, limit, count 201 202L(page_end_loop): 203 ldrb data1w, [src1], #1 204 ldrb data2w, [src2], #1 205 cmp data1w, #1 206 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 207 b.ne L(done) 208 subs count, count, #1 209 b.hi L(page_end_loop) 210 211 /* The following diagram explains the comparison of misaligned strings. 212 The bytes are shown in natural order. For little-endian, it is 213 reversed in the registers. The "x" bytes are before the string. 214 The "|" separates data that is loaded at one time. 215 src1 | a a a a a a a a | b b b c c c c c | . . . 216 src2 | x x x x x a a a a a a a a b b b | c c c c c . . . 217 218 After shifting in each step, the data looks like this: 219 STEP_A STEP_B STEP_C 220 data1 a a a a a a a a b b b c c c c c b b b c c c c c 221 data2 a a a a a a a a b b b 0 0 0 0 0 0 0 0 c c c c c 222 223 The bytes with "0" are eliminated from the syndrome via mask. 224 225 Align SRC2 down to 16 bytes. This way we can read 16 bytes at a 226 time from SRC2. The comparison happens in 3 steps. After each step 227 the loop can exit, or read from SRC1 or SRC2. */ 228L(src1_aligned): 229 /* Calculate offset from 8 byte alignment to string start in bits. No 230 need to mask offset since shifts are ignoring upper bits. */ 231 lsl offset, src2, #3 232 bic src2, src2, #0xf 233 mov mask, -1 234 neg neg_offset, offset 235 ldr data1, [src1], #8 236 ldp tmp1, tmp2, [src2], #16 237 LS_BK mask, mask, neg_offset 238 and neg_offset, neg_offset, #63 /* Need actual value for cmp later. */ 239 /* Skip the first compare if data in tmp1 is irrelevant. */ 240 tbnz offset, 6, L(misaligned_mid_loop) 241 242L(loop_misaligned): 243 /* STEP_A: Compare full 8 bytes when there is enough data from SRC2.*/ 244 LS_FW data2, tmp1, offset 245 LS_BK tmp1, tmp2, neg_offset 246 subs limit, limit, #8 247 orr data2, data2, tmp1 /* 8 bytes from SRC2 combined from two regs.*/ 248 sub has_nul, data1, zeroones 249 eor diff, data1, data2 /* Non-zero if differences found. */ 250 orr tmp3, data1, #REP8_7f 251 csinv endloop, diff, xzr, hi /* If limit, set to all ones. */ 252 bic has_nul, has_nul, tmp3 /* Non-zero if NUL byte found in SRC1. */ 253 orr tmp3, endloop, has_nul 254 cbnz tmp3, L(full_check) 255 256 ldr data1, [src1], #8 257L(misaligned_mid_loop): 258 /* STEP_B: Compare first part of data1 to second part of tmp2. */ 259 LS_FW data2, tmp2, offset 260#ifdef __AARCH64EB__ 261 /* For big-endian we do a byte reverse to avoid carry-propagation 262 problem described above. This way we can reuse the has_nul in the 263 next step and also use syndrome value trick at the end. */ 264 rev tmp3, data1 265 #define data1_fixed tmp3 266#else 267 #define data1_fixed data1 268#endif 269 sub has_nul, data1_fixed, zeroones 270 orr tmp3, data1_fixed, #REP8_7f 271 eor diff, data2, data1 /* Non-zero if differences found. */ 272 bic has_nul, has_nul, tmp3 /* Non-zero if NUL terminator. */ 273#ifdef __AARCH64EB__ 274 rev has_nul, has_nul 275#endif 276 cmp limit, neg_offset, lsr #3 277 orr syndrome, diff, has_nul 278 bic syndrome, syndrome, mask /* Ignore later bytes. */ 279 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 280 cbnz tmp3, L(syndrome_check) 281 282 /* STEP_C: Compare second part of data1 to first part of tmp1. */ 283 ldp tmp1, tmp2, [src2], #16 284 cmp limit, #8 285 LS_BK data2, tmp1, neg_offset 286 eor diff, data2, data1 /* Non-zero if differences found. */ 287 orr syndrome, diff, has_nul 288 and syndrome, syndrome, mask /* Ignore earlier bytes. */ 289 csinv tmp3, syndrome, xzr, hi /* If limit, set to all ones. */ 290 cbnz tmp3, L(syndrome_check) 291 292 ldr data1, [src1], #8 293 sub limit, limit, #8 294 b L(loop_misaligned) 295 296#ifdef __AARCH64EB__ 297L(syndrome_check): 298 clz pos, syndrome 299 cmp pos, limit, lsl #3 300 b.lo L(end_quick) 301#endif 302 303L(ret0): 304 mov result, #0 305 ret 306END(__strncmp_aarch64_mte) 307 308