1/* 2 * strncmp - compare two strings 3 * 4 * Copyright (c) 2013-2021, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64 11 */ 12 13#include "../asmdefs.h" 14 15#define REP8_01 0x0101010101010101 16#define REP8_7f 0x7f7f7f7f7f7f7f7f 17#define REP8_80 0x8080808080808080 18 19/* Parameters and result. */ 20#define src1 x0 21#define src2 x1 22#define limit x2 23#define result x0 24 25/* Internal variables. */ 26#define data1 x3 27#define data1w w3 28#define data2 x4 29#define data2w w4 30#define has_nul x5 31#define diff x6 32#define syndrome x7 33#define tmp1 x8 34#define tmp2 x9 35#define tmp3 x10 36#define zeroones x11 37#define pos x12 38#define limit_wd x13 39#define mask x14 40#define endloop x15 41#define count mask 42 43ENTRY (__strncmp_aarch64) 44 PTR_ARG (0) 45 PTR_ARG (1) 46 SIZE_ARG (2) 47 cbz limit, L(ret0) 48 eor tmp1, src1, src2 49 mov zeroones, #REP8_01 50 tst tmp1, #7 51 and count, src1, #7 52 b.ne L(misaligned8) 53 cbnz count, L(mutual_align) 54 /* Calculate the number of full and partial words -1. */ 55 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 56 lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ 57 58 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 59 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 60 can be done in parallel across the entire word. */ 61 .p2align 4 62L(loop_aligned): 63 ldr data1, [src1], #8 64 ldr data2, [src2], #8 65L(start_realigned): 66 subs limit_wd, limit_wd, #1 67 sub tmp1, data1, zeroones 68 orr tmp2, data1, #REP8_7f 69 eor diff, data1, data2 /* Non-zero if differences found. */ 70 csinv endloop, diff, xzr, pl /* Last Dword or differences. */ 71 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 72 ccmp endloop, #0, #0, eq 73 b.eq L(loop_aligned) 74 /* End of main loop */ 75 76 /* Not reached the limit, must have found the end or a diff. */ 77 tbz limit_wd, #63, L(not_limit) 78 79 /* Limit % 8 == 0 => all bytes significant. */ 80 ands limit, limit, #7 81 b.eq L(not_limit) 82 83 lsl limit, limit, #3 /* Bits -> bytes. */ 84 mov mask, #~0 85#ifdef __AARCH64EB__ 86 lsr mask, mask, limit 87#else 88 lsl mask, mask, limit 89#endif 90 bic data1, data1, mask 91 bic data2, data2, mask 92 93 /* Make sure that the NUL byte is marked in the syndrome. */ 94 orr has_nul, has_nul, mask 95 96L(not_limit): 97 orr syndrome, diff, has_nul 98 99#ifndef __AARCH64EB__ 100 rev syndrome, syndrome 101 rev data1, data1 102 /* The MS-non-zero bit of the syndrome marks either the first bit 103 that is different, or the top bit of the first zero byte. 104 Shifting left now will bring the critical information into the 105 top bits. */ 106 clz pos, syndrome 107 rev data2, data2 108 lsl data1, data1, pos 109 lsl data2, data2, pos 110 /* But we need to zero-extend (char is unsigned) the value and then 111 perform a signed 32-bit subtraction. */ 112 lsr data1, data1, #56 113 sub result, data1, data2, lsr #56 114 ret 115#else 116 /* For big-endian we cannot use the trick with the syndrome value 117 as carry-propagation can corrupt the upper bits if the trailing 118 bytes in the string contain 0x01. */ 119 /* However, if there is no NUL byte in the dword, we can generate 120 the result directly. We can't just subtract the bytes as the 121 MSB might be significant. */ 122 cbnz has_nul, 1f 123 cmp data1, data2 124 cset result, ne 125 cneg result, result, lo 126 ret 1271: 128 /* Re-compute the NUL-byte detection, using a byte-reversed value. */ 129 rev tmp3, data1 130 sub tmp1, tmp3, zeroones 131 orr tmp2, tmp3, #REP8_7f 132 bic has_nul, tmp1, tmp2 133 rev has_nul, has_nul 134 orr syndrome, diff, has_nul 135 clz pos, syndrome 136 /* The MS-non-zero bit of the syndrome marks either the first bit 137 that is different, or the top bit of the first zero byte. 138 Shifting left now will bring the critical information into the 139 top bits. */ 140 lsl data1, data1, pos 141 lsl data2, data2, pos 142 /* But we need to zero-extend (char is unsigned) the value and then 143 perform a signed 32-bit subtraction. */ 144 lsr data1, data1, #56 145 sub result, data1, data2, lsr #56 146 ret 147#endif 148 149L(mutual_align): 150 /* Sources are mutually aligned, but are not currently at an 151 alignment boundary. Round down the addresses and then mask off 152 the bytes that precede the start point. 153 We also need to adjust the limit calculations, but without 154 overflowing if the limit is near ULONG_MAX. */ 155 bic src1, src1, #7 156 bic src2, src2, #7 157 ldr data1, [src1], #8 158 neg tmp3, count, lsl #3 /* 64 - bits(bytes beyond align). */ 159 ldr data2, [src2], #8 160 mov tmp2, #~0 161 sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ 162#ifdef __AARCH64EB__ 163 /* Big-endian. Early bytes are at MSB. */ 164 lsl tmp2, tmp2, tmp3 /* Shift (count & 63). */ 165#else 166 /* Little-endian. Early bytes are at LSB. */ 167 lsr tmp2, tmp2, tmp3 /* Shift (count & 63). */ 168#endif 169 and tmp3, limit_wd, #7 170 lsr limit_wd, limit_wd, #3 171 /* Adjust the limit. Only low 3 bits used, so overflow irrelevant. */ 172 add limit, limit, count 173 add tmp3, tmp3, count 174 orr data1, data1, tmp2 175 orr data2, data2, tmp2 176 add limit_wd, limit_wd, tmp3, lsr #3 177 b L(start_realigned) 178 179 .p2align 4 180 /* Don't bother with dwords for up to 16 bytes. */ 181L(misaligned8): 182 cmp limit, #16 183 b.hs L(try_misaligned_words) 184 185L(byte_loop): 186 /* Perhaps we can do better than this. */ 187 ldrb data1w, [src1], #1 188 ldrb data2w, [src2], #1 189 subs limit, limit, #1 190 ccmp data1w, #1, #0, hi /* NZCV = 0b0000. */ 191 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 192 b.eq L(byte_loop) 193L(done): 194 sub result, data1, data2 195 ret 196 /* Align the SRC1 to a dword by doing a bytewise compare and then do 197 the dword loop. */ 198L(try_misaligned_words): 199 lsr limit_wd, limit, #3 200 cbz count, L(do_misaligned) 201 202 neg count, count 203 and count, count, #7 204 sub limit, limit, count 205 lsr limit_wd, limit, #3 206 207L(page_end_loop): 208 ldrb data1w, [src1], #1 209 ldrb data2w, [src2], #1 210 cmp data1w, #1 211 ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */ 212 b.ne L(done) 213 subs count, count, #1 214 b.hi L(page_end_loop) 215 216L(do_misaligned): 217 /* Prepare ourselves for the next page crossing. Unlike the aligned 218 loop, we fetch 1 less dword because we risk crossing bounds on 219 SRC2. */ 220 mov count, #8 221 subs limit_wd, limit_wd, #1 222 b.lo L(done_loop) 223L(loop_misaligned): 224 and tmp2, src2, #0xff8 225 eor tmp2, tmp2, #0xff8 226 cbz tmp2, L(page_end_loop) 227 228 ldr data1, [src1], #8 229 ldr data2, [src2], #8 230 sub tmp1, data1, zeroones 231 orr tmp2, data1, #REP8_7f 232 eor diff, data1, data2 /* Non-zero if differences found. */ 233 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 234 ccmp diff, #0, #0, eq 235 b.ne L(not_limit) 236 subs limit_wd, limit_wd, #1 237 b.pl L(loop_misaligned) 238 239L(done_loop): 240 /* We found a difference or a NULL before the limit was reached. */ 241 and limit, limit, #7 242 cbz limit, L(not_limit) 243 /* Read the last word. */ 244 sub src1, src1, 8 245 sub src2, src2, 8 246 ldr data1, [src1, limit] 247 ldr data2, [src2, limit] 248 sub tmp1, data1, zeroones 249 orr tmp2, data1, #REP8_7f 250 eor diff, data1, data2 /* Non-zero if differences found. */ 251 bics has_nul, tmp1, tmp2 /* Non-zero if NUL terminator. */ 252 ccmp diff, #0, #0, eq 253 b.ne L(not_limit) 254 255L(ret0): 256 mov result, #0 257 ret 258 259END ( __strncmp_aarch64) 260 261