1/* 2 * strlen - calculate the length of a string 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64. 12 */ 13 14#include "../asmdefs.h" 15 16/* Arguments and results. */ 17#define srcin x0 18#define len x0 19 20/* Locals and temporaries. */ 21#define src x1 22#define data1 x2 23#define data2 x3 24#define has_nul1 x4 25#define has_nul2 x5 26#define tmp1 x4 27#define tmp2 x5 28#define tmp3 x6 29#define tmp4 x7 30#define zeroones x8 31#define offset x9 32 33 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 34 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 35 can be done in parallel across the entire word. A faster check 36 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives 37 false hits for characters 129..255. */ 38 39#define REP8_01 0x0101010101010101 40#define REP8_7f 0x7f7f7f7f7f7f7f7f 41 42 /* This implementation is compatible with Memory Tagging. All loads 43 are 16 bytes in size and 16 bytes aligned. This also avoids the 44 need for page boundary checks. This implementation is correct 45 even without Memory Tagging, but other implementations could be 46 more beneficial if Memory Tagging is not enabled. 47 48 First load is aligned down and can contain bytes that are located 49 before the string. This is handled by modifying the "zeroones" 50 mask. The bytes that need to be ignored are set to zero. 51 If the string is aligned in such a way that 8 or more bytes from 52 the first load should be ignored, there is a special case 53 (skip_first_8_bytes) which only compares the second 8 bytes. 54 55 If there is a NUL byte in the first load, we calculate the length 56 from the 2 8-byte words using conditional select to reduce branch 57 mispredictions. 58 59 If the string is longer than 16 bytes, we check 32 bytes per 60 iteration using the fast NUL check (main_loop). If we encounter 61 non-ASCII characters, we fallback to a second loop 62 (nonascii_loop) using the full NUL check. */ 63 64ENTRY(__strlen_aarch64_mte) 65 bic src, srcin, 15 /* Align down to 16 bytes. */ 66 mov zeroones, REP8_01 67 /* (offset & 63) holds number of bits to ignore in a register.*/ 68 lsl offset, srcin, 3 69 ldp data1, data2, [src], -16 70 lsl tmp1, zeroones, offset /* Shift (offset & 63). */ 71#ifdef __AARCH64EB__ 72 /* For big-endian, carry propagation (if the final byte in the 73 string is 0x01) means we cannot use has_nul1/2 directly. 74 e.g. 0x0100 - 0x0101 = 0xffff, so 0x01 will be mistaken for NUL. 75 Since we expect strings to be small and early-exit, 76 byte-swap the data now so has_null1/2 will be correct. */ 77 rev data1, data1 78 rev data2, data2 79#endif 80 tbnz srcin, 3, L(skip_first_8_bytes) 81 sub tmp1, data1, tmp1 82 orr tmp2, data1, REP8_7f 83 sub tmp3, data2, zeroones 84 orr tmp4, data2, REP8_7f 85 bics has_nul1, tmp1, tmp2 86 bic has_nul2, tmp3, tmp4 87 /* If comparison happens, C flag is always set. */ 88 ccmp has_nul2, 0, 0, eq 89 beq L(main_loop) 90 91 /* Enter with C = has_nul1 == 0. */ 92 csel has_nul1, has_nul1, has_nul2, cc 93 and tmp2, srcin, 7 /* Bytes to ignore. */ 94 rev has_nul1, has_nul1 95 neg tmp2, tmp2 96 clz tmp1, has_nul1 /* Count bits before NUL. */ 97 /* Add 8 if NUL byte is not in first register. */ 98 add tmp3, tmp2, 8 99 csel len, tmp2, tmp3, cc 100 add len, len, tmp1, lsr 3 101 ret 102 103L(skip_first_8_bytes): 104 sub tmp1, data2, tmp1 105 orr tmp2, data2, REP8_7f 106 bics has_nul1, tmp1, tmp2 107 beq L(main_loop) 108 109 rev has_nul1, has_nul1 110 lsl tmp1, has_nul1, offset /* Ignore bytes before string. */ 111 clz tmp1, tmp1 /* Count bits before NUL. */ 112 lsr len, tmp1, 3 113 ret 114 115 /* The inner loop processes 32 bytes per iteration and uses the fast 116 NUL check. If we encounter non-ASCII characters, use a second 117 loop with the accurate NUL check. */ 118 .p2align 4 119L(main_loop): 120 ldp data1, data2, [src, 32]! 121 sub tmp1, data1, zeroones 122 sub tmp3, data2, zeroones 123 orr tmp2, tmp1, tmp3 124 tst tmp2, zeroones, lsl 7 125 bne 1f 126 ldp data1, data2, [src, 16] 127 sub tmp1, data1, zeroones 128 sub tmp3, data2, zeroones 129 orr tmp2, tmp1, tmp3 130 tst tmp2, zeroones, lsl 7 131 beq L(main_loop) 132 add src, src, 16 1331: 134 /* The fast check failed, so do the slower, accurate NUL check. */ 135 orr tmp2, data1, REP8_7f 136 orr tmp4, data2, REP8_7f 137 bics has_nul1, tmp1, tmp2 138 bic has_nul2, tmp3, tmp4 139 ccmp has_nul2, 0, 0, eq 140 beq L(nonascii_loop) 141 142 /* Enter with C = has_nul1 == 0. */ 143L(tail): 144#ifdef __AARCH64EB__ 145 /* For big-endian, carry propagation (if the final byte in the 146 string is 0x01) means we cannot use has_nul1/2 directly. The 147 easiest way to get the correct byte is to byte-swap the data 148 and calculate the syndrome a second time. */ 149 csel data1, data1, data2, cc 150 rev data1, data1 151 sub tmp1, data1, zeroones 152 orr tmp2, data1, REP8_7f 153 bic has_nul1, tmp1, tmp2 154#else 155 csel has_nul1, has_nul1, has_nul2, cc 156#endif 157 sub len, src, srcin 158 rev has_nul1, has_nul1 159 add tmp2, len, 8 160 clz tmp1, has_nul1 161 csel len, len, tmp2, cc 162 add len, len, tmp1, lsr 3 163 ret 164 165L(nonascii_loop): 166 ldp data1, data2, [src, 16]! 167 sub tmp1, data1, zeroones 168 orr tmp2, data1, REP8_7f 169 sub tmp3, data2, zeroones 170 orr tmp4, data2, REP8_7f 171 bics has_nul1, tmp1, tmp2 172 bic has_nul2, tmp3, tmp4 173 ccmp has_nul2, 0, 0, eq 174 bne L(tail) 175 ldp data1, data2, [src, 16]! 176 sub tmp1, data1, zeroones 177 orr tmp2, data1, REP8_7f 178 sub tmp3, data2, zeroones 179 orr tmp4, data2, REP8_7f 180 bics has_nul1, tmp1, tmp2 181 bic has_nul2, tmp3, tmp4 182 ccmp has_nul2, 0, 0, eq 183 beq L(nonascii_loop) 184 b L(tail) 185 186END(__strlen_aarch64_mte) 187