1/* 2 * strlen - calculate the length of a string. 3 * 4 * Copyright (c) 2020, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. 11 * Not MTE compatible. 12 */ 13 14#include "../asmdefs.h" 15 16#define srcin x0 17#define len x0 18 19#define src x1 20#define data1 x2 21#define data2 x3 22#define has_nul1 x4 23#define has_nul2 x5 24#define tmp1 x4 25#define tmp2 x5 26#define tmp3 x6 27#define tmp4 x7 28#define zeroones x8 29 30#define maskv v0 31#define maskd d0 32#define dataq1 q1 33#define dataq2 q2 34#define datav1 v1 35#define datav2 v2 36#define tmp x2 37#define tmpw w2 38#define synd x3 39#define shift x4 40 41/* For the first 32 bytes, NUL detection works on the principle that 42 (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero if a 43 byte is zero, and can be done in parallel across the entire word. */ 44 45#define REP8_01 0x0101010101010101 46#define REP8_7f 0x7f7f7f7f7f7f7f7f 47 48/* To test the page crossing code path more thoroughly, compile with 49 -DTEST_PAGE_CROSS - this will force all calls through the slower 50 entry path. This option is not intended for production use. */ 51 52#ifdef TEST_PAGE_CROSS 53# define MIN_PAGE_SIZE 32 54#else 55# define MIN_PAGE_SIZE 4096 56#endif 57 58/* Core algorithm: 59 60 Since strings are short on average, we check the first 32 bytes of the 61 string for a NUL character without aligning the string. In order to use 62 unaligned loads safely we must do a page cross check first. 63 64 If there is a NUL byte we calculate the length from the 2 8-byte words 65 using conditional select to reduce branch mispredictions (it is unlikely 66 strlen will be repeatedly called on strings with the same length). 67 68 If the string is longer than 32 bytes, align src so we don't need further 69 page cross checks, and process 32 bytes per iteration using a fast SIMD 70 loop. 71 72 If the page cross check fails, we read 32 bytes from an aligned address, 73 and ignore any characters before the string. If it contains a NUL 74 character, return the length, if not, continue in the main loop. */ 75 76ENTRY (__strlen_aarch64) 77 PTR_ARG (0) 78 and tmp1, srcin, MIN_PAGE_SIZE - 1 79 cmp tmp1, MIN_PAGE_SIZE - 32 80 b.hi L(page_cross) 81 82 /* Look for a NUL byte in the first 16 bytes. */ 83 ldp data1, data2, [srcin] 84 mov zeroones, REP8_01 85 86#ifdef __AARCH64EB__ 87 /* For big-endian, carry propagation (if the final byte in the 88 string is 0x01) means we cannot use has_nul1/2 directly. 89 Since we expect strings to be small and early-exit, 90 byte-swap the data now so has_null1/2 will be correct. */ 91 rev data1, data1 92 rev data2, data2 93#endif 94 sub tmp1, data1, zeroones 95 orr tmp2, data1, REP8_7f 96 sub tmp3, data2, zeroones 97 orr tmp4, data2, REP8_7f 98 bics has_nul1, tmp1, tmp2 99 bic has_nul2, tmp3, tmp4 100 ccmp has_nul2, 0, 0, eq 101 b.eq L(bytes16_31) 102 103 /* Find the exact offset of the first NUL byte in the first 16 bytes 104 from the string start. Enter with C = has_nul1 == 0. */ 105 csel has_nul1, has_nul1, has_nul2, cc 106 mov len, 8 107 rev has_nul1, has_nul1 108 csel len, xzr, len, cc 109 clz tmp1, has_nul1 110 add len, len, tmp1, lsr 3 111 ret 112 113 .p2align 3 114 /* Look for a NUL byte at offset 16..31 in the string. */ 115L(bytes16_31): 116 ldp data1, data2, [srcin, 16] 117#ifdef __AARCH64EB__ 118 rev data1, data1 119 rev data2, data2 120#endif 121 sub tmp1, data1, zeroones 122 orr tmp2, data1, REP8_7f 123 sub tmp3, data2, zeroones 124 orr tmp4, data2, REP8_7f 125 bics has_nul1, tmp1, tmp2 126 bic has_nul2, tmp3, tmp4 127 ccmp has_nul2, 0, 0, eq 128 b.eq L(loop_entry) 129 130 /* Find the exact offset of the first NUL byte at offset 16..31 from 131 the string start. Enter with C = has_nul1 == 0. */ 132 csel has_nul1, has_nul1, has_nul2, cc 133 mov len, 24 134 rev has_nul1, has_nul1 135 mov tmp3, 16 136 clz tmp1, has_nul1 137 csel len, tmp3, len, cc 138 add len, len, tmp1, lsr 3 139 ret 140 141L(loop_entry): 142 bic src, srcin, 31 143 144 .p2align 5 145L(loop): 146 ldp dataq1, dataq2, [src, 32]! 147 uminp maskv.16b, datav1.16b, datav2.16b 148 uminp maskv.16b, maskv.16b, maskv.16b 149 cmeq maskv.8b, maskv.8b, 0 150 fmov synd, maskd 151 cbz synd, L(loop) 152 153 /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */ 154 cmeq maskv.16b, datav1.16b, 0 155 sub len, src, srcin 156 tst synd, 0xffffffff 157 b.ne 1f 158 cmeq maskv.16b, datav2.16b, 0 159 add len, len, 16 1601: 161 /* Generate a bitmask and compute correct byte offset. */ 162#ifdef __AARCH64EB__ 163 bic maskv.8h, 0xf0 164#else 165 bic maskv.8h, 0x0f, lsl 8 166#endif 167 umaxp maskv.16b, maskv.16b, maskv.16b 168 fmov synd, maskd 169#ifndef __AARCH64EB__ 170 rbit synd, synd 171#endif 172 clz tmp, synd 173 add len, len, tmp, lsr 2 174 ret 175 176 .p2align 4 177 178L(page_cross): 179 bic src, srcin, 31 180 mov tmpw, 0x0c03 181 movk tmpw, 0xc030, lsl 16 182 ld1 {datav1.16b, datav2.16b}, [src] 183 dup maskv.4s, tmpw 184 cmeq datav1.16b, datav1.16b, 0 185 cmeq datav2.16b, datav2.16b, 0 186 and datav1.16b, datav1.16b, maskv.16b 187 and datav2.16b, datav2.16b, maskv.16b 188 addp maskv.16b, datav1.16b, datav2.16b 189 addp maskv.16b, maskv.16b, maskv.16b 190 fmov synd, maskd 191 lsl shift, srcin, 1 192 lsr synd, synd, shift 193 cbz synd, L(loop) 194 195 rbit synd, synd 196 clz len, synd 197 lsr len, len, 1 198 ret 199 200END (__strlen_aarch64) 201