/* * strlen - calculate the length of a string * * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception */ /* Assumptions: * * ARMv8-a, AArch64. */ #include "../asmdefs.h" /* Arguments and results. */ #define srcin x0 #define len x0 /* Locals and temporaries. */ #define src x1 #define data1 x2 #define data2 x3 #define has_nul1 x4 #define has_nul2 x5 #define tmp1 x4 #define tmp2 x5 #define tmp3 x6 #define tmp4 x7 #define zeroones x8 #define offset x9 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and can be done in parallel across the entire word. A faster check (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives false hits for characters 129..255. */ #define REP8_01 0x0101010101010101 #define REP8_7f 0x7f7f7f7f7f7f7f7f /* This implementation is compatible with Memory Tagging. All loads are 16 bytes in size and 16 bytes aligned. This also avoids the need for page boundary checks. This implementation is correct even without Memory Tagging, but other implementations could be more beneficial if Memory Tagging is not enabled. First load is aligned down and can contain bytes that are located before the string. This is handled by modifying the "zeroones" mask. The bytes that need to be ignored are set to zero. If the string is aligned in such a way that 8 or more bytes from the first load should be ignored, there is a special case (skip_first_8_bytes) which only compares the second 8 bytes. If there is a NUL byte in the first load, we calculate the length from the 2 8-byte words using conditional select to reduce branch mispredictions. If the string is longer than 16 bytes, we check 32 bytes per iteration using the fast NUL check (main_loop). If we encounter non-ASCII characters, we fallback to a second loop (nonascii_loop) using the full NUL check. */ ENTRY(__strlen_aarch64_mte) bic src, srcin, 15 /* Align down to 16 bytes. */ mov zeroones, REP8_01 /* (offset & 63) holds number of bits to ignore in a register.*/ lsl offset, srcin, 3 ldp data1, data2, [src], -16 lsl tmp1, zeroones, offset /* Shift (offset & 63). */ #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul1/2 directly. e.g. 0x0100 - 0x0101 = 0xffff, so 0x01 will be mistaken for NUL. Since we expect strings to be small and early-exit, byte-swap the data now so has_null1/2 will be correct. */ rev data1, data1 rev data2, data2 #endif tbnz srcin, 3, L(skip_first_8_bytes) sub tmp1, data1, tmp1 orr tmp2, data1, REP8_7f sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 /* If comparison happens, C flag is always set. */ ccmp has_nul2, 0, 0, eq beq L(main_loop) /* Enter with C = has_nul1 == 0. */ csel has_nul1, has_nul1, has_nul2, cc and tmp2, srcin, 7 /* Bytes to ignore. */ rev has_nul1, has_nul1 neg tmp2, tmp2 clz tmp1, has_nul1 /* Count bits before NUL. */ /* Add 8 if NUL byte is not in first register. */ add tmp3, tmp2, 8 csel len, tmp2, tmp3, cc add len, len, tmp1, lsr 3 ret L(skip_first_8_bytes): sub tmp1, data2, tmp1 orr tmp2, data2, REP8_7f bics has_nul1, tmp1, tmp2 beq L(main_loop) rev has_nul1, has_nul1 lsl tmp1, has_nul1, offset /* Ignore bytes before string. */ clz tmp1, tmp1 /* Count bits before NUL. */ lsr len, tmp1, 3 ret /* The inner loop processes 32 bytes per iteration and uses the fast NUL check. If we encounter non-ASCII characters, use a second loop with the accurate NUL check. */ .p2align 4 L(main_loop): ldp data1, data2, [src, 32]! sub tmp1, data1, zeroones sub tmp3, data2, zeroones orr tmp2, tmp1, tmp3 tst tmp2, zeroones, lsl 7 bne 1f ldp data1, data2, [src, 16] sub tmp1, data1, zeroones sub tmp3, data2, zeroones orr tmp2, tmp1, tmp3 tst tmp2, zeroones, lsl 7 beq L(main_loop) add src, src, 16 1: /* The fast check failed, so do the slower, accurate NUL check. */ orr tmp2, data1, REP8_7f orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq beq L(nonascii_loop) /* Enter with C = has_nul1 == 0. */ L(tail): #ifdef __AARCH64EB__ /* For big-endian, carry propagation (if the final byte in the string is 0x01) means we cannot use has_nul1/2 directly. The easiest way to get the correct byte is to byte-swap the data and calculate the syndrome a second time. */ csel data1, data1, data2, cc rev data1, data1 sub tmp1, data1, zeroones orr tmp2, data1, REP8_7f bic has_nul1, tmp1, tmp2 #else csel has_nul1, has_nul1, has_nul2, cc #endif sub len, src, srcin rev has_nul1, has_nul1 add tmp2, len, 8 clz tmp1, has_nul1 csel len, len, tmp2, cc add len, len, tmp1, lsr 3 ret L(nonascii_loop): ldp data1, data2, [src, 16]! sub tmp1, data1, zeroones orr tmp2, data1, REP8_7f sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq bne L(tail) ldp data1, data2, [src, 16]! sub tmp1, data1, zeroones orr tmp2, data1, REP8_7f sub tmp3, data2, zeroones orr tmp4, data2, REP8_7f bics has_nul1, tmp1, tmp2 bic has_nul2, tmp3, tmp4 ccmp has_nul2, 0, 0, eq beq L(nonascii_loop) b L(tail) END(__strlen_aarch64_mte)