1/* 2 * strnlen - calculate the length of a string with limit. 3 * 4 * Copyright (c) 2013, Arm Limited. 5 * SPDX-License-Identifier: MIT 6 */ 7 8/* Assumptions: 9 * 10 * ARMv8-a, AArch64 11 */ 12 13#include "../asmdefs.h" 14 15/* Arguments and results. */ 16#define srcin x0 17#define len x0 18#define limit x1 19 20/* Locals and temporaries. */ 21#define src x2 22#define data1 x3 23#define data2 x4 24#define data2a x5 25#define has_nul1 x6 26#define has_nul2 x7 27#define tmp1 x8 28#define tmp2 x9 29#define tmp3 x10 30#define tmp4 x11 31#define zeroones x12 32#define pos x13 33#define limit_wd x14 34 35#define REP8_01 0x0101010101010101 36#define REP8_7f 0x7f7f7f7f7f7f7f7f 37#define REP8_80 0x8080808080808080 38 39 .text 40 .p2align 6 41L(start): 42 /* Pre-pad to ensure critical loop begins an icache line. */ 43 .rep 7 44 nop 45 .endr 46 /* Put this code here to avoid wasting more space with pre-padding. */ 47L(hit_limit): 48 mov len, limit 49 ret 50 51ENTRY_ALIGN (__strnlen_aarch64, 0) 52 cbz limit, L(hit_limit) 53 mov zeroones, #REP8_01 54 bic src, srcin, #15 55 ands tmp1, srcin, #15 56 b.ne L(misaligned) 57 /* Calculate the number of full and partial words -1. */ 58 sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */ 59 lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */ 60 61 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80 62 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and 63 can be done in parallel across the entire word. */ 64 /* The inner loop deals with two Dwords at a time. This has a 65 slightly higher start-up cost, but we should win quite quickly, 66 especially on cores with a high number of issue slots per 67 cycle, as we get much better parallelism out of the operations. */ 68 69 /* Start of critial section -- keep to one 64Byte cache line. */ 70L(loop): 71 ldp data1, data2, [src], #16 72L(realigned): 73 sub tmp1, data1, zeroones 74 orr tmp2, data1, #REP8_7f 75 sub tmp3, data2, zeroones 76 orr tmp4, data2, #REP8_7f 77 bic has_nul1, tmp1, tmp2 78 bic has_nul2, tmp3, tmp4 79 subs limit_wd, limit_wd, #1 80 orr tmp1, has_nul1, has_nul2 81 ccmp tmp1, #0, #0, pl /* NZCV = 0000 */ 82 b.eq L(loop) 83 /* End of critical section -- keep to one 64Byte cache line. */ 84 85 orr tmp1, has_nul1, has_nul2 86 cbz tmp1, L(hit_limit) /* No null in final Qword. */ 87 88 /* We know there's a null in the final Qword. The easiest thing 89 to do now is work out the length of the string and return 90 MIN (len, limit). */ 91 92 sub len, src, srcin 93 cbz has_nul1, L(nul_in_data2) 94#ifdef __AARCH64EB__ 95 mov data2, data1 96#endif 97 sub len, len, #8 98 mov has_nul2, has_nul1 99L(nul_in_data2): 100#ifdef __AARCH64EB__ 101 /* For big-endian, carry propagation (if the final byte in the 102 string is 0x01) means we cannot use has_nul directly. The 103 easiest way to get the correct byte is to byte-swap the data 104 and calculate the syndrome a second time. */ 105 rev data2, data2 106 sub tmp1, data2, zeroones 107 orr tmp2, data2, #REP8_7f 108 bic has_nul2, tmp1, tmp2 109#endif 110 sub len, len, #8 111 rev has_nul2, has_nul2 112 clz pos, has_nul2 113 add len, len, pos, lsr #3 /* Bits to bytes. */ 114 cmp len, limit 115 csel len, len, limit, ls /* Return the lower value. */ 116 ret 117 118L(misaligned): 119 /* Deal with a partial first word. 120 We're doing two things in parallel here; 121 1) Calculate the number of words (but avoiding overflow if 122 limit is near ULONG_MAX) - to do this we need to work out 123 limit + tmp1 - 1 as a 65-bit value before shifting it; 124 2) Load and mask the initial data words - we force the bytes 125 before the ones we are interested in to 0xff - this ensures 126 early bytes will not hit any zero detection. */ 127 sub limit_wd, limit, #1 128 neg tmp4, tmp1 129 cmp tmp1, #8 130 131 and tmp3, limit_wd, #15 132 lsr limit_wd, limit_wd, #4 133 mov tmp2, #~0 134 135 ldp data1, data2, [src], #16 136 lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */ 137 add tmp3, tmp3, tmp1 138 139#ifdef __AARCH64EB__ 140 /* Big-endian. Early bytes are at MSB. */ 141 lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 142#else 143 /* Little-endian. Early bytes are at LSB. */ 144 lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */ 145#endif 146 add limit_wd, limit_wd, tmp3, lsr #4 147 148 orr data1, data1, tmp2 149 orr data2a, data2, tmp2 150 151 csinv data1, data1, xzr, le 152 csel data2, data2, data2a, le 153 b L(realigned) 154 155END (__strnlen_aarch64) 156