1/* 2 * memchr - find a character in a memory zone 3 * 4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5 * See https://llvm.org/LICENSE.txt for license information. 6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7 */ 8 9/* Assumptions: 10 * 11 * ARMv8-a, AArch64 12 * Neon Available. 13 */ 14 15#include "../asmdefs.h" 16 17/* Arguments and results. */ 18#define srcin x0 19#define chrin w1 20#define cntin x2 21 22#define result x0 23 24#define src x3 25#define tmp x4 26#define wtmp2 w5 27#define synd x6 28#define soff x9 29#define cntrem x10 30 31#define vrepchr v0 32#define vdata1 v1 33#define vdata2 v2 34#define vhas_chr1 v3 35#define vhas_chr2 v4 36#define vrepmask v5 37#define vend v6 38 39/* 40 * Core algorithm: 41 * 42 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 43 * per byte. For each tuple, bit 0 is set if the relevant byte matched the 44 * requested character and bit 1 is not used (faster than using a 32bit 45 * syndrome). Since the bits in the syndrome reflect exactly the order in which 46 * things occur in the original string, counting trailing zeros allows to 47 * identify exactly which byte has matched. 48 */ 49 50ENTRY (__memchr_aarch64) 51 /* Do not dereference srcin if no bytes to compare. */ 52 cbz cntin, L(zero_length) 53 /* 54 * Magic constant 0x40100401 allows us to identify which lane matches 55 * the requested byte. 56 */ 57 mov wtmp2, #0x0401 58 movk wtmp2, #0x4010, lsl #16 59 dup vrepchr.16b, chrin 60 /* Work with aligned 32-byte chunks */ 61 bic src, srcin, #31 62 dup vrepmask.4s, wtmp2 63 ands soff, srcin, #31 64 and cntrem, cntin, #31 65 b.eq L(loop) 66 67 /* 68 * Input string is not 32-byte aligned. We calculate the syndrome 69 * value for the aligned 32 bytes block containing the first bytes 70 * and mask the irrelevant part. 71 */ 72 73 ld1 {vdata1.16b, vdata2.16b}, [src], #32 74 sub tmp, soff, #32 75 adds cntin, cntin, tmp 76 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 77 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 78 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 79 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 80 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 81 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 82 mov synd, vend.d[0] 83 /* Clear the soff*2 lower bits */ 84 lsl tmp, soff, #1 85 lsr synd, synd, tmp 86 lsl synd, synd, tmp 87 /* The first block can also be the last */ 88 b.ls L(masklast) 89 /* Have we found something already? */ 90 cbnz synd, L(tail) 91 92L(loop): 93 ld1 {vdata1.16b, vdata2.16b}, [src], #32 94 subs cntin, cntin, #32 95 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 96 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 97 /* If we're out of data we finish regardless of the result */ 98 b.ls L(end) 99 /* Use a fast check for the termination condition */ 100 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 101 addp vend.2d, vend.2d, vend.2d 102 mov synd, vend.d[0] 103 /* We're not out of data, loop if we haven't found the character */ 104 cbz synd, L(loop) 105 106L(end): 107 /* Termination condition found, let's calculate the syndrome value */ 108 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 109 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 110 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */ 111 addp vend.16b, vend.16b, vend.16b /* 128->64 */ 112 mov synd, vend.d[0] 113 /* Only do the clear for the last possible block */ 114 b.hi L(tail) 115 116L(masklast): 117 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */ 118 add tmp, cntrem, soff 119 and tmp, tmp, #31 120 sub tmp, tmp, #32 121 neg tmp, tmp, lsl #1 122 lsl synd, synd, tmp 123 lsr synd, synd, tmp 124 125L(tail): 126 /* Count the trailing zeros using bit reversing */ 127 rbit synd, synd 128 /* Compensate the last post-increment */ 129 sub src, src, #32 130 /* Check that we have found a character */ 131 cmp synd, #0 132 /* And count the leading zeros */ 133 clz synd, synd 134 /* Compute the potential result */ 135 add result, src, synd, lsr #1 136 /* Select result or NULL */ 137 csel result, xzr, result, eq 138 ret 139 140L(zero_length): 141 mov result, #0 142 ret 143 144END (__memchr_aarch64) 145