1// 2// Copyright (c) 2014, ARM Limited 3// All rights Reserved. 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are met: 7// * Redistributions of source code must retain the above copyright 8// notice, this list of conditions and the following disclaimer. 9// * Redistributions in binary form must reproduce the above copyright 10// notice, this list of conditions and the following disclaimer in the 11// documentation and/or other materials provided with the distribution. 12// * Neither the name of the company nor the names of its contributors 13// may be used to endorse or promote products derived from this 14// software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27// 28 29// Assumptions: 30// 31// ARMv8-a, AArch64 32// Neon Available. 33// 34 35// Arguments and results. 36#define srcin x0 37#define cntin x1 38#define chrin w2 39 40#define result x0 41 42#define src x3 43#define tmp x4 44#define wtmp2 w5 45#define synd x6 46#define soff x9 47#define cntrem x10 48 49#define vrepchr v0 50#define vdata1 v1 51#define vdata2 v2 52#define vhas_chr1 v3 53#define vhas_chr2 v4 54#define vrepmask v5 55#define vend v6 56 57// 58// Core algorithm: 59// 60// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits 61// per byte. For each tuple, bit 0 is set if the relevant byte matched the 62// requested character and bit 1 is not used (faster than using a 32bit 63// syndrome). Since the bits in the syndrome reflect exactly the order in which 64// things occur in the original string, counting trailing zeros allows to 65// identify exactly which byte has matched. 66// 67 68ASM_GLOBAL ASM_PFX(InternalMemScanMem8) 69ASM_PFX(InternalMemScanMem8): 70 // Do not dereference srcin if no bytes to compare. 71 cbz cntin, .Lzero_length 72 // 73 // Magic constant 0x40100401 allows us to identify which lane matches 74 // the requested byte. 75 // 76 mov wtmp2, #0x0401 77 movk wtmp2, #0x4010, lsl #16 78 dup vrepchr.16b, chrin 79 // Work with aligned 32-byte chunks 80 bic src, srcin, #31 81 dup vrepmask.4s, wtmp2 82 ands soff, srcin, #31 83 and cntrem, cntin, #31 84 b.eq .Lloop 85 86 // 87 // Input string is not 32-byte aligned. We calculate the syndrome 88 // value for the aligned 32 bytes block containing the first bytes 89 // and mask the irrelevant part. 90 // 91 92 ld1 {vdata1.16b, vdata2.16b}, [src], #32 93 sub tmp, soff, #32 94 adds cntin, cntin, tmp 95 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 96 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 97 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 98 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 99 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 100 addp vend.16b, vend.16b, vend.16b // 128->64 101 mov synd, vend.d[0] 102 // Clear the soff*2 lower bits 103 lsl tmp, soff, #1 104 lsr synd, synd, tmp 105 lsl synd, synd, tmp 106 // The first block can also be the last 107 b.ls .Lmasklast 108 // Have we found something already? 109 cbnz synd, .Ltail 110 111.Lloop: 112 ld1 {vdata1.16b, vdata2.16b}, [src], #32 113 subs cntin, cntin, #32 114 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b 115 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b 116 // If we're out of data we finish regardless of the result 117 b.ls .Lend 118 // Use a fast check for the termination condition 119 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b 120 addp vend.2d, vend.2d, vend.2d 121 mov synd, vend.d[0] 122 // We're not out of data, loop if we haven't found the character 123 cbz synd, .Lloop 124 125.Lend: 126 // Termination condition found, let's calculate the syndrome value 127 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b 128 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b 129 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b // 256->128 130 addp vend.16b, vend.16b, vend.16b // 128->64 131 mov synd, vend.d[0] 132 // Only do the clear for the last possible block 133 b.hi .Ltail 134 135.Lmasklast: 136 // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits 137 add tmp, cntrem, soff 138 and tmp, tmp, #31 139 sub tmp, tmp, #32 140 neg tmp, tmp, lsl #1 141 lsl synd, synd, tmp 142 lsr synd, synd, tmp 143 144.Ltail: 145 // Count the trailing zeros using bit reversing 146 rbit synd, synd 147 // Compensate the last post-increment 148 sub src, src, #32 149 // Check that we have found a character 150 cmp synd, #0 151 // And count the leading zeros 152 clz synd, synd 153 // Compute the potential result 154 add result, src, synd, lsr #1 155 // Select result or NULL 156 csel result, xzr, result, eq 157 ret 158 159.Lzero_length: 160 mov result, #0 161 ret 162