• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//
2// Copyright (c) 2014, ARM Limited
3// All rights Reserved.
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are met:
7//     * Redistributions of source code must retain the above copyright
8//       notice, this list of conditions and the following disclaimer.
9//     * Redistributions in binary form must reproduce the above copyright
10//       notice, this list of conditions and the following disclaimer in the
11//       documentation and/or other materials provided with the distribution.
12//     * Neither the name of the company nor the names of its contributors
13//       may be used to endorse or promote products derived from this
14//       software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27//
28
29// Assumptions:
30//
31// ARMv8-a, AArch64
32// Neon Available.
33//
34
35// Arguments and results.
36#define srcin     x0
37#define cntin     x1
38#define chrin     w2
39
40#define result    x0
41
42#define src       x3
43#define	tmp       x4
44#define wtmp2     w5
45#define synd      x6
46#define soff      x9
47#define cntrem    x10
48
49#define vrepchr   v0
50#define vdata1    v1
51#define vdata2    v2
52#define vhas_chr1 v3
53#define vhas_chr2 v4
54#define vrepmask  v5
55#define vend      v6
56
57//
58// Core algorithm:
59//
60// For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
61// per byte. For each tuple, bit 0 is set if the relevant byte matched the
62// requested character and bit 1 is not used (faster than using a 32bit
63// syndrome). Since the bits in the syndrome reflect exactly the order in which
64// things occur in the original string, counting trailing zeros allows to
65// identify exactly which byte has matched.
66//
67
68ASM_GLOBAL ASM_PFX(InternalMemScanMem8)
69ASM_PFX(InternalMemScanMem8):
70    // Do not dereference srcin if no bytes to compare.
71    cbz	cntin, .Lzero_length
72    //
73    // Magic constant 0x40100401 allows us to identify which lane matches
74    // the requested byte.
75    //
76    mov     wtmp2, #0x0401
77    movk    wtmp2, #0x4010, lsl #16
78    dup     vrepchr.16b, chrin
79    // Work with aligned 32-byte chunks
80    bic     src, srcin, #31
81    dup     vrepmask.4s, wtmp2
82    ands    soff, srcin, #31
83    and     cntrem, cntin, #31
84    b.eq    .Lloop
85
86    //
87    // Input string is not 32-byte aligned. We calculate the syndrome
88    // value for the aligned 32 bytes block containing the first bytes
89    // and mask the irrelevant part.
90    //
91
92    ld1     {vdata1.16b, vdata2.16b}, [src], #32
93    sub     tmp, soff, #32
94    adds    cntin, cntin, tmp
95    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
96    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
97    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
98    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
99    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b        // 256->128
100    addp    vend.16b, vend.16b, vend.16b                  // 128->64
101    mov     synd, vend.d[0]
102    // Clear the soff*2 lower bits
103    lsl     tmp, soff, #1
104    lsr     synd, synd, tmp
105    lsl     synd, synd, tmp
106    // The first block can also be the last
107    b.ls    .Lmasklast
108    // Have we found something already?
109    cbnz    synd, .Ltail
110
111.Lloop:
112    ld1     {vdata1.16b, vdata2.16b}, [src], #32
113    subs    cntin, cntin, #32
114    cmeq    vhas_chr1.16b, vdata1.16b, vrepchr.16b
115    cmeq    vhas_chr2.16b, vdata2.16b, vrepchr.16b
116    // If we're out of data we finish regardless of the result
117    b.ls    .Lend
118    // Use a fast check for the termination condition
119    orr     vend.16b, vhas_chr1.16b, vhas_chr2.16b
120    addp    vend.2d, vend.2d, vend.2d
121    mov     synd, vend.d[0]
122    // We're not out of data, loop if we haven't found the character
123    cbz     synd, .Lloop
124
125.Lend:
126    // Termination condition found, let's calculate the syndrome value
127    and     vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
128    and     vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
129    addp    vend.16b, vhas_chr1.16b, vhas_chr2.16b      // 256->128
130    addp    vend.16b, vend.16b, vend.16b                // 128->64
131    mov     synd, vend.d[0]
132    // Only do the clear for the last possible block
133    b.hi    .Ltail
134
135.Lmasklast:
136    // Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits
137    add     tmp, cntrem, soff
138    and     tmp, tmp, #31
139    sub     tmp, tmp, #32
140    neg     tmp, tmp, lsl #1
141    lsl     synd, synd, tmp
142    lsr     synd, synd, tmp
143
144.Ltail:
145    // Count the trailing zeros using bit reversing
146    rbit    synd, synd
147    // Compensate the last post-increment
148    sub     src, src, #32
149    // Check that we have found a character
150    cmp     synd, #0
151    // And count the leading zeros
152    clz     synd, synd
153    // Compute the potential result
154    add     result, src, synd, lsr #1
155    // Select result or NULL
156    csel    result, xzr, result, eq
157    ret
158
159.Lzero_length:
160    mov   result, #0
161    ret
162