• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* stringlib: fastsearch implementation */
2 
3 #ifndef STRINGLIB_FASTSEARCH_H
4 #define STRINGLIB_FASTSEARCH_H
5 
6 /* fast search/count implementation, based on a mix between boyer-
7    moore and horspool, with a few more bells and whistles on the top.
8    for some more background, see: http://effbot.org/zone/stringlib.htm */
9 
10 /* note: fastsearch may access s[n], which isn't a problem when using
11    Python's ordinary string types, but may cause problems if you're
12    using this code in other contexts.  also, the count mode returns -1
13    if there cannot possible be a match in the target string, and 0 if
14    it has actually checked for matches, but didn't find any.  callers
15    beware! */
16 
17 #define FAST_COUNT 0
18 #define FAST_SEARCH 1
19 #define FAST_RSEARCH 2
20 
21 #if LONG_BIT >= 128
22 #define STRINGLIB_BLOOM_WIDTH 128
23 #elif LONG_BIT >= 64
24 #define STRINGLIB_BLOOM_WIDTH 64
25 #elif LONG_BIT >= 32
26 #define STRINGLIB_BLOOM_WIDTH 32
27 #else
28 #error "LONG_BIT is smaller than 32"
29 #endif
30 
31 #define STRINGLIB_BLOOM_ADD(mask, ch) \
32     ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
33 #define STRINGLIB_BLOOM(mask, ch)     \
34     ((mask &  (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1)))))
35 
36 Py_LOCAL_INLINE(Py_ssize_t)
fastsearch(const STRINGLIB_CHAR * s,Py_ssize_t n,const STRINGLIB_CHAR * p,Py_ssize_t m,Py_ssize_t maxcount,int mode)37 fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
38            const STRINGLIB_CHAR* p, Py_ssize_t m,
39            Py_ssize_t maxcount, int mode)
40 {
41     unsigned long mask;
42     Py_ssize_t skip, count = 0;
43     Py_ssize_t i, j, mlast, w;
44 
45     w = n - m;
46 
47     if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
48         return -1;
49 
50     /* look for special cases */
51     if (m <= 1) {
52         if (m <= 0)
53             return -1;
54         /* use special case for 1-character strings */
55         if (mode == FAST_COUNT) {
56             for (i = 0; i < n; i++)
57                 if (s[i] == p[0]) {
58                     count++;
59                     if (count == maxcount)
60                         return maxcount;
61                 }
62             return count;
63         } else if (mode == FAST_SEARCH) {
64             for (i = 0; i < n; i++)
65                 if (s[i] == p[0])
66                     return i;
67         } else {    /* FAST_RSEARCH */
68             for (i = n - 1; i > -1; i--)
69                 if (s[i] == p[0])
70                     return i;
71         }
72         return -1;
73     }
74 
75     mlast = m - 1;
76     skip = mlast - 1;
77     mask = 0;
78 
79     if (mode != FAST_RSEARCH) {
80 
81         /* create compressed boyer-moore delta 1 table */
82 
83         /* process pattern[:-1] */
84         for (i = 0; i < mlast; i++) {
85             STRINGLIB_BLOOM_ADD(mask, p[i]);
86             if (p[i] == p[mlast])
87                 skip = mlast - i - 1;
88         }
89         /* process pattern[-1] outside the loop */
90         STRINGLIB_BLOOM_ADD(mask, p[mlast]);
91 
92         for (i = 0; i <= w; i++) {
93             /* note: using mlast in the skip path slows things down on x86 */
94             if (s[i+m-1] == p[m-1]) {
95                 /* candidate match */
96                 for (j = 0; j < mlast; j++)
97                     if (s[i+j] != p[j])
98                         break;
99                 if (j == mlast) {
100                     /* got a match! */
101                     if (mode != FAST_COUNT)
102                         return i;
103                     count++;
104                     if (count == maxcount)
105                         return maxcount;
106                     i = i + mlast;
107                     continue;
108                 }
109                 /* miss: check if next character is part of pattern */
110                 if (!STRINGLIB_BLOOM(mask, s[i+m]))
111                     i = i + m;
112                 else
113                     i = i + skip;
114             } else {
115                 /* skip: check if next character is part of pattern */
116                 if (!STRINGLIB_BLOOM(mask, s[i+m]))
117                     i = i + m;
118             }
119         }
120     } else {    /* FAST_RSEARCH */
121 
122         /* create compressed boyer-moore delta 1 table */
123 
124         /* process pattern[0] outside the loop */
125         STRINGLIB_BLOOM_ADD(mask, p[0]);
126         /* process pattern[:0:-1] */
127         for (i = mlast; i > 0; i--) {
128             STRINGLIB_BLOOM_ADD(mask, p[i]);
129             if (p[i] == p[0])
130                 skip = i - 1;
131         }
132 
133         for (i = w; i >= 0; i--) {
134             if (s[i] == p[0]) {
135                 /* candidate match */
136                 for (j = mlast; j > 0; j--)
137                     if (s[i+j] != p[j])
138                         break;
139                 if (j == 0)
140                     /* got a match! */
141                     return i;
142                 /* miss: check if previous character is part of pattern */
143                 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
144                     i = i - m;
145                 else
146                     i = i - skip;
147             } else {
148                 /* skip: check if previous character is part of pattern */
149                 if (i > 0 && !STRINGLIB_BLOOM(mask, s[i-1]))
150                     i = i - m;
151             }
152         }
153     }
154 
155     if (mode != FAST_COUNT)
156         return -1;
157     return count;
158 }
159 
160 #endif
161