• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #ifndef V8_STRING_SEARCH_H_
29 #define V8_STRING_SEARCH_H_
30 
31 namespace v8 {
32 namespace internal {
33 
34 
35 //---------------------------------------------------------------------
36 // String Search object.
37 //---------------------------------------------------------------------
38 
39 // Class holding constants and methods that apply to all string search variants,
40 // independently of subject and pattern char size.
41 class StringSearchBase {
42  protected:
43   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
44   // limit, we can fix the size of tables. For a needle longer than this limit,
45   // search will not be optimal, since we only build tables for a suffix
46   // of the string, but it is a safe approximation.
47   static const int kBMMaxShift = Isolate::kBMMaxShift;
48 
49   // Reduce alphabet to this size.
50   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
51   // proportional to the input alphabet. We reduce the alphabet size by
52   // equating input characters modulo a smaller alphabet size. This gives
53   // a potentially less efficient searching, but is a safe approximation.
54   // For needles using only characters in the same Unicode 256-code point page,
55   // there is no search speed degradation.
56   static const int kAsciiAlphabetSize = 128;
57   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
58 
59   // Bad-char shift table stored in the state. It's length is the alphabet size.
60   // For patterns below this length, the skip length of Boyer-Moore is too short
61   // to compensate for the algorithmic overhead compared to simple brute force.
62   static const int kBMMinPatternLength = 7;
63 
IsAsciiString(Vector<const char>)64   static inline bool IsAsciiString(Vector<const char>) {
65     return true;
66   }
67 
IsAsciiString(Vector<const uc16> string)68   static inline bool IsAsciiString(Vector<const uc16> string) {
69     return String::IsAscii(string.start(), string.length());
70   }
71 
72   friend class Isolate;
73 };
74 
75 
76 template <typename PatternChar, typename SubjectChar>
77 class StringSearch : private StringSearchBase {
78  public:
StringSearch(Isolate * isolate,Vector<const PatternChar> pattern)79   StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
80       : isolate_(isolate),
81         pattern_(pattern),
82         start_(Max(0, pattern.length() - kBMMaxShift)) {
83     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
84       if (!IsAsciiString(pattern_)) {
85         strategy_ = &FailSearch;
86         return;
87       }
88     }
89     int pattern_length = pattern_.length();
90     if (pattern_length < kBMMinPatternLength) {
91       if (pattern_length == 1) {
92         strategy_ = &SingleCharSearch;
93         return;
94       }
95       strategy_ = &LinearSearch;
96       return;
97     }
98     strategy_ = &InitialSearch;
99   }
100 
Search(Vector<const SubjectChar> subject,int index)101   int Search(Vector<const SubjectChar> subject, int index) {
102     return strategy_(this, subject, index);
103   }
104 
AlphabetSize()105   static inline int AlphabetSize() {
106     if (sizeof(PatternChar) == 1) {
107       // ASCII needle.
108       return kAsciiAlphabetSize;
109     } else {
110       ASSERT(sizeof(PatternChar) == 2);
111       // UC16 needle.
112       return kUC16AlphabetSize;
113     }
114   }
115 
116  private:
117   typedef int (*SearchFunction)(  // NOLINT - it's not a cast!
118       StringSearch<PatternChar, SubjectChar>*,
119       Vector<const SubjectChar>,
120       int);
121 
FailSearch(StringSearch<PatternChar,SubjectChar> *,Vector<const SubjectChar>,int)122   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
123                         Vector<const SubjectChar>,
124                         int) {
125     return -1;
126   }
127 
128   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
129                               Vector<const SubjectChar> subject,
130                               int start_index);
131 
132   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
133                           Vector<const SubjectChar> subject,
134                           int start_index);
135 
136   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
137                            Vector<const SubjectChar> subject,
138                            int start_index);
139 
140   static int BoyerMooreHorspoolSearch(
141       StringSearch<PatternChar, SubjectChar>* search,
142       Vector<const SubjectChar> subject,
143       int start_index);
144 
145   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
146                               Vector<const SubjectChar> subject,
147                               int start_index);
148 
149   void PopulateBoyerMooreHorspoolTable();
150 
151   void PopulateBoyerMooreTable();
152 
CharOccurrence(int * bad_char_occurrence,SubjectChar char_code)153   static inline int CharOccurrence(int* bad_char_occurrence,
154                                    SubjectChar char_code) {
155     if (sizeof(SubjectChar) == 1) {
156       return bad_char_occurrence[static_cast<int>(char_code)];
157     }
158     if (sizeof(PatternChar) == 1) {
159       if (static_cast<unsigned int>(char_code) > String::kMaxAsciiCharCodeU) {
160         return -1;
161       }
162       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
163     }
164     // Both pattern and subject are UC16. Reduce character to equivalence class.
165     int equiv_class = char_code % kUC16AlphabetSize;
166     return bad_char_occurrence[equiv_class];
167   }
168 
169   // The following tables are shared by all searches.
170   // TODO(lrn): Introduce a way for a pattern to keep its tables
171   // between searches (e.g., for an Atom RegExp).
172 
173   // Store for the BoyerMoore(Horspool) bad char shift table.
174   // Return a table covering the last kBMMaxShift+1 positions of
175   // pattern.
bad_char_table()176   int* bad_char_table() {
177     return isolate_->bad_char_shift_table();
178   }
179 
180   // Store for the BoyerMoore good suffix shift table.
good_suffix_shift_table()181   int* good_suffix_shift_table() {
182     // Return biased pointer that maps the range  [start_..pattern_.length()
183     // to the kGoodSuffixShiftTable array.
184     return isolate_->good_suffix_shift_table() - start_;
185   }
186 
187   // Table used temporarily while building the BoyerMoore good suffix
188   // shift table.
suffix_table()189   int* suffix_table() {
190     // Return biased pointer that maps the range  [start_..pattern_.length()
191     // to the kSuffixTable array.
192     return isolate_->suffix_table() - start_;
193   }
194 
195   Isolate* isolate_;
196   // The pattern to search for.
197   Vector<const PatternChar> pattern_;
198   // Pointer to implementation of the search.
199   SearchFunction strategy_;
200   // Cache value of Max(0, pattern_length() - kBMMaxShift)
201   int start_;
202 };
203 
204 
205 //---------------------------------------------------------------------
206 // Single Character Pattern Search Strategy
207 //---------------------------------------------------------------------
208 
209 template <typename PatternChar, typename SubjectChar>
SingleCharSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)210 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
211     StringSearch<PatternChar, SubjectChar>* search,
212     Vector<const SubjectChar> subject,
213     int index) {
214   ASSERT_EQ(1, search->pattern_.length());
215   PatternChar pattern_first_char = search->pattern_[0];
216   int i = index;
217   if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
218     const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
219         memchr(subject.start() + i,
220                pattern_first_char,
221                subject.length() - i));
222     if (pos == NULL) return -1;
223     return static_cast<int>(pos - subject.start());
224   } else {
225     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
226       if (static_cast<uc16>(pattern_first_char) > String::kMaxAsciiCharCodeU) {
227         return -1;
228       }
229     }
230     SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
231     int n = subject.length();
232     while (i < n) {
233       if (subject[i++] == search_char) return i - 1;
234     }
235     return -1;
236   }
237 }
238 
239 //---------------------------------------------------------------------
240 // Linear Search Strategy
241 //---------------------------------------------------------------------
242 
243 
244 template <typename PatternChar, typename SubjectChar>
CharCompare(const PatternChar * pattern,const SubjectChar * subject,int length)245 inline bool CharCompare(const PatternChar* pattern,
246                         const SubjectChar* subject,
247                         int length) {
248   ASSERT(length > 0);
249   int pos = 0;
250   do {
251     if (pattern[pos] != subject[pos]) {
252       return false;
253     }
254     pos++;
255   } while (pos < length);
256   return true;
257 }
258 
259 
260 // Simple linear search for short patterns. Never bails out.
261 template <typename PatternChar, typename SubjectChar>
LinearSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)262 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
263     StringSearch<PatternChar, SubjectChar>* search,
264     Vector<const SubjectChar> subject,
265     int index) {
266   Vector<const PatternChar> pattern = search->pattern_;
267   ASSERT(pattern.length() > 1);
268   int pattern_length = pattern.length();
269   PatternChar pattern_first_char = pattern[0];
270   int i = index;
271   int n = subject.length() - pattern_length;
272   while (i <= n) {
273     if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
274       const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
275           memchr(subject.start() + i,
276                  pattern_first_char,
277                  n - i + 1));
278       if (pos == NULL) return -1;
279       i = static_cast<int>(pos - subject.start()) + 1;
280     } else {
281       if (subject[i++] != pattern_first_char) continue;
282     }
283     // Loop extracted to separate function to allow using return to do
284     // a deeper break.
285     if (CharCompare(pattern.start() + 1,
286                     subject.start() + i,
287                     pattern_length - 1)) {
288       return i - 1;
289     }
290   }
291   return -1;
292 }
293 
294 //---------------------------------------------------------------------
295 // Boyer-Moore string search
296 //---------------------------------------------------------------------
297 
298 template <typename PatternChar, typename SubjectChar>
BoyerMooreSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)299 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
300     StringSearch<PatternChar, SubjectChar>* search,
301     Vector<const SubjectChar> subject,
302     int start_index) {
303   Vector<const PatternChar> pattern = search->pattern_;
304   int subject_length = subject.length();
305   int pattern_length = pattern.length();
306   // Only preprocess at most kBMMaxShift last characters of pattern.
307   int start = search->start_;
308 
309   int* bad_char_occurence = search->bad_char_table();
310   int* good_suffix_shift = search->good_suffix_shift_table();
311 
312   PatternChar last_char = pattern[pattern_length - 1];
313   int index = start_index;
314   // Continue search from i.
315   while (index <= subject_length - pattern_length) {
316     int j = pattern_length - 1;
317     int c;
318     while (last_char != (c = subject[index + j])) {
319       int shift =
320           j - CharOccurrence(bad_char_occurence, c);
321       index += shift;
322       if (index > subject_length - pattern_length) {
323         return -1;
324       }
325     }
326     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
327     if (j < 0) {
328       return index;
329     } else if (j < start) {
330       // we have matched more than our tables allow us to be smart about.
331       // Fall back on BMH shift.
332       index += pattern_length - 1
333           - CharOccurrence(bad_char_occurence,
334                            static_cast<SubjectChar>(last_char));
335     } else {
336       int gs_shift = good_suffix_shift[j + 1];
337       int bc_occ =
338           CharOccurrence(bad_char_occurence, c);
339       int shift = j - bc_occ;
340       if (gs_shift > shift) {
341         shift = gs_shift;
342       }
343       index += shift;
344     }
345   }
346 
347   return -1;
348 }
349 
350 
351 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreTable()352 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
353   int pattern_length = pattern_.length();
354   const PatternChar* pattern = pattern_.start();
355   // Only look at the last kBMMaxShift characters of pattern (from start_
356   // to pattern_length).
357   int start = start_;
358   int length = pattern_length - start;
359 
360   // Biased tables so that we can use pattern indices as table indices,
361   // even if we only cover the part of the pattern from offset start.
362   int* shift_table = good_suffix_shift_table();
363   int* suffix_table = this->suffix_table();
364 
365   // Initialize table.
366   for (int i = start; i < pattern_length; i++) {
367     shift_table[i] = length;
368   }
369   shift_table[pattern_length] = 1;
370   suffix_table[pattern_length] = pattern_length + 1;
371 
372   if (pattern_length <= start) {
373     return;
374   }
375 
376   // Find suffixes.
377   PatternChar last_char = pattern[pattern_length - 1];
378   int suffix = pattern_length + 1;
379   {
380     int i = pattern_length;
381     while (i > start) {
382       PatternChar c = pattern[i - 1];
383       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
384         if (shift_table[suffix] == length) {
385           shift_table[suffix] = suffix - i;
386         }
387         suffix = suffix_table[suffix];
388       }
389       suffix_table[--i] = --suffix;
390       if (suffix == pattern_length) {
391         // No suffix to extend, so we check against last_char only.
392         while ((i > start) && (pattern[i - 1] != last_char)) {
393           if (shift_table[pattern_length] == length) {
394             shift_table[pattern_length] = pattern_length - i;
395           }
396           suffix_table[--i] = pattern_length;
397         }
398         if (i > start) {
399           suffix_table[--i] = --suffix;
400         }
401       }
402     }
403   }
404   // Build shift table using suffixes.
405   if (suffix < pattern_length) {
406     for (int i = start; i <= pattern_length; i++) {
407       if (shift_table[i] == length) {
408         shift_table[i] = suffix - start;
409       }
410       if (i == suffix) {
411         suffix = suffix_table[suffix];
412       }
413     }
414   }
415 }
416 
417 //---------------------------------------------------------------------
418 // Boyer-Moore-Horspool string search.
419 //---------------------------------------------------------------------
420 
421 template <typename PatternChar, typename SubjectChar>
BoyerMooreHorspoolSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)422 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
423     StringSearch<PatternChar, SubjectChar>* search,
424     Vector<const SubjectChar> subject,
425     int start_index) {
426   Vector<const PatternChar> pattern = search->pattern_;
427   int subject_length = subject.length();
428   int pattern_length = pattern.length();
429   int* char_occurrences = search->bad_char_table();
430   int badness = -pattern_length;
431 
432   // How bad we are doing without a good-suffix table.
433   PatternChar last_char = pattern[pattern_length - 1];
434   int last_char_shift = pattern_length - 1 -
435       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
436   // Perform search
437   int index = start_index;  // No matches found prior to this index.
438   while (index <= subject_length - pattern_length) {
439     int j = pattern_length - 1;
440     int subject_char;
441     while (last_char != (subject_char = subject[index + j])) {
442       int bc_occ = CharOccurrence(char_occurrences, subject_char);
443       int shift = j - bc_occ;
444       index += shift;
445       badness += 1 - shift;  // at most zero, so badness cannot increase.
446       if (index > subject_length - pattern_length) {
447         return -1;
448       }
449     }
450     j--;
451     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
452     if (j < 0) {
453       return index;
454     } else {
455       index += last_char_shift;
456       // Badness increases by the number of characters we have
457       // checked, and decreases by the number of characters we
458       // can skip by shifting. It's a measure of how we are doing
459       // compared to reading each character exactly once.
460       badness += (pattern_length - j) - last_char_shift;
461       if (badness > 0) {
462         search->PopulateBoyerMooreTable();
463         search->strategy_ = &BoyerMooreSearch;
464         return BoyerMooreSearch(search, subject, index);
465       }
466     }
467   }
468   return -1;
469 }
470 
471 
472 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreHorspoolTable()473 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
474   int pattern_length = pattern_.length();
475 
476   int* bad_char_occurrence = bad_char_table();
477 
478   // Only preprocess at most kBMMaxShift last characters of pattern.
479   int start = start_;
480   // Run forwards to populate bad_char_table, so that *last* instance
481   // of character equivalence class is the one registered.
482   // Notice: Doesn't include the last character.
483   int table_size = AlphabetSize();
484   if (start == 0) {  // All patterns less than kBMMaxShift in length.
485     memset(bad_char_occurrence,
486            -1,
487            table_size * sizeof(*bad_char_occurrence));
488   } else {
489     for (int i = 0; i < table_size; i++) {
490       bad_char_occurrence[i] = start - 1;
491     }
492   }
493   for (int i = start; i < pattern_length - 1; i++) {
494     PatternChar c = pattern_[i];
495     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
496     bad_char_occurrence[bucket] = i;
497   }
498 }
499 
500 //---------------------------------------------------------------------
501 // Linear string search with bailout to BMH.
502 //---------------------------------------------------------------------
503 
504 // Simple linear search for short patterns, which bails out if the string
505 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
506 template <typename PatternChar, typename SubjectChar>
InitialSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)507 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
508     StringSearch<PatternChar, SubjectChar>* search,
509     Vector<const SubjectChar> subject,
510     int index) {
511   Vector<const PatternChar> pattern = search->pattern_;
512   int pattern_length = pattern.length();
513   // Badness is a count of how much work we have done.  When we have
514   // done enough work we decide it's probably worth switching to a better
515   // algorithm.
516   int badness = -10 - (pattern_length << 2);
517 
518   // We know our pattern is at least 2 characters, we cache the first so
519   // the common case of the first character not matching is faster.
520   PatternChar pattern_first_char = pattern[0];
521   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
522     badness++;
523     if (badness <= 0) {
524       if (sizeof(SubjectChar) == 1 && sizeof(PatternChar) == 1) {
525         const SubjectChar* pos = reinterpret_cast<const SubjectChar*>(
526             memchr(subject.start() + i,
527                    pattern_first_char,
528                    n - i + 1));
529         if (pos == NULL) {
530           return -1;
531         }
532         i = static_cast<int>(pos - subject.start());
533       } else {
534         if (subject[i] != pattern_first_char) continue;
535       }
536       int j = 1;
537       do {
538         if (pattern[j] != subject[i + j]) {
539           break;
540         }
541         j++;
542       } while (j < pattern_length);
543       if (j == pattern_length) {
544         return i;
545       }
546       badness += j;
547     } else {
548       search->PopulateBoyerMooreHorspoolTable();
549       search->strategy_ = &BoyerMooreHorspoolSearch;
550       return BoyerMooreHorspoolSearch(search, subject, i);
551     }
552   }
553   return -1;
554 }
555 
556 
557 // Perform a a single stand-alone search.
558 // If searching multiple times for the same pattern, a search
559 // object should be constructed once and the Search function then called
560 // for each search.
561 template <typename SubjectChar, typename PatternChar>
SearchString(Isolate * isolate,Vector<const SubjectChar> subject,Vector<const PatternChar> pattern,int start_index)562 int SearchString(Isolate* isolate,
563                  Vector<const SubjectChar> subject,
564                  Vector<const PatternChar> pattern,
565                  int start_index) {
566   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
567   return search.Search(subject, start_index);
568 }
569 
570 }}  // namespace v8::internal
571 
572 #endif  // V8_STRING_SEARCH_H_
573