• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_STRINGS_STRING_SEARCH_H_
6 #define V8_STRINGS_STRING_SEARCH_H_
7 
8 #include "src/execution/isolate.h"
9 #include "src/utils/vector.h"
10 
11 namespace v8 {
12 namespace internal {
13 
14 //---------------------------------------------------------------------
15 // String Search object.
16 //---------------------------------------------------------------------
17 
18 // Class holding constants and methods that apply to all string search variants,
19 // independently of subject and pattern char size.
20 class StringSearchBase {
21  protected:
22   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
23   // limit, we can fix the size of tables. For a needle longer than this limit,
24   // search will not be optimal, since we only build tables for a suffix
25   // of the string, but it is a safe approximation.
26   static const int kBMMaxShift = Isolate::kBMMaxShift;
27 
28   // Reduce alphabet to this size.
29   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
30   // proportional to the input alphabet. We reduce the alphabet size by
31   // equating input characters modulo a smaller alphabet size. This gives
32   // a potentially less efficient searching, but is a safe approximation.
33   // For needles using only characters in the same Unicode 256-code point page,
34   // there is no search speed degradation.
35   static const int kLatin1AlphabetSize = 256;
36   static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
37 
38   // Bad-char shift table stored in the state. It's length is the alphabet size.
39   // For patterns below this length, the skip length of Boyer-Moore is too short
40   // to compensate for the algorithmic overhead compared to simple brute force.
41   static const int kBMMinPatternLength = 7;
42 
IsOneByteString(Vector<const uint8_t> string)43   static inline bool IsOneByteString(Vector<const uint8_t> string) {
44     return true;
45   }
46 
IsOneByteString(Vector<const uc16> string)47   static inline bool IsOneByteString(Vector<const uc16> string) {
48     return String::IsOneByte(string.begin(), string.length());
49   }
50 
51   friend class Isolate;
52 };
53 
54 template <typename PatternChar, typename SubjectChar>
55 class StringSearch : private StringSearchBase {
56  public:
StringSearch(Isolate * isolate,Vector<const PatternChar> pattern)57   StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
58       : isolate_(isolate),
59         pattern_(pattern),
60         start_(Max(0, pattern.length() - kBMMaxShift)) {
61     if (sizeof(PatternChar) > sizeof(SubjectChar)) {
62       if (!IsOneByteString(pattern_)) {
63         strategy_ = &FailSearch;
64         return;
65       }
66     }
67     int pattern_length = pattern_.length();
68     if (pattern_length < kBMMinPatternLength) {
69       if (pattern_length == 1) {
70         strategy_ = &SingleCharSearch;
71         return;
72       }
73       strategy_ = &LinearSearch;
74       return;
75     }
76     strategy_ = &InitialSearch;
77   }
78 
Search(Vector<const SubjectChar> subject,int index)79   int Search(Vector<const SubjectChar> subject, int index) {
80     return strategy_(this, subject, index);
81   }
82 
AlphabetSize()83   static inline int AlphabetSize() {
84     if (sizeof(PatternChar) == 1) {
85       // Latin1 needle.
86       return kLatin1AlphabetSize;
87     } else {
88       DCHECK_EQ(sizeof(PatternChar), 2);
89       // UC16 needle.
90       return kUC16AlphabetSize;
91     }
92   }
93 
94  private:
95   using SearchFunction = int (*)(StringSearch<PatternChar, SubjectChar>*,
96                                  Vector<const SubjectChar>, int);
97 
FailSearch(StringSearch<PatternChar,SubjectChar> *,Vector<const SubjectChar>,int)98   static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
99                         Vector<const SubjectChar>, int) {
100     return -1;
101   }
102 
103   static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
104                               Vector<const SubjectChar> subject,
105                               int start_index);
106 
107   static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
108                           Vector<const SubjectChar> subject, int start_index);
109 
110   static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
111                            Vector<const SubjectChar> subject, int start_index);
112 
113   static int BoyerMooreHorspoolSearch(
114       StringSearch<PatternChar, SubjectChar>* search,
115       Vector<const SubjectChar> subject, int start_index);
116 
117   static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
118                               Vector<const SubjectChar> subject,
119                               int start_index);
120 
121   void PopulateBoyerMooreHorspoolTable();
122 
123   void PopulateBoyerMooreTable();
124 
exceedsOneByte(uint8_t c)125   static inline bool exceedsOneByte(uint8_t c) { return false; }
126 
exceedsOneByte(uint16_t c)127   static inline bool exceedsOneByte(uint16_t c) {
128     return c > String::kMaxOneByteCharCodeU;
129   }
130 
CharOccurrence(int * bad_char_occurrence,SubjectChar char_code)131   static inline int CharOccurrence(int* bad_char_occurrence,
132                                    SubjectChar char_code) {
133     if (sizeof(SubjectChar) == 1) {
134       return bad_char_occurrence[static_cast<int>(char_code)];
135     }
136     if (sizeof(PatternChar) == 1) {
137       if (exceedsOneByte(char_code)) {
138         return -1;
139       }
140       return bad_char_occurrence[static_cast<unsigned int>(char_code)];
141     }
142     // Both pattern and subject are UC16. Reduce character to equivalence class.
143     int equiv_class = char_code % kUC16AlphabetSize;
144     return bad_char_occurrence[equiv_class];
145   }
146 
147   // The following tables are shared by all searches.
148   // TODO(lrn): Introduce a way for a pattern to keep its tables
149   // between searches (e.g., for an Atom RegExp).
150 
151   // Store for the BoyerMoore(Horspool) bad char shift table.
152   // Return a table covering the last kBMMaxShift+1 positions of
153   // pattern.
bad_char_table()154   int* bad_char_table() { return isolate_->bad_char_shift_table(); }
155 
156   // Store for the BoyerMoore good suffix shift table.
good_suffix_shift_table()157   int* good_suffix_shift_table() {
158     // Return biased pointer that maps the range  [start_..pattern_.length()
159     // to the kGoodSuffixShiftTable array.
160     return isolate_->good_suffix_shift_table() - start_;
161   }
162 
163   // Table used temporarily while building the BoyerMoore good suffix
164   // shift table.
suffix_table()165   int* suffix_table() {
166     // Return biased pointer that maps the range  [start_..pattern_.length()
167     // to the kSuffixTable array.
168     return isolate_->suffix_table() - start_;
169   }
170 
171   Isolate* isolate_;
172   // The pattern to search for.
173   Vector<const PatternChar> pattern_;
174   // Pointer to implementation of the search.
175   SearchFunction strategy_;
176   // Cache value of Max(0, pattern_length() - kBMMaxShift)
177   int start_;
178 };
179 
180 template <typename T, typename U>
AlignDown(T value,U alignment)181 inline T AlignDown(T value, U alignment) {
182   return reinterpret_cast<T>(
183       (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
184 }
185 
GetHighestValueByte(uc16 character)186 inline uint8_t GetHighestValueByte(uc16 character) {
187   return Max(static_cast<uint8_t>(character & 0xFF),
188              static_cast<uint8_t>(character >> 8));
189 }
190 
GetHighestValueByte(uint8_t character)191 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
192 
193 template <typename PatternChar, typename SubjectChar>
FindFirstCharacter(Vector<const PatternChar> pattern,Vector<const SubjectChar> subject,int index)194 inline int FindFirstCharacter(Vector<const PatternChar> pattern,
195                               Vector<const SubjectChar> subject, int index) {
196   const PatternChar pattern_first_char = pattern[0];
197   const int max_n = (subject.length() - pattern.length() + 1);
198 
199   if (sizeof(SubjectChar) == 2 && pattern_first_char == 0) {
200     // Special-case looking for the 0 char in other than one-byte strings.
201     // memchr mostly fails in this case due to every other byte being 0 in text
202     // that is mostly ascii characters.
203     for (int i = index; i < max_n; ++i) {
204       if (subject[i] == 0) return i;
205     }
206     return -1;
207   }
208   const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
209   const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
210   int pos = index;
211   do {
212     DCHECK_GE(max_n - pos, 0);
213     const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
214         memchr(subject.begin() + pos, search_byte,
215                (max_n - pos) * sizeof(SubjectChar)));
216     if (char_pos == nullptr) return -1;
217     char_pos = AlignDown(char_pos, sizeof(SubjectChar));
218     pos = static_cast<int>(char_pos - subject.begin());
219     if (subject[pos] == search_char) return pos;
220   } while (++pos < max_n);
221 
222   return -1;
223 }
224 
225 //---------------------------------------------------------------------
226 // Single Character Pattern Search Strategy
227 //---------------------------------------------------------------------
228 
229 template <typename PatternChar, typename SubjectChar>
SingleCharSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)230 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
231     StringSearch<PatternChar, SubjectChar>* search,
232     Vector<const SubjectChar> subject, int index) {
233   DCHECK_EQ(1, search->pattern_.length());
234   PatternChar pattern_first_char = search->pattern_[0];
235   if (sizeof(PatternChar) > sizeof(SubjectChar)) {
236     if (exceedsOneByte(pattern_first_char)) {
237       return -1;
238     }
239   }
240   return FindFirstCharacter(search->pattern_, subject, index);
241 }
242 
243 //---------------------------------------------------------------------
244 // Linear Search Strategy
245 //---------------------------------------------------------------------
246 
247 template <typename PatternChar, typename SubjectChar>
CharCompare(const PatternChar * pattern,const SubjectChar * subject,int length)248 inline bool CharCompare(const PatternChar* pattern, const SubjectChar* subject,
249                         int length) {
250   DCHECK_GT(length, 0);
251   int pos = 0;
252   do {
253     if (pattern[pos] != subject[pos]) {
254       return false;
255     }
256     pos++;
257   } while (pos < length);
258   return true;
259 }
260 
261 // Simple linear search for short patterns. Never bails out.
262 template <typename PatternChar, typename SubjectChar>
LinearSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)263 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
264     StringSearch<PatternChar, SubjectChar>* search,
265     Vector<const SubjectChar> subject, int index) {
266   Vector<const PatternChar> pattern = search->pattern_;
267   DCHECK_GT(pattern.length(), 1);
268   int pattern_length = pattern.length();
269   int i = index;
270   int n = subject.length() - pattern_length;
271   while (i <= n) {
272     i = FindFirstCharacter(pattern, subject, i);
273     if (i == -1) return -1;
274     DCHECK_LE(i, n);
275     i++;
276     // Loop extracted to separate function to allow using return to do
277     // a deeper break.
278     if (CharCompare(pattern.begin() + 1, subject.begin() + i,
279                     pattern_length - 1)) {
280       return i - 1;
281     }
282   }
283   return -1;
284 }
285 
286 //---------------------------------------------------------------------
287 // Boyer-Moore string search
288 //---------------------------------------------------------------------
289 
290 template <typename PatternChar, typename SubjectChar>
BoyerMooreSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)291 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
292     StringSearch<PatternChar, SubjectChar>* search,
293     Vector<const SubjectChar> subject, int start_index) {
294   Vector<const PatternChar> pattern = search->pattern_;
295   int subject_length = subject.length();
296   int pattern_length = pattern.length();
297   // Only preprocess at most kBMMaxShift last characters of pattern.
298   int start = search->start_;
299 
300   int* bad_char_occurence = search->bad_char_table();
301   int* good_suffix_shift = search->good_suffix_shift_table();
302 
303   PatternChar last_char = pattern[pattern_length - 1];
304   int index = start_index;
305   // Continue search from i.
306   while (index <= subject_length - pattern_length) {
307     int j = pattern_length - 1;
308     int c;
309     while (last_char != (c = subject[index + j])) {
310       int shift = j - CharOccurrence(bad_char_occurence, c);
311       index += shift;
312       if (index > subject_length - pattern_length) {
313         return -1;
314       }
315     }
316     while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
317     if (j < 0) {
318       return index;
319     } else if (j < start) {
320       // we have matched more than our tables allow us to be smart about.
321       // Fall back on BMH shift.
322       index += pattern_length - 1 -
323                CharOccurrence(bad_char_occurence,
324                               static_cast<SubjectChar>(last_char));
325     } else {
326       int gs_shift = good_suffix_shift[j + 1];
327       int bc_occ = CharOccurrence(bad_char_occurence, c);
328       int shift = j - bc_occ;
329       if (gs_shift > shift) {
330         shift = gs_shift;
331       }
332       index += shift;
333     }
334   }
335 
336   return -1;
337 }
338 
339 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreTable()340 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
341   int pattern_length = pattern_.length();
342   const PatternChar* pattern = pattern_.begin();
343   // Only look at the last kBMMaxShift characters of pattern (from start_
344   // to pattern_length).
345   int start = start_;
346   int length = pattern_length - start;
347 
348   // Biased tables so that we can use pattern indices as table indices,
349   // even if we only cover the part of the pattern from offset start.
350   int* shift_table = good_suffix_shift_table();
351   int* suffix_table = this->suffix_table();
352 
353   // Initialize table.
354   for (int i = start; i < pattern_length; i++) {
355     shift_table[i] = length;
356   }
357   shift_table[pattern_length] = 1;
358   suffix_table[pattern_length] = pattern_length + 1;
359 
360   if (pattern_length <= start) {
361     return;
362   }
363 
364   // Find suffixes.
365   PatternChar last_char = pattern[pattern_length - 1];
366   int suffix = pattern_length + 1;
367   {
368     int i = pattern_length;
369     while (i > start) {
370       PatternChar c = pattern[i - 1];
371       while (suffix <= pattern_length && c != pattern[suffix - 1]) {
372         if (shift_table[suffix] == length) {
373           shift_table[suffix] = suffix - i;
374         }
375         suffix = suffix_table[suffix];
376       }
377       suffix_table[--i] = --suffix;
378       if (suffix == pattern_length) {
379         // No suffix to extend, so we check against last_char only.
380         while ((i > start) && (pattern[i - 1] != last_char)) {
381           if (shift_table[pattern_length] == length) {
382             shift_table[pattern_length] = pattern_length - i;
383           }
384           suffix_table[--i] = pattern_length;
385         }
386         if (i > start) {
387           suffix_table[--i] = --suffix;
388         }
389       }
390     }
391   }
392   // Build shift table using suffixes.
393   if (suffix < pattern_length) {
394     for (int i = start; i <= pattern_length; i++) {
395       if (shift_table[i] == length) {
396         shift_table[i] = suffix - start;
397       }
398       if (i == suffix) {
399         suffix = suffix_table[suffix];
400       }
401     }
402   }
403 }
404 
405 //---------------------------------------------------------------------
406 // Boyer-Moore-Horspool string search.
407 //---------------------------------------------------------------------
408 
409 template <typename PatternChar, typename SubjectChar>
BoyerMooreHorspoolSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)410 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
411     StringSearch<PatternChar, SubjectChar>* search,
412     Vector<const SubjectChar> subject, int start_index) {
413   Vector<const PatternChar> pattern = search->pattern_;
414   int subject_length = subject.length();
415   int pattern_length = pattern.length();
416   int* char_occurrences = search->bad_char_table();
417   int badness = -pattern_length;
418 
419   // How bad we are doing without a good-suffix table.
420   PatternChar last_char = pattern[pattern_length - 1];
421   int last_char_shift =
422       pattern_length - 1 -
423       CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
424   // Perform search
425   int index = start_index;  // No matches found prior to this index.
426   while (index <= subject_length - pattern_length) {
427     int j = pattern_length - 1;
428     int subject_char;
429     while (last_char != (subject_char = subject[index + j])) {
430       int bc_occ = CharOccurrence(char_occurrences, subject_char);
431       int shift = j - bc_occ;
432       index += shift;
433       badness += 1 - shift;  // at most zero, so badness cannot increase.
434       if (index > subject_length - pattern_length) {
435         return -1;
436       }
437     }
438     j--;
439     while (j >= 0 && pattern[j] == (subject[index + j])) j--;
440     if (j < 0) {
441       return index;
442     } else {
443       index += last_char_shift;
444       // Badness increases by the number of characters we have
445       // checked, and decreases by the number of characters we
446       // can skip by shifting. It's a measure of how we are doing
447       // compared to reading each character exactly once.
448       badness += (pattern_length - j) - last_char_shift;
449       if (badness > 0) {
450         search->PopulateBoyerMooreTable();
451         search->strategy_ = &BoyerMooreSearch;
452         return BoyerMooreSearch(search, subject, index);
453       }
454     }
455   }
456   return -1;
457 }
458 
459 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreHorspoolTable()460 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
461   int pattern_length = pattern_.length();
462 
463   int* bad_char_occurrence = bad_char_table();
464 
465   // Only preprocess at most kBMMaxShift last characters of pattern.
466   int start = start_;
467   // Run forwards to populate bad_char_table, so that *last* instance
468   // of character equivalence class is the one registered.
469   // Notice: Doesn't include the last character.
470   int table_size = AlphabetSize();
471   if (start == 0) {  // All patterns less than kBMMaxShift in length.
472     memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
473   } else {
474     for (int i = 0; i < table_size; i++) {
475       bad_char_occurrence[i] = start - 1;
476     }
477   }
478   for (int i = start; i < pattern_length - 1; i++) {
479     PatternChar c = pattern_[i];
480     int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
481     bad_char_occurrence[bucket] = i;
482   }
483 }
484 
485 //---------------------------------------------------------------------
486 // Linear string search with bailout to BMH.
487 //---------------------------------------------------------------------
488 
489 // Simple linear search for short patterns, which bails out if the string
490 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
491 template <typename PatternChar, typename SubjectChar>
InitialSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)492 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
493     StringSearch<PatternChar, SubjectChar>* search,
494     Vector<const SubjectChar> subject, int index) {
495   Vector<const PatternChar> pattern = search->pattern_;
496   int pattern_length = pattern.length();
497   // Badness is a count of how much work we have done.  When we have
498   // done enough work we decide it's probably worth switching to a better
499   // algorithm.
500   int badness = -10 - (pattern_length << 2);
501 
502   // We know our pattern is at least 2 characters, we cache the first so
503   // the common case of the first character not matching is faster.
504   for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
505     badness++;
506     if (badness <= 0) {
507       i = FindFirstCharacter(pattern, subject, i);
508       if (i == -1) return -1;
509       DCHECK_LE(i, n);
510       int j = 1;
511       do {
512         if (pattern[j] != subject[i + j]) {
513           break;
514         }
515         j++;
516       } while (j < pattern_length);
517       if (j == pattern_length) {
518         return i;
519       }
520       badness += j;
521     } else {
522       search->PopulateBoyerMooreHorspoolTable();
523       search->strategy_ = &BoyerMooreHorspoolSearch;
524       return BoyerMooreHorspoolSearch(search, subject, i);
525     }
526   }
527   return -1;
528 }
529 
530 // Perform a a single stand-alone search.
531 // If searching multiple times for the same pattern, a search
532 // object should be constructed once and the Search function then called
533 // for each search.
534 template <typename SubjectChar, typename PatternChar>
SearchString(Isolate * isolate,Vector<const SubjectChar> subject,Vector<const PatternChar> pattern,int start_index)535 int SearchString(Isolate* isolate, Vector<const SubjectChar> subject,
536                  Vector<const PatternChar> pattern, int start_index) {
537   StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
538   return search.Search(subject, start_index);
539 }
540 
541 // A wrapper function around SearchString that wraps raw pointers to the subject
542 // and pattern as vectors before calling SearchString. Used from the
543 // StringIndexOf builtin.
544 template <typename SubjectChar, typename PatternChar>
SearchStringRaw(Isolate * isolate,const SubjectChar * subject_ptr,int subject_length,const PatternChar * pattern_ptr,int pattern_length,int start_index)545 intptr_t SearchStringRaw(Isolate* isolate, const SubjectChar* subject_ptr,
546                          int subject_length, const PatternChar* pattern_ptr,
547                          int pattern_length, int start_index) {
548   DisallowHeapAllocation no_gc;
549   Vector<const SubjectChar> subject(subject_ptr, subject_length);
550   Vector<const PatternChar> pattern(pattern_ptr, pattern_length);
551   return SearchString(isolate, subject, pattern, start_index);
552 }
553 
554 }  // namespace internal
555 }  // namespace v8
556 
557 #endif  // V8_STRINGS_STRING_SEARCH_H_
558