• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef SRC_STRING_SEARCH_H_
6 #define SRC_STRING_SEARCH_H_
7 
8 #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
9 
10 #include "util.h"
11 
12 #include <cstring>
13 #include <algorithm>
14 
15 namespace node {
16 namespace stringsearch {
17 
18 template <typename T>
19 class Vector {
20  public:
Vector(T * data,size_t length,bool isForward)21   Vector(T* data, size_t length, bool isForward)
22       : start_(data), length_(length), is_forward_(isForward) {
23     CHECK(length > 0 && data != nullptr);
24   }
25 
26   // Returns the start of the memory range.
27   // For vector v this is NOT necessarily &v[0], see forward().
start()28   const T* start() const { return start_; }
29 
30   // Returns the length of the vector, in characters.
length()31   size_t length() const { return length_; }
32 
33   // Returns true if the Vector is front-to-back, false if back-to-front.
34   // In the latter case, v[0] corresponds to the *end* of the memory range.
forward()35   bool forward() const { return is_forward_; }
36 
37   // Access individual vector elements - checks bounds in debug mode.
38   T& operator[](size_t index) const {
39     DCHECK_LT(index, length_);
40     return start_[is_forward_ ? index : (length_ - index - 1)];
41   }
42 
43  private:
44   T* start_;
45   size_t length_;
46   bool is_forward_;
47 };
48 
49 
50 //---------------------------------------------------------------------
51 // String Search object.
52 //---------------------------------------------------------------------
53 
54 // Class holding constants and methods that apply to all string search variants,
55 // independently of subject and pattern char size.
56 class StringSearchBase {
57  protected:
58   // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
59   // limit, we can fix the size of tables. For a needle longer than this limit,
60   // search will not be optimal, since we only build tables for a suffix
61   // of the string, but it is a safe approximation.
62   static const int kBMMaxShift = 250;
63 
64   // Reduce alphabet to this size.
65   // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
66   // proportional to the input alphabet. We reduce the alphabet size by
67   // equating input characters modulo a smaller alphabet size. This gives
68   // a potentially less efficient searching, but is a safe approximation.
69   // For needles using only characters in the same Unicode 256-code point page,
70   // there is no search speed degradation.
71   static const int kLatin1AlphabetSize = 256;
72   static const int kUC16AlphabetSize = 256;
73 
74   // Bad-char shift table stored in the state. It's length is the alphabet size.
75   // For patterns below this length, the skip length of Boyer-Moore is too short
76   // to compensate for the algorithmic overhead compared to simple brute force.
77   static const int kBMMinPatternLength = 8;
78 
79   // Store for the BoyerMoore(Horspool) bad char shift table.
80   int bad_char_shift_table_[kUC16AlphabetSize];
81   // Store for the BoyerMoore good suffix shift table.
82   int good_suffix_shift_table_[kBMMaxShift + 1];
83   // Table used temporarily while building the BoyerMoore good suffix
84   // shift table.
85   int suffix_table_[kBMMaxShift + 1];
86 };
87 
88 template <typename Char>
89 class StringSearch : private StringSearchBase {
90  public:
91   typedef stringsearch::Vector<const Char> Vector;
92 
StringSearch(Vector pattern)93   explicit StringSearch(Vector pattern)
94       : pattern_(pattern), start_(0) {
95     if (pattern.length() >= kBMMaxShift) {
96       start_ = pattern.length() - kBMMaxShift;
97     }
98 
99     size_t pattern_length = pattern_.length();
100     CHECK_GT(pattern_length, 0);
101     if (pattern_length < kBMMinPatternLength) {
102       if (pattern_length == 1) {
103         strategy_ = SearchStrategy::kSingleChar;
104         return;
105       }
106       strategy_ = SearchStrategy::kLinear;
107       return;
108     }
109     strategy_ = SearchStrategy::kInitial;
110   }
111 
Search(Vector subject,size_t index)112   size_t Search(Vector subject, size_t index) {
113     switch (strategy_) {
114       case kBoyerMooreHorspool:
115         return BoyerMooreHorspoolSearch(subject, index);
116       case kBoyerMoore:
117         return BoyerMooreSearch(subject, index);
118       case kInitial:
119         return InitialSearch(subject, index);
120       case kLinear:
121         return LinearSearch(subject, index);
122       case kSingleChar:
123         return SingleCharSearch(subject, index);
124     }
125     UNREACHABLE();
126   }
127 
AlphabetSize()128   static inline int AlphabetSize() {
129     if (sizeof(Char) == 1) {
130       // Latin1 needle.
131       return kLatin1AlphabetSize;
132     } else {
133       // UC16 needle.
134       return kUC16AlphabetSize;
135     }
136 
137     static_assert(sizeof(Char) == sizeof(uint8_t) ||
138                   sizeof(Char) == sizeof(uint16_t),
139                   "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)");
140   }
141 
142  private:
143   typedef size_t (StringSearch::*SearchFunction)(Vector, size_t);
144   size_t SingleCharSearch(Vector subject, size_t start_index);
145   size_t LinearSearch(Vector subject, size_t start_index);
146   size_t InitialSearch(Vector subject, size_t start_index);
147   size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index);
148   size_t BoyerMooreSearch(Vector subject, size_t start_index);
149 
150   void PopulateBoyerMooreHorspoolTable();
151 
152   void PopulateBoyerMooreTable();
153 
CharOccurrence(int * bad_char_occurrence,Char char_code)154   static inline int CharOccurrence(int* bad_char_occurrence,
155                                    Char char_code) {
156     if (sizeof(Char) == 1) {
157       return bad_char_occurrence[static_cast<int>(char_code)];
158     }
159     // Both pattern and subject are UC16. Reduce character to equivalence class.
160     int equiv_class = char_code % kUC16AlphabetSize;
161     return bad_char_occurrence[equiv_class];
162   }
163 
164   enum SearchStrategy {
165     kBoyerMooreHorspool,
166     kBoyerMoore,
167     kInitial,
168     kLinear,
169     kSingleChar,
170   };
171 
172   // The pattern to search for.
173   Vector pattern_;
174   SearchStrategy strategy_;
175   // Cache value of Max(0, pattern_length() - kBMMaxShift)
176   size_t start_;
177 };
178 
179 
180 template <typename T, typename U>
AlignDown(T value,U alignment)181 inline T AlignDown(T value, U alignment) {
182   return reinterpret_cast<T>(
183       (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
184 }
185 
186 
GetHighestValueByte(uint16_t character)187 inline uint8_t GetHighestValueByte(uint16_t character) {
188   return std::max(static_cast<uint8_t>(character & 0xFF),
189                   static_cast<uint8_t>(character >> 8));
190 }
191 
192 
GetHighestValueByte(uint8_t character)193 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
194 
195 
196 // Searches for a byte value in a memory buffer, back to front.
197 // Uses memrchr(3) on systems which support it, for speed.
198 // Falls back to a vanilla for loop on non-GNU systems such as Windows.
MemrchrFill(const void * haystack,uint8_t needle,size_t haystack_len)199 inline const void* MemrchrFill(const void* haystack, uint8_t needle,
200                                size_t haystack_len) {
201 #ifdef _GNU_SOURCE
202   return memrchr(haystack, needle, haystack_len);
203 #else
204   const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack);
205   for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) {
206     if (haystack8[i] == needle) {
207       return haystack8 + i;
208     }
209   }
210   return nullptr;
211 #endif
212 }
213 
214 
215 // Finds the first occurrence of *two-byte* character pattern[0] in the string
216 // `subject`. Does not check that the whole pattern matches.
217 template <typename Char>
FindFirstCharacter(Vector<const Char> pattern,Vector<const Char> subject,size_t index)218 inline size_t FindFirstCharacter(Vector<const Char> pattern,
219                                  Vector<const Char> subject, size_t index) {
220   const Char pattern_first_char = pattern[0];
221   const size_t max_n = (subject.length() - pattern.length() + 1);
222 
223   // For speed, search for the more `rare` of the two bytes in pattern[0]
224   // using memchr / memrchr (which are much faster than a simple for loop).
225   const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
226   size_t pos = index;
227   do {
228     const size_t bytes_to_search = (max_n - pos) * sizeof(Char);
229     const void* void_pos;
230     if (subject.forward()) {
231       // Assert that bytes_to_search won't overflow
232       CHECK_LE(pos, max_n);
233       CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char));
234       void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search);
235     } else {
236       CHECK_LE(pos, subject.length());
237       CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char));
238       void_pos = MemrchrFill(subject.start() + pattern.length() - 1,
239                              search_byte,
240                              bytes_to_search);
241     }
242     const Char* char_pos = static_cast<const Char*>(void_pos);
243     if (char_pos == nullptr)
244       return subject.length();
245 
246     // Then, for each match, verify that the full two bytes match pattern[0].
247     char_pos = AlignDown(char_pos, sizeof(Char));
248     size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
249     pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1);
250     if (subject[pos] == pattern_first_char) {
251       // Match found, hooray.
252       return pos;
253     }
254     // Search byte matched, but the other byte of pattern[0] didn't. Keep going.
255   } while (++pos < max_n);
256 
257   return subject.length();
258 }
259 
260 
261 // Finds the first occurrence of the byte pattern[0] in string `subject`.
262 // Does not verify that the whole pattern matches.
263 template <>
FindFirstCharacter(Vector<const uint8_t> pattern,Vector<const uint8_t> subject,size_t index)264 inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
265                                  Vector<const uint8_t> subject,
266                                  size_t index) {
267   const uint8_t pattern_first_char = pattern[0];
268   const size_t subj_len = subject.length();
269   const size_t max_n = (subject.length() - pattern.length() + 1);
270 
271   const void* pos;
272   if (subject.forward()) {
273     pos = memchr(subject.start() + index, pattern_first_char, max_n - index);
274   } else {
275     pos = MemrchrFill(subject.start() + pattern.length() - 1,
276                       pattern_first_char,
277                       max_n - index);
278   }
279   const uint8_t* char_pos = static_cast<const uint8_t*>(pos);
280   if (char_pos == nullptr) {
281     return subj_len;
282   }
283 
284   size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
285   return subject.forward() ? raw_pos : (subj_len - raw_pos - 1);
286 }
287 
288 //---------------------------------------------------------------------
289 // Single Character Pattern Search Strategy
290 //---------------------------------------------------------------------
291 
292 template <typename Char>
SingleCharSearch(Vector subject,size_t index)293 size_t StringSearch<Char>::SingleCharSearch(
294     Vector subject,
295     size_t index) {
296   CHECK_EQ(1, pattern_.length());
297   return FindFirstCharacter(pattern_, subject, index);
298 }
299 
300 //---------------------------------------------------------------------
301 // Linear Search Strategy
302 //---------------------------------------------------------------------
303 
304 // Simple linear search for short patterns. Never bails out.
305 template <typename Char>
LinearSearch(Vector subject,size_t index)306 size_t StringSearch<Char>::LinearSearch(
307     Vector subject,
308     size_t index) {
309   CHECK_GT(pattern_.length(), 1);
310   const size_t n = subject.length() - pattern_.length();
311   for (size_t i = index; i <= n; i++) {
312     i = FindFirstCharacter(pattern_, subject, i);
313     if (i == subject.length())
314       return subject.length();
315     CHECK_LE(i, n);
316 
317     bool matches = true;
318     for (size_t j = 1; j < pattern_.length(); j++) {
319       if (pattern_[j] != subject[i + j]) {
320         matches = false;
321         break;
322       }
323     }
324     if (matches) {
325       return i;
326     }
327   }
328   return subject.length();
329 }
330 
331 //---------------------------------------------------------------------
332 // Boyer-Moore string search
333 //---------------------------------------------------------------------
334 
335 template <typename Char>
BoyerMooreSearch(Vector subject,size_t start_index)336 size_t StringSearch<Char>::BoyerMooreSearch(
337     Vector subject,
338     size_t start_index) {
339   const size_t subject_length = subject.length();
340   const size_t pattern_length = pattern_.length();
341   // Only preprocess at most kBMMaxShift last characters of pattern.
342   size_t start = start_;
343 
344   int* bad_char_occurrence = bad_char_shift_table_;
345   int* good_suffix_shift = good_suffix_shift_table_ - start_;
346 
347   Char last_char = pattern_[pattern_length - 1];
348   size_t index = start_index;
349   // Continue search from i.
350   while (index <= subject_length - pattern_length) {
351     size_t j = pattern_length - 1;
352     int c;
353     while (last_char != (c = subject[index + j])) {
354       int shift = j - CharOccurrence(bad_char_occurrence, c);
355       index += shift;
356       if (index > subject_length - pattern_length) {
357         return subject.length();
358       }
359     }
360     while (pattern_[j] == (c = subject[index + j])) {
361       if (j == 0) {
362         return index;
363       }
364       j--;
365     }
366     if (j < start) {
367       // we have matched more than our tables allow us to be smart about.
368       // Fall back on BMH shift.
369       index += pattern_length - 1 -
370                CharOccurrence(bad_char_occurrence, last_char);
371     } else {
372       int gs_shift = good_suffix_shift[j + 1];
373       int bc_occ = CharOccurrence(bad_char_occurrence, c);
374       int shift = j - bc_occ;
375       if (gs_shift > shift) {
376         shift = gs_shift;
377       }
378       index += shift;
379     }
380   }
381 
382   return subject.length();
383 }
384 
385 template <typename Char>
PopulateBoyerMooreTable()386 void StringSearch<Char>::PopulateBoyerMooreTable() {
387   const size_t pattern_length = pattern_.length();
388   // Only look at the last kBMMaxShift characters of pattern (from start_
389   // to pattern_length).
390   const size_t start = start_;
391   const size_t length = pattern_length - start;
392 
393   // Biased tables so that we can use pattern indices as table indices,
394   // even if we only cover the part of the pattern from offset start.
395   int* shift_table = good_suffix_shift_table_ - start_;
396   int* suffix_table = suffix_table_ - start_;
397 
398   // Initialize table.
399   for (size_t i = start; i < pattern_length; i++) {
400     shift_table[i] = length;
401   }
402   shift_table[pattern_length] = 1;
403   suffix_table[pattern_length] = pattern_length + 1;
404 
405   if (pattern_length <= start) {
406     return;
407   }
408 
409   // Find suffixes.
410   Char last_char = pattern_[pattern_length - 1];
411   size_t suffix = pattern_length + 1;
412   {
413     size_t i = pattern_length;
414     while (i > start) {
415       Char c = pattern_[i - 1];
416       while (suffix <= pattern_length && c != pattern_[suffix - 1]) {
417         if (static_cast<size_t>(shift_table[suffix]) == length) {
418           shift_table[suffix] = suffix - i;
419         }
420         suffix = suffix_table[suffix];
421       }
422       suffix_table[--i] = --suffix;
423       if (suffix == pattern_length) {
424         // No suffix to extend, so we check against last_char only.
425         while ((i > start) && (pattern_[i - 1] != last_char)) {
426           if (static_cast<size_t>(shift_table[pattern_length]) == length) {
427             shift_table[pattern_length] = pattern_length - i;
428           }
429           suffix_table[--i] = pattern_length;
430         }
431         if (i > start) {
432           suffix_table[--i] = --suffix;
433         }
434       }
435     }
436   }
437   // Build shift table using suffixes.
438   if (suffix < pattern_length) {
439     for (size_t i = start; i <= pattern_length; i++) {
440       if (static_cast<size_t>(shift_table[i]) == length) {
441         shift_table[i] = suffix - start;
442       }
443       if (i == suffix) {
444         suffix = suffix_table[suffix];
445       }
446     }
447   }
448 }
449 
450 //---------------------------------------------------------------------
451 // Boyer-Moore-Horspool string search.
452 //---------------------------------------------------------------------
453 
454 template <typename Char>
BoyerMooreHorspoolSearch(Vector subject,size_t start_index)455 size_t StringSearch<Char>::BoyerMooreHorspoolSearch(
456     Vector subject,
457     size_t start_index) {
458   const size_t subject_length = subject.length();
459   const size_t pattern_length = pattern_.length();
460   int* char_occurrences = bad_char_shift_table_;
461   int64_t badness = -static_cast<int64_t>(pattern_length);
462 
463   // How bad we are doing without a good-suffix table.
464   Char last_char = pattern_[pattern_length - 1];
465   int last_char_shift =
466       pattern_length - 1 -
467       CharOccurrence(char_occurrences, last_char);
468 
469   // Perform search
470   size_t index = start_index;  // No matches found prior to this index.
471   while (index <= subject_length - pattern_length) {
472     size_t j = pattern_length - 1;
473     int subject_char;
474     while (last_char != (subject_char = subject[index + j])) {
475       int bc_occ = CharOccurrence(char_occurrences, subject_char);
476       int shift = j - bc_occ;
477       index += shift;
478       badness += 1 - shift;  // at most zero, so badness cannot increase.
479       if (index > subject_length - pattern_length) {
480         return subject_length;
481       }
482     }
483     j--;
484     while (pattern_[j] == (subject[index + j])) {
485       if (j == 0) {
486         return index;
487       }
488       j--;
489     }
490     index += last_char_shift;
491     // Badness increases by the number of characters we have
492     // checked, and decreases by the number of characters we
493     // can skip by shifting. It's a measure of how we are doing
494     // compared to reading each character exactly once.
495     badness += (pattern_length - j) - last_char_shift;
496     if (badness > 0) {
497       PopulateBoyerMooreTable();
498       strategy_ = SearchStrategy::kBoyerMoore;
499       return BoyerMooreSearch(subject, index);
500     }
501   }
502   return subject.length();
503 }
504 
505 template <typename Char>
PopulateBoyerMooreHorspoolTable()506 void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() {
507   const size_t pattern_length = pattern_.length();
508 
509   int* bad_char_occurrence = bad_char_shift_table_;
510 
511   // Only preprocess at most kBMMaxShift last characters of pattern.
512   const size_t start = start_;
513   // Run forwards to populate bad_char_table, so that *last* instance
514   // of character equivalence class is the one registered.
515   // Notice: Doesn't include the last character.
516   const size_t table_size = AlphabetSize();
517   if (start == 0) {
518     // All patterns less than kBMMaxShift in length.
519     memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
520   } else {
521     for (size_t i = 0; i < table_size; i++) {
522       bad_char_occurrence[i] = start - 1;
523     }
524   }
525   for (size_t i = start; i < pattern_length - 1; i++) {
526     Char c = pattern_[i];
527     int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize();
528     bad_char_occurrence[bucket] = i;
529   }
530 }
531 
532 //---------------------------------------------------------------------
533 // Linear string search with bailout to BMH.
534 //---------------------------------------------------------------------
535 
536 // Simple linear search for short patterns, which bails out if the string
537 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
538 template <typename Char>
InitialSearch(Vector subject,size_t index)539 size_t StringSearch<Char>::InitialSearch(
540     Vector subject,
541     size_t index) {
542   const size_t pattern_length = pattern_.length();
543   // Badness is a count of how much work we have done.  When we have
544   // done enough work we decide it's probably worth switching to a better
545   // algorithm.
546   int64_t badness = -10 - (pattern_length << 2);
547 
548   // We know our pattern is at least 2 characters, we cache the first so
549   // the common case of the first character not matching is faster.
550   for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
551     badness++;
552     if (badness <= 0) {
553       i = FindFirstCharacter(pattern_, subject, i);
554       if (i == subject.length())
555         return subject.length();
556       CHECK_LE(i, n);
557       size_t j = 1;
558       do {
559         if (pattern_[j] != subject[i + j]) {
560           break;
561         }
562         j++;
563       } while (j < pattern_length);
564       if (j == pattern_length) {
565         return i;
566       }
567       badness += j;
568     } else {
569       PopulateBoyerMooreHorspoolTable();
570       strategy_ = SearchStrategy::kBoyerMooreHorspool;
571       return BoyerMooreHorspoolSearch(subject, i);
572     }
573   }
574   return subject.length();
575 }
576 
577 // Perform a single stand-alone search.
578 // If searching multiple times for the same pattern, a search
579 // object should be constructed once and the Search function then called
580 // for each search.
581 template <typename Char>
SearchString(Vector<const Char> subject,Vector<const Char> pattern,size_t start_index)582 size_t SearchString(Vector<const Char> subject,
583                     Vector<const Char> pattern,
584                     size_t start_index) {
585   StringSearch<Char> search(pattern);
586   return search.Search(subject, start_index);
587 }
588 }  // namespace stringsearch
589 }  // namespace node
590 
591 namespace node {
592 
593 template <typename Char>
SearchString(const Char * haystack,size_t haystack_length,const Char * needle,size_t needle_length,size_t start_index,bool is_forward)594 size_t SearchString(const Char* haystack,
595                     size_t haystack_length,
596                     const Char* needle,
597                     size_t needle_length,
598                     size_t start_index,
599                     bool is_forward) {
600   if (haystack_length < needle_length) return haystack_length;
601   // To do a reverse search (lastIndexOf instead of indexOf) without redundant
602   // code, create two vectors that are reversed views into the input strings.
603   // For example, v_needle[0] would return the *last* character of the needle.
604   // So we're searching for the first instance of rev(needle) in rev(haystack)
605   stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward);
606   stringsearch::Vector<const Char> v_haystack(
607       haystack, haystack_length, is_forward);
608   size_t diff = haystack_length - needle_length;
609   size_t relative_start_index;
610   if (is_forward) {
611     relative_start_index = start_index;
612   } else if (diff < start_index) {
613     relative_start_index = 0;
614   } else {
615     relative_start_index = diff - start_index;
616   }
617   size_t pos = node::stringsearch::SearchString(
618       v_haystack, v_needle, relative_start_index);
619   if (pos == haystack_length) {
620     // not found
621     return pos;
622   }
623   return is_forward ? pos : (haystack_length - needle_length - pos);
624 }
625 
626 template <size_t N>
SearchString(const char * haystack,size_t haystack_length,const char (& needle)[N])627 size_t SearchString(const char* haystack, size_t haystack_length,
628                     const char (&needle)[N]) {
629   return SearchString(
630       reinterpret_cast<const uint8_t*>(haystack), haystack_length,
631       reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true);
632 }
633 
634 }  // namespace node
635 
636 #endif  // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
637 
638 #endif  // SRC_STRING_SEARCH_H_
639