1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_STRING_SEARCH_H_
6 #define V8_STRING_SEARCH_H_
7
8 #include "src/isolate.h"
9 #include "src/vector.h"
10
11 namespace v8 {
12 namespace internal {
13
14
15 //---------------------------------------------------------------------
16 // String Search object.
17 //---------------------------------------------------------------------
18
19 // Class holding constants and methods that apply to all string search variants,
20 // independently of subject and pattern char size.
21 class StringSearchBase {
22 protected:
23 // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
24 // limit, we can fix the size of tables. For a needle longer than this limit,
25 // search will not be optimal, since we only build tables for a suffix
26 // of the string, but it is a safe approximation.
27 static const int kBMMaxShift = Isolate::kBMMaxShift;
28
29 // Reduce alphabet to this size.
30 // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
31 // proportional to the input alphabet. We reduce the alphabet size by
32 // equating input characters modulo a smaller alphabet size. This gives
33 // a potentially less efficient searching, but is a safe approximation.
34 // For needles using only characters in the same Unicode 256-code point page,
35 // there is no search speed degradation.
36 static const int kLatin1AlphabetSize = 256;
37 static const int kUC16AlphabetSize = Isolate::kUC16AlphabetSize;
38
39 // Bad-char shift table stored in the state. It's length is the alphabet size.
40 // For patterns below this length, the skip length of Boyer-Moore is too short
41 // to compensate for the algorithmic overhead compared to simple brute force.
42 static const int kBMMinPatternLength = 7;
43
IsOneByteString(Vector<const uint8_t> string)44 static inline bool IsOneByteString(Vector<const uint8_t> string) {
45 return true;
46 }
47
IsOneByteString(Vector<const uc16> string)48 static inline bool IsOneByteString(Vector<const uc16> string) {
49 return String::IsOneByte(string.start(), string.length());
50 }
51
52 friend class Isolate;
53 };
54
55
56 template <typename PatternChar, typename SubjectChar>
57 class StringSearch : private StringSearchBase {
58 public:
StringSearch(Isolate * isolate,Vector<const PatternChar> pattern)59 StringSearch(Isolate* isolate, Vector<const PatternChar> pattern)
60 : isolate_(isolate),
61 pattern_(pattern),
62 start_(Max(0, pattern.length() - kBMMaxShift)) {
63 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
64 if (!IsOneByteString(pattern_)) {
65 strategy_ = &FailSearch;
66 return;
67 }
68 }
69 int pattern_length = pattern_.length();
70 if (pattern_length < kBMMinPatternLength) {
71 if (pattern_length == 1) {
72 strategy_ = &SingleCharSearch;
73 return;
74 }
75 strategy_ = &LinearSearch;
76 return;
77 }
78 strategy_ = &InitialSearch;
79 }
80
Search(Vector<const SubjectChar> subject,int index)81 int Search(Vector<const SubjectChar> subject, int index) {
82 return strategy_(this, subject, index);
83 }
84
AlphabetSize()85 static inline int AlphabetSize() {
86 if (sizeof(PatternChar) == 1) {
87 // Latin1 needle.
88 return kLatin1AlphabetSize;
89 } else {
90 DCHECK(sizeof(PatternChar) == 2);
91 // UC16 needle.
92 return kUC16AlphabetSize;
93 }
94 }
95
96 private:
97 typedef int (*SearchFunction)( // NOLINT - it's not a cast!
98 StringSearch<PatternChar, SubjectChar>*,
99 Vector<const SubjectChar>,
100 int);
101
FailSearch(StringSearch<PatternChar,SubjectChar> *,Vector<const SubjectChar>,int)102 static int FailSearch(StringSearch<PatternChar, SubjectChar>*,
103 Vector<const SubjectChar>,
104 int) {
105 return -1;
106 }
107
108 static int SingleCharSearch(StringSearch<PatternChar, SubjectChar>* search,
109 Vector<const SubjectChar> subject,
110 int start_index);
111
112 static int LinearSearch(StringSearch<PatternChar, SubjectChar>* search,
113 Vector<const SubjectChar> subject,
114 int start_index);
115
116 static int InitialSearch(StringSearch<PatternChar, SubjectChar>* search,
117 Vector<const SubjectChar> subject,
118 int start_index);
119
120 static int BoyerMooreHorspoolSearch(
121 StringSearch<PatternChar, SubjectChar>* search,
122 Vector<const SubjectChar> subject,
123 int start_index);
124
125 static int BoyerMooreSearch(StringSearch<PatternChar, SubjectChar>* search,
126 Vector<const SubjectChar> subject,
127 int start_index);
128
129 void PopulateBoyerMooreHorspoolTable();
130
131 void PopulateBoyerMooreTable();
132
exceedsOneByte(uint8_t c)133 static inline bool exceedsOneByte(uint8_t c) {
134 return false;
135 }
136
exceedsOneByte(uint16_t c)137 static inline bool exceedsOneByte(uint16_t c) {
138 return c > String::kMaxOneByteCharCodeU;
139 }
140
CharOccurrence(int * bad_char_occurrence,SubjectChar char_code)141 static inline int CharOccurrence(int* bad_char_occurrence,
142 SubjectChar char_code) {
143 if (sizeof(SubjectChar) == 1) {
144 return bad_char_occurrence[static_cast<int>(char_code)];
145 }
146 if (sizeof(PatternChar) == 1) {
147 if (exceedsOneByte(char_code)) {
148 return -1;
149 }
150 return bad_char_occurrence[static_cast<unsigned int>(char_code)];
151 }
152 // Both pattern and subject are UC16. Reduce character to equivalence class.
153 int equiv_class = char_code % kUC16AlphabetSize;
154 return bad_char_occurrence[equiv_class];
155 }
156
157 // The following tables are shared by all searches.
158 // TODO(lrn): Introduce a way for a pattern to keep its tables
159 // between searches (e.g., for an Atom RegExp).
160
161 // Store for the BoyerMoore(Horspool) bad char shift table.
162 // Return a table covering the last kBMMaxShift+1 positions of
163 // pattern.
bad_char_table()164 int* bad_char_table() {
165 return isolate_->bad_char_shift_table();
166 }
167
168 // Store for the BoyerMoore good suffix shift table.
good_suffix_shift_table()169 int* good_suffix_shift_table() {
170 // Return biased pointer that maps the range [start_..pattern_.length()
171 // to the kGoodSuffixShiftTable array.
172 return isolate_->good_suffix_shift_table() - start_;
173 }
174
175 // Table used temporarily while building the BoyerMoore good suffix
176 // shift table.
suffix_table()177 int* suffix_table() {
178 // Return biased pointer that maps the range [start_..pattern_.length()
179 // to the kSuffixTable array.
180 return isolate_->suffix_table() - start_;
181 }
182
183 Isolate* isolate_;
184 // The pattern to search for.
185 Vector<const PatternChar> pattern_;
186 // Pointer to implementation of the search.
187 SearchFunction strategy_;
188 // Cache value of Max(0, pattern_length() - kBMMaxShift)
189 int start_;
190 };
191
192
193 template <typename T, typename U>
AlignDown(T value,U alignment)194 inline T AlignDown(T value, U alignment) {
195 return reinterpret_cast<T>(
196 (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
197 }
198
199
GetHighestValueByte(uc16 character)200 inline uint8_t GetHighestValueByte(uc16 character) {
201 return Max(static_cast<uint8_t>(character & 0xFF),
202 static_cast<uint8_t>(character >> 8));
203 }
204
205
GetHighestValueByte(uint8_t character)206 inline uint8_t GetHighestValueByte(uint8_t character) { return character; }
207
208
209 template <typename PatternChar, typename SubjectChar>
FindFirstCharacter(Vector<const PatternChar> pattern,Vector<const SubjectChar> subject,int index)210 inline int FindFirstCharacter(Vector<const PatternChar> pattern,
211 Vector<const SubjectChar> subject, int index) {
212 const PatternChar pattern_first_char = pattern[0];
213 const int max_n = (subject.length() - pattern.length() + 1);
214
215 const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
216 const SubjectChar search_char = static_cast<SubjectChar>(pattern_first_char);
217 int pos = index;
218 do {
219 DCHECK_GE(max_n - pos, 0);
220 const SubjectChar* char_pos = reinterpret_cast<const SubjectChar*>(
221 memchr(subject.start() + pos, search_byte,
222 (max_n - pos) * sizeof(SubjectChar)));
223 if (char_pos == NULL) return -1;
224 char_pos = AlignDown(char_pos, sizeof(SubjectChar));
225 pos = static_cast<int>(char_pos - subject.start());
226 if (subject[pos] == search_char) return pos;
227 } while (++pos < max_n);
228
229 return -1;
230 }
231
232
233 //---------------------------------------------------------------------
234 // Single Character Pattern Search Strategy
235 //---------------------------------------------------------------------
236
237 template <typename PatternChar, typename SubjectChar>
SingleCharSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)238 int StringSearch<PatternChar, SubjectChar>::SingleCharSearch(
239 StringSearch<PatternChar, SubjectChar>* search,
240 Vector<const SubjectChar> subject,
241 int index) {
242 DCHECK_EQ(1, search->pattern_.length());
243 PatternChar pattern_first_char = search->pattern_[0];
244 if (sizeof(PatternChar) > sizeof(SubjectChar)) {
245 if (exceedsOneByte(pattern_first_char)) {
246 return -1;
247 }
248 }
249 return FindFirstCharacter(search->pattern_, subject, index);
250 }
251
252 //---------------------------------------------------------------------
253 // Linear Search Strategy
254 //---------------------------------------------------------------------
255
256
257 template <typename PatternChar, typename SubjectChar>
CharCompare(const PatternChar * pattern,const SubjectChar * subject,int length)258 inline bool CharCompare(const PatternChar* pattern,
259 const SubjectChar* subject,
260 int length) {
261 DCHECK(length > 0);
262 int pos = 0;
263 do {
264 if (pattern[pos] != subject[pos]) {
265 return false;
266 }
267 pos++;
268 } while (pos < length);
269 return true;
270 }
271
272
273 // Simple linear search for short patterns. Never bails out.
274 template <typename PatternChar, typename SubjectChar>
LinearSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)275 int StringSearch<PatternChar, SubjectChar>::LinearSearch(
276 StringSearch<PatternChar, SubjectChar>* search,
277 Vector<const SubjectChar> subject,
278 int index) {
279 Vector<const PatternChar> pattern = search->pattern_;
280 DCHECK(pattern.length() > 1);
281 int pattern_length = pattern.length();
282 int i = index;
283 int n = subject.length() - pattern_length;
284 while (i <= n) {
285 i = FindFirstCharacter(pattern, subject, i);
286 if (i == -1) return -1;
287 DCHECK_LE(i, n);
288 i++;
289 // Loop extracted to separate function to allow using return to do
290 // a deeper break.
291 if (CharCompare(pattern.start() + 1,
292 subject.start() + i,
293 pattern_length - 1)) {
294 return i - 1;
295 }
296 }
297 return -1;
298 }
299
300 //---------------------------------------------------------------------
301 // Boyer-Moore string search
302 //---------------------------------------------------------------------
303
304 template <typename PatternChar, typename SubjectChar>
BoyerMooreSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)305 int StringSearch<PatternChar, SubjectChar>::BoyerMooreSearch(
306 StringSearch<PatternChar, SubjectChar>* search,
307 Vector<const SubjectChar> subject,
308 int start_index) {
309 Vector<const PatternChar> pattern = search->pattern_;
310 int subject_length = subject.length();
311 int pattern_length = pattern.length();
312 // Only preprocess at most kBMMaxShift last characters of pattern.
313 int start = search->start_;
314
315 int* bad_char_occurence = search->bad_char_table();
316 int* good_suffix_shift = search->good_suffix_shift_table();
317
318 PatternChar last_char = pattern[pattern_length - 1];
319 int index = start_index;
320 // Continue search from i.
321 while (index <= subject_length - pattern_length) {
322 int j = pattern_length - 1;
323 int c;
324 while (last_char != (c = subject[index + j])) {
325 int shift =
326 j - CharOccurrence(bad_char_occurence, c);
327 index += shift;
328 if (index > subject_length - pattern_length) {
329 return -1;
330 }
331 }
332 while (j >= 0 && pattern[j] == (c = subject[index + j])) j--;
333 if (j < 0) {
334 return index;
335 } else if (j < start) {
336 // we have matched more than our tables allow us to be smart about.
337 // Fall back on BMH shift.
338 index += pattern_length - 1
339 - CharOccurrence(bad_char_occurence,
340 static_cast<SubjectChar>(last_char));
341 } else {
342 int gs_shift = good_suffix_shift[j + 1];
343 int bc_occ =
344 CharOccurrence(bad_char_occurence, c);
345 int shift = j - bc_occ;
346 if (gs_shift > shift) {
347 shift = gs_shift;
348 }
349 index += shift;
350 }
351 }
352
353 return -1;
354 }
355
356
357 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreTable()358 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreTable() {
359 int pattern_length = pattern_.length();
360 const PatternChar* pattern = pattern_.start();
361 // Only look at the last kBMMaxShift characters of pattern (from start_
362 // to pattern_length).
363 int start = start_;
364 int length = pattern_length - start;
365
366 // Biased tables so that we can use pattern indices as table indices,
367 // even if we only cover the part of the pattern from offset start.
368 int* shift_table = good_suffix_shift_table();
369 int* suffix_table = this->suffix_table();
370
371 // Initialize table.
372 for (int i = start; i < pattern_length; i++) {
373 shift_table[i] = length;
374 }
375 shift_table[pattern_length] = 1;
376 suffix_table[pattern_length] = pattern_length + 1;
377
378 if (pattern_length <= start) {
379 return;
380 }
381
382 // Find suffixes.
383 PatternChar last_char = pattern[pattern_length - 1];
384 int suffix = pattern_length + 1;
385 {
386 int i = pattern_length;
387 while (i > start) {
388 PatternChar c = pattern[i - 1];
389 while (suffix <= pattern_length && c != pattern[suffix - 1]) {
390 if (shift_table[suffix] == length) {
391 shift_table[suffix] = suffix - i;
392 }
393 suffix = suffix_table[suffix];
394 }
395 suffix_table[--i] = --suffix;
396 if (suffix == pattern_length) {
397 // No suffix to extend, so we check against last_char only.
398 while ((i > start) && (pattern[i - 1] != last_char)) {
399 if (shift_table[pattern_length] == length) {
400 shift_table[pattern_length] = pattern_length - i;
401 }
402 suffix_table[--i] = pattern_length;
403 }
404 if (i > start) {
405 suffix_table[--i] = --suffix;
406 }
407 }
408 }
409 }
410 // Build shift table using suffixes.
411 if (suffix < pattern_length) {
412 for (int i = start; i <= pattern_length; i++) {
413 if (shift_table[i] == length) {
414 shift_table[i] = suffix - start;
415 }
416 if (i == suffix) {
417 suffix = suffix_table[suffix];
418 }
419 }
420 }
421 }
422
423 //---------------------------------------------------------------------
424 // Boyer-Moore-Horspool string search.
425 //---------------------------------------------------------------------
426
427 template <typename PatternChar, typename SubjectChar>
BoyerMooreHorspoolSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int start_index)428 int StringSearch<PatternChar, SubjectChar>::BoyerMooreHorspoolSearch(
429 StringSearch<PatternChar, SubjectChar>* search,
430 Vector<const SubjectChar> subject,
431 int start_index) {
432 Vector<const PatternChar> pattern = search->pattern_;
433 int subject_length = subject.length();
434 int pattern_length = pattern.length();
435 int* char_occurrences = search->bad_char_table();
436 int badness = -pattern_length;
437
438 // How bad we are doing without a good-suffix table.
439 PatternChar last_char = pattern[pattern_length - 1];
440 int last_char_shift = pattern_length - 1 -
441 CharOccurrence(char_occurrences, static_cast<SubjectChar>(last_char));
442 // Perform search
443 int index = start_index; // No matches found prior to this index.
444 while (index <= subject_length - pattern_length) {
445 int j = pattern_length - 1;
446 int subject_char;
447 while (last_char != (subject_char = subject[index + j])) {
448 int bc_occ = CharOccurrence(char_occurrences, subject_char);
449 int shift = j - bc_occ;
450 index += shift;
451 badness += 1 - shift; // at most zero, so badness cannot increase.
452 if (index > subject_length - pattern_length) {
453 return -1;
454 }
455 }
456 j--;
457 while (j >= 0 && pattern[j] == (subject[index + j])) j--;
458 if (j < 0) {
459 return index;
460 } else {
461 index += last_char_shift;
462 // Badness increases by the number of characters we have
463 // checked, and decreases by the number of characters we
464 // can skip by shifting. It's a measure of how we are doing
465 // compared to reading each character exactly once.
466 badness += (pattern_length - j) - last_char_shift;
467 if (badness > 0) {
468 search->PopulateBoyerMooreTable();
469 search->strategy_ = &BoyerMooreSearch;
470 return BoyerMooreSearch(search, subject, index);
471 }
472 }
473 }
474 return -1;
475 }
476
477
478 template <typename PatternChar, typename SubjectChar>
PopulateBoyerMooreHorspoolTable()479 void StringSearch<PatternChar, SubjectChar>::PopulateBoyerMooreHorspoolTable() {
480 int pattern_length = pattern_.length();
481
482 int* bad_char_occurrence = bad_char_table();
483
484 // Only preprocess at most kBMMaxShift last characters of pattern.
485 int start = start_;
486 // Run forwards to populate bad_char_table, so that *last* instance
487 // of character equivalence class is the one registered.
488 // Notice: Doesn't include the last character.
489 int table_size = AlphabetSize();
490 if (start == 0) { // All patterns less than kBMMaxShift in length.
491 memset(bad_char_occurrence,
492 -1,
493 table_size * sizeof(*bad_char_occurrence));
494 } else {
495 for (int i = 0; i < table_size; i++) {
496 bad_char_occurrence[i] = start - 1;
497 }
498 }
499 for (int i = start; i < pattern_length - 1; i++) {
500 PatternChar c = pattern_[i];
501 int bucket = (sizeof(PatternChar) == 1) ? c : c % AlphabetSize();
502 bad_char_occurrence[bucket] = i;
503 }
504 }
505
506 //---------------------------------------------------------------------
507 // Linear string search with bailout to BMH.
508 //---------------------------------------------------------------------
509
510 // Simple linear search for short patterns, which bails out if the string
511 // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
512 template <typename PatternChar, typename SubjectChar>
InitialSearch(StringSearch<PatternChar,SubjectChar> * search,Vector<const SubjectChar> subject,int index)513 int StringSearch<PatternChar, SubjectChar>::InitialSearch(
514 StringSearch<PatternChar, SubjectChar>* search,
515 Vector<const SubjectChar> subject,
516 int index) {
517 Vector<const PatternChar> pattern = search->pattern_;
518 int pattern_length = pattern.length();
519 // Badness is a count of how much work we have done. When we have
520 // done enough work we decide it's probably worth switching to a better
521 // algorithm.
522 int badness = -10 - (pattern_length << 2);
523
524 // We know our pattern is at least 2 characters, we cache the first so
525 // the common case of the first character not matching is faster.
526 for (int i = index, n = subject.length() - pattern_length; i <= n; i++) {
527 badness++;
528 if (badness <= 0) {
529 i = FindFirstCharacter(pattern, subject, i);
530 if (i == -1) return -1;
531 DCHECK_LE(i, n);
532 int j = 1;
533 do {
534 if (pattern[j] != subject[i + j]) {
535 break;
536 }
537 j++;
538 } while (j < pattern_length);
539 if (j == pattern_length) {
540 return i;
541 }
542 badness += j;
543 } else {
544 search->PopulateBoyerMooreHorspoolTable();
545 search->strategy_ = &BoyerMooreHorspoolSearch;
546 return BoyerMooreHorspoolSearch(search, subject, i);
547 }
548 }
549 return -1;
550 }
551
552
553 // Perform a a single stand-alone search.
554 // If searching multiple times for the same pattern, a search
555 // object should be constructed once and the Search function then called
556 // for each search.
557 template <typename SubjectChar, typename PatternChar>
SearchString(Isolate * isolate,Vector<const SubjectChar> subject,Vector<const PatternChar> pattern,int start_index)558 int SearchString(Isolate* isolate,
559 Vector<const SubjectChar> subject,
560 Vector<const PatternChar> pattern,
561 int start_index) {
562 StringSearch<PatternChar, SubjectChar> search(isolate, pattern);
563 return search.Search(subject, start_index);
564 }
565
566 } // namespace internal
567 } // namespace v8
568
569 #endif // V8_STRING_SEARCH_H_
570