• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/experimental/experimental-interpreter.h"
6 
7 #include "src/base/optional.h"
8 #include "src/base/strings.h"
9 #include "src/common/assert-scope.h"
10 #include "src/objects/fixed-array-inl.h"
11 #include "src/objects/string-inl.h"
12 #include "src/regexp/experimental/experimental.h"
13 #include "src/strings/char-predicates-inl.h"
14 #include "src/zone/zone-allocator.h"
15 #include "src/zone/zone-list-inl.h"
16 
17 namespace v8 {
18 namespace internal {
19 
20 namespace {
21 
22 constexpr int kUndefinedRegisterValue = -1;
23 
24 template <class Character>
SatisfiesAssertion(RegExpAssertion::Type type,base::Vector<const Character> context,int position)25 bool SatisfiesAssertion(RegExpAssertion::Type type,
26                         base::Vector<const Character> context, int position) {
27   DCHECK_LE(position, context.length());
28   DCHECK_GE(position, 0);
29 
30   switch (type) {
31     case RegExpAssertion::Type::START_OF_INPUT:
32       return position == 0;
33     case RegExpAssertion::Type::END_OF_INPUT:
34       return position == context.length();
35     case RegExpAssertion::Type::START_OF_LINE:
36       if (position == 0) return true;
37       return unibrow::IsLineTerminator(context[position - 1]);
38     case RegExpAssertion::Type::END_OF_LINE:
39       if (position == context.length()) return true;
40       return unibrow::IsLineTerminator(context[position]);
41     case RegExpAssertion::Type::BOUNDARY:
42       if (context.length() == 0) {
43         return false;
44       } else if (position == 0) {
45         return IsRegExpWord(context[position]);
46       } else if (position == context.length()) {
47         return IsRegExpWord(context[position - 1]);
48       } else {
49         return IsRegExpWord(context[position - 1]) !=
50                IsRegExpWord(context[position]);
51       }
52     case RegExpAssertion::Type::NON_BOUNDARY:
53       return !SatisfiesAssertion(RegExpAssertion::Type::BOUNDARY, context,
54                                  position);
55   }
56 }
57 
ToInstructionVector(ByteArray raw_bytes,const DisallowGarbageCollection & no_gc)58 base::Vector<RegExpInstruction> ToInstructionVector(
59     ByteArray raw_bytes, const DisallowGarbageCollection& no_gc) {
60   RegExpInstruction* inst_begin =
61       reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress());
62   int inst_num = raw_bytes.length() / sizeof(RegExpInstruction);
63   DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length());
64   return base::Vector<RegExpInstruction>(inst_begin, inst_num);
65 }
66 
67 template <class Character>
68 base::Vector<const Character> ToCharacterVector(
69     String str, const DisallowGarbageCollection& no_gc);
70 
71 template <>
ToCharacterVector(String str,const DisallowGarbageCollection & no_gc)72 base::Vector<const uint8_t> ToCharacterVector<uint8_t>(
73     String str, const DisallowGarbageCollection& no_gc) {
74   DCHECK(str.IsFlat());
75   String::FlatContent content = str.GetFlatContent(no_gc);
76   DCHECK(content.IsOneByte());
77   return content.ToOneByteVector();
78 }
79 
80 template <>
ToCharacterVector(String str,const DisallowGarbageCollection & no_gc)81 base::Vector<const base::uc16> ToCharacterVector<base::uc16>(
82     String str, const DisallowGarbageCollection& no_gc) {
83   DCHECK(str.IsFlat());
84   String::FlatContent content = str.GetFlatContent(no_gc);
85   DCHECK(content.IsTwoByte());
86   return content.ToUC16Vector();
87 }
88 
89 template <class Character>
90 class NfaInterpreter {
91   // Executes a bytecode program in breadth-first mode, without backtracking.
92   // `Character` can be instantiated with `uint8_t` or `base::uc16` for one byte
93   // or two byte input strings.
94   //
95   // In contrast to the backtracking implementation, this has linear time
96   // complexity in the length of the input string. Breadth-first mode means
97   // that threads are executed in lockstep with respect to their input
98   // position, i.e. the threads share a common input index.  This is similar
99   // to breadth-first simulation of a non-deterministic finite automaton (nfa),
100   // hence the name of the class.
101   //
102   // To follow the semantics of a backtracking VM implementation, we have to be
103   // careful about whether we stop execution when a thread executes ACCEPT.
104   // For example, consider execution of the bytecode generated by the regexp
105   //
106   //   r = /abc|..|[a-c]{10,}/
107   //
108   // on input "abcccccccccccccc".  Clearly the three alternatives
109   // - /abc/
110   // - /../
111   // - /[a-c]{10,}/
112   // all match this input.  A backtracking implementation will report "abc" as
113   // match, because it explores the first alternative before the others.
114   //
115   // However, if we execute breadth first, then we execute the 3 threads
116   // - t1, which tries to match /abc/
117   // - t2, which tries to match /../
118   // - t3, which tries to match /[a-c]{10,}/
119   // in lockstep i.e. by iterating over the input and feeding all threads one
120   // character at a time.  t2 will execute an ACCEPT after two characters,
121   // while t1 will only execute ACCEPT after three characters. Thus we find a
122   // match for the second alternative before a match of the first alternative.
123   //
124   // This shows that we cannot always stop searching as soon as some thread t
125   // executes ACCEPT:  If there is a thread u with higher priority than t, then
126   // it must be finished first.  If u produces a match, then we can discard the
127   // match of t because matches produced by threads with higher priority are
128   // preferred over matches of threads with lower priority.  On the other hand,
129   // we are allowed to abort all threads with lower priority than t if t
130   // produces a match: Such threads can only produce worse matches.  In the
131   // example above, we can abort t3 after two characters because of t2's match.
132   //
133   // Thus the interpreter keeps track of a priority-ordered list of threads.
134   // If a thread ACCEPTs, all threads with lower priority are discarded, and
135   // the search continues with the threads with higher priority.  If no threads
136   // with high priority are left, we return the match that was produced by the
137   // ACCEPTing thread with highest priority.
138  public:
NfaInterpreter(Isolate * isolate,RegExp::CallOrigin call_origin,ByteArray bytecode,int register_count_per_match,String input,int32_t input_index,Zone * zone)139   NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin,
140                  ByteArray bytecode, int register_count_per_match, String input,
141                  int32_t input_index, Zone* zone)
142       : isolate_(isolate),
143         call_origin_(call_origin),
144         bytecode_object_(bytecode),
145         bytecode_(ToInstructionVector(bytecode, no_gc_)),
146         register_count_per_match_(register_count_per_match),
147         input_object_(input),
148         input_(ToCharacterVector<Character>(input, no_gc_)),
149         input_index_(input_index),
150         pc_last_input_index_(zone->NewArray<int>(bytecode.length()),
151                              bytecode.length()),
152         active_threads_(0, zone),
153         blocked_threads_(0, zone),
154         register_array_allocator_(zone),
155         best_match_registers_(base::nullopt),
156         zone_(zone) {
157     DCHECK(!bytecode_.empty());
158     DCHECK_GE(input_index_, 0);
159     DCHECK_LE(input_index_, input_.length());
160 
161     std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
162   }
163 
164   // Finds matches and writes their concatenated capture registers to
165   // `output_registers`.  `output_registers[i]` has to be valid for all i <
166   // output_register_count.  The search continues until all remaining matches
167   // have been found or there is no space left in `output_registers`.  Returns
168   // the number of matches found.
FindMatches(int32_t * output_registers,int output_register_count)169   int FindMatches(int32_t* output_registers, int output_register_count) {
170     const int max_match_num = output_register_count / register_count_per_match_;
171 
172     int match_num = 0;
173     while (match_num != max_match_num) {
174       int err_code = FindNextMatch();
175       if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
176 
177       if (!FoundMatch()) break;
178 
179       base::Vector<int> registers = *best_match_registers_;
180       output_registers =
181           std::copy(registers.begin(), registers.end(), output_registers);
182 
183       ++match_num;
184 
185       const int match_begin = registers[0];
186       const int match_end = registers[1];
187       DCHECK_LE(match_begin, match_end);
188       const int match_length = match_end - match_begin;
189       if (match_length != 0) {
190         SetInputIndex(match_end);
191       } else if (match_end == input_.length()) {
192         // Zero-length match, input exhausted.
193         SetInputIndex(match_end);
194         break;
195       } else {
196         // Zero-length match, more input.  We don't want to report more matches
197         // here endlessly, so we advance by 1.
198         SetInputIndex(match_end + 1);
199 
200         // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to
201         // the next codepoint, not to the next code unit. See also
202         // `RegExpUtils::AdvanceStringIndex`.
203         STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode);
204       }
205     }
206 
207     return match_num;
208   }
209 
210  private:
211   // The state of a "thread" executing experimental regexp bytecode.  (Not to
212   // be confused with an OS thread.)
213   struct InterpreterThread {
214     // This thread's program counter, i.e. the index within `bytecode_` of the
215     // next instruction to be executed.
216     int pc;
217     // Pointer to the array of registers, which is always size
218     // `register_count_per_match_`.  Should be deallocated with
219     // `register_array_allocator_`.
220     int* register_array_begin;
221   };
222 
223   // Handles pending interrupts if there are any.  Returns
224   // RegExp::kInternalRegExpSuccess if execution can continue, and an error
225   // code otherwise.
HandleInterrupts()226   int HandleInterrupts() {
227     StackLimitCheck check(isolate_);
228     if (call_origin_ == RegExp::CallOrigin::kFromJs) {
229       // Direct calls from JavaScript can be interrupted in two ways:
230       // 1. A real stack overflow, in which case we let the caller throw the
231       //    exception.
232       // 2. The stack guard was used to interrupt execution for another purpose,
233       //    forcing the call through the runtime system.
234       if (check.JsHasOverflowed()) {
235         return RegExp::kInternalRegExpException;
236       } else if (check.InterruptRequested()) {
237         return RegExp::kInternalRegExpRetry;
238       }
239     } else {
240       DCHECK(call_origin_ == RegExp::CallOrigin::kFromRuntime);
241       HandleScope handles(isolate_);
242       Handle<ByteArray> bytecode_handle(bytecode_object_, isolate_);
243       Handle<String> input_handle(input_object_, isolate_);
244 
245       if (check.JsHasOverflowed()) {
246         // We abort the interpreter now anyway, so gc can't invalidate any
247         // pointers.
248         AllowGarbageCollection yes_gc;
249         isolate_->StackOverflow();
250         return RegExp::kInternalRegExpException;
251       } else if (check.InterruptRequested()) {
252         // TODO(mbid): Is this really equivalent to whether the string is
253         // one-byte or two-byte? A comment at the declaration of
254         // IsOneByteRepresentationUnderneath says that this might fail for
255         // external strings.
256         const bool was_one_byte =
257             String::IsOneByteRepresentationUnderneath(input_object_);
258 
259         Object result;
260         {
261           AllowGarbageCollection yes_gc;
262           result = isolate_->stack_guard()->HandleInterrupts();
263         }
264         if (result.IsException(isolate_)) {
265           return RegExp::kInternalRegExpException;
266         }
267 
268         // If we changed between a LATIN1 and a UC16 string, we need to restart
269         // regexp matching with the appropriate template instantiation of
270         // RawMatch.
271         if (String::IsOneByteRepresentationUnderneath(*input_handle) !=
272             was_one_byte) {
273           return RegExp::kInternalRegExpRetry;
274         }
275 
276         // Update objects and pointers in case they have changed during gc.
277         bytecode_object_ = *bytecode_handle;
278         bytecode_ = ToInstructionVector(bytecode_object_, no_gc_);
279         input_object_ = *input_handle;
280         input_ = ToCharacterVector<Character>(input_object_, no_gc_);
281       }
282     }
283     return RegExp::kInternalRegExpSuccess;
284   }
285 
286   // Change the current input index for future calls to `FindNextMatch`.
SetInputIndex(int new_input_index)287   void SetInputIndex(int new_input_index) {
288     DCHECK_GE(input_index_, 0);
289     DCHECK_LE(input_index_, input_.length());
290 
291     input_index_ = new_input_index;
292   }
293 
294   // Find the next match and return the corresponding capture registers and
295   // write its capture registers to `best_match_registers_`.  The search starts
296   // at the current `input_index_`.  Returns RegExp::kInternalRegExpSuccess if
297   // execution could finish regularly (with or without a match) and an error
298   // code due to interrupt otherwise.
FindNextMatch()299   int FindNextMatch() {
300     DCHECK(active_threads_.is_empty());
301     // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
302     // here? As long as
303     //
304     //   pc_last_input_index_[pc] < input_index_
305     //
306     // for all possible program counters pc that are reachable without input
307     // from pc = 0 and
308     //
309     //   pc_last_input_index_[k] <= input_index_
310     //
311     // for all k > 0 hold I think everything should be fine.  Maybe we can do
312     // something about this in `SetInputIndex`.
313     std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
314 
315     // Clean up left-over data from a previous call to FindNextMatch.
316     for (InterpreterThread t : blocked_threads_) {
317       DestroyThread(t);
318     }
319     blocked_threads_.DropAndClear();
320 
321     for (InterpreterThread t : active_threads_) {
322       DestroyThread(t);
323     }
324     active_threads_.DropAndClear();
325 
326     if (best_match_registers_.has_value()) {
327       FreeRegisterArray(best_match_registers_->begin());
328       best_match_registers_ = base::nullopt;
329     }
330 
331     // All threads start at bytecode 0.
332     active_threads_.Add(
333         InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_);
334     // Run the initial thread, potentially forking new threads, until every
335     // thread is blocked without further input.
336     RunActiveThreads();
337 
338     // We stop if one of the following conditions hold:
339     // - We have exhausted the entire input.
340     // - We have found a match at some point, and there are no remaining
341     //   threads with higher priority than the thread that produced the match.
342     //   Threads with low priority have been aborted earlier, and the remaining
343     //   threads are blocked here, so the latter simply means that
344     //   `blocked_threads_` is empty.
345     while (input_index_ != input_.length() &&
346            !(FoundMatch() && blocked_threads_.is_empty())) {
347       DCHECK(active_threads_.is_empty());
348       base::uc16 input_char = input_[input_index_];
349       ++input_index_;
350 
351       static constexpr int kTicksBetweenInterruptHandling = 64;
352       if (input_index_ % kTicksBetweenInterruptHandling == 0) {
353         int err_code = HandleInterrupts();
354         if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
355       }
356 
357       // We unblock all blocked_threads_ by feeding them the input char.
358       FlushBlockedThreads(input_char);
359 
360       // Run all threads until they block or accept.
361       RunActiveThreads();
362     }
363 
364     return RegExp::kInternalRegExpSuccess;
365   }
366 
367   // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
368   // instruction, or its PC value was already processed.
369   // - If processing of `t` can't continue because of CONSUME_RANGE, it is
370   //   pushed on `blocked_threads_`.
371   // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
372   //   the current input index. All remaining `active_threads_` are discarded.
RunActiveThread(InterpreterThread t)373   void RunActiveThread(InterpreterThread t) {
374     while (true) {
375       if (IsPcProcessed(t.pc)) return;
376       MarkPcProcessed(t.pc);
377 
378       RegExpInstruction inst = bytecode_[t.pc];
379       switch (inst.opcode) {
380         case RegExpInstruction::CONSUME_RANGE: {
381           blocked_threads_.Add(t, zone_);
382           return;
383         }
384         case RegExpInstruction::ASSERTION:
385           if (!SatisfiesAssertion(inst.payload.assertion_type, input_,
386                                   input_index_)) {
387             DestroyThread(t);
388             return;
389           }
390           ++t.pc;
391           break;
392         case RegExpInstruction::FORK: {
393           InterpreterThread fork{inst.payload.pc,
394                                  NewRegisterArrayUninitialized()};
395           base::Vector<int> fork_registers = GetRegisterArray(fork);
396           base::Vector<int> t_registers = GetRegisterArray(t);
397           DCHECK_EQ(fork_registers.length(), t_registers.length());
398           std::copy(t_registers.begin(), t_registers.end(),
399                     fork_registers.begin());
400           active_threads_.Add(fork, zone_);
401 
402           ++t.pc;
403           break;
404         }
405         case RegExpInstruction::JMP:
406           t.pc = inst.payload.pc;
407           break;
408         case RegExpInstruction::ACCEPT:
409           if (best_match_registers_.has_value()) {
410             FreeRegisterArray(best_match_registers_->begin());
411           }
412           best_match_registers_ = GetRegisterArray(t);
413 
414           for (InterpreterThread s : active_threads_) {
415             FreeRegisterArray(s.register_array_begin);
416           }
417           active_threads_.DropAndClear();
418           return;
419         case RegExpInstruction::SET_REGISTER_TO_CP:
420           GetRegisterArray(t)[inst.payload.register_index] = input_index_;
421           ++t.pc;
422           break;
423         case RegExpInstruction::CLEAR_REGISTER:
424           GetRegisterArray(t)[inst.payload.register_index] =
425               kUndefinedRegisterValue;
426           ++t.pc;
427           break;
428       }
429     }
430   }
431 
432   // Run each active thread until it can't continue without further input.
433   // `active_threads_` is empty afterwards.  `blocked_threads_` are sorted from
434   // low to high priority.
RunActiveThreads()435   void RunActiveThreads() {
436     while (!active_threads_.is_empty()) {
437       RunActiveThread(active_threads_.RemoveLast());
438     }
439   }
440 
441   // Unblock all blocked_threads_ by feeding them an `input_char`.  Should only
442   // be called with `input_index_` pointing to the character *after*
443   // `input_char` so that `pc_last_input_index_` is updated correctly.
FlushBlockedThreads(base::uc16 input_char)444   void FlushBlockedThreads(base::uc16 input_char) {
445     // The threads in blocked_threads_ are sorted from high to low priority,
446     // but active_threads_ needs to be sorted from low to high priority, so we
447     // need to activate blocked threads in reverse order.
448     for (int i = blocked_threads_.length() - 1; i >= 0; --i) {
449       InterpreterThread t = blocked_threads_[i];
450       RegExpInstruction inst = bytecode_[t.pc];
451       DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
452       RegExpInstruction::Uc16Range range = inst.payload.consume_range;
453       if (input_char >= range.min && input_char <= range.max) {
454         ++t.pc;
455         active_threads_.Add(t, zone_);
456       } else {
457         DestroyThread(t);
458       }
459     }
460     blocked_threads_.DropAndClear();
461   }
462 
FoundMatch() const463   bool FoundMatch() const { return best_match_registers_.has_value(); }
464 
GetRegisterArray(InterpreterThread t)465   base::Vector<int> GetRegisterArray(InterpreterThread t) {
466     return base::Vector<int>(t.register_array_begin, register_count_per_match_);
467   }
468 
NewRegisterArrayUninitialized()469   int* NewRegisterArrayUninitialized() {
470     return register_array_allocator_.allocate(register_count_per_match_);
471   }
472 
NewRegisterArray(int fill_value)473   int* NewRegisterArray(int fill_value) {
474     int* array_begin = NewRegisterArrayUninitialized();
475     int* array_end = array_begin + register_count_per_match_;
476     std::fill(array_begin, array_end, fill_value);
477     return array_begin;
478   }
479 
FreeRegisterArray(int * register_array_begin)480   void FreeRegisterArray(int* register_array_begin) {
481     register_array_allocator_.deallocate(register_array_begin,
482                                          register_count_per_match_);
483   }
484 
DestroyThread(InterpreterThread t)485   void DestroyThread(InterpreterThread t) {
486     FreeRegisterArray(t.register_array_begin);
487   }
488 
489   // It is redundant to have two threads t, t0 execute at the same PC value,
490   // because one of t, t0 matches iff the other does.  We can thus discard
491   // the one with lower priority.  We check whether a thread executed at some
492   // PC value by recording for every possible value of PC what the value of
493   // input_index_ was the last time a thread executed at PC. If a thread
494   // tries to continue execution at a PC value that we have seen before at
495   // the current input index, we abort it. (We execute threads with higher
496   // priority first, so the second thread is guaranteed to have lower
497   // priority.)
498   //
499   // Check whether we've seen an active thread with a given pc value since the
500   // last increment of `input_index_`.
IsPcProcessed(int pc)501   bool IsPcProcessed(int pc) {
502     DCHECK_LE(pc_last_input_index_[pc], input_index_);
503     return pc_last_input_index_[pc] == input_index_;
504   }
505 
506   // Mark a pc as having been processed since the last increment of
507   // `input_index_`.
MarkPcProcessed(int pc)508   void MarkPcProcessed(int pc) {
509     DCHECK_LE(pc_last_input_index_[pc], input_index_);
510     pc_last_input_index_[pc] = input_index_;
511   }
512 
513   Isolate* const isolate_;
514 
515   const RegExp::CallOrigin call_origin_;
516 
517   DisallowGarbageCollection no_gc_;
518 
519   ByteArray bytecode_object_;
520   base::Vector<const RegExpInstruction> bytecode_;
521 
522   // Number of registers used per thread.
523   const int register_count_per_match_;
524 
525   String input_object_;
526   base::Vector<const Character> input_;
527   int input_index_;
528 
529   // pc_last_input_index_[k] records the value of input_index_ the last
530   // time a thread t such that t.pc == k was activated, i.e. put on
531   // active_threads_.  Thus pc_last_input_index.size() == bytecode.size().  See
532   // also `RunActiveThread`.
533   base::Vector<int> pc_last_input_index_;
534 
535   // Active threads can potentially (but not necessarily) continue without
536   // input.  Sorted from low to high priority.
537   ZoneList<InterpreterThread> active_threads_;
538 
539   // The pc of a blocked thread points to an instruction that consumes a
540   // character. Sorted from high to low priority (so the opposite of
541   // `active_threads_`).
542   ZoneList<InterpreterThread> blocked_threads_;
543 
544   // RecyclingZoneAllocator maintains a linked list through freed allocations
545   // for reuse if possible.
546   RecyclingZoneAllocator<int> register_array_allocator_;
547 
548   // The register array of the best match found so far during the current
549   // search.  If several threads ACCEPTed, then this will be the register array
550   // of the accepting thread with highest priority.  Should be deallocated with
551   // `register_array_allocator_`.
552   base::Optional<base::Vector<int>> best_match_registers_;
553 
554   Zone* zone_;
555 };
556 
557 }  // namespace
558 
FindMatches(Isolate * isolate,RegExp::CallOrigin call_origin,ByteArray bytecode,int register_count_per_match,String input,int start_index,int32_t * output_registers,int output_register_count,Zone * zone)559 int ExperimentalRegExpInterpreter::FindMatches(
560     Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray bytecode,
561     int register_count_per_match, String input, int start_index,
562     int32_t* output_registers, int output_register_count, Zone* zone) {
563   DCHECK(input.IsFlat());
564   DisallowGarbageCollection no_gc;
565 
566   if (input.GetFlatContent(no_gc).IsOneByte()) {
567     NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode,
568                                         register_count_per_match, input,
569                                         start_index, zone);
570     return interpreter.FindMatches(output_registers, output_register_count);
571   } else {
572     DCHECK(input.GetFlatContent(no_gc).IsTwoByte());
573     NfaInterpreter<base::uc16> interpreter(isolate, call_origin, bytecode,
574                                            register_count_per_match, input,
575                                            start_index, zone);
576     return interpreter.FindMatches(output_registers, output_register_count);
577   }
578 }
579 
580 }  // namespace internal
581 }  // namespace v8
582