1 // Copyright 2020 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/experimental/experimental-interpreter.h"
6
7 #include "src/base/optional.h"
8 #include "src/base/strings.h"
9 #include "src/common/assert-scope.h"
10 #include "src/objects/fixed-array-inl.h"
11 #include "src/objects/string-inl.h"
12 #include "src/regexp/experimental/experimental.h"
13 #include "src/strings/char-predicates-inl.h"
14 #include "src/zone/zone-allocator.h"
15 #include "src/zone/zone-list-inl.h"
16
17 namespace v8 {
18 namespace internal {
19
20 namespace {
21
22 constexpr int kUndefinedRegisterValue = -1;
23
24 template <class Character>
SatisfiesAssertion(RegExpAssertion::Type type,base::Vector<const Character> context,int position)25 bool SatisfiesAssertion(RegExpAssertion::Type type,
26 base::Vector<const Character> context, int position) {
27 DCHECK_LE(position, context.length());
28 DCHECK_GE(position, 0);
29
30 switch (type) {
31 case RegExpAssertion::Type::START_OF_INPUT:
32 return position == 0;
33 case RegExpAssertion::Type::END_OF_INPUT:
34 return position == context.length();
35 case RegExpAssertion::Type::START_OF_LINE:
36 if (position == 0) return true;
37 return unibrow::IsLineTerminator(context[position - 1]);
38 case RegExpAssertion::Type::END_OF_LINE:
39 if (position == context.length()) return true;
40 return unibrow::IsLineTerminator(context[position]);
41 case RegExpAssertion::Type::BOUNDARY:
42 if (context.length() == 0) {
43 return false;
44 } else if (position == 0) {
45 return IsRegExpWord(context[position]);
46 } else if (position == context.length()) {
47 return IsRegExpWord(context[position - 1]);
48 } else {
49 return IsRegExpWord(context[position - 1]) !=
50 IsRegExpWord(context[position]);
51 }
52 case RegExpAssertion::Type::NON_BOUNDARY:
53 return !SatisfiesAssertion(RegExpAssertion::Type::BOUNDARY, context,
54 position);
55 }
56 }
57
ToInstructionVector(ByteArray raw_bytes,const DisallowGarbageCollection & no_gc)58 base::Vector<RegExpInstruction> ToInstructionVector(
59 ByteArray raw_bytes, const DisallowGarbageCollection& no_gc) {
60 RegExpInstruction* inst_begin =
61 reinterpret_cast<RegExpInstruction*>(raw_bytes.GetDataStartAddress());
62 int inst_num = raw_bytes.length() / sizeof(RegExpInstruction);
63 DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes.length());
64 return base::Vector<RegExpInstruction>(inst_begin, inst_num);
65 }
66
67 template <class Character>
68 base::Vector<const Character> ToCharacterVector(
69 String str, const DisallowGarbageCollection& no_gc);
70
71 template <>
ToCharacterVector(String str,const DisallowGarbageCollection & no_gc)72 base::Vector<const uint8_t> ToCharacterVector<uint8_t>(
73 String str, const DisallowGarbageCollection& no_gc) {
74 DCHECK(str.IsFlat());
75 String::FlatContent content = str.GetFlatContent(no_gc);
76 DCHECK(content.IsOneByte());
77 return content.ToOneByteVector();
78 }
79
80 template <>
ToCharacterVector(String str,const DisallowGarbageCollection & no_gc)81 base::Vector<const base::uc16> ToCharacterVector<base::uc16>(
82 String str, const DisallowGarbageCollection& no_gc) {
83 DCHECK(str.IsFlat());
84 String::FlatContent content = str.GetFlatContent(no_gc);
85 DCHECK(content.IsTwoByte());
86 return content.ToUC16Vector();
87 }
88
89 template <class Character>
90 class NfaInterpreter {
91 // Executes a bytecode program in breadth-first mode, without backtracking.
92 // `Character` can be instantiated with `uint8_t` or `base::uc16` for one byte
93 // or two byte input strings.
94 //
95 // In contrast to the backtracking implementation, this has linear time
96 // complexity in the length of the input string. Breadth-first mode means
97 // that threads are executed in lockstep with respect to their input
98 // position, i.e. the threads share a common input index. This is similar
99 // to breadth-first simulation of a non-deterministic finite automaton (nfa),
100 // hence the name of the class.
101 //
102 // To follow the semantics of a backtracking VM implementation, we have to be
103 // careful about whether we stop execution when a thread executes ACCEPT.
104 // For example, consider execution of the bytecode generated by the regexp
105 //
106 // r = /abc|..|[a-c]{10,}/
107 //
108 // on input "abcccccccccccccc". Clearly the three alternatives
109 // - /abc/
110 // - /../
111 // - /[a-c]{10,}/
112 // all match this input. A backtracking implementation will report "abc" as
113 // match, because it explores the first alternative before the others.
114 //
115 // However, if we execute breadth first, then we execute the 3 threads
116 // - t1, which tries to match /abc/
117 // - t2, which tries to match /../
118 // - t3, which tries to match /[a-c]{10,}/
119 // in lockstep i.e. by iterating over the input and feeding all threads one
120 // character at a time. t2 will execute an ACCEPT after two characters,
121 // while t1 will only execute ACCEPT after three characters. Thus we find a
122 // match for the second alternative before a match of the first alternative.
123 //
124 // This shows that we cannot always stop searching as soon as some thread t
125 // executes ACCEPT: If there is a thread u with higher priority than t, then
126 // it must be finished first. If u produces a match, then we can discard the
127 // match of t because matches produced by threads with higher priority are
128 // preferred over matches of threads with lower priority. On the other hand,
129 // we are allowed to abort all threads with lower priority than t if t
130 // produces a match: Such threads can only produce worse matches. In the
131 // example above, we can abort t3 after two characters because of t2's match.
132 //
133 // Thus the interpreter keeps track of a priority-ordered list of threads.
134 // If a thread ACCEPTs, all threads with lower priority are discarded, and
135 // the search continues with the threads with higher priority. If no threads
136 // with high priority are left, we return the match that was produced by the
137 // ACCEPTing thread with highest priority.
138 public:
NfaInterpreter(Isolate * isolate,RegExp::CallOrigin call_origin,ByteArray bytecode,int register_count_per_match,String input,int32_t input_index,Zone * zone)139 NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin,
140 ByteArray bytecode, int register_count_per_match, String input,
141 int32_t input_index, Zone* zone)
142 : isolate_(isolate),
143 call_origin_(call_origin),
144 bytecode_object_(bytecode),
145 bytecode_(ToInstructionVector(bytecode, no_gc_)),
146 register_count_per_match_(register_count_per_match),
147 input_object_(input),
148 input_(ToCharacterVector<Character>(input, no_gc_)),
149 input_index_(input_index),
150 pc_last_input_index_(zone->NewArray<int>(bytecode.length()),
151 bytecode.length()),
152 active_threads_(0, zone),
153 blocked_threads_(0, zone),
154 register_array_allocator_(zone),
155 best_match_registers_(base::nullopt),
156 zone_(zone) {
157 DCHECK(!bytecode_.empty());
158 DCHECK_GE(input_index_, 0);
159 DCHECK_LE(input_index_, input_.length());
160
161 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
162 }
163
164 // Finds matches and writes their concatenated capture registers to
165 // `output_registers`. `output_registers[i]` has to be valid for all i <
166 // output_register_count. The search continues until all remaining matches
167 // have been found or there is no space left in `output_registers`. Returns
168 // the number of matches found.
FindMatches(int32_t * output_registers,int output_register_count)169 int FindMatches(int32_t* output_registers, int output_register_count) {
170 const int max_match_num = output_register_count / register_count_per_match_;
171
172 int match_num = 0;
173 while (match_num != max_match_num) {
174 int err_code = FindNextMatch();
175 if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
176
177 if (!FoundMatch()) break;
178
179 base::Vector<int> registers = *best_match_registers_;
180 output_registers =
181 std::copy(registers.begin(), registers.end(), output_registers);
182
183 ++match_num;
184
185 const int match_begin = registers[0];
186 const int match_end = registers[1];
187 DCHECK_LE(match_begin, match_end);
188 const int match_length = match_end - match_begin;
189 if (match_length != 0) {
190 SetInputIndex(match_end);
191 } else if (match_end == input_.length()) {
192 // Zero-length match, input exhausted.
193 SetInputIndex(match_end);
194 break;
195 } else {
196 // Zero-length match, more input. We don't want to report more matches
197 // here endlessly, so we advance by 1.
198 SetInputIndex(match_end + 1);
199
200 // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to
201 // the next codepoint, not to the next code unit. See also
202 // `RegExpUtils::AdvanceStringIndex`.
203 STATIC_ASSERT(!ExperimentalRegExp::kSupportsUnicode);
204 }
205 }
206
207 return match_num;
208 }
209
210 private:
211 // The state of a "thread" executing experimental regexp bytecode. (Not to
212 // be confused with an OS thread.)
213 struct InterpreterThread {
214 // This thread's program counter, i.e. the index within `bytecode_` of the
215 // next instruction to be executed.
216 int pc;
217 // Pointer to the array of registers, which is always size
218 // `register_count_per_match_`. Should be deallocated with
219 // `register_array_allocator_`.
220 int* register_array_begin;
221 };
222
223 // Handles pending interrupts if there are any. Returns
224 // RegExp::kInternalRegExpSuccess if execution can continue, and an error
225 // code otherwise.
HandleInterrupts()226 int HandleInterrupts() {
227 StackLimitCheck check(isolate_);
228 if (call_origin_ == RegExp::CallOrigin::kFromJs) {
229 // Direct calls from JavaScript can be interrupted in two ways:
230 // 1. A real stack overflow, in which case we let the caller throw the
231 // exception.
232 // 2. The stack guard was used to interrupt execution for another purpose,
233 // forcing the call through the runtime system.
234 if (check.JsHasOverflowed()) {
235 return RegExp::kInternalRegExpException;
236 } else if (check.InterruptRequested()) {
237 return RegExp::kInternalRegExpRetry;
238 }
239 } else {
240 DCHECK(call_origin_ == RegExp::CallOrigin::kFromRuntime);
241 HandleScope handles(isolate_);
242 Handle<ByteArray> bytecode_handle(bytecode_object_, isolate_);
243 Handle<String> input_handle(input_object_, isolate_);
244
245 if (check.JsHasOverflowed()) {
246 // We abort the interpreter now anyway, so gc can't invalidate any
247 // pointers.
248 AllowGarbageCollection yes_gc;
249 isolate_->StackOverflow();
250 return RegExp::kInternalRegExpException;
251 } else if (check.InterruptRequested()) {
252 // TODO(mbid): Is this really equivalent to whether the string is
253 // one-byte or two-byte? A comment at the declaration of
254 // IsOneByteRepresentationUnderneath says that this might fail for
255 // external strings.
256 const bool was_one_byte =
257 String::IsOneByteRepresentationUnderneath(input_object_);
258
259 Object result;
260 {
261 AllowGarbageCollection yes_gc;
262 result = isolate_->stack_guard()->HandleInterrupts();
263 }
264 if (result.IsException(isolate_)) {
265 return RegExp::kInternalRegExpException;
266 }
267
268 // If we changed between a LATIN1 and a UC16 string, we need to restart
269 // regexp matching with the appropriate template instantiation of
270 // RawMatch.
271 if (String::IsOneByteRepresentationUnderneath(*input_handle) !=
272 was_one_byte) {
273 return RegExp::kInternalRegExpRetry;
274 }
275
276 // Update objects and pointers in case they have changed during gc.
277 bytecode_object_ = *bytecode_handle;
278 bytecode_ = ToInstructionVector(bytecode_object_, no_gc_);
279 input_object_ = *input_handle;
280 input_ = ToCharacterVector<Character>(input_object_, no_gc_);
281 }
282 }
283 return RegExp::kInternalRegExpSuccess;
284 }
285
286 // Change the current input index for future calls to `FindNextMatch`.
SetInputIndex(int new_input_index)287 void SetInputIndex(int new_input_index) {
288 DCHECK_GE(input_index_, 0);
289 DCHECK_LE(input_index_, input_.length());
290
291 input_index_ = new_input_index;
292 }
293
294 // Find the next match and return the corresponding capture registers and
295 // write its capture registers to `best_match_registers_`. The search starts
296 // at the current `input_index_`. Returns RegExp::kInternalRegExpSuccess if
297 // execution could finish regularly (with or without a match) and an error
298 // code due to interrupt otherwise.
FindNextMatch()299 int FindNextMatch() {
300 DCHECK(active_threads_.is_empty());
301 // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
302 // here? As long as
303 //
304 // pc_last_input_index_[pc] < input_index_
305 //
306 // for all possible program counters pc that are reachable without input
307 // from pc = 0 and
308 //
309 // pc_last_input_index_[k] <= input_index_
310 //
311 // for all k > 0 hold I think everything should be fine. Maybe we can do
312 // something about this in `SetInputIndex`.
313 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(), -1);
314
315 // Clean up left-over data from a previous call to FindNextMatch.
316 for (InterpreterThread t : blocked_threads_) {
317 DestroyThread(t);
318 }
319 blocked_threads_.DropAndClear();
320
321 for (InterpreterThread t : active_threads_) {
322 DestroyThread(t);
323 }
324 active_threads_.DropAndClear();
325
326 if (best_match_registers_.has_value()) {
327 FreeRegisterArray(best_match_registers_->begin());
328 best_match_registers_ = base::nullopt;
329 }
330
331 // All threads start at bytecode 0.
332 active_threads_.Add(
333 InterpreterThread{0, NewRegisterArray(kUndefinedRegisterValue)}, zone_);
334 // Run the initial thread, potentially forking new threads, until every
335 // thread is blocked without further input.
336 RunActiveThreads();
337
338 // We stop if one of the following conditions hold:
339 // - We have exhausted the entire input.
340 // - We have found a match at some point, and there are no remaining
341 // threads with higher priority than the thread that produced the match.
342 // Threads with low priority have been aborted earlier, and the remaining
343 // threads are blocked here, so the latter simply means that
344 // `blocked_threads_` is empty.
345 while (input_index_ != input_.length() &&
346 !(FoundMatch() && blocked_threads_.is_empty())) {
347 DCHECK(active_threads_.is_empty());
348 base::uc16 input_char = input_[input_index_];
349 ++input_index_;
350
351 static constexpr int kTicksBetweenInterruptHandling = 64;
352 if (input_index_ % kTicksBetweenInterruptHandling == 0) {
353 int err_code = HandleInterrupts();
354 if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
355 }
356
357 // We unblock all blocked_threads_ by feeding them the input char.
358 FlushBlockedThreads(input_char);
359
360 // Run all threads until they block or accept.
361 RunActiveThreads();
362 }
363
364 return RegExp::kInternalRegExpSuccess;
365 }
366
367 // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
368 // instruction, or its PC value was already processed.
369 // - If processing of `t` can't continue because of CONSUME_RANGE, it is
370 // pushed on `blocked_threads_`.
371 // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
372 // the current input index. All remaining `active_threads_` are discarded.
RunActiveThread(InterpreterThread t)373 void RunActiveThread(InterpreterThread t) {
374 while (true) {
375 if (IsPcProcessed(t.pc)) return;
376 MarkPcProcessed(t.pc);
377
378 RegExpInstruction inst = bytecode_[t.pc];
379 switch (inst.opcode) {
380 case RegExpInstruction::CONSUME_RANGE: {
381 blocked_threads_.Add(t, zone_);
382 return;
383 }
384 case RegExpInstruction::ASSERTION:
385 if (!SatisfiesAssertion(inst.payload.assertion_type, input_,
386 input_index_)) {
387 DestroyThread(t);
388 return;
389 }
390 ++t.pc;
391 break;
392 case RegExpInstruction::FORK: {
393 InterpreterThread fork{inst.payload.pc,
394 NewRegisterArrayUninitialized()};
395 base::Vector<int> fork_registers = GetRegisterArray(fork);
396 base::Vector<int> t_registers = GetRegisterArray(t);
397 DCHECK_EQ(fork_registers.length(), t_registers.length());
398 std::copy(t_registers.begin(), t_registers.end(),
399 fork_registers.begin());
400 active_threads_.Add(fork, zone_);
401
402 ++t.pc;
403 break;
404 }
405 case RegExpInstruction::JMP:
406 t.pc = inst.payload.pc;
407 break;
408 case RegExpInstruction::ACCEPT:
409 if (best_match_registers_.has_value()) {
410 FreeRegisterArray(best_match_registers_->begin());
411 }
412 best_match_registers_ = GetRegisterArray(t);
413
414 for (InterpreterThread s : active_threads_) {
415 FreeRegisterArray(s.register_array_begin);
416 }
417 active_threads_.DropAndClear();
418 return;
419 case RegExpInstruction::SET_REGISTER_TO_CP:
420 GetRegisterArray(t)[inst.payload.register_index] = input_index_;
421 ++t.pc;
422 break;
423 case RegExpInstruction::CLEAR_REGISTER:
424 GetRegisterArray(t)[inst.payload.register_index] =
425 kUndefinedRegisterValue;
426 ++t.pc;
427 break;
428 }
429 }
430 }
431
432 // Run each active thread until it can't continue without further input.
433 // `active_threads_` is empty afterwards. `blocked_threads_` are sorted from
434 // low to high priority.
RunActiveThreads()435 void RunActiveThreads() {
436 while (!active_threads_.is_empty()) {
437 RunActiveThread(active_threads_.RemoveLast());
438 }
439 }
440
441 // Unblock all blocked_threads_ by feeding them an `input_char`. Should only
442 // be called with `input_index_` pointing to the character *after*
443 // `input_char` so that `pc_last_input_index_` is updated correctly.
FlushBlockedThreads(base::uc16 input_char)444 void FlushBlockedThreads(base::uc16 input_char) {
445 // The threads in blocked_threads_ are sorted from high to low priority,
446 // but active_threads_ needs to be sorted from low to high priority, so we
447 // need to activate blocked threads in reverse order.
448 for (int i = blocked_threads_.length() - 1; i >= 0; --i) {
449 InterpreterThread t = blocked_threads_[i];
450 RegExpInstruction inst = bytecode_[t.pc];
451 DCHECK_EQ(inst.opcode, RegExpInstruction::CONSUME_RANGE);
452 RegExpInstruction::Uc16Range range = inst.payload.consume_range;
453 if (input_char >= range.min && input_char <= range.max) {
454 ++t.pc;
455 active_threads_.Add(t, zone_);
456 } else {
457 DestroyThread(t);
458 }
459 }
460 blocked_threads_.DropAndClear();
461 }
462
FoundMatch() const463 bool FoundMatch() const { return best_match_registers_.has_value(); }
464
GetRegisterArray(InterpreterThread t)465 base::Vector<int> GetRegisterArray(InterpreterThread t) {
466 return base::Vector<int>(t.register_array_begin, register_count_per_match_);
467 }
468
NewRegisterArrayUninitialized()469 int* NewRegisterArrayUninitialized() {
470 return register_array_allocator_.allocate(register_count_per_match_);
471 }
472
NewRegisterArray(int fill_value)473 int* NewRegisterArray(int fill_value) {
474 int* array_begin = NewRegisterArrayUninitialized();
475 int* array_end = array_begin + register_count_per_match_;
476 std::fill(array_begin, array_end, fill_value);
477 return array_begin;
478 }
479
FreeRegisterArray(int * register_array_begin)480 void FreeRegisterArray(int* register_array_begin) {
481 register_array_allocator_.deallocate(register_array_begin,
482 register_count_per_match_);
483 }
484
DestroyThread(InterpreterThread t)485 void DestroyThread(InterpreterThread t) {
486 FreeRegisterArray(t.register_array_begin);
487 }
488
489 // It is redundant to have two threads t, t0 execute at the same PC value,
490 // because one of t, t0 matches iff the other does. We can thus discard
491 // the one with lower priority. We check whether a thread executed at some
492 // PC value by recording for every possible value of PC what the value of
493 // input_index_ was the last time a thread executed at PC. If a thread
494 // tries to continue execution at a PC value that we have seen before at
495 // the current input index, we abort it. (We execute threads with higher
496 // priority first, so the second thread is guaranteed to have lower
497 // priority.)
498 //
499 // Check whether we've seen an active thread with a given pc value since the
500 // last increment of `input_index_`.
IsPcProcessed(int pc)501 bool IsPcProcessed(int pc) {
502 DCHECK_LE(pc_last_input_index_[pc], input_index_);
503 return pc_last_input_index_[pc] == input_index_;
504 }
505
506 // Mark a pc as having been processed since the last increment of
507 // `input_index_`.
MarkPcProcessed(int pc)508 void MarkPcProcessed(int pc) {
509 DCHECK_LE(pc_last_input_index_[pc], input_index_);
510 pc_last_input_index_[pc] = input_index_;
511 }
512
513 Isolate* const isolate_;
514
515 const RegExp::CallOrigin call_origin_;
516
517 DisallowGarbageCollection no_gc_;
518
519 ByteArray bytecode_object_;
520 base::Vector<const RegExpInstruction> bytecode_;
521
522 // Number of registers used per thread.
523 const int register_count_per_match_;
524
525 String input_object_;
526 base::Vector<const Character> input_;
527 int input_index_;
528
529 // pc_last_input_index_[k] records the value of input_index_ the last
530 // time a thread t such that t.pc == k was activated, i.e. put on
531 // active_threads_. Thus pc_last_input_index.size() == bytecode.size(). See
532 // also `RunActiveThread`.
533 base::Vector<int> pc_last_input_index_;
534
535 // Active threads can potentially (but not necessarily) continue without
536 // input. Sorted from low to high priority.
537 ZoneList<InterpreterThread> active_threads_;
538
539 // The pc of a blocked thread points to an instruction that consumes a
540 // character. Sorted from high to low priority (so the opposite of
541 // `active_threads_`).
542 ZoneList<InterpreterThread> blocked_threads_;
543
544 // RecyclingZoneAllocator maintains a linked list through freed allocations
545 // for reuse if possible.
546 RecyclingZoneAllocator<int> register_array_allocator_;
547
548 // The register array of the best match found so far during the current
549 // search. If several threads ACCEPTed, then this will be the register array
550 // of the accepting thread with highest priority. Should be deallocated with
551 // `register_array_allocator_`.
552 base::Optional<base::Vector<int>> best_match_registers_;
553
554 Zone* zone_;
555 };
556
557 } // namespace
558
FindMatches(Isolate * isolate,RegExp::CallOrigin call_origin,ByteArray bytecode,int register_count_per_match,String input,int start_index,int32_t * output_registers,int output_register_count,Zone * zone)559 int ExperimentalRegExpInterpreter::FindMatches(
560 Isolate* isolate, RegExp::CallOrigin call_origin, ByteArray bytecode,
561 int register_count_per_match, String input, int start_index,
562 int32_t* output_registers, int output_register_count, Zone* zone) {
563 DCHECK(input.IsFlat());
564 DisallowGarbageCollection no_gc;
565
566 if (input.GetFlatContent(no_gc).IsOneByte()) {
567 NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode,
568 register_count_per_match, input,
569 start_index, zone);
570 return interpreter.FindMatches(output_registers, output_register_count);
571 } else {
572 DCHECK(input.GetFlatContent(no_gc).IsTwoByte());
573 NfaInterpreter<base::uc16> interpreter(isolate, call_origin, bytecode,
574 register_count_per_match, input,
575 start_index, zone);
576 return interpreter.FindMatches(output_registers, output_register_count);
577 }
578 }
579
580 } // namespace internal
581 } // namespace v8
582