• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp.h"
6 
7 #include "src/codegen/compilation-cache.h"
8 #include "src/diagnostics/code-tracer.h"
9 #include "src/heap/heap-inl.h"
10 #include "src/objects/js-regexp-inl.h"
11 #include "src/regexp/experimental/experimental.h"
12 #include "src/regexp/regexp-bytecode-generator.h"
13 #include "src/regexp/regexp-bytecodes.h"
14 #include "src/regexp/regexp-compiler.h"
15 #include "src/regexp/regexp-dotprinter.h"
16 #include "src/regexp/regexp-interpreter.h"
17 #include "src/regexp/regexp-macro-assembler-arch.h"
18 #include "src/regexp/regexp-macro-assembler-tracer.h"
19 #include "src/regexp/regexp-parser.h"
20 #include "src/regexp/regexp-utils.h"
21 #include "src/strings/string-search.h"
22 #include "src/utils/ostreams.h"
23 
24 namespace v8 {
25 namespace internal {
26 
27 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
28 
29 class RegExpImpl final : public AllStatic {
30  public:
31   // Returns a string representation of a regular expression.
32   // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
33   // This function calls the garbage collector if necessary.
34   static Handle<String> ToString(Handle<Object> value);
35 
36   // Prepares a JSRegExp object with Irregexp-specific data.
37   static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
38                                  Handle<String> pattern, JSRegExp::Flags flags,
39                                  int capture_count, uint32_t backtrack_limit);
40 
41   // Prepare a RegExp for being executed one or more times (using
42   // IrregexpExecOnce) on the subject.
43   // This ensures that the regexp is compiled for the subject, and that
44   // the subject is flat.
45   // Returns the number of integer spaces required by IrregexpExecOnce
46   // as its "registers" argument.  If the regexp cannot be compiled,
47   // an exception is set as pending, and this function returns negative.
48   static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
49                              Handle<String> subject);
50 
51   static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
52                           Handle<String> pattern, JSRegExp::Flags flags,
53                           Handle<String> match_pattern);
54 
55   static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
56                          Handle<String> subject, int index, int32_t* output,
57                          int output_size);
58 
59   static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
60                                  Handle<String> subject, int index,
61                                  Handle<RegExpMatchInfo> last_match_info);
62 
63   // Execute a regular expression on the subject, starting from index.
64   // If matching succeeds, return the number of matches.  This can be larger
65   // than one in the case of global regular expressions.
66   // The captures and subcaptures are stored into the registers vector.
67   // If matching fails, returns RE_FAILURE.
68   // If execution fails, sets a pending exception and returns RE_EXCEPTION.
69   static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
70                              Handle<String> subject, int index, int32_t* output,
71                              int output_size);
72 
73   // Execute an Irregexp bytecode pattern.
74   // On a successful match, the result is a JSArray containing
75   // captured positions.  On a failure, the result is the null value.
76   // Returns an empty handle in case of an exception.
77   V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
78       Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
79       int index, Handle<RegExpMatchInfo> last_match_info);
80 
81   static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
82                               Handle<String> sample_subject, bool is_one_byte);
83   static inline bool EnsureCompiledIrregexp(Isolate* isolate,
84                                             Handle<JSRegExp> re,
85                                             Handle<String> sample_subject,
86                                             bool is_one_byte);
87 
88   // Returns true on success, false on failure.
89   static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
90                       JSRegExp::Flags flags, Handle<String> pattern,
91                       Handle<String> sample_subject, bool is_one_byte,
92                       uint32_t& backtrack_limit);
93 
94   // For acting on the JSRegExp data FixedArray.
95   static int IrregexpMaxRegisterCount(FixedArray re);
96   static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
97   static int IrregexpNumberOfCaptures(FixedArray re);
98   static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
99   static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
100 };
101 
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpError error)102 MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
103                                                  Handle<JSRegExp> re,
104                                                  Handle<String> pattern,
105                                                  RegExpError error) {
106   Vector<const char> error_data = CStrVector(RegExpErrorString(error));
107   Handle<String> error_text =
108       isolate->factory()
109           ->NewStringFromOneByte(Vector<const uint8_t>::cast(error_data))
110           .ToHandleChecked();
111   THROW_NEW_ERROR(
112       isolate,
113       NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
114       Object);
115 }
116 
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,RegExpError error_text)117 void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
118                                   RegExpError error_text) {
119   USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
120                            error_text));
121 }
122 
IsUnmodifiedRegExp(Isolate * isolate,Handle<JSRegExp> regexp)123 bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) {
124   return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
125 }
126 
127 // Identifies the sort of regexps where the regexp engine is faster
128 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)129 static bool HasFewDifferentCharacters(Handle<String> pattern) {
130   int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
131   if (length <= kPatternTooShortForBoyerMoore) return false;
132   const int kMod = 128;
133   bool character_found[kMod];
134   int different = 0;
135   memset(&character_found[0], 0, sizeof(character_found));
136   for (int i = 0; i < length; i++) {
137     int ch = (pattern->Get(i) & (kMod - 1));
138     if (!character_found[ch]) {
139       character_found[ch] = true;
140       different++;
141       // We declare a regexp low-alphabet if it has at least 3 times as many
142       // characters as it has different characters.
143       if (different * 3 > length) return false;
144     }
145   }
146   return true;
147 }
148 
149 // Generic RegExp methods. Dispatches to implementation specific methods.
150 
151 // static
Compile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,uint32_t backtrack_limit)152 MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
153                                     Handle<String> pattern,
154                                     JSRegExp::Flags flags,
155                                     uint32_t backtrack_limit) {
156   DCHECK(pattern->IsFlat());
157 
158   // Caching is based only on the pattern and flags, but code also differs when
159   // a backtrack limit is set. A present backtrack limit is very much *not* the
160   // common case, so just skip the cache for these.
161   const bool is_compilation_cache_enabled =
162       (backtrack_limit == JSRegExp::kNoBacktrackLimit);
163 
164   Zone zone(isolate->allocator(), ZONE_NAME);
165   CompilationCache* compilation_cache = nullptr;
166   if (is_compilation_cache_enabled) {
167     compilation_cache = isolate->compilation_cache();
168     MaybeHandle<FixedArray> maybe_cached =
169         compilation_cache->LookupRegExp(pattern, flags);
170     Handle<FixedArray> cached;
171     if (maybe_cached.ToHandle(&cached)) {
172       re->set_data(*cached);
173       return re;
174     }
175   }
176 
177   PostponeInterruptsScope postpone(isolate);
178   RegExpCompileData parse_result;
179   FlatStringReader reader(isolate, pattern);
180   DCHECK(!isolate->has_pending_exception());
181   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
182                                  &parse_result)) {
183     // Throw an exception if we fail to parse the pattern.
184     return RegExp::ThrowRegExpException(isolate, re, pattern,
185                                         parse_result.error);
186   }
187 
188   bool has_been_compiled = false;
189 
190   if (FLAG_default_to_experimental_regexp_engine &&
191       ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
192                                        parse_result.capture_count)) {
193     DCHECK(FLAG_enable_experimental_regexp_engine);
194     ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
195                                    parse_result.capture_count);
196     has_been_compiled = true;
197   } else if (flags & JSRegExp::kLinear) {
198     DCHECK(FLAG_enable_experimental_regexp_engine);
199     if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
200                                           parse_result.capture_count)) {
201       // TODO(mbid): The error could provide a reason for why the regexp can't
202       // be executed in linear time (e.g. due to back references).
203       return RegExp::ThrowRegExpException(isolate, re, pattern,
204                                           RegExpError::kNotLinear);
205     }
206     ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
207                                    parse_result.capture_count);
208     has_been_compiled = true;
209   } else if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
210              !HasFewDifferentCharacters(pattern)) {
211     // Parse-tree is a single atom that is equal to the pattern.
212     RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
213     has_been_compiled = true;
214   } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
215              parse_result.capture_count == 0) {
216     RegExpAtom* atom = parse_result.tree->AsAtom();
217     // The pattern source might (?) contain escape sequences, but they're
218     // resolved in atom_string.
219     Vector<const uc16> atom_pattern = atom->data();
220     Handle<String> atom_string;
221     ASSIGN_RETURN_ON_EXCEPTION(
222         isolate, atom_string,
223         isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
224     if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
225       RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
226       has_been_compiled = true;
227     }
228   }
229   if (!has_been_compiled) {
230     RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
231                                    parse_result.capture_count, backtrack_limit);
232   }
233   DCHECK(re->data().IsFixedArray());
234   // Compilation succeeded so the data is set on the regexp
235   // and we can store it in the cache.
236   Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
237   if (is_compilation_cache_enabled) {
238     compilation_cache->PutRegExp(pattern, flags, data);
239   }
240 
241   return re;
242 }
243 
244 // static
EnsureFullyCompiled(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject)245 bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
246                                  Handle<String> subject) {
247   switch (re->TypeTag()) {
248     case JSRegExp::NOT_COMPILED:
249       UNREACHABLE();
250     case JSRegExp::ATOM:
251       return true;
252     case JSRegExp::IRREGEXP:
253       if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
254         DCHECK(isolate->has_pending_exception());
255         return false;
256       }
257       return true;
258     case JSRegExp::EXPERIMENTAL:
259       if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
260           !ExperimentalRegExp::Compile(isolate, re)) {
261         DCHECK(isolate->has_pending_exception());
262         return false;
263       }
264       return true;
265   }
266 }
267 
268 // static
ExperimentalOneshotExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)269 MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
270     Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
271     int index, Handle<RegExpMatchInfo> last_match_info) {
272   return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
273                                          last_match_info);
274 }
275 
276 // static
Exec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)277 MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
278                                  Handle<String> subject, int index,
279                                  Handle<RegExpMatchInfo> last_match_info) {
280   switch (regexp->TypeTag()) {
281     case JSRegExp::NOT_COMPILED:
282       UNREACHABLE();
283     case JSRegExp::ATOM:
284       return RegExpImpl::AtomExec(isolate, regexp, subject, index,
285                                   last_match_info);
286     case JSRegExp::IRREGEXP:
287       return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
288                                       last_match_info);
289     case JSRegExp::EXPERIMENTAL:
290       return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
291                                       last_match_info);
292   }
293 }
294 
295 // RegExp Atom implementation: Simple string search using indexOf.
296 
AtomCompile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)297 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
298                              Handle<String> pattern, JSRegExp::Flags flags,
299                              Handle<String> match_pattern) {
300   isolate->factory()->SetRegExpAtomData(re, pattern, flags, match_pattern);
301 }
302 
SetAtomLastCapture(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,String subject,int from,int to)303 static void SetAtomLastCapture(Isolate* isolate,
304                                Handle<RegExpMatchInfo> last_match_info,
305                                String subject, int from, int to) {
306   SealHandleScope shs(isolate);
307   last_match_info->SetNumberOfCaptureRegisters(2);
308   last_match_info->SetLastSubject(subject);
309   last_match_info->SetLastInput(subject);
310   last_match_info->SetCapture(0, from);
311   last_match_info->SetCapture(1, to);
312 }
313 
AtomExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)314 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
315                             Handle<String> subject, int index, int32_t* output,
316                             int output_size) {
317   DCHECK_LE(0, index);
318   DCHECK_LE(index, subject->length());
319 
320   subject = String::Flatten(isolate, subject);
321   DisallowHeapAllocation no_gc;  // ensure vectors stay valid
322 
323   String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
324   int needle_len = needle.length();
325   DCHECK(needle.IsFlat());
326   DCHECK_LT(0, needle_len);
327 
328   if (index + needle_len > subject->length()) {
329     return RegExp::RE_FAILURE;
330   }
331 
332   for (int i = 0; i < output_size; i += 2) {
333     String::FlatContent needle_content = needle.GetFlatContent(no_gc);
334     String::FlatContent subject_content = subject->GetFlatContent(no_gc);
335     DCHECK(needle_content.IsFlat());
336     DCHECK(subject_content.IsFlat());
337     // dispatch on type of strings
338     index =
339         (needle_content.IsOneByte()
340              ? (subject_content.IsOneByte()
341                     ? SearchString(isolate, subject_content.ToOneByteVector(),
342                                    needle_content.ToOneByteVector(), index)
343                     : SearchString(isolate, subject_content.ToUC16Vector(),
344                                    needle_content.ToOneByteVector(), index))
345              : (subject_content.IsOneByte()
346                     ? SearchString(isolate, subject_content.ToOneByteVector(),
347                                    needle_content.ToUC16Vector(), index)
348                     : SearchString(isolate, subject_content.ToUC16Vector(),
349                                    needle_content.ToUC16Vector(), index)));
350     if (index == -1) {
351       return i / 2;  // Return number of matches.
352     } else {
353       output[i] = index;
354       output[i + 1] = index + needle_len;
355       index += needle_len;
356     }
357   }
358   return output_size / 2;
359 }
360 
AtomExec(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)361 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
362                                     Handle<String> subject, int index,
363                                     Handle<RegExpMatchInfo> last_match_info) {
364   static const int kNumRegisters = 2;
365   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
366   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
367 
368   int res =
369       AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
370 
371   if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value();
372 
373   DCHECK_EQ(res, RegExp::RE_SUCCESS);
374   SealHandleScope shs(isolate);
375   SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
376                      output_registers[1]);
377   return last_match_info;
378 }
379 
380 // Irregexp implementation.
381 
382 // Ensures that the regexp object contains a compiled version of the
383 // source for either one-byte or two-byte subject strings.
384 // If the compiled version doesn't already exist, it is compiled
385 // from the source pattern.
386 // If compilation fails, an exception is thrown and this function
387 // returns false.
EnsureCompiledIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)388 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
389                                         Handle<String> sample_subject,
390                                         bool is_one_byte) {
391   Object compiled_code = re->Code(is_one_byte);
392   Object bytecode = re->Bytecode(is_one_byte);
393   bool needs_initial_compilation =
394       compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue);
395   // Recompile is needed when we're dealing with the first execution of the
396   // regexp after the decision to tier up has been made. If the tiering up
397   // strategy is not in use, this value is always false.
398   bool needs_tier_up_compilation =
399       re->MarkedForTierUp() && bytecode.IsByteArray();
400 
401   if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) {
402     PrintF("JSRegExp object %p needs tier-up compilation\n",
403            reinterpret_cast<void*>(re->ptr()));
404   }
405 
406   if (!needs_initial_compilation && !needs_tier_up_compilation) {
407     DCHECK(compiled_code.IsCode());
408     DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray());
409     return true;
410   }
411 
412   DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray());
413 
414   return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
415 }
416 
417 #ifdef DEBUG
418 namespace {
419 
RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re,bool is_one_byte)420 bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
421   Object entry = re->Code(is_one_byte);
422   Object bytecode = re->Bytecode(is_one_byte);
423   // If we're not using the tier-up strategy, entry can only be a smi
424   // representing an uncompiled regexp here. If we're using the tier-up
425   // strategy, entry can still be a smi representing an uncompiled regexp, when
426   // compiling the regexp before the tier-up, or it can contain a trampoline to
427   // the regexp interpreter, in which case the bytecode field contains compiled
428   // bytecode, when recompiling the regexp after the tier-up. If the
429   // tier-up was forced, which happens for global replaces, entry is a smi
430   // representing an uncompiled regexp, even though we're "recompiling" after
431   // the tier-up.
432   if (re->ShouldProduceBytecode()) {
433     DCHECK(entry.IsSmi());
434     DCHECK(bytecode.IsSmi());
435     int entry_value = Smi::ToInt(entry);
436     int bytecode_value = Smi::ToInt(bytecode);
437     DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
438     DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value);
439   } else {
440     DCHECK(entry.IsSmi() || (entry.IsCode() && bytecode.IsByteArray()));
441   }
442 
443   return true;
444 }
445 
446 }  // namespace
447 #endif
448 
CompileIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)449 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
450                                  Handle<String> sample_subject,
451                                  bool is_one_byte) {
452   // Compile the RegExp.
453   Zone zone(isolate->allocator(), ZONE_NAME);
454   PostponeInterruptsScope postpone(isolate);
455 
456   DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte));
457 
458   JSRegExp::Flags flags = re->GetFlags();
459 
460   Handle<String> pattern(re->Pattern(), isolate);
461   pattern = String::Flatten(isolate, pattern);
462   RegExpCompileData compile_data;
463   FlatStringReader reader(isolate, pattern);
464   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
465                                  &compile_data)) {
466     // Throw an exception if we fail to parse the pattern.
467     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
468     USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
469     return false;
470   }
471   // The compilation target is a kBytecode if we're interpreting all regexp
472   // objects, or if we're using the tier-up strategy but the tier-up hasn't
473   // happened yet. The compilation target is a kNative if we're using the
474   // tier-up strategy and we need to recompile to tier-up, or if we're producing
475   // native code for all regexp objects.
476   compile_data.compilation_target = re->ShouldProduceBytecode()
477                                         ? RegExpCompilationTarget::kBytecode
478                                         : RegExpCompilationTarget::kNative;
479   uint32_t backtrack_limit = re->BacktrackLimit();
480   const bool compilation_succeeded =
481       Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
482               is_one_byte, backtrack_limit);
483   if (!compilation_succeeded) {
484     DCHECK(compile_data.error != RegExpError::kNone);
485     RegExp::ThrowRegExpException(isolate, re, compile_data.error);
486     return false;
487   }
488 
489   Handle<FixedArray> data =
490       Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
491   if (compile_data.compilation_target == RegExpCompilationTarget::kNative) {
492     data->set(JSRegExp::code_index(is_one_byte), *compile_data.code);
493     // Reset bytecode to uninitialized. In case we use tier-up we know that
494     // tier-up has happened this way.
495     data->set(JSRegExp::bytecode_index(is_one_byte),
496               Smi::FromInt(JSRegExp::kUninitializedValue));
497   } else {
498     DCHECK_EQ(compile_data.compilation_target,
499               RegExpCompilationTarget::kBytecode);
500     // Store code generated by compiler in bytecode and trampoline to
501     // interpreter in code.
502     data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code);
503     Handle<Code> trampoline =
504         BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
505     data->set(JSRegExp::code_index(is_one_byte), *trampoline);
506   }
507   re->SetCaptureNameMap(compile_data.capture_name_map);
508   int register_max = IrregexpMaxRegisterCount(*data);
509   if (compile_data.register_count > register_max) {
510     SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
511   }
512   data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
513 
514   if (FLAG_trace_regexp_tier_up) {
515     PrintF("JSRegExp object %p %s size: %d\n",
516            reinterpret_cast<void*>(re->ptr()),
517            re->ShouldProduceBytecode() ? "bytecode" : "native code",
518            re->ShouldProduceBytecode()
519                ? IrregexpByteCode(*data, is_one_byte).Size()
520                : IrregexpNativeCode(*data, is_one_byte).Size());
521   }
522 
523   return true;
524 }
525 
IrregexpMaxRegisterCount(FixedArray re)526 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
527   return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex));
528 }
529 
SetIrregexpMaxRegisterCount(FixedArray re,int value)530 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
531   re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
532 }
533 
IrregexpNumberOfCaptures(FixedArray re)534 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
535   return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
536 }
537 
IrregexpByteCode(FixedArray re,bool is_one_byte)538 ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
539   return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte)));
540 }
541 
IrregexpNativeCode(FixedArray re,bool is_one_byte)542 Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
543   return Code::cast(re.get(JSRegExp::code_index(is_one_byte)));
544 }
545 
IrregexpInitialize(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count,uint32_t backtrack_limit)546 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
547                                     Handle<String> pattern,
548                                     JSRegExp::Flags flags, int capture_count,
549                                     uint32_t backtrack_limit) {
550   // Initialize compiled code entries to null.
551   isolate->factory()->SetRegExpIrregexpData(re, pattern, flags, capture_count,
552                                             backtrack_limit);
553 }
554 
555 // static
IrregexpPrepare(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject)556 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
557                                 Handle<String> subject) {
558   DCHECK(subject->IsFlat());
559 
560   // Check representation of the underlying storage.
561   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
562   if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject,
563                                           is_one_byte)) {
564     return -1;
565   }
566 
567   // Only reserve room for output captures. Internal registers are allocated by
568   // the engine.
569   return JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
570 }
571 
IrregexpExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)572 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
573                                 Handle<String> subject, int index,
574                                 int32_t* output, int output_size) {
575   DCHECK_LE(0, index);
576   DCHECK_LE(index, subject->length());
577   DCHECK(subject->IsFlat());
578   DCHECK_GE(output_size,
579             JSRegExp::RegistersForCaptureCount(regexp->CaptureCount()));
580 
581   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
582 
583   if (!regexp->ShouldProduceBytecode()) {
584     do {
585       EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
586       // The stack is used to allocate registers for the compiled regexp code.
587       // This means that in case of failure, the output registers array is left
588       // untouched and contains the capture results from the previous successful
589       // match.  We can use that to set the last match info lazily.
590       int res = NativeRegExpMacroAssembler::Match(regexp, subject, output,
591                                                   output_size, index, isolate);
592       if (res != NativeRegExpMacroAssembler::RETRY) {
593         DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
594                isolate->has_pending_exception());
595         STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
596                       RegExp::RE_SUCCESS);
597         STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
598                       RegExp::RE_FAILURE);
599         STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
600                       RegExp::RE_EXCEPTION);
601         return res;
602       }
603       // If result is RETRY, the string has changed representation, and we
604       // must restart from scratch.
605       // In this case, it means we must make sure we are prepared to handle
606       // the, potentially, different subject (the string can switch between
607       // being internal and external, and even between being Latin1 and UC16,
608       // but the characters are always the same).
609       is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
610     } while (true);
611     UNREACHABLE();
612   } else {
613     DCHECK(regexp->ShouldProduceBytecode());
614 
615     do {
616       IrregexpInterpreter::Result result =
617           IrregexpInterpreter::MatchForCallFromRuntime(
618               isolate, regexp, subject, output, output_size, index);
619       DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
620                      isolate->has_pending_exception());
621 
622       switch (result) {
623         case IrregexpInterpreter::SUCCESS:
624         case IrregexpInterpreter::EXCEPTION:
625         case IrregexpInterpreter::FAILURE:
626         case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL:
627           return result;
628         case IrregexpInterpreter::RETRY:
629           // The string has changed representation, and we must restart the
630           // match.
631           // We need to reset the tier up to start over with compilation.
632           if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick();
633           is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
634           EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
635           break;
636       }
637     } while (true);
638     UNREACHABLE();
639   }
640 }
641 
IrregexpExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info)642 MaybeHandle<Object> RegExpImpl::IrregexpExec(
643     Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
644     int previous_index, Handle<RegExpMatchInfo> last_match_info) {
645   DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
646 
647   subject = String::Flatten(isolate, subject);
648 
649 #ifdef DEBUG
650   if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) {
651     String pattern = regexp->Pattern();
652     PrintF("\n\nRegexp match:   /%s/\n\n", pattern.ToCString().get());
653     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
654   }
655 #endif
656 
657   // For very long subject strings, the regexp interpreter is currently much
658   // slower than the jitted code execution. If the tier-up strategy is turned
659   // on, we want to avoid this performance penalty so we eagerly tier-up if the
660   // subject string length is equal or greater than the given heuristic value.
661   if (FLAG_regexp_tier_up &&
662       subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
663     regexp->MarkTierUpForNextExec();
664     if (FLAG_trace_regexp_tier_up) {
665       PrintF(
666           "Forcing tier-up for very long strings in "
667           "RegExpImpl::IrregexpExec\n");
668     }
669   }
670 
671   // Prepare space for the return values.
672   int required_registers =
673       RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
674   if (required_registers < 0) {
675     // Compiling failed with an exception.
676     DCHECK(isolate->has_pending_exception());
677     return MaybeHandle<Object>();
678   }
679 
680   int32_t* output_registers = nullptr;
681   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
682     output_registers = NewArray<int32_t>(required_registers);
683   }
684   std::unique_ptr<int32_t[]> auto_release(output_registers);
685   if (output_registers == nullptr) {
686     output_registers = isolate->jsregexp_static_offsets_vector();
687   }
688 
689   int res =
690       RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
691                                   output_registers, required_registers);
692 
693   if (res == RegExp::RE_SUCCESS) {
694     int capture_count = regexp->CaptureCount();
695     return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
696                                     capture_count, output_registers);
697   } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
698     return ExperimentalRegExp::OneshotExec(isolate, regexp, subject,
699                                            previous_index, last_match_info);
700   } else if (res == RegExp::RE_EXCEPTION) {
701     DCHECK(isolate->has_pending_exception());
702     return MaybeHandle<Object>();
703   } else {
704     DCHECK(res == RegExp::RE_FAILURE);
705     return isolate->factory()->null_value();
706   }
707 }
708 
709 // static
SetLastMatchInfo(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)710 Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo(
711     Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
712     Handle<String> subject, int capture_count, int32_t* match) {
713   // This is the only place where match infos can grow. If, after executing the
714   // regexp, RegExpExecStub finds that the match info is too small, it restarts
715   // execution in RegExpImpl::Exec, which finally grows the match info right
716   // here.
717   Handle<RegExpMatchInfo> result =
718       RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
719   if (*result != *last_match_info) {
720     if (*last_match_info == *isolate->regexp_last_match_info()) {
721       // This inner condition is only needed for special situations like the
722       // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
723       // RegExpImpl::Exec; there actually want to bypass the Isolate's match
724       // info and execute the regexp without side effects.
725       isolate->native_context()->set_regexp_last_match_info(*result);
726     }
727   }
728 
729   int capture_register_count =
730       JSRegExp::RegistersForCaptureCount(capture_count);
731   DisallowHeapAllocation no_allocation;
732   if (match != nullptr) {
733     for (int i = 0; i < capture_register_count; i += 2) {
734       result->SetCapture(i, match[i]);
735       result->SetCapture(i + 1, match[i + 1]);
736     }
737   }
738   result->SetLastSubject(*subject);
739   result->SetLastInput(*subject);
740   return result;
741 }
742 
743 // static
DotPrintForTesting(const char * label,RegExpNode * node)744 void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) {
745   DotPrinter::DotPrint(label, node);
746 }
747 
748 namespace {
749 
750 // Returns true if we've either generated too much irregex code within this
751 // isolate, or the pattern string is too long.
TooMuchRegExpCode(Isolate * isolate,Handle<String> pattern)752 bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
753   // Limit the space regexps take up on the heap.  In order to limit this we
754   // would like to keep track of the amount of regexp code on the heap.  This
755   // is not tracked, however.  As a conservative approximation we track the
756   // total regexp code compiled including code that has subsequently been freed
757   // and the total executable memory at any point.
758   static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
759   static constexpr size_t kRegExpCompiledLimit = 1 * MB;
760 
761   Heap* heap = isolate->heap();
762   if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
763   return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
764           heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
765 }
766 
767 }  // namespace
768 
769 // static
CompileForTesting(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)770 bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone,
771                                RegExpCompileData* data, JSRegExp::Flags flags,
772                                Handle<String> pattern,
773                                Handle<String> sample_subject,
774                                bool is_one_byte) {
775   uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
776   return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
777                              sample_subject, is_one_byte, backtrack_limit);
778 }
779 
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte,uint32_t & backtrack_limit)780 bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
781                          JSRegExp::Flags flags, Handle<String> pattern,
782                          Handle<String> sample_subject, bool is_one_byte,
783                          uint32_t& backtrack_limit) {
784   if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
785       RegExpMacroAssembler::kMaxRegisterCount) {
786     data->error = RegExpError::kTooLarge;
787     return false;
788   }
789 
790   RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
791 
792   if (compiler.optimize()) {
793     compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
794   }
795 
796   // Sample some characters from the middle of the string.
797   static const int kSampleSize = 128;
798 
799   sample_subject = String::Flatten(isolate, sample_subject);
800   int chars_sampled = 0;
801   int half_way = (sample_subject->length() - kSampleSize) / 2;
802   for (int i = Max(0, half_way);
803        i < sample_subject->length() && chars_sampled < kSampleSize;
804        i++, chars_sampled++) {
805     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
806   }
807 
808   data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
809   data->error = AnalyzeRegExp(isolate, is_one_byte, data->node);
810   if (data->error != RegExpError::kNone) {
811     return false;
812   }
813 
814   // Create the correct assembler for the architecture.
815   std::unique_ptr<RegExpMacroAssembler> macro_assembler;
816   if (data->compilation_target == RegExpCompilationTarget::kNative) {
817     // Native regexp implementation.
818     DCHECK(!FLAG_jitless);
819 
820     NativeRegExpMacroAssembler::Mode mode =
821         is_one_byte ? NativeRegExpMacroAssembler::LATIN1
822                     : NativeRegExpMacroAssembler::UC16;
823 
824     const int output_register_count =
825         JSRegExp::RegistersForCaptureCount(data->capture_count);
826 #if V8_TARGET_ARCH_IA32
827     macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
828                                                        output_register_count));
829 #elif V8_TARGET_ARCH_X64
830     macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
831                                                       output_register_count));
832 #elif V8_TARGET_ARCH_ARM
833     macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
834                                                       output_register_count));
835 #elif V8_TARGET_ARCH_ARM64
836     macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
837                                                         output_register_count));
838 #elif V8_TARGET_ARCH_S390
839     macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
840                                                        output_register_count));
841 #elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
842     macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
843                                                       output_register_count));
844 #elif V8_TARGET_ARCH_MIPS
845     macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
846                                                        output_register_count));
847 #elif V8_TARGET_ARCH_MIPS64
848     macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
849                                                        output_register_count));
850 #else
851 #error "Unsupported architecture"
852 #endif
853   } else {
854     DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
855     // Interpreted regexp implementation.
856     macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
857   }
858 
859   macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
860   if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks &&
861       ExperimentalRegExp::CanBeHandled(data->tree, flags,
862                                        data->capture_count)) {
863     if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
864       backtrack_limit = FLAG_regexp_backtracks_before_fallback;
865     } else {
866       backtrack_limit =
867           std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback);
868     }
869     macro_assembler->set_backtrack_limit(backtrack_limit);
870     macro_assembler->set_can_fallback(true);
871   } else {
872     macro_assembler->set_backtrack_limit(backtrack_limit);
873     macro_assembler->set_can_fallback(false);
874   }
875 
876   // Inserted here, instead of in Assembler, because it depends on information
877   // in the AST that isn't replicated in the Node structure.
878   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
879   bool is_start_anchored = data->tree->IsAnchoredAtStart();
880   int max_length = data->tree->max_match();
881   static const int kMaxBacksearchLimit = 1024;
882   if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
883       max_length < kMaxBacksearchLimit) {
884     macro_assembler->SetCurrentPositionFromEnd(max_length);
885   }
886 
887   if (IsGlobal(flags)) {
888     RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
889     if (data->tree->min_match() > 0) {
890       mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
891     } else if (IsUnicode(flags)) {
892       mode = RegExpMacroAssembler::GLOBAL_UNICODE;
893     }
894     macro_assembler->set_global_mode(mode);
895   }
896 
897   RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
898 #ifdef DEBUG
899   std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
900   if (FLAG_trace_regexp_assembler) {
901     tracer_macro_assembler.reset(
902         new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
903     macro_assembler_ptr = tracer_macro_assembler.get();
904   }
905 #endif
906 
907   RegExpCompiler::CompilationResult result = compiler.Assemble(
908       isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
909 
910   // Code / bytecode printing.
911   {
912 #ifdef ENABLE_DISASSEMBLER
913     if (FLAG_print_regexp_code &&
914         data->compilation_target == RegExpCompilationTarget::kNative) {
915       CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
916       OFStream os(trace_scope.file());
917       Handle<Code> c = Handle<Code>::cast(result.code);
918       auto pattern_cstring = pattern->ToCString();
919       c->Disassemble(pattern_cstring.get(), os, isolate);
920     }
921 #endif
922     if (FLAG_print_regexp_bytecode &&
923         data->compilation_target == RegExpCompilationTarget::kBytecode) {
924       Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code);
925       auto pattern_cstring = pattern->ToCString();
926       RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(),
927                                 bytecode->length(), pattern_cstring.get());
928     }
929   }
930 
931   if (result.error != RegExpError::kNone) {
932     if (FLAG_correctness_fuzzer_suppressions &&
933         result.error == RegExpError::kStackOverflow) {
934       FATAL("Aborting on stack overflow");
935     }
936     data->error = result.error;
937   }
938 
939   data->code = result.code;
940   data->register_count = result.num_registers;
941 
942   return result.Succeeded();
943 }
944 
RegExpGlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)945 RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
946                                      Handle<String> subject, Isolate* isolate)
947     : register_array_(nullptr),
948       register_array_size_(0),
949       regexp_(regexp),
950       subject_(subject),
951       isolate_(isolate) {
952   DCHECK(IsGlobal(regexp->GetFlags()));
953 
954   switch (regexp_->TypeTag()) {
955     case JSRegExp::NOT_COMPILED:
956       UNREACHABLE();
957     case JSRegExp::ATOM: {
958       // ATOM regexps do not have a global loop, so we search for one match at
959       // a time.
960       static const int kAtomRegistersPerMatch = 2;
961       registers_per_match_ = kAtomRegistersPerMatch;
962       register_array_size_ = registers_per_match_;
963       break;
964     }
965     case JSRegExp::IRREGEXP: {
966       registers_per_match_ =
967           RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
968       if (registers_per_match_ < 0) {
969         num_matches_ = -1;  // Signal exception.
970         return;
971       }
972       if (regexp->ShouldProduceBytecode()) {
973         // Global loop in interpreted regexp is not implemented.  We choose the
974         // size of the offsets vector so that it can only store one match.
975         register_array_size_ = registers_per_match_;
976         max_matches_ = 1;
977       } else {
978         register_array_size_ = Max(registers_per_match_,
979                                    Isolate::kJSRegexpStaticOffsetsVectorSize);
980       }
981       break;
982     }
983     case JSRegExp::EXPERIMENTAL: {
984       if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
985           !ExperimentalRegExp::Compile(isolate_, regexp)) {
986         DCHECK(isolate->has_pending_exception());
987         num_matches_ = -1;  // Signal exception.
988         return;
989       }
990       registers_per_match_ =
991           JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
992       register_array_size_ =
993           Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
994       break;
995     }
996   }
997 
998   max_matches_ = register_array_size_ / registers_per_match_;
999 
1000   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1001     register_array_ = NewArray<int32_t>(register_array_size_);
1002   } else {
1003     register_array_ = isolate->jsregexp_static_offsets_vector();
1004   }
1005 
1006   // Set state so that fetching the results the first time triggers a call
1007   // to the compiled regexp.
1008   current_match_index_ = max_matches_ - 1;
1009   num_matches_ = max_matches_;
1010   DCHECK_LE(2, registers_per_match_);  // Each match has at least one capture.
1011   DCHECK_GE(register_array_size_, registers_per_match_);
1012   int32_t* last_match =
1013       &register_array_[current_match_index_ * registers_per_match_];
1014   last_match[0] = -1;
1015   last_match[1] = 0;
1016 }
1017 
~RegExpGlobalCache()1018 RegExpGlobalCache::~RegExpGlobalCache() {
1019   // Deallocate the register array if we allocated it in the constructor
1020   // (as opposed to using the existing jsregexp_static_offsets_vector).
1021   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1022     DeleteArray(register_array_);
1023   }
1024 }
1025 
AdvanceZeroLength(int last_index)1026 int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
1027   if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
1028       unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
1029       unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
1030     // Advance over the surrogate pair.
1031     return last_index + 2;
1032   }
1033   return last_index + 1;
1034 }
1035 
FetchNext()1036 int32_t* RegExpGlobalCache::FetchNext() {
1037   current_match_index_++;
1038 
1039   if (current_match_index_ >= num_matches_) {
1040     // Current batch of results exhausted.
1041     // Fail if last batch was not even fully filled.
1042     if (num_matches_ < max_matches_) {
1043       num_matches_ = 0;  // Signal failed match.
1044       return nullptr;
1045     }
1046 
1047     int32_t* last_match =
1048         &register_array_[(current_match_index_ - 1) * registers_per_match_];
1049     int last_end_index = last_match[1];
1050 
1051     switch (regexp_->TypeTag()) {
1052       case JSRegExp::NOT_COMPILED:
1053         UNREACHABLE();
1054       case JSRegExp::ATOM:
1055         num_matches_ =
1056             RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
1057                                     register_array_, register_array_size_);
1058         break;
1059       case JSRegExp::EXPERIMENTAL: {
1060         DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
1061         DisallowHeapAllocation no_gc;
1062         num_matches_ = ExperimentalRegExp::ExecRaw(
1063             isolate_, RegExp::kFromRuntime, *regexp_, *subject_,
1064             register_array_, register_array_size_, last_end_index);
1065         break;
1066       }
1067       case JSRegExp::IRREGEXP: {
1068         int last_start_index = last_match[0];
1069         if (last_start_index == last_end_index) {
1070           // Zero-length match. Advance by one code point.
1071           last_end_index = AdvanceZeroLength(last_end_index);
1072         }
1073         if (last_end_index > subject_->length()) {
1074           num_matches_ = 0;  // Signal failed match.
1075           return nullptr;
1076         }
1077         num_matches_ = RegExpImpl::IrregexpExecRaw(
1078             isolate_, regexp_, subject_, last_end_index, register_array_,
1079             register_array_size_);
1080         break;
1081       }
1082     }
1083 
1084     // Fall back to experimental engine if needed and possible.
1085     if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) {
1086       num_matches_ = ExperimentalRegExp::OneshotExecRaw(
1087           isolate_, regexp_, subject_, register_array_, register_array_size_,
1088           last_end_index);
1089     }
1090 
1091     if (num_matches_ <= 0) {
1092       return nullptr;
1093     }
1094     current_match_index_ = 0;
1095     return register_array_;
1096   } else {
1097     return &register_array_[current_match_index_ * registers_per_match_];
1098   }
1099 }
1100 
LastSuccessfulMatch()1101 int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
1102   int index = current_match_index_ * registers_per_match_;
1103   if (num_matches_ == 0) {
1104     // After a failed match we shift back by one result.
1105     index -= registers_per_match_;
1106   }
1107   return &register_array_[index];
1108 }
1109 
Lookup(Heap * heap,String key_string,Object key_pattern,FixedArray * last_match_cache,ResultsCacheType type)1110 Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
1111                                   Object key_pattern,
1112                                   FixedArray* last_match_cache,
1113                                   ResultsCacheType type) {
1114   FixedArray cache;
1115   if (!key_string.IsInternalizedString()) return Smi::zero();
1116   if (type == STRING_SPLIT_SUBSTRINGS) {
1117     DCHECK(key_pattern.IsString());
1118     if (!key_pattern.IsInternalizedString()) return Smi::zero();
1119     cache = heap->string_split_cache();
1120   } else {
1121     DCHECK(type == REGEXP_MULTIPLE_INDICES);
1122     DCHECK(key_pattern.IsFixedArray());
1123     cache = heap->regexp_multiple_cache();
1124   }
1125 
1126   uint32_t hash = key_string.Hash();
1127   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1128                     ~(kArrayEntriesPerCacheEntry - 1));
1129   if (cache.get(index + kStringOffset) != key_string ||
1130       cache.get(index + kPatternOffset) != key_pattern) {
1131     index =
1132         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1133     if (cache.get(index + kStringOffset) != key_string ||
1134         cache.get(index + kPatternOffset) != key_pattern) {
1135       return Smi::zero();
1136     }
1137   }
1138 
1139   *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset));
1140   return cache.get(index + kArrayOffset);
1141 }
1142 
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)1143 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
1144                                Handle<Object> key_pattern,
1145                                Handle<FixedArray> value_array,
1146                                Handle<FixedArray> last_match_cache,
1147                                ResultsCacheType type) {
1148   Factory* factory = isolate->factory();
1149   Handle<FixedArray> cache;
1150   if (!key_string->IsInternalizedString()) return;
1151   if (type == STRING_SPLIT_SUBSTRINGS) {
1152     DCHECK(key_pattern->IsString());
1153     if (!key_pattern->IsInternalizedString()) return;
1154     cache = factory->string_split_cache();
1155   } else {
1156     DCHECK(type == REGEXP_MULTIPLE_INDICES);
1157     DCHECK(key_pattern->IsFixedArray());
1158     cache = factory->regexp_multiple_cache();
1159   }
1160 
1161   uint32_t hash = key_string->Hash();
1162   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1163                     ~(kArrayEntriesPerCacheEntry - 1));
1164   if (cache->get(index + kStringOffset) == Smi::zero()) {
1165     cache->set(index + kStringOffset, *key_string);
1166     cache->set(index + kPatternOffset, *key_pattern);
1167     cache->set(index + kArrayOffset, *value_array);
1168     cache->set(index + kLastMatchOffset, *last_match_cache);
1169   } else {
1170     uint32_t index2 =
1171         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1172     if (cache->get(index2 + kStringOffset) == Smi::zero()) {
1173       cache->set(index2 + kStringOffset, *key_string);
1174       cache->set(index2 + kPatternOffset, *key_pattern);
1175       cache->set(index2 + kArrayOffset, *value_array);
1176       cache->set(index2 + kLastMatchOffset, *last_match_cache);
1177     } else {
1178       cache->set(index2 + kStringOffset, Smi::zero());
1179       cache->set(index2 + kPatternOffset, Smi::zero());
1180       cache->set(index2 + kArrayOffset, Smi::zero());
1181       cache->set(index2 + kLastMatchOffset, Smi::zero());
1182       cache->set(index + kStringOffset, *key_string);
1183       cache->set(index + kPatternOffset, *key_pattern);
1184       cache->set(index + kArrayOffset, *value_array);
1185       cache->set(index + kLastMatchOffset, *last_match_cache);
1186     }
1187   }
1188   // If the array is a reasonably short list of substrings, convert it into a
1189   // list of internalized strings.
1190   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
1191     for (int i = 0; i < value_array->length(); i++) {
1192       Handle<String> str(String::cast(value_array->get(i)), isolate);
1193       Handle<String> internalized_str = factory->InternalizeString(str);
1194       value_array->set(i, *internalized_str);
1195     }
1196   }
1197   // Convert backing store to a copy-on-write array.
1198   value_array->set_map_no_write_barrier(
1199       ReadOnlyRoots(isolate).fixed_cow_array_map());
1200 }
1201 
Clear(FixedArray cache)1202 void RegExpResultsCache::Clear(FixedArray cache) {
1203   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
1204     cache.set(i, Smi::zero());
1205   }
1206 }
1207 
1208 }  // namespace internal
1209 }  // namespace v8
1210