• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp.h"
6 
7 #include "src/base/strings.h"
8 #include "src/codegen/compilation-cache.h"
9 #include "src/diagnostics/code-tracer.h"
10 #include "src/execution/interrupts-scope.h"
11 #include "src/heap/heap-inl.h"
12 #include "src/objects/js-regexp-inl.h"
13 #include "src/regexp/experimental/experimental.h"
14 #include "src/regexp/regexp-bytecode-generator.h"
15 #include "src/regexp/regexp-bytecodes.h"
16 #include "src/regexp/regexp-compiler.h"
17 #include "src/regexp/regexp-dotprinter.h"
18 #include "src/regexp/regexp-interpreter.h"
19 #include "src/regexp/regexp-macro-assembler-arch.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-parser.h"
22 #include "src/regexp/regexp-utils.h"
23 #include "src/strings/string-search.h"
24 #include "src/utils/ostreams.h"
25 
26 namespace v8 {
27 namespace internal {
28 
29 using namespace regexp_compiler_constants;  // NOLINT(build/namespaces)
30 
31 class RegExpImpl final : public AllStatic {
32  public:
33   // Returns a string representation of a regular expression.
34   // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
35   // This function calls the garbage collector if necessary.
36   static Handle<String> ToString(Handle<Object> value);
37 
38   // Prepares a JSRegExp object with Irregexp-specific data.
39   static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
40                                  Handle<String> pattern, RegExpFlags flags,
41                                  int capture_count, uint32_t backtrack_limit);
42 
43   // Prepare a RegExp for being executed one or more times (using
44   // IrregexpExecOnce) on the subject.
45   // This ensures that the regexp is compiled for the subject, and that
46   // the subject is flat.
47   // Returns the number of integer spaces required by IrregexpExecOnce
48   // as its "registers" argument.  If the regexp cannot be compiled,
49   // an exception is set as pending, and this function returns negative.
50   static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
51                              Handle<String> subject);
52 
53   static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
54                           Handle<String> pattern, RegExpFlags flags,
55                           Handle<String> match_pattern);
56 
57   static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
58                          Handle<String> subject, int index, int32_t* output,
59                          int output_size);
60 
61   static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
62                                  Handle<String> subject, int index,
63                                  Handle<RegExpMatchInfo> last_match_info);
64 
65   // Execute a regular expression on the subject, starting from index.
66   // If matching succeeds, return the number of matches.  This can be larger
67   // than one in the case of global regular expressions.
68   // The captures and subcaptures are stored into the registers vector.
69   // If matching fails, returns RE_FAILURE.
70   // If execution fails, sets a pending exception and returns RE_EXCEPTION.
71   static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
72                              Handle<String> subject, int index, int32_t* output,
73                              int output_size);
74 
75   // Execute an Irregexp bytecode pattern.
76   // On a successful match, the result is a JSArray containing
77   // captured positions.  On a failure, the result is the null value.
78   // Returns an empty handle in case of an exception.
79   V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
80       Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
81       int index, Handle<RegExpMatchInfo> last_match_info,
82       RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
83 
84   static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
85                               Handle<String> sample_subject, bool is_one_byte);
86   static inline bool EnsureCompiledIrregexp(Isolate* isolate,
87                                             Handle<JSRegExp> re,
88                                             Handle<String> sample_subject,
89                                             bool is_one_byte);
90 
91   // Returns true on success, false on failure.
92   static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
93                       RegExpFlags flags, Handle<String> pattern,
94                       Handle<String> sample_subject, bool is_one_byte,
95                       uint32_t& backtrack_limit);
96 
97   // For acting on the JSRegExp data FixedArray.
98   static int IrregexpMaxRegisterCount(FixedArray re);
99   static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
100   static int IrregexpNumberOfCaptures(FixedArray re);
101   static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
102   static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
103 };
104 
105 // static
CanGenerateBytecode()106 bool RegExp::CanGenerateBytecode() {
107   return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
108 }
109 
110 // static
111 template <class CharT>
VerifySyntax(Zone * zone,uintptr_t stack_limit,const CharT * input,int input_length,RegExpFlags flags,RegExpError * regexp_error_out,const DisallowGarbageCollection & no_gc)112 bool RegExp::VerifySyntax(Zone* zone, uintptr_t stack_limit, const CharT* input,
113                           int input_length, RegExpFlags flags,
114                           RegExpError* regexp_error_out,
115                           const DisallowGarbageCollection& no_gc) {
116   RegExpCompileData data;
117   bool pattern_is_valid = RegExpParser::VerifyRegExpSyntax(
118       zone, stack_limit, input, input_length, flags, &data, no_gc);
119   *regexp_error_out = data.error;
120   return pattern_is_valid;
121 }
122 
123 template bool RegExp::VerifySyntax<uint8_t>(Zone*, uintptr_t, const uint8_t*,
124                                             int, RegExpFlags,
125                                             RegExpError* regexp_error_out,
126                                             const DisallowGarbageCollection&);
127 template bool RegExp::VerifySyntax<base::uc16>(
128     Zone*, uintptr_t, const base::uc16*, int, RegExpFlags,
129     RegExpError* regexp_error_out, const DisallowGarbageCollection&);
130 
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpError error)131 MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
132                                                  Handle<JSRegExp> re,
133                                                  Handle<String> pattern,
134                                                  RegExpError error) {
135   base::Vector<const char> error_data =
136       base::CStrVector(RegExpErrorString(error));
137   Handle<String> error_text =
138       isolate->factory()
139           ->NewStringFromOneByte(base::Vector<const uint8_t>::cast(error_data))
140           .ToHandleChecked();
141   THROW_NEW_ERROR(
142       isolate,
143       NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
144       Object);
145 }
146 
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,RegExpError error_text)147 void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
148                                   RegExpError error_text) {
149   USE(ThrowRegExpException(isolate, re, Handle<String>(re->source(), isolate),
150                            error_text));
151 }
152 
IsUnmodifiedRegExp(Isolate * isolate,Handle<JSRegExp> regexp)153 bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) {
154   return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
155 }
156 
157 namespace {
158 
159 // Identifies the sort of regexps where the regexp engine is faster
160 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)161 bool HasFewDifferentCharacters(Handle<String> pattern) {
162   int length = std::min(kMaxLookaheadForBoyerMoore, pattern->length());
163   if (length <= kPatternTooShortForBoyerMoore) return false;
164   const int kMod = 128;
165   bool character_found[kMod];
166   int different = 0;
167   memset(&character_found[0], 0, sizeof(character_found));
168   for (int i = 0; i < length; i++) {
169     int ch = (pattern->Get(i) & (kMod - 1));
170     if (!character_found[ch]) {
171       character_found[ch] = true;
172       different++;
173       // We declare a regexp low-alphabet if it has at least 3 times as many
174       // characters as it has different characters.
175       if (different * 3 > length) return false;
176     }
177   }
178   return true;
179 }
180 
181 }  // namespace
182 
183 // Generic RegExp methods. Dispatches to implementation specific methods.
184 
185 // static
Compile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpFlags flags,uint32_t backtrack_limit)186 MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
187                                     Handle<String> pattern, RegExpFlags flags,
188                                     uint32_t backtrack_limit) {
189   DCHECK(pattern->IsFlat());
190 
191   // Caching is based only on the pattern and flags, but code also differs when
192   // a backtrack limit is set. A present backtrack limit is very much *not* the
193   // common case, so just skip the cache for these.
194   const bool is_compilation_cache_enabled =
195       (backtrack_limit == JSRegExp::kNoBacktrackLimit);
196 
197   Zone zone(isolate->allocator(), ZONE_NAME);
198   CompilationCache* compilation_cache = nullptr;
199   if (is_compilation_cache_enabled) {
200     compilation_cache = isolate->compilation_cache();
201     MaybeHandle<FixedArray> maybe_cached = compilation_cache->LookupRegExp(
202         pattern, JSRegExp::AsJSRegExpFlags(flags));
203     Handle<FixedArray> cached;
204     if (maybe_cached.ToHandle(&cached)) {
205       re->set_data(*cached);
206       return re;
207     }
208   }
209 
210   PostponeInterruptsScope postpone(isolate);
211   RegExpCompileData parse_result;
212   DCHECK(!isolate->has_pending_exception());
213   if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
214                                                &parse_result)) {
215     // Throw an exception if we fail to parse the pattern.
216     return RegExp::ThrowRegExpException(isolate, re, pattern,
217                                         parse_result.error);
218   }
219 
220   bool has_been_compiled = false;
221 
222   if (FLAG_default_to_experimental_regexp_engine &&
223       ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
224                                        parse_result.capture_count)) {
225     DCHECK(FLAG_enable_experimental_regexp_engine);
226     ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
227                                    parse_result.capture_count);
228     has_been_compiled = true;
229   } else if (flags & JSRegExp::kLinear) {
230     DCHECK(FLAG_enable_experimental_regexp_engine);
231     if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
232                                           parse_result.capture_count)) {
233       // TODO(mbid): The error could provide a reason for why the regexp can't
234       // be executed in linear time (e.g. due to back references).
235       return RegExp::ThrowRegExpException(isolate, re, pattern,
236                                           RegExpError::kNotLinear);
237     }
238     ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
239                                    parse_result.capture_count);
240     has_been_compiled = true;
241   } else if (parse_result.simple && !IsIgnoreCase(flags) && !IsSticky(flags) &&
242              !HasFewDifferentCharacters(pattern)) {
243     // Parse-tree is a single atom that is equal to the pattern.
244     RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
245     has_been_compiled = true;
246   } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
247              parse_result.capture_count == 0) {
248     RegExpAtom* atom = parse_result.tree->AsAtom();
249     // The pattern source might (?) contain escape sequences, but they're
250     // resolved in atom_string.
251     base::Vector<const base::uc16> atom_pattern = atom->data();
252     Handle<String> atom_string;
253     ASSIGN_RETURN_ON_EXCEPTION(
254         isolate, atom_string,
255         isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
256     if (!IsIgnoreCase(flags) && !HasFewDifferentCharacters(atom_string)) {
257       RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
258       has_been_compiled = true;
259     }
260   }
261   if (!has_been_compiled) {
262     RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
263                                    parse_result.capture_count, backtrack_limit);
264   }
265   DCHECK(re->data().IsFixedArray());
266   // Compilation succeeded so the data is set on the regexp
267   // and we can store it in the cache.
268   Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
269   if (is_compilation_cache_enabled) {
270     compilation_cache->PutRegExp(pattern, JSRegExp::AsJSRegExpFlags(flags),
271                                  data);
272   }
273 
274   return re;
275 }
276 
277 // static
EnsureFullyCompiled(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject)278 bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
279                                  Handle<String> subject) {
280   switch (re->type_tag()) {
281     case JSRegExp::NOT_COMPILED:
282       UNREACHABLE();
283     case JSRegExp::ATOM:
284       return true;
285     case JSRegExp::IRREGEXP:
286       if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
287         DCHECK(isolate->has_pending_exception());
288         return false;
289       }
290       return true;
291     case JSRegExp::EXPERIMENTAL:
292       if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
293           !ExperimentalRegExp::Compile(isolate, re)) {
294         DCHECK(isolate->has_pending_exception());
295         return false;
296       }
297       return true;
298   }
299 }
300 
301 // static
ExperimentalOneshotExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info,RegExp::ExecQuirks exec_quirks)302 MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
303     Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
304     int index, Handle<RegExpMatchInfo> last_match_info,
305     RegExp::ExecQuirks exec_quirks) {
306   return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
307                                          last_match_info, exec_quirks);
308 }
309 
310 // static
Exec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info,ExecQuirks exec_quirks)311 MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
312                                  Handle<String> subject, int index,
313                                  Handle<RegExpMatchInfo> last_match_info,
314                                  ExecQuirks exec_quirks) {
315   switch (regexp->type_tag()) {
316     case JSRegExp::NOT_COMPILED:
317       UNREACHABLE();
318     case JSRegExp::ATOM:
319       return RegExpImpl::AtomExec(isolate, regexp, subject, index,
320                                   last_match_info);
321     case JSRegExp::IRREGEXP:
322       return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
323                                       last_match_info, exec_quirks);
324     case JSRegExp::EXPERIMENTAL:
325       return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
326                                       last_match_info, exec_quirks);
327   }
328 }
329 
330 // RegExp Atom implementation: Simple string search using indexOf.
331 
AtomCompile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpFlags flags,Handle<String> match_pattern)332 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
333                              Handle<String> pattern, RegExpFlags flags,
334                              Handle<String> match_pattern) {
335   isolate->factory()->SetRegExpAtomData(
336       re, pattern, JSRegExp::AsJSRegExpFlags(flags), match_pattern);
337 }
338 
339 namespace {
340 
SetAtomLastCapture(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,String subject,int from,int to)341 void SetAtomLastCapture(Isolate* isolate,
342                         Handle<RegExpMatchInfo> last_match_info, String subject,
343                         int from, int to) {
344   SealHandleScope shs(isolate);
345   last_match_info->SetNumberOfCaptureRegisters(2);
346   last_match_info->SetLastSubject(subject);
347   last_match_info->SetLastInput(subject);
348   last_match_info->SetCapture(0, from);
349   last_match_info->SetCapture(1, to);
350 }
351 
352 }  // namespace
353 
AtomExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)354 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
355                             Handle<String> subject, int index, int32_t* output,
356                             int output_size) {
357   DCHECK_LE(0, index);
358   DCHECK_LE(index, subject->length());
359 
360   subject = String::Flatten(isolate, subject);
361   DisallowGarbageCollection no_gc;  // ensure vectors stay valid
362 
363   String needle = regexp->atom_pattern();
364   int needle_len = needle.length();
365   DCHECK(needle.IsFlat());
366   DCHECK_LT(0, needle_len);
367 
368   if (index + needle_len > subject->length()) {
369     return RegExp::RE_FAILURE;
370   }
371 
372   for (int i = 0; i < output_size; i += 2) {
373     String::FlatContent needle_content = needle.GetFlatContent(no_gc);
374     String::FlatContent subject_content = subject->GetFlatContent(no_gc);
375     DCHECK(needle_content.IsFlat());
376     DCHECK(subject_content.IsFlat());
377     // dispatch on type of strings
378     index =
379         (needle_content.IsOneByte()
380              ? (subject_content.IsOneByte()
381                     ? SearchString(isolate, subject_content.ToOneByteVector(),
382                                    needle_content.ToOneByteVector(), index)
383                     : SearchString(isolate, subject_content.ToUC16Vector(),
384                                    needle_content.ToOneByteVector(), index))
385              : (subject_content.IsOneByte()
386                     ? SearchString(isolate, subject_content.ToOneByteVector(),
387                                    needle_content.ToUC16Vector(), index)
388                     : SearchString(isolate, subject_content.ToUC16Vector(),
389                                    needle_content.ToUC16Vector(), index)));
390     if (index == -1) {
391       return i / 2;  // Return number of matches.
392     } else {
393       output[i] = index;
394       output[i + 1] = index + needle_len;
395       index += needle_len;
396     }
397   }
398   return output_size / 2;
399 }
400 
AtomExec(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)401 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
402                                     Handle<String> subject, int index,
403                                     Handle<RegExpMatchInfo> last_match_info) {
404   static const int kNumRegisters = 2;
405   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
406   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
407 
408   int res =
409       AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
410 
411   if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value();
412 
413   DCHECK_EQ(res, RegExp::RE_SUCCESS);
414   SealHandleScope shs(isolate);
415   SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
416                      output_registers[1]);
417   return last_match_info;
418 }
419 
420 // Irregexp implementation.
421 
422 // Ensures that the regexp object contains a compiled version of the
423 // source for either one-byte or two-byte subject strings.
424 // If the compiled version doesn't already exist, it is compiled
425 // from the source pattern.
426 // If compilation fails, an exception is thrown and this function
427 // returns false.
EnsureCompiledIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)428 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
429                                         Handle<String> sample_subject,
430                                         bool is_one_byte) {
431   Object compiled_code = re->code(is_one_byte);
432   Object bytecode = re->bytecode(is_one_byte);
433   bool needs_initial_compilation =
434       compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue);
435   // Recompile is needed when we're dealing with the first execution of the
436   // regexp after the decision to tier up has been made. If the tiering up
437   // strategy is not in use, this value is always false.
438   bool needs_tier_up_compilation =
439       re->MarkedForTierUp() && bytecode.IsByteArray();
440 
441   if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) {
442     PrintF("JSRegExp object %p needs tier-up compilation\n",
443            reinterpret_cast<void*>(re->ptr()));
444   }
445 
446   if (!needs_initial_compilation && !needs_tier_up_compilation) {
447     DCHECK(compiled_code.IsCodeT());
448     DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray());
449     return true;
450   }
451 
452   DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray());
453 
454   return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
455 }
456 
457 namespace {
458 
459 #ifdef DEBUG
RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re,bool is_one_byte)460 bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
461   Object entry = re->code(is_one_byte);
462   Object bytecode = re->bytecode(is_one_byte);
463   // If we're not using the tier-up strategy, entry can only be a smi
464   // representing an uncompiled regexp here. If we're using the tier-up
465   // strategy, entry can still be a smi representing an uncompiled regexp, when
466   // compiling the regexp before the tier-up, or it can contain a trampoline to
467   // the regexp interpreter, in which case the bytecode field contains compiled
468   // bytecode, when recompiling the regexp after the tier-up. If the
469   // tier-up was forced, which happens for global replaces, entry is a smi
470   // representing an uncompiled regexp, even though we're "recompiling" after
471   // the tier-up.
472   if (re->ShouldProduceBytecode()) {
473     DCHECK(entry.IsSmi());
474     DCHECK(bytecode.IsSmi());
475     int entry_value = Smi::ToInt(entry);
476     int bytecode_value = Smi::ToInt(bytecode);
477     DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
478     DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value);
479   } else {
480     DCHECK(entry.IsSmi() || (entry.IsCodeT() && bytecode.IsByteArray()));
481   }
482 
483   return true;
484 }
485 #endif
486 
487 struct RegExpCaptureIndexLess {
operator ()v8::internal::__anonc9c892ec0311::RegExpCaptureIndexLess488   bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
489     DCHECK_NOT_NULL(lhs);
490     DCHECK_NOT_NULL(rhs);
491     return lhs->index() < rhs->index();
492   }
493 };
494 
495 }  // namespace
496 
497 // static
CreateCaptureNameMap(Isolate * isolate,ZoneVector<RegExpCapture * > * named_captures)498 Handle<FixedArray> RegExp::CreateCaptureNameMap(
499     Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures) {
500   if (named_captures == nullptr) return Handle<FixedArray>();
501 
502   DCHECK(!named_captures->empty());
503 
504   // Named captures are sorted by name (because the set is used to ensure
505   // name uniqueness). But the capture name map must to be sorted by index.
506 
507   std::sort(named_captures->begin(), named_captures->end(),
508             RegExpCaptureIndexLess{});
509 
510   int len = static_cast<int>(named_captures->size()) * 2;
511   Handle<FixedArray> array = isolate->factory()->NewFixedArray(len);
512 
513   int i = 0;
514   for (const RegExpCapture* capture : *named_captures) {
515     base::Vector<const base::uc16> capture_name(capture->name()->data(),
516                                                 capture->name()->size());
517     // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
518     // internalized so they can be used as property names in the 'exec' results.
519     Handle<String> name = isolate->factory()->InternalizeString(capture_name);
520     array->set(i * 2, *name);
521     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
522 
523     i++;
524   }
525   DCHECK_EQ(i * 2, len);
526 
527   return array;
528 }
529 
CompileIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)530 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
531                                  Handle<String> sample_subject,
532                                  bool is_one_byte) {
533   // Compile the RegExp.
534   Zone zone(isolate->allocator(), ZONE_NAME);
535   PostponeInterruptsScope postpone(isolate);
536 
537   DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte));
538 
539   RegExpFlags flags = JSRegExp::AsRegExpFlags(re->flags());
540 
541   Handle<String> pattern(re->source(), isolate);
542   pattern = String::Flatten(isolate, pattern);
543   RegExpCompileData compile_data;
544   if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
545                                                &compile_data)) {
546     // Throw an exception if we fail to parse the pattern.
547     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
548     USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
549     return false;
550   }
551   // The compilation target is a kBytecode if we're interpreting all regexp
552   // objects, or if we're using the tier-up strategy but the tier-up hasn't
553   // happened yet. The compilation target is a kNative if we're using the
554   // tier-up strategy and we need to recompile to tier-up, or if we're producing
555   // native code for all regexp objects.
556   compile_data.compilation_target = re->ShouldProduceBytecode()
557                                         ? RegExpCompilationTarget::kBytecode
558                                         : RegExpCompilationTarget::kNative;
559   uint32_t backtrack_limit = re->backtrack_limit();
560   const bool compilation_succeeded =
561       Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
562               is_one_byte, backtrack_limit);
563   if (!compilation_succeeded) {
564     DCHECK(compile_data.error != RegExpError::kNone);
565     RegExp::ThrowRegExpException(isolate, re, compile_data.error);
566     return false;
567   }
568 
569   Handle<FixedArray> data =
570       Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
571   if (compile_data.compilation_target == RegExpCompilationTarget::kNative) {
572     Code code = Code::cast(*compile_data.code);
573     data->set(JSRegExp::code_index(is_one_byte), ToCodeT(code));
574 
575     // Reset bytecode to uninitialized. In case we use tier-up we know that
576     // tier-up has happened this way.
577     data->set(JSRegExp::bytecode_index(is_one_byte),
578               Smi::FromInt(JSRegExp::kUninitializedValue));
579   } else {
580     DCHECK_EQ(compile_data.compilation_target,
581               RegExpCompilationTarget::kBytecode);
582     // Store code generated by compiler in bytecode and trampoline to
583     // interpreter in code.
584     data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code);
585     Handle<CodeT> trampoline =
586         BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
587     data->set(JSRegExp::code_index(is_one_byte), *trampoline);
588   }
589   Handle<FixedArray> capture_name_map =
590       RegExp::CreateCaptureNameMap(isolate, compile_data.named_captures);
591   re->set_capture_name_map(capture_name_map);
592   int register_max = IrregexpMaxRegisterCount(*data);
593   if (compile_data.register_count > register_max) {
594     SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
595   }
596   data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
597 
598   if (FLAG_trace_regexp_tier_up) {
599     PrintF("JSRegExp object %p %s size: %d\n",
600            reinterpret_cast<void*>(re->ptr()),
601            re->ShouldProduceBytecode() ? "bytecode" : "native code",
602            re->ShouldProduceBytecode()
603                ? IrregexpByteCode(*data, is_one_byte).Size()
604                : IrregexpNativeCode(*data, is_one_byte).Size());
605   }
606 
607   return true;
608 }
609 
IrregexpMaxRegisterCount(FixedArray re)610 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
611   return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex));
612 }
613 
SetIrregexpMaxRegisterCount(FixedArray re,int value)614 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
615   re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
616 }
617 
IrregexpNumberOfCaptures(FixedArray re)618 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
619   return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
620 }
621 
IrregexpByteCode(FixedArray re,bool is_one_byte)622 ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
623   return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte)));
624 }
625 
IrregexpNativeCode(FixedArray re,bool is_one_byte)626 Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
627   return Code::cast(re.get(JSRegExp::code_index(is_one_byte)));
628 }
629 
IrregexpInitialize(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpFlags flags,int capture_count,uint32_t backtrack_limit)630 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
631                                     Handle<String> pattern, RegExpFlags flags,
632                                     int capture_count,
633                                     uint32_t backtrack_limit) {
634   // Initialize compiled code entries to null.
635   isolate->factory()->SetRegExpIrregexpData(re, pattern,
636                                             JSRegExp::AsJSRegExpFlags(flags),
637                                             capture_count, backtrack_limit);
638 }
639 
640 // static
IrregexpPrepare(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject)641 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
642                                 Handle<String> subject) {
643   DCHECK(subject->IsFlat());
644 
645   // Check representation of the underlying storage.
646   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
647   if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject,
648                                           is_one_byte)) {
649     return -1;
650   }
651 
652   // Only reserve room for output captures. Internal registers are allocated by
653   // the engine.
654   return JSRegExp::RegistersForCaptureCount(regexp->capture_count());
655 }
656 
IrregexpExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)657 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
658                                 Handle<String> subject, int index,
659                                 int32_t* output, int output_size) {
660   DCHECK_LE(0, index);
661   DCHECK_LE(index, subject->length());
662   DCHECK(subject->IsFlat());
663   DCHECK_GE(output_size,
664             JSRegExp::RegistersForCaptureCount(regexp->capture_count()));
665 
666   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
667 
668   if (!regexp->ShouldProduceBytecode()) {
669     do {
670       EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
671       // The stack is used to allocate registers for the compiled regexp code.
672       // This means that in case of failure, the output registers array is left
673       // untouched and contains the capture results from the previous successful
674       // match.  We can use that to set the last match info lazily.
675       int res = NativeRegExpMacroAssembler::Match(regexp, subject, output,
676                                                   output_size, index, isolate);
677       if (res != NativeRegExpMacroAssembler::RETRY) {
678         DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
679                isolate->has_pending_exception());
680         STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
681                       RegExp::RE_SUCCESS);
682         STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
683                       RegExp::RE_FAILURE);
684         STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
685                       RegExp::RE_EXCEPTION);
686         return res;
687       }
688       // If result is RETRY, the string has changed representation, and we
689       // must restart from scratch.
690       // In this case, it means we must make sure we are prepared to handle
691       // the, potentially, different subject (the string can switch between
692       // being internal and external, and even between being Latin1 and
693       // UC16, but the characters are always the same).
694       is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
695     } while (true);
696     UNREACHABLE();
697   } else {
698     DCHECK(regexp->ShouldProduceBytecode());
699 
700     do {
701       IrregexpInterpreter::Result result =
702           IrregexpInterpreter::MatchForCallFromRuntime(
703               isolate, regexp, subject, output, output_size, index);
704       DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
705                      isolate->has_pending_exception());
706 
707       switch (result) {
708         case IrregexpInterpreter::SUCCESS:
709         case IrregexpInterpreter::EXCEPTION:
710         case IrregexpInterpreter::FAILURE:
711         case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL:
712           return result;
713         case IrregexpInterpreter::RETRY:
714           // The string has changed representation, and we must restart the
715           // match.
716           // We need to reset the tier up to start over with compilation.
717           if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick();
718           is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
719           EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
720           break;
721       }
722     } while (true);
723     UNREACHABLE();
724   }
725 }
726 
IrregexpExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info,RegExp::ExecQuirks exec_quirks)727 MaybeHandle<Object> RegExpImpl::IrregexpExec(
728     Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
729     int previous_index, Handle<RegExpMatchInfo> last_match_info,
730     RegExp::ExecQuirks exec_quirks) {
731   DCHECK_EQ(regexp->type_tag(), JSRegExp::IRREGEXP);
732 
733   subject = String::Flatten(isolate, subject);
734 
735 #ifdef DEBUG
736   if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) {
737     PrintF("\n\nRegexp match:   /%s/\n\n", regexp->source().ToCString().get());
738     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
739   }
740 #endif
741 
742   // For very long subject strings, the regexp interpreter is currently much
743   // slower than the jitted code execution. If the tier-up strategy is turned
744   // on, we want to avoid this performance penalty so we eagerly tier-up if the
745   // subject string length is equal or greater than the given heuristic value.
746   if (FLAG_regexp_tier_up &&
747       subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
748     regexp->MarkTierUpForNextExec();
749     if (FLAG_trace_regexp_tier_up) {
750       PrintF(
751           "Forcing tier-up for very long strings in "
752           "RegExpImpl::IrregexpExec\n");
753     }
754   }
755 
756   // Prepare space for the return values.
757   int required_registers =
758       RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
759   if (required_registers < 0) {
760     // Compiling failed with an exception.
761     DCHECK(isolate->has_pending_exception());
762     return MaybeHandle<Object>();
763   }
764 
765   int32_t* output_registers = nullptr;
766   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
767     output_registers = NewArray<int32_t>(required_registers);
768   }
769   std::unique_ptr<int32_t[]> auto_release(output_registers);
770   if (output_registers == nullptr) {
771     output_registers = isolate->jsregexp_static_offsets_vector();
772   }
773 
774   int res =
775       RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
776                                   output_registers, required_registers);
777 
778   if (res == RegExp::RE_SUCCESS) {
779     if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
780       if (output_registers[0] >= subject->length()) {
781         return isolate->factory()->null_value();
782       }
783     }
784     int capture_count = regexp->capture_count();
785     return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
786                                     capture_count, output_registers);
787   } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
788     return ExperimentalRegExp::OneshotExec(isolate, regexp, subject,
789                                            previous_index, last_match_info);
790   } else if (res == RegExp::RE_EXCEPTION) {
791     DCHECK(isolate->has_pending_exception());
792     return MaybeHandle<Object>();
793   } else {
794     DCHECK(res == RegExp::RE_FAILURE);
795     return isolate->factory()->null_value();
796   }
797 }
798 
799 // static
SetLastMatchInfo(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)800 Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo(
801     Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
802     Handle<String> subject, int capture_count, int32_t* match) {
803   // This is the only place where match infos can grow. If, after executing the
804   // regexp, RegExpExecStub finds that the match info is too small, it restarts
805   // execution in RegExpImpl::Exec, which finally grows the match info right
806   // here.
807   Handle<RegExpMatchInfo> result =
808       RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
809   if (*result != *last_match_info) {
810     if (*last_match_info == *isolate->regexp_last_match_info()) {
811       // This inner condition is only needed for special situations like the
812       // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
813       // RegExpImpl::Exec; there actually want to bypass the Isolate's match
814       // info and execute the regexp without side effects.
815       isolate->native_context()->set_regexp_last_match_info(*result);
816     }
817   }
818 
819   int capture_register_count =
820       JSRegExp::RegistersForCaptureCount(capture_count);
821   DisallowGarbageCollection no_gc;
822   if (match != nullptr) {
823     for (int i = 0; i < capture_register_count; i += 2) {
824       result->SetCapture(i, match[i]);
825       result->SetCapture(i + 1, match[i + 1]);
826     }
827   }
828   result->SetLastSubject(*subject);
829   result->SetLastInput(*subject);
830   return result;
831 }
832 
833 // static
DotPrintForTesting(const char * label,RegExpNode * node)834 void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) {
835   DotPrinter::DotPrint(label, node);
836 }
837 
838 namespace {
839 
840 // Returns true if we've either generated too much irregex code within this
841 // isolate, or the pattern string is too long.
TooMuchRegExpCode(Isolate * isolate,Handle<String> pattern)842 bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
843   // Limit the space regexps take up on the heap.  In order to limit this we
844   // would like to keep track of the amount of regexp code on the heap.  This
845   // is not tracked, however.  As a conservative approximation we track the
846   // total regexp code compiled including code that has subsequently been freed
847   // and the total executable memory at any point.
848   static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
849   static constexpr size_t kRegExpCompiledLimit = 1 * MB;
850 
851   Heap* heap = isolate->heap();
852   if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
853   return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
854           heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
855 }
856 
857 }  // namespace
858 
859 // static
CompileForTesting(Isolate * isolate,Zone * zone,RegExpCompileData * data,RegExpFlags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)860 bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone,
861                                RegExpCompileData* data, RegExpFlags flags,
862                                Handle<String> pattern,
863                                Handle<String> sample_subject,
864                                bool is_one_byte) {
865   uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
866   return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
867                              sample_subject, is_one_byte, backtrack_limit);
868 }
869 
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,RegExpFlags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte,uint32_t & backtrack_limit)870 bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
871                          RegExpFlags flags, Handle<String> pattern,
872                          Handle<String> sample_subject, bool is_one_byte,
873                          uint32_t& backtrack_limit) {
874   if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
875       RegExpMacroAssembler::kMaxRegisterCount) {
876     data->error = RegExpError::kTooLarge;
877     return false;
878   }
879 
880   RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
881                           is_one_byte);
882 
883   if (compiler.optimize()) {
884     compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
885   }
886 
887   // Sample some characters from the middle of the string.
888   static const int kSampleSize = 128;
889 
890   sample_subject = String::Flatten(isolate, sample_subject);
891   int chars_sampled = 0;
892   int half_way = (sample_subject->length() - kSampleSize) / 2;
893   for (int i = std::max(0, half_way);
894        i < sample_subject->length() && chars_sampled < kSampleSize;
895        i++, chars_sampled++) {
896     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
897   }
898 
899   data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
900   data->error = AnalyzeRegExp(isolate, is_one_byte, flags, data->node);
901   if (data->error != RegExpError::kNone) {
902     return false;
903   }
904 
905   if (FLAG_trace_regexp_graph) DotPrinter::DotPrint("Start", data->node);
906 
907   // Create the correct assembler for the architecture.
908   std::unique_ptr<RegExpMacroAssembler> macro_assembler;
909   if (data->compilation_target == RegExpCompilationTarget::kNative) {
910     // Native regexp implementation.
911     DCHECK(!FLAG_jitless);
912 
913     NativeRegExpMacroAssembler::Mode mode =
914         is_one_byte ? NativeRegExpMacroAssembler::LATIN1
915                     : NativeRegExpMacroAssembler::UC16;
916 
917     const int output_register_count =
918         JSRegExp::RegistersForCaptureCount(data->capture_count);
919 #if V8_TARGET_ARCH_IA32
920     macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
921                                                        output_register_count));
922 #elif V8_TARGET_ARCH_X64
923     macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
924                                                       output_register_count));
925 #elif V8_TARGET_ARCH_ARM
926     macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
927                                                       output_register_count));
928 #elif V8_TARGET_ARCH_ARM64
929     macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
930                                                         output_register_count));
931 #elif V8_TARGET_ARCH_S390
932     macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
933                                                        output_register_count));
934 #elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
935     macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
936                                                       output_register_count));
937 #elif V8_TARGET_ARCH_MIPS
938     macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
939                                                        output_register_count));
940 #elif V8_TARGET_ARCH_MIPS64
941     macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
942                                                        output_register_count));
943 #elif V8_TARGET_ARCH_RISCV64
944     macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode,
945                                                         output_register_count));
946 #elif V8_TARGET_ARCH_LOONG64
947     macro_assembler.reset(new RegExpMacroAssemblerLOONG64(
948         isolate, zone, mode, output_register_count));
949 #else
950 #error "Unsupported architecture"
951 #endif
952   } else {
953     DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
954     // Interpreted regexp implementation.
955     macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
956   }
957 
958   macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
959   if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks &&
960       ExperimentalRegExp::CanBeHandled(data->tree, flags,
961                                        data->capture_count)) {
962     if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
963       backtrack_limit = FLAG_regexp_backtracks_before_fallback;
964     } else {
965       backtrack_limit =
966           std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback);
967     }
968     macro_assembler->set_backtrack_limit(backtrack_limit);
969     macro_assembler->set_can_fallback(true);
970   } else {
971     macro_assembler->set_backtrack_limit(backtrack_limit);
972     macro_assembler->set_can_fallback(false);
973   }
974 
975   // Inserted here, instead of in Assembler, because it depends on information
976   // in the AST that isn't replicated in the Node structure.
977   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
978   bool is_start_anchored = data->tree->IsAnchoredAtStart();
979   int max_length = data->tree->max_match();
980   static const int kMaxBacksearchLimit = 1024;
981   if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
982       max_length < kMaxBacksearchLimit) {
983     macro_assembler->SetCurrentPositionFromEnd(max_length);
984   }
985 
986   if (IsGlobal(flags)) {
987     RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
988     if (data->tree->min_match() > 0) {
989       mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
990     } else if (IsUnicode(flags)) {
991       mode = RegExpMacroAssembler::GLOBAL_UNICODE;
992     }
993     macro_assembler->set_global_mode(mode);
994   }
995 
996   RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
997 #ifdef DEBUG
998   std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
999   if (FLAG_trace_regexp_assembler) {
1000     tracer_macro_assembler.reset(
1001         new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
1002     macro_assembler_ptr = tracer_macro_assembler.get();
1003   }
1004 #endif
1005 
1006   RegExpCompiler::CompilationResult result = compiler.Assemble(
1007       isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
1008 
1009   // Code / bytecode printing.
1010   {
1011 #ifdef ENABLE_DISASSEMBLER
1012     if (FLAG_print_regexp_code &&
1013         data->compilation_target == RegExpCompilationTarget::kNative) {
1014       CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1015       OFStream os(trace_scope.file());
1016       Handle<Code> c = Handle<Code>::cast(result.code);
1017       auto pattern_cstring = pattern->ToCString();
1018       c->Disassemble(pattern_cstring.get(), os, isolate);
1019     }
1020 #endif
1021     if (FLAG_print_regexp_bytecode &&
1022         data->compilation_target == RegExpCompilationTarget::kBytecode) {
1023       Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code);
1024       auto pattern_cstring = pattern->ToCString();
1025       RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(),
1026                                 bytecode->length(), pattern_cstring.get());
1027     }
1028   }
1029 
1030   if (result.error != RegExpError::kNone) {
1031     if (FLAG_correctness_fuzzer_suppressions &&
1032         result.error == RegExpError::kStackOverflow) {
1033       FATAL("Aborting on stack overflow");
1034     }
1035     data->error = result.error;
1036   }
1037 
1038   data->code = result.code;
1039   data->register_count = result.num_registers;
1040 
1041   return result.Succeeded();
1042 }
1043 
RegExpGlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)1044 RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
1045                                      Handle<String> subject, Isolate* isolate)
1046     : register_array_(nullptr),
1047       register_array_size_(0),
1048       regexp_(regexp),
1049       subject_(subject),
1050       isolate_(isolate) {
1051   DCHECK(IsGlobal(JSRegExp::AsRegExpFlags(regexp->flags())));
1052 
1053   switch (regexp_->type_tag()) {
1054     case JSRegExp::NOT_COMPILED:
1055       UNREACHABLE();
1056     case JSRegExp::ATOM: {
1057       // ATOM regexps do not have a global loop, so we search for one match at
1058       // a time.
1059       static const int kAtomRegistersPerMatch = 2;
1060       registers_per_match_ = kAtomRegistersPerMatch;
1061       register_array_size_ = registers_per_match_;
1062       break;
1063     }
1064     case JSRegExp::IRREGEXP: {
1065       registers_per_match_ =
1066           RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
1067       if (registers_per_match_ < 0) {
1068         num_matches_ = -1;  // Signal exception.
1069         return;
1070       }
1071       if (regexp->ShouldProduceBytecode()) {
1072         // Global loop in interpreted regexp is not implemented.  We choose the
1073         // size of the offsets vector so that it can only store one match.
1074         register_array_size_ = registers_per_match_;
1075         max_matches_ = 1;
1076       } else {
1077         register_array_size_ = std::max(
1078             {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
1079       }
1080       break;
1081     }
1082     case JSRegExp::EXPERIMENTAL: {
1083       if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
1084           !ExperimentalRegExp::Compile(isolate_, regexp)) {
1085         DCHECK(isolate->has_pending_exception());
1086         num_matches_ = -1;  // Signal exception.
1087         return;
1088       }
1089       registers_per_match_ =
1090           JSRegExp::RegistersForCaptureCount(regexp->capture_count());
1091       register_array_size_ = std::max(
1092           {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
1093       break;
1094     }
1095   }
1096 
1097   max_matches_ = register_array_size_ / registers_per_match_;
1098 
1099   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1100     register_array_ = NewArray<int32_t>(register_array_size_);
1101   } else {
1102     register_array_ = isolate->jsregexp_static_offsets_vector();
1103   }
1104 
1105   // Set state so that fetching the results the first time triggers a call
1106   // to the compiled regexp.
1107   current_match_index_ = max_matches_ - 1;
1108   num_matches_ = max_matches_;
1109   DCHECK_LE(2, registers_per_match_);  // Each match has at least one capture.
1110   DCHECK_GE(register_array_size_, registers_per_match_);
1111   int32_t* last_match =
1112       &register_array_[current_match_index_ * registers_per_match_];
1113   last_match[0] = -1;
1114   last_match[1] = 0;
1115 }
1116 
~RegExpGlobalCache()1117 RegExpGlobalCache::~RegExpGlobalCache() {
1118   // Deallocate the register array if we allocated it in the constructor
1119   // (as opposed to using the existing jsregexp_static_offsets_vector).
1120   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1121     DeleteArray(register_array_);
1122   }
1123 }
1124 
AdvanceZeroLength(int last_index)1125 int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
1126   if (IsUnicode(JSRegExp::AsRegExpFlags(regexp_->flags())) &&
1127       last_index + 1 < subject_->length() &&
1128       unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
1129       unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
1130     // Advance over the surrogate pair.
1131     return last_index + 2;
1132   }
1133   return last_index + 1;
1134 }
1135 
FetchNext()1136 int32_t* RegExpGlobalCache::FetchNext() {
1137   current_match_index_++;
1138 
1139   if (current_match_index_ >= num_matches_) {
1140     // Current batch of results exhausted.
1141     // Fail if last batch was not even fully filled.
1142     if (num_matches_ < max_matches_) {
1143       num_matches_ = 0;  // Signal failed match.
1144       return nullptr;
1145     }
1146 
1147     int32_t* last_match =
1148         &register_array_[(current_match_index_ - 1) * registers_per_match_];
1149     int last_end_index = last_match[1];
1150 
1151     switch (regexp_->type_tag()) {
1152       case JSRegExp::NOT_COMPILED:
1153         UNREACHABLE();
1154       case JSRegExp::ATOM:
1155         num_matches_ =
1156             RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
1157                                     register_array_, register_array_size_);
1158         break;
1159       case JSRegExp::EXPERIMENTAL: {
1160         DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
1161         DisallowGarbageCollection no_gc;
1162         num_matches_ = ExperimentalRegExp::ExecRaw(
1163             isolate_, RegExp::kFromRuntime, *regexp_, *subject_,
1164             register_array_, register_array_size_, last_end_index);
1165         break;
1166       }
1167       case JSRegExp::IRREGEXP: {
1168         int last_start_index = last_match[0];
1169         if (last_start_index == last_end_index) {
1170           // Zero-length match. Advance by one code point.
1171           last_end_index = AdvanceZeroLength(last_end_index);
1172         }
1173         if (last_end_index > subject_->length()) {
1174           num_matches_ = 0;  // Signal failed match.
1175           return nullptr;
1176         }
1177         num_matches_ = RegExpImpl::IrregexpExecRaw(
1178             isolate_, regexp_, subject_, last_end_index, register_array_,
1179             register_array_size_);
1180         break;
1181       }
1182     }
1183 
1184     // Fall back to experimental engine if needed and possible.
1185     if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) {
1186       num_matches_ = ExperimentalRegExp::OneshotExecRaw(
1187           isolate_, regexp_, subject_, register_array_, register_array_size_,
1188           last_end_index);
1189     }
1190 
1191     if (num_matches_ <= 0) {
1192       return nullptr;
1193     }
1194     current_match_index_ = 0;
1195     return register_array_;
1196   } else {
1197     return &register_array_[current_match_index_ * registers_per_match_];
1198   }
1199 }
1200 
LastSuccessfulMatch()1201 int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
1202   int index = current_match_index_ * registers_per_match_;
1203   if (num_matches_ == 0) {
1204     // After a failed match we shift back by one result.
1205     index -= registers_per_match_;
1206   }
1207   return &register_array_[index];
1208 }
1209 
Lookup(Heap * heap,String key_string,Object key_pattern,FixedArray * last_match_cache,ResultsCacheType type)1210 Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
1211                                   Object key_pattern,
1212                                   FixedArray* last_match_cache,
1213                                   ResultsCacheType type) {
1214   FixedArray cache;
1215   if (!key_string.IsInternalizedString()) return Smi::zero();
1216   if (type == STRING_SPLIT_SUBSTRINGS) {
1217     DCHECK(key_pattern.IsString());
1218     if (!key_pattern.IsInternalizedString()) return Smi::zero();
1219     cache = heap->string_split_cache();
1220   } else {
1221     DCHECK(type == REGEXP_MULTIPLE_INDICES);
1222     DCHECK(key_pattern.IsFixedArray());
1223     cache = heap->regexp_multiple_cache();
1224   }
1225 
1226   uint32_t hash = key_string.hash();
1227   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1228                     ~(kArrayEntriesPerCacheEntry - 1));
1229   if (cache.get(index + kStringOffset) != key_string ||
1230       cache.get(index + kPatternOffset) != key_pattern) {
1231     index =
1232         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1233     if (cache.get(index + kStringOffset) != key_string ||
1234         cache.get(index + kPatternOffset) != key_pattern) {
1235       return Smi::zero();
1236     }
1237   }
1238 
1239   *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset));
1240   return cache.get(index + kArrayOffset);
1241 }
1242 
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)1243 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
1244                                Handle<Object> key_pattern,
1245                                Handle<FixedArray> value_array,
1246                                Handle<FixedArray> last_match_cache,
1247                                ResultsCacheType type) {
1248   Factory* factory = isolate->factory();
1249   Handle<FixedArray> cache;
1250   if (!key_string->IsInternalizedString()) return;
1251   if (type == STRING_SPLIT_SUBSTRINGS) {
1252     DCHECK(key_pattern->IsString());
1253     if (!key_pattern->IsInternalizedString()) return;
1254     cache = factory->string_split_cache();
1255   } else {
1256     DCHECK(type == REGEXP_MULTIPLE_INDICES);
1257     DCHECK(key_pattern->IsFixedArray());
1258     cache = factory->regexp_multiple_cache();
1259   }
1260 
1261   uint32_t hash = key_string->hash();
1262   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1263                     ~(kArrayEntriesPerCacheEntry - 1));
1264   if (cache->get(index + kStringOffset) == Smi::zero()) {
1265     cache->set(index + kStringOffset, *key_string);
1266     cache->set(index + kPatternOffset, *key_pattern);
1267     cache->set(index + kArrayOffset, *value_array);
1268     cache->set(index + kLastMatchOffset, *last_match_cache);
1269   } else {
1270     uint32_t index2 =
1271         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1272     if (cache->get(index2 + kStringOffset) == Smi::zero()) {
1273       cache->set(index2 + kStringOffset, *key_string);
1274       cache->set(index2 + kPatternOffset, *key_pattern);
1275       cache->set(index2 + kArrayOffset, *value_array);
1276       cache->set(index2 + kLastMatchOffset, *last_match_cache);
1277     } else {
1278       cache->set(index2 + kStringOffset, Smi::zero());
1279       cache->set(index2 + kPatternOffset, Smi::zero());
1280       cache->set(index2 + kArrayOffset, Smi::zero());
1281       cache->set(index2 + kLastMatchOffset, Smi::zero());
1282       cache->set(index + kStringOffset, *key_string);
1283       cache->set(index + kPatternOffset, *key_pattern);
1284       cache->set(index + kArrayOffset, *value_array);
1285       cache->set(index + kLastMatchOffset, *last_match_cache);
1286     }
1287   }
1288   // If the array is a reasonably short list of substrings, convert it into a
1289   // list of internalized strings.
1290   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
1291     for (int i = 0; i < value_array->length(); i++) {
1292       Handle<String> str(String::cast(value_array->get(i)), isolate);
1293       Handle<String> internalized_str = factory->InternalizeString(str);
1294       value_array->set(i, *internalized_str);
1295     }
1296   }
1297   // Convert backing store to a copy-on-write array.
1298   value_array->set_map_no_write_barrier(
1299       ReadOnlyRoots(isolate).fixed_cow_array_map());
1300 }
1301 
Clear(FixedArray cache)1302 void RegExpResultsCache::Clear(FixedArray cache) {
1303   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
1304     cache.set(i, Smi::zero());
1305   }
1306 }
1307 
1308 }  // namespace internal
1309 }  // namespace v8
1310