1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp.h"
6
7 #include "src/base/strings.h"
8 #include "src/codegen/compilation-cache.h"
9 #include "src/diagnostics/code-tracer.h"
10 #include "src/execution/interrupts-scope.h"
11 #include "src/heap/heap-inl.h"
12 #include "src/objects/js-regexp-inl.h"
13 #include "src/regexp/experimental/experimental.h"
14 #include "src/regexp/regexp-bytecode-generator.h"
15 #include "src/regexp/regexp-bytecodes.h"
16 #include "src/regexp/regexp-compiler.h"
17 #include "src/regexp/regexp-dotprinter.h"
18 #include "src/regexp/regexp-interpreter.h"
19 #include "src/regexp/regexp-macro-assembler-arch.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-parser.h"
22 #include "src/regexp/regexp-utils.h"
23 #include "src/strings/string-search.h"
24 #include "src/utils/ostreams.h"
25
26 namespace v8 {
27 namespace internal {
28
29 using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
30
31 class RegExpImpl final : public AllStatic {
32 public:
33 // Returns a string representation of a regular expression.
34 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
35 // This function calls the garbage collector if necessary.
36 static Handle<String> ToString(Handle<Object> value);
37
38 // Prepares a JSRegExp object with Irregexp-specific data.
39 static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
40 Handle<String> pattern, RegExpFlags flags,
41 int capture_count, uint32_t backtrack_limit);
42
43 // Prepare a RegExp for being executed one or more times (using
44 // IrregexpExecOnce) on the subject.
45 // This ensures that the regexp is compiled for the subject, and that
46 // the subject is flat.
47 // Returns the number of integer spaces required by IrregexpExecOnce
48 // as its "registers" argument. If the regexp cannot be compiled,
49 // an exception is set as pending, and this function returns negative.
50 static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
51 Handle<String> subject);
52
53 static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
54 Handle<String> pattern, RegExpFlags flags,
55 Handle<String> match_pattern);
56
57 static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
58 Handle<String> subject, int index, int32_t* output,
59 int output_size);
60
61 static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
62 Handle<String> subject, int index,
63 Handle<RegExpMatchInfo> last_match_info);
64
65 // Execute a regular expression on the subject, starting from index.
66 // If matching succeeds, return the number of matches. This can be larger
67 // than one in the case of global regular expressions.
68 // The captures and subcaptures are stored into the registers vector.
69 // If matching fails, returns RE_FAILURE.
70 // If execution fails, sets a pending exception and returns RE_EXCEPTION.
71 static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
72 Handle<String> subject, int index, int32_t* output,
73 int output_size);
74
75 // Execute an Irregexp bytecode pattern.
76 // On a successful match, the result is a JSArray containing
77 // captured positions. On a failure, the result is the null value.
78 // Returns an empty handle in case of an exception.
79 V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
80 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
81 int index, Handle<RegExpMatchInfo> last_match_info,
82 RegExp::ExecQuirks exec_quirks = RegExp::ExecQuirks::kNone);
83
84 static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
85 Handle<String> sample_subject, bool is_one_byte);
86 static inline bool EnsureCompiledIrregexp(Isolate* isolate,
87 Handle<JSRegExp> re,
88 Handle<String> sample_subject,
89 bool is_one_byte);
90
91 // Returns true on success, false on failure.
92 static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
93 RegExpFlags flags, Handle<String> pattern,
94 Handle<String> sample_subject, bool is_one_byte,
95 uint32_t& backtrack_limit);
96
97 // For acting on the JSRegExp data FixedArray.
98 static int IrregexpMaxRegisterCount(FixedArray re);
99 static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
100 static int IrregexpNumberOfCaptures(FixedArray re);
101 static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
102 static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
103 };
104
105 // static
CanGenerateBytecode()106 bool RegExp::CanGenerateBytecode() {
107 return FLAG_regexp_interpret_all || FLAG_regexp_tier_up;
108 }
109
110 // static
111 template <class CharT>
VerifySyntax(Zone * zone,uintptr_t stack_limit,const CharT * input,int input_length,RegExpFlags flags,RegExpError * regexp_error_out,const DisallowGarbageCollection & no_gc)112 bool RegExp::VerifySyntax(Zone* zone, uintptr_t stack_limit, const CharT* input,
113 int input_length, RegExpFlags flags,
114 RegExpError* regexp_error_out,
115 const DisallowGarbageCollection& no_gc) {
116 RegExpCompileData data;
117 bool pattern_is_valid = RegExpParser::VerifyRegExpSyntax(
118 zone, stack_limit, input, input_length, flags, &data, no_gc);
119 *regexp_error_out = data.error;
120 return pattern_is_valid;
121 }
122
123 template bool RegExp::VerifySyntax<uint8_t>(Zone*, uintptr_t, const uint8_t*,
124 int, RegExpFlags,
125 RegExpError* regexp_error_out,
126 const DisallowGarbageCollection&);
127 template bool RegExp::VerifySyntax<base::uc16>(
128 Zone*, uintptr_t, const base::uc16*, int, RegExpFlags,
129 RegExpError* regexp_error_out, const DisallowGarbageCollection&);
130
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpError error)131 MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
132 Handle<JSRegExp> re,
133 Handle<String> pattern,
134 RegExpError error) {
135 base::Vector<const char> error_data =
136 base::CStrVector(RegExpErrorString(error));
137 Handle<String> error_text =
138 isolate->factory()
139 ->NewStringFromOneByte(base::Vector<const uint8_t>::cast(error_data))
140 .ToHandleChecked();
141 THROW_NEW_ERROR(
142 isolate,
143 NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
144 Object);
145 }
146
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,RegExpError error_text)147 void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
148 RegExpError error_text) {
149 USE(ThrowRegExpException(isolate, re, Handle<String>(re->source(), isolate),
150 error_text));
151 }
152
IsUnmodifiedRegExp(Isolate * isolate,Handle<JSRegExp> regexp)153 bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) {
154 return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
155 }
156
157 namespace {
158
159 // Identifies the sort of regexps where the regexp engine is faster
160 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)161 bool HasFewDifferentCharacters(Handle<String> pattern) {
162 int length = std::min(kMaxLookaheadForBoyerMoore, pattern->length());
163 if (length <= kPatternTooShortForBoyerMoore) return false;
164 const int kMod = 128;
165 bool character_found[kMod];
166 int different = 0;
167 memset(&character_found[0], 0, sizeof(character_found));
168 for (int i = 0; i < length; i++) {
169 int ch = (pattern->Get(i) & (kMod - 1));
170 if (!character_found[ch]) {
171 character_found[ch] = true;
172 different++;
173 // We declare a regexp low-alphabet if it has at least 3 times as many
174 // characters as it has different characters.
175 if (different * 3 > length) return false;
176 }
177 }
178 return true;
179 }
180
181 } // namespace
182
183 // Generic RegExp methods. Dispatches to implementation specific methods.
184
185 // static
Compile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpFlags flags,uint32_t backtrack_limit)186 MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
187 Handle<String> pattern, RegExpFlags flags,
188 uint32_t backtrack_limit) {
189 DCHECK(pattern->IsFlat());
190
191 // Caching is based only on the pattern and flags, but code also differs when
192 // a backtrack limit is set. A present backtrack limit is very much *not* the
193 // common case, so just skip the cache for these.
194 const bool is_compilation_cache_enabled =
195 (backtrack_limit == JSRegExp::kNoBacktrackLimit);
196
197 Zone zone(isolate->allocator(), ZONE_NAME);
198 CompilationCache* compilation_cache = nullptr;
199 if (is_compilation_cache_enabled) {
200 compilation_cache = isolate->compilation_cache();
201 MaybeHandle<FixedArray> maybe_cached = compilation_cache->LookupRegExp(
202 pattern, JSRegExp::AsJSRegExpFlags(flags));
203 Handle<FixedArray> cached;
204 if (maybe_cached.ToHandle(&cached)) {
205 re->set_data(*cached);
206 return re;
207 }
208 }
209
210 PostponeInterruptsScope postpone(isolate);
211 RegExpCompileData parse_result;
212 DCHECK(!isolate->has_pending_exception());
213 if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
214 &parse_result)) {
215 // Throw an exception if we fail to parse the pattern.
216 return RegExp::ThrowRegExpException(isolate, re, pattern,
217 parse_result.error);
218 }
219
220 bool has_been_compiled = false;
221
222 if (FLAG_default_to_experimental_regexp_engine &&
223 ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
224 parse_result.capture_count)) {
225 DCHECK(FLAG_enable_experimental_regexp_engine);
226 ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
227 parse_result.capture_count);
228 has_been_compiled = true;
229 } else if (flags & JSRegExp::kLinear) {
230 DCHECK(FLAG_enable_experimental_regexp_engine);
231 if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
232 parse_result.capture_count)) {
233 // TODO(mbid): The error could provide a reason for why the regexp can't
234 // be executed in linear time (e.g. due to back references).
235 return RegExp::ThrowRegExpException(isolate, re, pattern,
236 RegExpError::kNotLinear);
237 }
238 ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
239 parse_result.capture_count);
240 has_been_compiled = true;
241 } else if (parse_result.simple && !IsIgnoreCase(flags) && !IsSticky(flags) &&
242 !HasFewDifferentCharacters(pattern)) {
243 // Parse-tree is a single atom that is equal to the pattern.
244 RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
245 has_been_compiled = true;
246 } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
247 parse_result.capture_count == 0) {
248 RegExpAtom* atom = parse_result.tree->AsAtom();
249 // The pattern source might (?) contain escape sequences, but they're
250 // resolved in atom_string.
251 base::Vector<const base::uc16> atom_pattern = atom->data();
252 Handle<String> atom_string;
253 ASSIGN_RETURN_ON_EXCEPTION(
254 isolate, atom_string,
255 isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
256 if (!IsIgnoreCase(flags) && !HasFewDifferentCharacters(atom_string)) {
257 RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
258 has_been_compiled = true;
259 }
260 }
261 if (!has_been_compiled) {
262 RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
263 parse_result.capture_count, backtrack_limit);
264 }
265 DCHECK(re->data().IsFixedArray());
266 // Compilation succeeded so the data is set on the regexp
267 // and we can store it in the cache.
268 Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
269 if (is_compilation_cache_enabled) {
270 compilation_cache->PutRegExp(pattern, JSRegExp::AsJSRegExpFlags(flags),
271 data);
272 }
273
274 return re;
275 }
276
277 // static
EnsureFullyCompiled(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject)278 bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
279 Handle<String> subject) {
280 switch (re->type_tag()) {
281 case JSRegExp::NOT_COMPILED:
282 UNREACHABLE();
283 case JSRegExp::ATOM:
284 return true;
285 case JSRegExp::IRREGEXP:
286 if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
287 DCHECK(isolate->has_pending_exception());
288 return false;
289 }
290 return true;
291 case JSRegExp::EXPERIMENTAL:
292 if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
293 !ExperimentalRegExp::Compile(isolate, re)) {
294 DCHECK(isolate->has_pending_exception());
295 return false;
296 }
297 return true;
298 }
299 }
300
301 // static
ExperimentalOneshotExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info,RegExp::ExecQuirks exec_quirks)302 MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
303 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
304 int index, Handle<RegExpMatchInfo> last_match_info,
305 RegExp::ExecQuirks exec_quirks) {
306 return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
307 last_match_info, exec_quirks);
308 }
309
310 // static
Exec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info,ExecQuirks exec_quirks)311 MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
312 Handle<String> subject, int index,
313 Handle<RegExpMatchInfo> last_match_info,
314 ExecQuirks exec_quirks) {
315 switch (regexp->type_tag()) {
316 case JSRegExp::NOT_COMPILED:
317 UNREACHABLE();
318 case JSRegExp::ATOM:
319 return RegExpImpl::AtomExec(isolate, regexp, subject, index,
320 last_match_info);
321 case JSRegExp::IRREGEXP:
322 return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
323 last_match_info, exec_quirks);
324 case JSRegExp::EXPERIMENTAL:
325 return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
326 last_match_info, exec_quirks);
327 }
328 }
329
330 // RegExp Atom implementation: Simple string search using indexOf.
331
AtomCompile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpFlags flags,Handle<String> match_pattern)332 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
333 Handle<String> pattern, RegExpFlags flags,
334 Handle<String> match_pattern) {
335 isolate->factory()->SetRegExpAtomData(
336 re, pattern, JSRegExp::AsJSRegExpFlags(flags), match_pattern);
337 }
338
339 namespace {
340
SetAtomLastCapture(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,String subject,int from,int to)341 void SetAtomLastCapture(Isolate* isolate,
342 Handle<RegExpMatchInfo> last_match_info, String subject,
343 int from, int to) {
344 SealHandleScope shs(isolate);
345 last_match_info->SetNumberOfCaptureRegisters(2);
346 last_match_info->SetLastSubject(subject);
347 last_match_info->SetLastInput(subject);
348 last_match_info->SetCapture(0, from);
349 last_match_info->SetCapture(1, to);
350 }
351
352 } // namespace
353
AtomExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)354 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
355 Handle<String> subject, int index, int32_t* output,
356 int output_size) {
357 DCHECK_LE(0, index);
358 DCHECK_LE(index, subject->length());
359
360 subject = String::Flatten(isolate, subject);
361 DisallowGarbageCollection no_gc; // ensure vectors stay valid
362
363 String needle = regexp->atom_pattern();
364 int needle_len = needle.length();
365 DCHECK(needle.IsFlat());
366 DCHECK_LT(0, needle_len);
367
368 if (index + needle_len > subject->length()) {
369 return RegExp::RE_FAILURE;
370 }
371
372 for (int i = 0; i < output_size; i += 2) {
373 String::FlatContent needle_content = needle.GetFlatContent(no_gc);
374 String::FlatContent subject_content = subject->GetFlatContent(no_gc);
375 DCHECK(needle_content.IsFlat());
376 DCHECK(subject_content.IsFlat());
377 // dispatch on type of strings
378 index =
379 (needle_content.IsOneByte()
380 ? (subject_content.IsOneByte()
381 ? SearchString(isolate, subject_content.ToOneByteVector(),
382 needle_content.ToOneByteVector(), index)
383 : SearchString(isolate, subject_content.ToUC16Vector(),
384 needle_content.ToOneByteVector(), index))
385 : (subject_content.IsOneByte()
386 ? SearchString(isolate, subject_content.ToOneByteVector(),
387 needle_content.ToUC16Vector(), index)
388 : SearchString(isolate, subject_content.ToUC16Vector(),
389 needle_content.ToUC16Vector(), index)));
390 if (index == -1) {
391 return i / 2; // Return number of matches.
392 } else {
393 output[i] = index;
394 output[i + 1] = index + needle_len;
395 index += needle_len;
396 }
397 }
398 return output_size / 2;
399 }
400
AtomExec(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)401 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
402 Handle<String> subject, int index,
403 Handle<RegExpMatchInfo> last_match_info) {
404 static const int kNumRegisters = 2;
405 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
406 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
407
408 int res =
409 AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
410
411 if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value();
412
413 DCHECK_EQ(res, RegExp::RE_SUCCESS);
414 SealHandleScope shs(isolate);
415 SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
416 output_registers[1]);
417 return last_match_info;
418 }
419
420 // Irregexp implementation.
421
422 // Ensures that the regexp object contains a compiled version of the
423 // source for either one-byte or two-byte subject strings.
424 // If the compiled version doesn't already exist, it is compiled
425 // from the source pattern.
426 // If compilation fails, an exception is thrown and this function
427 // returns false.
EnsureCompiledIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)428 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
429 Handle<String> sample_subject,
430 bool is_one_byte) {
431 Object compiled_code = re->code(is_one_byte);
432 Object bytecode = re->bytecode(is_one_byte);
433 bool needs_initial_compilation =
434 compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue);
435 // Recompile is needed when we're dealing with the first execution of the
436 // regexp after the decision to tier up has been made. If the tiering up
437 // strategy is not in use, this value is always false.
438 bool needs_tier_up_compilation =
439 re->MarkedForTierUp() && bytecode.IsByteArray();
440
441 if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) {
442 PrintF("JSRegExp object %p needs tier-up compilation\n",
443 reinterpret_cast<void*>(re->ptr()));
444 }
445
446 if (!needs_initial_compilation && !needs_tier_up_compilation) {
447 DCHECK(compiled_code.IsCodeT());
448 DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray());
449 return true;
450 }
451
452 DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray());
453
454 return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
455 }
456
457 namespace {
458
459 #ifdef DEBUG
RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re,bool is_one_byte)460 bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
461 Object entry = re->code(is_one_byte);
462 Object bytecode = re->bytecode(is_one_byte);
463 // If we're not using the tier-up strategy, entry can only be a smi
464 // representing an uncompiled regexp here. If we're using the tier-up
465 // strategy, entry can still be a smi representing an uncompiled regexp, when
466 // compiling the regexp before the tier-up, or it can contain a trampoline to
467 // the regexp interpreter, in which case the bytecode field contains compiled
468 // bytecode, when recompiling the regexp after the tier-up. If the
469 // tier-up was forced, which happens for global replaces, entry is a smi
470 // representing an uncompiled regexp, even though we're "recompiling" after
471 // the tier-up.
472 if (re->ShouldProduceBytecode()) {
473 DCHECK(entry.IsSmi());
474 DCHECK(bytecode.IsSmi());
475 int entry_value = Smi::ToInt(entry);
476 int bytecode_value = Smi::ToInt(bytecode);
477 DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
478 DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value);
479 } else {
480 DCHECK(entry.IsSmi() || (entry.IsCodeT() && bytecode.IsByteArray()));
481 }
482
483 return true;
484 }
485 #endif
486
487 struct RegExpCaptureIndexLess {
operator ()v8::internal::__anonc9c892ec0311::RegExpCaptureIndexLess488 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
489 DCHECK_NOT_NULL(lhs);
490 DCHECK_NOT_NULL(rhs);
491 return lhs->index() < rhs->index();
492 }
493 };
494
495 } // namespace
496
497 // static
CreateCaptureNameMap(Isolate * isolate,ZoneVector<RegExpCapture * > * named_captures)498 Handle<FixedArray> RegExp::CreateCaptureNameMap(
499 Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures) {
500 if (named_captures == nullptr) return Handle<FixedArray>();
501
502 DCHECK(!named_captures->empty());
503
504 // Named captures are sorted by name (because the set is used to ensure
505 // name uniqueness). But the capture name map must to be sorted by index.
506
507 std::sort(named_captures->begin(), named_captures->end(),
508 RegExpCaptureIndexLess{});
509
510 int len = static_cast<int>(named_captures->size()) * 2;
511 Handle<FixedArray> array = isolate->factory()->NewFixedArray(len);
512
513 int i = 0;
514 for (const RegExpCapture* capture : *named_captures) {
515 base::Vector<const base::uc16> capture_name(capture->name()->data(),
516 capture->name()->size());
517 // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
518 // internalized so they can be used as property names in the 'exec' results.
519 Handle<String> name = isolate->factory()->InternalizeString(capture_name);
520 array->set(i * 2, *name);
521 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
522
523 i++;
524 }
525 DCHECK_EQ(i * 2, len);
526
527 return array;
528 }
529
CompileIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)530 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
531 Handle<String> sample_subject,
532 bool is_one_byte) {
533 // Compile the RegExp.
534 Zone zone(isolate->allocator(), ZONE_NAME);
535 PostponeInterruptsScope postpone(isolate);
536
537 DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte));
538
539 RegExpFlags flags = JSRegExp::AsRegExpFlags(re->flags());
540
541 Handle<String> pattern(re->source(), isolate);
542 pattern = String::Flatten(isolate, pattern);
543 RegExpCompileData compile_data;
544 if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
545 &compile_data)) {
546 // Throw an exception if we fail to parse the pattern.
547 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
548 USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
549 return false;
550 }
551 // The compilation target is a kBytecode if we're interpreting all regexp
552 // objects, or if we're using the tier-up strategy but the tier-up hasn't
553 // happened yet. The compilation target is a kNative if we're using the
554 // tier-up strategy and we need to recompile to tier-up, or if we're producing
555 // native code for all regexp objects.
556 compile_data.compilation_target = re->ShouldProduceBytecode()
557 ? RegExpCompilationTarget::kBytecode
558 : RegExpCompilationTarget::kNative;
559 uint32_t backtrack_limit = re->backtrack_limit();
560 const bool compilation_succeeded =
561 Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
562 is_one_byte, backtrack_limit);
563 if (!compilation_succeeded) {
564 DCHECK(compile_data.error != RegExpError::kNone);
565 RegExp::ThrowRegExpException(isolate, re, compile_data.error);
566 return false;
567 }
568
569 Handle<FixedArray> data =
570 Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
571 if (compile_data.compilation_target == RegExpCompilationTarget::kNative) {
572 Code code = Code::cast(*compile_data.code);
573 data->set(JSRegExp::code_index(is_one_byte), ToCodeT(code));
574
575 // Reset bytecode to uninitialized. In case we use tier-up we know that
576 // tier-up has happened this way.
577 data->set(JSRegExp::bytecode_index(is_one_byte),
578 Smi::FromInt(JSRegExp::kUninitializedValue));
579 } else {
580 DCHECK_EQ(compile_data.compilation_target,
581 RegExpCompilationTarget::kBytecode);
582 // Store code generated by compiler in bytecode and trampoline to
583 // interpreter in code.
584 data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code);
585 Handle<CodeT> trampoline =
586 BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
587 data->set(JSRegExp::code_index(is_one_byte), *trampoline);
588 }
589 Handle<FixedArray> capture_name_map =
590 RegExp::CreateCaptureNameMap(isolate, compile_data.named_captures);
591 re->set_capture_name_map(capture_name_map);
592 int register_max = IrregexpMaxRegisterCount(*data);
593 if (compile_data.register_count > register_max) {
594 SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
595 }
596 data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
597
598 if (FLAG_trace_regexp_tier_up) {
599 PrintF("JSRegExp object %p %s size: %d\n",
600 reinterpret_cast<void*>(re->ptr()),
601 re->ShouldProduceBytecode() ? "bytecode" : "native code",
602 re->ShouldProduceBytecode()
603 ? IrregexpByteCode(*data, is_one_byte).Size()
604 : IrregexpNativeCode(*data, is_one_byte).Size());
605 }
606
607 return true;
608 }
609
IrregexpMaxRegisterCount(FixedArray re)610 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
611 return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex));
612 }
613
SetIrregexpMaxRegisterCount(FixedArray re,int value)614 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
615 re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
616 }
617
IrregexpNumberOfCaptures(FixedArray re)618 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
619 return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
620 }
621
IrregexpByteCode(FixedArray re,bool is_one_byte)622 ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
623 return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte)));
624 }
625
IrregexpNativeCode(FixedArray re,bool is_one_byte)626 Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
627 return Code::cast(re.get(JSRegExp::code_index(is_one_byte)));
628 }
629
IrregexpInitialize(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpFlags flags,int capture_count,uint32_t backtrack_limit)630 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
631 Handle<String> pattern, RegExpFlags flags,
632 int capture_count,
633 uint32_t backtrack_limit) {
634 // Initialize compiled code entries to null.
635 isolate->factory()->SetRegExpIrregexpData(re, pattern,
636 JSRegExp::AsJSRegExpFlags(flags),
637 capture_count, backtrack_limit);
638 }
639
640 // static
IrregexpPrepare(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject)641 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
642 Handle<String> subject) {
643 DCHECK(subject->IsFlat());
644
645 // Check representation of the underlying storage.
646 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
647 if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject,
648 is_one_byte)) {
649 return -1;
650 }
651
652 // Only reserve room for output captures. Internal registers are allocated by
653 // the engine.
654 return JSRegExp::RegistersForCaptureCount(regexp->capture_count());
655 }
656
IrregexpExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)657 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
658 Handle<String> subject, int index,
659 int32_t* output, int output_size) {
660 DCHECK_LE(0, index);
661 DCHECK_LE(index, subject->length());
662 DCHECK(subject->IsFlat());
663 DCHECK_GE(output_size,
664 JSRegExp::RegistersForCaptureCount(regexp->capture_count()));
665
666 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
667
668 if (!regexp->ShouldProduceBytecode()) {
669 do {
670 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
671 // The stack is used to allocate registers for the compiled regexp code.
672 // This means that in case of failure, the output registers array is left
673 // untouched and contains the capture results from the previous successful
674 // match. We can use that to set the last match info lazily.
675 int res = NativeRegExpMacroAssembler::Match(regexp, subject, output,
676 output_size, index, isolate);
677 if (res != NativeRegExpMacroAssembler::RETRY) {
678 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
679 isolate->has_pending_exception());
680 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
681 RegExp::RE_SUCCESS);
682 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
683 RegExp::RE_FAILURE);
684 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
685 RegExp::RE_EXCEPTION);
686 return res;
687 }
688 // If result is RETRY, the string has changed representation, and we
689 // must restart from scratch.
690 // In this case, it means we must make sure we are prepared to handle
691 // the, potentially, different subject (the string can switch between
692 // being internal and external, and even between being Latin1 and
693 // UC16, but the characters are always the same).
694 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
695 } while (true);
696 UNREACHABLE();
697 } else {
698 DCHECK(regexp->ShouldProduceBytecode());
699
700 do {
701 IrregexpInterpreter::Result result =
702 IrregexpInterpreter::MatchForCallFromRuntime(
703 isolate, regexp, subject, output, output_size, index);
704 DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
705 isolate->has_pending_exception());
706
707 switch (result) {
708 case IrregexpInterpreter::SUCCESS:
709 case IrregexpInterpreter::EXCEPTION:
710 case IrregexpInterpreter::FAILURE:
711 case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL:
712 return result;
713 case IrregexpInterpreter::RETRY:
714 // The string has changed representation, and we must restart the
715 // match.
716 // We need to reset the tier up to start over with compilation.
717 if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick();
718 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
719 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
720 break;
721 }
722 } while (true);
723 UNREACHABLE();
724 }
725 }
726
IrregexpExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info,RegExp::ExecQuirks exec_quirks)727 MaybeHandle<Object> RegExpImpl::IrregexpExec(
728 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
729 int previous_index, Handle<RegExpMatchInfo> last_match_info,
730 RegExp::ExecQuirks exec_quirks) {
731 DCHECK_EQ(regexp->type_tag(), JSRegExp::IRREGEXP);
732
733 subject = String::Flatten(isolate, subject);
734
735 #ifdef DEBUG
736 if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) {
737 PrintF("\n\nRegexp match: /%s/\n\n", regexp->source().ToCString().get());
738 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
739 }
740 #endif
741
742 // For very long subject strings, the regexp interpreter is currently much
743 // slower than the jitted code execution. If the tier-up strategy is turned
744 // on, we want to avoid this performance penalty so we eagerly tier-up if the
745 // subject string length is equal or greater than the given heuristic value.
746 if (FLAG_regexp_tier_up &&
747 subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
748 regexp->MarkTierUpForNextExec();
749 if (FLAG_trace_regexp_tier_up) {
750 PrintF(
751 "Forcing tier-up for very long strings in "
752 "RegExpImpl::IrregexpExec\n");
753 }
754 }
755
756 // Prepare space for the return values.
757 int required_registers =
758 RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
759 if (required_registers < 0) {
760 // Compiling failed with an exception.
761 DCHECK(isolate->has_pending_exception());
762 return MaybeHandle<Object>();
763 }
764
765 int32_t* output_registers = nullptr;
766 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
767 output_registers = NewArray<int32_t>(required_registers);
768 }
769 std::unique_ptr<int32_t[]> auto_release(output_registers);
770 if (output_registers == nullptr) {
771 output_registers = isolate->jsregexp_static_offsets_vector();
772 }
773
774 int res =
775 RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
776 output_registers, required_registers);
777
778 if (res == RegExp::RE_SUCCESS) {
779 if (exec_quirks == RegExp::ExecQuirks::kTreatMatchAtEndAsFailure) {
780 if (output_registers[0] >= subject->length()) {
781 return isolate->factory()->null_value();
782 }
783 }
784 int capture_count = regexp->capture_count();
785 return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
786 capture_count, output_registers);
787 } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
788 return ExperimentalRegExp::OneshotExec(isolate, regexp, subject,
789 previous_index, last_match_info);
790 } else if (res == RegExp::RE_EXCEPTION) {
791 DCHECK(isolate->has_pending_exception());
792 return MaybeHandle<Object>();
793 } else {
794 DCHECK(res == RegExp::RE_FAILURE);
795 return isolate->factory()->null_value();
796 }
797 }
798
799 // static
SetLastMatchInfo(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)800 Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo(
801 Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
802 Handle<String> subject, int capture_count, int32_t* match) {
803 // This is the only place where match infos can grow. If, after executing the
804 // regexp, RegExpExecStub finds that the match info is too small, it restarts
805 // execution in RegExpImpl::Exec, which finally grows the match info right
806 // here.
807 Handle<RegExpMatchInfo> result =
808 RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
809 if (*result != *last_match_info) {
810 if (*last_match_info == *isolate->regexp_last_match_info()) {
811 // This inner condition is only needed for special situations like the
812 // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
813 // RegExpImpl::Exec; there actually want to bypass the Isolate's match
814 // info and execute the regexp without side effects.
815 isolate->native_context()->set_regexp_last_match_info(*result);
816 }
817 }
818
819 int capture_register_count =
820 JSRegExp::RegistersForCaptureCount(capture_count);
821 DisallowGarbageCollection no_gc;
822 if (match != nullptr) {
823 for (int i = 0; i < capture_register_count; i += 2) {
824 result->SetCapture(i, match[i]);
825 result->SetCapture(i + 1, match[i + 1]);
826 }
827 }
828 result->SetLastSubject(*subject);
829 result->SetLastInput(*subject);
830 return result;
831 }
832
833 // static
DotPrintForTesting(const char * label,RegExpNode * node)834 void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) {
835 DotPrinter::DotPrint(label, node);
836 }
837
838 namespace {
839
840 // Returns true if we've either generated too much irregex code within this
841 // isolate, or the pattern string is too long.
TooMuchRegExpCode(Isolate * isolate,Handle<String> pattern)842 bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
843 // Limit the space regexps take up on the heap. In order to limit this we
844 // would like to keep track of the amount of regexp code on the heap. This
845 // is not tracked, however. As a conservative approximation we track the
846 // total regexp code compiled including code that has subsequently been freed
847 // and the total executable memory at any point.
848 static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
849 static constexpr size_t kRegExpCompiledLimit = 1 * MB;
850
851 Heap* heap = isolate->heap();
852 if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
853 return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
854 heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
855 }
856
857 } // namespace
858
859 // static
CompileForTesting(Isolate * isolate,Zone * zone,RegExpCompileData * data,RegExpFlags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)860 bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone,
861 RegExpCompileData* data, RegExpFlags flags,
862 Handle<String> pattern,
863 Handle<String> sample_subject,
864 bool is_one_byte) {
865 uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
866 return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
867 sample_subject, is_one_byte, backtrack_limit);
868 }
869
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,RegExpFlags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte,uint32_t & backtrack_limit)870 bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
871 RegExpFlags flags, Handle<String> pattern,
872 Handle<String> sample_subject, bool is_one_byte,
873 uint32_t& backtrack_limit) {
874 if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
875 RegExpMacroAssembler::kMaxRegisterCount) {
876 data->error = RegExpError::kTooLarge;
877 return false;
878 }
879
880 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
881 is_one_byte);
882
883 if (compiler.optimize()) {
884 compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
885 }
886
887 // Sample some characters from the middle of the string.
888 static const int kSampleSize = 128;
889
890 sample_subject = String::Flatten(isolate, sample_subject);
891 int chars_sampled = 0;
892 int half_way = (sample_subject->length() - kSampleSize) / 2;
893 for (int i = std::max(0, half_way);
894 i < sample_subject->length() && chars_sampled < kSampleSize;
895 i++, chars_sampled++) {
896 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
897 }
898
899 data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
900 data->error = AnalyzeRegExp(isolate, is_one_byte, flags, data->node);
901 if (data->error != RegExpError::kNone) {
902 return false;
903 }
904
905 if (FLAG_trace_regexp_graph) DotPrinter::DotPrint("Start", data->node);
906
907 // Create the correct assembler for the architecture.
908 std::unique_ptr<RegExpMacroAssembler> macro_assembler;
909 if (data->compilation_target == RegExpCompilationTarget::kNative) {
910 // Native regexp implementation.
911 DCHECK(!FLAG_jitless);
912
913 NativeRegExpMacroAssembler::Mode mode =
914 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
915 : NativeRegExpMacroAssembler::UC16;
916
917 const int output_register_count =
918 JSRegExp::RegistersForCaptureCount(data->capture_count);
919 #if V8_TARGET_ARCH_IA32
920 macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
921 output_register_count));
922 #elif V8_TARGET_ARCH_X64
923 macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
924 output_register_count));
925 #elif V8_TARGET_ARCH_ARM
926 macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
927 output_register_count));
928 #elif V8_TARGET_ARCH_ARM64
929 macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
930 output_register_count));
931 #elif V8_TARGET_ARCH_S390
932 macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
933 output_register_count));
934 #elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
935 macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
936 output_register_count));
937 #elif V8_TARGET_ARCH_MIPS
938 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
939 output_register_count));
940 #elif V8_TARGET_ARCH_MIPS64
941 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
942 output_register_count));
943 #elif V8_TARGET_ARCH_RISCV64
944 macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode,
945 output_register_count));
946 #elif V8_TARGET_ARCH_LOONG64
947 macro_assembler.reset(new RegExpMacroAssemblerLOONG64(
948 isolate, zone, mode, output_register_count));
949 #else
950 #error "Unsupported architecture"
951 #endif
952 } else {
953 DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
954 // Interpreted regexp implementation.
955 macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
956 }
957
958 macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
959 if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks &&
960 ExperimentalRegExp::CanBeHandled(data->tree, flags,
961 data->capture_count)) {
962 if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
963 backtrack_limit = FLAG_regexp_backtracks_before_fallback;
964 } else {
965 backtrack_limit =
966 std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback);
967 }
968 macro_assembler->set_backtrack_limit(backtrack_limit);
969 macro_assembler->set_can_fallback(true);
970 } else {
971 macro_assembler->set_backtrack_limit(backtrack_limit);
972 macro_assembler->set_can_fallback(false);
973 }
974
975 // Inserted here, instead of in Assembler, because it depends on information
976 // in the AST that isn't replicated in the Node structure.
977 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
978 bool is_start_anchored = data->tree->IsAnchoredAtStart();
979 int max_length = data->tree->max_match();
980 static const int kMaxBacksearchLimit = 1024;
981 if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
982 max_length < kMaxBacksearchLimit) {
983 macro_assembler->SetCurrentPositionFromEnd(max_length);
984 }
985
986 if (IsGlobal(flags)) {
987 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
988 if (data->tree->min_match() > 0) {
989 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
990 } else if (IsUnicode(flags)) {
991 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
992 }
993 macro_assembler->set_global_mode(mode);
994 }
995
996 RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
997 #ifdef DEBUG
998 std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
999 if (FLAG_trace_regexp_assembler) {
1000 tracer_macro_assembler.reset(
1001 new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
1002 macro_assembler_ptr = tracer_macro_assembler.get();
1003 }
1004 #endif
1005
1006 RegExpCompiler::CompilationResult result = compiler.Assemble(
1007 isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
1008
1009 // Code / bytecode printing.
1010 {
1011 #ifdef ENABLE_DISASSEMBLER
1012 if (FLAG_print_regexp_code &&
1013 data->compilation_target == RegExpCompilationTarget::kNative) {
1014 CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1015 OFStream os(trace_scope.file());
1016 Handle<Code> c = Handle<Code>::cast(result.code);
1017 auto pattern_cstring = pattern->ToCString();
1018 c->Disassemble(pattern_cstring.get(), os, isolate);
1019 }
1020 #endif
1021 if (FLAG_print_regexp_bytecode &&
1022 data->compilation_target == RegExpCompilationTarget::kBytecode) {
1023 Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code);
1024 auto pattern_cstring = pattern->ToCString();
1025 RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(),
1026 bytecode->length(), pattern_cstring.get());
1027 }
1028 }
1029
1030 if (result.error != RegExpError::kNone) {
1031 if (FLAG_correctness_fuzzer_suppressions &&
1032 result.error == RegExpError::kStackOverflow) {
1033 FATAL("Aborting on stack overflow");
1034 }
1035 data->error = result.error;
1036 }
1037
1038 data->code = result.code;
1039 data->register_count = result.num_registers;
1040
1041 return result.Succeeded();
1042 }
1043
RegExpGlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)1044 RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
1045 Handle<String> subject, Isolate* isolate)
1046 : register_array_(nullptr),
1047 register_array_size_(0),
1048 regexp_(regexp),
1049 subject_(subject),
1050 isolate_(isolate) {
1051 DCHECK(IsGlobal(JSRegExp::AsRegExpFlags(regexp->flags())));
1052
1053 switch (regexp_->type_tag()) {
1054 case JSRegExp::NOT_COMPILED:
1055 UNREACHABLE();
1056 case JSRegExp::ATOM: {
1057 // ATOM regexps do not have a global loop, so we search for one match at
1058 // a time.
1059 static const int kAtomRegistersPerMatch = 2;
1060 registers_per_match_ = kAtomRegistersPerMatch;
1061 register_array_size_ = registers_per_match_;
1062 break;
1063 }
1064 case JSRegExp::IRREGEXP: {
1065 registers_per_match_ =
1066 RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
1067 if (registers_per_match_ < 0) {
1068 num_matches_ = -1; // Signal exception.
1069 return;
1070 }
1071 if (regexp->ShouldProduceBytecode()) {
1072 // Global loop in interpreted regexp is not implemented. We choose the
1073 // size of the offsets vector so that it can only store one match.
1074 register_array_size_ = registers_per_match_;
1075 max_matches_ = 1;
1076 } else {
1077 register_array_size_ = std::max(
1078 {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
1079 }
1080 break;
1081 }
1082 case JSRegExp::EXPERIMENTAL: {
1083 if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
1084 !ExperimentalRegExp::Compile(isolate_, regexp)) {
1085 DCHECK(isolate->has_pending_exception());
1086 num_matches_ = -1; // Signal exception.
1087 return;
1088 }
1089 registers_per_match_ =
1090 JSRegExp::RegistersForCaptureCount(regexp->capture_count());
1091 register_array_size_ = std::max(
1092 {registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize});
1093 break;
1094 }
1095 }
1096
1097 max_matches_ = register_array_size_ / registers_per_match_;
1098
1099 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1100 register_array_ = NewArray<int32_t>(register_array_size_);
1101 } else {
1102 register_array_ = isolate->jsregexp_static_offsets_vector();
1103 }
1104
1105 // Set state so that fetching the results the first time triggers a call
1106 // to the compiled regexp.
1107 current_match_index_ = max_matches_ - 1;
1108 num_matches_ = max_matches_;
1109 DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
1110 DCHECK_GE(register_array_size_, registers_per_match_);
1111 int32_t* last_match =
1112 ®ister_array_[current_match_index_ * registers_per_match_];
1113 last_match[0] = -1;
1114 last_match[1] = 0;
1115 }
1116
~RegExpGlobalCache()1117 RegExpGlobalCache::~RegExpGlobalCache() {
1118 // Deallocate the register array if we allocated it in the constructor
1119 // (as opposed to using the existing jsregexp_static_offsets_vector).
1120 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1121 DeleteArray(register_array_);
1122 }
1123 }
1124
AdvanceZeroLength(int last_index)1125 int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
1126 if (IsUnicode(JSRegExp::AsRegExpFlags(regexp_->flags())) &&
1127 last_index + 1 < subject_->length() &&
1128 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
1129 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
1130 // Advance over the surrogate pair.
1131 return last_index + 2;
1132 }
1133 return last_index + 1;
1134 }
1135
FetchNext()1136 int32_t* RegExpGlobalCache::FetchNext() {
1137 current_match_index_++;
1138
1139 if (current_match_index_ >= num_matches_) {
1140 // Current batch of results exhausted.
1141 // Fail if last batch was not even fully filled.
1142 if (num_matches_ < max_matches_) {
1143 num_matches_ = 0; // Signal failed match.
1144 return nullptr;
1145 }
1146
1147 int32_t* last_match =
1148 ®ister_array_[(current_match_index_ - 1) * registers_per_match_];
1149 int last_end_index = last_match[1];
1150
1151 switch (regexp_->type_tag()) {
1152 case JSRegExp::NOT_COMPILED:
1153 UNREACHABLE();
1154 case JSRegExp::ATOM:
1155 num_matches_ =
1156 RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
1157 register_array_, register_array_size_);
1158 break;
1159 case JSRegExp::EXPERIMENTAL: {
1160 DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
1161 DisallowGarbageCollection no_gc;
1162 num_matches_ = ExperimentalRegExp::ExecRaw(
1163 isolate_, RegExp::kFromRuntime, *regexp_, *subject_,
1164 register_array_, register_array_size_, last_end_index);
1165 break;
1166 }
1167 case JSRegExp::IRREGEXP: {
1168 int last_start_index = last_match[0];
1169 if (last_start_index == last_end_index) {
1170 // Zero-length match. Advance by one code point.
1171 last_end_index = AdvanceZeroLength(last_end_index);
1172 }
1173 if (last_end_index > subject_->length()) {
1174 num_matches_ = 0; // Signal failed match.
1175 return nullptr;
1176 }
1177 num_matches_ = RegExpImpl::IrregexpExecRaw(
1178 isolate_, regexp_, subject_, last_end_index, register_array_,
1179 register_array_size_);
1180 break;
1181 }
1182 }
1183
1184 // Fall back to experimental engine if needed and possible.
1185 if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) {
1186 num_matches_ = ExperimentalRegExp::OneshotExecRaw(
1187 isolate_, regexp_, subject_, register_array_, register_array_size_,
1188 last_end_index);
1189 }
1190
1191 if (num_matches_ <= 0) {
1192 return nullptr;
1193 }
1194 current_match_index_ = 0;
1195 return register_array_;
1196 } else {
1197 return ®ister_array_[current_match_index_ * registers_per_match_];
1198 }
1199 }
1200
LastSuccessfulMatch()1201 int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
1202 int index = current_match_index_ * registers_per_match_;
1203 if (num_matches_ == 0) {
1204 // After a failed match we shift back by one result.
1205 index -= registers_per_match_;
1206 }
1207 return ®ister_array_[index];
1208 }
1209
Lookup(Heap * heap,String key_string,Object key_pattern,FixedArray * last_match_cache,ResultsCacheType type)1210 Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
1211 Object key_pattern,
1212 FixedArray* last_match_cache,
1213 ResultsCacheType type) {
1214 FixedArray cache;
1215 if (!key_string.IsInternalizedString()) return Smi::zero();
1216 if (type == STRING_SPLIT_SUBSTRINGS) {
1217 DCHECK(key_pattern.IsString());
1218 if (!key_pattern.IsInternalizedString()) return Smi::zero();
1219 cache = heap->string_split_cache();
1220 } else {
1221 DCHECK(type == REGEXP_MULTIPLE_INDICES);
1222 DCHECK(key_pattern.IsFixedArray());
1223 cache = heap->regexp_multiple_cache();
1224 }
1225
1226 uint32_t hash = key_string.hash();
1227 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1228 ~(kArrayEntriesPerCacheEntry - 1));
1229 if (cache.get(index + kStringOffset) != key_string ||
1230 cache.get(index + kPatternOffset) != key_pattern) {
1231 index =
1232 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1233 if (cache.get(index + kStringOffset) != key_string ||
1234 cache.get(index + kPatternOffset) != key_pattern) {
1235 return Smi::zero();
1236 }
1237 }
1238
1239 *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset));
1240 return cache.get(index + kArrayOffset);
1241 }
1242
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)1243 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
1244 Handle<Object> key_pattern,
1245 Handle<FixedArray> value_array,
1246 Handle<FixedArray> last_match_cache,
1247 ResultsCacheType type) {
1248 Factory* factory = isolate->factory();
1249 Handle<FixedArray> cache;
1250 if (!key_string->IsInternalizedString()) return;
1251 if (type == STRING_SPLIT_SUBSTRINGS) {
1252 DCHECK(key_pattern->IsString());
1253 if (!key_pattern->IsInternalizedString()) return;
1254 cache = factory->string_split_cache();
1255 } else {
1256 DCHECK(type == REGEXP_MULTIPLE_INDICES);
1257 DCHECK(key_pattern->IsFixedArray());
1258 cache = factory->regexp_multiple_cache();
1259 }
1260
1261 uint32_t hash = key_string->hash();
1262 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1263 ~(kArrayEntriesPerCacheEntry - 1));
1264 if (cache->get(index + kStringOffset) == Smi::zero()) {
1265 cache->set(index + kStringOffset, *key_string);
1266 cache->set(index + kPatternOffset, *key_pattern);
1267 cache->set(index + kArrayOffset, *value_array);
1268 cache->set(index + kLastMatchOffset, *last_match_cache);
1269 } else {
1270 uint32_t index2 =
1271 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1272 if (cache->get(index2 + kStringOffset) == Smi::zero()) {
1273 cache->set(index2 + kStringOffset, *key_string);
1274 cache->set(index2 + kPatternOffset, *key_pattern);
1275 cache->set(index2 + kArrayOffset, *value_array);
1276 cache->set(index2 + kLastMatchOffset, *last_match_cache);
1277 } else {
1278 cache->set(index2 + kStringOffset, Smi::zero());
1279 cache->set(index2 + kPatternOffset, Smi::zero());
1280 cache->set(index2 + kArrayOffset, Smi::zero());
1281 cache->set(index2 + kLastMatchOffset, Smi::zero());
1282 cache->set(index + kStringOffset, *key_string);
1283 cache->set(index + kPatternOffset, *key_pattern);
1284 cache->set(index + kArrayOffset, *value_array);
1285 cache->set(index + kLastMatchOffset, *last_match_cache);
1286 }
1287 }
1288 // If the array is a reasonably short list of substrings, convert it into a
1289 // list of internalized strings.
1290 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
1291 for (int i = 0; i < value_array->length(); i++) {
1292 Handle<String> str(String::cast(value_array->get(i)), isolate);
1293 Handle<String> internalized_str = factory->InternalizeString(str);
1294 value_array->set(i, *internalized_str);
1295 }
1296 }
1297 // Convert backing store to a copy-on-write array.
1298 value_array->set_map_no_write_barrier(
1299 ReadOnlyRoots(isolate).fixed_cow_array_map());
1300 }
1301
Clear(FixedArray cache)1302 void RegExpResultsCache::Clear(FixedArray cache) {
1303 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
1304 cache.set(i, Smi::zero());
1305 }
1306 }
1307
1308 } // namespace internal
1309 } // namespace v8
1310