1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp.h"
6
7 #include "src/codegen/compilation-cache.h"
8 #include "src/diagnostics/code-tracer.h"
9 #include "src/heap/heap-inl.h"
10 #include "src/objects/js-regexp-inl.h"
11 #include "src/regexp/experimental/experimental.h"
12 #include "src/regexp/regexp-bytecode-generator.h"
13 #include "src/regexp/regexp-bytecodes.h"
14 #include "src/regexp/regexp-compiler.h"
15 #include "src/regexp/regexp-dotprinter.h"
16 #include "src/regexp/regexp-interpreter.h"
17 #include "src/regexp/regexp-macro-assembler-arch.h"
18 #include "src/regexp/regexp-macro-assembler-tracer.h"
19 #include "src/regexp/regexp-parser.h"
20 #include "src/regexp/regexp-utils.h"
21 #include "src/strings/string-search.h"
22 #include "src/utils/ostreams.h"
23
24 namespace v8 {
25 namespace internal {
26
27 using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
28
29 class RegExpImpl final : public AllStatic {
30 public:
31 // Returns a string representation of a regular expression.
32 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
33 // This function calls the garbage collector if necessary.
34 static Handle<String> ToString(Handle<Object> value);
35
36 // Prepares a JSRegExp object with Irregexp-specific data.
37 static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
38 Handle<String> pattern, JSRegExp::Flags flags,
39 int capture_count, uint32_t backtrack_limit);
40
41 // Prepare a RegExp for being executed one or more times (using
42 // IrregexpExecOnce) on the subject.
43 // This ensures that the regexp is compiled for the subject, and that
44 // the subject is flat.
45 // Returns the number of integer spaces required by IrregexpExecOnce
46 // as its "registers" argument. If the regexp cannot be compiled,
47 // an exception is set as pending, and this function returns negative.
48 static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
49 Handle<String> subject);
50
51 static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
52 Handle<String> pattern, JSRegExp::Flags flags,
53 Handle<String> match_pattern);
54
55 static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
56 Handle<String> subject, int index, int32_t* output,
57 int output_size);
58
59 static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
60 Handle<String> subject, int index,
61 Handle<RegExpMatchInfo> last_match_info);
62
63 // Execute a regular expression on the subject, starting from index.
64 // If matching succeeds, return the number of matches. This can be larger
65 // than one in the case of global regular expressions.
66 // The captures and subcaptures are stored into the registers vector.
67 // If matching fails, returns RE_FAILURE.
68 // If execution fails, sets a pending exception and returns RE_EXCEPTION.
69 static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
70 Handle<String> subject, int index, int32_t* output,
71 int output_size);
72
73 // Execute an Irregexp bytecode pattern.
74 // On a successful match, the result is a JSArray containing
75 // captured positions. On a failure, the result is the null value.
76 // Returns an empty handle in case of an exception.
77 V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
78 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
79 int index, Handle<RegExpMatchInfo> last_match_info);
80
81 static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
82 Handle<String> sample_subject, bool is_one_byte);
83 static inline bool EnsureCompiledIrregexp(Isolate* isolate,
84 Handle<JSRegExp> re,
85 Handle<String> sample_subject,
86 bool is_one_byte);
87
88 // Returns true on success, false on failure.
89 static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
90 JSRegExp::Flags flags, Handle<String> pattern,
91 Handle<String> sample_subject, bool is_one_byte,
92 uint32_t& backtrack_limit);
93
94 // For acting on the JSRegExp data FixedArray.
95 static int IrregexpMaxRegisterCount(FixedArray re);
96 static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
97 static int IrregexpNumberOfCaptures(FixedArray re);
98 static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
99 static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
100 };
101
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,RegExpError error)102 MaybeHandle<Object> RegExp::ThrowRegExpException(Isolate* isolate,
103 Handle<JSRegExp> re,
104 Handle<String> pattern,
105 RegExpError error) {
106 Vector<const char> error_data = CStrVector(RegExpErrorString(error));
107 Handle<String> error_text =
108 isolate->factory()
109 ->NewStringFromOneByte(Vector<const uint8_t>::cast(error_data))
110 .ToHandleChecked();
111 THROW_NEW_ERROR(
112 isolate,
113 NewSyntaxError(MessageTemplate::kMalformedRegExp, pattern, error_text),
114 Object);
115 }
116
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,RegExpError error_text)117 void RegExp::ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
118 RegExpError error_text) {
119 USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
120 error_text));
121 }
122
IsUnmodifiedRegExp(Isolate * isolate,Handle<JSRegExp> regexp)123 bool RegExp::IsUnmodifiedRegExp(Isolate* isolate, Handle<JSRegExp> regexp) {
124 return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
125 }
126
127 // Identifies the sort of regexps where the regexp engine is faster
128 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)129 static bool HasFewDifferentCharacters(Handle<String> pattern) {
130 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
131 if (length <= kPatternTooShortForBoyerMoore) return false;
132 const int kMod = 128;
133 bool character_found[kMod];
134 int different = 0;
135 memset(&character_found[0], 0, sizeof(character_found));
136 for (int i = 0; i < length; i++) {
137 int ch = (pattern->Get(i) & (kMod - 1));
138 if (!character_found[ch]) {
139 character_found[ch] = true;
140 different++;
141 // We declare a regexp low-alphabet if it has at least 3 times as many
142 // characters as it has different characters.
143 if (different * 3 > length) return false;
144 }
145 }
146 return true;
147 }
148
149 // Generic RegExp methods. Dispatches to implementation specific methods.
150
151 // static
Compile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,uint32_t backtrack_limit)152 MaybeHandle<Object> RegExp::Compile(Isolate* isolate, Handle<JSRegExp> re,
153 Handle<String> pattern,
154 JSRegExp::Flags flags,
155 uint32_t backtrack_limit) {
156 DCHECK(pattern->IsFlat());
157
158 // Caching is based only on the pattern and flags, but code also differs when
159 // a backtrack limit is set. A present backtrack limit is very much *not* the
160 // common case, so just skip the cache for these.
161 const bool is_compilation_cache_enabled =
162 (backtrack_limit == JSRegExp::kNoBacktrackLimit);
163
164 Zone zone(isolate->allocator(), ZONE_NAME);
165 CompilationCache* compilation_cache = nullptr;
166 if (is_compilation_cache_enabled) {
167 compilation_cache = isolate->compilation_cache();
168 MaybeHandle<FixedArray> maybe_cached =
169 compilation_cache->LookupRegExp(pattern, flags);
170 Handle<FixedArray> cached;
171 if (maybe_cached.ToHandle(&cached)) {
172 re->set_data(*cached);
173 return re;
174 }
175 }
176
177 PostponeInterruptsScope postpone(isolate);
178 RegExpCompileData parse_result;
179 FlatStringReader reader(isolate, pattern);
180 DCHECK(!isolate->has_pending_exception());
181 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
182 &parse_result)) {
183 // Throw an exception if we fail to parse the pattern.
184 return RegExp::ThrowRegExpException(isolate, re, pattern,
185 parse_result.error);
186 }
187
188 bool has_been_compiled = false;
189
190 if (FLAG_default_to_experimental_regexp_engine &&
191 ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
192 parse_result.capture_count)) {
193 DCHECK(FLAG_enable_experimental_regexp_engine);
194 ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
195 parse_result.capture_count);
196 has_been_compiled = true;
197 } else if (flags & JSRegExp::kLinear) {
198 DCHECK(FLAG_enable_experimental_regexp_engine);
199 if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, flags,
200 parse_result.capture_count)) {
201 // TODO(mbid): The error could provide a reason for why the regexp can't
202 // be executed in linear time (e.g. due to back references).
203 return RegExp::ThrowRegExpException(isolate, re, pattern,
204 RegExpError::kNotLinear);
205 }
206 ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
207 parse_result.capture_count);
208 has_been_compiled = true;
209 } else if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
210 !HasFewDifferentCharacters(pattern)) {
211 // Parse-tree is a single atom that is equal to the pattern.
212 RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
213 has_been_compiled = true;
214 } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
215 parse_result.capture_count == 0) {
216 RegExpAtom* atom = parse_result.tree->AsAtom();
217 // The pattern source might (?) contain escape sequences, but they're
218 // resolved in atom_string.
219 Vector<const uc16> atom_pattern = atom->data();
220 Handle<String> atom_string;
221 ASSIGN_RETURN_ON_EXCEPTION(
222 isolate, atom_string,
223 isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
224 if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
225 RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
226 has_been_compiled = true;
227 }
228 }
229 if (!has_been_compiled) {
230 RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
231 parse_result.capture_count, backtrack_limit);
232 }
233 DCHECK(re->data().IsFixedArray());
234 // Compilation succeeded so the data is set on the regexp
235 // and we can store it in the cache.
236 Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
237 if (is_compilation_cache_enabled) {
238 compilation_cache->PutRegExp(pattern, flags, data);
239 }
240
241 return re;
242 }
243
244 // static
EnsureFullyCompiled(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject)245 bool RegExp::EnsureFullyCompiled(Isolate* isolate, Handle<JSRegExp> re,
246 Handle<String> subject) {
247 switch (re->TypeTag()) {
248 case JSRegExp::NOT_COMPILED:
249 UNREACHABLE();
250 case JSRegExp::ATOM:
251 return true;
252 case JSRegExp::IRREGEXP:
253 if (RegExpImpl::IrregexpPrepare(isolate, re, subject) == -1) {
254 DCHECK(isolate->has_pending_exception());
255 return false;
256 }
257 return true;
258 case JSRegExp::EXPERIMENTAL:
259 if (!ExperimentalRegExp::IsCompiled(re, isolate) &&
260 !ExperimentalRegExp::Compile(isolate, re)) {
261 DCHECK(isolate->has_pending_exception());
262 return false;
263 }
264 return true;
265 }
266 }
267
268 // static
ExperimentalOneshotExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)269 MaybeHandle<Object> RegExp::ExperimentalOneshotExec(
270 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
271 int index, Handle<RegExpMatchInfo> last_match_info) {
272 return ExperimentalRegExp::OneshotExec(isolate, regexp, subject, index,
273 last_match_info);
274 }
275
276 // static
Exec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)277 MaybeHandle<Object> RegExp::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
278 Handle<String> subject, int index,
279 Handle<RegExpMatchInfo> last_match_info) {
280 switch (regexp->TypeTag()) {
281 case JSRegExp::NOT_COMPILED:
282 UNREACHABLE();
283 case JSRegExp::ATOM:
284 return RegExpImpl::AtomExec(isolate, regexp, subject, index,
285 last_match_info);
286 case JSRegExp::IRREGEXP:
287 return RegExpImpl::IrregexpExec(isolate, regexp, subject, index,
288 last_match_info);
289 case JSRegExp::EXPERIMENTAL:
290 return ExperimentalRegExp::Exec(isolate, regexp, subject, index,
291 last_match_info);
292 }
293 }
294
295 // RegExp Atom implementation: Simple string search using indexOf.
296
AtomCompile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)297 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
298 Handle<String> pattern, JSRegExp::Flags flags,
299 Handle<String> match_pattern) {
300 isolate->factory()->SetRegExpAtomData(re, pattern, flags, match_pattern);
301 }
302
SetAtomLastCapture(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,String subject,int from,int to)303 static void SetAtomLastCapture(Isolate* isolate,
304 Handle<RegExpMatchInfo> last_match_info,
305 String subject, int from, int to) {
306 SealHandleScope shs(isolate);
307 last_match_info->SetNumberOfCaptureRegisters(2);
308 last_match_info->SetLastSubject(subject);
309 last_match_info->SetLastInput(subject);
310 last_match_info->SetCapture(0, from);
311 last_match_info->SetCapture(1, to);
312 }
313
AtomExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)314 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
315 Handle<String> subject, int index, int32_t* output,
316 int output_size) {
317 DCHECK_LE(0, index);
318 DCHECK_LE(index, subject->length());
319
320 subject = String::Flatten(isolate, subject);
321 DisallowHeapAllocation no_gc; // ensure vectors stay valid
322
323 String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
324 int needle_len = needle.length();
325 DCHECK(needle.IsFlat());
326 DCHECK_LT(0, needle_len);
327
328 if (index + needle_len > subject->length()) {
329 return RegExp::RE_FAILURE;
330 }
331
332 for (int i = 0; i < output_size; i += 2) {
333 String::FlatContent needle_content = needle.GetFlatContent(no_gc);
334 String::FlatContent subject_content = subject->GetFlatContent(no_gc);
335 DCHECK(needle_content.IsFlat());
336 DCHECK(subject_content.IsFlat());
337 // dispatch on type of strings
338 index =
339 (needle_content.IsOneByte()
340 ? (subject_content.IsOneByte()
341 ? SearchString(isolate, subject_content.ToOneByteVector(),
342 needle_content.ToOneByteVector(), index)
343 : SearchString(isolate, subject_content.ToUC16Vector(),
344 needle_content.ToOneByteVector(), index))
345 : (subject_content.IsOneByte()
346 ? SearchString(isolate, subject_content.ToOneByteVector(),
347 needle_content.ToUC16Vector(), index)
348 : SearchString(isolate, subject_content.ToUC16Vector(),
349 needle_content.ToUC16Vector(), index)));
350 if (index == -1) {
351 return i / 2; // Return number of matches.
352 } else {
353 output[i] = index;
354 output[i + 1] = index + needle_len;
355 index += needle_len;
356 }
357 }
358 return output_size / 2;
359 }
360
AtomExec(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)361 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
362 Handle<String> subject, int index,
363 Handle<RegExpMatchInfo> last_match_info) {
364 static const int kNumRegisters = 2;
365 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
366 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
367
368 int res =
369 AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
370
371 if (res == RegExp::RE_FAILURE) return isolate->factory()->null_value();
372
373 DCHECK_EQ(res, RegExp::RE_SUCCESS);
374 SealHandleScope shs(isolate);
375 SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
376 output_registers[1]);
377 return last_match_info;
378 }
379
380 // Irregexp implementation.
381
382 // Ensures that the regexp object contains a compiled version of the
383 // source for either one-byte or two-byte subject strings.
384 // If the compiled version doesn't already exist, it is compiled
385 // from the source pattern.
386 // If compilation fails, an exception is thrown and this function
387 // returns false.
EnsureCompiledIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)388 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
389 Handle<String> sample_subject,
390 bool is_one_byte) {
391 Object compiled_code = re->Code(is_one_byte);
392 Object bytecode = re->Bytecode(is_one_byte);
393 bool needs_initial_compilation =
394 compiled_code == Smi::FromInt(JSRegExp::kUninitializedValue);
395 // Recompile is needed when we're dealing with the first execution of the
396 // regexp after the decision to tier up has been made. If the tiering up
397 // strategy is not in use, this value is always false.
398 bool needs_tier_up_compilation =
399 re->MarkedForTierUp() && bytecode.IsByteArray();
400
401 if (FLAG_trace_regexp_tier_up && needs_tier_up_compilation) {
402 PrintF("JSRegExp object %p needs tier-up compilation\n",
403 reinterpret_cast<void*>(re->ptr()));
404 }
405
406 if (!needs_initial_compilation && !needs_tier_up_compilation) {
407 DCHECK(compiled_code.IsCode());
408 DCHECK_IMPLIES(FLAG_regexp_interpret_all, bytecode.IsByteArray());
409 return true;
410 }
411
412 DCHECK_IMPLIES(needs_tier_up_compilation, bytecode.IsByteArray());
413
414 return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
415 }
416
417 #ifdef DEBUG
418 namespace {
419
RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re,bool is_one_byte)420 bool RegExpCodeIsValidForPreCompilation(Handle<JSRegExp> re, bool is_one_byte) {
421 Object entry = re->Code(is_one_byte);
422 Object bytecode = re->Bytecode(is_one_byte);
423 // If we're not using the tier-up strategy, entry can only be a smi
424 // representing an uncompiled regexp here. If we're using the tier-up
425 // strategy, entry can still be a smi representing an uncompiled regexp, when
426 // compiling the regexp before the tier-up, or it can contain a trampoline to
427 // the regexp interpreter, in which case the bytecode field contains compiled
428 // bytecode, when recompiling the regexp after the tier-up. If the
429 // tier-up was forced, which happens for global replaces, entry is a smi
430 // representing an uncompiled regexp, even though we're "recompiling" after
431 // the tier-up.
432 if (re->ShouldProduceBytecode()) {
433 DCHECK(entry.IsSmi());
434 DCHECK(bytecode.IsSmi());
435 int entry_value = Smi::ToInt(entry);
436 int bytecode_value = Smi::ToInt(bytecode);
437 DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
438 DCHECK_EQ(JSRegExp::kUninitializedValue, bytecode_value);
439 } else {
440 DCHECK(entry.IsSmi() || (entry.IsCode() && bytecode.IsByteArray()));
441 }
442
443 return true;
444 }
445
446 } // namespace
447 #endif
448
CompileIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)449 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
450 Handle<String> sample_subject,
451 bool is_one_byte) {
452 // Compile the RegExp.
453 Zone zone(isolate->allocator(), ZONE_NAME);
454 PostponeInterruptsScope postpone(isolate);
455
456 DCHECK(RegExpCodeIsValidForPreCompilation(re, is_one_byte));
457
458 JSRegExp::Flags flags = re->GetFlags();
459
460 Handle<String> pattern(re->Pattern(), isolate);
461 pattern = String::Flatten(isolate, pattern);
462 RegExpCompileData compile_data;
463 FlatStringReader reader(isolate, pattern);
464 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
465 &compile_data)) {
466 // Throw an exception if we fail to parse the pattern.
467 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
468 USE(RegExp::ThrowRegExpException(isolate, re, pattern, compile_data.error));
469 return false;
470 }
471 // The compilation target is a kBytecode if we're interpreting all regexp
472 // objects, or if we're using the tier-up strategy but the tier-up hasn't
473 // happened yet. The compilation target is a kNative if we're using the
474 // tier-up strategy and we need to recompile to tier-up, or if we're producing
475 // native code for all regexp objects.
476 compile_data.compilation_target = re->ShouldProduceBytecode()
477 ? RegExpCompilationTarget::kBytecode
478 : RegExpCompilationTarget::kNative;
479 uint32_t backtrack_limit = re->BacktrackLimit();
480 const bool compilation_succeeded =
481 Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
482 is_one_byte, backtrack_limit);
483 if (!compilation_succeeded) {
484 DCHECK(compile_data.error != RegExpError::kNone);
485 RegExp::ThrowRegExpException(isolate, re, compile_data.error);
486 return false;
487 }
488
489 Handle<FixedArray> data =
490 Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
491 if (compile_data.compilation_target == RegExpCompilationTarget::kNative) {
492 data->set(JSRegExp::code_index(is_one_byte), *compile_data.code);
493 // Reset bytecode to uninitialized. In case we use tier-up we know that
494 // tier-up has happened this way.
495 data->set(JSRegExp::bytecode_index(is_one_byte),
496 Smi::FromInt(JSRegExp::kUninitializedValue));
497 } else {
498 DCHECK_EQ(compile_data.compilation_target,
499 RegExpCompilationTarget::kBytecode);
500 // Store code generated by compiler in bytecode and trampoline to
501 // interpreter in code.
502 data->set(JSRegExp::bytecode_index(is_one_byte), *compile_data.code);
503 Handle<Code> trampoline =
504 BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
505 data->set(JSRegExp::code_index(is_one_byte), *trampoline);
506 }
507 re->SetCaptureNameMap(compile_data.capture_name_map);
508 int register_max = IrregexpMaxRegisterCount(*data);
509 if (compile_data.register_count > register_max) {
510 SetIrregexpMaxRegisterCount(*data, compile_data.register_count);
511 }
512 data->set(JSRegExp::kIrregexpBacktrackLimit, Smi::FromInt(backtrack_limit));
513
514 if (FLAG_trace_regexp_tier_up) {
515 PrintF("JSRegExp object %p %s size: %d\n",
516 reinterpret_cast<void*>(re->ptr()),
517 re->ShouldProduceBytecode() ? "bytecode" : "native code",
518 re->ShouldProduceBytecode()
519 ? IrregexpByteCode(*data, is_one_byte).Size()
520 : IrregexpNativeCode(*data, is_one_byte).Size());
521 }
522
523 return true;
524 }
525
IrregexpMaxRegisterCount(FixedArray re)526 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
527 return Smi::ToInt(re.get(JSRegExp::kIrregexpMaxRegisterCountIndex));
528 }
529
SetIrregexpMaxRegisterCount(FixedArray re,int value)530 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
531 re.set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
532 }
533
IrregexpNumberOfCaptures(FixedArray re)534 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
535 return Smi::ToInt(re.get(JSRegExp::kIrregexpCaptureCountIndex));
536 }
537
IrregexpByteCode(FixedArray re,bool is_one_byte)538 ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
539 return ByteArray::cast(re.get(JSRegExp::bytecode_index(is_one_byte)));
540 }
541
IrregexpNativeCode(FixedArray re,bool is_one_byte)542 Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
543 return Code::cast(re.get(JSRegExp::code_index(is_one_byte)));
544 }
545
IrregexpInitialize(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count,uint32_t backtrack_limit)546 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
547 Handle<String> pattern,
548 JSRegExp::Flags flags, int capture_count,
549 uint32_t backtrack_limit) {
550 // Initialize compiled code entries to null.
551 isolate->factory()->SetRegExpIrregexpData(re, pattern, flags, capture_count,
552 backtrack_limit);
553 }
554
555 // static
IrregexpPrepare(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject)556 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
557 Handle<String> subject) {
558 DCHECK(subject->IsFlat());
559
560 // Check representation of the underlying storage.
561 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
562 if (!RegExpImpl::EnsureCompiledIrregexp(isolate, regexp, subject,
563 is_one_byte)) {
564 return -1;
565 }
566
567 // Only reserve room for output captures. Internal registers are allocated by
568 // the engine.
569 return JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
570 }
571
IrregexpExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)572 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
573 Handle<String> subject, int index,
574 int32_t* output, int output_size) {
575 DCHECK_LE(0, index);
576 DCHECK_LE(index, subject->length());
577 DCHECK(subject->IsFlat());
578 DCHECK_GE(output_size,
579 JSRegExp::RegistersForCaptureCount(regexp->CaptureCount()));
580
581 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
582
583 if (!regexp->ShouldProduceBytecode()) {
584 do {
585 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
586 // The stack is used to allocate registers for the compiled regexp code.
587 // This means that in case of failure, the output registers array is left
588 // untouched and contains the capture results from the previous successful
589 // match. We can use that to set the last match info lazily.
590 int res = NativeRegExpMacroAssembler::Match(regexp, subject, output,
591 output_size, index, isolate);
592 if (res != NativeRegExpMacroAssembler::RETRY) {
593 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
594 isolate->has_pending_exception());
595 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
596 RegExp::RE_SUCCESS);
597 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
598 RegExp::RE_FAILURE);
599 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
600 RegExp::RE_EXCEPTION);
601 return res;
602 }
603 // If result is RETRY, the string has changed representation, and we
604 // must restart from scratch.
605 // In this case, it means we must make sure we are prepared to handle
606 // the, potentially, different subject (the string can switch between
607 // being internal and external, and even between being Latin1 and UC16,
608 // but the characters are always the same).
609 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
610 } while (true);
611 UNREACHABLE();
612 } else {
613 DCHECK(regexp->ShouldProduceBytecode());
614
615 do {
616 IrregexpInterpreter::Result result =
617 IrregexpInterpreter::MatchForCallFromRuntime(
618 isolate, regexp, subject, output, output_size, index);
619 DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
620 isolate->has_pending_exception());
621
622 switch (result) {
623 case IrregexpInterpreter::SUCCESS:
624 case IrregexpInterpreter::EXCEPTION:
625 case IrregexpInterpreter::FAILURE:
626 case IrregexpInterpreter::FALLBACK_TO_EXPERIMENTAL:
627 return result;
628 case IrregexpInterpreter::RETRY:
629 // The string has changed representation, and we must restart the
630 // match.
631 // We need to reset the tier up to start over with compilation.
632 if (FLAG_regexp_tier_up) regexp->ResetLastTierUpTick();
633 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
634 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
635 break;
636 }
637 } while (true);
638 UNREACHABLE();
639 }
640 }
641
IrregexpExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info)642 MaybeHandle<Object> RegExpImpl::IrregexpExec(
643 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
644 int previous_index, Handle<RegExpMatchInfo> last_match_info) {
645 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
646
647 subject = String::Flatten(isolate, subject);
648
649 #ifdef DEBUG
650 if (FLAG_trace_regexp_bytecodes && regexp->ShouldProduceBytecode()) {
651 String pattern = regexp->Pattern();
652 PrintF("\n\nRegexp match: /%s/\n\n", pattern.ToCString().get());
653 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
654 }
655 #endif
656
657 // For very long subject strings, the regexp interpreter is currently much
658 // slower than the jitted code execution. If the tier-up strategy is turned
659 // on, we want to avoid this performance penalty so we eagerly tier-up if the
660 // subject string length is equal or greater than the given heuristic value.
661 if (FLAG_regexp_tier_up &&
662 subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
663 regexp->MarkTierUpForNextExec();
664 if (FLAG_trace_regexp_tier_up) {
665 PrintF(
666 "Forcing tier-up for very long strings in "
667 "RegExpImpl::IrregexpExec\n");
668 }
669 }
670
671 // Prepare space for the return values.
672 int required_registers =
673 RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
674 if (required_registers < 0) {
675 // Compiling failed with an exception.
676 DCHECK(isolate->has_pending_exception());
677 return MaybeHandle<Object>();
678 }
679
680 int32_t* output_registers = nullptr;
681 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
682 output_registers = NewArray<int32_t>(required_registers);
683 }
684 std::unique_ptr<int32_t[]> auto_release(output_registers);
685 if (output_registers == nullptr) {
686 output_registers = isolate->jsregexp_static_offsets_vector();
687 }
688
689 int res =
690 RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
691 output_registers, required_registers);
692
693 if (res == RegExp::RE_SUCCESS) {
694 int capture_count = regexp->CaptureCount();
695 return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
696 capture_count, output_registers);
697 } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
698 return ExperimentalRegExp::OneshotExec(isolate, regexp, subject,
699 previous_index, last_match_info);
700 } else if (res == RegExp::RE_EXCEPTION) {
701 DCHECK(isolate->has_pending_exception());
702 return MaybeHandle<Object>();
703 } else {
704 DCHECK(res == RegExp::RE_FAILURE);
705 return isolate->factory()->null_value();
706 }
707 }
708
709 // static
SetLastMatchInfo(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)710 Handle<RegExpMatchInfo> RegExp::SetLastMatchInfo(
711 Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
712 Handle<String> subject, int capture_count, int32_t* match) {
713 // This is the only place where match infos can grow. If, after executing the
714 // regexp, RegExpExecStub finds that the match info is too small, it restarts
715 // execution in RegExpImpl::Exec, which finally grows the match info right
716 // here.
717 Handle<RegExpMatchInfo> result =
718 RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
719 if (*result != *last_match_info) {
720 if (*last_match_info == *isolate->regexp_last_match_info()) {
721 // This inner condition is only needed for special situations like the
722 // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
723 // RegExpImpl::Exec; there actually want to bypass the Isolate's match
724 // info and execute the regexp without side effects.
725 isolate->native_context()->set_regexp_last_match_info(*result);
726 }
727 }
728
729 int capture_register_count =
730 JSRegExp::RegistersForCaptureCount(capture_count);
731 DisallowHeapAllocation no_allocation;
732 if (match != nullptr) {
733 for (int i = 0; i < capture_register_count; i += 2) {
734 result->SetCapture(i, match[i]);
735 result->SetCapture(i + 1, match[i + 1]);
736 }
737 }
738 result->SetLastSubject(*subject);
739 result->SetLastInput(*subject);
740 return result;
741 }
742
743 // static
DotPrintForTesting(const char * label,RegExpNode * node)744 void RegExp::DotPrintForTesting(const char* label, RegExpNode* node) {
745 DotPrinter::DotPrint(label, node);
746 }
747
748 namespace {
749
750 // Returns true if we've either generated too much irregex code within this
751 // isolate, or the pattern string is too long.
TooMuchRegExpCode(Isolate * isolate,Handle<String> pattern)752 bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
753 // Limit the space regexps take up on the heap. In order to limit this we
754 // would like to keep track of the amount of regexp code on the heap. This
755 // is not tracked, however. As a conservative approximation we track the
756 // total regexp code compiled including code that has subsequently been freed
757 // and the total executable memory at any point.
758 static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
759 static constexpr size_t kRegExpCompiledLimit = 1 * MB;
760
761 Heap* heap = isolate->heap();
762 if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
763 return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
764 heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
765 }
766
767 } // namespace
768
769 // static
CompileForTesting(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)770 bool RegExp::CompileForTesting(Isolate* isolate, Zone* zone,
771 RegExpCompileData* data, JSRegExp::Flags flags,
772 Handle<String> pattern,
773 Handle<String> sample_subject,
774 bool is_one_byte) {
775 uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
776 return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
777 sample_subject, is_one_byte, backtrack_limit);
778 }
779
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte,uint32_t & backtrack_limit)780 bool RegExpImpl::Compile(Isolate* isolate, Zone* zone, RegExpCompileData* data,
781 JSRegExp::Flags flags, Handle<String> pattern,
782 Handle<String> sample_subject, bool is_one_byte,
783 uint32_t& backtrack_limit) {
784 if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
785 RegExpMacroAssembler::kMaxRegisterCount) {
786 data->error = RegExpError::kTooLarge;
787 return false;
788 }
789
790 RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
791
792 if (compiler.optimize()) {
793 compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
794 }
795
796 // Sample some characters from the middle of the string.
797 static const int kSampleSize = 128;
798
799 sample_subject = String::Flatten(isolate, sample_subject);
800 int chars_sampled = 0;
801 int half_way = (sample_subject->length() - kSampleSize) / 2;
802 for (int i = Max(0, half_way);
803 i < sample_subject->length() && chars_sampled < kSampleSize;
804 i++, chars_sampled++) {
805 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
806 }
807
808 data->node = compiler.PreprocessRegExp(data, flags, is_one_byte);
809 data->error = AnalyzeRegExp(isolate, is_one_byte, data->node);
810 if (data->error != RegExpError::kNone) {
811 return false;
812 }
813
814 // Create the correct assembler for the architecture.
815 std::unique_ptr<RegExpMacroAssembler> macro_assembler;
816 if (data->compilation_target == RegExpCompilationTarget::kNative) {
817 // Native regexp implementation.
818 DCHECK(!FLAG_jitless);
819
820 NativeRegExpMacroAssembler::Mode mode =
821 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
822 : NativeRegExpMacroAssembler::UC16;
823
824 const int output_register_count =
825 JSRegExp::RegistersForCaptureCount(data->capture_count);
826 #if V8_TARGET_ARCH_IA32
827 macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
828 output_register_count));
829 #elif V8_TARGET_ARCH_X64
830 macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
831 output_register_count));
832 #elif V8_TARGET_ARCH_ARM
833 macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
834 output_register_count));
835 #elif V8_TARGET_ARCH_ARM64
836 macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
837 output_register_count));
838 #elif V8_TARGET_ARCH_S390
839 macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
840 output_register_count));
841 #elif V8_TARGET_ARCH_PPC || V8_TARGET_ARCH_PPC64
842 macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
843 output_register_count));
844 #elif V8_TARGET_ARCH_MIPS
845 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
846 output_register_count));
847 #elif V8_TARGET_ARCH_MIPS64
848 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
849 output_register_count));
850 #else
851 #error "Unsupported architecture"
852 #endif
853 } else {
854 DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
855 // Interpreted regexp implementation.
856 macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
857 }
858
859 macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
860 if (FLAG_enable_experimental_regexp_engine_on_excessive_backtracks &&
861 ExperimentalRegExp::CanBeHandled(data->tree, flags,
862 data->capture_count)) {
863 if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
864 backtrack_limit = FLAG_regexp_backtracks_before_fallback;
865 } else {
866 backtrack_limit =
867 std::min(backtrack_limit, FLAG_regexp_backtracks_before_fallback);
868 }
869 macro_assembler->set_backtrack_limit(backtrack_limit);
870 macro_assembler->set_can_fallback(true);
871 } else {
872 macro_assembler->set_backtrack_limit(backtrack_limit);
873 macro_assembler->set_can_fallback(false);
874 }
875
876 // Inserted here, instead of in Assembler, because it depends on information
877 // in the AST that isn't replicated in the Node structure.
878 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
879 bool is_start_anchored = data->tree->IsAnchoredAtStart();
880 int max_length = data->tree->max_match();
881 static const int kMaxBacksearchLimit = 1024;
882 if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
883 max_length < kMaxBacksearchLimit) {
884 macro_assembler->SetCurrentPositionFromEnd(max_length);
885 }
886
887 if (IsGlobal(flags)) {
888 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
889 if (data->tree->min_match() > 0) {
890 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
891 } else if (IsUnicode(flags)) {
892 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
893 }
894 macro_assembler->set_global_mode(mode);
895 }
896
897 RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
898 #ifdef DEBUG
899 std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
900 if (FLAG_trace_regexp_assembler) {
901 tracer_macro_assembler.reset(
902 new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
903 macro_assembler_ptr = tracer_macro_assembler.get();
904 }
905 #endif
906
907 RegExpCompiler::CompilationResult result = compiler.Assemble(
908 isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
909
910 // Code / bytecode printing.
911 {
912 #ifdef ENABLE_DISASSEMBLER
913 if (FLAG_print_regexp_code &&
914 data->compilation_target == RegExpCompilationTarget::kNative) {
915 CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
916 OFStream os(trace_scope.file());
917 Handle<Code> c = Handle<Code>::cast(result.code);
918 auto pattern_cstring = pattern->ToCString();
919 c->Disassemble(pattern_cstring.get(), os, isolate);
920 }
921 #endif
922 if (FLAG_print_regexp_bytecode &&
923 data->compilation_target == RegExpCompilationTarget::kBytecode) {
924 Handle<ByteArray> bytecode = Handle<ByteArray>::cast(result.code);
925 auto pattern_cstring = pattern->ToCString();
926 RegExpBytecodeDisassemble(bytecode->GetDataStartAddress(),
927 bytecode->length(), pattern_cstring.get());
928 }
929 }
930
931 if (result.error != RegExpError::kNone) {
932 if (FLAG_correctness_fuzzer_suppressions &&
933 result.error == RegExpError::kStackOverflow) {
934 FATAL("Aborting on stack overflow");
935 }
936 data->error = result.error;
937 }
938
939 data->code = result.code;
940 data->register_count = result.num_registers;
941
942 return result.Succeeded();
943 }
944
RegExpGlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)945 RegExpGlobalCache::RegExpGlobalCache(Handle<JSRegExp> regexp,
946 Handle<String> subject, Isolate* isolate)
947 : register_array_(nullptr),
948 register_array_size_(0),
949 regexp_(regexp),
950 subject_(subject),
951 isolate_(isolate) {
952 DCHECK(IsGlobal(regexp->GetFlags()));
953
954 switch (regexp_->TypeTag()) {
955 case JSRegExp::NOT_COMPILED:
956 UNREACHABLE();
957 case JSRegExp::ATOM: {
958 // ATOM regexps do not have a global loop, so we search for one match at
959 // a time.
960 static const int kAtomRegistersPerMatch = 2;
961 registers_per_match_ = kAtomRegistersPerMatch;
962 register_array_size_ = registers_per_match_;
963 break;
964 }
965 case JSRegExp::IRREGEXP: {
966 registers_per_match_ =
967 RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
968 if (registers_per_match_ < 0) {
969 num_matches_ = -1; // Signal exception.
970 return;
971 }
972 if (regexp->ShouldProduceBytecode()) {
973 // Global loop in interpreted regexp is not implemented. We choose the
974 // size of the offsets vector so that it can only store one match.
975 register_array_size_ = registers_per_match_;
976 max_matches_ = 1;
977 } else {
978 register_array_size_ = Max(registers_per_match_,
979 Isolate::kJSRegexpStaticOffsetsVectorSize);
980 }
981 break;
982 }
983 case JSRegExp::EXPERIMENTAL: {
984 if (!ExperimentalRegExp::IsCompiled(regexp, isolate_) &&
985 !ExperimentalRegExp::Compile(isolate_, regexp)) {
986 DCHECK(isolate->has_pending_exception());
987 num_matches_ = -1; // Signal exception.
988 return;
989 }
990 registers_per_match_ =
991 JSRegExp::RegistersForCaptureCount(regexp->CaptureCount());
992 register_array_size_ =
993 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
994 break;
995 }
996 }
997
998 max_matches_ = register_array_size_ / registers_per_match_;
999
1000 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1001 register_array_ = NewArray<int32_t>(register_array_size_);
1002 } else {
1003 register_array_ = isolate->jsregexp_static_offsets_vector();
1004 }
1005
1006 // Set state so that fetching the results the first time triggers a call
1007 // to the compiled regexp.
1008 current_match_index_ = max_matches_ - 1;
1009 num_matches_ = max_matches_;
1010 DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
1011 DCHECK_GE(register_array_size_, registers_per_match_);
1012 int32_t* last_match =
1013 ®ister_array_[current_match_index_ * registers_per_match_];
1014 last_match[0] = -1;
1015 last_match[1] = 0;
1016 }
1017
~RegExpGlobalCache()1018 RegExpGlobalCache::~RegExpGlobalCache() {
1019 // Deallocate the register array if we allocated it in the constructor
1020 // (as opposed to using the existing jsregexp_static_offsets_vector).
1021 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
1022 DeleteArray(register_array_);
1023 }
1024 }
1025
AdvanceZeroLength(int last_index)1026 int RegExpGlobalCache::AdvanceZeroLength(int last_index) {
1027 if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
1028 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
1029 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
1030 // Advance over the surrogate pair.
1031 return last_index + 2;
1032 }
1033 return last_index + 1;
1034 }
1035
FetchNext()1036 int32_t* RegExpGlobalCache::FetchNext() {
1037 current_match_index_++;
1038
1039 if (current_match_index_ >= num_matches_) {
1040 // Current batch of results exhausted.
1041 // Fail if last batch was not even fully filled.
1042 if (num_matches_ < max_matches_) {
1043 num_matches_ = 0; // Signal failed match.
1044 return nullptr;
1045 }
1046
1047 int32_t* last_match =
1048 ®ister_array_[(current_match_index_ - 1) * registers_per_match_];
1049 int last_end_index = last_match[1];
1050
1051 switch (regexp_->TypeTag()) {
1052 case JSRegExp::NOT_COMPILED:
1053 UNREACHABLE();
1054 case JSRegExp::ATOM:
1055 num_matches_ =
1056 RegExpImpl::AtomExecRaw(isolate_, regexp_, subject_, last_end_index,
1057 register_array_, register_array_size_);
1058 break;
1059 case JSRegExp::EXPERIMENTAL: {
1060 DCHECK(ExperimentalRegExp::IsCompiled(regexp_, isolate_));
1061 DisallowHeapAllocation no_gc;
1062 num_matches_ = ExperimentalRegExp::ExecRaw(
1063 isolate_, RegExp::kFromRuntime, *regexp_, *subject_,
1064 register_array_, register_array_size_, last_end_index);
1065 break;
1066 }
1067 case JSRegExp::IRREGEXP: {
1068 int last_start_index = last_match[0];
1069 if (last_start_index == last_end_index) {
1070 // Zero-length match. Advance by one code point.
1071 last_end_index = AdvanceZeroLength(last_end_index);
1072 }
1073 if (last_end_index > subject_->length()) {
1074 num_matches_ = 0; // Signal failed match.
1075 return nullptr;
1076 }
1077 num_matches_ = RegExpImpl::IrregexpExecRaw(
1078 isolate_, regexp_, subject_, last_end_index, register_array_,
1079 register_array_size_);
1080 break;
1081 }
1082 }
1083
1084 // Fall back to experimental engine if needed and possible.
1085 if (num_matches_ == RegExp::kInternalRegExpFallbackToExperimental) {
1086 num_matches_ = ExperimentalRegExp::OneshotExecRaw(
1087 isolate_, regexp_, subject_, register_array_, register_array_size_,
1088 last_end_index);
1089 }
1090
1091 if (num_matches_ <= 0) {
1092 return nullptr;
1093 }
1094 current_match_index_ = 0;
1095 return register_array_;
1096 } else {
1097 return ®ister_array_[current_match_index_ * registers_per_match_];
1098 }
1099 }
1100
LastSuccessfulMatch()1101 int32_t* RegExpGlobalCache::LastSuccessfulMatch() {
1102 int index = current_match_index_ * registers_per_match_;
1103 if (num_matches_ == 0) {
1104 // After a failed match we shift back by one result.
1105 index -= registers_per_match_;
1106 }
1107 return ®ister_array_[index];
1108 }
1109
Lookup(Heap * heap,String key_string,Object key_pattern,FixedArray * last_match_cache,ResultsCacheType type)1110 Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
1111 Object key_pattern,
1112 FixedArray* last_match_cache,
1113 ResultsCacheType type) {
1114 FixedArray cache;
1115 if (!key_string.IsInternalizedString()) return Smi::zero();
1116 if (type == STRING_SPLIT_SUBSTRINGS) {
1117 DCHECK(key_pattern.IsString());
1118 if (!key_pattern.IsInternalizedString()) return Smi::zero();
1119 cache = heap->string_split_cache();
1120 } else {
1121 DCHECK(type == REGEXP_MULTIPLE_INDICES);
1122 DCHECK(key_pattern.IsFixedArray());
1123 cache = heap->regexp_multiple_cache();
1124 }
1125
1126 uint32_t hash = key_string.Hash();
1127 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1128 ~(kArrayEntriesPerCacheEntry - 1));
1129 if (cache.get(index + kStringOffset) != key_string ||
1130 cache.get(index + kPatternOffset) != key_pattern) {
1131 index =
1132 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1133 if (cache.get(index + kStringOffset) != key_string ||
1134 cache.get(index + kPatternOffset) != key_pattern) {
1135 return Smi::zero();
1136 }
1137 }
1138
1139 *last_match_cache = FixedArray::cast(cache.get(index + kLastMatchOffset));
1140 return cache.get(index + kArrayOffset);
1141 }
1142
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)1143 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
1144 Handle<Object> key_pattern,
1145 Handle<FixedArray> value_array,
1146 Handle<FixedArray> last_match_cache,
1147 ResultsCacheType type) {
1148 Factory* factory = isolate->factory();
1149 Handle<FixedArray> cache;
1150 if (!key_string->IsInternalizedString()) return;
1151 if (type == STRING_SPLIT_SUBSTRINGS) {
1152 DCHECK(key_pattern->IsString());
1153 if (!key_pattern->IsInternalizedString()) return;
1154 cache = factory->string_split_cache();
1155 } else {
1156 DCHECK(type == REGEXP_MULTIPLE_INDICES);
1157 DCHECK(key_pattern->IsFixedArray());
1158 cache = factory->regexp_multiple_cache();
1159 }
1160
1161 uint32_t hash = key_string->Hash();
1162 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1163 ~(kArrayEntriesPerCacheEntry - 1));
1164 if (cache->get(index + kStringOffset) == Smi::zero()) {
1165 cache->set(index + kStringOffset, *key_string);
1166 cache->set(index + kPatternOffset, *key_pattern);
1167 cache->set(index + kArrayOffset, *value_array);
1168 cache->set(index + kLastMatchOffset, *last_match_cache);
1169 } else {
1170 uint32_t index2 =
1171 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
1172 if (cache->get(index2 + kStringOffset) == Smi::zero()) {
1173 cache->set(index2 + kStringOffset, *key_string);
1174 cache->set(index2 + kPatternOffset, *key_pattern);
1175 cache->set(index2 + kArrayOffset, *value_array);
1176 cache->set(index2 + kLastMatchOffset, *last_match_cache);
1177 } else {
1178 cache->set(index2 + kStringOffset, Smi::zero());
1179 cache->set(index2 + kPatternOffset, Smi::zero());
1180 cache->set(index2 + kArrayOffset, Smi::zero());
1181 cache->set(index2 + kLastMatchOffset, Smi::zero());
1182 cache->set(index + kStringOffset, *key_string);
1183 cache->set(index + kPatternOffset, *key_pattern);
1184 cache->set(index + kArrayOffset, *value_array);
1185 cache->set(index + kLastMatchOffset, *last_match_cache);
1186 }
1187 }
1188 // If the array is a reasonably short list of substrings, convert it into a
1189 // list of internalized strings.
1190 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
1191 for (int i = 0; i < value_array->length(); i++) {
1192 Handle<String> str(String::cast(value_array->get(i)), isolate);
1193 Handle<String> internalized_str = factory->InternalizeString(str);
1194 value_array->set(i, *internalized_str);
1195 }
1196 }
1197 // Convert backing store to a copy-on-write array.
1198 value_array->set_map_no_write_barrier(
1199 ReadOnlyRoots(isolate).fixed_cow_array_map());
1200 }
1201
Clear(FixedArray cache)1202 void RegExpResultsCache::Clear(FixedArray cache) {
1203 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
1204 cache.set(i, Smi::zero());
1205 }
1206 }
1207
1208 } // namespace internal
1209 } // namespace v8
1210