1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/jsregexp.h"
6
7 #include <memory>
8 #include <vector>
9
10 #include "src/base/platform/platform.h"
11 #include "src/code-tracer.h"
12 #include "src/compilation-cache.h"
13 #include "src/elements.h"
14 #include "src/execution.h"
15 #include "src/heap/factory.h"
16 #include "src/isolate-inl.h"
17 #include "src/messages.h"
18 #include "src/ostreams.h"
19 #include "src/regexp/interpreter-irregexp.h"
20 #include "src/regexp/jsregexp-inl.h"
21 #include "src/regexp/regexp-macro-assembler-irregexp.h"
22 #include "src/regexp/regexp-macro-assembler-tracer.h"
23 #include "src/regexp/regexp-macro-assembler.h"
24 #include "src/regexp/regexp-parser.h"
25 #include "src/regexp/regexp-stack.h"
26 #include "src/runtime/runtime.h"
27 #include "src/splay-tree-inl.h"
28 #include "src/string-search.h"
29 #include "src/unicode-decoder.h"
30 #include "src/unicode-inl.h"
31
32 #ifdef V8_INTL_SUPPORT
33 #include "unicode/uniset.h"
34 #include "unicode/utypes.h"
35 #endif // V8_INTL_SUPPORT
36
37 #ifndef V8_INTERPRETED_REGEXP
38 #if V8_TARGET_ARCH_IA32
39 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
40 #elif V8_TARGET_ARCH_X64
41 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
42 #elif V8_TARGET_ARCH_ARM64
43 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
44 #elif V8_TARGET_ARCH_ARM
45 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
46 #elif V8_TARGET_ARCH_PPC
47 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
48 #elif V8_TARGET_ARCH_S390
49 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
50 #elif V8_TARGET_ARCH_MIPS
51 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
52 #elif V8_TARGET_ARCH_MIPS64
53 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
54 #else
55 #error Unsupported target architecture.
56 #endif
57 #endif
58
59
60 namespace v8 {
61 namespace internal {
62
63 V8_WARN_UNUSED_RESULT
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text)64 static inline MaybeHandle<Object> ThrowRegExpException(
65 Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
66 Handle<String> error_text) {
67 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
68 pattern, error_text),
69 Object);
70 }
71
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> error_text)72 inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
73 Handle<String> error_text) {
74 USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
75 error_text));
76 }
77
78
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)79 ContainedInLattice AddRange(ContainedInLattice containment,
80 const int* ranges,
81 int ranges_length,
82 Interval new_range) {
83 DCHECK_EQ(1, ranges_length & 1);
84 DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
85 if (containment == kLatticeUnknown) return containment;
86 bool inside = false;
87 int last = 0;
88 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
89 // Consider the range from last to ranges[i].
90 // We haven't got to the new range yet.
91 if (ranges[i] <= new_range.from()) continue;
92 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
93 // inclusive, but the values in ranges are not.
94 if (last <= new_range.from() && new_range.to() < ranges[i]) {
95 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
96 }
97 return kLatticeUnknown;
98 }
99 return containment;
100 }
101
102 // More makes code generation slower, less makes V8 benchmark score lower.
103 const int kMaxLookaheadForBoyerMoore = 8;
104 // In a 3-character pattern you can maximally step forwards 3 characters
105 // at a time, which is not always enough to pay for the extra logic.
106 const int kPatternTooShortForBoyerMoore = 2;
107
108 // Identifies the sort of regexps where the regexp engine is faster
109 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)110 static bool HasFewDifferentCharacters(Handle<String> pattern) {
111 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
112 if (length <= kPatternTooShortForBoyerMoore) return false;
113 const int kMod = 128;
114 bool character_found[kMod];
115 int different = 0;
116 memset(&character_found[0], 0, sizeof(character_found));
117 for (int i = 0; i < length; i++) {
118 int ch = (pattern->Get(i) & (kMod - 1));
119 if (!character_found[ch]) {
120 character_found[ch] = true;
121 different++;
122 // We declare a regexp low-alphabet if it has at least 3 times as many
123 // characters as it has different characters.
124 if (different * 3 > length) return false;
125 }
126 }
127 return true;
128 }
129
130 // Generic RegExp methods. Dispatches to implementation specific methods.
131
Compile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags)132 MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
133 Handle<String> pattern,
134 JSRegExp::Flags flags) {
135 DCHECK(pattern->IsFlat());
136
137 Zone zone(isolate->allocator(), ZONE_NAME);
138 CompilationCache* compilation_cache = isolate->compilation_cache();
139 MaybeHandle<FixedArray> maybe_cached =
140 compilation_cache->LookupRegExp(pattern, flags);
141 Handle<FixedArray> cached;
142 if (maybe_cached.ToHandle(&cached)) {
143 re->set_data(*cached);
144 return re;
145 }
146
147 PostponeInterruptsScope postpone(isolate);
148 RegExpCompileData parse_result;
149 FlatStringReader reader(isolate, pattern);
150 DCHECK(!isolate->has_pending_exception());
151 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
152 &parse_result)) {
153 // Throw an exception if we fail to parse the pattern.
154 return ThrowRegExpException(isolate, re, pattern, parse_result.error);
155 }
156
157 bool has_been_compiled = false;
158
159 if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
160 !HasFewDifferentCharacters(pattern)) {
161 // Parse-tree is a single atom that is equal to the pattern.
162 AtomCompile(isolate, re, pattern, flags, pattern);
163 has_been_compiled = true;
164 } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
165 parse_result.capture_count == 0) {
166 RegExpAtom* atom = parse_result.tree->AsAtom();
167 Vector<const uc16> atom_pattern = atom->data();
168 Handle<String> atom_string;
169 ASSIGN_RETURN_ON_EXCEPTION(
170 isolate, atom_string,
171 isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
172 if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
173 AtomCompile(isolate, re, pattern, flags, atom_string);
174 has_been_compiled = true;
175 }
176 }
177 if (!has_been_compiled) {
178 IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
179 }
180 DCHECK(re->data()->IsFixedArray());
181 // Compilation succeeded so the data is set on the regexp
182 // and we can store it in the cache.
183 Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
184 compilation_cache->PutRegExp(pattern, flags, data);
185
186 return re;
187 }
188
Exec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)189 MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
190 Handle<String> subject, int index,
191 Handle<RegExpMatchInfo> last_match_info) {
192 switch (regexp->TypeTag()) {
193 case JSRegExp::ATOM:
194 return AtomExec(isolate, regexp, subject, index, last_match_info);
195 case JSRegExp::IRREGEXP: {
196 return IrregexpExec(isolate, regexp, subject, index, last_match_info);
197 }
198 default:
199 UNREACHABLE();
200 }
201 }
202
203
204 // RegExp Atom implementation: Simple string search using indexOf.
205
AtomCompile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)206 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
207 Handle<String> pattern, JSRegExp::Flags flags,
208 Handle<String> match_pattern) {
209 isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
210 match_pattern);
211 }
212
SetAtomLastCapture(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,String * subject,int from,int to)213 static void SetAtomLastCapture(Isolate* isolate,
214 Handle<RegExpMatchInfo> last_match_info,
215 String* subject, int from, int to) {
216 SealHandleScope shs(isolate);
217 last_match_info->SetNumberOfCaptureRegisters(2);
218 last_match_info->SetLastSubject(subject);
219 last_match_info->SetLastInput(subject);
220 last_match_info->SetCapture(0, from);
221 last_match_info->SetCapture(1, to);
222 }
223
AtomExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)224 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
225 Handle<String> subject, int index, int32_t* output,
226 int output_size) {
227 DCHECK_LE(0, index);
228 DCHECK_LE(index, subject->length());
229
230 subject = String::Flatten(isolate, subject);
231 DisallowHeapAllocation no_gc; // ensure vectors stay valid
232
233 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
234 int needle_len = needle->length();
235 DCHECK(needle->IsFlat());
236 DCHECK_LT(0, needle_len);
237
238 if (index + needle_len > subject->length()) {
239 return RegExpImpl::RE_FAILURE;
240 }
241
242 for (int i = 0; i < output_size; i += 2) {
243 String::FlatContent needle_content = needle->GetFlatContent();
244 String::FlatContent subject_content = subject->GetFlatContent();
245 DCHECK(needle_content.IsFlat());
246 DCHECK(subject_content.IsFlat());
247 // dispatch on type of strings
248 index =
249 (needle_content.IsOneByte()
250 ? (subject_content.IsOneByte()
251 ? SearchString(isolate, subject_content.ToOneByteVector(),
252 needle_content.ToOneByteVector(), index)
253 : SearchString(isolate, subject_content.ToUC16Vector(),
254 needle_content.ToOneByteVector(), index))
255 : (subject_content.IsOneByte()
256 ? SearchString(isolate, subject_content.ToOneByteVector(),
257 needle_content.ToUC16Vector(), index)
258 : SearchString(isolate, subject_content.ToUC16Vector(),
259 needle_content.ToUC16Vector(), index)));
260 if (index == -1) {
261 return i / 2; // Return number of matches.
262 } else {
263 output[i] = index;
264 output[i+1] = index + needle_len;
265 index += needle_len;
266 }
267 }
268 return output_size / 2;
269 }
270
AtomExec(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)271 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
272 Handle<String> subject, int index,
273 Handle<RegExpMatchInfo> last_match_info) {
274 static const int kNumRegisters = 2;
275 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
276 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
277
278 int res =
279 AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
280
281 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
282
283 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
284 SealHandleScope shs(isolate);
285 SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
286 output_registers[1]);
287 return last_match_info;
288 }
289
290
291 // Irregexp implementation.
292
293 // Ensures that the regexp object contains a compiled version of the
294 // source for either one-byte or two-byte subject strings.
295 // If the compiled version doesn't already exist, it is compiled
296 // from the source pattern.
297 // If compilation fails, an exception is thrown and this function
298 // returns false.
EnsureCompiledIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)299 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
300 Handle<String> sample_subject,
301 bool is_one_byte) {
302 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
303 #ifdef V8_INTERPRETED_REGEXP
304 if (compiled_code->IsByteArray()) return true;
305 #else // V8_INTERPRETED_REGEXP (RegExp native code)
306 if (compiled_code->IsCode()) return true;
307 #endif
308 return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
309 }
310
CompileIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)311 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
312 Handle<String> sample_subject,
313 bool is_one_byte) {
314 // Compile the RegExp.
315 Zone zone(isolate->allocator(), ZONE_NAME);
316 PostponeInterruptsScope postpone(isolate);
317 #ifdef DEBUG
318 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
319 // When arriving here entry can only be a smi representing an uncompiled
320 // regexp.
321 DCHECK(entry->IsSmi());
322 int entry_value = Smi::ToInt(entry);
323 DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
324 #endif
325
326 JSRegExp::Flags flags = re->GetFlags();
327
328 Handle<String> pattern(re->Pattern(), isolate);
329 pattern = String::Flatten(isolate, pattern);
330 RegExpCompileData compile_data;
331 FlatStringReader reader(isolate, pattern);
332 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
333 &compile_data)) {
334 // Throw an exception if we fail to parse the pattern.
335 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
336 USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
337 return false;
338 }
339 RegExpEngine::CompilationResult result =
340 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
341 sample_subject, is_one_byte);
342 if (result.error_message != nullptr) {
343 // Unable to compile regexp.
344 if (FLAG_abort_on_stack_or_string_length_overflow &&
345 strncmp(result.error_message, "Stack overflow", 15) == 0) {
346 FATAL("Aborting on stack overflow");
347 }
348 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
349 CStrVector(result.error_message)).ToHandleChecked();
350 ThrowRegExpException(isolate, re, error_message);
351 return false;
352 }
353
354 Handle<FixedArray> data =
355 Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
356 data->set(JSRegExp::code_index(is_one_byte), result.code);
357 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
358 int register_max = IrregexpMaxRegisterCount(*data);
359 if (result.num_registers > register_max) {
360 SetIrregexpMaxRegisterCount(*data, result.num_registers);
361 }
362
363 return true;
364 }
365
366
IrregexpMaxRegisterCount(FixedArray * re)367 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
368 return Smi::cast(
369 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
370 }
371
372
SetIrregexpMaxRegisterCount(FixedArray * re,int value)373 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
374 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
375 }
376
SetIrregexpCaptureNameMap(FixedArray * re,Handle<FixedArray> value)377 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
378 Handle<FixedArray> value) {
379 if (value.is_null()) {
380 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
381 } else {
382 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
383 }
384 }
385
IrregexpNumberOfCaptures(FixedArray * re)386 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
387 return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
388 }
389
390
IrregexpNumberOfRegisters(FixedArray * re)391 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
392 return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
393 }
394
395
IrregexpByteCode(FixedArray * re,bool is_one_byte)396 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
397 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
398 }
399
400
IrregexpNativeCode(FixedArray * re,bool is_one_byte)401 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
402 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
403 }
404
IrregexpInitialize(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)405 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
406 Handle<String> pattern,
407 JSRegExp::Flags flags, int capture_count) {
408 // Initialize compiled code entries to null.
409 isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
410 flags, capture_count);
411 }
412
IrregexpPrepare(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject)413 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
414 Handle<String> subject) {
415 DCHECK(subject->IsFlat());
416
417 // Check representation of the underlying storage.
418 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
419 if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
420
421 #ifdef V8_INTERPRETED_REGEXP
422 // Byte-code regexp needs space allocated for all its registers.
423 // The result captures are copied to the start of the registers array
424 // if the match succeeds. This way those registers are not clobbered
425 // when we set the last match info from last successful match.
426 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
427 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
428 #else // V8_INTERPRETED_REGEXP
429 // Native regexp only needs room to output captures. Registers are handled
430 // internally.
431 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
432 #endif // V8_INTERPRETED_REGEXP
433 }
434
IrregexpExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)435 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
436 Handle<String> subject, int index,
437 int32_t* output, int output_size) {
438 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
439
440 DCHECK_LE(0, index);
441 DCHECK_LE(index, subject->length());
442 DCHECK(subject->IsFlat());
443
444 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
445
446 #ifndef V8_INTERPRETED_REGEXP
447 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
448 do {
449 EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
450 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
451 // The stack is used to allocate registers for the compiled regexp code.
452 // This means that in case of failure, the output registers array is left
453 // untouched and contains the capture results from the previous successful
454 // match. We can use that to set the last match info lazily.
455 NativeRegExpMacroAssembler::Result res =
456 NativeRegExpMacroAssembler::Match(code,
457 subject,
458 output,
459 output_size,
460 index,
461 isolate);
462 if (res != NativeRegExpMacroAssembler::RETRY) {
463 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
464 isolate->has_pending_exception());
465 STATIC_ASSERT(
466 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
467 STATIC_ASSERT(
468 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
469 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
470 == RE_EXCEPTION);
471 return static_cast<IrregexpResult>(res);
472 }
473 // If result is RETRY, the string has changed representation, and we
474 // must restart from scratch.
475 // In this case, it means we must make sure we are prepared to handle
476 // the, potentially, different subject (the string can switch between
477 // being internal and external, and even between being Latin1 and UC16,
478 // but the characters are always the same).
479 IrregexpPrepare(isolate, regexp, subject);
480 is_one_byte = subject->IsOneByteRepresentationUnderneath();
481 } while (true);
482 UNREACHABLE();
483 #else // V8_INTERPRETED_REGEXP
484
485 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
486 // We must have done EnsureCompiledIrregexp, so we can get the number of
487 // registers.
488 int number_of_capture_registers =
489 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
490 int32_t* raw_output = &output[number_of_capture_registers];
491 // We do not touch the actual capture result registers until we know there
492 // has been a match so that we can use those capture results to set the
493 // last match info.
494 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
495 raw_output[i] = -1;
496 }
497 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
498 isolate);
499
500 IrregexpResult result = IrregexpInterpreter::Match(isolate,
501 byte_codes,
502 subject,
503 raw_output,
504 index);
505 if (result == RE_SUCCESS) {
506 // Copy capture results to the start of the registers array.
507 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
508 }
509 if (result == RE_EXCEPTION) {
510 DCHECK(!isolate->has_pending_exception());
511 isolate->StackOverflow();
512 }
513 return result;
514 #endif // V8_INTERPRETED_REGEXP
515 }
516
IrregexpExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info)517 MaybeHandle<Object> RegExpImpl::IrregexpExec(
518 Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
519 int previous_index, Handle<RegExpMatchInfo> last_match_info) {
520 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
521
522 subject = String::Flatten(isolate, subject);
523
524 // Prepare space for the return values.
525 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
526 if (FLAG_trace_regexp_bytecodes) {
527 String* pattern = regexp->Pattern();
528 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
529 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
530 }
531 #endif
532 int required_registers =
533 RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
534 if (required_registers < 0) {
535 // Compiling failed with an exception.
536 DCHECK(isolate->has_pending_exception());
537 return MaybeHandle<Object>();
538 }
539
540 int32_t* output_registers = nullptr;
541 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
542 output_registers = NewArray<int32_t>(required_registers);
543 }
544 std::unique_ptr<int32_t[]> auto_release(output_registers);
545 if (output_registers == nullptr) {
546 output_registers = isolate->jsregexp_static_offsets_vector();
547 }
548
549 int res =
550 RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
551 output_registers, required_registers);
552 if (res == RE_SUCCESS) {
553 int capture_count =
554 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
555 return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
556 output_registers);
557 }
558 if (res == RE_EXCEPTION) {
559 DCHECK(isolate->has_pending_exception());
560 return MaybeHandle<Object>();
561 }
562 DCHECK(res == RE_FAILURE);
563 return isolate->factory()->null_value();
564 }
565
SetLastMatchInfo(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)566 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
567 Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
568 Handle<String> subject, int capture_count, int32_t* match) {
569 // This is the only place where match infos can grow. If, after executing the
570 // regexp, RegExpExecStub finds that the match info is too small, it restarts
571 // execution in RegExpImpl::Exec, which finally grows the match info right
572 // here.
573
574 int capture_register_count = (capture_count + 1) * 2;
575 Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
576 isolate, last_match_info, capture_register_count);
577 result->SetNumberOfCaptureRegisters(capture_register_count);
578
579 if (*result != *last_match_info) {
580 // The match info has been reallocated, update the corresponding reference
581 // on the native context.
582 if (*last_match_info == *isolate->regexp_last_match_info()) {
583 isolate->native_context()->set_regexp_last_match_info(*result);
584 } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
585 isolate->native_context()->set_regexp_internal_match_info(*result);
586 }
587 }
588
589 DisallowHeapAllocation no_allocation;
590 if (match != nullptr) {
591 for (int i = 0; i < capture_register_count; i += 2) {
592 result->SetCapture(i, match[i]);
593 result->SetCapture(i + 1, match[i + 1]);
594 }
595 }
596 result->SetLastSubject(*subject);
597 result->SetLastInput(*subject);
598 return result;
599 }
600
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)601 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
602 Handle<String> subject, Isolate* isolate)
603 : register_array_(nullptr),
604 register_array_size_(0),
605 regexp_(regexp),
606 subject_(subject),
607 isolate_(isolate) {
608 #ifdef V8_INTERPRETED_REGEXP
609 bool interpreted = true;
610 #else
611 bool interpreted = false;
612 #endif // V8_INTERPRETED_REGEXP
613
614 if (regexp_->TypeTag() == JSRegExp::ATOM) {
615 static const int kAtomRegistersPerMatch = 2;
616 registers_per_match_ = kAtomRegistersPerMatch;
617 // There is no distinction between interpreted and native for atom regexps.
618 interpreted = false;
619 } else {
620 registers_per_match_ =
621 RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
622 if (registers_per_match_ < 0) {
623 num_matches_ = -1; // Signal exception.
624 return;
625 }
626 }
627
628 DCHECK(IsGlobal(regexp->GetFlags()));
629 if (!interpreted) {
630 register_array_size_ =
631 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
632 max_matches_ = register_array_size_ / registers_per_match_;
633 } else {
634 // Global loop in interpreted regexp is not implemented. We choose
635 // the size of the offsets vector so that it can only store one match.
636 register_array_size_ = registers_per_match_;
637 max_matches_ = 1;
638 }
639
640 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
641 register_array_ = NewArray<int32_t>(register_array_size_);
642 } else {
643 register_array_ = isolate->jsregexp_static_offsets_vector();
644 }
645
646 // Set state so that fetching the results the first time triggers a call
647 // to the compiled regexp.
648 current_match_index_ = max_matches_ - 1;
649 num_matches_ = max_matches_;
650 DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
651 DCHECK_GE(register_array_size_, registers_per_match_);
652 int32_t* last_match =
653 ®ister_array_[current_match_index_ * registers_per_match_];
654 last_match[0] = -1;
655 last_match[1] = 0;
656 }
657
AdvanceZeroLength(int last_index)658 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
659 if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
660 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
661 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
662 // Advance over the surrogate pair.
663 return last_index + 2;
664 }
665 return last_index + 1;
666 }
667
668 // -------------------------------------------------------------------
669 // Implementation of the Irregexp regular expression engine.
670 //
671 // The Irregexp regular expression engine is intended to be a complete
672 // implementation of ECMAScript regular expressions. It generates either
673 // bytecodes or native code.
674
675 // The Irregexp regexp engine is structured in three steps.
676 // 1) The parser generates an abstract syntax tree. See ast.cc.
677 // 2) From the AST a node network is created. The nodes are all
678 // subclasses of RegExpNode. The nodes represent states when
679 // executing a regular expression. Several optimizations are
680 // performed on the node network.
681 // 3) From the nodes we generate either byte codes or native code
682 // that can actually execute the regular expression (perform
683 // the search). The code generation step is described in more
684 // detail below.
685
686 // Code generation.
687 //
688 // The nodes are divided into four main categories.
689 // * Choice nodes
690 // These represent places where the regular expression can
691 // match in more than one way. For example on entry to an
692 // alternation (foo|bar) or a repetition (*, +, ? or {}).
693 // * Action nodes
694 // These represent places where some action should be
695 // performed. Examples include recording the current position
696 // in the input string to a register (in order to implement
697 // captures) or other actions on register for example in order
698 // to implement the counters needed for {} repetitions.
699 // * Matching nodes
700 // These attempt to match some element part of the input string.
701 // Examples of elements include character classes, plain strings
702 // or back references.
703 // * End nodes
704 // These are used to implement the actions required on finding
705 // a successful match or failing to find a match.
706 //
707 // The code generated (whether as byte codes or native code) maintains
708 // some state as it runs. This consists of the following elements:
709 //
710 // * The capture registers. Used for string captures.
711 // * Other registers. Used for counters etc.
712 // * The current position.
713 // * The stack of backtracking information. Used when a matching node
714 // fails to find a match and needs to try an alternative.
715 //
716 // Conceptual regular expression execution model:
717 //
718 // There is a simple conceptual model of regular expression execution
719 // which will be presented first. The actual code generated is a more
720 // efficient simulation of the simple conceptual model:
721 //
722 // * Choice nodes are implemented as follows:
723 // For each choice except the last {
724 // push current position
725 // push backtrack code location
726 // <generate code to test for choice>
727 // backtrack code location:
728 // pop current position
729 // }
730 // <generate code to test for last choice>
731 //
732 // * Actions nodes are generated as follows
733 // <push affected registers on backtrack stack>
734 // <generate code to perform action>
735 // push backtrack code location
736 // <generate code to test for following nodes>
737 // backtrack code location:
738 // <pop affected registers to restore their state>
739 // <pop backtrack location from stack and go to it>
740 //
741 // * Matching nodes are generated as follows:
742 // if input string matches at current position
743 // update current position
744 // <generate code to test for following nodes>
745 // else
746 // <pop backtrack location from stack and go to it>
747 //
748 // Thus it can be seen that the current position is saved and restored
749 // by the choice nodes, whereas the registers are saved and restored by
750 // by the action nodes that manipulate them.
751 //
752 // The other interesting aspect of this model is that nodes are generated
753 // at the point where they are needed by a recursive call to Emit(). If
754 // the node has already been code generated then the Emit() call will
755 // generate a jump to the previously generated code instead. In order to
756 // limit recursion it is possible for the Emit() function to put the node
757 // on a work list for later generation and instead generate a jump. The
758 // destination of the jump is resolved later when the code is generated.
759 //
760 // Actual regular expression code generation.
761 //
762 // Code generation is actually more complicated than the above. In order
763 // to improve the efficiency of the generated code some optimizations are
764 // performed
765 //
766 // * Choice nodes have 1-character lookahead.
767 // A choice node looks at the following character and eliminates some of
768 // the choices immediately based on that character. This is not yet
769 // implemented.
770 // * Simple greedy loops store reduced backtracking information.
771 // A quantifier like /.*foo/m will greedily match the whole input. It will
772 // then need to backtrack to a point where it can match "foo". The naive
773 // implementation of this would push each character position onto the
774 // backtracking stack, then pop them off one by one. This would use space
775 // proportional to the length of the input string. However since the "."
776 // can only match in one way and always has a constant length (in this case
777 // of 1) it suffices to store the current position on the top of the stack
778 // once. Matching now becomes merely incrementing the current position and
779 // backtracking becomes decrementing the current position and checking the
780 // result against the stored current position. This is faster and saves
781 // space.
782 // * The current state is virtualized.
783 // This is used to defer expensive operations until it is clear that they
784 // are needed and to generate code for a node more than once, allowing
785 // specialized an efficient versions of the code to be created. This is
786 // explained in the section below.
787 //
788 // Execution state virtualization.
789 //
790 // Instead of emitting code, nodes that manipulate the state can record their
791 // manipulation in an object called the Trace. The Trace object can record a
792 // current position offset, an optional backtrack code location on the top of
793 // the virtualized backtrack stack and some register changes. When a node is
794 // to be emitted it can flush the Trace or update it. Flushing the Trace
795 // will emit code to bring the actual state into line with the virtual state.
796 // Avoiding flushing the state can postpone some work (e.g. updates of capture
797 // registers). Postponing work can save time when executing the regular
798 // expression since it may be found that the work never has to be done as a
799 // failure to match can occur. In addition it is much faster to jump to a
800 // known backtrack code location than it is to pop an unknown backtrack
801 // location from the stack and jump there.
802 //
803 // The virtual state found in the Trace affects code generation. For example
804 // the virtual state contains the difference between the actual current
805 // position and the virtual current position, and matching code needs to use
806 // this offset to attempt a match in the correct location of the input
807 // string. Therefore code generated for a non-trivial trace is specialized
808 // to that trace. The code generator therefore has the ability to generate
809 // code for each node several times. In order to limit the size of the
810 // generated code there is an arbitrary limit on how many specialized sets of
811 // code may be generated for a given node. If the limit is reached, the
812 // trace is flushed and a generic version of the code for a node is emitted.
813 // This is subsequently used for that node. The code emitted for non-generic
814 // trace is not recorded in the node and so it cannot currently be reused in
815 // the event that code generation is requested for an identical trace.
816
817
AppendToText(RegExpText * text,Zone * zone)818 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
819 UNREACHABLE();
820 }
821
822
AppendToText(RegExpText * text,Zone * zone)823 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
824 text->AddElement(TextElement::Atom(this), zone);
825 }
826
827
AppendToText(RegExpText * text,Zone * zone)828 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
829 text->AddElement(TextElement::CharClass(this), zone);
830 }
831
832
AppendToText(RegExpText * text,Zone * zone)833 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
834 for (int i = 0; i < elements()->length(); i++)
835 text->AddElement(elements()->at(i), zone);
836 }
837
838
Atom(RegExpAtom * atom)839 TextElement TextElement::Atom(RegExpAtom* atom) {
840 return TextElement(ATOM, atom);
841 }
842
843
CharClass(RegExpCharacterClass * char_class)844 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
845 return TextElement(CHAR_CLASS, char_class);
846 }
847
848
length() const849 int TextElement::length() const {
850 switch (text_type()) {
851 case ATOM:
852 return atom()->length();
853
854 case CHAR_CLASS:
855 return 1;
856 }
857 UNREACHABLE();
858 }
859
860
GetTable(bool ignore_case)861 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
862 if (table_ == nullptr) {
863 table_ = new(zone()) DispatchTable(zone());
864 DispatchTableConstructor cons(table_, ignore_case, zone());
865 cons.BuildTable(this);
866 }
867 return table_;
868 }
869
870
871 class FrequencyCollator {
872 public:
FrequencyCollator()873 FrequencyCollator() : total_samples_(0) {
874 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
875 frequencies_[i] = CharacterFrequency(i);
876 }
877 }
878
CountCharacter(int character)879 void CountCharacter(int character) {
880 int index = (character & RegExpMacroAssembler::kTableMask);
881 frequencies_[index].Increment();
882 total_samples_++;
883 }
884
885 // Does not measure in percent, but rather per-128 (the table size from the
886 // regexp macro assembler).
Frequency(int in_character)887 int Frequency(int in_character) {
888 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
889 if (total_samples_ < 1) return 1; // Division by zero.
890 int freq_in_per128 =
891 (frequencies_[in_character].counter() * 128) / total_samples_;
892 return freq_in_per128;
893 }
894
895 private:
896 class CharacterFrequency {
897 public:
CharacterFrequency()898 CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)899 explicit CharacterFrequency(int character)
900 : counter_(0), character_(character) { }
901
Increment()902 void Increment() { counter_++; }
counter()903 int counter() { return counter_; }
character()904 int character() { return character_; }
905
906 private:
907 int counter_;
908 int character_;
909 };
910
911
912 private:
913 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
914 int total_samples_;
915 };
916
917
918 class RegExpCompiler {
919 public:
920 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
921 bool is_one_byte);
922
AllocateRegister()923 int AllocateRegister() {
924 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
925 reg_exp_too_big_ = true;
926 return next_register_;
927 }
928 return next_register_++;
929 }
930
931 // Lookarounds to match lone surrogates for unicode character class matches
932 // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()933 int UnicodeLookaroundStackRegister() {
934 if (unicode_lookaround_stack_register_ == kNoRegister) {
935 unicode_lookaround_stack_register_ = AllocateRegister();
936 }
937 return unicode_lookaround_stack_register_;
938 }
939
UnicodeLookaroundPositionRegister()940 int UnicodeLookaroundPositionRegister() {
941 if (unicode_lookaround_position_register_ == kNoRegister) {
942 unicode_lookaround_position_register_ = AllocateRegister();
943 }
944 return unicode_lookaround_position_register_;
945 }
946
947 RegExpEngine::CompilationResult Assemble(Isolate* isolate,
948 RegExpMacroAssembler* assembler,
949 RegExpNode* start, int capture_count,
950 Handle<String> pattern);
951
AddWork(RegExpNode * node)952 inline void AddWork(RegExpNode* node) {
953 if (!node->on_work_list() && !node->label()->is_bound()) {
954 node->set_on_work_list(true);
955 work_list_->push_back(node);
956 }
957 }
958
959 static const int kImplementationOffset = 0;
960 static const int kNumberOfRegistersOffset = 0;
961 static const int kCodeOffset = 1;
962
macro_assembler()963 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()964 EndNode* accept() { return accept_; }
965
966 static const int kMaxRecursion = 100;
recursion_depth()967 inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()968 inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()969 inline void DecrementRecursionDepth() { recursion_depth_--; }
970
SetRegExpTooBig()971 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
972
one_byte()973 inline bool one_byte() { return one_byte_; }
optimize()974 inline bool optimize() { return optimize_; }
set_optimize(bool value)975 inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()976 inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)977 inline void set_limiting_recursion(bool value) {
978 limiting_recursion_ = value;
979 }
read_backward()980 bool read_backward() { return read_backward_; }
set_read_backward(bool value)981 void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()982 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
983
current_expansion_factor()984 int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)985 void set_current_expansion_factor(int value) {
986 current_expansion_factor_ = value;
987 }
988
isolate() const989 Isolate* isolate() const { return isolate_; }
zone() const990 Zone* zone() const { return zone_; }
991
992 static const int kNoRegister = -1;
993
994 private:
995 EndNode* accept_;
996 int next_register_;
997 int unicode_lookaround_stack_register_;
998 int unicode_lookaround_position_register_;
999 std::vector<RegExpNode*>* work_list_;
1000 int recursion_depth_;
1001 RegExpMacroAssembler* macro_assembler_;
1002 bool one_byte_;
1003 bool reg_exp_too_big_;
1004 bool limiting_recursion_;
1005 bool optimize_;
1006 bool read_backward_;
1007 int current_expansion_factor_;
1008 FrequencyCollator frequency_collator_;
1009 Isolate* isolate_;
1010 Zone* zone_;
1011 };
1012
1013
1014 class RecursionCheck {
1015 public:
RecursionCheck(RegExpCompiler * compiler)1016 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1017 compiler->IncrementRecursionDepth();
1018 }
~RecursionCheck()1019 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1020 private:
1021 RegExpCompiler* compiler_;
1022 };
1023
1024
IrregexpRegExpTooBig(Isolate * isolate)1025 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1026 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1027 }
1028
1029
1030 // Attempts to compile the regexp using an Irregexp code generator. Returns
1031 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,bool one_byte)1032 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1033 bool one_byte)
1034 : next_register_(2 * (capture_count + 1)),
1035 unicode_lookaround_stack_register_(kNoRegister),
1036 unicode_lookaround_position_register_(kNoRegister),
1037 work_list_(nullptr),
1038 recursion_depth_(0),
1039 one_byte_(one_byte),
1040 reg_exp_too_big_(false),
1041 limiting_recursion_(false),
1042 optimize_(FLAG_regexp_optimization),
1043 read_backward_(false),
1044 current_expansion_factor_(1),
1045 frequency_collator_(),
1046 isolate_(isolate),
1047 zone_(zone) {
1048 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1049 DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1050 }
1051
Assemble(Isolate * isolate,RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1052 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1053 Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
1054 int capture_count, Handle<String> pattern) {
1055 #ifdef DEBUG
1056 if (FLAG_trace_regexp_assembler)
1057 macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1058 else
1059 #endif
1060 macro_assembler_ = macro_assembler;
1061
1062 std::vector<RegExpNode*> work_list;
1063 work_list_ = &work_list;
1064 Label fail;
1065 macro_assembler_->PushBacktrack(&fail);
1066 Trace new_trace;
1067 start->Emit(this, &new_trace);
1068 macro_assembler_->Bind(&fail);
1069 macro_assembler_->Fail();
1070 while (!work_list.empty()) {
1071 RegExpNode* node = work_list.back();
1072 work_list.pop_back();
1073 node->set_on_work_list(false);
1074 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1075 }
1076 if (reg_exp_too_big_) {
1077 macro_assembler_->AbortedCodeGeneration();
1078 return IrregexpRegExpTooBig(isolate_);
1079 }
1080
1081 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1082 isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1083 work_list_ = nullptr;
1084 #if defined(ENABLE_DISASSEMBLER) && !defined(V8_INTERPRETED_REGEXP)
1085 if (FLAG_print_code) {
1086 CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1087 OFStream os(trace_scope.file());
1088 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1089 }
1090 #endif
1091 #ifdef DEBUG
1092 if (FLAG_trace_regexp_assembler) {
1093 delete macro_assembler_;
1094 }
1095 #endif
1096 return RegExpEngine::CompilationResult(*code, next_register_);
1097 }
1098
1099
Mentions(int that)1100 bool Trace::DeferredAction::Mentions(int that) {
1101 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1102 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1103 return range.Contains(that);
1104 } else {
1105 return reg() == that;
1106 }
1107 }
1108
1109
mentions_reg(int reg)1110 bool Trace::mentions_reg(int reg) {
1111 for (DeferredAction* action = actions_; action != nullptr;
1112 action = action->next()) {
1113 if (action->Mentions(reg))
1114 return true;
1115 }
1116 return false;
1117 }
1118
1119
GetStoredPosition(int reg,int * cp_offset)1120 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1121 DCHECK_EQ(0, *cp_offset);
1122 for (DeferredAction* action = actions_; action != nullptr;
1123 action = action->next()) {
1124 if (action->Mentions(reg)) {
1125 if (action->action_type() == ActionNode::STORE_POSITION) {
1126 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1127 return true;
1128 } else {
1129 return false;
1130 }
1131 }
1132 }
1133 return false;
1134 }
1135
1136
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1137 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1138 Zone* zone) {
1139 int max_register = RegExpCompiler::kNoRegister;
1140 for (DeferredAction* action = actions_; action != nullptr;
1141 action = action->next()) {
1142 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1143 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1144 for (int i = range.from(); i <= range.to(); i++)
1145 affected_registers->Set(i, zone);
1146 if (range.to() > max_register) max_register = range.to();
1147 } else {
1148 affected_registers->Set(action->reg(), zone);
1149 if (action->reg() > max_register) max_register = action->reg();
1150 }
1151 }
1152 return max_register;
1153 }
1154
1155
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1156 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1157 int max_register,
1158 const OutSet& registers_to_pop,
1159 const OutSet& registers_to_clear) {
1160 for (int reg = max_register; reg >= 0; reg--) {
1161 if (registers_to_pop.Get(reg)) {
1162 assembler->PopRegister(reg);
1163 } else if (registers_to_clear.Get(reg)) {
1164 int clear_to = reg;
1165 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1166 reg--;
1167 }
1168 assembler->ClearRegisters(reg, clear_to);
1169 }
1170 }
1171 }
1172
1173
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1174 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1175 int max_register,
1176 const OutSet& affected_registers,
1177 OutSet* registers_to_pop,
1178 OutSet* registers_to_clear,
1179 Zone* zone) {
1180 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1181 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1182
1183 // Count pushes performed to force a stack limit check occasionally.
1184 int pushes = 0;
1185
1186 for (int reg = 0; reg <= max_register; reg++) {
1187 if (!affected_registers.Get(reg)) {
1188 continue;
1189 }
1190
1191 // The chronologically first deferred action in the trace
1192 // is used to infer the action needed to restore a register
1193 // to its previous state (or not, if it's safe to ignore it).
1194 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1195 DeferredActionUndoType undo_action = IGNORE;
1196
1197 int value = 0;
1198 bool absolute = false;
1199 bool clear = false;
1200 static const int kNoStore = kMinInt;
1201 int store_position = kNoStore;
1202 // This is a little tricky because we are scanning the actions in reverse
1203 // historical order (newest first).
1204 for (DeferredAction* action = actions_; action != nullptr;
1205 action = action->next()) {
1206 if (action->Mentions(reg)) {
1207 switch (action->action_type()) {
1208 case ActionNode::SET_REGISTER: {
1209 Trace::DeferredSetRegister* psr =
1210 static_cast<Trace::DeferredSetRegister*>(action);
1211 if (!absolute) {
1212 value += psr->value();
1213 absolute = true;
1214 }
1215 // SET_REGISTER is currently only used for newly introduced loop
1216 // counters. They can have a significant previous value if they
1217 // occur in a loop. TODO(lrn): Propagate this information, so
1218 // we can set undo_action to IGNORE if we know there is no value to
1219 // restore.
1220 undo_action = RESTORE;
1221 DCHECK_EQ(store_position, kNoStore);
1222 DCHECK(!clear);
1223 break;
1224 }
1225 case ActionNode::INCREMENT_REGISTER:
1226 if (!absolute) {
1227 value++;
1228 }
1229 DCHECK_EQ(store_position, kNoStore);
1230 DCHECK(!clear);
1231 undo_action = RESTORE;
1232 break;
1233 case ActionNode::STORE_POSITION: {
1234 Trace::DeferredCapture* pc =
1235 static_cast<Trace::DeferredCapture*>(action);
1236 if (!clear && store_position == kNoStore) {
1237 store_position = pc->cp_offset();
1238 }
1239
1240 // For captures we know that stores and clears alternate.
1241 // Other register, are never cleared, and if the occur
1242 // inside a loop, they might be assigned more than once.
1243 if (reg <= 1) {
1244 // Registers zero and one, aka "capture zero", is
1245 // always set correctly if we succeed. There is no
1246 // need to undo a setting on backtrack, because we
1247 // will set it again or fail.
1248 undo_action = IGNORE;
1249 } else {
1250 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1251 }
1252 DCHECK(!absolute);
1253 DCHECK_EQ(value, 0);
1254 break;
1255 }
1256 case ActionNode::CLEAR_CAPTURES: {
1257 // Since we're scanning in reverse order, if we've already
1258 // set the position we have to ignore historically earlier
1259 // clearing operations.
1260 if (store_position == kNoStore) {
1261 clear = true;
1262 }
1263 undo_action = RESTORE;
1264 DCHECK(!absolute);
1265 DCHECK_EQ(value, 0);
1266 break;
1267 }
1268 default:
1269 UNREACHABLE();
1270 break;
1271 }
1272 }
1273 }
1274 // Prepare for the undo-action (e.g., push if it's going to be popped).
1275 if (undo_action == RESTORE) {
1276 pushes++;
1277 RegExpMacroAssembler::StackCheckFlag stack_check =
1278 RegExpMacroAssembler::kNoStackLimitCheck;
1279 if (pushes == push_limit) {
1280 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1281 pushes = 0;
1282 }
1283
1284 assembler->PushRegister(reg, stack_check);
1285 registers_to_pop->Set(reg, zone);
1286 } else if (undo_action == CLEAR) {
1287 registers_to_clear->Set(reg, zone);
1288 }
1289 // Perform the chronologically last action (or accumulated increment)
1290 // for the register.
1291 if (store_position != kNoStore) {
1292 assembler->WriteCurrentPositionToRegister(reg, store_position);
1293 } else if (clear) {
1294 assembler->ClearRegisters(reg, reg);
1295 } else if (absolute) {
1296 assembler->SetRegister(reg, value);
1297 } else if (value != 0) {
1298 assembler->AdvanceRegister(reg, value);
1299 }
1300 }
1301 }
1302
1303
1304 // This is called as we come into a loop choice node and some other tricky
1305 // nodes. It normalizes the state of the code generator to ensure we can
1306 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1307 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1308 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1309
1310 DCHECK(!is_trivial());
1311
1312 if (actions_ == nullptr && backtrack() == nullptr) {
1313 // Here we just have some deferred cp advances to fix and we are back to
1314 // a normal situation. We may also have to forget some information gained
1315 // through a quick check that was already performed.
1316 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1317 // Create a new trivial state and generate the node with that.
1318 Trace new_state;
1319 successor->Emit(compiler, &new_state);
1320 return;
1321 }
1322
1323 // Generate deferred actions here along with code to undo them again.
1324 OutSet affected_registers;
1325
1326 if (backtrack() != nullptr) {
1327 // Here we have a concrete backtrack location. These are set up by choice
1328 // nodes and so they indicate that we have a deferred save of the current
1329 // position which we may need to emit here.
1330 assembler->PushCurrentPosition();
1331 }
1332
1333 int max_register = FindAffectedRegisters(&affected_registers,
1334 compiler->zone());
1335 OutSet registers_to_pop;
1336 OutSet registers_to_clear;
1337 PerformDeferredActions(assembler,
1338 max_register,
1339 affected_registers,
1340 ®isters_to_pop,
1341 ®isters_to_clear,
1342 compiler->zone());
1343 if (cp_offset_ != 0) {
1344 assembler->AdvanceCurrentPosition(cp_offset_);
1345 }
1346
1347 // Create a new trivial state and generate the node with that.
1348 Label undo;
1349 assembler->PushBacktrack(&undo);
1350 if (successor->KeepRecursing(compiler)) {
1351 Trace new_state;
1352 successor->Emit(compiler, &new_state);
1353 } else {
1354 compiler->AddWork(successor);
1355 assembler->GoTo(successor->label());
1356 }
1357
1358 // On backtrack we need to restore state.
1359 assembler->Bind(&undo);
1360 RestoreAffectedRegisters(assembler,
1361 max_register,
1362 registers_to_pop,
1363 registers_to_clear);
1364 if (backtrack() == nullptr) {
1365 assembler->Backtrack();
1366 } else {
1367 assembler->PopCurrentPosition();
1368 assembler->GoTo(backtrack());
1369 }
1370 }
1371
1372
Emit(RegExpCompiler * compiler,Trace * trace)1373 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1374 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1375
1376 // Omit flushing the trace. We discard the entire stack frame anyway.
1377
1378 if (!label()->is_bound()) {
1379 // We are completely independent of the trace, since we ignore it,
1380 // so this code can be used as the generic version.
1381 assembler->Bind(label());
1382 }
1383
1384 // Throw away everything on the backtrack stack since the start
1385 // of the negative submatch and restore the character position.
1386 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1387 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1388 if (clear_capture_count_ > 0) {
1389 // Clear any captures that might have been performed during the success
1390 // of the body of the negative look-ahead.
1391 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1392 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1393 }
1394 // Now that we have unwound the stack we find at the top of the stack the
1395 // backtrack that the BeginSubmatch node got.
1396 assembler->Backtrack();
1397 }
1398
1399
Emit(RegExpCompiler * compiler,Trace * trace)1400 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1401 if (!trace->is_trivial()) {
1402 trace->Flush(compiler, this);
1403 return;
1404 }
1405 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1406 if (!label()->is_bound()) {
1407 assembler->Bind(label());
1408 }
1409 switch (action_) {
1410 case ACCEPT:
1411 assembler->Succeed();
1412 return;
1413 case BACKTRACK:
1414 assembler->GoTo(trace->backtrack());
1415 return;
1416 case NEGATIVE_SUBMATCH_SUCCESS:
1417 // This case is handled in a different virtual method.
1418 UNREACHABLE();
1419 }
1420 UNIMPLEMENTED();
1421 }
1422
1423
AddGuard(Guard * guard,Zone * zone)1424 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1425 if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1426 guards_->Add(guard, zone);
1427 }
1428
1429
SetRegister(int reg,int val,RegExpNode * on_success)1430 ActionNode* ActionNode::SetRegister(int reg,
1431 int val,
1432 RegExpNode* on_success) {
1433 ActionNode* result =
1434 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1435 result->data_.u_store_register.reg = reg;
1436 result->data_.u_store_register.value = val;
1437 return result;
1438 }
1439
1440
IncrementRegister(int reg,RegExpNode * on_success)1441 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1442 ActionNode* result =
1443 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1444 result->data_.u_increment_register.reg = reg;
1445 return result;
1446 }
1447
1448
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1449 ActionNode* ActionNode::StorePosition(int reg,
1450 bool is_capture,
1451 RegExpNode* on_success) {
1452 ActionNode* result =
1453 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1454 result->data_.u_position_register.reg = reg;
1455 result->data_.u_position_register.is_capture = is_capture;
1456 return result;
1457 }
1458
1459
ClearCaptures(Interval range,RegExpNode * on_success)1460 ActionNode* ActionNode::ClearCaptures(Interval range,
1461 RegExpNode* on_success) {
1462 ActionNode* result =
1463 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1464 result->data_.u_clear_captures.range_from = range.from();
1465 result->data_.u_clear_captures.range_to = range.to();
1466 return result;
1467 }
1468
1469
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1470 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1471 int position_reg,
1472 RegExpNode* on_success) {
1473 ActionNode* result =
1474 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1475 result->data_.u_submatch.stack_pointer_register = stack_reg;
1476 result->data_.u_submatch.current_position_register = position_reg;
1477 return result;
1478 }
1479
1480
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1481 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1482 int position_reg,
1483 int clear_register_count,
1484 int clear_register_from,
1485 RegExpNode* on_success) {
1486 ActionNode* result =
1487 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1488 result->data_.u_submatch.stack_pointer_register = stack_reg;
1489 result->data_.u_submatch.current_position_register = position_reg;
1490 result->data_.u_submatch.clear_register_count = clear_register_count;
1491 result->data_.u_submatch.clear_register_from = clear_register_from;
1492 return result;
1493 }
1494
1495
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1496 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1497 int repetition_register,
1498 int repetition_limit,
1499 RegExpNode* on_success) {
1500 ActionNode* result =
1501 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1502 result->data_.u_empty_match_check.start_register = start_register;
1503 result->data_.u_empty_match_check.repetition_register = repetition_register;
1504 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1505 return result;
1506 }
1507
1508
1509 #define DEFINE_ACCEPT(Type) \
1510 void Type##Node::Accept(NodeVisitor* visitor) { \
1511 visitor->Visit##Type(this); \
1512 }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1513 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1514 #undef DEFINE_ACCEPT
1515
1516
1517 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1518 visitor->VisitLoopChoice(this);
1519 }
1520
1521
1522 // -------------------------------------------------------------------
1523 // Emit code.
1524
1525
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1526 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1527 Guard* guard,
1528 Trace* trace) {
1529 switch (guard->op()) {
1530 case Guard::LT:
1531 DCHECK(!trace->mentions_reg(guard->reg()));
1532 macro_assembler->IfRegisterGE(guard->reg(),
1533 guard->value(),
1534 trace->backtrack());
1535 break;
1536 case Guard::GEQ:
1537 DCHECK(!trace->mentions_reg(guard->reg()));
1538 macro_assembler->IfRegisterLT(guard->reg(),
1539 guard->value(),
1540 trace->backtrack());
1541 break;
1542 }
1543 }
1544
1545
1546 // Returns the number of characters in the equivalence class, omitting those
1547 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1548 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1549 bool one_byte_subject,
1550 unibrow::uchar* letters) {
1551 int length =
1552 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1553 // Unibrow returns 0 or 1 for characters where case independence is
1554 // trivial.
1555 if (length == 0) {
1556 letters[0] = character;
1557 length = 1;
1558 }
1559
1560 if (one_byte_subject) {
1561 int new_length = 0;
1562 for (int i = 0; i < length; i++) {
1563 if (letters[i] <= String::kMaxOneByteCharCode) {
1564 letters[new_length++] = letters[i];
1565 }
1566 }
1567 length = new_length;
1568 }
1569
1570 return length;
1571 }
1572
1573
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1574 static inline bool EmitSimpleCharacter(Isolate* isolate,
1575 RegExpCompiler* compiler,
1576 uc16 c,
1577 Label* on_failure,
1578 int cp_offset,
1579 bool check,
1580 bool preloaded) {
1581 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1582 bool bound_checked = false;
1583 if (!preloaded) {
1584 assembler->LoadCurrentCharacter(
1585 cp_offset,
1586 on_failure,
1587 check);
1588 bound_checked = true;
1589 }
1590 assembler->CheckNotCharacter(c, on_failure);
1591 return bound_checked;
1592 }
1593
1594
1595 // Only emits non-letters (things that don't have case). Only used for case
1596 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1597 static inline bool EmitAtomNonLetter(Isolate* isolate,
1598 RegExpCompiler* compiler,
1599 uc16 c,
1600 Label* on_failure,
1601 int cp_offset,
1602 bool check,
1603 bool preloaded) {
1604 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1605 bool one_byte = compiler->one_byte();
1606 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1607 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1608 if (length < 1) {
1609 // This can't match. Must be an one-byte subject and a non-one-byte
1610 // character. We do not need to do anything since the one-byte pass
1611 // already handled this.
1612 return false; // Bounds not checked.
1613 }
1614 bool checked = false;
1615 // We handle the length > 1 case in a later pass.
1616 if (length == 1) {
1617 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1618 // Can't match - see above.
1619 return false; // Bounds not checked.
1620 }
1621 if (!preloaded) {
1622 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1623 checked = check;
1624 }
1625 macro_assembler->CheckNotCharacter(c, on_failure);
1626 }
1627 return checked;
1628 }
1629
1630
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1631 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1632 bool one_byte, uc16 c1, uc16 c2,
1633 Label* on_failure) {
1634 uc16 char_mask;
1635 if (one_byte) {
1636 char_mask = String::kMaxOneByteCharCode;
1637 } else {
1638 char_mask = String::kMaxUtf16CodeUnit;
1639 }
1640 uc16 exor = c1 ^ c2;
1641 // Check whether exor has only one bit set.
1642 if (((exor - 1) & exor) == 0) {
1643 // If c1 and c2 differ only by one bit.
1644 // Ecma262UnCanonicalize always gives the highest number last.
1645 DCHECK(c2 > c1);
1646 uc16 mask = char_mask ^ exor;
1647 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1648 return true;
1649 }
1650 DCHECK(c2 > c1);
1651 uc16 diff = c2 - c1;
1652 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1653 // If the characters differ by 2^n but don't differ by one bit then
1654 // subtract the difference from the found character, then do the or
1655 // trick. We avoid the theoretical case where negative numbers are
1656 // involved in order to simplify code generation.
1657 uc16 mask = char_mask ^ diff;
1658 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1659 diff,
1660 mask,
1661 on_failure);
1662 return true;
1663 }
1664 return false;
1665 }
1666
1667
1668 typedef bool EmitCharacterFunction(Isolate* isolate,
1669 RegExpCompiler* compiler,
1670 uc16 c,
1671 Label* on_failure,
1672 int cp_offset,
1673 bool check,
1674 bool preloaded);
1675
1676 // Only emits letters (things that have case). Only used for case independent
1677 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1678 static inline bool EmitAtomLetter(Isolate* isolate,
1679 RegExpCompiler* compiler,
1680 uc16 c,
1681 Label* on_failure,
1682 int cp_offset,
1683 bool check,
1684 bool preloaded) {
1685 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1686 bool one_byte = compiler->one_byte();
1687 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1688 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1689 if (length <= 1) return false;
1690 // We may not need to check against the end of the input string
1691 // if this character lies before a character that matched.
1692 if (!preloaded) {
1693 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1694 }
1695 Label ok;
1696 DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
1697 switch (length) {
1698 case 2: {
1699 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1700 chars[1], on_failure)) {
1701 } else {
1702 macro_assembler->CheckCharacter(chars[0], &ok);
1703 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1704 macro_assembler->Bind(&ok);
1705 }
1706 break;
1707 }
1708 case 4:
1709 macro_assembler->CheckCharacter(chars[3], &ok);
1710 V8_FALLTHROUGH;
1711 case 3:
1712 macro_assembler->CheckCharacter(chars[0], &ok);
1713 macro_assembler->CheckCharacter(chars[1], &ok);
1714 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1715 macro_assembler->Bind(&ok);
1716 break;
1717 default:
1718 UNREACHABLE();
1719 break;
1720 }
1721 return true;
1722 }
1723
1724
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1725 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1726 int border,
1727 Label* fall_through,
1728 Label* above_or_equal,
1729 Label* below) {
1730 if (below != fall_through) {
1731 masm->CheckCharacterLT(border, below);
1732 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1733 } else {
1734 masm->CheckCharacterGT(border - 1, above_or_equal);
1735 }
1736 }
1737
1738
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1739 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1740 int first,
1741 int last,
1742 Label* fall_through,
1743 Label* in_range,
1744 Label* out_of_range) {
1745 if (in_range == fall_through) {
1746 if (first == last) {
1747 masm->CheckNotCharacter(first, out_of_range);
1748 } else {
1749 masm->CheckCharacterNotInRange(first, last, out_of_range);
1750 }
1751 } else {
1752 if (first == last) {
1753 masm->CheckCharacter(first, in_range);
1754 } else {
1755 masm->CheckCharacterInRange(first, last, in_range);
1756 }
1757 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1758 }
1759 }
1760
1761
1762 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1763 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1764 static void EmitUseLookupTable(
1765 RegExpMacroAssembler* masm,
1766 ZoneList<int>* ranges,
1767 int start_index,
1768 int end_index,
1769 int min_char,
1770 Label* fall_through,
1771 Label* even_label,
1772 Label* odd_label) {
1773 static const int kSize = RegExpMacroAssembler::kTableSize;
1774 static const int kMask = RegExpMacroAssembler::kTableMask;
1775
1776 int base = (min_char & ~kMask);
1777 USE(base);
1778
1779 // Assert that everything is on one kTableSize page.
1780 for (int i = start_index; i <= end_index; i++) {
1781 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1782 }
1783 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1784
1785 char templ[kSize];
1786 Label* on_bit_set;
1787 Label* on_bit_clear;
1788 int bit;
1789 if (even_label == fall_through) {
1790 on_bit_set = odd_label;
1791 on_bit_clear = even_label;
1792 bit = 1;
1793 } else {
1794 on_bit_set = even_label;
1795 on_bit_clear = odd_label;
1796 bit = 0;
1797 }
1798 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1799 templ[i] = bit;
1800 }
1801 int j = 0;
1802 bit ^= 1;
1803 for (int i = start_index; i < end_index; i++) {
1804 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1805 templ[j] = bit;
1806 }
1807 bit ^= 1;
1808 }
1809 for (int i = j; i < kSize; i++) {
1810 templ[i] = bit;
1811 }
1812 Factory* factory = masm->isolate()->factory();
1813 // TODO(erikcorry): Cache these.
1814 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1815 for (int i = 0; i < kSize; i++) {
1816 ba->set(i, templ[i]);
1817 }
1818 masm->CheckBitInTable(ba, on_bit_set);
1819 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1820 }
1821
1822
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1823 static void CutOutRange(RegExpMacroAssembler* masm,
1824 ZoneList<int>* ranges,
1825 int start_index,
1826 int end_index,
1827 int cut_index,
1828 Label* even_label,
1829 Label* odd_label) {
1830 bool odd = (((cut_index - start_index) & 1) == 1);
1831 Label* in_range_label = odd ? odd_label : even_label;
1832 Label dummy;
1833 EmitDoubleBoundaryTest(masm,
1834 ranges->at(cut_index),
1835 ranges->at(cut_index + 1) - 1,
1836 &dummy,
1837 in_range_label,
1838 &dummy);
1839 DCHECK(!dummy.is_linked());
1840 // Cut out the single range by rewriting the array. This creates a new
1841 // range that is a merger of the two ranges on either side of the one we
1842 // are cutting out. The oddity of the labels is preserved.
1843 for (int j = cut_index; j > start_index; j--) {
1844 ranges->at(j) = ranges->at(j - 1);
1845 }
1846 for (int j = cut_index + 1; j < end_index; j++) {
1847 ranges->at(j) = ranges->at(j + 1);
1848 }
1849 }
1850
1851
1852 // Unicode case. Split the search space into kSize spaces that are handled
1853 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1854 static void SplitSearchSpace(ZoneList<int>* ranges,
1855 int start_index,
1856 int end_index,
1857 int* new_start_index,
1858 int* new_end_index,
1859 int* border) {
1860 static const int kSize = RegExpMacroAssembler::kTableSize;
1861 static const int kMask = RegExpMacroAssembler::kTableMask;
1862
1863 int first = ranges->at(start_index);
1864 int last = ranges->at(end_index) - 1;
1865
1866 *new_start_index = start_index;
1867 *border = (ranges->at(start_index) & ~kMask) + kSize;
1868 while (*new_start_index < end_index) {
1869 if (ranges->at(*new_start_index) > *border) break;
1870 (*new_start_index)++;
1871 }
1872 // new_start_index is the index of the first edge that is beyond the
1873 // current kSize space.
1874
1875 // For very large search spaces we do a binary chop search of the non-Latin1
1876 // space instead of just going to the end of the current kSize space. The
1877 // heuristics are complicated a little by the fact that any 128-character
1878 // encoding space can be quickly tested with a table lookup, so we don't
1879 // wish to do binary chop search at a smaller granularity than that. A
1880 // 128-character space can take up a lot of space in the ranges array if,
1881 // for example, we only want to match every second character (eg. the lower
1882 // case characters on some Unicode pages).
1883 int binary_chop_index = (end_index + start_index) / 2;
1884 // The first test ensures that we get to the code that handles the Latin1
1885 // range with a single not-taken branch, speeding up this important
1886 // character range (even non-Latin1 charset-based text has spaces and
1887 // punctuation).
1888 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1889 end_index - start_index > (*new_start_index - start_index) * 2 &&
1890 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1891 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1892 int scan_forward_for_section_border = binary_chop_index;;
1893 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1894
1895 while (scan_forward_for_section_border < end_index) {
1896 if (ranges->at(scan_forward_for_section_border) > new_border) {
1897 *new_start_index = scan_forward_for_section_border;
1898 *border = new_border;
1899 break;
1900 }
1901 scan_forward_for_section_border++;
1902 }
1903 }
1904
1905 DCHECK(*new_start_index > start_index);
1906 *new_end_index = *new_start_index - 1;
1907 if (ranges->at(*new_end_index) == *border) {
1908 (*new_end_index)--;
1909 }
1910 if (*border >= ranges->at(end_index)) {
1911 *border = ranges->at(end_index);
1912 *new_start_index = end_index; // Won't be used.
1913 *new_end_index = end_index - 1;
1914 }
1915 }
1916
1917 // Gets a series of segment boundaries representing a character class. If the
1918 // character is in the range between an even and an odd boundary (counting from
1919 // start_index) then go to even_label, otherwise go to odd_label. We already
1920 // know that the character is in the range of min_char to max_char inclusive.
1921 // Either label can be nullptr indicating backtracking. Either label can also
1922 // be equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1923 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1924 int start_index, int end_index, uc32 min_char,
1925 uc32 max_char, Label* fall_through,
1926 Label* even_label, Label* odd_label) {
1927 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1928 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1929
1930 int first = ranges->at(start_index);
1931 int last = ranges->at(end_index) - 1;
1932
1933 DCHECK_LT(min_char, first);
1934
1935 // Just need to test if the character is before or on-or-after
1936 // a particular character.
1937 if (start_index == end_index) {
1938 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1939 return;
1940 }
1941
1942 // Another almost trivial case: There is one interval in the middle that is
1943 // different from the end intervals.
1944 if (start_index + 1 == end_index) {
1945 EmitDoubleBoundaryTest(
1946 masm, first, last, fall_through, even_label, odd_label);
1947 return;
1948 }
1949
1950 // It's not worth using table lookup if there are very few intervals in the
1951 // character class.
1952 if (end_index - start_index <= 6) {
1953 // It is faster to test for individual characters, so we look for those
1954 // first, then try arbitrary ranges in the second round.
1955 static int kNoCutIndex = -1;
1956 int cut = kNoCutIndex;
1957 for (int i = start_index; i < end_index; i++) {
1958 if (ranges->at(i) == ranges->at(i + 1) - 1) {
1959 cut = i;
1960 break;
1961 }
1962 }
1963 if (cut == kNoCutIndex) cut = start_index;
1964 CutOutRange(
1965 masm, ranges, start_index, end_index, cut, even_label, odd_label);
1966 DCHECK_GE(end_index - start_index, 2);
1967 GenerateBranches(masm,
1968 ranges,
1969 start_index + 1,
1970 end_index - 1,
1971 min_char,
1972 max_char,
1973 fall_through,
1974 even_label,
1975 odd_label);
1976 return;
1977 }
1978
1979 // If there are a lot of intervals in the regexp, then we will use tables to
1980 // determine whether the character is inside or outside the character class.
1981 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1982
1983 if ((max_char >> kBits) == (min_char >> kBits)) {
1984 EmitUseLookupTable(masm,
1985 ranges,
1986 start_index,
1987 end_index,
1988 min_char,
1989 fall_through,
1990 even_label,
1991 odd_label);
1992 return;
1993 }
1994
1995 if ((min_char >> kBits) != (first >> kBits)) {
1996 masm->CheckCharacterLT(first, odd_label);
1997 GenerateBranches(masm,
1998 ranges,
1999 start_index + 1,
2000 end_index,
2001 first,
2002 max_char,
2003 fall_through,
2004 odd_label,
2005 even_label);
2006 return;
2007 }
2008
2009 int new_start_index = 0;
2010 int new_end_index = 0;
2011 int border = 0;
2012
2013 SplitSearchSpace(ranges,
2014 start_index,
2015 end_index,
2016 &new_start_index,
2017 &new_end_index,
2018 &border);
2019
2020 Label handle_rest;
2021 Label* above = &handle_rest;
2022 if (border == last + 1) {
2023 // We didn't find any section that started after the limit, so everything
2024 // above the border is one of the terminal labels.
2025 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2026 DCHECK(new_end_index == end_index - 1);
2027 }
2028
2029 DCHECK_LE(start_index, new_end_index);
2030 DCHECK_LE(new_start_index, end_index);
2031 DCHECK_LT(start_index, new_start_index);
2032 DCHECK_LT(new_end_index, end_index);
2033 DCHECK(new_end_index + 1 == new_start_index ||
2034 (new_end_index + 2 == new_start_index &&
2035 border == ranges->at(new_end_index + 1)));
2036 DCHECK_LT(min_char, border - 1);
2037 DCHECK_LT(border, max_char);
2038 DCHECK_LT(ranges->at(new_end_index), border);
2039 DCHECK(border < ranges->at(new_start_index) ||
2040 (border == ranges->at(new_start_index) &&
2041 new_start_index == end_index &&
2042 new_end_index == end_index - 1 &&
2043 border == last + 1));
2044 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2045
2046 masm->CheckCharacterGT(border - 1, above);
2047 Label dummy;
2048 GenerateBranches(masm,
2049 ranges,
2050 start_index,
2051 new_end_index,
2052 min_char,
2053 border - 1,
2054 &dummy,
2055 even_label,
2056 odd_label);
2057 if (handle_rest.is_linked()) {
2058 masm->Bind(&handle_rest);
2059 bool flip = (new_start_index & 1) != (start_index & 1);
2060 GenerateBranches(masm,
2061 ranges,
2062 new_start_index,
2063 end_index,
2064 border,
2065 max_char,
2066 &dummy,
2067 flip ? odd_label : even_label,
2068 flip ? even_label : odd_label);
2069 }
2070 }
2071
2072
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2073 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2074 RegExpCharacterClass* cc, bool one_byte,
2075 Label* on_failure, int cp_offset, bool check_offset,
2076 bool preloaded, Zone* zone) {
2077 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2078 CharacterRange::Canonicalize(ranges);
2079
2080 int max_char;
2081 if (one_byte) {
2082 max_char = String::kMaxOneByteCharCode;
2083 } else {
2084 max_char = String::kMaxUtf16CodeUnit;
2085 }
2086
2087 int range_count = ranges->length();
2088
2089 int last_valid_range = range_count - 1;
2090 while (last_valid_range >= 0) {
2091 CharacterRange& range = ranges->at(last_valid_range);
2092 if (range.from() <= max_char) {
2093 break;
2094 }
2095 last_valid_range--;
2096 }
2097
2098 if (last_valid_range < 0) {
2099 if (!cc->is_negated()) {
2100 macro_assembler->GoTo(on_failure);
2101 }
2102 if (check_offset) {
2103 macro_assembler->CheckPosition(cp_offset, on_failure);
2104 }
2105 return;
2106 }
2107
2108 if (last_valid_range == 0 &&
2109 ranges->at(0).IsEverything(max_char)) {
2110 if (cc->is_negated()) {
2111 macro_assembler->GoTo(on_failure);
2112 } else {
2113 // This is a common case hit by non-anchored expressions.
2114 if (check_offset) {
2115 macro_assembler->CheckPosition(cp_offset, on_failure);
2116 }
2117 }
2118 return;
2119 }
2120
2121 if (!preloaded) {
2122 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2123 }
2124
2125 if (cc->is_standard(zone) &&
2126 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2127 on_failure)) {
2128 return;
2129 }
2130
2131
2132 // A new list with ascending entries. Each entry is a code unit
2133 // where there is a boundary between code units that are part of
2134 // the class and code units that are not. Normally we insert an
2135 // entry at zero which goes to the failure label, but if there
2136 // was already one there we fall through for success on that entry.
2137 // Subsequent entries have alternating meaning (success/failure).
2138 ZoneList<int>* range_boundaries =
2139 new(zone) ZoneList<int>(last_valid_range, zone);
2140
2141 bool zeroth_entry_is_failure = !cc->is_negated();
2142
2143 for (int i = 0; i <= last_valid_range; i++) {
2144 CharacterRange& range = ranges->at(i);
2145 if (range.from() == 0) {
2146 DCHECK_EQ(i, 0);
2147 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2148 } else {
2149 range_boundaries->Add(range.from(), zone);
2150 }
2151 range_boundaries->Add(range.to() + 1, zone);
2152 }
2153 int end_index = range_boundaries->length() - 1;
2154 if (range_boundaries->at(end_index) > max_char) {
2155 end_index--;
2156 }
2157
2158 Label fall_through;
2159 GenerateBranches(macro_assembler,
2160 range_boundaries,
2161 0, // start_index.
2162 end_index,
2163 0, // min_char.
2164 max_char,
2165 &fall_through,
2166 zeroth_entry_is_failure ? &fall_through : on_failure,
2167 zeroth_entry_is_failure ? on_failure : &fall_through);
2168 macro_assembler->Bind(&fall_through);
2169 }
2170
2171
~RegExpNode()2172 RegExpNode::~RegExpNode() {
2173 }
2174
2175
LimitVersions(RegExpCompiler * compiler,Trace * trace)2176 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2177 Trace* trace) {
2178 // If we are generating a greedy loop then don't stop and don't reuse code.
2179 if (trace->stop_node() != nullptr) {
2180 return CONTINUE;
2181 }
2182
2183 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2184 if (trace->is_trivial()) {
2185 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2186 // If a generic version is already scheduled to be generated or we have
2187 // recursed too deeply then just generate a jump to that code.
2188 macro_assembler->GoTo(&label_);
2189 // This will queue it up for generation of a generic version if it hasn't
2190 // already been queued.
2191 compiler->AddWork(this);
2192 return DONE;
2193 }
2194 // Generate generic version of the node and bind the label for later use.
2195 macro_assembler->Bind(&label_);
2196 return CONTINUE;
2197 }
2198
2199 // We are being asked to make a non-generic version. Keep track of how many
2200 // non-generic versions we generate so as not to overdo it.
2201 trace_count_++;
2202 if (KeepRecursing(compiler) && compiler->optimize() &&
2203 trace_count_ < kMaxCopiesCodeGenerated) {
2204 return CONTINUE;
2205 }
2206
2207 // If we get here code has been generated for this node too many times or
2208 // recursion is too deep. Time to switch to a generic version. The code for
2209 // generic versions above can handle deep recursion properly.
2210 bool was_limiting = compiler->limiting_recursion();
2211 compiler->set_limiting_recursion(true);
2212 trace->Flush(compiler, this);
2213 compiler->set_limiting_recursion(was_limiting);
2214 return DONE;
2215 }
2216
2217
KeepRecursing(RegExpCompiler * compiler)2218 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2219 return !compiler->limiting_recursion() &&
2220 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2221 }
2222
2223
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2224 int ActionNode::EatsAtLeast(int still_to_find,
2225 int budget,
2226 bool not_at_start) {
2227 if (budget <= 0) return 0;
2228 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2229 return on_success()->EatsAtLeast(still_to_find,
2230 budget - 1,
2231 not_at_start);
2232 }
2233
2234
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2235 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2236 BoyerMooreLookahead* bm, bool not_at_start) {
2237 if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2238 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2239 }
2240 SaveBMInfo(bm, not_at_start, offset);
2241 }
2242
2243
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2244 int AssertionNode::EatsAtLeast(int still_to_find,
2245 int budget,
2246 bool not_at_start) {
2247 if (budget <= 0) return 0;
2248 // If we know we are not at the start and we are asked "how many characters
2249 // will you match if you succeed?" then we can answer anything since false
2250 // implies false. So lets just return the max answer (still_to_find) since
2251 // that won't prevent us from preloading a lot of characters for the other
2252 // branches in the node graph.
2253 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2254 return on_success()->EatsAtLeast(still_to_find,
2255 budget - 1,
2256 not_at_start);
2257 }
2258
2259
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2260 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2261 BoyerMooreLookahead* bm, bool not_at_start) {
2262 // Match the behaviour of EatsAtLeast on this node.
2263 if (assertion_type() == AT_START && not_at_start) return;
2264 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2265 SaveBMInfo(bm, not_at_start, offset);
2266 }
2267
2268
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2269 int BackReferenceNode::EatsAtLeast(int still_to_find,
2270 int budget,
2271 bool not_at_start) {
2272 if (read_backward()) return 0;
2273 if (budget <= 0) return 0;
2274 return on_success()->EatsAtLeast(still_to_find,
2275 budget - 1,
2276 not_at_start);
2277 }
2278
2279
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2280 int TextNode::EatsAtLeast(int still_to_find,
2281 int budget,
2282 bool not_at_start) {
2283 if (read_backward()) return 0;
2284 int answer = Length();
2285 if (answer >= still_to_find) return answer;
2286 if (budget <= 0) return answer;
2287 // We are not at start after this node so we set the last argument to 'true'.
2288 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2289 budget - 1,
2290 true);
2291 }
2292
2293
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2294 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2295 bool not_at_start) {
2296 if (budget <= 0) return 0;
2297 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2298 // afterwards.
2299 RegExpNode* node = alternatives_->at(1).node();
2300 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2301 }
2302
2303
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2304 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2305 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2306 bool not_at_start) {
2307 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2308 // afterwards.
2309 RegExpNode* node = alternatives_->at(1).node();
2310 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2311 }
2312
2313
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2314 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2315 int budget,
2316 RegExpNode* ignore_this_node,
2317 bool not_at_start) {
2318 if (budget <= 0) return 0;
2319 int min = 100;
2320 int choice_count = alternatives_->length();
2321 budget = (budget - 1) / choice_count;
2322 for (int i = 0; i < choice_count; i++) {
2323 RegExpNode* node = alternatives_->at(i).node();
2324 if (node == ignore_this_node) continue;
2325 int node_eats_at_least =
2326 node->EatsAtLeast(still_to_find, budget, not_at_start);
2327 if (node_eats_at_least < min) min = node_eats_at_least;
2328 if (min == 0) return 0;
2329 }
2330 return min;
2331 }
2332
2333
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2334 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2335 int budget,
2336 bool not_at_start) {
2337 return EatsAtLeastHelper(still_to_find,
2338 budget - 1,
2339 loop_node_,
2340 not_at_start);
2341 }
2342
2343
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2344 int ChoiceNode::EatsAtLeast(int still_to_find,
2345 int budget,
2346 bool not_at_start) {
2347 return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2348 }
2349
2350
2351 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2352 static inline uint32_t SmearBitsRight(uint32_t v) {
2353 v |= v >> 1;
2354 v |= v >> 2;
2355 v |= v >> 4;
2356 v |= v >> 8;
2357 v |= v >> 16;
2358 return v;
2359 }
2360
2361
Rationalize(bool asc)2362 bool QuickCheckDetails::Rationalize(bool asc) {
2363 bool found_useful_op = false;
2364 uint32_t char_mask;
2365 if (asc) {
2366 char_mask = String::kMaxOneByteCharCode;
2367 } else {
2368 char_mask = String::kMaxUtf16CodeUnit;
2369 }
2370 mask_ = 0;
2371 value_ = 0;
2372 int char_shift = 0;
2373 for (int i = 0; i < characters_; i++) {
2374 Position* pos = &positions_[i];
2375 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2376 found_useful_op = true;
2377 }
2378 mask_ |= (pos->mask & char_mask) << char_shift;
2379 value_ |= (pos->value & char_mask) << char_shift;
2380 char_shift += asc ? 8 : 16;
2381 }
2382 return found_useful_op;
2383 }
2384
2385
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2386 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2387 Trace* bounds_check_trace,
2388 Trace* trace,
2389 bool preload_has_checked_bounds,
2390 Label* on_possible_success,
2391 QuickCheckDetails* details,
2392 bool fall_through_on_failure) {
2393 if (details->characters() == 0) return false;
2394 GetQuickCheckDetails(
2395 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2396 if (details->cannot_match()) return false;
2397 if (!details->Rationalize(compiler->one_byte())) return false;
2398 DCHECK(details->characters() == 1 ||
2399 compiler->macro_assembler()->CanReadUnaligned());
2400 uint32_t mask = details->mask();
2401 uint32_t value = details->value();
2402
2403 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2404
2405 if (trace->characters_preloaded() != details->characters()) {
2406 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2407 // We are attempting to preload the minimum number of characters
2408 // any choice would eat, so if the bounds check fails, then none of the
2409 // choices can succeed, so we can just immediately backtrack, rather
2410 // than go to the next choice.
2411 assembler->LoadCurrentCharacter(trace->cp_offset(),
2412 bounds_check_trace->backtrack(),
2413 !preload_has_checked_bounds,
2414 details->characters());
2415 }
2416
2417
2418 bool need_mask = true;
2419
2420 if (details->characters() == 1) {
2421 // If number of characters preloaded is 1 then we used a byte or 16 bit
2422 // load so the value is already masked down.
2423 uint32_t char_mask;
2424 if (compiler->one_byte()) {
2425 char_mask = String::kMaxOneByteCharCode;
2426 } else {
2427 char_mask = String::kMaxUtf16CodeUnit;
2428 }
2429 if ((mask & char_mask) == char_mask) need_mask = false;
2430 mask &= char_mask;
2431 } else {
2432 // For 2-character preloads in one-byte mode or 1-character preloads in
2433 // two-byte mode we also use a 16 bit load with zero extend.
2434 static const uint32_t kTwoByteMask = 0xFFFF;
2435 static const uint32_t kFourByteMask = 0xFFFFFFFF;
2436 if (details->characters() == 2 && compiler->one_byte()) {
2437 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2438 } else if (details->characters() == 1 && !compiler->one_byte()) {
2439 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2440 } else {
2441 if (mask == kFourByteMask) need_mask = false;
2442 }
2443 }
2444
2445 if (fall_through_on_failure) {
2446 if (need_mask) {
2447 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2448 } else {
2449 assembler->CheckCharacter(value, on_possible_success);
2450 }
2451 } else {
2452 if (need_mask) {
2453 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2454 } else {
2455 assembler->CheckNotCharacter(value, trace->backtrack());
2456 }
2457 }
2458 return true;
2459 }
2460
2461
2462 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2463 // super-class in the .h file).
2464 //
2465 // We iterate along the text object, building up for each character a
2466 // mask and value that can be used to test for a quick failure to match.
2467 // The masks and values for the positions will be combined into a single
2468 // machine word for the current character width in order to be used in
2469 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2470 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2471 RegExpCompiler* compiler,
2472 int characters_filled_in,
2473 bool not_at_start) {
2474 // Do not collect any quick check details if the text node reads backward,
2475 // since it reads in the opposite direction than we use for quick checks.
2476 if (read_backward()) return;
2477 Isolate* isolate = compiler->macro_assembler()->isolate();
2478 DCHECK(characters_filled_in < details->characters());
2479 int characters = details->characters();
2480 int char_mask;
2481 if (compiler->one_byte()) {
2482 char_mask = String::kMaxOneByteCharCode;
2483 } else {
2484 char_mask = String::kMaxUtf16CodeUnit;
2485 }
2486 for (int k = 0; k < elements()->length(); k++) {
2487 TextElement elm = elements()->at(k);
2488 if (elm.text_type() == TextElement::ATOM) {
2489 Vector<const uc16> quarks = elm.atom()->data();
2490 for (int i = 0; i < characters && i < quarks.length(); i++) {
2491 QuickCheckDetails::Position* pos =
2492 details->positions(characters_filled_in);
2493 uc16 c = quarks[i];
2494 if (elm.atom()->ignore_case()) {
2495 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2496 int length = GetCaseIndependentLetters(isolate, c,
2497 compiler->one_byte(), chars);
2498 if (length == 0) {
2499 // This can happen because all case variants are non-Latin1, but we
2500 // know the input is Latin1.
2501 details->set_cannot_match();
2502 pos->determines_perfectly = false;
2503 return;
2504 }
2505 if (length == 1) {
2506 // This letter has no case equivalents, so it's nice and simple
2507 // and the mask-compare will determine definitely whether we have
2508 // a match at this character position.
2509 pos->mask = char_mask;
2510 pos->value = c;
2511 pos->determines_perfectly = true;
2512 } else {
2513 uint32_t common_bits = char_mask;
2514 uint32_t bits = chars[0];
2515 for (int j = 1; j < length; j++) {
2516 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2517 common_bits ^= differing_bits;
2518 bits &= common_bits;
2519 }
2520 // If length is 2 and common bits has only one zero in it then
2521 // our mask and compare instruction will determine definitely
2522 // whether we have a match at this character position. Otherwise
2523 // it can only be an approximate check.
2524 uint32_t one_zero = (common_bits | ~char_mask);
2525 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2526 pos->determines_perfectly = true;
2527 }
2528 pos->mask = common_bits;
2529 pos->value = bits;
2530 }
2531 } else {
2532 // Don't ignore case. Nice simple case where the mask-compare will
2533 // determine definitely whether we have a match at this character
2534 // position.
2535 if (c > char_mask) {
2536 details->set_cannot_match();
2537 pos->determines_perfectly = false;
2538 return;
2539 }
2540 pos->mask = char_mask;
2541 pos->value = c;
2542 pos->determines_perfectly = true;
2543 }
2544 characters_filled_in++;
2545 DCHECK(characters_filled_in <= details->characters());
2546 if (characters_filled_in == details->characters()) {
2547 return;
2548 }
2549 }
2550 } else {
2551 QuickCheckDetails::Position* pos =
2552 details->positions(characters_filled_in);
2553 RegExpCharacterClass* tree = elm.char_class();
2554 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2555 DCHECK(!ranges->is_empty());
2556 if (tree->is_negated()) {
2557 // A quick check uses multi-character mask and compare. There is no
2558 // useful way to incorporate a negative char class into this scheme
2559 // so we just conservatively create a mask and value that will always
2560 // succeed.
2561 pos->mask = 0;
2562 pos->value = 0;
2563 } else {
2564 int first_range = 0;
2565 while (ranges->at(first_range).from() > char_mask) {
2566 first_range++;
2567 if (first_range == ranges->length()) {
2568 details->set_cannot_match();
2569 pos->determines_perfectly = false;
2570 return;
2571 }
2572 }
2573 CharacterRange range = ranges->at(first_range);
2574 uc16 from = range.from();
2575 uc16 to = range.to();
2576 if (to > char_mask) {
2577 to = char_mask;
2578 }
2579 uint32_t differing_bits = (from ^ to);
2580 // A mask and compare is only perfect if the differing bits form a
2581 // number like 00011111 with one single block of trailing 1s.
2582 if ((differing_bits & (differing_bits + 1)) == 0 &&
2583 from + differing_bits == to) {
2584 pos->determines_perfectly = true;
2585 }
2586 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2587 uint32_t bits = (from & common_bits);
2588 for (int i = first_range + 1; i < ranges->length(); i++) {
2589 CharacterRange range = ranges->at(i);
2590 uc16 from = range.from();
2591 uc16 to = range.to();
2592 if (from > char_mask) continue;
2593 if (to > char_mask) to = char_mask;
2594 // Here we are combining more ranges into the mask and compare
2595 // value. With each new range the mask becomes more sparse and
2596 // so the chances of a false positive rise. A character class
2597 // with multiple ranges is assumed never to be equivalent to a
2598 // mask and compare operation.
2599 pos->determines_perfectly = false;
2600 uint32_t new_common_bits = (from ^ to);
2601 new_common_bits = ~SmearBitsRight(new_common_bits);
2602 common_bits &= new_common_bits;
2603 bits &= new_common_bits;
2604 uint32_t differing_bits = (from & common_bits) ^ bits;
2605 common_bits ^= differing_bits;
2606 bits &= common_bits;
2607 }
2608 pos->mask = common_bits;
2609 pos->value = bits;
2610 }
2611 characters_filled_in++;
2612 DCHECK(characters_filled_in <= details->characters());
2613 if (characters_filled_in == details->characters()) {
2614 return;
2615 }
2616 }
2617 }
2618 DCHECK(characters_filled_in != details->characters());
2619 if (!details->cannot_match()) {
2620 on_success()-> GetQuickCheckDetails(details,
2621 compiler,
2622 characters_filled_in,
2623 true);
2624 }
2625 }
2626
2627
Clear()2628 void QuickCheckDetails::Clear() {
2629 for (int i = 0; i < characters_; i++) {
2630 positions_[i].mask = 0;
2631 positions_[i].value = 0;
2632 positions_[i].determines_perfectly = false;
2633 }
2634 characters_ = 0;
2635 }
2636
2637
Advance(int by,bool one_byte)2638 void QuickCheckDetails::Advance(int by, bool one_byte) {
2639 if (by >= characters_ || by < 0) {
2640 DCHECK_IMPLIES(by < 0, characters_ == 0);
2641 Clear();
2642 return;
2643 }
2644 DCHECK_LE(characters_ - by, 4);
2645 DCHECK_LE(characters_, 4);
2646 for (int i = 0; i < characters_ - by; i++) {
2647 positions_[i] = positions_[by + i];
2648 }
2649 for (int i = characters_ - by; i < characters_; i++) {
2650 positions_[i].mask = 0;
2651 positions_[i].value = 0;
2652 positions_[i].determines_perfectly = false;
2653 }
2654 characters_ -= by;
2655 // We could change mask_ and value_ here but we would never advance unless
2656 // they had already been used in a check and they won't be used again because
2657 // it would gain us nothing. So there's no point.
2658 }
2659
2660
Merge(QuickCheckDetails * other,int from_index)2661 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2662 DCHECK(characters_ == other->characters_);
2663 if (other->cannot_match_) {
2664 return;
2665 }
2666 if (cannot_match_) {
2667 *this = *other;
2668 return;
2669 }
2670 for (int i = from_index; i < characters_; i++) {
2671 QuickCheckDetails::Position* pos = positions(i);
2672 QuickCheckDetails::Position* other_pos = other->positions(i);
2673 if (pos->mask != other_pos->mask ||
2674 pos->value != other_pos->value ||
2675 !other_pos->determines_perfectly) {
2676 // Our mask-compare operation will be approximate unless we have the
2677 // exact same operation on both sides of the alternation.
2678 pos->determines_perfectly = false;
2679 }
2680 pos->mask &= other_pos->mask;
2681 pos->value &= pos->mask;
2682 other_pos->value &= pos->mask;
2683 uc16 differing_bits = (pos->value ^ other_pos->value);
2684 pos->mask &= ~differing_bits;
2685 pos->value &= pos->mask;
2686 }
2687 }
2688
2689
2690 class VisitMarker {
2691 public:
VisitMarker(NodeInfo * info)2692 explicit VisitMarker(NodeInfo* info) : info_(info) {
2693 DCHECK(!info->visited);
2694 info->visited = true;
2695 }
~VisitMarker()2696 ~VisitMarker() {
2697 info_->visited = false;
2698 }
2699 private:
2700 NodeInfo* info_;
2701 };
2702
FilterOneByte(int depth)2703 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
2704 if (info()->replacement_calculated) return replacement();
2705 if (depth < 0) return this;
2706 DCHECK(!info()->visited);
2707 VisitMarker marker(info());
2708 return FilterSuccessor(depth - 1);
2709 }
2710
FilterSuccessor(int depth)2711 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2712 RegExpNode* next = on_success_->FilterOneByte(depth - 1);
2713 if (next == nullptr) return set_replacement(nullptr);
2714 on_success_ = next;
2715 return set_replacement(this);
2716 }
2717
2718 // We need to check for the following characters: 0x39C 0x3BC 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2719 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2720 // TODO(dcarney): this could be a lot more efficient.
2721 return range.Contains(0x039C) || range.Contains(0x03BC) ||
2722 range.Contains(0x0178);
2723 }
2724
2725
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2726 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2727 for (int i = 0; i < ranges->length(); i++) {
2728 // TODO(dcarney): this could be a lot more efficient.
2729 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2730 }
2731 return false;
2732 }
2733
FilterOneByte(int depth)2734 RegExpNode* TextNode::FilterOneByte(int depth) {
2735 if (info()->replacement_calculated) return replacement();
2736 if (depth < 0) return this;
2737 DCHECK(!info()->visited);
2738 VisitMarker marker(info());
2739 int element_count = elements()->length();
2740 for (int i = 0; i < element_count; i++) {
2741 TextElement elm = elements()->at(i);
2742 if (elm.text_type() == TextElement::ATOM) {
2743 Vector<const uc16> quarks = elm.atom()->data();
2744 for (int j = 0; j < quarks.length(); j++) {
2745 uint16_t c = quarks[j];
2746 if (elm.atom()->ignore_case()) {
2747 c = unibrow::Latin1::TryConvertToLatin1(c);
2748 }
2749 if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
2750 // Replace quark in case we converted to Latin-1.
2751 uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
2752 writable_quarks[j] = c;
2753 }
2754 } else {
2755 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2756 RegExpCharacterClass* cc = elm.char_class();
2757 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2758 CharacterRange::Canonicalize(ranges);
2759 // Now they are in order so we only need to look at the first.
2760 int range_count = ranges->length();
2761 if (cc->is_negated()) {
2762 if (range_count != 0 &&
2763 ranges->at(0).from() == 0 &&
2764 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2765 // This will be handled in a later filter.
2766 if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2767 continue;
2768 return set_replacement(nullptr);
2769 }
2770 } else {
2771 if (range_count == 0 ||
2772 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2773 // This will be handled in a later filter.
2774 if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2775 continue;
2776 return set_replacement(nullptr);
2777 }
2778 }
2779 }
2780 }
2781 return FilterSuccessor(depth - 1);
2782 }
2783
FilterOneByte(int depth)2784 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
2785 if (info()->replacement_calculated) return replacement();
2786 if (depth < 0) return this;
2787 if (info()->visited) return this;
2788 {
2789 VisitMarker marker(info());
2790
2791 RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2792 // If we can't continue after the loop then there is no sense in doing the
2793 // loop.
2794 if (continue_replacement == nullptr) return set_replacement(nullptr);
2795 }
2796
2797 return ChoiceNode::FilterOneByte(depth - 1);
2798 }
2799
FilterOneByte(int depth)2800 RegExpNode* ChoiceNode::FilterOneByte(int depth) {
2801 if (info()->replacement_calculated) return replacement();
2802 if (depth < 0) return this;
2803 if (info()->visited) return this;
2804 VisitMarker marker(info());
2805 int choice_count = alternatives_->length();
2806
2807 for (int i = 0; i < choice_count; i++) {
2808 GuardedAlternative alternative = alternatives_->at(i);
2809 if (alternative.guards() != nullptr &&
2810 alternative.guards()->length() != 0) {
2811 set_replacement(this);
2812 return this;
2813 }
2814 }
2815
2816 int surviving = 0;
2817 RegExpNode* survivor = nullptr;
2818 for (int i = 0; i < choice_count; i++) {
2819 GuardedAlternative alternative = alternatives_->at(i);
2820 RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2821 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2822 if (replacement != nullptr) {
2823 alternatives_->at(i).set_node(replacement);
2824 surviving++;
2825 survivor = replacement;
2826 }
2827 }
2828 if (surviving < 2) return set_replacement(survivor);
2829
2830 set_replacement(this);
2831 if (surviving == choice_count) {
2832 return this;
2833 }
2834 // Only some of the nodes survived the filtering. We need to rebuild the
2835 // alternatives list.
2836 ZoneList<GuardedAlternative>* new_alternatives =
2837 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2838 for (int i = 0; i < choice_count; i++) {
2839 RegExpNode* replacement =
2840 alternatives_->at(i).node()->FilterOneByte(depth - 1);
2841 if (replacement != nullptr) {
2842 alternatives_->at(i).set_node(replacement);
2843 new_alternatives->Add(alternatives_->at(i), zone());
2844 }
2845 }
2846 alternatives_ = new_alternatives;
2847 return this;
2848 }
2849
FilterOneByte(int depth)2850 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
2851 if (info()->replacement_calculated) return replacement();
2852 if (depth < 0) return this;
2853 if (info()->visited) return this;
2854 VisitMarker marker(info());
2855 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2856 // afterwards.
2857 RegExpNode* node = alternatives_->at(1).node();
2858 RegExpNode* replacement = node->FilterOneByte(depth - 1);
2859 if (replacement == nullptr) return set_replacement(nullptr);
2860 alternatives_->at(1).set_node(replacement);
2861
2862 RegExpNode* neg_node = alternatives_->at(0).node();
2863 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2864 // If the negative lookahead is always going to fail then
2865 // we don't need to check it.
2866 if (neg_replacement == nullptr) return set_replacement(replacement);
2867 alternatives_->at(0).set_node(neg_replacement);
2868 return set_replacement(this);
2869 }
2870
2871
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2872 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2873 RegExpCompiler* compiler,
2874 int characters_filled_in,
2875 bool not_at_start) {
2876 if (body_can_be_zero_length_ || info()->visited) return;
2877 VisitMarker marker(info());
2878 return ChoiceNode::GetQuickCheckDetails(details,
2879 compiler,
2880 characters_filled_in,
2881 not_at_start);
2882 }
2883
2884
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2885 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2886 BoyerMooreLookahead* bm, bool not_at_start) {
2887 if (body_can_be_zero_length_ || budget <= 0) {
2888 bm->SetRest(offset);
2889 SaveBMInfo(bm, not_at_start, offset);
2890 return;
2891 }
2892 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2893 SaveBMInfo(bm, not_at_start, offset);
2894 }
2895
2896
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2897 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2898 RegExpCompiler* compiler,
2899 int characters_filled_in,
2900 bool not_at_start) {
2901 not_at_start = (not_at_start || not_at_start_);
2902 int choice_count = alternatives_->length();
2903 DCHECK_LT(0, choice_count);
2904 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2905 compiler,
2906 characters_filled_in,
2907 not_at_start);
2908 for (int i = 1; i < choice_count; i++) {
2909 QuickCheckDetails new_details(details->characters());
2910 RegExpNode* node = alternatives_->at(i).node();
2911 node->GetQuickCheckDetails(&new_details, compiler,
2912 characters_filled_in,
2913 not_at_start);
2914 // Here we merge the quick match details of the two branches.
2915 details->Merge(&new_details, characters_filled_in);
2916 }
2917 }
2918
2919
2920 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2921 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2922 Label* word,
2923 Label* non_word,
2924 bool fall_through_on_word) {
2925 if (assembler->CheckSpecialCharacterClass(
2926 fall_through_on_word ? 'w' : 'W',
2927 fall_through_on_word ? non_word : word)) {
2928 // Optimized implementation available.
2929 return;
2930 }
2931 assembler->CheckCharacterGT('z', non_word);
2932 assembler->CheckCharacterLT('0', non_word);
2933 assembler->CheckCharacterGT('a' - 1, word);
2934 assembler->CheckCharacterLT('9' + 1, word);
2935 assembler->CheckCharacterLT('A', non_word);
2936 assembler->CheckCharacterLT('Z' + 1, word);
2937 if (fall_through_on_word) {
2938 assembler->CheckNotCharacter('_', non_word);
2939 } else {
2940 assembler->CheckCharacter('_', word);
2941 }
2942 }
2943
2944
2945 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2946 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2947 static void EmitHat(RegExpCompiler* compiler,
2948 RegExpNode* on_success,
2949 Trace* trace) {
2950 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2951 // We will be loading the previous character into the current character
2952 // register.
2953 Trace new_trace(*trace);
2954 new_trace.InvalidateCurrentCharacter();
2955
2956 Label ok;
2957 if (new_trace.cp_offset() == 0) {
2958 // The start of input counts as a newline in this context, so skip to
2959 // ok if we are at the start.
2960 assembler->CheckAtStart(&ok);
2961 }
2962 // We already checked that we are not at the start of input so it must be
2963 // OK to load the previous character.
2964 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2965 new_trace.backtrack(),
2966 false);
2967 if (!assembler->CheckSpecialCharacterClass('n',
2968 new_trace.backtrack())) {
2969 // Newline means \n, \r, 0x2028 or 0x2029.
2970 if (!compiler->one_byte()) {
2971 assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2972 }
2973 assembler->CheckCharacter('\n', &ok);
2974 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2975 }
2976 assembler->Bind(&ok);
2977 on_success->Emit(compiler, &new_trace);
2978 }
2979
2980
2981 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)2982 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2983 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2984 Isolate* isolate = assembler->isolate();
2985 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2986 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2987 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2988 if (lookahead == nullptr) {
2989 int eats_at_least =
2990 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
2991 kRecursionBudget,
2992 not_at_start));
2993 if (eats_at_least >= 1) {
2994 BoyerMooreLookahead* bm =
2995 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
2996 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2997 if (bm->at(0)->is_non_word())
2998 next_is_word_character = Trace::FALSE_VALUE;
2999 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3000 }
3001 } else {
3002 if (lookahead->at(0)->is_non_word())
3003 next_is_word_character = Trace::FALSE_VALUE;
3004 if (lookahead->at(0)->is_word())
3005 next_is_word_character = Trace::TRUE_VALUE;
3006 }
3007 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3008 if (next_is_word_character == Trace::UNKNOWN) {
3009 Label before_non_word;
3010 Label before_word;
3011 if (trace->characters_preloaded() != 1) {
3012 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3013 }
3014 // Fall through on non-word.
3015 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3016 // Next character is not a word character.
3017 assembler->Bind(&before_non_word);
3018 Label ok;
3019 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3020 assembler->GoTo(&ok);
3021
3022 assembler->Bind(&before_word);
3023 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3024 assembler->Bind(&ok);
3025 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3026 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3027 } else {
3028 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3029 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3030 }
3031 }
3032
3033
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3034 void AssertionNode::BacktrackIfPrevious(
3035 RegExpCompiler* compiler,
3036 Trace* trace,
3037 AssertionNode::IfPrevious backtrack_if_previous) {
3038 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3039 Trace new_trace(*trace);
3040 new_trace.InvalidateCurrentCharacter();
3041
3042 Label fall_through, dummy;
3043
3044 Label* non_word = backtrack_if_previous == kIsNonWord ?
3045 new_trace.backtrack() :
3046 &fall_through;
3047 Label* word = backtrack_if_previous == kIsNonWord ?
3048 &fall_through :
3049 new_trace.backtrack();
3050
3051 if (new_trace.cp_offset() == 0) {
3052 // The start of input counts as a non-word character, so the question is
3053 // decided if we are at the start.
3054 assembler->CheckAtStart(non_word);
3055 }
3056 // We already checked that we are not at the start of input so it must be
3057 // OK to load the previous character.
3058 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3059 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3060
3061 assembler->Bind(&fall_through);
3062 on_success()->Emit(compiler, &new_trace);
3063 }
3064
3065
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3066 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3067 RegExpCompiler* compiler,
3068 int filled_in,
3069 bool not_at_start) {
3070 if (assertion_type_ == AT_START && not_at_start) {
3071 details->set_cannot_match();
3072 return;
3073 }
3074 return on_success()->GetQuickCheckDetails(details,
3075 compiler,
3076 filled_in,
3077 not_at_start);
3078 }
3079
3080
Emit(RegExpCompiler * compiler,Trace * trace)3081 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3082 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3083 switch (assertion_type_) {
3084 case AT_END: {
3085 Label ok;
3086 assembler->CheckPosition(trace->cp_offset(), &ok);
3087 assembler->GoTo(trace->backtrack());
3088 assembler->Bind(&ok);
3089 break;
3090 }
3091 case AT_START: {
3092 if (trace->at_start() == Trace::FALSE_VALUE) {
3093 assembler->GoTo(trace->backtrack());
3094 return;
3095 }
3096 if (trace->at_start() == Trace::UNKNOWN) {
3097 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3098 Trace at_start_trace = *trace;
3099 at_start_trace.set_at_start(Trace::TRUE_VALUE);
3100 on_success()->Emit(compiler, &at_start_trace);
3101 return;
3102 }
3103 }
3104 break;
3105 case AFTER_NEWLINE:
3106 EmitHat(compiler, on_success(), trace);
3107 return;
3108 case AT_BOUNDARY:
3109 case AT_NON_BOUNDARY: {
3110 EmitBoundaryCheck(compiler, trace);
3111 return;
3112 }
3113 }
3114 on_success()->Emit(compiler, trace);
3115 }
3116
3117
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3118 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3119 if (quick_check == nullptr) return false;
3120 if (offset >= quick_check->characters()) return false;
3121 return quick_check->positions(offset)->determines_perfectly;
3122 }
3123
3124
UpdateBoundsCheck(int index,int * checked_up_to)3125 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3126 if (index > *checked_up_to) {
3127 *checked_up_to = index;
3128 }
3129 }
3130
3131
3132 // We call this repeatedly to generate code for each pass over the text node.
3133 // The passes are in increasing order of difficulty because we hope one
3134 // of the first passes will fail in which case we are saved the work of the
3135 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3136 // we will check the '%' in the first pass, the case independent 'a' in the
3137 // second pass and the character class in the last pass.
3138 //
3139 // The passes are done from right to left, so for example to test for /bar/
3140 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3141 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3142 // bounds check most of the time. In the example we only need to check for
3143 // end-of-input when loading the putative 'r'.
3144 //
3145 // A slight complication involves the fact that the first character may already
3146 // be fetched into a register by the previous node. In this case we want to
3147 // do the test for that character first. We do this in separate passes. The
3148 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3149 // pass has been performed then subsequent passes will have true in
3150 // first_element_checked to indicate that that character does not need to be
3151 // checked again.
3152 //
3153 // In addition to all this we are passed a Trace, which can
3154 // contain an AlternativeGeneration object. In this AlternativeGeneration
3155 // object we can see details of any quick check that was already passed in
3156 // order to get to the code we are now generating. The quick check can involve
3157 // loading characters, which means we do not need to recheck the bounds
3158 // up to the limit the quick check already checked. In addition the quick
3159 // check can have involved a mask and compare operation which may simplify
3160 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3161 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3162 TextEmitPassType pass,
3163 bool preloaded,
3164 Trace* trace,
3165 bool first_element_checked,
3166 int* checked_up_to) {
3167 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3168 Isolate* isolate = assembler->isolate();
3169 bool one_byte = compiler->one_byte();
3170 Label* backtrack = trace->backtrack();
3171 QuickCheckDetails* quick_check = trace->quick_check_performed();
3172 int element_count = elements()->length();
3173 int backward_offset = read_backward() ? -Length() : 0;
3174 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3175 TextElement elm = elements()->at(i);
3176 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3177 if (elm.text_type() == TextElement::ATOM) {
3178 if (SkipPass(pass, elm.atom()->ignore_case())) continue;
3179 Vector<const uc16> quarks = elm.atom()->data();
3180 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3181 if (first_element_checked && i == 0 && j == 0) continue;
3182 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3183 EmitCharacterFunction* emit_function = nullptr;
3184 uc16 quark = quarks[j];
3185 if (elm.atom()->ignore_case()) {
3186 // Everywhere else we assume that a non-Latin-1 character cannot match
3187 // a Latin-1 character. Avoid the cases where this is assumption is
3188 // invalid by using the Latin1 equivalent instead.
3189 quark = unibrow::Latin1::TryConvertToLatin1(quark);
3190 }
3191 switch (pass) {
3192 case NON_LATIN1_MATCH:
3193 DCHECK(one_byte);
3194 if (quark > String::kMaxOneByteCharCode) {
3195 assembler->GoTo(backtrack);
3196 return;
3197 }
3198 break;
3199 case NON_LETTER_CHARACTER_MATCH:
3200 emit_function = &EmitAtomNonLetter;
3201 break;
3202 case SIMPLE_CHARACTER_MATCH:
3203 emit_function = &EmitSimpleCharacter;
3204 break;
3205 case CASE_CHARACTER_MATCH:
3206 emit_function = &EmitAtomLetter;
3207 break;
3208 default:
3209 break;
3210 }
3211 if (emit_function != nullptr) {
3212 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3213 bool bound_checked =
3214 emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
3215 bounds_check, preloaded);
3216 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3217 }
3218 }
3219 } else {
3220 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3221 if (pass == CHARACTER_CLASS_MATCH) {
3222 if (first_element_checked && i == 0) continue;
3223 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3224 RegExpCharacterClass* cc = elm.char_class();
3225 bool bounds_check = *checked_up_to < cp_offset || read_backward();
3226 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3227 bounds_check, preloaded, zone());
3228 UpdateBoundsCheck(cp_offset, checked_up_to);
3229 }
3230 }
3231 }
3232 }
3233
3234
Length()3235 int TextNode::Length() {
3236 TextElement elm = elements()->last();
3237 DCHECK_LE(0, elm.cp_offset());
3238 return elm.cp_offset() + elm.length();
3239 }
3240
SkipPass(TextEmitPassType pass,bool ignore_case)3241 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
3242 if (ignore_case) {
3243 return pass == SIMPLE_CHARACTER_MATCH;
3244 } else {
3245 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3246 }
3247 }
3248
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)3249 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3250 ZoneList<CharacterRange>* ranges,
3251 bool read_backward,
3252 RegExpNode* on_success,
3253 JSRegExp::Flags flags) {
3254 DCHECK_NOT_NULL(ranges);
3255 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3256 elms->Add(TextElement::CharClass(
3257 new (zone) RegExpCharacterClass(zone, ranges, flags)),
3258 zone);
3259 return new (zone) TextNode(elms, read_backward, on_success);
3260 }
3261
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)3262 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3263 CharacterRange trail,
3264 bool read_backward,
3265 RegExpNode* on_success,
3266 JSRegExp::Flags flags) {
3267 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3268 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3269 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3270 elms->Add(TextElement::CharClass(
3271 new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
3272 zone);
3273 elms->Add(TextElement::CharClass(
3274 new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
3275 zone);
3276 return new (zone) TextNode(elms, read_backward, on_success);
3277 }
3278
3279
3280 // This generates the code to match a text node. A text node can contain
3281 // straight character sequences (possibly to be matched in a case-independent
3282 // way) and character classes. For efficiency we do not do this in a single
3283 // pass from left to right. Instead we pass over the text node several times,
3284 // emitting code for some character positions every time. See the comment on
3285 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3286 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3287 LimitResult limit_result = LimitVersions(compiler, trace);
3288 if (limit_result == DONE) return;
3289 DCHECK(limit_result == CONTINUE);
3290
3291 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3292 compiler->SetRegExpTooBig();
3293 return;
3294 }
3295
3296 if (compiler->one_byte()) {
3297 int dummy = 0;
3298 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3299 }
3300
3301 bool first_elt_done = false;
3302 int bound_checked_to = trace->cp_offset() - 1;
3303 bound_checked_to += trace->bound_checked_up_to();
3304
3305 // If a character is preloaded into the current character register then
3306 // check that now.
3307 if (trace->characters_preloaded() == 1) {
3308 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3309 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
3310 false, &bound_checked_to);
3311 }
3312 first_elt_done = true;
3313 }
3314
3315 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3316 TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
3317 first_elt_done, &bound_checked_to);
3318 }
3319
3320 Trace successor_trace(*trace);
3321 // If we advance backward, we may end up at the start.
3322 successor_trace.AdvanceCurrentPositionInTrace(
3323 read_backward() ? -Length() : Length(), compiler);
3324 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3325 : Trace::FALSE_VALUE);
3326 RecursionCheck rc(compiler);
3327 on_success()->Emit(compiler, &successor_trace);
3328 }
3329
3330
InvalidateCurrentCharacter()3331 void Trace::InvalidateCurrentCharacter() {
3332 characters_preloaded_ = 0;
3333 }
3334
3335
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3336 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3337 // We don't have an instruction for shifting the current character register
3338 // down or for using a shifted value for anything so lets just forget that
3339 // we preloaded any characters into it.
3340 characters_preloaded_ = 0;
3341 // Adjust the offsets of the quick check performed information. This
3342 // information is used to find out what we already determined about the
3343 // characters by means of mask and compare.
3344 quick_check_performed_.Advance(by, compiler->one_byte());
3345 cp_offset_ += by;
3346 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3347 compiler->SetRegExpTooBig();
3348 cp_offset_ = 0;
3349 }
3350 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3351 }
3352
3353
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)3354 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3355 int element_count = elements()->length();
3356 for (int i = 0; i < element_count; i++) {
3357 TextElement elm = elements()->at(i);
3358 if (elm.text_type() == TextElement::CHAR_CLASS) {
3359 RegExpCharacterClass* cc = elm.char_class();
3360 #ifdef V8_INTL_SUPPORT
3361 bool case_equivalents_already_added =
3362 NeedsUnicodeCaseEquivalents(cc->flags());
3363 #else
3364 bool case_equivalents_already_added = false;
3365 #endif
3366 if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
3367 // None of the standard character classes is different in the case
3368 // independent case and it slows us down if we don't know that.
3369 if (cc->is_standard(zone())) continue;
3370 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3371 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
3372 is_one_byte);
3373 }
3374 }
3375 }
3376 }
3377
3378
GreedyLoopTextLength()3379 int TextNode::GreedyLoopTextLength() { return Length(); }
3380
3381
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3382 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3383 RegExpCompiler* compiler) {
3384 if (read_backward()) return nullptr;
3385 if (elements()->length() != 1) return nullptr;
3386 TextElement elm = elements()->at(0);
3387 if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3388 RegExpCharacterClass* node = elm.char_class();
3389 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3390 CharacterRange::Canonicalize(ranges);
3391 if (node->is_negated()) {
3392 return ranges->length() == 0 ? on_success() : nullptr;
3393 }
3394 if (ranges->length() != 1) return nullptr;
3395 uint32_t max_char;
3396 if (compiler->one_byte()) {
3397 max_char = String::kMaxOneByteCharCode;
3398 } else {
3399 max_char = String::kMaxUtf16CodeUnit;
3400 }
3401 return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3402 }
3403
3404
3405 // Finds the fixed match length of a sequence of nodes that goes from
3406 // this alternative and back to this choice node. If there are variable
3407 // length nodes or other complications in the way then return a sentinel
3408 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3409 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3410 GuardedAlternative* alternative) {
3411 int length = 0;
3412 RegExpNode* node = alternative->node();
3413 // Later we will generate code for all these text nodes using recursion
3414 // so we have to limit the max number.
3415 int recursion_depth = 0;
3416 while (node != this) {
3417 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3418 return kNodeIsTooComplexForGreedyLoops;
3419 }
3420 int node_length = node->GreedyLoopTextLength();
3421 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3422 return kNodeIsTooComplexForGreedyLoops;
3423 }
3424 length += node_length;
3425 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3426 node = seq_node->on_success();
3427 }
3428 return read_backward() ? -length : length;
3429 }
3430
3431
AddLoopAlternative(GuardedAlternative alt)3432 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3433 DCHECK_NULL(loop_node_);
3434 AddAlternative(alt);
3435 loop_node_ = alt.node();
3436 }
3437
3438
AddContinueAlternative(GuardedAlternative alt)3439 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3440 DCHECK_NULL(continue_node_);
3441 AddAlternative(alt);
3442 continue_node_ = alt.node();
3443 }
3444
3445
Emit(RegExpCompiler * compiler,Trace * trace)3446 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3447 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3448 if (trace->stop_node() == this) {
3449 // Back edge of greedy optimized loop node graph.
3450 int text_length =
3451 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3452 DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3453 // Update the counter-based backtracking info on the stack. This is an
3454 // optimization for greedy loops (see below).
3455 DCHECK(trace->cp_offset() == text_length);
3456 macro_assembler->AdvanceCurrentPosition(text_length);
3457 macro_assembler->GoTo(trace->loop_label());
3458 return;
3459 }
3460 DCHECK_NULL(trace->stop_node());
3461 if (!trace->is_trivial()) {
3462 trace->Flush(compiler, this);
3463 return;
3464 }
3465 ChoiceNode::Emit(compiler, trace);
3466 }
3467
3468
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3469 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3470 int eats_at_least) {
3471 int preload_characters = Min(4, eats_at_least);
3472 DCHECK_LE(preload_characters, 4);
3473 if (compiler->macro_assembler()->CanReadUnaligned()) {
3474 bool one_byte = compiler->one_byte();
3475 if (one_byte) {
3476 // We can't preload 3 characters because there is no machine instruction
3477 // to do that. We can't just load 4 because we could be reading
3478 // beyond the end of the string, which could cause a memory fault.
3479 if (preload_characters == 3) preload_characters = 2;
3480 } else {
3481 if (preload_characters > 2) preload_characters = 2;
3482 }
3483 } else {
3484 if (preload_characters > 1) preload_characters = 1;
3485 }
3486 return preload_characters;
3487 }
3488
3489
3490 // This class is used when generating the alternatives in a choice node. It
3491 // records the way the alternative is being code generated.
3492 class AlternativeGeneration: public Malloced {
3493 public:
AlternativeGeneration()3494 AlternativeGeneration()
3495 : possible_success(),
3496 expects_preload(false),
3497 after(),
3498 quick_check_details() { }
3499 Label possible_success;
3500 bool expects_preload;
3501 Label after;
3502 QuickCheckDetails quick_check_details;
3503 };
3504
3505
3506 // Creates a list of AlternativeGenerations. If the list has a reasonable
3507 // size then it is on the stack, otherwise the excess is on the heap.
3508 class AlternativeGenerationList {
3509 public:
AlternativeGenerationList(int count,Zone * zone)3510 AlternativeGenerationList(int count, Zone* zone)
3511 : alt_gens_(count, zone) {
3512 for (int i = 0; i < count && i < kAFew; i++) {
3513 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3514 }
3515 for (int i = kAFew; i < count; i++) {
3516 alt_gens_.Add(new AlternativeGeneration(), zone);
3517 }
3518 }
~AlternativeGenerationList()3519 ~AlternativeGenerationList() {
3520 for (int i = kAFew; i < alt_gens_.length(); i++) {
3521 delete alt_gens_[i];
3522 alt_gens_[i] = nullptr;
3523 }
3524 }
3525
at(int i)3526 AlternativeGeneration* at(int i) {
3527 return alt_gens_[i];
3528 }
3529
3530 private:
3531 static const int kAFew = 10;
3532 ZoneList<AlternativeGeneration*> alt_gens_;
3533 AlternativeGeneration a_few_alt_gens_[kAFew];
3534 };
3535
3536
3537 static const uc32 kRangeEndMarker = 0x110000;
3538
3539 // The '2' variant is has inclusive from and exclusive to.
3540 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3541 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3542 static const int kSpaceRanges[] = {
3543 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3544 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3545 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3546 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3547
3548 static const int kWordRanges[] = {
3549 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3550 static const int kWordRangeCount = arraysize(kWordRanges);
3551 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3552 static const int kDigitRangeCount = arraysize(kDigitRanges);
3553 static const int kSurrogateRanges[] = {
3554 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3555 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3556 static const int kLineTerminatorRanges[] = {
3557 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3558 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3559
Set(int character)3560 void BoyerMoorePositionInfo::Set(int character) {
3561 SetInterval(Interval(character, character));
3562 }
3563
3564
SetInterval(const Interval & interval)3565 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3566 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3567 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3568 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3569 surrogate_ =
3570 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3571 if (interval.to() - interval.from() >= kMapSize - 1) {
3572 if (map_count_ != kMapSize) {
3573 map_count_ = kMapSize;
3574 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3575 }
3576 return;
3577 }
3578 for (int i = interval.from(); i <= interval.to(); i++) {
3579 int mod_character = (i & kMask);
3580 if (!map_->at(mod_character)) {
3581 map_count_++;
3582 map_->at(mod_character) = true;
3583 }
3584 if (map_count_ == kMapSize) return;
3585 }
3586 }
3587
3588
SetAll()3589 void BoyerMoorePositionInfo::SetAll() {
3590 s_ = w_ = d_ = kLatticeUnknown;
3591 if (map_count_ != kMapSize) {
3592 map_count_ = kMapSize;
3593 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3594 }
3595 }
3596
3597
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3598 BoyerMooreLookahead::BoyerMooreLookahead(
3599 int length, RegExpCompiler* compiler, Zone* zone)
3600 : length_(length),
3601 compiler_(compiler) {
3602 if (compiler->one_byte()) {
3603 max_char_ = String::kMaxOneByteCharCode;
3604 } else {
3605 max_char_ = String::kMaxUtf16CodeUnit;
3606 }
3607 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3608 for (int i = 0; i < length; i++) {
3609 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3610 }
3611 }
3612
3613
3614 // Find the longest range of lookahead that has the fewest number of different
3615 // characters that can occur at a given position. Since we are optimizing two
3616 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3617 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3618 int biggest_points = 0;
3619 // If more than 32 characters out of 128 can occur it is unlikely that we can
3620 // be lucky enough to step forwards much of the time.
3621 const int kMaxMax = 32;
3622 for (int max_number_of_chars = 4;
3623 max_number_of_chars < kMaxMax;
3624 max_number_of_chars *= 2) {
3625 biggest_points =
3626 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3627 }
3628 if (biggest_points == 0) return false;
3629 return true;
3630 }
3631
3632
3633 // Find the highest-points range between 0 and length_ where the character
3634 // information is not too vague. 'Too vague' means that there are more than
3635 // max_number_of_chars that can occur at this position. Calculates the number
3636 // of points as the product of width-of-the-range and
3637 // probability-of-finding-one-of-the-characters, where the probability is
3638 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3639 int BoyerMooreLookahead::FindBestInterval(
3640 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3641 int biggest_points = old_biggest_points;
3642 static const int kSize = RegExpMacroAssembler::kTableSize;
3643 for (int i = 0; i < length_; ) {
3644 while (i < length_ && Count(i) > max_number_of_chars) i++;
3645 if (i == length_) break;
3646 int remembered_from = i;
3647 bool union_map[kSize];
3648 for (int j = 0; j < kSize; j++) union_map[j] = false;
3649 while (i < length_ && Count(i) <= max_number_of_chars) {
3650 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3651 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3652 i++;
3653 }
3654 int frequency = 0;
3655 for (int j = 0; j < kSize; j++) {
3656 if (union_map[j]) {
3657 // Add 1 to the frequency to give a small per-character boost for
3658 // the cases where our sampling is not good enough and many
3659 // characters have a frequency of zero. This means the frequency
3660 // can theoretically be up to 2*kSize though we treat it mostly as
3661 // a fraction of kSize.
3662 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3663 }
3664 }
3665 // We use the probability of skipping times the distance we are skipping to
3666 // judge the effectiveness of this. Actually we have a cut-off: By
3667 // dividing by 2 we switch off the skipping if the probability of skipping
3668 // is less than 50%. This is because the multibyte mask-and-compare
3669 // skipping in quickcheck is more likely to do well on this case.
3670 bool in_quickcheck_range =
3671 ((i - remembered_from < 4) ||
3672 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3673 // Called 'probability' but it is only a rough estimate and can actually
3674 // be outside the 0-kSize range.
3675 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3676 int points = (i - remembered_from) * probability;
3677 if (points > biggest_points) {
3678 *from = remembered_from;
3679 *to = i - 1;
3680 biggest_points = points;
3681 }
3682 }
3683 return biggest_points;
3684 }
3685
3686
3687 // Take all the characters that will not prevent a successful match if they
3688 // occur in the subject string in the range between min_lookahead and
3689 // max_lookahead (inclusive) measured from the current position. If the
3690 // character at max_lookahead offset is not one of these characters, then we
3691 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3692 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3693 int max_lookahead,
3694 Handle<ByteArray> boolean_skip_table) {
3695 const int kSize = RegExpMacroAssembler::kTableSize;
3696
3697 const int kSkipArrayEntry = 0;
3698 const int kDontSkipArrayEntry = 1;
3699
3700 for (int i = 0; i < kSize; i++) {
3701 boolean_skip_table->set(i, kSkipArrayEntry);
3702 }
3703 int skip = max_lookahead + 1 - min_lookahead;
3704
3705 for (int i = max_lookahead; i >= min_lookahead; i--) {
3706 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3707 for (int j = 0; j < kSize; j++) {
3708 if (map->at(j)) {
3709 boolean_skip_table->set(j, kDontSkipArrayEntry);
3710 }
3711 }
3712 }
3713
3714 return skip;
3715 }
3716
3717
3718 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3719 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3720 const int kSize = RegExpMacroAssembler::kTableSize;
3721
3722 int min_lookahead = 0;
3723 int max_lookahead = 0;
3724
3725 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3726
3727 bool found_single_character = false;
3728 int single_character = 0;
3729 for (int i = max_lookahead; i >= min_lookahead; i--) {
3730 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3731 if (map->map_count() > 1 ||
3732 (found_single_character && map->map_count() != 0)) {
3733 found_single_character = false;
3734 break;
3735 }
3736 for (int j = 0; j < kSize; j++) {
3737 if (map->at(j)) {
3738 found_single_character = true;
3739 single_character = j;
3740 break;
3741 }
3742 }
3743 }
3744
3745 int lookahead_width = max_lookahead + 1 - min_lookahead;
3746
3747 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3748 // The mask-compare can probably handle this better.
3749 return;
3750 }
3751
3752 if (found_single_character) {
3753 Label cont, again;
3754 masm->Bind(&again);
3755 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3756 if (max_char_ > kSize) {
3757 masm->CheckCharacterAfterAnd(single_character,
3758 RegExpMacroAssembler::kTableMask,
3759 &cont);
3760 } else {
3761 masm->CheckCharacter(single_character, &cont);
3762 }
3763 masm->AdvanceCurrentPosition(lookahead_width);
3764 masm->GoTo(&again);
3765 masm->Bind(&cont);
3766 return;
3767 }
3768
3769 Factory* factory = masm->isolate()->factory();
3770 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3771 int skip_distance = GetSkipTable(
3772 min_lookahead, max_lookahead, boolean_skip_table);
3773 DCHECK_NE(0, skip_distance);
3774
3775 Label cont, again;
3776 masm->Bind(&again);
3777 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3778 masm->CheckBitInTable(boolean_skip_table, &cont);
3779 masm->AdvanceCurrentPosition(skip_distance);
3780 masm->GoTo(&again);
3781 masm->Bind(&cont);
3782 }
3783
3784
3785 /* Code generation for choice nodes.
3786 *
3787 * We generate quick checks that do a mask and compare to eliminate a
3788 * choice. If the quick check succeeds then it jumps to the continuation to
3789 * do slow checks and check subsequent nodes. If it fails (the common case)
3790 * it falls through to the next choice.
3791 *
3792 * Here is the desired flow graph. Nodes directly below each other imply
3793 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3794 * 3 doesn't have a quick check so we have to call the slow check.
3795 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3796 * regexp continuation is generated directly after the Sn node, up to the
3797 * next GoTo if we decide to reuse some already generated code. Some
3798 * nodes expect preload_characters to be preloaded into the current
3799 * character register. R nodes do this preloading. Vertices are marked
3800 * F for failures and S for success (possible success in the case of quick
3801 * nodes). L, V, < and > are used as arrow heads.
3802 *
3803 * ----------> R
3804 * |
3805 * V
3806 * Q1 -----> S1
3807 * | S /
3808 * F| /
3809 * | F/
3810 * | /
3811 * | R
3812 * | /
3813 * V L
3814 * Q2 -----> S2
3815 * | S /
3816 * F| /
3817 * | F/
3818 * | /
3819 * | R
3820 * | /
3821 * V L
3822 * S3
3823 * |
3824 * F|
3825 * |
3826 * R
3827 * |
3828 * backtrack V
3829 * <----------Q4
3830 * \ F |
3831 * \ |S
3832 * \ F V
3833 * \-----S4
3834 *
3835 * For greedy loops we push the current position, then generate the code that
3836 * eats the input specially in EmitGreedyLoop. The other choice (the
3837 * continuation) is generated by the normal code in EmitChoices, and steps back
3838 * in the input to the starting position when it fails to match. The loop code
3839 * looks like this (U is the unwind code that steps back in the greedy loop).
3840 *
3841 * _____
3842 * / \
3843 * V |
3844 * ----------> S1 |
3845 * /| |
3846 * / |S |
3847 * F/ \_____/
3848 * /
3849 * |<-----
3850 * | \
3851 * V |S
3852 * Q2 ---> U----->backtrack
3853 * | F /
3854 * S| /
3855 * V F /
3856 * S2--/
3857 */
3858
GreedyLoopState(bool not_at_start)3859 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3860 counter_backtrack_trace_.set_backtrack(&label_);
3861 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3862 }
3863
3864
AssertGuardsMentionRegisters(Trace * trace)3865 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3866 #ifdef DEBUG
3867 int choice_count = alternatives_->length();
3868 for (int i = 0; i < choice_count - 1; i++) {
3869 GuardedAlternative alternative = alternatives_->at(i);
3870 ZoneList<Guard*>* guards = alternative.guards();
3871 int guard_count = (guards == nullptr) ? 0 : guards->length();
3872 for (int j = 0; j < guard_count; j++) {
3873 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3874 }
3875 }
3876 #endif
3877 }
3878
3879
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3880 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3881 Trace* current_trace,
3882 PreloadState* state) {
3883 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3884 // Save some time by looking at most one machine word ahead.
3885 state->eats_at_least_ =
3886 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3887 current_trace->at_start() == Trace::FALSE_VALUE);
3888 }
3889 state->preload_characters_ =
3890 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3891
3892 state->preload_is_current_ =
3893 (current_trace->characters_preloaded() == state->preload_characters_);
3894 state->preload_has_checked_bounds_ = state->preload_is_current_;
3895 }
3896
3897
Emit(RegExpCompiler * compiler,Trace * trace)3898 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3899 int choice_count = alternatives_->length();
3900
3901 if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3902 alternatives_->at(0).node()->Emit(compiler, trace);
3903 return;
3904 }
3905
3906 AssertGuardsMentionRegisters(trace);
3907
3908 LimitResult limit_result = LimitVersions(compiler, trace);
3909 if (limit_result == DONE) return;
3910 DCHECK(limit_result == CONTINUE);
3911
3912 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3913 // other choice nodes we only flush if we are out of code size budget.
3914 if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3915 trace->Flush(compiler, this);
3916 return;
3917 }
3918
3919 RecursionCheck rc(compiler);
3920
3921 PreloadState preload;
3922 preload.init();
3923 GreedyLoopState greedy_loop_state(not_at_start());
3924
3925 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3926 AlternativeGenerationList alt_gens(choice_count, zone());
3927
3928 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3929 trace = EmitGreedyLoop(compiler,
3930 trace,
3931 &alt_gens,
3932 &preload,
3933 &greedy_loop_state,
3934 text_length);
3935 } else {
3936 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3937 // match the traces produced pre-cleanup.
3938 Label second_choice;
3939 compiler->macro_assembler()->Bind(&second_choice);
3940
3941 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3942
3943 EmitChoices(compiler,
3944 &alt_gens,
3945 0,
3946 trace,
3947 &preload);
3948 }
3949
3950 // At this point we need to generate slow checks for the alternatives where
3951 // the quick check was inlined. We can recognize these because the associated
3952 // label was bound.
3953 int new_flush_budget = trace->flush_budget() / choice_count;
3954 for (int i = 0; i < choice_count; i++) {
3955 AlternativeGeneration* alt_gen = alt_gens.at(i);
3956 Trace new_trace(*trace);
3957 // If there are actions to be flushed we have to limit how many times
3958 // they are flushed. Take the budget of the parent trace and distribute
3959 // it fairly amongst the children.
3960 if (new_trace.actions() != nullptr) {
3961 new_trace.set_flush_budget(new_flush_budget);
3962 }
3963 bool next_expects_preload =
3964 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3965 EmitOutOfLineContinuation(compiler,
3966 &new_trace,
3967 alternatives_->at(i),
3968 alt_gen,
3969 preload.preload_characters_,
3970 next_expects_preload);
3971 }
3972 }
3973
3974
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3975 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3976 Trace* trace,
3977 AlternativeGenerationList* alt_gens,
3978 PreloadState* preload,
3979 GreedyLoopState* greedy_loop_state,
3980 int text_length) {
3981 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3982 // Here we have special handling for greedy loops containing only text nodes
3983 // and other simple nodes. These are handled by pushing the current
3984 // position on the stack and then incrementing the current position each
3985 // time around the switch. On backtrack we decrement the current position
3986 // and check it against the pushed value. This avoids pushing backtrack
3987 // information for each iteration of the loop, which could take up a lot of
3988 // space.
3989 DCHECK(trace->stop_node() == nullptr);
3990 macro_assembler->PushCurrentPosition();
3991 Label greedy_match_failed;
3992 Trace greedy_match_trace;
3993 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3994 greedy_match_trace.set_backtrack(&greedy_match_failed);
3995 Label loop_label;
3996 macro_assembler->Bind(&loop_label);
3997 greedy_match_trace.set_stop_node(this);
3998 greedy_match_trace.set_loop_label(&loop_label);
3999 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4000 macro_assembler->Bind(&greedy_match_failed);
4001
4002 Label second_choice; // For use in greedy matches.
4003 macro_assembler->Bind(&second_choice);
4004
4005 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4006
4007 EmitChoices(compiler,
4008 alt_gens,
4009 1,
4010 new_trace,
4011 preload);
4012
4013 macro_assembler->Bind(greedy_loop_state->label());
4014 // If we have unwound to the bottom then backtrack.
4015 macro_assembler->CheckGreedyLoop(trace->backtrack());
4016 // Otherwise try the second priority at an earlier position.
4017 macro_assembler->AdvanceCurrentPosition(-text_length);
4018 macro_assembler->GoTo(&second_choice);
4019 return new_trace;
4020 }
4021
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4022 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4023 Trace* trace) {
4024 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4025 if (alternatives_->length() != 2) return eats_at_least;
4026
4027 GuardedAlternative alt1 = alternatives_->at(1);
4028 if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4029 return eats_at_least;
4030 }
4031 RegExpNode* eats_anything_node = alt1.node();
4032 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4033 return eats_at_least;
4034 }
4035
4036 // Really we should be creating a new trace when we execute this function,
4037 // but there is no need, because the code it generates cannot backtrack, and
4038 // we always arrive here with a trivial trace (since it's the entry to a
4039 // loop. That also implies that there are no preloaded characters, which is
4040 // good, because it means we won't be violating any assumptions by
4041 // overwriting those characters with new load instructions.
4042 DCHECK(trace->is_trivial());
4043
4044 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4045 Isolate* isolate = macro_assembler->isolate();
4046 // At this point we know that we are at a non-greedy loop that will eat
4047 // any character one at a time. Any non-anchored regexp has such a
4048 // loop prepended to it in order to find where it starts. We look for
4049 // a pattern of the form ...abc... where we can look 6 characters ahead
4050 // and step forwards 3 if the character is not one of abc. Abc need
4051 // not be atoms, they can be any reasonably limited character class or
4052 // small alternation.
4053 BoyerMooreLookahead* bm = bm_info(false);
4054 if (bm == nullptr) {
4055 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4056 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4057 kRecursionBudget,
4058 false));
4059 if (eats_at_least >= 1) {
4060 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4061 compiler,
4062 zone());
4063 GuardedAlternative alt0 = alternatives_->at(0);
4064 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4065 }
4066 }
4067 if (bm != nullptr) {
4068 bm->EmitSkipInstructions(macro_assembler);
4069 }
4070 return eats_at_least;
4071 }
4072
4073
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4074 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4075 AlternativeGenerationList* alt_gens,
4076 int first_choice,
4077 Trace* trace,
4078 PreloadState* preload) {
4079 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4080 SetUpPreLoad(compiler, trace, preload);
4081
4082 // For now we just call all choices one after the other. The idea ultimately
4083 // is to use the Dispatch table to try only the relevant ones.
4084 int choice_count = alternatives_->length();
4085
4086 int new_flush_budget = trace->flush_budget() / choice_count;
4087
4088 for (int i = first_choice; i < choice_count; i++) {
4089 bool is_last = i == choice_count - 1;
4090 bool fall_through_on_failure = !is_last;
4091 GuardedAlternative alternative = alternatives_->at(i);
4092 AlternativeGeneration* alt_gen = alt_gens->at(i);
4093 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4094 ZoneList<Guard*>* guards = alternative.guards();
4095 int guard_count = (guards == nullptr) ? 0 : guards->length();
4096 Trace new_trace(*trace);
4097 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4098 preload->preload_characters_ :
4099 0);
4100 if (preload->preload_has_checked_bounds_) {
4101 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4102 }
4103 new_trace.quick_check_performed()->Clear();
4104 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4105 if (!is_last) {
4106 new_trace.set_backtrack(&alt_gen->after);
4107 }
4108 alt_gen->expects_preload = preload->preload_is_current_;
4109 bool generate_full_check_inline = false;
4110 if (compiler->optimize() &&
4111 try_to_emit_quick_check_for_alternative(i == 0) &&
4112 alternative.node()->EmitQuickCheck(
4113 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4114 &alt_gen->possible_success, &alt_gen->quick_check_details,
4115 fall_through_on_failure)) {
4116 // Quick check was generated for this choice.
4117 preload->preload_is_current_ = true;
4118 preload->preload_has_checked_bounds_ = true;
4119 // If we generated the quick check to fall through on possible success,
4120 // we now need to generate the full check inline.
4121 if (!fall_through_on_failure) {
4122 macro_assembler->Bind(&alt_gen->possible_success);
4123 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4124 new_trace.set_characters_preloaded(preload->preload_characters_);
4125 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4126 generate_full_check_inline = true;
4127 }
4128 } else if (alt_gen->quick_check_details.cannot_match()) {
4129 if (!fall_through_on_failure) {
4130 macro_assembler->GoTo(trace->backtrack());
4131 }
4132 continue;
4133 } else {
4134 // No quick check was generated. Put the full code here.
4135 // If this is not the first choice then there could be slow checks from
4136 // previous cases that go here when they fail. There's no reason to
4137 // insist that they preload characters since the slow check we are about
4138 // to generate probably can't use it.
4139 if (i != first_choice) {
4140 alt_gen->expects_preload = false;
4141 new_trace.InvalidateCurrentCharacter();
4142 }
4143 generate_full_check_inline = true;
4144 }
4145 if (generate_full_check_inline) {
4146 if (new_trace.actions() != nullptr) {
4147 new_trace.set_flush_budget(new_flush_budget);
4148 }
4149 for (int j = 0; j < guard_count; j++) {
4150 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4151 }
4152 alternative.node()->Emit(compiler, &new_trace);
4153 preload->preload_is_current_ = false;
4154 }
4155 macro_assembler->Bind(&alt_gen->after);
4156 }
4157 }
4158
4159
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4160 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4161 Trace* trace,
4162 GuardedAlternative alternative,
4163 AlternativeGeneration* alt_gen,
4164 int preload_characters,
4165 bool next_expects_preload) {
4166 if (!alt_gen->possible_success.is_linked()) return;
4167
4168 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4169 macro_assembler->Bind(&alt_gen->possible_success);
4170 Trace out_of_line_trace(*trace);
4171 out_of_line_trace.set_characters_preloaded(preload_characters);
4172 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4173 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4174 ZoneList<Guard*>* guards = alternative.guards();
4175 int guard_count = (guards == nullptr) ? 0 : guards->length();
4176 if (next_expects_preload) {
4177 Label reload_current_char;
4178 out_of_line_trace.set_backtrack(&reload_current_char);
4179 for (int j = 0; j < guard_count; j++) {
4180 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4181 }
4182 alternative.node()->Emit(compiler, &out_of_line_trace);
4183 macro_assembler->Bind(&reload_current_char);
4184 // Reload the current character, since the next quick check expects that.
4185 // We don't need to check bounds here because we only get into this
4186 // code through a quick check which already did the checked load.
4187 macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4188 preload_characters);
4189 macro_assembler->GoTo(&(alt_gen->after));
4190 } else {
4191 out_of_line_trace.set_backtrack(&(alt_gen->after));
4192 for (int j = 0; j < guard_count; j++) {
4193 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4194 }
4195 alternative.node()->Emit(compiler, &out_of_line_trace);
4196 }
4197 }
4198
4199
Emit(RegExpCompiler * compiler,Trace * trace)4200 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4201 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4202 LimitResult limit_result = LimitVersions(compiler, trace);
4203 if (limit_result == DONE) return;
4204 DCHECK(limit_result == CONTINUE);
4205
4206 RecursionCheck rc(compiler);
4207
4208 switch (action_type_) {
4209 case STORE_POSITION: {
4210 Trace::DeferredCapture
4211 new_capture(data_.u_position_register.reg,
4212 data_.u_position_register.is_capture,
4213 trace);
4214 Trace new_trace = *trace;
4215 new_trace.add_action(&new_capture);
4216 on_success()->Emit(compiler, &new_trace);
4217 break;
4218 }
4219 case INCREMENT_REGISTER: {
4220 Trace::DeferredIncrementRegister
4221 new_increment(data_.u_increment_register.reg);
4222 Trace new_trace = *trace;
4223 new_trace.add_action(&new_increment);
4224 on_success()->Emit(compiler, &new_trace);
4225 break;
4226 }
4227 case SET_REGISTER: {
4228 Trace::DeferredSetRegister
4229 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4230 Trace new_trace = *trace;
4231 new_trace.add_action(&new_set);
4232 on_success()->Emit(compiler, &new_trace);
4233 break;
4234 }
4235 case CLEAR_CAPTURES: {
4236 Trace::DeferredClearCaptures
4237 new_capture(Interval(data_.u_clear_captures.range_from,
4238 data_.u_clear_captures.range_to));
4239 Trace new_trace = *trace;
4240 new_trace.add_action(&new_capture);
4241 on_success()->Emit(compiler, &new_trace);
4242 break;
4243 }
4244 case BEGIN_SUBMATCH:
4245 if (!trace->is_trivial()) {
4246 trace->Flush(compiler, this);
4247 } else {
4248 assembler->WriteCurrentPositionToRegister(
4249 data_.u_submatch.current_position_register, 0);
4250 assembler->WriteStackPointerToRegister(
4251 data_.u_submatch.stack_pointer_register);
4252 on_success()->Emit(compiler, trace);
4253 }
4254 break;
4255 case EMPTY_MATCH_CHECK: {
4256 int start_pos_reg = data_.u_empty_match_check.start_register;
4257 int stored_pos = 0;
4258 int rep_reg = data_.u_empty_match_check.repetition_register;
4259 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4260 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4261 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4262 // If we know we haven't advanced and there is no minimum we
4263 // can just backtrack immediately.
4264 assembler->GoTo(trace->backtrack());
4265 } else if (know_dist && stored_pos < trace->cp_offset()) {
4266 // If we know we've advanced we can generate the continuation
4267 // immediately.
4268 on_success()->Emit(compiler, trace);
4269 } else if (!trace->is_trivial()) {
4270 trace->Flush(compiler, this);
4271 } else {
4272 Label skip_empty_check;
4273 // If we have a minimum number of repetitions we check the current
4274 // number first and skip the empty check if it's not enough.
4275 if (has_minimum) {
4276 int limit = data_.u_empty_match_check.repetition_limit;
4277 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4278 }
4279 // If the match is empty we bail out, otherwise we fall through
4280 // to the on-success continuation.
4281 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4282 trace->backtrack());
4283 assembler->Bind(&skip_empty_check);
4284 on_success()->Emit(compiler, trace);
4285 }
4286 break;
4287 }
4288 case POSITIVE_SUBMATCH_SUCCESS: {
4289 if (!trace->is_trivial()) {
4290 trace->Flush(compiler, this);
4291 return;
4292 }
4293 assembler->ReadCurrentPositionFromRegister(
4294 data_.u_submatch.current_position_register);
4295 assembler->ReadStackPointerFromRegister(
4296 data_.u_submatch.stack_pointer_register);
4297 int clear_register_count = data_.u_submatch.clear_register_count;
4298 if (clear_register_count == 0) {
4299 on_success()->Emit(compiler, trace);
4300 return;
4301 }
4302 int clear_registers_from = data_.u_submatch.clear_register_from;
4303 Label clear_registers_backtrack;
4304 Trace new_trace = *trace;
4305 new_trace.set_backtrack(&clear_registers_backtrack);
4306 on_success()->Emit(compiler, &new_trace);
4307
4308 assembler->Bind(&clear_registers_backtrack);
4309 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4310 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4311
4312 DCHECK(trace->backtrack() == nullptr);
4313 assembler->Backtrack();
4314 return;
4315 }
4316 default:
4317 UNREACHABLE();
4318 }
4319 }
4320
4321
Emit(RegExpCompiler * compiler,Trace * trace)4322 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4323 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4324 if (!trace->is_trivial()) {
4325 trace->Flush(compiler, this);
4326 return;
4327 }
4328
4329 LimitResult limit_result = LimitVersions(compiler, trace);
4330 if (limit_result == DONE) return;
4331 DCHECK(limit_result == CONTINUE);
4332
4333 RecursionCheck rc(compiler);
4334
4335 DCHECK_EQ(start_reg_ + 1, end_reg_);
4336 if (IgnoreCase(flags_)) {
4337 assembler->CheckNotBackReferenceIgnoreCase(
4338 start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
4339 } else {
4340 assembler->CheckNotBackReference(start_reg_, read_backward(),
4341 trace->backtrack());
4342 }
4343 // We are going to advance backward, so we may end up at the start.
4344 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4345
4346 // Check that the back reference does not end inside a surrogate pair.
4347 if (IsUnicode(flags_) && !compiler->one_byte()) {
4348 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4349 }
4350 on_success()->Emit(compiler, trace);
4351 }
4352
4353
4354 // -------------------------------------------------------------------
4355 // Dot/dotty output
4356
4357
4358 #ifdef DEBUG
4359
4360
4361 class DotPrinter: public NodeVisitor {
4362 public:
DotPrinter(std::ostream & os,bool ignore_case)4363 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4364 : os_(os),
4365 ignore_case_(ignore_case) {}
4366 void PrintNode(const char* label, RegExpNode* node);
4367 void Visit(RegExpNode* node);
4368 void PrintAttributes(RegExpNode* from);
4369 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4370 #define DECLARE_VISIT(Type) \
4371 virtual void Visit##Type(Type##Node* that);
4372 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4373 #undef DECLARE_VISIT
4374 private:
4375 std::ostream& os_;
4376 bool ignore_case_;
4377 };
4378
4379
PrintNode(const char * label,RegExpNode * node)4380 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4381 os_ << "digraph G {\n graph [label=\"";
4382 for (int i = 0; label[i]; i++) {
4383 switch (label[i]) {
4384 case '\\':
4385 os_ << "\\\\";
4386 break;
4387 case '"':
4388 os_ << "\"";
4389 break;
4390 default:
4391 os_ << label[i];
4392 break;
4393 }
4394 }
4395 os_ << "\"];\n";
4396 Visit(node);
4397 os_ << "}" << std::endl;
4398 }
4399
4400
Visit(RegExpNode * node)4401 void DotPrinter::Visit(RegExpNode* node) {
4402 if (node->info()->visited) return;
4403 node->info()->visited = true;
4404 node->Accept(this);
4405 }
4406
4407
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4408 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4409 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4410 Visit(on_failure);
4411 }
4412
4413
4414 class TableEntryBodyPrinter {
4415 public:
TableEntryBodyPrinter(std::ostream & os,ChoiceNode * choice)4416 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4417 : os_(os),
4418 choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4419 void Call(uc16 from, DispatchTable::Entry entry) {
4420 OutSet* out_set = entry.out_set();
4421 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4422 if (out_set->Get(i)) {
4423 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4424 << choice()->alternatives()->at(i).node() << ";\n";
4425 }
4426 }
4427 }
4428 private:
choice()4429 ChoiceNode* choice() { return choice_; }
4430 std::ostream& os_;
4431 ChoiceNode* choice_;
4432 };
4433
4434
4435 class TableEntryHeaderPrinter {
4436 public:
TableEntryHeaderPrinter(std::ostream & os)4437 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4438 : first_(true),
4439 os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4440 void Call(uc16 from, DispatchTable::Entry entry) {
4441 if (first_) {
4442 first_ = false;
4443 } else {
4444 os_ << "|";
4445 }
4446 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4447 OutSet* out_set = entry.out_set();
4448 int priority = 0;
4449 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4450 if (out_set->Get(i)) {
4451 if (priority > 0) os_ << "|";
4452 os_ << "<s" << from << "o" << i << "> " << priority;
4453 priority++;
4454 }
4455 }
4456 os_ << "}}";
4457 }
4458
4459 private:
4460 bool first_;
4461 std::ostream& os_;
4462 };
4463
4464
4465 class AttributePrinter {
4466 public:
AttributePrinter(std::ostream & os)4467 explicit AttributePrinter(std::ostream& os) // NOLINT
4468 : os_(os),
4469 first_(true) {}
PrintSeparator()4470 void PrintSeparator() {
4471 if (first_) {
4472 first_ = false;
4473 } else {
4474 os_ << "|";
4475 }
4476 }
PrintBit(const char * name,bool value)4477 void PrintBit(const char* name, bool value) {
4478 if (!value) return;
4479 PrintSeparator();
4480 os_ << "{" << name << "}";
4481 }
PrintPositive(const char * name,int value)4482 void PrintPositive(const char* name, int value) {
4483 if (value < 0) return;
4484 PrintSeparator();
4485 os_ << "{" << name << "|" << value << "}";
4486 }
4487
4488 private:
4489 std::ostream& os_;
4490 bool first_;
4491 };
4492
4493
PrintAttributes(RegExpNode * that)4494 void DotPrinter::PrintAttributes(RegExpNode* that) {
4495 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4496 << "margin=0.1, fontsize=10, label=\"{";
4497 AttributePrinter printer(os_);
4498 NodeInfo* info = that->info();
4499 printer.PrintBit("NI", info->follows_newline_interest);
4500 printer.PrintBit("WI", info->follows_word_interest);
4501 printer.PrintBit("SI", info->follows_start_interest);
4502 Label* label = that->label();
4503 if (label->is_bound())
4504 printer.PrintPositive("@", label->pos());
4505 os_ << "}\"];\n"
4506 << " a" << that << " -> n" << that
4507 << " [style=dashed, color=grey, arrowhead=none];\n";
4508 }
4509
4510
4511 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4512 void DotPrinter::VisitChoice(ChoiceNode* that) {
4513 if (kPrintDispatchTable) {
4514 os_ << " n" << that << " [shape=Mrecord, label=\"";
4515 TableEntryHeaderPrinter header_printer(os_);
4516 that->GetTable(ignore_case_)->ForEach(&header_printer);
4517 os_ << "\"]\n";
4518 PrintAttributes(that);
4519 TableEntryBodyPrinter body_printer(os_, that);
4520 that->GetTable(ignore_case_)->ForEach(&body_printer);
4521 } else {
4522 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4523 for (int i = 0; i < that->alternatives()->length(); i++) {
4524 GuardedAlternative alt = that->alternatives()->at(i);
4525 os_ << " n" << that << " -> n" << alt.node();
4526 }
4527 }
4528 for (int i = 0; i < that->alternatives()->length(); i++) {
4529 GuardedAlternative alt = that->alternatives()->at(i);
4530 alt.node()->Accept(this);
4531 }
4532 }
4533
4534
VisitText(TextNode * that)4535 void DotPrinter::VisitText(TextNode* that) {
4536 Zone* zone = that->zone();
4537 os_ << " n" << that << " [label=\"";
4538 for (int i = 0; i < that->elements()->length(); i++) {
4539 if (i > 0) os_ << " ";
4540 TextElement elm = that->elements()->at(i);
4541 switch (elm.text_type()) {
4542 case TextElement::ATOM: {
4543 Vector<const uc16> data = elm.atom()->data();
4544 for (int i = 0; i < data.length(); i++) {
4545 os_ << static_cast<char>(data[i]);
4546 }
4547 break;
4548 }
4549 case TextElement::CHAR_CLASS: {
4550 RegExpCharacterClass* node = elm.char_class();
4551 os_ << "[";
4552 if (node->is_negated()) os_ << "^";
4553 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4554 CharacterRange range = node->ranges(zone)->at(j);
4555 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4556 }
4557 os_ << "]";
4558 break;
4559 }
4560 default:
4561 UNREACHABLE();
4562 }
4563 }
4564 os_ << "\", shape=box, peripheries=2];\n";
4565 PrintAttributes(that);
4566 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4567 Visit(that->on_success());
4568 }
4569
4570
VisitBackReference(BackReferenceNode * that)4571 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4572 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4573 << that->end_register() << "\", shape=doubleoctagon];\n";
4574 PrintAttributes(that);
4575 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4576 Visit(that->on_success());
4577 }
4578
4579
VisitEnd(EndNode * that)4580 void DotPrinter::VisitEnd(EndNode* that) {
4581 os_ << " n" << that << " [style=bold, shape=point];\n";
4582 PrintAttributes(that);
4583 }
4584
4585
VisitAssertion(AssertionNode * that)4586 void DotPrinter::VisitAssertion(AssertionNode* that) {
4587 os_ << " n" << that << " [";
4588 switch (that->assertion_type()) {
4589 case AssertionNode::AT_END:
4590 os_ << "label=\"$\", shape=septagon";
4591 break;
4592 case AssertionNode::AT_START:
4593 os_ << "label=\"^\", shape=septagon";
4594 break;
4595 case AssertionNode::AT_BOUNDARY:
4596 os_ << "label=\"\\b\", shape=septagon";
4597 break;
4598 case AssertionNode::AT_NON_BOUNDARY:
4599 os_ << "label=\"\\B\", shape=septagon";
4600 break;
4601 case AssertionNode::AFTER_NEWLINE:
4602 os_ << "label=\"(?<=\\n)\", shape=septagon";
4603 break;
4604 }
4605 os_ << "];\n";
4606 PrintAttributes(that);
4607 RegExpNode* successor = that->on_success();
4608 os_ << " n" << that << " -> n" << successor << ";\n";
4609 Visit(successor);
4610 }
4611
4612
VisitAction(ActionNode * that)4613 void DotPrinter::VisitAction(ActionNode* that) {
4614 os_ << " n" << that << " [";
4615 switch (that->action_type_) {
4616 case ActionNode::SET_REGISTER:
4617 os_ << "label=\"$" << that->data_.u_store_register.reg
4618 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4619 break;
4620 case ActionNode::INCREMENT_REGISTER:
4621 os_ << "label=\"$" << that->data_.u_increment_register.reg
4622 << "++\", shape=octagon";
4623 break;
4624 case ActionNode::STORE_POSITION:
4625 os_ << "label=\"$" << that->data_.u_position_register.reg
4626 << ":=$pos\", shape=octagon";
4627 break;
4628 case ActionNode::BEGIN_SUBMATCH:
4629 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4630 << ":=$pos,begin\", shape=septagon";
4631 break;
4632 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4633 os_ << "label=\"escape\", shape=septagon";
4634 break;
4635 case ActionNode::EMPTY_MATCH_CHECK:
4636 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4637 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4638 << "<" << that->data_.u_empty_match_check.repetition_limit
4639 << "?\", shape=septagon";
4640 break;
4641 case ActionNode::CLEAR_CAPTURES: {
4642 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4643 << " to $" << that->data_.u_clear_captures.range_to
4644 << "\", shape=septagon";
4645 break;
4646 }
4647 }
4648 os_ << "];\n";
4649 PrintAttributes(that);
4650 RegExpNode* successor = that->on_success();
4651 os_ << " n" << that << " -> n" << successor << ";\n";
4652 Visit(successor);
4653 }
4654
4655
4656 class DispatchTableDumper {
4657 public:
DispatchTableDumper(std::ostream & os)4658 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4659 void Call(uc16 key, DispatchTable::Entry entry);
4660 private:
4661 std::ostream& os_;
4662 };
4663
4664
Call(uc16 key,DispatchTable::Entry entry)4665 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4666 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4667 OutSet* set = entry.out_set();
4668 bool first = true;
4669 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4670 if (set->Get(i)) {
4671 if (first) {
4672 first = false;
4673 } else {
4674 os_ << ", ";
4675 }
4676 os_ << i;
4677 }
4678 }
4679 os_ << "}\n";
4680 }
4681
4682
Dump()4683 void DispatchTable::Dump() {
4684 OFStream os(stderr);
4685 DispatchTableDumper dumper(os);
4686 tree()->ForEach(&dumper);
4687 }
4688
4689
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4690 void RegExpEngine::DotPrint(const char* label,
4691 RegExpNode* node,
4692 bool ignore_case) {
4693 StdoutStream os;
4694 DotPrinter printer(os, ignore_case);
4695 printer.PrintNode(label, node);
4696 }
4697
4698
4699 #endif // DEBUG
4700
4701
4702 // -------------------------------------------------------------------
4703 // Tree to graph conversion
4704
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4705 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4706 RegExpNode* on_success) {
4707 ZoneList<TextElement>* elms =
4708 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4709 elms->Add(TextElement::Atom(this), compiler->zone());
4710 return new (compiler->zone())
4711 TextNode(elms, compiler->read_backward(), on_success);
4712 }
4713
4714
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4715 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4716 RegExpNode* on_success) {
4717 return new (compiler->zone())
4718 TextNode(elements(), compiler->read_backward(), on_success);
4719 }
4720
4721
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4722 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4723 const int* special_class,
4724 int length) {
4725 length--; // Remove final marker.
4726 DCHECK_EQ(kRangeEndMarker, special_class[length]);
4727 DCHECK_NE(0, ranges->length());
4728 DCHECK_NE(0, length);
4729 DCHECK_NE(0, special_class[0]);
4730 if (ranges->length() != (length >> 1) + 1) {
4731 return false;
4732 }
4733 CharacterRange range = ranges->at(0);
4734 if (range.from() != 0) {
4735 return false;
4736 }
4737 for (int i = 0; i < length; i += 2) {
4738 if (special_class[i] != (range.to() + 1)) {
4739 return false;
4740 }
4741 range = ranges->at((i >> 1) + 1);
4742 if (special_class[i+1] != range.from()) {
4743 return false;
4744 }
4745 }
4746 if (range.to() != String::kMaxCodePoint) {
4747 return false;
4748 }
4749 return true;
4750 }
4751
4752
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4753 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4754 const int* special_class,
4755 int length) {
4756 length--; // Remove final marker.
4757 DCHECK_EQ(kRangeEndMarker, special_class[length]);
4758 if (ranges->length() * 2 != length) {
4759 return false;
4760 }
4761 for (int i = 0; i < length; i += 2) {
4762 CharacterRange range = ranges->at(i >> 1);
4763 if (range.from() != special_class[i] ||
4764 range.to() != special_class[i + 1] - 1) {
4765 return false;
4766 }
4767 }
4768 return true;
4769 }
4770
4771
is_standard(Zone * zone)4772 bool RegExpCharacterClass::is_standard(Zone* zone) {
4773 // TODO(lrn): Remove need for this function, by not throwing away information
4774 // along the way.
4775 if (is_negated()) {
4776 return false;
4777 }
4778 if (set_.is_standard()) {
4779 return true;
4780 }
4781 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4782 set_.set_standard_set_type('s');
4783 return true;
4784 }
4785 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4786 set_.set_standard_set_type('S');
4787 return true;
4788 }
4789 if (CompareInverseRanges(set_.ranges(zone),
4790 kLineTerminatorRanges,
4791 kLineTerminatorRangeCount)) {
4792 set_.set_standard_set_type('.');
4793 return true;
4794 }
4795 if (CompareRanges(set_.ranges(zone),
4796 kLineTerminatorRanges,
4797 kLineTerminatorRangeCount)) {
4798 set_.set_standard_set_type('n');
4799 return true;
4800 }
4801 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4802 set_.set_standard_set_type('w');
4803 return true;
4804 }
4805 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4806 set_.set_standard_set_type('W');
4807 return true;
4808 }
4809 return false;
4810 }
4811
4812
UnicodeRangeSplitter(Zone * zone,ZoneList<CharacterRange> * base)4813 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4814 ZoneList<CharacterRange>* base)
4815 : zone_(zone),
4816 table_(zone),
4817 bmp_(nullptr),
4818 lead_surrogates_(nullptr),
4819 trail_surrogates_(nullptr),
4820 non_bmp_(nullptr) {
4821 // The unicode range splitter categorizes given character ranges into:
4822 // - Code points from the BMP representable by one code unit.
4823 // - Code points outside the BMP that need to be split into surrogate pairs.
4824 // - Lone lead surrogates.
4825 // - Lone trail surrogates.
4826 // Lone surrogates are valid code points, even though no actual characters.
4827 // They require special matching to make sure we do not split surrogate pairs.
4828 // We use the dispatch table to accomplish this. The base range is split up
4829 // by the table by the overlay ranges, and the Call callback is used to
4830 // filter and collect ranges for each category.
4831 for (int i = 0; i < base->length(); i++) {
4832 table_.AddRange(base->at(i), kBase, zone_);
4833 }
4834 // Add overlay ranges.
4835 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4836 kBmpCodePoints, zone_);
4837 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4838 kLeadSurrogates, zone_);
4839 table_.AddRange(
4840 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4841 kTrailSurrogates, zone_);
4842 table_.AddRange(
4843 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4844 kBmpCodePoints, zone_);
4845 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4846 kNonBmpCodePoints, zone_);
4847 table_.ForEach(this);
4848 }
4849
4850
Call(uc32 from,DispatchTable::Entry entry)4851 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4852 OutSet* outset = entry.out_set();
4853 if (!outset->Get(kBase)) return;
4854 ZoneList<CharacterRange>** target = nullptr;
4855 if (outset->Get(kBmpCodePoints)) {
4856 target = &bmp_;
4857 } else if (outset->Get(kLeadSurrogates)) {
4858 target = &lead_surrogates_;
4859 } else if (outset->Get(kTrailSurrogates)) {
4860 target = &trail_surrogates_;
4861 } else {
4862 DCHECK(outset->Get(kNonBmpCodePoints));
4863 target = &non_bmp_;
4864 }
4865 if (*target == nullptr)
4866 *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4867 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4868 }
4869
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4870 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4871 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4872 ZoneList<CharacterRange>* bmp = splitter->bmp();
4873 if (bmp == nullptr) return;
4874 JSRegExp::Flags default_flags = JSRegExp::Flags();
4875 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4876 compiler->zone(), bmp, compiler->read_backward(), on_success,
4877 default_flags)));
4878 }
4879
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4880 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4881 RegExpNode* on_success,
4882 UnicodeRangeSplitter* splitter) {
4883 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4884 if (non_bmp == nullptr) return;
4885 DCHECK(!compiler->one_byte());
4886 Zone* zone = compiler->zone();
4887 JSRegExp::Flags default_flags = JSRegExp::Flags();
4888 CharacterRange::Canonicalize(non_bmp);
4889 for (int i = 0; i < non_bmp->length(); i++) {
4890 // Match surrogate pair.
4891 // E.g. [\u10005-\u11005] becomes
4892 // \ud800[\udc05-\udfff]|
4893 // [\ud801-\ud803][\udc00-\udfff]|
4894 // \ud804[\udc00-\udc05]
4895 uc32 from = non_bmp->at(i).from();
4896 uc32 to = non_bmp->at(i).to();
4897 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4898 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4899 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4900 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4901 if (from_l == to_l) {
4902 // The lead surrogate is the same.
4903 result->AddAlternative(
4904 GuardedAlternative(TextNode::CreateForSurrogatePair(
4905 zone, CharacterRange::Singleton(from_l),
4906 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4907 on_success, default_flags)));
4908 } else {
4909 if (from_t != kTrailSurrogateStart) {
4910 // Add [from_l][from_t-\udfff]
4911 result->AddAlternative(
4912 GuardedAlternative(TextNode::CreateForSurrogatePair(
4913 zone, CharacterRange::Singleton(from_l),
4914 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4915 compiler->read_backward(), on_success, default_flags)));
4916 from_l++;
4917 }
4918 if (to_t != kTrailSurrogateEnd) {
4919 // Add [to_l][\udc00-to_t]
4920 result->AddAlternative(
4921 GuardedAlternative(TextNode::CreateForSurrogatePair(
4922 zone, CharacterRange::Singleton(to_l),
4923 CharacterRange::Range(kTrailSurrogateStart, to_t),
4924 compiler->read_backward(), on_success, default_flags)));
4925 to_l--;
4926 }
4927 if (from_l <= to_l) {
4928 // Add [from_l-to_l][\udc00-\udfff]
4929 result->AddAlternative(
4930 GuardedAlternative(TextNode::CreateForSurrogatePair(
4931 zone, CharacterRange::Range(from_l, to_l),
4932 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4933 compiler->read_backward(), on_success, default_flags)));
4934 }
4935 }
4936 }
4937 }
4938
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)4939 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4940 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4941 ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
4942 JSRegExp::Flags flags) {
4943 Zone* zone = compiler->zone();
4944 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4945 zone, match, read_backward, on_success, flags);
4946 int stack_register = compiler->UnicodeLookaroundStackRegister();
4947 int position_register = compiler->UnicodeLookaroundPositionRegister();
4948 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4949 position_register);
4950 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4951 zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
4952 return lookaround.ForMatch(negative_match);
4953 }
4954
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)4955 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4956 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4957 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4958 bool read_backward, JSRegExp::Flags flags) {
4959 Zone* zone = compiler->zone();
4960 int stack_register = compiler->UnicodeLookaroundStackRegister();
4961 int position_register = compiler->UnicodeLookaroundPositionRegister();
4962 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4963 position_register);
4964 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4965 zone, lookahead, read_backward, lookaround.on_match_success(), flags);
4966 return TextNode::CreateForCharacterRanges(
4967 zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
4968 }
4969
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4970 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4971 RegExpNode* on_success,
4972 UnicodeRangeSplitter* splitter) {
4973 JSRegExp::Flags default_flags = JSRegExp::Flags();
4974 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4975 if (lead_surrogates == nullptr) return;
4976 Zone* zone = compiler->zone();
4977 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4978 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4979 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4980
4981 RegExpNode* match;
4982 if (compiler->read_backward()) {
4983 // Reading backward. Assert that reading forward, there is no trail
4984 // surrogate, and then backward match the lead surrogate.
4985 match = NegativeLookaroundAgainstReadDirectionAndMatch(
4986 compiler, trail_surrogates, lead_surrogates, on_success, true,
4987 default_flags);
4988 } else {
4989 // Reading forward. Forward match the lead surrogate and assert that
4990 // no trail surrogate follows.
4991 match = MatchAndNegativeLookaroundInReadDirection(
4992 compiler, lead_surrogates, trail_surrogates, on_success, false,
4993 default_flags);
4994 }
4995 result->AddAlternative(GuardedAlternative(match));
4996 }
4997
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4998 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4999 RegExpNode* on_success,
5000 UnicodeRangeSplitter* splitter) {
5001 JSRegExp::Flags default_flags = JSRegExp::Flags();
5002 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5003 if (trail_surrogates == nullptr) return;
5004 Zone* zone = compiler->zone();
5005 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5006 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5007 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5008
5009 RegExpNode* match;
5010 if (compiler->read_backward()) {
5011 // Reading backward. Backward match the trail surrogate and assert that no
5012 // lead surrogate precedes it.
5013 match = MatchAndNegativeLookaroundInReadDirection(
5014 compiler, trail_surrogates, lead_surrogates, on_success, true,
5015 default_flags);
5016 } else {
5017 // Reading forward. Assert that reading backward, there is no lead
5018 // surrogate, and then forward match the trail surrogate.
5019 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5020 compiler, lead_surrogates, trail_surrogates, on_success, false,
5021 default_flags);
5022 }
5023 result->AddAlternative(GuardedAlternative(match));
5024 }
5025
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)5026 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5027 RegExpNode* on_success) {
5028 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5029 DCHECK(!compiler->read_backward());
5030 Zone* zone = compiler->zone();
5031 // Advance any character. If the character happens to be a lead surrogate and
5032 // we advanced into the middle of a surrogate pair, it will work out, as
5033 // nothing will match from there. We will have to advance again, consuming
5034 // the associated trail surrogate.
5035 ZoneList<CharacterRange>* range = CharacterRange::List(
5036 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5037 JSRegExp::Flags default_flags = JSRegExp::Flags();
5038 return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
5039 default_flags);
5040 }
5041
AddUnicodeCaseEquivalents(ZoneList<CharacterRange> * ranges,Zone * zone)5042 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5043 #ifdef V8_INTL_SUPPORT
5044 DCHECK(CharacterRange::IsCanonical(ranges));
5045
5046 // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5047 // See also https://crbug.com/v8/6727.
5048 // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5049 // which we use frequently internally. But large ranges can also easily be
5050 // created by the user. We might want to have a more general caching mechanism
5051 // for such ranges.
5052 if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5053
5054 // Use ICU to compute the case fold closure over the ranges.
5055 icu::UnicodeSet set;
5056 for (int i = 0; i < ranges->length(); i++) {
5057 set.add(ranges->at(i).from(), ranges->at(i).to());
5058 }
5059 ranges->Clear();
5060 set.closeOver(USET_CASE_INSENSITIVE);
5061 // Full case mapping map single characters to multiple characters.
5062 // Those are represented as strings in the set. Remove them so that
5063 // we end up with only simple and common case mappings.
5064 set.removeAllStrings();
5065 for (int i = 0; i < set.getRangeCount(); i++) {
5066 ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5067 zone);
5068 }
5069 // No errors and everything we collected have been ranges.
5070 CharacterRange::Canonicalize(ranges);
5071 #endif // V8_INTL_SUPPORT
5072 }
5073
5074
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5075 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5076 RegExpNode* on_success) {
5077 set_.Canonicalize();
5078 Zone* zone = compiler->zone();
5079 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5080 if (NeedsUnicodeCaseEquivalents(flags_)) {
5081 AddUnicodeCaseEquivalents(ranges, zone);
5082 }
5083 if (IsUnicode(flags_) && !compiler->one_byte() &&
5084 !contains_split_surrogate()) {
5085 if (is_negated()) {
5086 ZoneList<CharacterRange>* negated =
5087 new (zone) ZoneList<CharacterRange>(2, zone);
5088 CharacterRange::Negate(ranges, negated, zone);
5089 ranges = negated;
5090 }
5091 if (ranges->length() == 0) {
5092 JSRegExp::Flags default_flags;
5093 RegExpCharacterClass* fail =
5094 new (zone) RegExpCharacterClass(zone, ranges, default_flags);
5095 return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5096 }
5097 if (standard_type() == '*') {
5098 return UnanchoredAdvance(compiler, on_success);
5099 } else {
5100 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5101 UnicodeRangeSplitter splitter(zone, ranges);
5102 AddBmpCharacters(compiler, result, on_success, &splitter);
5103 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5104 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5105 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5106 return result;
5107 }
5108 } else {
5109 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5110 }
5111 }
5112
5113
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)5114 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5115 RegExpAtom* atom1 = (*a)->AsAtom();
5116 RegExpAtom* atom2 = (*b)->AsAtom();
5117 uc16 character1 = atom1->data().at(0);
5118 uc16 character2 = atom2->data().at(0);
5119 if (character1 < character2) return -1;
5120 if (character1 > character2) return 1;
5121 return 0;
5122 }
5123
5124
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)5125 static unibrow::uchar Canonical(
5126 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5127 unibrow::uchar c) {
5128 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5129 int length = canonicalize->get(c, '\0', chars);
5130 DCHECK_LE(length, 1);
5131 unibrow::uchar canonical = c;
5132 if (length == 1) canonical = chars[0];
5133 return canonical;
5134 }
5135
5136
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)5137 int CompareFirstCharCaseIndependent(
5138 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5139 RegExpTree* const* a, RegExpTree* const* b) {
5140 RegExpAtom* atom1 = (*a)->AsAtom();
5141 RegExpAtom* atom2 = (*b)->AsAtom();
5142 unibrow::uchar character1 = atom1->data().at(0);
5143 unibrow::uchar character2 = atom2->data().at(0);
5144 if (character1 == character2) return 0;
5145 if (character1 >= 'a' || character2 >= 'a') {
5146 character1 = Canonical(canonicalize, character1);
5147 character2 = Canonical(canonicalize, character2);
5148 }
5149 return static_cast<int>(character1) - static_cast<int>(character2);
5150 }
5151
5152
5153 // We can stable sort runs of atoms, since the order does not matter if they
5154 // start with different characters.
5155 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)5156 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5157 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5158 int length = alternatives->length();
5159 bool found_consecutive_atoms = false;
5160 for (int i = 0; i < length; i++) {
5161 while (i < length) {
5162 RegExpTree* alternative = alternatives->at(i);
5163 if (alternative->IsAtom()) break;
5164 i++;
5165 }
5166 // i is length or it is the index of an atom.
5167 if (i == length) break;
5168 int first_atom = i;
5169 JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
5170 i++;
5171 while (i < length) {
5172 RegExpTree* alternative = alternatives->at(i);
5173 if (!alternative->IsAtom()) break;
5174 if (alternative->AsAtom()->flags() != flags) break;
5175 i++;
5176 }
5177 // Sort atoms to get ones with common prefixes together.
5178 // This step is more tricky if we are in a case-independent regexp,
5179 // because it would change /is|I/ to /I|is/, and order matters when
5180 // the regexp parts don't match only disjoint starting points. To fix
5181 // this we have a version of CompareFirstChar that uses case-
5182 // independent character classes for comparison.
5183 DCHECK_LT(first_atom, alternatives->length());
5184 DCHECK_LE(i, alternatives->length());
5185 DCHECK_LE(first_atom, i);
5186 if (IgnoreCase(flags)) {
5187 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5188 compiler->isolate()->regexp_macro_assembler_canonicalize();
5189 auto compare_closure =
5190 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5191 return CompareFirstCharCaseIndependent(canonicalize, a, b);
5192 };
5193 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5194 } else {
5195 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5196 }
5197 if (i - first_atom > 1) found_consecutive_atoms = true;
5198 }
5199 return found_consecutive_atoms;
5200 }
5201
5202
5203 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)5204 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5205 Zone* zone = compiler->zone();
5206 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5207 int length = alternatives->length();
5208
5209 int write_posn = 0;
5210 int i = 0;
5211 while (i < length) {
5212 RegExpTree* alternative = alternatives->at(i);
5213 if (!alternative->IsAtom()) {
5214 alternatives->at(write_posn++) = alternatives->at(i);
5215 i++;
5216 continue;
5217 }
5218 RegExpAtom* const atom = alternative->AsAtom();
5219 JSRegExp::Flags flags = atom->flags();
5220 unibrow::uchar common_prefix = atom->data().at(0);
5221 int first_with_prefix = i;
5222 int prefix_length = atom->length();
5223 i++;
5224 while (i < length) {
5225 alternative = alternatives->at(i);
5226 if (!alternative->IsAtom()) break;
5227 RegExpAtom* const atom = alternative->AsAtom();
5228 if (atom->flags() != flags) break;
5229 unibrow::uchar new_prefix = atom->data().at(0);
5230 if (new_prefix != common_prefix) {
5231 if (!IgnoreCase(flags)) break;
5232 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5233 compiler->isolate()->regexp_macro_assembler_canonicalize();
5234 new_prefix = Canonical(canonicalize, new_prefix);
5235 common_prefix = Canonical(canonicalize, common_prefix);
5236 if (new_prefix != common_prefix) break;
5237 }
5238 prefix_length = Min(prefix_length, atom->length());
5239 i++;
5240 }
5241 if (i > first_with_prefix + 2) {
5242 // Found worthwhile run of alternatives with common prefix of at least one
5243 // character. The sorting function above did not sort on more than one
5244 // character for reasons of correctness, but there may still be a longer
5245 // common prefix if the terms were similar or presorted in the input.
5246 // Find out how long the common prefix is.
5247 int run_length = i - first_with_prefix;
5248 RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
5249 for (int j = 1; j < run_length && prefix_length > 1; j++) {
5250 RegExpAtom* old_atom =
5251 alternatives->at(j + first_with_prefix)->AsAtom();
5252 for (int k = 1; k < prefix_length; k++) {
5253 if (atom->data().at(k) != old_atom->data().at(k)) {
5254 prefix_length = k;
5255 break;
5256 }
5257 }
5258 }
5259 RegExpAtom* prefix = new (zone)
5260 RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
5261 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5262 pair->Add(prefix, zone);
5263 ZoneList<RegExpTree*>* suffixes =
5264 new (zone) ZoneList<RegExpTree*>(run_length, zone);
5265 for (int j = 0; j < run_length; j++) {
5266 RegExpAtom* old_atom =
5267 alternatives->at(j + first_with_prefix)->AsAtom();
5268 int len = old_atom->length();
5269 if (len == prefix_length) {
5270 suffixes->Add(new (zone) RegExpEmpty(), zone);
5271 } else {
5272 RegExpTree* suffix = new (zone) RegExpAtom(
5273 old_atom->data().SubVector(prefix_length, old_atom->length()),
5274 flags);
5275 suffixes->Add(suffix, zone);
5276 }
5277 }
5278 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5279 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5280 } else {
5281 // Just copy any non-worthwhile alternatives.
5282 for (int j = first_with_prefix; j < i; j++) {
5283 alternatives->at(write_posn++) = alternatives->at(j);
5284 }
5285 }
5286 }
5287 alternatives->Rewind(write_posn); // Trim end of array.
5288 }
5289
5290
5291 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)5292 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5293 RegExpCompiler* compiler) {
5294 Zone* zone = compiler->zone();
5295 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5296 int length = alternatives->length();
5297
5298 int write_posn = 0;
5299 int i = 0;
5300 while (i < length) {
5301 RegExpTree* alternative = alternatives->at(i);
5302 if (!alternative->IsAtom()) {
5303 alternatives->at(write_posn++) = alternatives->at(i);
5304 i++;
5305 continue;
5306 }
5307 RegExpAtom* const atom = alternative->AsAtom();
5308 if (atom->length() != 1) {
5309 alternatives->at(write_posn++) = alternatives->at(i);
5310 i++;
5311 continue;
5312 }
5313 JSRegExp::Flags flags = atom->flags();
5314 DCHECK_IMPLIES(IsUnicode(flags),
5315 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5316 bool contains_trail_surrogate =
5317 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5318 int first_in_run = i;
5319 i++;
5320 // Find a run of single-character atom alternatives that have identical
5321 // flags (case independence and unicode-ness).
5322 while (i < length) {
5323 alternative = alternatives->at(i);
5324 if (!alternative->IsAtom()) break;
5325 RegExpAtom* const atom = alternative->AsAtom();
5326 if (atom->length() != 1) break;
5327 if (atom->flags() != flags) break;
5328 DCHECK_IMPLIES(IsUnicode(flags),
5329 !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5330 contains_trail_surrogate |=
5331 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5332 i++;
5333 }
5334 if (i > first_in_run + 1) {
5335 // Found non-trivial run of single-character alternatives.
5336 int run_length = i - first_in_run;
5337 ZoneList<CharacterRange>* ranges =
5338 new (zone) ZoneList<CharacterRange>(2, zone);
5339 for (int j = 0; j < run_length; j++) {
5340 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5341 DCHECK_EQ(old_atom->length(), 1);
5342 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5343 }
5344 RegExpCharacterClass::CharacterClassFlags character_class_flags;
5345 if (IsUnicode(flags) && contains_trail_surrogate) {
5346 character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5347 }
5348 alternatives->at(write_posn++) = new (zone)
5349 RegExpCharacterClass(zone, ranges, flags, character_class_flags);
5350 } else {
5351 // Just copy any trivial alternatives.
5352 for (int j = first_in_run; j < i; j++) {
5353 alternatives->at(write_posn++) = alternatives->at(j);
5354 }
5355 }
5356 }
5357 alternatives->Rewind(write_posn); // Trim end of array.
5358 }
5359
5360
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5361 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5362 RegExpNode* on_success) {
5363 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5364
5365 if (alternatives->length() > 2) {
5366 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5367 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5368 FixSingleCharacterDisjunctions(compiler);
5369 if (alternatives->length() == 1) {
5370 return alternatives->at(0)->ToNode(compiler, on_success);
5371 }
5372 }
5373
5374 int length = alternatives->length();
5375
5376 ChoiceNode* result =
5377 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5378 for (int i = 0; i < length; i++) {
5379 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5380 on_success));
5381 result->AddAlternative(alternative);
5382 }
5383 return result;
5384 }
5385
5386
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5387 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5388 RegExpNode* on_success) {
5389 return ToNode(min(),
5390 max(),
5391 is_greedy(),
5392 body(),
5393 compiler,
5394 on_success);
5395 }
5396
5397
5398 // Scoped object to keep track of how much we unroll quantifier loops in the
5399 // regexp graph generator.
5400 class RegExpExpansionLimiter {
5401 public:
5402 static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)5403 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5404 : compiler_(compiler),
5405 saved_expansion_factor_(compiler->current_expansion_factor()),
5406 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5407 DCHECK_LT(0, factor);
5408 if (ok_to_expand_) {
5409 if (factor > kMaxExpansionFactor) {
5410 // Avoid integer overflow of the current expansion factor.
5411 ok_to_expand_ = false;
5412 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5413 } else {
5414 int new_factor = saved_expansion_factor_ * factor;
5415 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5416 compiler->set_current_expansion_factor(new_factor);
5417 }
5418 }
5419 }
5420
~RegExpExpansionLimiter()5421 ~RegExpExpansionLimiter() {
5422 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5423 }
5424
ok_to_expand()5425 bool ok_to_expand() { return ok_to_expand_; }
5426
5427 private:
5428 RegExpCompiler* compiler_;
5429 int saved_expansion_factor_;
5430 bool ok_to_expand_;
5431
5432 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5433 };
5434
5435
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)5436 RegExpNode* RegExpQuantifier::ToNode(int min,
5437 int max,
5438 bool is_greedy,
5439 RegExpTree* body,
5440 RegExpCompiler* compiler,
5441 RegExpNode* on_success,
5442 bool not_at_start) {
5443 // x{f, t} becomes this:
5444 //
5445 // (r++)<-.
5446 // | `
5447 // | (x)
5448 // v ^
5449 // (r=0)-->(?)---/ [if r < t]
5450 // |
5451 // [if r >= f] \----> ...
5452 //
5453
5454 // 15.10.2.5 RepeatMatcher algorithm.
5455 // The parser has already eliminated the case where max is 0. In the case
5456 // where max_match is zero the parser has removed the quantifier if min was
5457 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5458
5459 // If we know that we cannot match zero length then things are a little
5460 // simpler since we don't need to make the special zero length match check
5461 // from step 2.1. If the min and max are small we can unroll a little in
5462 // this case.
5463 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5464 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5465 if (max == 0) return on_success; // This can happen due to recursion.
5466 bool body_can_be_empty = (body->min_match() == 0);
5467 int body_start_reg = RegExpCompiler::kNoRegister;
5468 Interval capture_registers = body->CaptureRegisters();
5469 bool needs_capture_clearing = !capture_registers.is_empty();
5470 Zone* zone = compiler->zone();
5471
5472 if (body_can_be_empty) {
5473 body_start_reg = compiler->AllocateRegister();
5474 } else if (compiler->optimize() && !needs_capture_clearing) {
5475 // Only unroll if there are no captures and the body can't be
5476 // empty.
5477 {
5478 RegExpExpansionLimiter limiter(
5479 compiler, min + ((max != min) ? 1 : 0));
5480 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5481 int new_max = (max == kInfinity) ? max : max - min;
5482 // Recurse once to get the loop or optional matches after the fixed
5483 // ones.
5484 RegExpNode* answer = ToNode(
5485 0, new_max, is_greedy, body, compiler, on_success, true);
5486 // Unroll the forced matches from 0 to min. This can cause chains of
5487 // TextNodes (which the parser does not generate). These should be
5488 // combined if it turns out they hinder good code generation.
5489 for (int i = 0; i < min; i++) {
5490 answer = body->ToNode(compiler, answer);
5491 }
5492 return answer;
5493 }
5494 }
5495 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5496 DCHECK_LT(0, max); // Due to the 'if' above.
5497 RegExpExpansionLimiter limiter(compiler, max);
5498 if (limiter.ok_to_expand()) {
5499 // Unroll the optional matches up to max.
5500 RegExpNode* answer = on_success;
5501 for (int i = 0; i < max; i++) {
5502 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5503 if (is_greedy) {
5504 alternation->AddAlternative(
5505 GuardedAlternative(body->ToNode(compiler, answer)));
5506 alternation->AddAlternative(GuardedAlternative(on_success));
5507 } else {
5508 alternation->AddAlternative(GuardedAlternative(on_success));
5509 alternation->AddAlternative(
5510 GuardedAlternative(body->ToNode(compiler, answer)));
5511 }
5512 answer = alternation;
5513 if (not_at_start && !compiler->read_backward()) {
5514 alternation->set_not_at_start();
5515 }
5516 }
5517 return answer;
5518 }
5519 }
5520 }
5521 bool has_min = min > 0;
5522 bool has_max = max < RegExpTree::kInfinity;
5523 bool needs_counter = has_min || has_max;
5524 int reg_ctr = needs_counter
5525 ? compiler->AllocateRegister()
5526 : RegExpCompiler::kNoRegister;
5527 LoopChoiceNode* center = new (zone)
5528 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5529 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5530 RegExpNode* loop_return = needs_counter
5531 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5532 : static_cast<RegExpNode*>(center);
5533 if (body_can_be_empty) {
5534 // If the body can be empty we need to check if it was and then
5535 // backtrack.
5536 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5537 reg_ctr,
5538 min,
5539 loop_return);
5540 }
5541 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5542 if (body_can_be_empty) {
5543 // If the body can be empty we need to store the start position
5544 // so we can bail out if it was empty.
5545 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5546 }
5547 if (needs_capture_clearing) {
5548 // Before entering the body of this loop we need to clear captures.
5549 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5550 }
5551 GuardedAlternative body_alt(body_node);
5552 if (has_max) {
5553 Guard* body_guard =
5554 new(zone) Guard(reg_ctr, Guard::LT, max);
5555 body_alt.AddGuard(body_guard, zone);
5556 }
5557 GuardedAlternative rest_alt(on_success);
5558 if (has_min) {
5559 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5560 rest_alt.AddGuard(rest_guard, zone);
5561 }
5562 if (is_greedy) {
5563 center->AddLoopAlternative(body_alt);
5564 center->AddContinueAlternative(rest_alt);
5565 } else {
5566 center->AddContinueAlternative(rest_alt);
5567 center->AddLoopAlternative(body_alt);
5568 }
5569 if (needs_counter) {
5570 return ActionNode::SetRegister(reg_ctr, 0, center);
5571 } else {
5572 return center;
5573 }
5574 }
5575
5576 namespace {
5577 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5578 // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
BoundaryAssertionAsLookaround(RegExpCompiler * compiler,RegExpNode * on_success,RegExpAssertion::AssertionType type,JSRegExp::Flags flags)5579 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5580 RegExpNode* on_success,
5581 RegExpAssertion::AssertionType type,
5582 JSRegExp::Flags flags) {
5583 DCHECK(NeedsUnicodeCaseEquivalents(flags));
5584 Zone* zone = compiler->zone();
5585 ZoneList<CharacterRange>* word_range =
5586 new (zone) ZoneList<CharacterRange>(2, zone);
5587 CharacterRange::AddClassEscape('w', word_range, true, zone);
5588 int stack_register = compiler->UnicodeLookaroundStackRegister();
5589 int position_register = compiler->UnicodeLookaroundPositionRegister();
5590 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5591 // Add two choices. The (non-)boundary could start with a word or
5592 // a non-word-character.
5593 for (int i = 0; i < 2; i++) {
5594 bool lookbehind_for_word = i == 0;
5595 bool lookahead_for_word =
5596 (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5597 // Look to the left.
5598 RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5599 stack_register, position_register);
5600 RegExpNode* backward = TextNode::CreateForCharacterRanges(
5601 zone, word_range, true, lookbehind.on_match_success(), flags);
5602 // Look to the right.
5603 RegExpLookaround::Builder lookahead(lookahead_for_word,
5604 lookbehind.ForMatch(backward),
5605 stack_register, position_register);
5606 RegExpNode* forward = TextNode::CreateForCharacterRanges(
5607 zone, word_range, false, lookahead.on_match_success(), flags);
5608 result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5609 }
5610 return result;
5611 }
5612 } // anonymous namespace
5613
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5614 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5615 RegExpNode* on_success) {
5616 NodeInfo info;
5617 Zone* zone = compiler->zone();
5618
5619 switch (assertion_type()) {
5620 case START_OF_LINE:
5621 return AssertionNode::AfterNewline(on_success);
5622 case START_OF_INPUT:
5623 return AssertionNode::AtStart(on_success);
5624 case BOUNDARY:
5625 return NeedsUnicodeCaseEquivalents(flags_)
5626 ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
5627 flags_)
5628 : AssertionNode::AtBoundary(on_success);
5629 case NON_BOUNDARY:
5630 return NeedsUnicodeCaseEquivalents(flags_)
5631 ? BoundaryAssertionAsLookaround(compiler, on_success,
5632 NON_BOUNDARY, flags_)
5633 : AssertionNode::AtNonBoundary(on_success);
5634 case END_OF_INPUT:
5635 return AssertionNode::AtEnd(on_success);
5636 case END_OF_LINE: {
5637 // Compile $ in multiline regexps as an alternation with a positive
5638 // lookahead in one side and an end-of-input on the other side.
5639 // We need two registers for the lookahead.
5640 int stack_pointer_register = compiler->AllocateRegister();
5641 int position_register = compiler->AllocateRegister();
5642 // The ChoiceNode to distinguish between a newline and end-of-input.
5643 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5644 // Create a newline atom.
5645 ZoneList<CharacterRange>* newline_ranges =
5646 new(zone) ZoneList<CharacterRange>(3, zone);
5647 CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5648 JSRegExp::Flags default_flags = JSRegExp::Flags();
5649 RegExpCharacterClass* newline_atom =
5650 new (zone) RegExpCharacterClass('n', default_flags);
5651 TextNode* newline_matcher = new (zone) TextNode(
5652 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5653 stack_pointer_register, position_register,
5654 0, // No captures inside.
5655 -1, // Ignored if no captures.
5656 on_success));
5657 // Create an end-of-input matcher.
5658 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5659 stack_pointer_register,
5660 position_register,
5661 newline_matcher);
5662 // Add the two alternatives to the ChoiceNode.
5663 GuardedAlternative eol_alternative(end_of_line);
5664 result->AddAlternative(eol_alternative);
5665 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5666 result->AddAlternative(end_alternative);
5667 return result;
5668 }
5669 default:
5670 UNREACHABLE();
5671 }
5672 return on_success;
5673 }
5674
5675
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5676 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5677 RegExpNode* on_success) {
5678 return new (compiler->zone())
5679 BackReferenceNode(RegExpCapture::StartRegister(index()),
5680 RegExpCapture::EndRegister(index()), flags_,
5681 compiler->read_backward(), on_success);
5682 }
5683
5684
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5685 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5686 RegExpNode* on_success) {
5687 return on_success;
5688 }
5689
5690
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)5691 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5692 int stack_pointer_register,
5693 int position_register,
5694 int capture_register_count,
5695 int capture_register_start)
5696 : is_positive_(is_positive),
5697 on_success_(on_success),
5698 stack_pointer_register_(stack_pointer_register),
5699 position_register_(position_register) {
5700 if (is_positive_) {
5701 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5702 stack_pointer_register, position_register, capture_register_count,
5703 capture_register_start, on_success_);
5704 } else {
5705 Zone* zone = on_success_->zone();
5706 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5707 stack_pointer_register, position_register, capture_register_count,
5708 capture_register_start, zone);
5709 }
5710 }
5711
5712
ForMatch(RegExpNode * match)5713 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5714 if (is_positive_) {
5715 return ActionNode::BeginSubmatch(stack_pointer_register_,
5716 position_register_, match);
5717 } else {
5718 Zone* zone = on_success_->zone();
5719 // We use a ChoiceNode to represent the negative lookaround. The first
5720 // alternative is the negative match. On success, the end node backtracks.
5721 // On failure, the second alternative is tried and leads to success.
5722 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5723 // first exit when calculating quick checks.
5724 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5725 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5726 return ActionNode::BeginSubmatch(stack_pointer_register_,
5727 position_register_, choice_node);
5728 }
5729 }
5730
5731
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5732 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5733 RegExpNode* on_success) {
5734 int stack_pointer_register = compiler->AllocateRegister();
5735 int position_register = compiler->AllocateRegister();
5736
5737 const int registers_per_capture = 2;
5738 const int register_of_first_capture = 2;
5739 int register_count = capture_count_ * registers_per_capture;
5740 int register_start =
5741 register_of_first_capture + capture_from_ * registers_per_capture;
5742
5743 RegExpNode* result;
5744 bool was_reading_backward = compiler->read_backward();
5745 compiler->set_read_backward(type() == LOOKBEHIND);
5746 Builder builder(is_positive(), on_success, stack_pointer_register,
5747 position_register, register_count, register_start);
5748 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5749 result = builder.ForMatch(match);
5750 compiler->set_read_backward(was_reading_backward);
5751 return result;
5752 }
5753
5754
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5755 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5756 RegExpNode* on_success) {
5757 return ToNode(body(), index(), compiler, on_success);
5758 }
5759
5760
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5761 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5762 int index,
5763 RegExpCompiler* compiler,
5764 RegExpNode* on_success) {
5765 DCHECK_NOT_NULL(body);
5766 int start_reg = RegExpCapture::StartRegister(index);
5767 int end_reg = RegExpCapture::EndRegister(index);
5768 if (compiler->read_backward()) std::swap(start_reg, end_reg);
5769 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5770 RegExpNode* body_node = body->ToNode(compiler, store_end);
5771 return ActionNode::StorePosition(start_reg, true, body_node);
5772 }
5773
5774
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5775 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5776 RegExpNode* on_success) {
5777 ZoneList<RegExpTree*>* children = nodes();
5778 RegExpNode* current = on_success;
5779 if (compiler->read_backward()) {
5780 for (int i = 0; i < children->length(); i++) {
5781 current = children->at(i)->ToNode(compiler, current);
5782 }
5783 } else {
5784 for (int i = children->length() - 1; i >= 0; i--) {
5785 current = children->at(i)->ToNode(compiler, current);
5786 }
5787 }
5788 return current;
5789 }
5790
5791
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5792 static void AddClass(const int* elmv,
5793 int elmc,
5794 ZoneList<CharacterRange>* ranges,
5795 Zone* zone) {
5796 elmc--;
5797 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5798 for (int i = 0; i < elmc; i += 2) {
5799 DCHECK(elmv[i] < elmv[i + 1]);
5800 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5801 }
5802 }
5803
5804
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5805 static void AddClassNegated(const int *elmv,
5806 int elmc,
5807 ZoneList<CharacterRange>* ranges,
5808 Zone* zone) {
5809 elmc--;
5810 DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5811 DCHECK_NE(0x0000, elmv[0]);
5812 DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5813 uc16 last = 0x0000;
5814 for (int i = 0; i < elmc; i += 2) {
5815 DCHECK(last <= elmv[i] - 1);
5816 DCHECK(elmv[i] < elmv[i + 1]);
5817 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5818 last = elmv[i + 1];
5819 }
5820 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5821 }
5822
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,bool add_unicode_case_equivalents,Zone * zone)5823 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5824 bool add_unicode_case_equivalents,
5825 Zone* zone) {
5826 if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5827 // See #sec-runtime-semantics-wordcharacters-abstract-operation
5828 // In case of unicode and ignore_case, we need to create the closure over
5829 // case equivalent characters before negating.
5830 ZoneList<CharacterRange>* new_ranges =
5831 new (zone) ZoneList<CharacterRange>(2, zone);
5832 AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5833 AddUnicodeCaseEquivalents(new_ranges, zone);
5834 if (type == 'W') {
5835 ZoneList<CharacterRange>* negated =
5836 new (zone) ZoneList<CharacterRange>(2, zone);
5837 CharacterRange::Negate(new_ranges, negated, zone);
5838 new_ranges = negated;
5839 }
5840 ranges->AddAll(*new_ranges, zone);
5841 return;
5842 }
5843 AddClassEscape(type, ranges, zone);
5844 }
5845
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,Zone * zone)5846 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5847 Zone* zone) {
5848 switch (type) {
5849 case 's':
5850 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5851 break;
5852 case 'S':
5853 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5854 break;
5855 case 'w':
5856 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5857 break;
5858 case 'W':
5859 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5860 break;
5861 case 'd':
5862 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5863 break;
5864 case 'D':
5865 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5866 break;
5867 case '.':
5868 AddClassNegated(kLineTerminatorRanges,
5869 kLineTerminatorRangeCount,
5870 ranges,
5871 zone);
5872 break;
5873 // This is not a character range as defined by the spec but a
5874 // convenient shorthand for a character class that matches any
5875 // character.
5876 case '*':
5877 ranges->Add(CharacterRange::Everything(), zone);
5878 break;
5879 // This is the set of characters matched by the $ and ^ symbols
5880 // in multiline mode.
5881 case 'n':
5882 AddClass(kLineTerminatorRanges,
5883 kLineTerminatorRangeCount,
5884 ranges,
5885 zone);
5886 break;
5887 default:
5888 UNREACHABLE();
5889 }
5890 }
5891
5892
GetWordBounds()5893 Vector<const int> CharacterRange::GetWordBounds() {
5894 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5895 }
5896
5897 // static
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)5898 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5899 ZoneList<CharacterRange>* ranges,
5900 bool is_one_byte) {
5901 CharacterRange::Canonicalize(ranges);
5902 int range_count = ranges->length();
5903 for (int i = 0; i < range_count; i++) {
5904 CharacterRange range = ranges->at(i);
5905 uc32 bottom = range.from();
5906 if (bottom > String::kMaxUtf16CodeUnit) continue;
5907 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5908 // Nothing to be done for surrogates.
5909 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5910 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5911 if (bottom > String::kMaxOneByteCharCode) continue;
5912 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5913 }
5914 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5915 if (top == bottom) {
5916 // If this is a singleton we just expand the one character.
5917 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5918 for (int i = 0; i < length; i++) {
5919 uc32 chr = chars[i];
5920 if (chr != bottom) {
5921 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5922 }
5923 }
5924 } else {
5925 // If this is a range we expand the characters block by block, expanding
5926 // contiguous subranges (blocks) one at a time. The approach is as
5927 // follows. For a given start character we look up the remainder of the
5928 // block that contains it (represented by the end point), for instance we
5929 // find 'z' if the character is 'c'. A block is characterized by the
5930 // property that all characters uncanonicalize in the same way, except
5931 // that each entry in the result is incremented by the distance from the
5932 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5933 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5934 // we've found the end point we look up its uncanonicalization and
5935 // produce a range for each element. For instance for [c-f] we look up
5936 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5937 // it is not already contained in the input, so [c-f] will be skipped but
5938 // [C-F] will be added. If this range is not completely contained in a
5939 // block we do this for all the blocks covered by the range (handling
5940 // characters that is not in a block as a "singleton block").
5941 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5942 int pos = bottom;
5943 while (pos <= top) {
5944 int length =
5945 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5946 uc32 block_end;
5947 if (length == 0) {
5948 block_end = pos;
5949 } else {
5950 DCHECK_EQ(1, length);
5951 block_end = equivalents[0];
5952 }
5953 int end = (block_end > top) ? top : block_end;
5954 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5955 equivalents);
5956 for (int i = 0; i < length; i++) {
5957 uc32 c = equivalents[i];
5958 uc32 range_from = c - (block_end - pos);
5959 uc32 range_to = c - (block_end - end);
5960 if (!(bottom <= range_from && range_to <= top)) {
5961 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5962 }
5963 }
5964 pos = end + 1;
5965 }
5966 }
5967 }
5968 }
5969
5970
IsCanonical(ZoneList<CharacterRange> * ranges)5971 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5972 DCHECK_NOT_NULL(ranges);
5973 int n = ranges->length();
5974 if (n <= 1) return true;
5975 int max = ranges->at(0).to();
5976 for (int i = 1; i < n; i++) {
5977 CharacterRange next_range = ranges->at(i);
5978 if (next_range.from() <= max + 1) return false;
5979 max = next_range.to();
5980 }
5981 return true;
5982 }
5983
5984
ranges(Zone * zone)5985 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5986 if (ranges_ == nullptr) {
5987 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5988 CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5989 }
5990 return ranges_;
5991 }
5992
5993
5994 // Move a number of elements in a zonelist to another position
5995 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5996 static void MoveRanges(ZoneList<CharacterRange>* list,
5997 int from,
5998 int to,
5999 int count) {
6000 // Ranges are potentially overlapping.
6001 if (from < to) {
6002 for (int i = count - 1; i >= 0; i--) {
6003 list->at(to + i) = list->at(from + i);
6004 }
6005 } else {
6006 for (int i = 0; i < count; i++) {
6007 list->at(to + i) = list->at(from + i);
6008 }
6009 }
6010 }
6011
6012
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)6013 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6014 int count,
6015 CharacterRange insert) {
6016 // Inserts a range into list[0..count[, which must be sorted
6017 // by from value and non-overlapping and non-adjacent, using at most
6018 // list[0..count] for the result. Returns the number of resulting
6019 // canonicalized ranges. Inserting a range may collapse existing ranges into
6020 // fewer ranges, so the return value can be anything in the range 1..count+1.
6021 uc32 from = insert.from();
6022 uc32 to = insert.to();
6023 int start_pos = 0;
6024 int end_pos = count;
6025 for (int i = count - 1; i >= 0; i--) {
6026 CharacterRange current = list->at(i);
6027 if (current.from() > to + 1) {
6028 end_pos = i;
6029 } else if (current.to() + 1 < from) {
6030 start_pos = i + 1;
6031 break;
6032 }
6033 }
6034
6035 // Inserted range overlaps, or is adjacent to, ranges at positions
6036 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6037 // not affected by the insertion.
6038 // If start_pos == end_pos, the range must be inserted before start_pos.
6039 // if start_pos < end_pos, the entire range from start_pos to end_pos
6040 // must be merged with the insert range.
6041
6042 if (start_pos == end_pos) {
6043 // Insert between existing ranges at position start_pos.
6044 if (start_pos < count) {
6045 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6046 }
6047 list->at(start_pos) = insert;
6048 return count + 1;
6049 }
6050 if (start_pos + 1 == end_pos) {
6051 // Replace single existing range at position start_pos.
6052 CharacterRange to_replace = list->at(start_pos);
6053 int new_from = Min(to_replace.from(), from);
6054 int new_to = Max(to_replace.to(), to);
6055 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6056 return count;
6057 }
6058 // Replace a number of existing ranges from start_pos to end_pos - 1.
6059 // Move the remaining ranges down.
6060
6061 int new_from = Min(list->at(start_pos).from(), from);
6062 int new_to = Max(list->at(end_pos - 1).to(), to);
6063 if (end_pos < count) {
6064 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6065 }
6066 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6067 return count - (end_pos - start_pos) + 1;
6068 }
6069
6070
Canonicalize()6071 void CharacterSet::Canonicalize() {
6072 // Special/default classes are always considered canonical. The result
6073 // of calling ranges() will be sorted.
6074 if (ranges_ == nullptr) return;
6075 CharacterRange::Canonicalize(ranges_);
6076 }
6077
6078
Canonicalize(ZoneList<CharacterRange> * character_ranges)6079 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6080 if (character_ranges->length() <= 1) return;
6081 // Check whether ranges are already canonical (increasing, non-overlapping,
6082 // non-adjacent).
6083 int n = character_ranges->length();
6084 int max = character_ranges->at(0).to();
6085 int i = 1;
6086 while (i < n) {
6087 CharacterRange current = character_ranges->at(i);
6088 if (current.from() <= max + 1) {
6089 break;
6090 }
6091 max = current.to();
6092 i++;
6093 }
6094 // Canonical until the i'th range. If that's all of them, we are done.
6095 if (i == n) return;
6096
6097 // The ranges at index i and forward are not canonicalized. Make them so by
6098 // doing the equivalent of insertion sort (inserting each into the previous
6099 // list, in order).
6100 // Notice that inserting a range can reduce the number of ranges in the
6101 // result due to combining of adjacent and overlapping ranges.
6102 int read = i; // Range to insert.
6103 int num_canonical = i; // Length of canonicalized part of list.
6104 do {
6105 num_canonical = InsertRangeInCanonicalList(character_ranges,
6106 num_canonical,
6107 character_ranges->at(read));
6108 read++;
6109 } while (read < n);
6110 character_ranges->Rewind(num_canonical);
6111
6112 DCHECK(CharacterRange::IsCanonical(character_ranges));
6113 }
6114
6115
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)6116 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6117 ZoneList<CharacterRange>* negated_ranges,
6118 Zone* zone) {
6119 DCHECK(CharacterRange::IsCanonical(ranges));
6120 DCHECK_EQ(0, negated_ranges->length());
6121 int range_count = ranges->length();
6122 uc32 from = 0;
6123 int i = 0;
6124 if (range_count > 0 && ranges->at(0).from() == 0) {
6125 from = ranges->at(0).to() + 1;
6126 i = 1;
6127 }
6128 while (i < range_count) {
6129 CharacterRange range = ranges->at(i);
6130 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6131 from = range.to() + 1;
6132 i++;
6133 }
6134 if (from < String::kMaxCodePoint) {
6135 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6136 zone);
6137 }
6138 }
6139
6140
6141 // -------------------------------------------------------------------
6142 // Splay tree
6143
6144
Extend(unsigned value,Zone * zone)6145 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6146 if (Get(value))
6147 return this;
6148 if (successors(zone) != nullptr) {
6149 for (int i = 0; i < successors(zone)->length(); i++) {
6150 OutSet* successor = successors(zone)->at(i);
6151 if (successor->Get(value))
6152 return successor;
6153 }
6154 } else {
6155 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6156 }
6157 OutSet* result = new(zone) OutSet(first_, remaining_);
6158 result->Set(value, zone);
6159 successors(zone)->Add(result, zone);
6160 return result;
6161 }
6162
6163
Set(unsigned value,Zone * zone)6164 void OutSet::Set(unsigned value, Zone *zone) {
6165 if (value < kFirstLimit) {
6166 first_ |= (1 << value);
6167 } else {
6168 if (remaining_ == nullptr)
6169 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6170 if (remaining_->is_empty() || !remaining_->Contains(value))
6171 remaining_->Add(value, zone);
6172 }
6173 }
6174
6175
Get(unsigned value) const6176 bool OutSet::Get(unsigned value) const {
6177 if (value < kFirstLimit) {
6178 return (first_ & (1 << value)) != 0;
6179 } else if (remaining_ == nullptr) {
6180 return false;
6181 } else {
6182 return remaining_->Contains(value);
6183 }
6184 }
6185
6186
6187 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6188
6189
AddRange(CharacterRange full_range,int value,Zone * zone)6190 void DispatchTable::AddRange(CharacterRange full_range, int value,
6191 Zone* zone) {
6192 CharacterRange current = full_range;
6193 if (tree()->is_empty()) {
6194 // If this is the first range we just insert into the table.
6195 ZoneSplayTree<Config>::Locator loc;
6196 bool inserted = tree()->Insert(current.from(), &loc);
6197 DCHECK(inserted);
6198 USE(inserted);
6199 loc.set_value(Entry(current.from(), current.to(),
6200 empty()->Extend(value, zone)));
6201 return;
6202 }
6203 // First see if there is a range to the left of this one that
6204 // overlaps.
6205 ZoneSplayTree<Config>::Locator loc;
6206 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6207 Entry* entry = &loc.value();
6208 // If we've found a range that overlaps with this one, and it
6209 // starts strictly to the left of this one, we have to fix it
6210 // because the following code only handles ranges that start on
6211 // or after the start point of the range we're adding.
6212 if (entry->from() < current.from() && entry->to() >= current.from()) {
6213 // Snap the overlapping range in half around the start point of
6214 // the range we're adding.
6215 CharacterRange left =
6216 CharacterRange::Range(entry->from(), current.from() - 1);
6217 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6218 // The left part of the overlapping range doesn't overlap.
6219 // Truncate the whole entry to be just the left part.
6220 entry->set_to(left.to());
6221 // The right part is the one that overlaps. We add this part
6222 // to the map and let the next step deal with merging it with
6223 // the range we're adding.
6224 ZoneSplayTree<Config>::Locator loc;
6225 bool inserted = tree()->Insert(right.from(), &loc);
6226 DCHECK(inserted);
6227 USE(inserted);
6228 loc.set_value(Entry(right.from(),
6229 right.to(),
6230 entry->out_set()));
6231 }
6232 }
6233 while (current.is_valid()) {
6234 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6235 (loc.value().from() <= current.to()) &&
6236 (loc.value().to() >= current.from())) {
6237 Entry* entry = &loc.value();
6238 // We have overlap. If there is space between the start point of
6239 // the range we're adding and where the overlapping range starts
6240 // then we have to add a range covering just that space.
6241 if (current.from() < entry->from()) {
6242 ZoneSplayTree<Config>::Locator ins;
6243 bool inserted = tree()->Insert(current.from(), &ins);
6244 DCHECK(inserted);
6245 USE(inserted);
6246 ins.set_value(Entry(current.from(),
6247 entry->from() - 1,
6248 empty()->Extend(value, zone)));
6249 current.set_from(entry->from());
6250 }
6251 DCHECK_EQ(current.from(), entry->from());
6252 // If the overlapping range extends beyond the one we want to add
6253 // we have to snap the right part off and add it separately.
6254 if (entry->to() > current.to()) {
6255 ZoneSplayTree<Config>::Locator ins;
6256 bool inserted = tree()->Insert(current.to() + 1, &ins);
6257 DCHECK(inserted);
6258 USE(inserted);
6259 ins.set_value(Entry(current.to() + 1,
6260 entry->to(),
6261 entry->out_set()));
6262 entry->set_to(current.to());
6263 }
6264 DCHECK(entry->to() <= current.to());
6265 // The overlapping range is now completely contained by the range
6266 // we're adding so we can just update it and move the start point
6267 // of the range we're adding just past it.
6268 entry->AddValue(value, zone);
6269 DCHECK(entry->to() + 1 > current.from());
6270 current.set_from(entry->to() + 1);
6271 } else {
6272 // There is no overlap so we can just add the range
6273 ZoneSplayTree<Config>::Locator ins;
6274 bool inserted = tree()->Insert(current.from(), &ins);
6275 DCHECK(inserted);
6276 USE(inserted);
6277 ins.set_value(Entry(current.from(),
6278 current.to(),
6279 empty()->Extend(value, zone)));
6280 break;
6281 }
6282 }
6283 }
6284
6285
Get(uc32 value)6286 OutSet* DispatchTable::Get(uc32 value) {
6287 ZoneSplayTree<Config>::Locator loc;
6288 if (!tree()->FindGreatestLessThan(value, &loc))
6289 return empty();
6290 Entry* entry = &loc.value();
6291 if (value <= entry->to())
6292 return entry->out_set();
6293 else
6294 return empty();
6295 }
6296
6297
6298 // -------------------------------------------------------------------
6299 // Analysis
6300
6301
EnsureAnalyzed(RegExpNode * that)6302 void Analysis::EnsureAnalyzed(RegExpNode* that) {
6303 StackLimitCheck check(isolate());
6304 if (check.HasOverflowed()) {
6305 fail("Stack overflow");
6306 return;
6307 }
6308 if (that->info()->been_analyzed || that->info()->being_analyzed)
6309 return;
6310 that->info()->being_analyzed = true;
6311 that->Accept(this);
6312 that->info()->being_analyzed = false;
6313 that->info()->been_analyzed = true;
6314 }
6315
6316
VisitEnd(EndNode * that)6317 void Analysis::VisitEnd(EndNode* that) {
6318 // nothing to do
6319 }
6320
6321
CalculateOffsets()6322 void TextNode::CalculateOffsets() {
6323 int element_count = elements()->length();
6324 // Set up the offsets of the elements relative to the start. This is a fixed
6325 // quantity since a TextNode can only contain fixed-width things.
6326 int cp_offset = 0;
6327 for (int i = 0; i < element_count; i++) {
6328 TextElement& elm = elements()->at(i);
6329 elm.set_cp_offset(cp_offset);
6330 cp_offset += elm.length();
6331 }
6332 }
6333
6334
VisitText(TextNode * that)6335 void Analysis::VisitText(TextNode* that) {
6336 that->MakeCaseIndependent(isolate(), is_one_byte_);
6337 EnsureAnalyzed(that->on_success());
6338 if (!has_failed()) {
6339 that->CalculateOffsets();
6340 }
6341 }
6342
6343
VisitAction(ActionNode * that)6344 void Analysis::VisitAction(ActionNode* that) {
6345 RegExpNode* target = that->on_success();
6346 EnsureAnalyzed(target);
6347 if (!has_failed()) {
6348 // If the next node is interested in what it follows then this node
6349 // has to be interested too so it can pass the information on.
6350 that->info()->AddFromFollowing(target->info());
6351 }
6352 }
6353
6354
VisitChoice(ChoiceNode * that)6355 void Analysis::VisitChoice(ChoiceNode* that) {
6356 NodeInfo* info = that->info();
6357 for (int i = 0; i < that->alternatives()->length(); i++) {
6358 RegExpNode* node = that->alternatives()->at(i).node();
6359 EnsureAnalyzed(node);
6360 if (has_failed()) return;
6361 // Anything the following nodes need to know has to be known by
6362 // this node also, so it can pass it on.
6363 info->AddFromFollowing(node->info());
6364 }
6365 }
6366
6367
VisitLoopChoice(LoopChoiceNode * that)6368 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6369 NodeInfo* info = that->info();
6370 for (int i = 0; i < that->alternatives()->length(); i++) {
6371 RegExpNode* node = that->alternatives()->at(i).node();
6372 if (node != that->loop_node()) {
6373 EnsureAnalyzed(node);
6374 if (has_failed()) return;
6375 info->AddFromFollowing(node->info());
6376 }
6377 }
6378 // Check the loop last since it may need the value of this node
6379 // to get a correct result.
6380 EnsureAnalyzed(that->loop_node());
6381 if (!has_failed()) {
6382 info->AddFromFollowing(that->loop_node()->info());
6383 }
6384 }
6385
6386
VisitBackReference(BackReferenceNode * that)6387 void Analysis::VisitBackReference(BackReferenceNode* that) {
6388 EnsureAnalyzed(that->on_success());
6389 }
6390
6391
VisitAssertion(AssertionNode * that)6392 void Analysis::VisitAssertion(AssertionNode* that) {
6393 EnsureAnalyzed(that->on_success());
6394 }
6395
6396
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6397 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6398 BoyerMooreLookahead* bm,
6399 bool not_at_start) {
6400 // Working out the set of characters that a backreference can match is too
6401 // hard, so we just say that any character can match.
6402 bm->SetRest(offset);
6403 SaveBMInfo(bm, not_at_start, offset);
6404 }
6405
6406
6407 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6408 RegExpMacroAssembler::kTableSize);
6409
6410
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6411 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6412 BoyerMooreLookahead* bm, bool not_at_start) {
6413 ZoneList<GuardedAlternative>* alts = alternatives();
6414 budget = (budget - 1) / alts->length();
6415 for (int i = 0; i < alts->length(); i++) {
6416 GuardedAlternative& alt = alts->at(i);
6417 if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6418 bm->SetRest(offset); // Give up trying to fill in info.
6419 SaveBMInfo(bm, not_at_start, offset);
6420 return;
6421 }
6422 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6423 }
6424 SaveBMInfo(bm, not_at_start, offset);
6425 }
6426
6427
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6428 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6429 BoyerMooreLookahead* bm, bool not_at_start) {
6430 if (initial_offset >= bm->length()) return;
6431 int offset = initial_offset;
6432 int max_char = bm->max_char();
6433 for (int i = 0; i < elements()->length(); i++) {
6434 if (offset >= bm->length()) {
6435 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6436 return;
6437 }
6438 TextElement text = elements()->at(i);
6439 if (text.text_type() == TextElement::ATOM) {
6440 RegExpAtom* atom = text.atom();
6441 for (int j = 0; j < atom->length(); j++, offset++) {
6442 if (offset >= bm->length()) {
6443 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6444 return;
6445 }
6446 uc16 character = atom->data()[j];
6447 if (IgnoreCase(atom->flags())) {
6448 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6449 int length = GetCaseIndependentLetters(
6450 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6451 chars);
6452 for (int j = 0; j < length; j++) {
6453 bm->Set(offset, chars[j]);
6454 }
6455 } else {
6456 if (character <= max_char) bm->Set(offset, character);
6457 }
6458 }
6459 } else {
6460 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6461 RegExpCharacterClass* char_class = text.char_class();
6462 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6463 if (char_class->is_negated()) {
6464 bm->SetAll(offset);
6465 } else {
6466 for (int k = 0; k < ranges->length(); k++) {
6467 CharacterRange& range = ranges->at(k);
6468 if (range.from() > max_char) continue;
6469 int to = Min(max_char, static_cast<int>(range.to()));
6470 bm->SetInterval(offset, Interval(range.from(), to));
6471 }
6472 }
6473 offset++;
6474 }
6475 }
6476 if (offset >= bm->length()) {
6477 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6478 return;
6479 }
6480 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6481 true); // Not at start after a text node.
6482 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6483 }
6484
6485
6486 // -------------------------------------------------------------------
6487 // Dispatch table construction
6488
6489
VisitEnd(EndNode * that)6490 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6491 AddRange(CharacterRange::Everything());
6492 }
6493
6494
BuildTable(ChoiceNode * node)6495 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6496 node->set_being_calculated(true);
6497 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6498 for (int i = 0; i < alternatives->length(); i++) {
6499 set_choice_index(i);
6500 alternatives->at(i).node()->Accept(this);
6501 }
6502 node->set_being_calculated(false);
6503 }
6504
6505
6506 class AddDispatchRange {
6507 public:
AddDispatchRange(DispatchTableConstructor * constructor)6508 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6509 : constructor_(constructor) { }
6510 void Call(uc32 from, DispatchTable::Entry entry);
6511 private:
6512 DispatchTableConstructor* constructor_;
6513 };
6514
6515
Call(uc32 from,DispatchTable::Entry entry)6516 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6517 constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6518 }
6519
6520
VisitChoice(ChoiceNode * node)6521 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6522 if (node->being_calculated())
6523 return;
6524 DispatchTable* table = node->GetTable(ignore_case_);
6525 AddDispatchRange adder(this);
6526 table->ForEach(&adder);
6527 }
6528
6529
VisitBackReference(BackReferenceNode * that)6530 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6531 // TODO(160): Find the node that we refer back to and propagate its start
6532 // set back to here. For now we just accept anything.
6533 AddRange(CharacterRange::Everything());
6534 }
6535
6536
VisitAssertion(AssertionNode * that)6537 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6538 RegExpNode* target = that->on_success();
6539 target->Accept(this);
6540 }
6541
6542
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)6543 static int CompareRangeByFrom(const CharacterRange* a,
6544 const CharacterRange* b) {
6545 return Compare<uc16>(a->from(), b->from());
6546 }
6547
6548
AddInverse(ZoneList<CharacterRange> * ranges)6549 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6550 ranges->Sort(CompareRangeByFrom);
6551 uc16 last = 0;
6552 for (int i = 0; i < ranges->length(); i++) {
6553 CharacterRange range = ranges->at(i);
6554 if (last < range.from())
6555 AddRange(CharacterRange::Range(last, range.from() - 1));
6556 if (range.to() >= last) {
6557 if (range.to() == String::kMaxCodePoint) {
6558 return;
6559 } else {
6560 last = range.to() + 1;
6561 }
6562 }
6563 }
6564 AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6565 }
6566
6567
VisitText(TextNode * that)6568 void DispatchTableConstructor::VisitText(TextNode* that) {
6569 TextElement elm = that->elements()->at(0);
6570 switch (elm.text_type()) {
6571 case TextElement::ATOM: {
6572 uc16 c = elm.atom()->data()[0];
6573 AddRange(CharacterRange::Range(c, c));
6574 break;
6575 }
6576 case TextElement::CHAR_CLASS: {
6577 RegExpCharacterClass* tree = elm.char_class();
6578 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6579 if (tree->is_negated()) {
6580 AddInverse(ranges);
6581 } else {
6582 for (int i = 0; i < ranges->length(); i++)
6583 AddRange(ranges->at(i));
6584 }
6585 break;
6586 }
6587 default: {
6588 UNIMPLEMENTED();
6589 }
6590 }
6591 }
6592
6593
VisitAction(ActionNode * that)6594 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6595 RegExpNode* target = that->on_success();
6596 target->Accept(this);
6597 }
6598
OptionallyStepBackToLeadSurrogate(RegExpCompiler * compiler,RegExpNode * on_success,JSRegExp::Flags flags)6599 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6600 RegExpNode* on_success,
6601 JSRegExp::Flags flags) {
6602 // If the regexp matching starts within a surrogate pair, step back
6603 // to the lead surrogate and start matching from there.
6604 DCHECK(!compiler->read_backward());
6605 Zone* zone = compiler->zone();
6606 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6607 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6608 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6609 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6610
6611 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6612
6613 int stack_register = compiler->UnicodeLookaroundStackRegister();
6614 int position_register = compiler->UnicodeLookaroundPositionRegister();
6615 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6616 zone, lead_surrogates, true, on_success, flags);
6617 RegExpLookaround::Builder builder(true, step_back, stack_register,
6618 position_register);
6619 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6620 zone, trail_surrogates, false, builder.on_match_success(), flags);
6621
6622 optional_step_back->AddAlternative(
6623 GuardedAlternative(builder.ForMatch(match_trail)));
6624 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6625
6626 return optional_step_back;
6627 }
6628
6629
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)6630 RegExpEngine::CompilationResult RegExpEngine::Compile(
6631 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6632 JSRegExp::Flags flags, Handle<String> pattern,
6633 Handle<String> sample_subject, bool is_one_byte) {
6634 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6635 return IrregexpRegExpTooBig(isolate);
6636 }
6637 bool is_sticky = IsSticky(flags);
6638 bool is_global = IsGlobal(flags);
6639 bool is_unicode = IsUnicode(flags);
6640 RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
6641
6642 if (compiler.optimize())
6643 compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
6644
6645 // Sample some characters from the middle of the string.
6646 static const int kSampleSize = 128;
6647
6648 sample_subject = String::Flatten(isolate, sample_subject);
6649 int chars_sampled = 0;
6650 int half_way = (sample_subject->length() - kSampleSize) / 2;
6651 for (int i = Max(0, half_way);
6652 i < sample_subject->length() && chars_sampled < kSampleSize;
6653 i++, chars_sampled++) {
6654 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6655 }
6656
6657 // Wrap the body of the regexp in capture #0.
6658 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6659 0,
6660 &compiler,
6661 compiler.accept());
6662 RegExpNode* node = captured_body;
6663 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6664 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6665 int max_length = data->tree->max_match();
6666 if (!is_start_anchored && !is_sticky) {
6667 // Add a .*? at the beginning, outside the body capture, unless
6668 // this expression is anchored at the beginning or sticky.
6669 JSRegExp::Flags default_flags = JSRegExp::Flags();
6670 RegExpNode* loop_node = RegExpQuantifier::ToNode(
6671 0, RegExpTree::kInfinity, false,
6672 new (zone) RegExpCharacterClass('*', default_flags), &compiler,
6673 captured_body, data->contains_anchor);
6674
6675 if (data->contains_anchor) {
6676 // Unroll loop once, to take care of the case that might start
6677 // at the start of input.
6678 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6679 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6680 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6681 new (zone) RegExpCharacterClass('*', default_flags), false,
6682 loop_node)));
6683 node = first_step_node;
6684 } else {
6685 node = loop_node;
6686 }
6687 }
6688 if (is_one_byte) {
6689 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6690 // Do it again to propagate the new nodes to places where they were not
6691 // put because they had not been calculated yet.
6692 if (node != nullptr) {
6693 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6694 }
6695 } else if (is_unicode && (is_global || is_sticky)) {
6696 node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
6697 }
6698
6699 if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6700 data->node = node;
6701 Analysis analysis(isolate, is_one_byte);
6702 analysis.EnsureAnalyzed(node);
6703 if (analysis.has_failed()) {
6704 const char* error_message = analysis.error_message();
6705 return CompilationResult(isolate, error_message);
6706 }
6707
6708 // Create the correct assembler for the architecture.
6709 #ifndef V8_INTERPRETED_REGEXP
6710 // Native regexp implementation.
6711
6712 NativeRegExpMacroAssembler::Mode mode =
6713 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6714 : NativeRegExpMacroAssembler::UC16;
6715
6716 #if V8_TARGET_ARCH_IA32
6717 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6718 (data->capture_count + 1) * 2);
6719 #elif V8_TARGET_ARCH_X64
6720 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6721 (data->capture_count + 1) * 2);
6722 #elif V8_TARGET_ARCH_ARM
6723 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6724 (data->capture_count + 1) * 2);
6725 #elif V8_TARGET_ARCH_ARM64
6726 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6727 (data->capture_count + 1) * 2);
6728 #elif V8_TARGET_ARCH_S390
6729 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6730 (data->capture_count + 1) * 2);
6731 #elif V8_TARGET_ARCH_PPC
6732 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6733 (data->capture_count + 1) * 2);
6734 #elif V8_TARGET_ARCH_MIPS
6735 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6736 (data->capture_count + 1) * 2);
6737 #elif V8_TARGET_ARCH_MIPS64
6738 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6739 (data->capture_count + 1) * 2);
6740 #else
6741 #error "Unsupported architecture"
6742 #endif
6743
6744 #else // V8_INTERPRETED_REGEXP
6745 // Interpreted regexp implementation.
6746 EmbeddedVector<byte, 1024> codes;
6747 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6748 #endif // V8_INTERPRETED_REGEXP
6749
6750 macro_assembler.set_slow_safe(TooMuchRegExpCode(isolate, pattern));
6751
6752 // Inserted here, instead of in Assembler, because it depends on information
6753 // in the AST that isn't replicated in the Node structure.
6754 static const int kMaxBacksearchLimit = 1024;
6755 if (is_end_anchored && !is_start_anchored && !is_sticky &&
6756 max_length < kMaxBacksearchLimit) {
6757 macro_assembler.SetCurrentPositionFromEnd(max_length);
6758 }
6759
6760 if (is_global) {
6761 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6762 if (data->tree->min_match() > 0) {
6763 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6764 } else if (is_unicode) {
6765 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6766 }
6767 macro_assembler.set_global_mode(mode);
6768 }
6769
6770 return compiler.Assemble(isolate, ¯o_assembler, node, data->capture_count,
6771 pattern);
6772 }
6773
TooMuchRegExpCode(Isolate * isolate,Handle<String> pattern)6774 bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
6775 Heap* heap = isolate->heap();
6776 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6777 if (heap->isolate()->total_regexp_code_generated() >
6778 RegExpImpl::kRegExpCompiledLimit &&
6779 heap->CommittedMemoryExecutable() >
6780 RegExpImpl::kRegExpExecutableMemoryLimit) {
6781 too_much = true;
6782 }
6783 return too_much;
6784 }
6785
6786
Lookup(Heap * heap,String * key_string,Object * key_pattern,FixedArray ** last_match_cache,ResultsCacheType type)6787 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6788 Object* key_pattern,
6789 FixedArray** last_match_cache,
6790 ResultsCacheType type) {
6791 FixedArray* cache;
6792 if (!key_string->IsInternalizedString()) return Smi::kZero;
6793 if (type == STRING_SPLIT_SUBSTRINGS) {
6794 DCHECK(key_pattern->IsString());
6795 if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6796 cache = heap->string_split_cache();
6797 } else {
6798 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6799 DCHECK(key_pattern->IsFixedArray());
6800 cache = heap->regexp_multiple_cache();
6801 }
6802
6803 uint32_t hash = key_string->Hash();
6804 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6805 ~(kArrayEntriesPerCacheEntry - 1));
6806 if (cache->get(index + kStringOffset) != key_string ||
6807 cache->get(index + kPatternOffset) != key_pattern) {
6808 index =
6809 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6810 if (cache->get(index + kStringOffset) != key_string ||
6811 cache->get(index + kPatternOffset) != key_pattern) {
6812 return Smi::kZero;
6813 }
6814 }
6815
6816 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6817 return cache->get(index + kArrayOffset);
6818 }
6819
6820
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)6821 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6822 Handle<Object> key_pattern,
6823 Handle<FixedArray> value_array,
6824 Handle<FixedArray> last_match_cache,
6825 ResultsCacheType type) {
6826 Factory* factory = isolate->factory();
6827 Handle<FixedArray> cache;
6828 if (!key_string->IsInternalizedString()) return;
6829 if (type == STRING_SPLIT_SUBSTRINGS) {
6830 DCHECK(key_pattern->IsString());
6831 if (!key_pattern->IsInternalizedString()) return;
6832 cache = factory->string_split_cache();
6833 } else {
6834 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6835 DCHECK(key_pattern->IsFixedArray());
6836 cache = factory->regexp_multiple_cache();
6837 }
6838
6839 uint32_t hash = key_string->Hash();
6840 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6841 ~(kArrayEntriesPerCacheEntry - 1));
6842 if (cache->get(index + kStringOffset) == Smi::kZero) {
6843 cache->set(index + kStringOffset, *key_string);
6844 cache->set(index + kPatternOffset, *key_pattern);
6845 cache->set(index + kArrayOffset, *value_array);
6846 cache->set(index + kLastMatchOffset, *last_match_cache);
6847 } else {
6848 uint32_t index2 =
6849 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6850 if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6851 cache->set(index2 + kStringOffset, *key_string);
6852 cache->set(index2 + kPatternOffset, *key_pattern);
6853 cache->set(index2 + kArrayOffset, *value_array);
6854 cache->set(index2 + kLastMatchOffset, *last_match_cache);
6855 } else {
6856 cache->set(index2 + kStringOffset, Smi::kZero);
6857 cache->set(index2 + kPatternOffset, Smi::kZero);
6858 cache->set(index2 + kArrayOffset, Smi::kZero);
6859 cache->set(index2 + kLastMatchOffset, Smi::kZero);
6860 cache->set(index + kStringOffset, *key_string);
6861 cache->set(index + kPatternOffset, *key_pattern);
6862 cache->set(index + kArrayOffset, *value_array);
6863 cache->set(index + kLastMatchOffset, *last_match_cache);
6864 }
6865 }
6866 // If the array is a reasonably short list of substrings, convert it into a
6867 // list of internalized strings.
6868 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6869 for (int i = 0; i < value_array->length(); i++) {
6870 Handle<String> str(String::cast(value_array->get(i)), isolate);
6871 Handle<String> internalized_str = factory->InternalizeString(str);
6872 value_array->set(i, *internalized_str);
6873 }
6874 }
6875 // Convert backing store to a copy-on-write array.
6876 value_array->set_map_no_write_barrier(
6877 ReadOnlyRoots(isolate).fixed_cow_array_map());
6878 }
6879
6880
Clear(FixedArray * cache)6881 void RegExpResultsCache::Clear(FixedArray* cache) {
6882 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6883 cache->set(i, Smi::kZero);
6884 }
6885 }
6886
6887 } // namespace internal
6888 } // namespace v8
6889