• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/jsregexp.h"
6 
7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h"
10 #include "src/compiler.h"
11 #include "src/execution.h"
12 #include "src/factory.h"
13 #include "src/isolate-inl.h"
14 #include "src/messages.h"
15 #include "src/ostreams.h"
16 #include "src/regexp/interpreter-irregexp.h"
17 #include "src/regexp/jsregexp-inl.h"
18 #include "src/regexp/regexp-macro-assembler.h"
19 #include "src/regexp/regexp-macro-assembler-irregexp.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-parser.h"
22 #include "src/regexp/regexp-stack.h"
23 #include "src/runtime/runtime.h"
24 #include "src/splay-tree-inl.h"
25 #include "src/string-search.h"
26 #include "src/unicode-decoder.h"
27 
28 #ifdef V8_I18N_SUPPORT
29 #include "unicode/uset.h"
30 #include "unicode/utypes.h"
31 #endif  // V8_I18N_SUPPORT
32 
33 #ifndef V8_INTERPRETED_REGEXP
34 #if V8_TARGET_ARCH_IA32
35 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
36 #elif V8_TARGET_ARCH_X64
37 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
38 #elif V8_TARGET_ARCH_ARM64
39 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
40 #elif V8_TARGET_ARCH_ARM
41 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
42 #elif V8_TARGET_ARCH_PPC
43 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
44 #elif V8_TARGET_ARCH_S390
45 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
46 #elif V8_TARGET_ARCH_MIPS
47 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
48 #elif V8_TARGET_ARCH_MIPS64
49 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
50 #elif V8_TARGET_ARCH_X87
51 #include "src/regexp/x87/regexp-macro-assembler-x87.h"
52 #else
53 #error Unsupported target architecture.
54 #endif
55 #endif
56 
57 
58 namespace v8 {
59 namespace internal {
60 
61 MUST_USE_RESULT
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text)62 static inline MaybeHandle<Object> ThrowRegExpException(
63     Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
64   Isolate* isolate = re->GetIsolate();
65   THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
66                                           pattern, error_text),
67                   Object);
68 }
69 
70 
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> error_text)71 inline void ThrowRegExpException(Handle<JSRegExp> re,
72                                  Handle<String> error_text) {
73   USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
74 }
75 
76 
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)77 ContainedInLattice AddRange(ContainedInLattice containment,
78                             const int* ranges,
79                             int ranges_length,
80                             Interval new_range) {
81   DCHECK((ranges_length & 1) == 1);
82   DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
83   if (containment == kLatticeUnknown) return containment;
84   bool inside = false;
85   int last = 0;
86   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
87     // Consider the range from last to ranges[i].
88     // We haven't got to the new range yet.
89     if (ranges[i] <= new_range.from()) continue;
90     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
91     // inclusive, but the values in ranges are not.
92     if (last <= new_range.from() && new_range.to() < ranges[i]) {
93       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
94     }
95     return kLatticeUnknown;
96   }
97   return containment;
98 }
99 
100 
101 // More makes code generation slower, less makes V8 benchmark score lower.
102 const int kMaxLookaheadForBoyerMoore = 8;
103 // In a 3-character pattern you can maximally step forwards 3 characters
104 // at a time, which is not always enough to pay for the extra logic.
105 const int kPatternTooShortForBoyerMoore = 2;
106 
107 
108 // Identifies the sort of regexps where the regexp engine is faster
109 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)110 static bool HasFewDifferentCharacters(Handle<String> pattern) {
111   int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
112   if (length <= kPatternTooShortForBoyerMoore) return false;
113   const int kMod = 128;
114   bool character_found[kMod];
115   int different = 0;
116   memset(&character_found[0], 0, sizeof(character_found));
117   for (int i = 0; i < length; i++) {
118     int ch = (pattern->Get(i) & (kMod - 1));
119     if (!character_found[ch]) {
120       character_found[ch] = true;
121       different++;
122       // We declare a regexp low-alphabet if it has at least 3 times as many
123       // characters as it has different characters.
124       if (different * 3 > length) return false;
125     }
126   }
127   return true;
128 }
129 
130 
131 // Generic RegExp methods. Dispatches to implementation specific methods.
132 
133 
Compile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags)134 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
135                                         Handle<String> pattern,
136                                         JSRegExp::Flags flags) {
137   Isolate* isolate = re->GetIsolate();
138   Zone zone(isolate->allocator());
139   CompilationCache* compilation_cache = isolate->compilation_cache();
140   MaybeHandle<FixedArray> maybe_cached =
141       compilation_cache->LookupRegExp(pattern, flags);
142   Handle<FixedArray> cached;
143   bool in_cache = maybe_cached.ToHandle(&cached);
144   LOG(isolate, RegExpCompileEvent(re, in_cache));
145 
146   Handle<Object> result;
147   if (in_cache) {
148     re->set_data(*cached);
149     return re;
150   }
151   pattern = String::Flatten(pattern);
152   PostponeInterruptsScope postpone(isolate);
153   RegExpCompileData parse_result;
154   FlatStringReader reader(isolate, pattern);
155   if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
156                                  &parse_result)) {
157     // Throw an exception if we fail to parse the pattern.
158     return ThrowRegExpException(re, pattern, parse_result.error);
159   }
160 
161   bool has_been_compiled = false;
162 
163   if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
164       !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
165     // Parse-tree is a single atom that is equal to the pattern.
166     AtomCompile(re, pattern, flags, pattern);
167     has_been_compiled = true;
168   } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
169              !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
170     RegExpAtom* atom = parse_result.tree->AsAtom();
171     Vector<const uc16> atom_pattern = atom->data();
172     Handle<String> atom_string;
173     ASSIGN_RETURN_ON_EXCEPTION(
174         isolate, atom_string,
175         isolate->factory()->NewStringFromTwoByte(atom_pattern),
176         Object);
177     if (!HasFewDifferentCharacters(atom_string)) {
178       AtomCompile(re, pattern, flags, atom_string);
179       has_been_compiled = true;
180     }
181   }
182   if (!has_been_compiled) {
183     IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
184   }
185   DCHECK(re->data()->IsFixedArray());
186   // Compilation succeeded so the data is set on the regexp
187   // and we can store it in the cache.
188   Handle<FixedArray> data(FixedArray::cast(re->data()));
189   compilation_cache->PutRegExp(pattern, flags, data);
190 
191   return re;
192 }
193 
194 
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<JSArray> last_match_info)195 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
196                                      Handle<String> subject,
197                                      int index,
198                                      Handle<JSArray> last_match_info) {
199   switch (regexp->TypeTag()) {
200     case JSRegExp::ATOM:
201       return AtomExec(regexp, subject, index, last_match_info);
202     case JSRegExp::IRREGEXP: {
203       return IrregexpExec(regexp, subject, index, last_match_info);
204     }
205     default:
206       UNREACHABLE();
207       return MaybeHandle<Object>();
208   }
209 }
210 
211 
212 // RegExp Atom implementation: Simple string search using indexOf.
213 
214 
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)215 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
216                              Handle<String> pattern,
217                              JSRegExp::Flags flags,
218                              Handle<String> match_pattern) {
219   re->GetIsolate()->factory()->SetRegExpAtomData(re,
220                                                  JSRegExp::ATOM,
221                                                  pattern,
222                                                  flags,
223                                                  match_pattern);
224 }
225 
226 
SetAtomLastCapture(FixedArray * array,String * subject,int from,int to)227 static void SetAtomLastCapture(FixedArray* array,
228                                String* subject,
229                                int from,
230                                int to) {
231   SealHandleScope shs(array->GetIsolate());
232   RegExpImpl::SetLastCaptureCount(array, 2);
233   RegExpImpl::SetLastSubject(array, subject);
234   RegExpImpl::SetLastInput(array, subject);
235   RegExpImpl::SetCapture(array, 0, from);
236   RegExpImpl::SetCapture(array, 1, to);
237 }
238 
239 
AtomExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)240 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
241                             Handle<String> subject,
242                             int index,
243                             int32_t* output,
244                             int output_size) {
245   Isolate* isolate = regexp->GetIsolate();
246 
247   DCHECK(0 <= index);
248   DCHECK(index <= subject->length());
249 
250   subject = String::Flatten(subject);
251   DisallowHeapAllocation no_gc;  // ensure vectors stay valid
252 
253   String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
254   int needle_len = needle->length();
255   DCHECK(needle->IsFlat());
256   DCHECK_LT(0, needle_len);
257 
258   if (index + needle_len > subject->length()) {
259     return RegExpImpl::RE_FAILURE;
260   }
261 
262   for (int i = 0; i < output_size; i += 2) {
263     String::FlatContent needle_content = needle->GetFlatContent();
264     String::FlatContent subject_content = subject->GetFlatContent();
265     DCHECK(needle_content.IsFlat());
266     DCHECK(subject_content.IsFlat());
267     // dispatch on type of strings
268     index =
269         (needle_content.IsOneByte()
270              ? (subject_content.IsOneByte()
271                     ? SearchString(isolate, subject_content.ToOneByteVector(),
272                                    needle_content.ToOneByteVector(), index)
273                     : SearchString(isolate, subject_content.ToUC16Vector(),
274                                    needle_content.ToOneByteVector(), index))
275              : (subject_content.IsOneByte()
276                     ? SearchString(isolate, subject_content.ToOneByteVector(),
277                                    needle_content.ToUC16Vector(), index)
278                     : SearchString(isolate, subject_content.ToUC16Vector(),
279                                    needle_content.ToUC16Vector(), index)));
280     if (index == -1) {
281       return i / 2;  // Return number of matches.
282     } else {
283       output[i] = index;
284       output[i+1] = index + needle_len;
285       index += needle_len;
286     }
287   }
288   return output_size / 2;
289 }
290 
291 
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<JSArray> last_match_info)292 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
293                                     Handle<String> subject,
294                                     int index,
295                                     Handle<JSArray> last_match_info) {
296   Isolate* isolate = re->GetIsolate();
297 
298   static const int kNumRegisters = 2;
299   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
300   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
301 
302   int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
303 
304   if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
305 
306   DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
307   SealHandleScope shs(isolate);
308   FixedArray* array = FixedArray::cast(last_match_info->elements());
309   SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
310   return last_match_info;
311 }
312 
313 
314 // Irregexp implementation.
315 
316 // Ensures that the regexp object contains a compiled version of the
317 // source for either one-byte or two-byte subject strings.
318 // If the compiled version doesn't already exist, it is compiled
319 // from the source pattern.
320 // If compilation fails, an exception is thrown and this function
321 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)322 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
323                                         Handle<String> sample_subject,
324                                         bool is_one_byte) {
325   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
326 #ifdef V8_INTERPRETED_REGEXP
327   if (compiled_code->IsByteArray()) return true;
328 #else  // V8_INTERPRETED_REGEXP (RegExp native code)
329   if (compiled_code->IsCode()) return true;
330 #endif
331   // We could potentially have marked this as flushable, but have kept
332   // a saved version if we did not flush it yet.
333   Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
334   if (saved_code->IsCode()) {
335     // Reinstate the code in the original place.
336     re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
337     DCHECK(compiled_code->IsSmi());
338     return true;
339   }
340   return CompileIrregexp(re, sample_subject, is_one_byte);
341 }
342 
343 
CompileIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)344 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
345                                  Handle<String> sample_subject,
346                                  bool is_one_byte) {
347   // Compile the RegExp.
348   Isolate* isolate = re->GetIsolate();
349   Zone zone(isolate->allocator());
350   PostponeInterruptsScope postpone(isolate);
351   // If we had a compilation error the last time this is saved at the
352   // saved code index.
353   Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
354   // When arriving here entry can only be a smi, either representing an
355   // uncompiled regexp, a previous compilation error, or code that has
356   // been flushed.
357   DCHECK(entry->IsSmi());
358   int entry_value = Smi::cast(entry)->value();
359   DCHECK(entry_value == JSRegExp::kUninitializedValue ||
360          entry_value == JSRegExp::kCompilationErrorValue ||
361          (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
362 
363   if (entry_value == JSRegExp::kCompilationErrorValue) {
364     // A previous compilation failed and threw an error which we store in
365     // the saved code index (we store the error message, not the actual
366     // error). Recreate the error object and throw it.
367     Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
368     DCHECK(error_string->IsString());
369     Handle<String> error_message(String::cast(error_string));
370     ThrowRegExpException(re, error_message);
371     return false;
372   }
373 
374   JSRegExp::Flags flags = re->GetFlags();
375 
376   Handle<String> pattern(re->Pattern());
377   pattern = String::Flatten(pattern);
378   RegExpCompileData compile_data;
379   FlatStringReader reader(isolate, pattern);
380   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
381                                  &compile_data)) {
382     // Throw an exception if we fail to parse the pattern.
383     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
384     USE(ThrowRegExpException(re, pattern, compile_data.error));
385     return false;
386   }
387   RegExpEngine::CompilationResult result =
388       RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
389                             sample_subject, is_one_byte);
390   if (result.error_message != NULL) {
391     // Unable to compile regexp.
392     Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
393         CStrVector(result.error_message)).ToHandleChecked();
394     ThrowRegExpException(re, error_message);
395     return false;
396   }
397 
398   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
399   data->set(JSRegExp::code_index(is_one_byte), result.code);
400   SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
401   int register_max = IrregexpMaxRegisterCount(*data);
402   if (result.num_registers > register_max) {
403     SetIrregexpMaxRegisterCount(*data, result.num_registers);
404   }
405 
406   return true;
407 }
408 
409 
IrregexpMaxRegisterCount(FixedArray * re)410 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
411   return Smi::cast(
412       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
413 }
414 
415 
SetIrregexpMaxRegisterCount(FixedArray * re,int value)416 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
417   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
418 }
419 
SetIrregexpCaptureNameMap(FixedArray * re,Handle<FixedArray> value)420 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
421                                            Handle<FixedArray> value) {
422   if (value.is_null()) {
423     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::FromInt(0));
424   } else {
425     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
426   }
427 }
428 
IrregexpNumberOfCaptures(FixedArray * re)429 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
430   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
431 }
432 
433 
IrregexpNumberOfRegisters(FixedArray * re)434 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
435   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
436 }
437 
438 
IrregexpByteCode(FixedArray * re,bool is_one_byte)439 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
440   return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
441 }
442 
443 
IrregexpNativeCode(FixedArray * re,bool is_one_byte)444 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
445   return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
446 }
447 
448 
IrregexpInitialize(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)449 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
450                                     Handle<String> pattern,
451                                     JSRegExp::Flags flags,
452                                     int capture_count) {
453   // Initialize compiled code entries to null.
454   re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
455                                                      JSRegExp::IRREGEXP,
456                                                      pattern,
457                                                      flags,
458                                                      capture_count);
459 }
460 
461 
IrregexpPrepare(Handle<JSRegExp> regexp,Handle<String> subject)462 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
463                                 Handle<String> subject) {
464   subject = String::Flatten(subject);
465 
466   // Check representation of the underlying storage.
467   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
468   if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
469 
470 #ifdef V8_INTERPRETED_REGEXP
471   // Byte-code regexp needs space allocated for all its registers.
472   // The result captures are copied to the start of the registers array
473   // if the match succeeds.  This way those registers are not clobbered
474   // when we set the last match info from last successful match.
475   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
476          (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
477 #else  // V8_INTERPRETED_REGEXP
478   // Native regexp only needs room to output captures. Registers are handled
479   // internally.
480   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
481 #endif  // V8_INTERPRETED_REGEXP
482 }
483 
484 
IrregexpExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)485 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
486                                 Handle<String> subject,
487                                 int index,
488                                 int32_t* output,
489                                 int output_size) {
490   Isolate* isolate = regexp->GetIsolate();
491 
492   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
493 
494   DCHECK(index >= 0);
495   DCHECK(index <= subject->length());
496   DCHECK(subject->IsFlat());
497 
498   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
499 
500 #ifndef V8_INTERPRETED_REGEXP
501   DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
502   do {
503     EnsureCompiledIrregexp(regexp, subject, is_one_byte);
504     Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
505     // The stack is used to allocate registers for the compiled regexp code.
506     // This means that in case of failure, the output registers array is left
507     // untouched and contains the capture results from the previous successful
508     // match.  We can use that to set the last match info lazily.
509     NativeRegExpMacroAssembler::Result res =
510         NativeRegExpMacroAssembler::Match(code,
511                                           subject,
512                                           output,
513                                           output_size,
514                                           index,
515                                           isolate);
516     if (res != NativeRegExpMacroAssembler::RETRY) {
517       DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
518              isolate->has_pending_exception());
519       STATIC_ASSERT(
520           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
521       STATIC_ASSERT(
522           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
523       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
524                     == RE_EXCEPTION);
525       return static_cast<IrregexpResult>(res);
526     }
527     // If result is RETRY, the string has changed representation, and we
528     // must restart from scratch.
529     // In this case, it means we must make sure we are prepared to handle
530     // the, potentially, different subject (the string can switch between
531     // being internal and external, and even between being Latin1 and UC16,
532     // but the characters are always the same).
533     IrregexpPrepare(regexp, subject);
534     is_one_byte = subject->IsOneByteRepresentationUnderneath();
535   } while (true);
536   UNREACHABLE();
537   return RE_EXCEPTION;
538 #else  // V8_INTERPRETED_REGEXP
539 
540   DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
541   // We must have done EnsureCompiledIrregexp, so we can get the number of
542   // registers.
543   int number_of_capture_registers =
544       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
545   int32_t* raw_output = &output[number_of_capture_registers];
546   // We do not touch the actual capture result registers until we know there
547   // has been a match so that we can use those capture results to set the
548   // last match info.
549   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
550     raw_output[i] = -1;
551   }
552   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
553                                isolate);
554 
555   IrregexpResult result = IrregexpInterpreter::Match(isolate,
556                                                      byte_codes,
557                                                      subject,
558                                                      raw_output,
559                                                      index);
560   if (result == RE_SUCCESS) {
561     // Copy capture results to the start of the registers array.
562     MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
563   }
564   if (result == RE_EXCEPTION) {
565     DCHECK(!isolate->has_pending_exception());
566     isolate->StackOverflow();
567   }
568   return result;
569 #endif  // V8_INTERPRETED_REGEXP
570 }
571 
572 
IrregexpExec(Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<JSArray> last_match_info)573 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
574                                              Handle<String> subject,
575                                              int previous_index,
576                                              Handle<JSArray> last_match_info) {
577   Isolate* isolate = regexp->GetIsolate();
578   DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
579 
580   // Prepare space for the return values.
581 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
582   if (FLAG_trace_regexp_bytecodes) {
583     String* pattern = regexp->Pattern();
584     PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
585     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
586   }
587 #endif
588   int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
589   if (required_registers < 0) {
590     // Compiling failed with an exception.
591     DCHECK(isolate->has_pending_exception());
592     return MaybeHandle<Object>();
593   }
594 
595   int32_t* output_registers = NULL;
596   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
597     output_registers = NewArray<int32_t>(required_registers);
598   }
599   base::SmartArrayPointer<int32_t> auto_release(output_registers);
600   if (output_registers == NULL) {
601     output_registers = isolate->jsregexp_static_offsets_vector();
602   }
603 
604   int res = RegExpImpl::IrregexpExecRaw(
605       regexp, subject, previous_index, output_registers, required_registers);
606   if (res == RE_SUCCESS) {
607     int capture_count =
608         IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
609     return SetLastMatchInfo(
610         last_match_info, subject, capture_count, output_registers);
611   }
612   if (res == RE_EXCEPTION) {
613     DCHECK(isolate->has_pending_exception());
614     return MaybeHandle<Object>();
615   }
616   DCHECK(res == RE_FAILURE);
617   return isolate->factory()->null_value();
618 }
619 
620 
EnsureSize(Handle<JSArray> array,uint32_t minimum_size)621 static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
622   if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
623     JSArray::SetLength(array, minimum_size);
624   }
625 }
626 
627 
SetLastMatchInfo(Handle<JSArray> last_match_info,Handle<String> subject,int capture_count,int32_t * match)628 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
629                                              Handle<String> subject,
630                                              int capture_count,
631                                              int32_t* match) {
632   DCHECK(last_match_info->HasFastObjectElements());
633   int capture_register_count = (capture_count + 1) * 2;
634   EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
635   DisallowHeapAllocation no_allocation;
636   FixedArray* array = FixedArray::cast(last_match_info->elements());
637   if (match != NULL) {
638     for (int i = 0; i < capture_register_count; i += 2) {
639       SetCapture(array, i, match[i]);
640       SetCapture(array, i + 1, match[i + 1]);
641     }
642   }
643   SetLastCaptureCount(array, capture_register_count);
644   SetLastSubject(array, *subject);
645   SetLastInput(array, *subject);
646   return last_match_info;
647 }
648 
649 
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)650 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
651                                      Handle<String> subject,
652                                      Isolate* isolate)
653   : register_array_(NULL),
654     register_array_size_(0),
655     regexp_(regexp),
656     subject_(subject) {
657 #ifdef V8_INTERPRETED_REGEXP
658   bool interpreted = true;
659 #else
660   bool interpreted = false;
661 #endif  // V8_INTERPRETED_REGEXP
662 
663   if (regexp_->TypeTag() == JSRegExp::ATOM) {
664     static const int kAtomRegistersPerMatch = 2;
665     registers_per_match_ = kAtomRegistersPerMatch;
666     // There is no distinction between interpreted and native for atom regexps.
667     interpreted = false;
668   } else {
669     registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
670     if (registers_per_match_ < 0) {
671       num_matches_ = -1;  // Signal exception.
672       return;
673     }
674   }
675 
676   DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
677   if (!interpreted) {
678     register_array_size_ =
679         Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
680     max_matches_ = register_array_size_ / registers_per_match_;
681   } else {
682     // Global loop in interpreted regexp is not implemented.  We choose
683     // the size of the offsets vector so that it can only store one match.
684     register_array_size_ = registers_per_match_;
685     max_matches_ = 1;
686   }
687 
688   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
689     register_array_ = NewArray<int32_t>(register_array_size_);
690   } else {
691     register_array_ = isolate->jsregexp_static_offsets_vector();
692   }
693 
694   // Set state so that fetching the results the first time triggers a call
695   // to the compiled regexp.
696   current_match_index_ = max_matches_ - 1;
697   num_matches_ = max_matches_;
698   DCHECK(registers_per_match_ >= 2);  // Each match has at least one capture.
699   DCHECK_GE(register_array_size_, registers_per_match_);
700   int32_t* last_match =
701       &register_array_[current_match_index_ * registers_per_match_];
702   last_match[0] = -1;
703   last_match[1] = 0;
704 }
705 
AdvanceZeroLength(int last_index)706 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
707   if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
708       last_index + 1 < subject_->length() &&
709       unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
710       unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
711     // Advance over the surrogate pair.
712     return last_index + 2;
713   }
714   return last_index + 1;
715 }
716 
717 // -------------------------------------------------------------------
718 // Implementation of the Irregexp regular expression engine.
719 //
720 // The Irregexp regular expression engine is intended to be a complete
721 // implementation of ECMAScript regular expressions.  It generates either
722 // bytecodes or native code.
723 
724 //   The Irregexp regexp engine is structured in three steps.
725 //   1) The parser generates an abstract syntax tree.  See ast.cc.
726 //   2) From the AST a node network is created.  The nodes are all
727 //      subclasses of RegExpNode.  The nodes represent states when
728 //      executing a regular expression.  Several optimizations are
729 //      performed on the node network.
730 //   3) From the nodes we generate either byte codes or native code
731 //      that can actually execute the regular expression (perform
732 //      the search).  The code generation step is described in more
733 //      detail below.
734 
735 // Code generation.
736 //
737 //   The nodes are divided into four main categories.
738 //   * Choice nodes
739 //        These represent places where the regular expression can
740 //        match in more than one way.  For example on entry to an
741 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
742 //   * Action nodes
743 //        These represent places where some action should be
744 //        performed.  Examples include recording the current position
745 //        in the input string to a register (in order to implement
746 //        captures) or other actions on register for example in order
747 //        to implement the counters needed for {} repetitions.
748 //   * Matching nodes
749 //        These attempt to match some element part of the input string.
750 //        Examples of elements include character classes, plain strings
751 //        or back references.
752 //   * End nodes
753 //        These are used to implement the actions required on finding
754 //        a successful match or failing to find a match.
755 //
756 //   The code generated (whether as byte codes or native code) maintains
757 //   some state as it runs.  This consists of the following elements:
758 //
759 //   * The capture registers.  Used for string captures.
760 //   * Other registers.  Used for counters etc.
761 //   * The current position.
762 //   * The stack of backtracking information.  Used when a matching node
763 //     fails to find a match and needs to try an alternative.
764 //
765 // Conceptual regular expression execution model:
766 //
767 //   There is a simple conceptual model of regular expression execution
768 //   which will be presented first.  The actual code generated is a more
769 //   efficient simulation of the simple conceptual model:
770 //
771 //   * Choice nodes are implemented as follows:
772 //     For each choice except the last {
773 //       push current position
774 //       push backtrack code location
775 //       <generate code to test for choice>
776 //       backtrack code location:
777 //       pop current position
778 //     }
779 //     <generate code to test for last choice>
780 //
781 //   * Actions nodes are generated as follows
782 //     <push affected registers on backtrack stack>
783 //     <generate code to perform action>
784 //     push backtrack code location
785 //     <generate code to test for following nodes>
786 //     backtrack code location:
787 //     <pop affected registers to restore their state>
788 //     <pop backtrack location from stack and go to it>
789 //
790 //   * Matching nodes are generated as follows:
791 //     if input string matches at current position
792 //       update current position
793 //       <generate code to test for following nodes>
794 //     else
795 //       <pop backtrack location from stack and go to it>
796 //
797 //   Thus it can be seen that the current position is saved and restored
798 //   by the choice nodes, whereas the registers are saved and restored by
799 //   by the action nodes that manipulate them.
800 //
801 //   The other interesting aspect of this model is that nodes are generated
802 //   at the point where they are needed by a recursive call to Emit().  If
803 //   the node has already been code generated then the Emit() call will
804 //   generate a jump to the previously generated code instead.  In order to
805 //   limit recursion it is possible for the Emit() function to put the node
806 //   on a work list for later generation and instead generate a jump.  The
807 //   destination of the jump is resolved later when the code is generated.
808 //
809 // Actual regular expression code generation.
810 //
811 //   Code generation is actually more complicated than the above.  In order
812 //   to improve the efficiency of the generated code some optimizations are
813 //   performed
814 //
815 //   * Choice nodes have 1-character lookahead.
816 //     A choice node looks at the following character and eliminates some of
817 //     the choices immediately based on that character.  This is not yet
818 //     implemented.
819 //   * Simple greedy loops store reduced backtracking information.
820 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
821 //     then need to backtrack to a point where it can match "foo".  The naive
822 //     implementation of this would push each character position onto the
823 //     backtracking stack, then pop them off one by one.  This would use space
824 //     proportional to the length of the input string.  However since the "."
825 //     can only match in one way and always has a constant length (in this case
826 //     of 1) it suffices to store the current position on the top of the stack
827 //     once.  Matching now becomes merely incrementing the current position and
828 //     backtracking becomes decrementing the current position and checking the
829 //     result against the stored current position.  This is faster and saves
830 //     space.
831 //   * The current state is virtualized.
832 //     This is used to defer expensive operations until it is clear that they
833 //     are needed and to generate code for a node more than once, allowing
834 //     specialized an efficient versions of the code to be created. This is
835 //     explained in the section below.
836 //
837 // Execution state virtualization.
838 //
839 //   Instead of emitting code, nodes that manipulate the state can record their
840 //   manipulation in an object called the Trace.  The Trace object can record a
841 //   current position offset, an optional backtrack code location on the top of
842 //   the virtualized backtrack stack and some register changes.  When a node is
843 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
844 //   will emit code to bring the actual state into line with the virtual state.
845 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
846 //   registers).  Postponing work can save time when executing the regular
847 //   expression since it may be found that the work never has to be done as a
848 //   failure to match can occur.  In addition it is much faster to jump to a
849 //   known backtrack code location than it is to pop an unknown backtrack
850 //   location from the stack and jump there.
851 //
852 //   The virtual state found in the Trace affects code generation.  For example
853 //   the virtual state contains the difference between the actual current
854 //   position and the virtual current position, and matching code needs to use
855 //   this offset to attempt a match in the correct location of the input
856 //   string.  Therefore code generated for a non-trivial trace is specialized
857 //   to that trace.  The code generator therefore has the ability to generate
858 //   code for each node several times.  In order to limit the size of the
859 //   generated code there is an arbitrary limit on how many specialized sets of
860 //   code may be generated for a given node.  If the limit is reached, the
861 //   trace is flushed and a generic version of the code for a node is emitted.
862 //   This is subsequently used for that node.  The code emitted for non-generic
863 //   trace is not recorded in the node and so it cannot currently be reused in
864 //   the event that code generation is requested for an identical trace.
865 
866 
AppendToText(RegExpText * text,Zone * zone)867 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
868   UNREACHABLE();
869 }
870 
871 
AppendToText(RegExpText * text,Zone * zone)872 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
873   text->AddElement(TextElement::Atom(this), zone);
874 }
875 
876 
AppendToText(RegExpText * text,Zone * zone)877 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
878   text->AddElement(TextElement::CharClass(this), zone);
879 }
880 
881 
AppendToText(RegExpText * text,Zone * zone)882 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
883   for (int i = 0; i < elements()->length(); i++)
884     text->AddElement(elements()->at(i), zone);
885 }
886 
887 
Atom(RegExpAtom * atom)888 TextElement TextElement::Atom(RegExpAtom* atom) {
889   return TextElement(ATOM, atom);
890 }
891 
892 
CharClass(RegExpCharacterClass * char_class)893 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
894   return TextElement(CHAR_CLASS, char_class);
895 }
896 
897 
length() const898 int TextElement::length() const {
899   switch (text_type()) {
900     case ATOM:
901       return atom()->length();
902 
903     case CHAR_CLASS:
904       return 1;
905   }
906   UNREACHABLE();
907   return 0;
908 }
909 
910 
GetTable(bool ignore_case)911 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
912   if (table_ == NULL) {
913     table_ = new(zone()) DispatchTable(zone());
914     DispatchTableConstructor cons(table_, ignore_case, zone());
915     cons.BuildTable(this);
916   }
917   return table_;
918 }
919 
920 
921 class FrequencyCollator {
922  public:
FrequencyCollator()923   FrequencyCollator() : total_samples_(0) {
924     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
925       frequencies_[i] = CharacterFrequency(i);
926     }
927   }
928 
CountCharacter(int character)929   void CountCharacter(int character) {
930     int index = (character & RegExpMacroAssembler::kTableMask);
931     frequencies_[index].Increment();
932     total_samples_++;
933   }
934 
935   // Does not measure in percent, but rather per-128 (the table size from the
936   // regexp macro assembler).
Frequency(int in_character)937   int Frequency(int in_character) {
938     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
939     if (total_samples_ < 1) return 1;  // Division by zero.
940     int freq_in_per128 =
941         (frequencies_[in_character].counter() * 128) / total_samples_;
942     return freq_in_per128;
943   }
944 
945  private:
946   class CharacterFrequency {
947    public:
CharacterFrequency()948     CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)949     explicit CharacterFrequency(int character)
950         : counter_(0), character_(character) { }
951 
Increment()952     void Increment() { counter_++; }
counter()953     int counter() { return counter_; }
character()954     int character() { return character_; }
955 
956    private:
957     int counter_;
958     int character_;
959   };
960 
961 
962  private:
963   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
964   int total_samples_;
965 };
966 
967 
968 class RegExpCompiler {
969  public:
970   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
971                  JSRegExp::Flags flags, bool is_one_byte);
972 
AllocateRegister()973   int AllocateRegister() {
974     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
975       reg_exp_too_big_ = true;
976       return next_register_;
977     }
978     return next_register_++;
979   }
980 
981   // Lookarounds to match lone surrogates for unicode character class matches
982   // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()983   int UnicodeLookaroundStackRegister() {
984     if (unicode_lookaround_stack_register_ == kNoRegister) {
985       unicode_lookaround_stack_register_ = AllocateRegister();
986     }
987     return unicode_lookaround_stack_register_;
988   }
989 
UnicodeLookaroundPositionRegister()990   int UnicodeLookaroundPositionRegister() {
991     if (unicode_lookaround_position_register_ == kNoRegister) {
992       unicode_lookaround_position_register_ = AllocateRegister();
993     }
994     return unicode_lookaround_position_register_;
995   }
996 
997   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
998                                            RegExpNode* start,
999                                            int capture_count,
1000                                            Handle<String> pattern);
1001 
AddWork(RegExpNode * node)1002   inline void AddWork(RegExpNode* node) {
1003     if (!node->on_work_list() && !node->label()->is_bound()) {
1004       node->set_on_work_list(true);
1005       work_list_->Add(node);
1006     }
1007   }
1008 
1009   static const int kImplementationOffset = 0;
1010   static const int kNumberOfRegistersOffset = 0;
1011   static const int kCodeOffset = 1;
1012 
macro_assembler()1013   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()1014   EndNode* accept() { return accept_; }
1015 
1016   static const int kMaxRecursion = 100;
recursion_depth()1017   inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()1018   inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()1019   inline void DecrementRecursionDepth() { recursion_depth_--; }
1020 
SetRegExpTooBig()1021   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1022 
ignore_case()1023   inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
unicode()1024   inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
one_byte()1025   inline bool one_byte() { return one_byte_; }
optimize()1026   inline bool optimize() { return optimize_; }
set_optimize(bool value)1027   inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()1028   inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)1029   inline void set_limiting_recursion(bool value) {
1030     limiting_recursion_ = value;
1031   }
read_backward()1032   bool read_backward() { return read_backward_; }
set_read_backward(bool value)1033   void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()1034   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1035 
current_expansion_factor()1036   int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)1037   void set_current_expansion_factor(int value) {
1038     current_expansion_factor_ = value;
1039   }
1040 
isolate() const1041   Isolate* isolate() const { return isolate_; }
zone() const1042   Zone* zone() const { return zone_; }
1043 
1044   static const int kNoRegister = -1;
1045 
1046  private:
1047   EndNode* accept_;
1048   int next_register_;
1049   int unicode_lookaround_stack_register_;
1050   int unicode_lookaround_position_register_;
1051   List<RegExpNode*>* work_list_;
1052   int recursion_depth_;
1053   RegExpMacroAssembler* macro_assembler_;
1054   JSRegExp::Flags flags_;
1055   bool one_byte_;
1056   bool reg_exp_too_big_;
1057   bool limiting_recursion_;
1058   bool optimize_;
1059   bool read_backward_;
1060   int current_expansion_factor_;
1061   FrequencyCollator frequency_collator_;
1062   Isolate* isolate_;
1063   Zone* zone_;
1064 };
1065 
1066 
1067 class RecursionCheck {
1068  public:
RecursionCheck(RegExpCompiler * compiler)1069   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1070     compiler->IncrementRecursionDepth();
1071   }
~RecursionCheck()1072   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1073  private:
1074   RegExpCompiler* compiler_;
1075 };
1076 
1077 
IrregexpRegExpTooBig(Isolate * isolate)1078 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1079   return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1080 }
1081 
1082 
1083 // Attempts to compile the regexp using an Irregexp code generator.  Returns
1084 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,JSRegExp::Flags flags,bool one_byte)1085 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1086                                JSRegExp::Flags flags, bool one_byte)
1087     : next_register_(2 * (capture_count + 1)),
1088       unicode_lookaround_stack_register_(kNoRegister),
1089       unicode_lookaround_position_register_(kNoRegister),
1090       work_list_(NULL),
1091       recursion_depth_(0),
1092       flags_(flags),
1093       one_byte_(one_byte),
1094       reg_exp_too_big_(false),
1095       limiting_recursion_(false),
1096       optimize_(FLAG_regexp_optimization),
1097       read_backward_(false),
1098       current_expansion_factor_(1),
1099       frequency_collator_(),
1100       isolate_(isolate),
1101       zone_(zone) {
1102   accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1103   DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1104 }
1105 
1106 
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1107 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1108     RegExpMacroAssembler* macro_assembler,
1109     RegExpNode* start,
1110     int capture_count,
1111     Handle<String> pattern) {
1112   Heap* heap = pattern->GetHeap();
1113 
1114 #ifdef DEBUG
1115   if (FLAG_trace_regexp_assembler)
1116     macro_assembler_ =
1117         new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1118   else
1119 #endif
1120     macro_assembler_ = macro_assembler;
1121 
1122   List <RegExpNode*> work_list(0);
1123   work_list_ = &work_list;
1124   Label fail;
1125   macro_assembler_->PushBacktrack(&fail);
1126   Trace new_trace;
1127   start->Emit(this, &new_trace);
1128   macro_assembler_->Bind(&fail);
1129   macro_assembler_->Fail();
1130   while (!work_list.is_empty()) {
1131     RegExpNode* node = work_list.RemoveLast();
1132     node->set_on_work_list(false);
1133     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1134   }
1135   if (reg_exp_too_big_) {
1136     macro_assembler_->AbortedCodeGeneration();
1137     return IrregexpRegExpTooBig(isolate_);
1138   }
1139 
1140   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1141   heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1142   work_list_ = NULL;
1143 #ifdef ENABLE_DISASSEMBLER
1144   if (FLAG_print_code) {
1145     CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1146     OFStream os(trace_scope.file());
1147     Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1148   }
1149 #endif
1150 #ifdef DEBUG
1151   if (FLAG_trace_regexp_assembler) {
1152     delete macro_assembler_;
1153   }
1154 #endif
1155   return RegExpEngine::CompilationResult(*code, next_register_);
1156 }
1157 
1158 
Mentions(int that)1159 bool Trace::DeferredAction::Mentions(int that) {
1160   if (action_type() == ActionNode::CLEAR_CAPTURES) {
1161     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1162     return range.Contains(that);
1163   } else {
1164     return reg() == that;
1165   }
1166 }
1167 
1168 
mentions_reg(int reg)1169 bool Trace::mentions_reg(int reg) {
1170   for (DeferredAction* action = actions_;
1171        action != NULL;
1172        action = action->next()) {
1173     if (action->Mentions(reg))
1174       return true;
1175   }
1176   return false;
1177 }
1178 
1179 
GetStoredPosition(int reg,int * cp_offset)1180 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1181   DCHECK_EQ(0, *cp_offset);
1182   for (DeferredAction* action = actions_;
1183        action != NULL;
1184        action = action->next()) {
1185     if (action->Mentions(reg)) {
1186       if (action->action_type() == ActionNode::STORE_POSITION) {
1187         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1188         return true;
1189       } else {
1190         return false;
1191       }
1192     }
1193   }
1194   return false;
1195 }
1196 
1197 
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1198 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1199                                  Zone* zone) {
1200   int max_register = RegExpCompiler::kNoRegister;
1201   for (DeferredAction* action = actions_;
1202        action != NULL;
1203        action = action->next()) {
1204     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1205       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1206       for (int i = range.from(); i <= range.to(); i++)
1207         affected_registers->Set(i, zone);
1208       if (range.to() > max_register) max_register = range.to();
1209     } else {
1210       affected_registers->Set(action->reg(), zone);
1211       if (action->reg() > max_register) max_register = action->reg();
1212     }
1213   }
1214   return max_register;
1215 }
1216 
1217 
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1218 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1219                                      int max_register,
1220                                      const OutSet& registers_to_pop,
1221                                      const OutSet& registers_to_clear) {
1222   for (int reg = max_register; reg >= 0; reg--) {
1223     if (registers_to_pop.Get(reg)) {
1224       assembler->PopRegister(reg);
1225     } else if (registers_to_clear.Get(reg)) {
1226       int clear_to = reg;
1227       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1228         reg--;
1229       }
1230       assembler->ClearRegisters(reg, clear_to);
1231     }
1232   }
1233 }
1234 
1235 
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1236 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1237                                    int max_register,
1238                                    const OutSet& affected_registers,
1239                                    OutSet* registers_to_pop,
1240                                    OutSet* registers_to_clear,
1241                                    Zone* zone) {
1242   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1243   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1244 
1245   // Count pushes performed to force a stack limit check occasionally.
1246   int pushes = 0;
1247 
1248   for (int reg = 0; reg <= max_register; reg++) {
1249     if (!affected_registers.Get(reg)) {
1250       continue;
1251     }
1252 
1253     // The chronologically first deferred action in the trace
1254     // is used to infer the action needed to restore a register
1255     // to its previous state (or not, if it's safe to ignore it).
1256     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1257     DeferredActionUndoType undo_action = IGNORE;
1258 
1259     int value = 0;
1260     bool absolute = false;
1261     bool clear = false;
1262     static const int kNoStore = kMinInt;
1263     int store_position = kNoStore;
1264     // This is a little tricky because we are scanning the actions in reverse
1265     // historical order (newest first).
1266     for (DeferredAction* action = actions_;
1267          action != NULL;
1268          action = action->next()) {
1269       if (action->Mentions(reg)) {
1270         switch (action->action_type()) {
1271           case ActionNode::SET_REGISTER: {
1272             Trace::DeferredSetRegister* psr =
1273                 static_cast<Trace::DeferredSetRegister*>(action);
1274             if (!absolute) {
1275               value += psr->value();
1276               absolute = true;
1277             }
1278             // SET_REGISTER is currently only used for newly introduced loop
1279             // counters. They can have a significant previous value if they
1280             // occour in a loop. TODO(lrn): Propagate this information, so
1281             // we can set undo_action to IGNORE if we know there is no value to
1282             // restore.
1283             undo_action = RESTORE;
1284             DCHECK_EQ(store_position, kNoStore);
1285             DCHECK(!clear);
1286             break;
1287           }
1288           case ActionNode::INCREMENT_REGISTER:
1289             if (!absolute) {
1290               value++;
1291             }
1292             DCHECK_EQ(store_position, kNoStore);
1293             DCHECK(!clear);
1294             undo_action = RESTORE;
1295             break;
1296           case ActionNode::STORE_POSITION: {
1297             Trace::DeferredCapture* pc =
1298                 static_cast<Trace::DeferredCapture*>(action);
1299             if (!clear && store_position == kNoStore) {
1300               store_position = pc->cp_offset();
1301             }
1302 
1303             // For captures we know that stores and clears alternate.
1304             // Other register, are never cleared, and if the occur
1305             // inside a loop, they might be assigned more than once.
1306             if (reg <= 1) {
1307               // Registers zero and one, aka "capture zero", is
1308               // always set correctly if we succeed. There is no
1309               // need to undo a setting on backtrack, because we
1310               // will set it again or fail.
1311               undo_action = IGNORE;
1312             } else {
1313               undo_action = pc->is_capture() ? CLEAR : RESTORE;
1314             }
1315             DCHECK(!absolute);
1316             DCHECK_EQ(value, 0);
1317             break;
1318           }
1319           case ActionNode::CLEAR_CAPTURES: {
1320             // Since we're scanning in reverse order, if we've already
1321             // set the position we have to ignore historically earlier
1322             // clearing operations.
1323             if (store_position == kNoStore) {
1324               clear = true;
1325             }
1326             undo_action = RESTORE;
1327             DCHECK(!absolute);
1328             DCHECK_EQ(value, 0);
1329             break;
1330           }
1331           default:
1332             UNREACHABLE();
1333             break;
1334         }
1335       }
1336     }
1337     // Prepare for the undo-action (e.g., push if it's going to be popped).
1338     if (undo_action == RESTORE) {
1339       pushes++;
1340       RegExpMacroAssembler::StackCheckFlag stack_check =
1341           RegExpMacroAssembler::kNoStackLimitCheck;
1342       if (pushes == push_limit) {
1343         stack_check = RegExpMacroAssembler::kCheckStackLimit;
1344         pushes = 0;
1345       }
1346 
1347       assembler->PushRegister(reg, stack_check);
1348       registers_to_pop->Set(reg, zone);
1349     } else if (undo_action == CLEAR) {
1350       registers_to_clear->Set(reg, zone);
1351     }
1352     // Perform the chronologically last action (or accumulated increment)
1353     // for the register.
1354     if (store_position != kNoStore) {
1355       assembler->WriteCurrentPositionToRegister(reg, store_position);
1356     } else if (clear) {
1357       assembler->ClearRegisters(reg, reg);
1358     } else if (absolute) {
1359       assembler->SetRegister(reg, value);
1360     } else if (value != 0) {
1361       assembler->AdvanceRegister(reg, value);
1362     }
1363   }
1364 }
1365 
1366 
1367 // This is called as we come into a loop choice node and some other tricky
1368 // nodes.  It normalizes the state of the code generator to ensure we can
1369 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1370 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1371   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1372 
1373   DCHECK(!is_trivial());
1374 
1375   if (actions_ == NULL && backtrack() == NULL) {
1376     // Here we just have some deferred cp advances to fix and we are back to
1377     // a normal situation.  We may also have to forget some information gained
1378     // through a quick check that was already performed.
1379     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1380     // Create a new trivial state and generate the node with that.
1381     Trace new_state;
1382     successor->Emit(compiler, &new_state);
1383     return;
1384   }
1385 
1386   // Generate deferred actions here along with code to undo them again.
1387   OutSet affected_registers;
1388 
1389   if (backtrack() != NULL) {
1390     // Here we have a concrete backtrack location.  These are set up by choice
1391     // nodes and so they indicate that we have a deferred save of the current
1392     // position which we may need to emit here.
1393     assembler->PushCurrentPosition();
1394   }
1395 
1396   int max_register = FindAffectedRegisters(&affected_registers,
1397                                            compiler->zone());
1398   OutSet registers_to_pop;
1399   OutSet registers_to_clear;
1400   PerformDeferredActions(assembler,
1401                          max_register,
1402                          affected_registers,
1403                          &registers_to_pop,
1404                          &registers_to_clear,
1405                          compiler->zone());
1406   if (cp_offset_ != 0) {
1407     assembler->AdvanceCurrentPosition(cp_offset_);
1408   }
1409 
1410   // Create a new trivial state and generate the node with that.
1411   Label undo;
1412   assembler->PushBacktrack(&undo);
1413   if (successor->KeepRecursing(compiler)) {
1414     Trace new_state;
1415     successor->Emit(compiler, &new_state);
1416   } else {
1417     compiler->AddWork(successor);
1418     assembler->GoTo(successor->label());
1419   }
1420 
1421   // On backtrack we need to restore state.
1422   assembler->Bind(&undo);
1423   RestoreAffectedRegisters(assembler,
1424                            max_register,
1425                            registers_to_pop,
1426                            registers_to_clear);
1427   if (backtrack() == NULL) {
1428     assembler->Backtrack();
1429   } else {
1430     assembler->PopCurrentPosition();
1431     assembler->GoTo(backtrack());
1432   }
1433 }
1434 
1435 
Emit(RegExpCompiler * compiler,Trace * trace)1436 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1437   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1438 
1439   // Omit flushing the trace. We discard the entire stack frame anyway.
1440 
1441   if (!label()->is_bound()) {
1442     // We are completely independent of the trace, since we ignore it,
1443     // so this code can be used as the generic version.
1444     assembler->Bind(label());
1445   }
1446 
1447   // Throw away everything on the backtrack stack since the start
1448   // of the negative submatch and restore the character position.
1449   assembler->ReadCurrentPositionFromRegister(current_position_register_);
1450   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1451   if (clear_capture_count_ > 0) {
1452     // Clear any captures that might have been performed during the success
1453     // of the body of the negative look-ahead.
1454     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1455     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1456   }
1457   // Now that we have unwound the stack we find at the top of the stack the
1458   // backtrack that the BeginSubmatch node got.
1459   assembler->Backtrack();
1460 }
1461 
1462 
Emit(RegExpCompiler * compiler,Trace * trace)1463 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1464   if (!trace->is_trivial()) {
1465     trace->Flush(compiler, this);
1466     return;
1467   }
1468   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1469   if (!label()->is_bound()) {
1470     assembler->Bind(label());
1471   }
1472   switch (action_) {
1473     case ACCEPT:
1474       assembler->Succeed();
1475       return;
1476     case BACKTRACK:
1477       assembler->GoTo(trace->backtrack());
1478       return;
1479     case NEGATIVE_SUBMATCH_SUCCESS:
1480       // This case is handled in a different virtual method.
1481       UNREACHABLE();
1482   }
1483   UNIMPLEMENTED();
1484 }
1485 
1486 
AddGuard(Guard * guard,Zone * zone)1487 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1488   if (guards_ == NULL)
1489     guards_ = new(zone) ZoneList<Guard*>(1, zone);
1490   guards_->Add(guard, zone);
1491 }
1492 
1493 
SetRegister(int reg,int val,RegExpNode * on_success)1494 ActionNode* ActionNode::SetRegister(int reg,
1495                                     int val,
1496                                     RegExpNode* on_success) {
1497   ActionNode* result =
1498       new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1499   result->data_.u_store_register.reg = reg;
1500   result->data_.u_store_register.value = val;
1501   return result;
1502 }
1503 
1504 
IncrementRegister(int reg,RegExpNode * on_success)1505 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1506   ActionNode* result =
1507       new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1508   result->data_.u_increment_register.reg = reg;
1509   return result;
1510 }
1511 
1512 
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1513 ActionNode* ActionNode::StorePosition(int reg,
1514                                       bool is_capture,
1515                                       RegExpNode* on_success) {
1516   ActionNode* result =
1517       new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1518   result->data_.u_position_register.reg = reg;
1519   result->data_.u_position_register.is_capture = is_capture;
1520   return result;
1521 }
1522 
1523 
ClearCaptures(Interval range,RegExpNode * on_success)1524 ActionNode* ActionNode::ClearCaptures(Interval range,
1525                                       RegExpNode* on_success) {
1526   ActionNode* result =
1527       new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1528   result->data_.u_clear_captures.range_from = range.from();
1529   result->data_.u_clear_captures.range_to = range.to();
1530   return result;
1531 }
1532 
1533 
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1534 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1535                                       int position_reg,
1536                                       RegExpNode* on_success) {
1537   ActionNode* result =
1538       new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1539   result->data_.u_submatch.stack_pointer_register = stack_reg;
1540   result->data_.u_submatch.current_position_register = position_reg;
1541   return result;
1542 }
1543 
1544 
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1545 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1546                                                 int position_reg,
1547                                                 int clear_register_count,
1548                                                 int clear_register_from,
1549                                                 RegExpNode* on_success) {
1550   ActionNode* result =
1551       new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1552   result->data_.u_submatch.stack_pointer_register = stack_reg;
1553   result->data_.u_submatch.current_position_register = position_reg;
1554   result->data_.u_submatch.clear_register_count = clear_register_count;
1555   result->data_.u_submatch.clear_register_from = clear_register_from;
1556   return result;
1557 }
1558 
1559 
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1560 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1561                                         int repetition_register,
1562                                         int repetition_limit,
1563                                         RegExpNode* on_success) {
1564   ActionNode* result =
1565       new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1566   result->data_.u_empty_match_check.start_register = start_register;
1567   result->data_.u_empty_match_check.repetition_register = repetition_register;
1568   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1569   return result;
1570 }
1571 
1572 
1573 #define DEFINE_ACCEPT(Type)                                          \
1574   void Type##Node::Accept(NodeVisitor* visitor) {                    \
1575     visitor->Visit##Type(this);                                      \
1576   }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1577 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1578 #undef DEFINE_ACCEPT
1579 
1580 
1581 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1582   visitor->VisitLoopChoice(this);
1583 }
1584 
1585 
1586 // -------------------------------------------------------------------
1587 // Emit code.
1588 
1589 
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1590 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1591                                Guard* guard,
1592                                Trace* trace) {
1593   switch (guard->op()) {
1594     case Guard::LT:
1595       DCHECK(!trace->mentions_reg(guard->reg()));
1596       macro_assembler->IfRegisterGE(guard->reg(),
1597                                     guard->value(),
1598                                     trace->backtrack());
1599       break;
1600     case Guard::GEQ:
1601       DCHECK(!trace->mentions_reg(guard->reg()));
1602       macro_assembler->IfRegisterLT(guard->reg(),
1603                                     guard->value(),
1604                                     trace->backtrack());
1605       break;
1606   }
1607 }
1608 
1609 
1610 // Returns the number of characters in the equivalence class, omitting those
1611 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1612 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1613                                      bool one_byte_subject,
1614                                      unibrow::uchar* letters) {
1615   int length =
1616       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1617   // Unibrow returns 0 or 1 for characters where case independence is
1618   // trivial.
1619   if (length == 0) {
1620     letters[0] = character;
1621     length = 1;
1622   }
1623 
1624   if (one_byte_subject) {
1625     int new_length = 0;
1626     for (int i = 0; i < length; i++) {
1627       if (letters[i] <= String::kMaxOneByteCharCode) {
1628         letters[new_length++] = letters[i];
1629       }
1630     }
1631     length = new_length;
1632   }
1633 
1634   return length;
1635 }
1636 
1637 
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1638 static inline bool EmitSimpleCharacter(Isolate* isolate,
1639                                        RegExpCompiler* compiler,
1640                                        uc16 c,
1641                                        Label* on_failure,
1642                                        int cp_offset,
1643                                        bool check,
1644                                        bool preloaded) {
1645   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1646   bool bound_checked = false;
1647   if (!preloaded) {
1648     assembler->LoadCurrentCharacter(
1649         cp_offset,
1650         on_failure,
1651         check);
1652     bound_checked = true;
1653   }
1654   assembler->CheckNotCharacter(c, on_failure);
1655   return bound_checked;
1656 }
1657 
1658 
1659 // Only emits non-letters (things that don't have case).  Only used for case
1660 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1661 static inline bool EmitAtomNonLetter(Isolate* isolate,
1662                                      RegExpCompiler* compiler,
1663                                      uc16 c,
1664                                      Label* on_failure,
1665                                      int cp_offset,
1666                                      bool check,
1667                                      bool preloaded) {
1668   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1669   bool one_byte = compiler->one_byte();
1670   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1671   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1672   if (length < 1) {
1673     // This can't match.  Must be an one-byte subject and a non-one-byte
1674     // character.  We do not need to do anything since the one-byte pass
1675     // already handled this.
1676     return false;  // Bounds not checked.
1677   }
1678   bool checked = false;
1679   // We handle the length > 1 case in a later pass.
1680   if (length == 1) {
1681     if (one_byte && c > String::kMaxOneByteCharCodeU) {
1682       // Can't match - see above.
1683       return false;  // Bounds not checked.
1684     }
1685     if (!preloaded) {
1686       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1687       checked = check;
1688     }
1689     macro_assembler->CheckNotCharacter(c, on_failure);
1690   }
1691   return checked;
1692 }
1693 
1694 
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1695 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1696                                       bool one_byte, uc16 c1, uc16 c2,
1697                                       Label* on_failure) {
1698   uc16 char_mask;
1699   if (one_byte) {
1700     char_mask = String::kMaxOneByteCharCode;
1701   } else {
1702     char_mask = String::kMaxUtf16CodeUnit;
1703   }
1704   uc16 exor = c1 ^ c2;
1705   // Check whether exor has only one bit set.
1706   if (((exor - 1) & exor) == 0) {
1707     // If c1 and c2 differ only by one bit.
1708     // Ecma262UnCanonicalize always gives the highest number last.
1709     DCHECK(c2 > c1);
1710     uc16 mask = char_mask ^ exor;
1711     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1712     return true;
1713   }
1714   DCHECK(c2 > c1);
1715   uc16 diff = c2 - c1;
1716   if (((diff - 1) & diff) == 0 && c1 >= diff) {
1717     // If the characters differ by 2^n but don't differ by one bit then
1718     // subtract the difference from the found character, then do the or
1719     // trick.  We avoid the theoretical case where negative numbers are
1720     // involved in order to simplify code generation.
1721     uc16 mask = char_mask ^ diff;
1722     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1723                                                     diff,
1724                                                     mask,
1725                                                     on_failure);
1726     return true;
1727   }
1728   return false;
1729 }
1730 
1731 
1732 typedef bool EmitCharacterFunction(Isolate* isolate,
1733                                    RegExpCompiler* compiler,
1734                                    uc16 c,
1735                                    Label* on_failure,
1736                                    int cp_offset,
1737                                    bool check,
1738                                    bool preloaded);
1739 
1740 // Only emits letters (things that have case).  Only used for case independent
1741 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1742 static inline bool EmitAtomLetter(Isolate* isolate,
1743                                   RegExpCompiler* compiler,
1744                                   uc16 c,
1745                                   Label* on_failure,
1746                                   int cp_offset,
1747                                   bool check,
1748                                   bool preloaded) {
1749   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1750   bool one_byte = compiler->one_byte();
1751   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1752   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1753   if (length <= 1) return false;
1754   // We may not need to check against the end of the input string
1755   // if this character lies before a character that matched.
1756   if (!preloaded) {
1757     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1758   }
1759   Label ok;
1760   DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1761   switch (length) {
1762     case 2: {
1763       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1764                                     chars[1], on_failure)) {
1765       } else {
1766         macro_assembler->CheckCharacter(chars[0], &ok);
1767         macro_assembler->CheckNotCharacter(chars[1], on_failure);
1768         macro_assembler->Bind(&ok);
1769       }
1770       break;
1771     }
1772     case 4:
1773       macro_assembler->CheckCharacter(chars[3], &ok);
1774       // Fall through!
1775     case 3:
1776       macro_assembler->CheckCharacter(chars[0], &ok);
1777       macro_assembler->CheckCharacter(chars[1], &ok);
1778       macro_assembler->CheckNotCharacter(chars[2], on_failure);
1779       macro_assembler->Bind(&ok);
1780       break;
1781     default:
1782       UNREACHABLE();
1783       break;
1784   }
1785   return true;
1786 }
1787 
1788 
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1789 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1790                              int border,
1791                              Label* fall_through,
1792                              Label* above_or_equal,
1793                              Label* below) {
1794   if (below != fall_through) {
1795     masm->CheckCharacterLT(border, below);
1796     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1797   } else {
1798     masm->CheckCharacterGT(border - 1, above_or_equal);
1799   }
1800 }
1801 
1802 
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1803 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1804                                    int first,
1805                                    int last,
1806                                    Label* fall_through,
1807                                    Label* in_range,
1808                                    Label* out_of_range) {
1809   if (in_range == fall_through) {
1810     if (first == last) {
1811       masm->CheckNotCharacter(first, out_of_range);
1812     } else {
1813       masm->CheckCharacterNotInRange(first, last, out_of_range);
1814     }
1815   } else {
1816     if (first == last) {
1817       masm->CheckCharacter(first, in_range);
1818     } else {
1819       masm->CheckCharacterInRange(first, last, in_range);
1820     }
1821     if (out_of_range != fall_through) masm->GoTo(out_of_range);
1822   }
1823 }
1824 
1825 
1826 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1827 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1828 static void EmitUseLookupTable(
1829     RegExpMacroAssembler* masm,
1830     ZoneList<int>* ranges,
1831     int start_index,
1832     int end_index,
1833     int min_char,
1834     Label* fall_through,
1835     Label* even_label,
1836     Label* odd_label) {
1837   static const int kSize = RegExpMacroAssembler::kTableSize;
1838   static const int kMask = RegExpMacroAssembler::kTableMask;
1839 
1840   int base = (min_char & ~kMask);
1841   USE(base);
1842 
1843   // Assert that everything is on one kTableSize page.
1844   for (int i = start_index; i <= end_index; i++) {
1845     DCHECK_EQ(ranges->at(i) & ~kMask, base);
1846   }
1847   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1848 
1849   char templ[kSize];
1850   Label* on_bit_set;
1851   Label* on_bit_clear;
1852   int bit;
1853   if (even_label == fall_through) {
1854     on_bit_set = odd_label;
1855     on_bit_clear = even_label;
1856     bit = 1;
1857   } else {
1858     on_bit_set = even_label;
1859     on_bit_clear = odd_label;
1860     bit = 0;
1861   }
1862   for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1863     templ[i] = bit;
1864   }
1865   int j = 0;
1866   bit ^= 1;
1867   for (int i = start_index; i < end_index; i++) {
1868     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1869       templ[j] = bit;
1870     }
1871     bit ^= 1;
1872   }
1873   for (int i = j; i < kSize; i++) {
1874     templ[i] = bit;
1875   }
1876   Factory* factory = masm->isolate()->factory();
1877   // TODO(erikcorry): Cache these.
1878   Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1879   for (int i = 0; i < kSize; i++) {
1880     ba->set(i, templ[i]);
1881   }
1882   masm->CheckBitInTable(ba, on_bit_set);
1883   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1884 }
1885 
1886 
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1887 static void CutOutRange(RegExpMacroAssembler* masm,
1888                         ZoneList<int>* ranges,
1889                         int start_index,
1890                         int end_index,
1891                         int cut_index,
1892                         Label* even_label,
1893                         Label* odd_label) {
1894   bool odd = (((cut_index - start_index) & 1) == 1);
1895   Label* in_range_label = odd ? odd_label : even_label;
1896   Label dummy;
1897   EmitDoubleBoundaryTest(masm,
1898                          ranges->at(cut_index),
1899                          ranges->at(cut_index + 1) - 1,
1900                          &dummy,
1901                          in_range_label,
1902                          &dummy);
1903   DCHECK(!dummy.is_linked());
1904   // Cut out the single range by rewriting the array.  This creates a new
1905   // range that is a merger of the two ranges on either side of the one we
1906   // are cutting out.  The oddity of the labels is preserved.
1907   for (int j = cut_index; j > start_index; j--) {
1908     ranges->at(j) = ranges->at(j - 1);
1909   }
1910   for (int j = cut_index + 1; j < end_index; j++) {
1911     ranges->at(j) = ranges->at(j + 1);
1912   }
1913 }
1914 
1915 
1916 // Unicode case.  Split the search space into kSize spaces that are handled
1917 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1918 static void SplitSearchSpace(ZoneList<int>* ranges,
1919                              int start_index,
1920                              int end_index,
1921                              int* new_start_index,
1922                              int* new_end_index,
1923                              int* border) {
1924   static const int kSize = RegExpMacroAssembler::kTableSize;
1925   static const int kMask = RegExpMacroAssembler::kTableMask;
1926 
1927   int first = ranges->at(start_index);
1928   int last = ranges->at(end_index) - 1;
1929 
1930   *new_start_index = start_index;
1931   *border = (ranges->at(start_index) & ~kMask) + kSize;
1932   while (*new_start_index < end_index) {
1933     if (ranges->at(*new_start_index) > *border) break;
1934     (*new_start_index)++;
1935   }
1936   // new_start_index is the index of the first edge that is beyond the
1937   // current kSize space.
1938 
1939   // For very large search spaces we do a binary chop search of the non-Latin1
1940   // space instead of just going to the end of the current kSize space.  The
1941   // heuristics are complicated a little by the fact that any 128-character
1942   // encoding space can be quickly tested with a table lookup, so we don't
1943   // wish to do binary chop search at a smaller granularity than that.  A
1944   // 128-character space can take up a lot of space in the ranges array if,
1945   // for example, we only want to match every second character (eg. the lower
1946   // case characters on some Unicode pages).
1947   int binary_chop_index = (end_index + start_index) / 2;
1948   // The first test ensures that we get to the code that handles the Latin1
1949   // range with a single not-taken branch, speeding up this important
1950   // character range (even non-Latin1 charset-based text has spaces and
1951   // punctuation).
1952   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1953       end_index - start_index > (*new_start_index - start_index) * 2 &&
1954       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1955       ranges->at(binary_chop_index) >= first + 2 * kSize) {
1956     int scan_forward_for_section_border = binary_chop_index;;
1957     int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1958 
1959     while (scan_forward_for_section_border < end_index) {
1960       if (ranges->at(scan_forward_for_section_border) > new_border) {
1961         *new_start_index = scan_forward_for_section_border;
1962         *border = new_border;
1963         break;
1964       }
1965       scan_forward_for_section_border++;
1966     }
1967   }
1968 
1969   DCHECK(*new_start_index > start_index);
1970   *new_end_index = *new_start_index - 1;
1971   if (ranges->at(*new_end_index) == *border) {
1972     (*new_end_index)--;
1973   }
1974   if (*border >= ranges->at(end_index)) {
1975     *border = ranges->at(end_index);
1976     *new_start_index = end_index;  // Won't be used.
1977     *new_end_index = end_index - 1;
1978   }
1979 }
1980 
1981 
1982 // Gets a series of segment boundaries representing a character class.  If the
1983 // character is in the range between an even and an odd boundary (counting from
1984 // start_index) then go to even_label, otherwise go to odd_label.  We already
1985 // know that the character is in the range of min_char to max_char inclusive.
1986 // Either label can be NULL indicating backtracking.  Either label can also be
1987 // equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1988 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1989                              int start_index, int end_index, uc32 min_char,
1990                              uc32 max_char, Label* fall_through,
1991                              Label* even_label, Label* odd_label) {
1992   DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1993   DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1994 
1995   int first = ranges->at(start_index);
1996   int last = ranges->at(end_index) - 1;
1997 
1998   DCHECK_LT(min_char, first);
1999 
2000   // Just need to test if the character is before or on-or-after
2001   // a particular character.
2002   if (start_index == end_index) {
2003     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2004     return;
2005   }
2006 
2007   // Another almost trivial case:  There is one interval in the middle that is
2008   // different from the end intervals.
2009   if (start_index + 1 == end_index) {
2010     EmitDoubleBoundaryTest(
2011         masm, first, last, fall_through, even_label, odd_label);
2012     return;
2013   }
2014 
2015   // It's not worth using table lookup if there are very few intervals in the
2016   // character class.
2017   if (end_index - start_index <= 6) {
2018     // It is faster to test for individual characters, so we look for those
2019     // first, then try arbitrary ranges in the second round.
2020     static int kNoCutIndex = -1;
2021     int cut = kNoCutIndex;
2022     for (int i = start_index; i < end_index; i++) {
2023       if (ranges->at(i) == ranges->at(i + 1) - 1) {
2024         cut = i;
2025         break;
2026       }
2027     }
2028     if (cut == kNoCutIndex) cut = start_index;
2029     CutOutRange(
2030         masm, ranges, start_index, end_index, cut, even_label, odd_label);
2031     DCHECK_GE(end_index - start_index, 2);
2032     GenerateBranches(masm,
2033                      ranges,
2034                      start_index + 1,
2035                      end_index - 1,
2036                      min_char,
2037                      max_char,
2038                      fall_through,
2039                      even_label,
2040                      odd_label);
2041     return;
2042   }
2043 
2044   // If there are a lot of intervals in the regexp, then we will use tables to
2045   // determine whether the character is inside or outside the character class.
2046   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2047 
2048   if ((max_char >> kBits) == (min_char >> kBits)) {
2049     EmitUseLookupTable(masm,
2050                        ranges,
2051                        start_index,
2052                        end_index,
2053                        min_char,
2054                        fall_through,
2055                        even_label,
2056                        odd_label);
2057     return;
2058   }
2059 
2060   if ((min_char >> kBits) != (first >> kBits)) {
2061     masm->CheckCharacterLT(first, odd_label);
2062     GenerateBranches(masm,
2063                      ranges,
2064                      start_index + 1,
2065                      end_index,
2066                      first,
2067                      max_char,
2068                      fall_through,
2069                      odd_label,
2070                      even_label);
2071     return;
2072   }
2073 
2074   int new_start_index = 0;
2075   int new_end_index = 0;
2076   int border = 0;
2077 
2078   SplitSearchSpace(ranges,
2079                    start_index,
2080                    end_index,
2081                    &new_start_index,
2082                    &new_end_index,
2083                    &border);
2084 
2085   Label handle_rest;
2086   Label* above = &handle_rest;
2087   if (border == last + 1) {
2088     // We didn't find any section that started after the limit, so everything
2089     // above the border is one of the terminal labels.
2090     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2091     DCHECK(new_end_index == end_index - 1);
2092   }
2093 
2094   DCHECK_LE(start_index, new_end_index);
2095   DCHECK_LE(new_start_index, end_index);
2096   DCHECK_LT(start_index, new_start_index);
2097   DCHECK_LT(new_end_index, end_index);
2098   DCHECK(new_end_index + 1 == new_start_index ||
2099          (new_end_index + 2 == new_start_index &&
2100           border == ranges->at(new_end_index + 1)));
2101   DCHECK_LT(min_char, border - 1);
2102   DCHECK_LT(border, max_char);
2103   DCHECK_LT(ranges->at(new_end_index), border);
2104   DCHECK(border < ranges->at(new_start_index) ||
2105          (border == ranges->at(new_start_index) &&
2106           new_start_index == end_index &&
2107           new_end_index == end_index - 1 &&
2108           border == last + 1));
2109   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2110 
2111   masm->CheckCharacterGT(border - 1, above);
2112   Label dummy;
2113   GenerateBranches(masm,
2114                    ranges,
2115                    start_index,
2116                    new_end_index,
2117                    min_char,
2118                    border - 1,
2119                    &dummy,
2120                    even_label,
2121                    odd_label);
2122   if (handle_rest.is_linked()) {
2123     masm->Bind(&handle_rest);
2124     bool flip = (new_start_index & 1) != (start_index & 1);
2125     GenerateBranches(masm,
2126                      ranges,
2127                      new_start_index,
2128                      end_index,
2129                      border,
2130                      max_char,
2131                      &dummy,
2132                      flip ? odd_label : even_label,
2133                      flip ? even_label : odd_label);
2134   }
2135 }
2136 
2137 
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2138 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2139                           RegExpCharacterClass* cc, bool one_byte,
2140                           Label* on_failure, int cp_offset, bool check_offset,
2141                           bool preloaded, Zone* zone) {
2142   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2143   CharacterRange::Canonicalize(ranges);
2144 
2145   int max_char;
2146   if (one_byte) {
2147     max_char = String::kMaxOneByteCharCode;
2148   } else {
2149     max_char = String::kMaxUtf16CodeUnit;
2150   }
2151 
2152   int range_count = ranges->length();
2153 
2154   int last_valid_range = range_count - 1;
2155   while (last_valid_range >= 0) {
2156     CharacterRange& range = ranges->at(last_valid_range);
2157     if (range.from() <= max_char) {
2158       break;
2159     }
2160     last_valid_range--;
2161   }
2162 
2163   if (last_valid_range < 0) {
2164     if (!cc->is_negated()) {
2165       macro_assembler->GoTo(on_failure);
2166     }
2167     if (check_offset) {
2168       macro_assembler->CheckPosition(cp_offset, on_failure);
2169     }
2170     return;
2171   }
2172 
2173   if (last_valid_range == 0 &&
2174       ranges->at(0).IsEverything(max_char)) {
2175     if (cc->is_negated()) {
2176       macro_assembler->GoTo(on_failure);
2177     } else {
2178       // This is a common case hit by non-anchored expressions.
2179       if (check_offset) {
2180         macro_assembler->CheckPosition(cp_offset, on_failure);
2181       }
2182     }
2183     return;
2184   }
2185 
2186   if (!preloaded) {
2187     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2188   }
2189 
2190   if (cc->is_standard(zone) &&
2191       macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2192                                                   on_failure)) {
2193       return;
2194   }
2195 
2196 
2197   // A new list with ascending entries.  Each entry is a code unit
2198   // where there is a boundary between code units that are part of
2199   // the class and code units that are not.  Normally we insert an
2200   // entry at zero which goes to the failure label, but if there
2201   // was already one there we fall through for success on that entry.
2202   // Subsequent entries have alternating meaning (success/failure).
2203   ZoneList<int>* range_boundaries =
2204       new(zone) ZoneList<int>(last_valid_range, zone);
2205 
2206   bool zeroth_entry_is_failure = !cc->is_negated();
2207 
2208   for (int i = 0; i <= last_valid_range; i++) {
2209     CharacterRange& range = ranges->at(i);
2210     if (range.from() == 0) {
2211       DCHECK_EQ(i, 0);
2212       zeroth_entry_is_failure = !zeroth_entry_is_failure;
2213     } else {
2214       range_boundaries->Add(range.from(), zone);
2215     }
2216     range_boundaries->Add(range.to() + 1, zone);
2217   }
2218   int end_index = range_boundaries->length() - 1;
2219   if (range_boundaries->at(end_index) > max_char) {
2220     end_index--;
2221   }
2222 
2223   Label fall_through;
2224   GenerateBranches(macro_assembler,
2225                    range_boundaries,
2226                    0,  // start_index.
2227                    end_index,
2228                    0,  // min_char.
2229                    max_char,
2230                    &fall_through,
2231                    zeroth_entry_is_failure ? &fall_through : on_failure,
2232                    zeroth_entry_is_failure ? on_failure : &fall_through);
2233   macro_assembler->Bind(&fall_through);
2234 }
2235 
2236 
~RegExpNode()2237 RegExpNode::~RegExpNode() {
2238 }
2239 
2240 
LimitVersions(RegExpCompiler * compiler,Trace * trace)2241 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2242                                                   Trace* trace) {
2243   // If we are generating a greedy loop then don't stop and don't reuse code.
2244   if (trace->stop_node() != NULL) {
2245     return CONTINUE;
2246   }
2247 
2248   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2249   if (trace->is_trivial()) {
2250     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2251       // If a generic version is already scheduled to be generated or we have
2252       // recursed too deeply then just generate a jump to that code.
2253       macro_assembler->GoTo(&label_);
2254       // This will queue it up for generation of a generic version if it hasn't
2255       // already been queued.
2256       compiler->AddWork(this);
2257       return DONE;
2258     }
2259     // Generate generic version of the node and bind the label for later use.
2260     macro_assembler->Bind(&label_);
2261     return CONTINUE;
2262   }
2263 
2264   // We are being asked to make a non-generic version.  Keep track of how many
2265   // non-generic versions we generate so as not to overdo it.
2266   trace_count_++;
2267   if (KeepRecursing(compiler) && compiler->optimize() &&
2268       trace_count_ < kMaxCopiesCodeGenerated) {
2269     return CONTINUE;
2270   }
2271 
2272   // If we get here code has been generated for this node too many times or
2273   // recursion is too deep.  Time to switch to a generic version.  The code for
2274   // generic versions above can handle deep recursion properly.
2275   bool was_limiting = compiler->limiting_recursion();
2276   compiler->set_limiting_recursion(true);
2277   trace->Flush(compiler, this);
2278   compiler->set_limiting_recursion(was_limiting);
2279   return DONE;
2280 }
2281 
2282 
KeepRecursing(RegExpCompiler * compiler)2283 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2284   return !compiler->limiting_recursion() &&
2285          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2286 }
2287 
2288 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2289 int ActionNode::EatsAtLeast(int still_to_find,
2290                             int budget,
2291                             bool not_at_start) {
2292   if (budget <= 0) return 0;
2293   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
2294   return on_success()->EatsAtLeast(still_to_find,
2295                                    budget - 1,
2296                                    not_at_start);
2297 }
2298 
2299 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2300 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2301                               BoyerMooreLookahead* bm, bool not_at_start) {
2302   if (action_type_ == BEGIN_SUBMATCH) {
2303     bm->SetRest(offset);
2304   } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2305     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2306   }
2307   SaveBMInfo(bm, not_at_start, offset);
2308 }
2309 
2310 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2311 int AssertionNode::EatsAtLeast(int still_to_find,
2312                                int budget,
2313                                bool not_at_start) {
2314   if (budget <= 0) return 0;
2315   // If we know we are not at the start and we are asked "how many characters
2316   // will you match if you succeed?" then we can answer anything since false
2317   // implies false.  So lets just return the max answer (still_to_find) since
2318   // that won't prevent us from preloading a lot of characters for the other
2319   // branches in the node graph.
2320   if (assertion_type() == AT_START && not_at_start) return still_to_find;
2321   return on_success()->EatsAtLeast(still_to_find,
2322                                    budget - 1,
2323                                    not_at_start);
2324 }
2325 
2326 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2327 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2328                                  BoyerMooreLookahead* bm, bool not_at_start) {
2329   // Match the behaviour of EatsAtLeast on this node.
2330   if (assertion_type() == AT_START && not_at_start) return;
2331   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2332   SaveBMInfo(bm, not_at_start, offset);
2333 }
2334 
2335 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2336 int BackReferenceNode::EatsAtLeast(int still_to_find,
2337                                    int budget,
2338                                    bool not_at_start) {
2339   if (read_backward()) return 0;
2340   if (budget <= 0) return 0;
2341   return on_success()->EatsAtLeast(still_to_find,
2342                                    budget - 1,
2343                                    not_at_start);
2344 }
2345 
2346 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2347 int TextNode::EatsAtLeast(int still_to_find,
2348                           int budget,
2349                           bool not_at_start) {
2350   if (read_backward()) return 0;
2351   int answer = Length();
2352   if (answer >= still_to_find) return answer;
2353   if (budget <= 0) return answer;
2354   // We are not at start after this node so we set the last argument to 'true'.
2355   return answer + on_success()->EatsAtLeast(still_to_find - answer,
2356                                             budget - 1,
2357                                             true);
2358 }
2359 
2360 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2361 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2362                                               bool not_at_start) {
2363   if (budget <= 0) return 0;
2364   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2365   // afterwards.
2366   RegExpNode* node = alternatives_->at(1).node();
2367   return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2368 }
2369 
2370 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2371 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2372     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2373     bool not_at_start) {
2374   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2375   // afterwards.
2376   RegExpNode* node = alternatives_->at(1).node();
2377   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2378 }
2379 
2380 
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2381 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2382                                   int budget,
2383                                   RegExpNode* ignore_this_node,
2384                                   bool not_at_start) {
2385   if (budget <= 0) return 0;
2386   int min = 100;
2387   int choice_count = alternatives_->length();
2388   budget = (budget - 1) / choice_count;
2389   for (int i = 0; i < choice_count; i++) {
2390     RegExpNode* node = alternatives_->at(i).node();
2391     if (node == ignore_this_node) continue;
2392     int node_eats_at_least =
2393         node->EatsAtLeast(still_to_find, budget, not_at_start);
2394     if (node_eats_at_least < min) min = node_eats_at_least;
2395     if (min == 0) return 0;
2396   }
2397   return min;
2398 }
2399 
2400 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2401 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2402                                 int budget,
2403                                 bool not_at_start) {
2404   return EatsAtLeastHelper(still_to_find,
2405                            budget - 1,
2406                            loop_node_,
2407                            not_at_start);
2408 }
2409 
2410 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2411 int ChoiceNode::EatsAtLeast(int still_to_find,
2412                             int budget,
2413                             bool not_at_start) {
2414   return EatsAtLeastHelper(still_to_find,
2415                            budget,
2416                            NULL,
2417                            not_at_start);
2418 }
2419 
2420 
2421 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2422 static inline uint32_t SmearBitsRight(uint32_t v) {
2423   v |= v >> 1;
2424   v |= v >> 2;
2425   v |= v >> 4;
2426   v |= v >> 8;
2427   v |= v >> 16;
2428   return v;
2429 }
2430 
2431 
Rationalize(bool asc)2432 bool QuickCheckDetails::Rationalize(bool asc) {
2433   bool found_useful_op = false;
2434   uint32_t char_mask;
2435   if (asc) {
2436     char_mask = String::kMaxOneByteCharCode;
2437   } else {
2438     char_mask = String::kMaxUtf16CodeUnit;
2439   }
2440   mask_ = 0;
2441   value_ = 0;
2442   int char_shift = 0;
2443   for (int i = 0; i < characters_; i++) {
2444     Position* pos = &positions_[i];
2445     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2446       found_useful_op = true;
2447     }
2448     mask_ |= (pos->mask & char_mask) << char_shift;
2449     value_ |= (pos->value & char_mask) << char_shift;
2450     char_shift += asc ? 8 : 16;
2451   }
2452   return found_useful_op;
2453 }
2454 
2455 
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2456 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2457                                 Trace* bounds_check_trace,
2458                                 Trace* trace,
2459                                 bool preload_has_checked_bounds,
2460                                 Label* on_possible_success,
2461                                 QuickCheckDetails* details,
2462                                 bool fall_through_on_failure) {
2463   if (details->characters() == 0) return false;
2464   GetQuickCheckDetails(
2465       details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2466   if (details->cannot_match()) return false;
2467   if (!details->Rationalize(compiler->one_byte())) return false;
2468   DCHECK(details->characters() == 1 ||
2469          compiler->macro_assembler()->CanReadUnaligned());
2470   uint32_t mask = details->mask();
2471   uint32_t value = details->value();
2472 
2473   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2474 
2475   if (trace->characters_preloaded() != details->characters()) {
2476     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2477     // We are attempting to preload the minimum number of characters
2478     // any choice would eat, so if the bounds check fails, then none of the
2479     // choices can succeed, so we can just immediately backtrack, rather
2480     // than go to the next choice.
2481     assembler->LoadCurrentCharacter(trace->cp_offset(),
2482                                     bounds_check_trace->backtrack(),
2483                                     !preload_has_checked_bounds,
2484                                     details->characters());
2485   }
2486 
2487 
2488   bool need_mask = true;
2489 
2490   if (details->characters() == 1) {
2491     // If number of characters preloaded is 1 then we used a byte or 16 bit
2492     // load so the value is already masked down.
2493     uint32_t char_mask;
2494     if (compiler->one_byte()) {
2495       char_mask = String::kMaxOneByteCharCode;
2496     } else {
2497       char_mask = String::kMaxUtf16CodeUnit;
2498     }
2499     if ((mask & char_mask) == char_mask) need_mask = false;
2500     mask &= char_mask;
2501   } else {
2502     // For 2-character preloads in one-byte mode or 1-character preloads in
2503     // two-byte mode we also use a 16 bit load with zero extend.
2504     static const uint32_t kTwoByteMask = 0xffff;
2505     static const uint32_t kFourByteMask = 0xffffffff;
2506     if (details->characters() == 2 && compiler->one_byte()) {
2507       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2508     } else if (details->characters() == 1 && !compiler->one_byte()) {
2509       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2510     } else {
2511       if (mask == kFourByteMask) need_mask = false;
2512     }
2513   }
2514 
2515   if (fall_through_on_failure) {
2516     if (need_mask) {
2517       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2518     } else {
2519       assembler->CheckCharacter(value, on_possible_success);
2520     }
2521   } else {
2522     if (need_mask) {
2523       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2524     } else {
2525       assembler->CheckNotCharacter(value, trace->backtrack());
2526     }
2527   }
2528   return true;
2529 }
2530 
2531 
2532 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2533 // super-class in the .h file).
2534 //
2535 // We iterate along the text object, building up for each character a
2536 // mask and value that can be used to test for a quick failure to match.
2537 // The masks and values for the positions will be combined into a single
2538 // machine word for the current character width in order to be used in
2539 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2540 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2541                                     RegExpCompiler* compiler,
2542                                     int characters_filled_in,
2543                                     bool not_at_start) {
2544   // Do not collect any quick check details if the text node reads backward,
2545   // since it reads in the opposite direction than we use for quick checks.
2546   if (read_backward()) return;
2547   Isolate* isolate = compiler->macro_assembler()->isolate();
2548   DCHECK(characters_filled_in < details->characters());
2549   int characters = details->characters();
2550   int char_mask;
2551   if (compiler->one_byte()) {
2552     char_mask = String::kMaxOneByteCharCode;
2553   } else {
2554     char_mask = String::kMaxUtf16CodeUnit;
2555   }
2556   for (int k = 0; k < elements()->length(); k++) {
2557     TextElement elm = elements()->at(k);
2558     if (elm.text_type() == TextElement::ATOM) {
2559       Vector<const uc16> quarks = elm.atom()->data();
2560       for (int i = 0; i < characters && i < quarks.length(); i++) {
2561         QuickCheckDetails::Position* pos =
2562             details->positions(characters_filled_in);
2563         uc16 c = quarks[i];
2564         if (compiler->ignore_case()) {
2565           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2566           int length = GetCaseIndependentLetters(isolate, c,
2567                                                  compiler->one_byte(), chars);
2568           if (length == 0) {
2569             // This can happen because all case variants are non-Latin1, but we
2570             // know the input is Latin1.
2571             details->set_cannot_match();
2572             pos->determines_perfectly = false;
2573             return;
2574           }
2575           if (length == 1) {
2576             // This letter has no case equivalents, so it's nice and simple
2577             // and the mask-compare will determine definitely whether we have
2578             // a match at this character position.
2579             pos->mask = char_mask;
2580             pos->value = c;
2581             pos->determines_perfectly = true;
2582           } else {
2583             uint32_t common_bits = char_mask;
2584             uint32_t bits = chars[0];
2585             for (int j = 1; j < length; j++) {
2586               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2587               common_bits ^= differing_bits;
2588               bits &= common_bits;
2589             }
2590             // If length is 2 and common bits has only one zero in it then
2591             // our mask and compare instruction will determine definitely
2592             // whether we have a match at this character position.  Otherwise
2593             // it can only be an approximate check.
2594             uint32_t one_zero = (common_bits | ~char_mask);
2595             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2596               pos->determines_perfectly = true;
2597             }
2598             pos->mask = common_bits;
2599             pos->value = bits;
2600           }
2601         } else {
2602           // Don't ignore case.  Nice simple case where the mask-compare will
2603           // determine definitely whether we have a match at this character
2604           // position.
2605           if (c > char_mask) {
2606             details->set_cannot_match();
2607             pos->determines_perfectly = false;
2608             return;
2609           }
2610           pos->mask = char_mask;
2611           pos->value = c;
2612           pos->determines_perfectly = true;
2613         }
2614         characters_filled_in++;
2615         DCHECK(characters_filled_in <= details->characters());
2616         if (characters_filled_in == details->characters()) {
2617           return;
2618         }
2619       }
2620     } else {
2621       QuickCheckDetails::Position* pos =
2622           details->positions(characters_filled_in);
2623       RegExpCharacterClass* tree = elm.char_class();
2624       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2625       if (tree->is_negated()) {
2626         // A quick check uses multi-character mask and compare.  There is no
2627         // useful way to incorporate a negative char class into this scheme
2628         // so we just conservatively create a mask and value that will always
2629         // succeed.
2630         pos->mask = 0;
2631         pos->value = 0;
2632       } else {
2633         int first_range = 0;
2634         while (ranges->at(first_range).from() > char_mask) {
2635           first_range++;
2636           if (first_range == ranges->length()) {
2637             details->set_cannot_match();
2638             pos->determines_perfectly = false;
2639             return;
2640           }
2641         }
2642         CharacterRange range = ranges->at(first_range);
2643         uc16 from = range.from();
2644         uc16 to = range.to();
2645         if (to > char_mask) {
2646           to = char_mask;
2647         }
2648         uint32_t differing_bits = (from ^ to);
2649         // A mask and compare is only perfect if the differing bits form a
2650         // number like 00011111 with one single block of trailing 1s.
2651         if ((differing_bits & (differing_bits + 1)) == 0 &&
2652              from + differing_bits == to) {
2653           pos->determines_perfectly = true;
2654         }
2655         uint32_t common_bits = ~SmearBitsRight(differing_bits);
2656         uint32_t bits = (from & common_bits);
2657         for (int i = first_range + 1; i < ranges->length(); i++) {
2658           CharacterRange range = ranges->at(i);
2659           uc16 from = range.from();
2660           uc16 to = range.to();
2661           if (from > char_mask) continue;
2662           if (to > char_mask) to = char_mask;
2663           // Here we are combining more ranges into the mask and compare
2664           // value.  With each new range the mask becomes more sparse and
2665           // so the chances of a false positive rise.  A character class
2666           // with multiple ranges is assumed never to be equivalent to a
2667           // mask and compare operation.
2668           pos->determines_perfectly = false;
2669           uint32_t new_common_bits = (from ^ to);
2670           new_common_bits = ~SmearBitsRight(new_common_bits);
2671           common_bits &= new_common_bits;
2672           bits &= new_common_bits;
2673           uint32_t differing_bits = (from & common_bits) ^ bits;
2674           common_bits ^= differing_bits;
2675           bits &= common_bits;
2676         }
2677         pos->mask = common_bits;
2678         pos->value = bits;
2679       }
2680       characters_filled_in++;
2681       DCHECK(characters_filled_in <= details->characters());
2682       if (characters_filled_in == details->characters()) {
2683         return;
2684       }
2685     }
2686   }
2687   DCHECK(characters_filled_in != details->characters());
2688   if (!details->cannot_match()) {
2689     on_success()-> GetQuickCheckDetails(details,
2690                                         compiler,
2691                                         characters_filled_in,
2692                                         true);
2693   }
2694 }
2695 
2696 
Clear()2697 void QuickCheckDetails::Clear() {
2698   for (int i = 0; i < characters_; i++) {
2699     positions_[i].mask = 0;
2700     positions_[i].value = 0;
2701     positions_[i].determines_perfectly = false;
2702   }
2703   characters_ = 0;
2704 }
2705 
2706 
Advance(int by,bool one_byte)2707 void QuickCheckDetails::Advance(int by, bool one_byte) {
2708   if (by >= characters_ || by < 0) {
2709     DCHECK_IMPLIES(by < 0, characters_ == 0);
2710     Clear();
2711     return;
2712   }
2713   DCHECK_LE(characters_ - by, 4);
2714   DCHECK_LE(characters_, 4);
2715   for (int i = 0; i < characters_ - by; i++) {
2716     positions_[i] = positions_[by + i];
2717   }
2718   for (int i = characters_ - by; i < characters_; i++) {
2719     positions_[i].mask = 0;
2720     positions_[i].value = 0;
2721     positions_[i].determines_perfectly = false;
2722   }
2723   characters_ -= by;
2724   // We could change mask_ and value_ here but we would never advance unless
2725   // they had already been used in a check and they won't be used again because
2726   // it would gain us nothing.  So there's no point.
2727 }
2728 
2729 
Merge(QuickCheckDetails * other,int from_index)2730 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2731   DCHECK(characters_ == other->characters_);
2732   if (other->cannot_match_) {
2733     return;
2734   }
2735   if (cannot_match_) {
2736     *this = *other;
2737     return;
2738   }
2739   for (int i = from_index; i < characters_; i++) {
2740     QuickCheckDetails::Position* pos = positions(i);
2741     QuickCheckDetails::Position* other_pos = other->positions(i);
2742     if (pos->mask != other_pos->mask ||
2743         pos->value != other_pos->value ||
2744         !other_pos->determines_perfectly) {
2745       // Our mask-compare operation will be approximate unless we have the
2746       // exact same operation on both sides of the alternation.
2747       pos->determines_perfectly = false;
2748     }
2749     pos->mask &= other_pos->mask;
2750     pos->value &= pos->mask;
2751     other_pos->value &= pos->mask;
2752     uc16 differing_bits = (pos->value ^ other_pos->value);
2753     pos->mask &= ~differing_bits;
2754     pos->value &= pos->mask;
2755   }
2756 }
2757 
2758 
2759 class VisitMarker {
2760  public:
VisitMarker(NodeInfo * info)2761   explicit VisitMarker(NodeInfo* info) : info_(info) {
2762     DCHECK(!info->visited);
2763     info->visited = true;
2764   }
~VisitMarker()2765   ~VisitMarker() {
2766     info_->visited = false;
2767   }
2768  private:
2769   NodeInfo* info_;
2770 };
2771 
2772 
FilterOneByte(int depth,bool ignore_case)2773 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2774   if (info()->replacement_calculated) return replacement();
2775   if (depth < 0) return this;
2776   DCHECK(!info()->visited);
2777   VisitMarker marker(info());
2778   return FilterSuccessor(depth - 1, ignore_case);
2779 }
2780 
2781 
FilterSuccessor(int depth,bool ignore_case)2782 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2783   RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2784   if (next == NULL) return set_replacement(NULL);
2785   on_success_ = next;
2786   return set_replacement(this);
2787 }
2788 
2789 
2790 // We need to check for the following characters: 0x39c 0x3bc 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2791 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2792   // TODO(dcarney): this could be a lot more efficient.
2793   return range.Contains(0x39c) ||
2794       range.Contains(0x3bc) || range.Contains(0x178);
2795 }
2796 
2797 
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2798 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2799   for (int i = 0; i < ranges->length(); i++) {
2800     // TODO(dcarney): this could be a lot more efficient.
2801     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2802   }
2803   return false;
2804 }
2805 
2806 
FilterOneByte(int depth,bool ignore_case)2807 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2808   if (info()->replacement_calculated) return replacement();
2809   if (depth < 0) return this;
2810   DCHECK(!info()->visited);
2811   VisitMarker marker(info());
2812   int element_count = elements()->length();
2813   for (int i = 0; i < element_count; i++) {
2814     TextElement elm = elements()->at(i);
2815     if (elm.text_type() == TextElement::ATOM) {
2816       Vector<const uc16> quarks = elm.atom()->data();
2817       for (int j = 0; j < quarks.length(); j++) {
2818         uint16_t c = quarks[j];
2819         if (c <= String::kMaxOneByteCharCode) continue;
2820         if (!ignore_case) return set_replacement(NULL);
2821         // Here, we need to check for characters whose upper and lower cases
2822         // are outside the Latin-1 range.
2823         uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2824         // Character is outside Latin-1 completely
2825         if (converted == 0) return set_replacement(NULL);
2826         // Convert quark to Latin-1 in place.
2827         uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2828         copy[j] = converted;
2829       }
2830     } else {
2831       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2832       RegExpCharacterClass* cc = elm.char_class();
2833       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2834       CharacterRange::Canonicalize(ranges);
2835       // Now they are in order so we only need to look at the first.
2836       int range_count = ranges->length();
2837       if (cc->is_negated()) {
2838         if (range_count != 0 &&
2839             ranges->at(0).from() == 0 &&
2840             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2841           // This will be handled in a later filter.
2842           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2843           return set_replacement(NULL);
2844         }
2845       } else {
2846         if (range_count == 0 ||
2847             ranges->at(0).from() > String::kMaxOneByteCharCode) {
2848           // This will be handled in a later filter.
2849           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2850           return set_replacement(NULL);
2851         }
2852       }
2853     }
2854   }
2855   return FilterSuccessor(depth - 1, ignore_case);
2856 }
2857 
2858 
FilterOneByte(int depth,bool ignore_case)2859 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2860   if (info()->replacement_calculated) return replacement();
2861   if (depth < 0) return this;
2862   if (info()->visited) return this;
2863   {
2864     VisitMarker marker(info());
2865 
2866     RegExpNode* continue_replacement =
2867         continue_node_->FilterOneByte(depth - 1, ignore_case);
2868     // If we can't continue after the loop then there is no sense in doing the
2869     // loop.
2870     if (continue_replacement == NULL) return set_replacement(NULL);
2871   }
2872 
2873   return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2874 }
2875 
2876 
FilterOneByte(int depth,bool ignore_case)2877 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2878   if (info()->replacement_calculated) return replacement();
2879   if (depth < 0) return this;
2880   if (info()->visited) return this;
2881   VisitMarker marker(info());
2882   int choice_count = alternatives_->length();
2883 
2884   for (int i = 0; i < choice_count; i++) {
2885     GuardedAlternative alternative = alternatives_->at(i);
2886     if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2887       set_replacement(this);
2888       return this;
2889     }
2890   }
2891 
2892   int surviving = 0;
2893   RegExpNode* survivor = NULL;
2894   for (int i = 0; i < choice_count; i++) {
2895     GuardedAlternative alternative = alternatives_->at(i);
2896     RegExpNode* replacement =
2897         alternative.node()->FilterOneByte(depth - 1, ignore_case);
2898     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
2899     if (replacement != NULL) {
2900       alternatives_->at(i).set_node(replacement);
2901       surviving++;
2902       survivor = replacement;
2903     }
2904   }
2905   if (surviving < 2) return set_replacement(survivor);
2906 
2907   set_replacement(this);
2908   if (surviving == choice_count) {
2909     return this;
2910   }
2911   // Only some of the nodes survived the filtering.  We need to rebuild the
2912   // alternatives list.
2913   ZoneList<GuardedAlternative>* new_alternatives =
2914       new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2915   for (int i = 0; i < choice_count; i++) {
2916     RegExpNode* replacement =
2917         alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2918     if (replacement != NULL) {
2919       alternatives_->at(i).set_node(replacement);
2920       new_alternatives->Add(alternatives_->at(i), zone());
2921     }
2922   }
2923   alternatives_ = new_alternatives;
2924   return this;
2925 }
2926 
2927 
FilterOneByte(int depth,bool ignore_case)2928 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2929                                                         bool ignore_case) {
2930   if (info()->replacement_calculated) return replacement();
2931   if (depth < 0) return this;
2932   if (info()->visited) return this;
2933   VisitMarker marker(info());
2934   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2935   // afterwards.
2936   RegExpNode* node = alternatives_->at(1).node();
2937   RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2938   if (replacement == NULL) return set_replacement(NULL);
2939   alternatives_->at(1).set_node(replacement);
2940 
2941   RegExpNode* neg_node = alternatives_->at(0).node();
2942   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2943   // If the negative lookahead is always going to fail then
2944   // we don't need to check it.
2945   if (neg_replacement == NULL) return set_replacement(replacement);
2946   alternatives_->at(0).set_node(neg_replacement);
2947   return set_replacement(this);
2948 }
2949 
2950 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2951 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2952                                           RegExpCompiler* compiler,
2953                                           int characters_filled_in,
2954                                           bool not_at_start) {
2955   if (body_can_be_zero_length_ || info()->visited) return;
2956   VisitMarker marker(info());
2957   return ChoiceNode::GetQuickCheckDetails(details,
2958                                           compiler,
2959                                           characters_filled_in,
2960                                           not_at_start);
2961 }
2962 
2963 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2964 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2965                                   BoyerMooreLookahead* bm, bool not_at_start) {
2966   if (body_can_be_zero_length_ || budget <= 0) {
2967     bm->SetRest(offset);
2968     SaveBMInfo(bm, not_at_start, offset);
2969     return;
2970   }
2971   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2972   SaveBMInfo(bm, not_at_start, offset);
2973 }
2974 
2975 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2976 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2977                                       RegExpCompiler* compiler,
2978                                       int characters_filled_in,
2979                                       bool not_at_start) {
2980   not_at_start = (not_at_start || not_at_start_);
2981   int choice_count = alternatives_->length();
2982   DCHECK(choice_count > 0);
2983   alternatives_->at(0).node()->GetQuickCheckDetails(details,
2984                                                     compiler,
2985                                                     characters_filled_in,
2986                                                     not_at_start);
2987   for (int i = 1; i < choice_count; i++) {
2988     QuickCheckDetails new_details(details->characters());
2989     RegExpNode* node = alternatives_->at(i).node();
2990     node->GetQuickCheckDetails(&new_details, compiler,
2991                                characters_filled_in,
2992                                not_at_start);
2993     // Here we merge the quick match details of the two branches.
2994     details->Merge(&new_details, characters_filled_in);
2995   }
2996 }
2997 
2998 
2999 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)3000 static void EmitWordCheck(RegExpMacroAssembler* assembler,
3001                           Label* word,
3002                           Label* non_word,
3003                           bool fall_through_on_word) {
3004   if (assembler->CheckSpecialCharacterClass(
3005           fall_through_on_word ? 'w' : 'W',
3006           fall_through_on_word ? non_word : word)) {
3007     // Optimized implementation available.
3008     return;
3009   }
3010   assembler->CheckCharacterGT('z', non_word);
3011   assembler->CheckCharacterLT('0', non_word);
3012   assembler->CheckCharacterGT('a' - 1, word);
3013   assembler->CheckCharacterLT('9' + 1, word);
3014   assembler->CheckCharacterLT('A', non_word);
3015   assembler->CheckCharacterLT('Z' + 1, word);
3016   if (fall_through_on_word) {
3017     assembler->CheckNotCharacter('_', non_word);
3018   } else {
3019     assembler->CheckCharacter('_', word);
3020   }
3021 }
3022 
3023 
3024 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3025 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)3026 static void EmitHat(RegExpCompiler* compiler,
3027                     RegExpNode* on_success,
3028                     Trace* trace) {
3029   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3030   // We will be loading the previous character into the current character
3031   // register.
3032   Trace new_trace(*trace);
3033   new_trace.InvalidateCurrentCharacter();
3034 
3035   Label ok;
3036   if (new_trace.cp_offset() == 0) {
3037     // The start of input counts as a newline in this context, so skip to
3038     // ok if we are at the start.
3039     assembler->CheckAtStart(&ok);
3040   }
3041   // We already checked that we are not at the start of input so it must be
3042   // OK to load the previous character.
3043   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3044                                   new_trace.backtrack(),
3045                                   false);
3046   if (!assembler->CheckSpecialCharacterClass('n',
3047                                              new_trace.backtrack())) {
3048     // Newline means \n, \r, 0x2028 or 0x2029.
3049     if (!compiler->one_byte()) {
3050       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3051     }
3052     assembler->CheckCharacter('\n', &ok);
3053     assembler->CheckNotCharacter('\r', new_trace.backtrack());
3054   }
3055   assembler->Bind(&ok);
3056   on_success->Emit(compiler, &new_trace);
3057 }
3058 
3059 
3060 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)3061 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3062   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3063   Isolate* isolate = assembler->isolate();
3064   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3065   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3066   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3067   if (lookahead == NULL) {
3068     int eats_at_least =
3069         Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3070                                                     kRecursionBudget,
3071                                                     not_at_start));
3072     if (eats_at_least >= 1) {
3073       BoyerMooreLookahead* bm =
3074           new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3075       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3076       if (bm->at(0)->is_non_word())
3077         next_is_word_character = Trace::FALSE_VALUE;
3078       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3079     }
3080   } else {
3081     if (lookahead->at(0)->is_non_word())
3082       next_is_word_character = Trace::FALSE_VALUE;
3083     if (lookahead->at(0)->is_word())
3084       next_is_word_character = Trace::TRUE_VALUE;
3085   }
3086   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3087   if (next_is_word_character == Trace::UNKNOWN) {
3088     Label before_non_word;
3089     Label before_word;
3090     if (trace->characters_preloaded() != 1) {
3091       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3092     }
3093     // Fall through on non-word.
3094     EmitWordCheck(assembler, &before_word, &before_non_word, false);
3095     // Next character is not a word character.
3096     assembler->Bind(&before_non_word);
3097     Label ok;
3098     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3099     assembler->GoTo(&ok);
3100 
3101     assembler->Bind(&before_word);
3102     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3103     assembler->Bind(&ok);
3104   } else if (next_is_word_character == Trace::TRUE_VALUE) {
3105     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3106   } else {
3107     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3108     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3109   }
3110 }
3111 
3112 
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3113 void AssertionNode::BacktrackIfPrevious(
3114     RegExpCompiler* compiler,
3115     Trace* trace,
3116     AssertionNode::IfPrevious backtrack_if_previous) {
3117   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3118   Trace new_trace(*trace);
3119   new_trace.InvalidateCurrentCharacter();
3120 
3121   Label fall_through, dummy;
3122 
3123   Label* non_word = backtrack_if_previous == kIsNonWord ?
3124                     new_trace.backtrack() :
3125                     &fall_through;
3126   Label* word = backtrack_if_previous == kIsNonWord ?
3127                 &fall_through :
3128                 new_trace.backtrack();
3129 
3130   if (new_trace.cp_offset() == 0) {
3131     // The start of input counts as a non-word character, so the question is
3132     // decided if we are at the start.
3133     assembler->CheckAtStart(non_word);
3134   }
3135   // We already checked that we are not at the start of input so it must be
3136   // OK to load the previous character.
3137   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3138   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3139 
3140   assembler->Bind(&fall_through);
3141   on_success()->Emit(compiler, &new_trace);
3142 }
3143 
3144 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3145 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3146                                          RegExpCompiler* compiler,
3147                                          int filled_in,
3148                                          bool not_at_start) {
3149   if (assertion_type_ == AT_START && not_at_start) {
3150     details->set_cannot_match();
3151     return;
3152   }
3153   return on_success()->GetQuickCheckDetails(details,
3154                                             compiler,
3155                                             filled_in,
3156                                             not_at_start);
3157 }
3158 
3159 
Emit(RegExpCompiler * compiler,Trace * trace)3160 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3161   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3162   switch (assertion_type_) {
3163     case AT_END: {
3164       Label ok;
3165       assembler->CheckPosition(trace->cp_offset(), &ok);
3166       assembler->GoTo(trace->backtrack());
3167       assembler->Bind(&ok);
3168       break;
3169     }
3170     case AT_START: {
3171       if (trace->at_start() == Trace::FALSE_VALUE) {
3172         assembler->GoTo(trace->backtrack());
3173         return;
3174       }
3175       if (trace->at_start() == Trace::UNKNOWN) {
3176         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3177         Trace at_start_trace = *trace;
3178         at_start_trace.set_at_start(Trace::TRUE_VALUE);
3179         on_success()->Emit(compiler, &at_start_trace);
3180         return;
3181       }
3182     }
3183     break;
3184     case AFTER_NEWLINE:
3185       EmitHat(compiler, on_success(), trace);
3186       return;
3187     case AT_BOUNDARY:
3188     case AT_NON_BOUNDARY: {
3189       EmitBoundaryCheck(compiler, trace);
3190       return;
3191     }
3192   }
3193   on_success()->Emit(compiler, trace);
3194 }
3195 
3196 
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3197 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3198   if (quick_check == NULL) return false;
3199   if (offset >= quick_check->characters()) return false;
3200   return quick_check->positions(offset)->determines_perfectly;
3201 }
3202 
3203 
UpdateBoundsCheck(int index,int * checked_up_to)3204 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3205   if (index > *checked_up_to) {
3206     *checked_up_to = index;
3207   }
3208 }
3209 
3210 
3211 // We call this repeatedly to generate code for each pass over the text node.
3212 // The passes are in increasing order of difficulty because we hope one
3213 // of the first passes will fail in which case we are saved the work of the
3214 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
3215 // we will check the '%' in the first pass, the case independent 'a' in the
3216 // second pass and the character class in the last pass.
3217 //
3218 // The passes are done from right to left, so for example to test for /bar/
3219 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3220 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
3221 // bounds check most of the time.  In the example we only need to check for
3222 // end-of-input when loading the putative 'r'.
3223 //
3224 // A slight complication involves the fact that the first character may already
3225 // be fetched into a register by the previous node.  In this case we want to
3226 // do the test for that character first.  We do this in separate passes.  The
3227 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
3228 // pass has been performed then subsequent passes will have true in
3229 // first_element_checked to indicate that that character does not need to be
3230 // checked again.
3231 //
3232 // In addition to all this we are passed a Trace, which can
3233 // contain an AlternativeGeneration object.  In this AlternativeGeneration
3234 // object we can see details of any quick check that was already passed in
3235 // order to get to the code we are now generating.  The quick check can involve
3236 // loading characters, which means we do not need to recheck the bounds
3237 // up to the limit the quick check already checked.  In addition the quick
3238 // check can have involved a mask and compare operation which may simplify
3239 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3240 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3241                             TextEmitPassType pass,
3242                             bool preloaded,
3243                             Trace* trace,
3244                             bool first_element_checked,
3245                             int* checked_up_to) {
3246   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3247   Isolate* isolate = assembler->isolate();
3248   bool one_byte = compiler->one_byte();
3249   Label* backtrack = trace->backtrack();
3250   QuickCheckDetails* quick_check = trace->quick_check_performed();
3251   int element_count = elements()->length();
3252   int backward_offset = read_backward() ? -Length() : 0;
3253   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3254     TextElement elm = elements()->at(i);
3255     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3256     if (elm.text_type() == TextElement::ATOM) {
3257       Vector<const uc16> quarks = elm.atom()->data();
3258       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3259         if (first_element_checked && i == 0 && j == 0) continue;
3260         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3261         EmitCharacterFunction* emit_function = NULL;
3262         switch (pass) {
3263           case NON_LATIN1_MATCH:
3264             DCHECK(one_byte);
3265             if (quarks[j] > String::kMaxOneByteCharCode) {
3266               assembler->GoTo(backtrack);
3267               return;
3268             }
3269             break;
3270           case NON_LETTER_CHARACTER_MATCH:
3271             emit_function = &EmitAtomNonLetter;
3272             break;
3273           case SIMPLE_CHARACTER_MATCH:
3274             emit_function = &EmitSimpleCharacter;
3275             break;
3276           case CASE_CHARACTER_MATCH:
3277             emit_function = &EmitAtomLetter;
3278             break;
3279           default:
3280             break;
3281         }
3282         if (emit_function != NULL) {
3283           bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3284           bool bound_checked =
3285               emit_function(isolate, compiler, quarks[j], backtrack,
3286                             cp_offset + j, bounds_check, preloaded);
3287           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3288         }
3289       }
3290     } else {
3291       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3292       if (pass == CHARACTER_CLASS_MATCH) {
3293         if (first_element_checked && i == 0) continue;
3294         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3295         RegExpCharacterClass* cc = elm.char_class();
3296         bool bounds_check = *checked_up_to < cp_offset || read_backward();
3297         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3298                       bounds_check, preloaded, zone());
3299         UpdateBoundsCheck(cp_offset, checked_up_to);
3300       }
3301     }
3302   }
3303 }
3304 
3305 
Length()3306 int TextNode::Length() {
3307   TextElement elm = elements()->last();
3308   DCHECK(elm.cp_offset() >= 0);
3309   return elm.cp_offset() + elm.length();
3310 }
3311 
3312 
SkipPass(int int_pass,bool ignore_case)3313 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3314   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3315   if (ignore_case) {
3316     return pass == SIMPLE_CHARACTER_MATCH;
3317   } else {
3318     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3319   }
3320 }
3321 
3322 
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success)3323 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3324                                              ZoneList<CharacterRange>* ranges,
3325                                              bool read_backward,
3326                                              RegExpNode* on_success) {
3327   DCHECK_NOT_NULL(ranges);
3328   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3329   elms->Add(
3330       TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3331       zone);
3332   return new (zone) TextNode(elms, read_backward, on_success);
3333 }
3334 
3335 
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success)3336 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3337                                            CharacterRange trail,
3338                                            bool read_backward,
3339                                            RegExpNode* on_success) {
3340   ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3341   ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3342   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3343   elms->Add(TextElement::CharClass(
3344                 new (zone) RegExpCharacterClass(lead_ranges, false)),
3345             zone);
3346   elms->Add(TextElement::CharClass(
3347                 new (zone) RegExpCharacterClass(trail_ranges, false)),
3348             zone);
3349   return new (zone) TextNode(elms, read_backward, on_success);
3350 }
3351 
3352 
3353 // This generates the code to match a text node.  A text node can contain
3354 // straight character sequences (possibly to be matched in a case-independent
3355 // way) and character classes.  For efficiency we do not do this in a single
3356 // pass from left to right.  Instead we pass over the text node several times,
3357 // emitting code for some character positions every time.  See the comment on
3358 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3359 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3360   LimitResult limit_result = LimitVersions(compiler, trace);
3361   if (limit_result == DONE) return;
3362   DCHECK(limit_result == CONTINUE);
3363 
3364   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3365     compiler->SetRegExpTooBig();
3366     return;
3367   }
3368 
3369   if (compiler->one_byte()) {
3370     int dummy = 0;
3371     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3372   }
3373 
3374   bool first_elt_done = false;
3375   int bound_checked_to = trace->cp_offset() - 1;
3376   bound_checked_to += trace->bound_checked_up_to();
3377 
3378   // If a character is preloaded into the current character register then
3379   // check that now.
3380   if (trace->characters_preloaded() == 1) {
3381     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3382       if (!SkipPass(pass, compiler->ignore_case())) {
3383         TextEmitPass(compiler,
3384                      static_cast<TextEmitPassType>(pass),
3385                      true,
3386                      trace,
3387                      false,
3388                      &bound_checked_to);
3389       }
3390     }
3391     first_elt_done = true;
3392   }
3393 
3394   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3395     if (!SkipPass(pass, compiler->ignore_case())) {
3396       TextEmitPass(compiler,
3397                    static_cast<TextEmitPassType>(pass),
3398                    false,
3399                    trace,
3400                    first_elt_done,
3401                    &bound_checked_to);
3402     }
3403   }
3404 
3405   Trace successor_trace(*trace);
3406   // If we advance backward, we may end up at the start.
3407   successor_trace.AdvanceCurrentPositionInTrace(
3408       read_backward() ? -Length() : Length(), compiler);
3409   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3410                                                : Trace::FALSE_VALUE);
3411   RecursionCheck rc(compiler);
3412   on_success()->Emit(compiler, &successor_trace);
3413 }
3414 
3415 
InvalidateCurrentCharacter()3416 void Trace::InvalidateCurrentCharacter() {
3417   characters_preloaded_ = 0;
3418 }
3419 
3420 
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3421 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3422   // We don't have an instruction for shifting the current character register
3423   // down or for using a shifted value for anything so lets just forget that
3424   // we preloaded any characters into it.
3425   characters_preloaded_ = 0;
3426   // Adjust the offsets of the quick check performed information.  This
3427   // information is used to find out what we already determined about the
3428   // characters by means of mask and compare.
3429   quick_check_performed_.Advance(by, compiler->one_byte());
3430   cp_offset_ += by;
3431   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3432     compiler->SetRegExpTooBig();
3433     cp_offset_ = 0;
3434   }
3435   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3436 }
3437 
3438 
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)3439 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3440   int element_count = elements()->length();
3441   for (int i = 0; i < element_count; i++) {
3442     TextElement elm = elements()->at(i);
3443     if (elm.text_type() == TextElement::CHAR_CLASS) {
3444       RegExpCharacterClass* cc = elm.char_class();
3445       // None of the standard character classes is different in the case
3446       // independent case and it slows us down if we don't know that.
3447       if (cc->is_standard(zone())) continue;
3448       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3449       CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3450     }
3451   }
3452 }
3453 
3454 
GreedyLoopTextLength()3455 int TextNode::GreedyLoopTextLength() { return Length(); }
3456 
3457 
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3458 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3459     RegExpCompiler* compiler) {
3460   if (read_backward()) return NULL;
3461   if (elements()->length() != 1) return NULL;
3462   TextElement elm = elements()->at(0);
3463   if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3464   RegExpCharacterClass* node = elm.char_class();
3465   ZoneList<CharacterRange>* ranges = node->ranges(zone());
3466   CharacterRange::Canonicalize(ranges);
3467   if (node->is_negated()) {
3468     return ranges->length() == 0 ? on_success() : NULL;
3469   }
3470   if (ranges->length() != 1) return NULL;
3471   uint32_t max_char;
3472   if (compiler->one_byte()) {
3473     max_char = String::kMaxOneByteCharCode;
3474   } else {
3475     max_char = String::kMaxUtf16CodeUnit;
3476   }
3477   return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3478 }
3479 
3480 
3481 // Finds the fixed match length of a sequence of nodes that goes from
3482 // this alternative and back to this choice node.  If there are variable
3483 // length nodes or other complications in the way then return a sentinel
3484 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3485 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3486     GuardedAlternative* alternative) {
3487   int length = 0;
3488   RegExpNode* node = alternative->node();
3489   // Later we will generate code for all these text nodes using recursion
3490   // so we have to limit the max number.
3491   int recursion_depth = 0;
3492   while (node != this) {
3493     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3494       return kNodeIsTooComplexForGreedyLoops;
3495     }
3496     int node_length = node->GreedyLoopTextLength();
3497     if (node_length == kNodeIsTooComplexForGreedyLoops) {
3498       return kNodeIsTooComplexForGreedyLoops;
3499     }
3500     length += node_length;
3501     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3502     node = seq_node->on_success();
3503   }
3504   return read_backward() ? -length : length;
3505 }
3506 
3507 
AddLoopAlternative(GuardedAlternative alt)3508 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3509   DCHECK_NULL(loop_node_);
3510   AddAlternative(alt);
3511   loop_node_ = alt.node();
3512 }
3513 
3514 
AddContinueAlternative(GuardedAlternative alt)3515 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3516   DCHECK_NULL(continue_node_);
3517   AddAlternative(alt);
3518   continue_node_ = alt.node();
3519 }
3520 
3521 
Emit(RegExpCompiler * compiler,Trace * trace)3522 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3523   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3524   if (trace->stop_node() == this) {
3525     // Back edge of greedy optimized loop node graph.
3526     int text_length =
3527         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3528     DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3529     // Update the counter-based backtracking info on the stack.  This is an
3530     // optimization for greedy loops (see below).
3531     DCHECK(trace->cp_offset() == text_length);
3532     macro_assembler->AdvanceCurrentPosition(text_length);
3533     macro_assembler->GoTo(trace->loop_label());
3534     return;
3535   }
3536   DCHECK_NULL(trace->stop_node());
3537   if (!trace->is_trivial()) {
3538     trace->Flush(compiler, this);
3539     return;
3540   }
3541   ChoiceNode::Emit(compiler, trace);
3542 }
3543 
3544 
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3545 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3546                                            int eats_at_least) {
3547   int preload_characters = Min(4, eats_at_least);
3548   if (compiler->macro_assembler()->CanReadUnaligned()) {
3549     bool one_byte = compiler->one_byte();
3550     if (one_byte) {
3551       if (preload_characters > 4) preload_characters = 4;
3552       // We can't preload 3 characters because there is no machine instruction
3553       // to do that.  We can't just load 4 because we could be reading
3554       // beyond the end of the string, which could cause a memory fault.
3555       if (preload_characters == 3) preload_characters = 2;
3556     } else {
3557       if (preload_characters > 2) preload_characters = 2;
3558     }
3559   } else {
3560     if (preload_characters > 1) preload_characters = 1;
3561   }
3562   return preload_characters;
3563 }
3564 
3565 
3566 // This class is used when generating the alternatives in a choice node.  It
3567 // records the way the alternative is being code generated.
3568 class AlternativeGeneration: public Malloced {
3569  public:
AlternativeGeneration()3570   AlternativeGeneration()
3571       : possible_success(),
3572         expects_preload(false),
3573         after(),
3574         quick_check_details() { }
3575   Label possible_success;
3576   bool expects_preload;
3577   Label after;
3578   QuickCheckDetails quick_check_details;
3579 };
3580 
3581 
3582 // Creates a list of AlternativeGenerations.  If the list has a reasonable
3583 // size then it is on the stack, otherwise the excess is on the heap.
3584 class AlternativeGenerationList {
3585  public:
AlternativeGenerationList(int count,Zone * zone)3586   AlternativeGenerationList(int count, Zone* zone)
3587       : alt_gens_(count, zone) {
3588     for (int i = 0; i < count && i < kAFew; i++) {
3589       alt_gens_.Add(a_few_alt_gens_ + i, zone);
3590     }
3591     for (int i = kAFew; i < count; i++) {
3592       alt_gens_.Add(new AlternativeGeneration(), zone);
3593     }
3594   }
~AlternativeGenerationList()3595   ~AlternativeGenerationList() {
3596     for (int i = kAFew; i < alt_gens_.length(); i++) {
3597       delete alt_gens_[i];
3598       alt_gens_[i] = NULL;
3599     }
3600   }
3601 
at(int i)3602   AlternativeGeneration* at(int i) {
3603     return alt_gens_[i];
3604   }
3605 
3606  private:
3607   static const int kAFew = 10;
3608   ZoneList<AlternativeGeneration*> alt_gens_;
3609   AlternativeGeneration a_few_alt_gens_[kAFew];
3610 };
3611 
3612 
3613 static const uc32 kRangeEndMarker = 0x110000;
3614 
3615 // The '2' variant is has inclusive from and exclusive to.
3616 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3617 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3618 static const int kSpaceRanges[] = {
3619     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,         0x1681,
3620     0x180E, 0x180F,   0x2000, 0x200B,  0x2028, 0x202A, 0x202F,         0x2030,
3621     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
3622 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3623 
3624 static const int kWordRanges[] = {
3625     '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3626 static const int kWordRangeCount = arraysize(kWordRanges);
3627 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3628 static const int kDigitRangeCount = arraysize(kDigitRanges);
3629 static const int kSurrogateRanges[] = {
3630     kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3631 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3632 static const int kLineTerminatorRanges[] = {
3633     0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3634 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3635 
Set(int character)3636 void BoyerMoorePositionInfo::Set(int character) {
3637   SetInterval(Interval(character, character));
3638 }
3639 
3640 
SetInterval(const Interval & interval)3641 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3642   s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3643   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3644   d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3645   surrogate_ =
3646       AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3647   if (interval.to() - interval.from() >= kMapSize - 1) {
3648     if (map_count_ != kMapSize) {
3649       map_count_ = kMapSize;
3650       for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3651     }
3652     return;
3653   }
3654   for (int i = interval.from(); i <= interval.to(); i++) {
3655     int mod_character = (i & kMask);
3656     if (!map_->at(mod_character)) {
3657       map_count_++;
3658       map_->at(mod_character) = true;
3659     }
3660     if (map_count_ == kMapSize) return;
3661   }
3662 }
3663 
3664 
SetAll()3665 void BoyerMoorePositionInfo::SetAll() {
3666   s_ = w_ = d_ = kLatticeUnknown;
3667   if (map_count_ != kMapSize) {
3668     map_count_ = kMapSize;
3669     for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3670   }
3671 }
3672 
3673 
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3674 BoyerMooreLookahead::BoyerMooreLookahead(
3675     int length, RegExpCompiler* compiler, Zone* zone)
3676     : length_(length),
3677       compiler_(compiler) {
3678   if (compiler->one_byte()) {
3679     max_char_ = String::kMaxOneByteCharCode;
3680   } else {
3681     max_char_ = String::kMaxUtf16CodeUnit;
3682   }
3683   bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3684   for (int i = 0; i < length; i++) {
3685     bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3686   }
3687 }
3688 
3689 
3690 // Find the longest range of lookahead that has the fewest number of different
3691 // characters that can occur at a given position.  Since we are optimizing two
3692 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3693 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3694   int biggest_points = 0;
3695   // If more than 32 characters out of 128 can occur it is unlikely that we can
3696   // be lucky enough to step forwards much of the time.
3697   const int kMaxMax = 32;
3698   for (int max_number_of_chars = 4;
3699        max_number_of_chars < kMaxMax;
3700        max_number_of_chars *= 2) {
3701     biggest_points =
3702         FindBestInterval(max_number_of_chars, biggest_points, from, to);
3703   }
3704   if (biggest_points == 0) return false;
3705   return true;
3706 }
3707 
3708 
3709 // Find the highest-points range between 0 and length_ where the character
3710 // information is not too vague.  'Too vague' means that there are more than
3711 // max_number_of_chars that can occur at this position.  Calculates the number
3712 // of points as the product of width-of-the-range and
3713 // probability-of-finding-one-of-the-characters, where the probability is
3714 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3715 int BoyerMooreLookahead::FindBestInterval(
3716     int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3717   int biggest_points = old_biggest_points;
3718   static const int kSize = RegExpMacroAssembler::kTableSize;
3719   for (int i = 0; i < length_; ) {
3720     while (i < length_ && Count(i) > max_number_of_chars) i++;
3721     if (i == length_) break;
3722     int remembered_from = i;
3723     bool union_map[kSize];
3724     for (int j = 0; j < kSize; j++) union_map[j] = false;
3725     while (i < length_ && Count(i) <= max_number_of_chars) {
3726       BoyerMoorePositionInfo* map = bitmaps_->at(i);
3727       for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3728       i++;
3729     }
3730     int frequency = 0;
3731     for (int j = 0; j < kSize; j++) {
3732       if (union_map[j]) {
3733         // Add 1 to the frequency to give a small per-character boost for
3734         // the cases where our sampling is not good enough and many
3735         // characters have a frequency of zero.  This means the frequency
3736         // can theoretically be up to 2*kSize though we treat it mostly as
3737         // a fraction of kSize.
3738         frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3739       }
3740     }
3741     // We use the probability of skipping times the distance we are skipping to
3742     // judge the effectiveness of this.  Actually we have a cut-off:  By
3743     // dividing by 2 we switch off the skipping if the probability of skipping
3744     // is less than 50%.  This is because the multibyte mask-and-compare
3745     // skipping in quickcheck is more likely to do well on this case.
3746     bool in_quickcheck_range =
3747         ((i - remembered_from < 4) ||
3748          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3749     // Called 'probability' but it is only a rough estimate and can actually
3750     // be outside the 0-kSize range.
3751     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3752     int points = (i - remembered_from) * probability;
3753     if (points > biggest_points) {
3754       *from = remembered_from;
3755       *to = i - 1;
3756       biggest_points = points;
3757     }
3758   }
3759   return biggest_points;
3760 }
3761 
3762 
3763 // Take all the characters that will not prevent a successful match if they
3764 // occur in the subject string in the range between min_lookahead and
3765 // max_lookahead (inclusive) measured from the current position.  If the
3766 // character at max_lookahead offset is not one of these characters, then we
3767 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3768 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3769                                       int max_lookahead,
3770                                       Handle<ByteArray> boolean_skip_table) {
3771   const int kSize = RegExpMacroAssembler::kTableSize;
3772 
3773   const int kSkipArrayEntry = 0;
3774   const int kDontSkipArrayEntry = 1;
3775 
3776   for (int i = 0; i < kSize; i++) {
3777     boolean_skip_table->set(i, kSkipArrayEntry);
3778   }
3779   int skip = max_lookahead + 1 - min_lookahead;
3780 
3781   for (int i = max_lookahead; i >= min_lookahead; i--) {
3782     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3783     for (int j = 0; j < kSize; j++) {
3784       if (map->at(j)) {
3785         boolean_skip_table->set(j, kDontSkipArrayEntry);
3786       }
3787     }
3788   }
3789 
3790   return skip;
3791 }
3792 
3793 
3794 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3795 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3796   const int kSize = RegExpMacroAssembler::kTableSize;
3797 
3798   int min_lookahead = 0;
3799   int max_lookahead = 0;
3800 
3801   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3802 
3803   bool found_single_character = false;
3804   int single_character = 0;
3805   for (int i = max_lookahead; i >= min_lookahead; i--) {
3806     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3807     if (map->map_count() > 1 ||
3808         (found_single_character && map->map_count() != 0)) {
3809       found_single_character = false;
3810       break;
3811     }
3812     for (int j = 0; j < kSize; j++) {
3813       if (map->at(j)) {
3814         found_single_character = true;
3815         single_character = j;
3816         break;
3817       }
3818     }
3819   }
3820 
3821   int lookahead_width = max_lookahead + 1 - min_lookahead;
3822 
3823   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3824     // The mask-compare can probably handle this better.
3825     return;
3826   }
3827 
3828   if (found_single_character) {
3829     Label cont, again;
3830     masm->Bind(&again);
3831     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3832     if (max_char_ > kSize) {
3833       masm->CheckCharacterAfterAnd(single_character,
3834                                    RegExpMacroAssembler::kTableMask,
3835                                    &cont);
3836     } else {
3837       masm->CheckCharacter(single_character, &cont);
3838     }
3839     masm->AdvanceCurrentPosition(lookahead_width);
3840     masm->GoTo(&again);
3841     masm->Bind(&cont);
3842     return;
3843   }
3844 
3845   Factory* factory = masm->isolate()->factory();
3846   Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3847   int skip_distance = GetSkipTable(
3848       min_lookahead, max_lookahead, boolean_skip_table);
3849   DCHECK(skip_distance != 0);
3850 
3851   Label cont, again;
3852   masm->Bind(&again);
3853   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3854   masm->CheckBitInTable(boolean_skip_table, &cont);
3855   masm->AdvanceCurrentPosition(skip_distance);
3856   masm->GoTo(&again);
3857   masm->Bind(&cont);
3858 }
3859 
3860 
3861 /* Code generation for choice nodes.
3862  *
3863  * We generate quick checks that do a mask and compare to eliminate a
3864  * choice.  If the quick check succeeds then it jumps to the continuation to
3865  * do slow checks and check subsequent nodes.  If it fails (the common case)
3866  * it falls through to the next choice.
3867  *
3868  * Here is the desired flow graph.  Nodes directly below each other imply
3869  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
3870  * 3 doesn't have a quick check so we have to call the slow check.
3871  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
3872  * regexp continuation is generated directly after the Sn node, up to the
3873  * next GoTo if we decide to reuse some already generated code.  Some
3874  * nodes expect preload_characters to be preloaded into the current
3875  * character register.  R nodes do this preloading.  Vertices are marked
3876  * F for failures and S for success (possible success in the case of quick
3877  * nodes).  L, V, < and > are used as arrow heads.
3878  *
3879  * ----------> R
3880  *             |
3881  *             V
3882  *            Q1 -----> S1
3883  *             |   S   /
3884  *            F|      /
3885  *             |    F/
3886  *             |    /
3887  *             |   R
3888  *             |  /
3889  *             V L
3890  *            Q2 -----> S2
3891  *             |   S   /
3892  *            F|      /
3893  *             |    F/
3894  *             |    /
3895  *             |   R
3896  *             |  /
3897  *             V L
3898  *            S3
3899  *             |
3900  *            F|
3901  *             |
3902  *             R
3903  *             |
3904  * backtrack   V
3905  * <----------Q4
3906  *   \    F    |
3907  *    \        |S
3908  *     \   F   V
3909  *      \-----S4
3910  *
3911  * For greedy loops we push the current position, then generate the code that
3912  * eats the input specially in EmitGreedyLoop.  The other choice (the
3913  * continuation) is generated by the normal code in EmitChoices, and steps back
3914  * in the input to the starting position when it fails to match.  The loop code
3915  * looks like this (U is the unwind code that steps back in the greedy loop).
3916  *
3917  *              _____
3918  *             /     \
3919  *             V     |
3920  * ----------> S1    |
3921  *            /|     |
3922  *           / |S    |
3923  *         F/  \_____/
3924  *         /
3925  *        |<-----
3926  *        |      \
3927  *        V       |S
3928  *        Q2 ---> U----->backtrack
3929  *        |  F   /
3930  *       S|     /
3931  *        V  F /
3932  *        S2--/
3933  */
3934 
GreedyLoopState(bool not_at_start)3935 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3936   counter_backtrack_trace_.set_backtrack(&label_);
3937   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3938 }
3939 
3940 
AssertGuardsMentionRegisters(Trace * trace)3941 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3942 #ifdef DEBUG
3943   int choice_count = alternatives_->length();
3944   for (int i = 0; i < choice_count - 1; i++) {
3945     GuardedAlternative alternative = alternatives_->at(i);
3946     ZoneList<Guard*>* guards = alternative.guards();
3947     int guard_count = (guards == NULL) ? 0 : guards->length();
3948     for (int j = 0; j < guard_count; j++) {
3949       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3950     }
3951   }
3952 #endif
3953 }
3954 
3955 
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3956 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3957                               Trace* current_trace,
3958                               PreloadState* state) {
3959     if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3960       // Save some time by looking at most one machine word ahead.
3961       state->eats_at_least_ =
3962           EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3963                       current_trace->at_start() == Trace::FALSE_VALUE);
3964     }
3965     state->preload_characters_ =
3966         CalculatePreloadCharacters(compiler, state->eats_at_least_);
3967 
3968     state->preload_is_current_ =
3969         (current_trace->characters_preloaded() == state->preload_characters_);
3970     state->preload_has_checked_bounds_ = state->preload_is_current_;
3971 }
3972 
3973 
Emit(RegExpCompiler * compiler,Trace * trace)3974 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3975   int choice_count = alternatives_->length();
3976 
3977   if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3978     alternatives_->at(0).node()->Emit(compiler, trace);
3979     return;
3980   }
3981 
3982   AssertGuardsMentionRegisters(trace);
3983 
3984   LimitResult limit_result = LimitVersions(compiler, trace);
3985   if (limit_result == DONE) return;
3986   DCHECK(limit_result == CONTINUE);
3987 
3988   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3989   // other choice nodes we only flush if we are out of code size budget.
3990   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3991     trace->Flush(compiler, this);
3992     return;
3993   }
3994 
3995   RecursionCheck rc(compiler);
3996 
3997   PreloadState preload;
3998   preload.init();
3999   GreedyLoopState greedy_loop_state(not_at_start());
4000 
4001   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
4002   AlternativeGenerationList alt_gens(choice_count, zone());
4003 
4004   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4005     trace = EmitGreedyLoop(compiler,
4006                            trace,
4007                            &alt_gens,
4008                            &preload,
4009                            &greedy_loop_state,
4010                            text_length);
4011   } else {
4012     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
4013     // match the traces produced pre-cleanup.
4014     Label second_choice;
4015     compiler->macro_assembler()->Bind(&second_choice);
4016 
4017     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4018 
4019     EmitChoices(compiler,
4020                 &alt_gens,
4021                 0,
4022                 trace,
4023                 &preload);
4024   }
4025 
4026   // At this point we need to generate slow checks for the alternatives where
4027   // the quick check was inlined.  We can recognize these because the associated
4028   // label was bound.
4029   int new_flush_budget = trace->flush_budget() / choice_count;
4030   for (int i = 0; i < choice_count; i++) {
4031     AlternativeGeneration* alt_gen = alt_gens.at(i);
4032     Trace new_trace(*trace);
4033     // If there are actions to be flushed we have to limit how many times
4034     // they are flushed.  Take the budget of the parent trace and distribute
4035     // it fairly amongst the children.
4036     if (new_trace.actions() != NULL) {
4037       new_trace.set_flush_budget(new_flush_budget);
4038     }
4039     bool next_expects_preload =
4040         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4041     EmitOutOfLineContinuation(compiler,
4042                               &new_trace,
4043                               alternatives_->at(i),
4044                               alt_gen,
4045                               preload.preload_characters_,
4046                               next_expects_preload);
4047   }
4048 }
4049 
4050 
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)4051 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4052                                   Trace* trace,
4053                                   AlternativeGenerationList* alt_gens,
4054                                   PreloadState* preload,
4055                                   GreedyLoopState* greedy_loop_state,
4056                                   int text_length) {
4057   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4058   // Here we have special handling for greedy loops containing only text nodes
4059   // and other simple nodes.  These are handled by pushing the current
4060   // position on the stack and then incrementing the current position each
4061   // time around the switch.  On backtrack we decrement the current position
4062   // and check it against the pushed value.  This avoids pushing backtrack
4063   // information for each iteration of the loop, which could take up a lot of
4064   // space.
4065   DCHECK(trace->stop_node() == NULL);
4066   macro_assembler->PushCurrentPosition();
4067   Label greedy_match_failed;
4068   Trace greedy_match_trace;
4069   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4070   greedy_match_trace.set_backtrack(&greedy_match_failed);
4071   Label loop_label;
4072   macro_assembler->Bind(&loop_label);
4073   greedy_match_trace.set_stop_node(this);
4074   greedy_match_trace.set_loop_label(&loop_label);
4075   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4076   macro_assembler->Bind(&greedy_match_failed);
4077 
4078   Label second_choice;  // For use in greedy matches.
4079   macro_assembler->Bind(&second_choice);
4080 
4081   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4082 
4083   EmitChoices(compiler,
4084               alt_gens,
4085               1,
4086               new_trace,
4087               preload);
4088 
4089   macro_assembler->Bind(greedy_loop_state->label());
4090   // If we have unwound to the bottom then backtrack.
4091   macro_assembler->CheckGreedyLoop(trace->backtrack());
4092   // Otherwise try the second priority at an earlier position.
4093   macro_assembler->AdvanceCurrentPosition(-text_length);
4094   macro_assembler->GoTo(&second_choice);
4095   return new_trace;
4096 }
4097 
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4098 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4099                                               Trace* trace) {
4100   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4101   if (alternatives_->length() != 2) return eats_at_least;
4102 
4103   GuardedAlternative alt1 = alternatives_->at(1);
4104   if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4105     return eats_at_least;
4106   }
4107   RegExpNode* eats_anything_node = alt1.node();
4108   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4109     return eats_at_least;
4110   }
4111 
4112   // Really we should be creating a new trace when we execute this function,
4113   // but there is no need, because the code it generates cannot backtrack, and
4114   // we always arrive here with a trivial trace (since it's the entry to a
4115   // loop.  That also implies that there are no preloaded characters, which is
4116   // good, because it means we won't be violating any assumptions by
4117   // overwriting those characters with new load instructions.
4118   DCHECK(trace->is_trivial());
4119 
4120   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4121   Isolate* isolate = macro_assembler->isolate();
4122   // At this point we know that we are at a non-greedy loop that will eat
4123   // any character one at a time.  Any non-anchored regexp has such a
4124   // loop prepended to it in order to find where it starts.  We look for
4125   // a pattern of the form ...abc... where we can look 6 characters ahead
4126   // and step forwards 3 if the character is not one of abc.  Abc need
4127   // not be atoms, they can be any reasonably limited character class or
4128   // small alternation.
4129   BoyerMooreLookahead* bm = bm_info(false);
4130   if (bm == NULL) {
4131     eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4132                         EatsAtLeast(kMaxLookaheadForBoyerMoore,
4133                                     kRecursionBudget,
4134                                     false));
4135     if (eats_at_least >= 1) {
4136       bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4137                                            compiler,
4138                                            zone());
4139       GuardedAlternative alt0 = alternatives_->at(0);
4140       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4141     }
4142   }
4143   if (bm != NULL) {
4144     bm->EmitSkipInstructions(macro_assembler);
4145   }
4146   return eats_at_least;
4147 }
4148 
4149 
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4150 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4151                              AlternativeGenerationList* alt_gens,
4152                              int first_choice,
4153                              Trace* trace,
4154                              PreloadState* preload) {
4155   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4156   SetUpPreLoad(compiler, trace, preload);
4157 
4158   // For now we just call all choices one after the other.  The idea ultimately
4159   // is to use the Dispatch table to try only the relevant ones.
4160   int choice_count = alternatives_->length();
4161 
4162   int new_flush_budget = trace->flush_budget() / choice_count;
4163 
4164   for (int i = first_choice; i < choice_count; i++) {
4165     bool is_last = i == choice_count - 1;
4166     bool fall_through_on_failure = !is_last;
4167     GuardedAlternative alternative = alternatives_->at(i);
4168     AlternativeGeneration* alt_gen = alt_gens->at(i);
4169     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4170     ZoneList<Guard*>* guards = alternative.guards();
4171     int guard_count = (guards == NULL) ? 0 : guards->length();
4172     Trace new_trace(*trace);
4173     new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4174                                          preload->preload_characters_ :
4175                                          0);
4176     if (preload->preload_has_checked_bounds_) {
4177       new_trace.set_bound_checked_up_to(preload->preload_characters_);
4178     }
4179     new_trace.quick_check_performed()->Clear();
4180     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4181     if (!is_last) {
4182       new_trace.set_backtrack(&alt_gen->after);
4183     }
4184     alt_gen->expects_preload = preload->preload_is_current_;
4185     bool generate_full_check_inline = false;
4186     if (compiler->optimize() &&
4187         try_to_emit_quick_check_for_alternative(i == 0) &&
4188         alternative.node()->EmitQuickCheck(
4189             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4190             &alt_gen->possible_success, &alt_gen->quick_check_details,
4191             fall_through_on_failure)) {
4192       // Quick check was generated for this choice.
4193       preload->preload_is_current_ = true;
4194       preload->preload_has_checked_bounds_ = true;
4195       // If we generated the quick check to fall through on possible success,
4196       // we now need to generate the full check inline.
4197       if (!fall_through_on_failure) {
4198         macro_assembler->Bind(&alt_gen->possible_success);
4199         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4200         new_trace.set_characters_preloaded(preload->preload_characters_);
4201         new_trace.set_bound_checked_up_to(preload->preload_characters_);
4202         generate_full_check_inline = true;
4203       }
4204     } else if (alt_gen->quick_check_details.cannot_match()) {
4205       if (!fall_through_on_failure) {
4206         macro_assembler->GoTo(trace->backtrack());
4207       }
4208       continue;
4209     } else {
4210       // No quick check was generated.  Put the full code here.
4211       // If this is not the first choice then there could be slow checks from
4212       // previous cases that go here when they fail.  There's no reason to
4213       // insist that they preload characters since the slow check we are about
4214       // to generate probably can't use it.
4215       if (i != first_choice) {
4216         alt_gen->expects_preload = false;
4217         new_trace.InvalidateCurrentCharacter();
4218       }
4219       generate_full_check_inline = true;
4220     }
4221     if (generate_full_check_inline) {
4222       if (new_trace.actions() != NULL) {
4223         new_trace.set_flush_budget(new_flush_budget);
4224       }
4225       for (int j = 0; j < guard_count; j++) {
4226         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4227       }
4228       alternative.node()->Emit(compiler, &new_trace);
4229       preload->preload_is_current_ = false;
4230     }
4231     macro_assembler->Bind(&alt_gen->after);
4232   }
4233 }
4234 
4235 
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4236 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4237                                            Trace* trace,
4238                                            GuardedAlternative alternative,
4239                                            AlternativeGeneration* alt_gen,
4240                                            int preload_characters,
4241                                            bool next_expects_preload) {
4242   if (!alt_gen->possible_success.is_linked()) return;
4243 
4244   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4245   macro_assembler->Bind(&alt_gen->possible_success);
4246   Trace out_of_line_trace(*trace);
4247   out_of_line_trace.set_characters_preloaded(preload_characters);
4248   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4249   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4250   ZoneList<Guard*>* guards = alternative.guards();
4251   int guard_count = (guards == NULL) ? 0 : guards->length();
4252   if (next_expects_preload) {
4253     Label reload_current_char;
4254     out_of_line_trace.set_backtrack(&reload_current_char);
4255     for (int j = 0; j < guard_count; j++) {
4256       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4257     }
4258     alternative.node()->Emit(compiler, &out_of_line_trace);
4259     macro_assembler->Bind(&reload_current_char);
4260     // Reload the current character, since the next quick check expects that.
4261     // We don't need to check bounds here because we only get into this
4262     // code through a quick check which already did the checked load.
4263     macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4264                                           NULL,
4265                                           false,
4266                                           preload_characters);
4267     macro_assembler->GoTo(&(alt_gen->after));
4268   } else {
4269     out_of_line_trace.set_backtrack(&(alt_gen->after));
4270     for (int j = 0; j < guard_count; j++) {
4271       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4272     }
4273     alternative.node()->Emit(compiler, &out_of_line_trace);
4274   }
4275 }
4276 
4277 
Emit(RegExpCompiler * compiler,Trace * trace)4278 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4279   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4280   LimitResult limit_result = LimitVersions(compiler, trace);
4281   if (limit_result == DONE) return;
4282   DCHECK(limit_result == CONTINUE);
4283 
4284   RecursionCheck rc(compiler);
4285 
4286   switch (action_type_) {
4287     case STORE_POSITION: {
4288       Trace::DeferredCapture
4289           new_capture(data_.u_position_register.reg,
4290                       data_.u_position_register.is_capture,
4291                       trace);
4292       Trace new_trace = *trace;
4293       new_trace.add_action(&new_capture);
4294       on_success()->Emit(compiler, &new_trace);
4295       break;
4296     }
4297     case INCREMENT_REGISTER: {
4298       Trace::DeferredIncrementRegister
4299           new_increment(data_.u_increment_register.reg);
4300       Trace new_trace = *trace;
4301       new_trace.add_action(&new_increment);
4302       on_success()->Emit(compiler, &new_trace);
4303       break;
4304     }
4305     case SET_REGISTER: {
4306       Trace::DeferredSetRegister
4307           new_set(data_.u_store_register.reg, data_.u_store_register.value);
4308       Trace new_trace = *trace;
4309       new_trace.add_action(&new_set);
4310       on_success()->Emit(compiler, &new_trace);
4311       break;
4312     }
4313     case CLEAR_CAPTURES: {
4314       Trace::DeferredClearCaptures
4315         new_capture(Interval(data_.u_clear_captures.range_from,
4316                              data_.u_clear_captures.range_to));
4317       Trace new_trace = *trace;
4318       new_trace.add_action(&new_capture);
4319       on_success()->Emit(compiler, &new_trace);
4320       break;
4321     }
4322     case BEGIN_SUBMATCH:
4323       if (!trace->is_trivial()) {
4324         trace->Flush(compiler, this);
4325       } else {
4326         assembler->WriteCurrentPositionToRegister(
4327             data_.u_submatch.current_position_register, 0);
4328         assembler->WriteStackPointerToRegister(
4329             data_.u_submatch.stack_pointer_register);
4330         on_success()->Emit(compiler, trace);
4331       }
4332       break;
4333     case EMPTY_MATCH_CHECK: {
4334       int start_pos_reg = data_.u_empty_match_check.start_register;
4335       int stored_pos = 0;
4336       int rep_reg = data_.u_empty_match_check.repetition_register;
4337       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4338       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4339       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4340         // If we know we haven't advanced and there is no minimum we
4341         // can just backtrack immediately.
4342         assembler->GoTo(trace->backtrack());
4343       } else if (know_dist && stored_pos < trace->cp_offset()) {
4344         // If we know we've advanced we can generate the continuation
4345         // immediately.
4346         on_success()->Emit(compiler, trace);
4347       } else if (!trace->is_trivial()) {
4348         trace->Flush(compiler, this);
4349       } else {
4350         Label skip_empty_check;
4351         // If we have a minimum number of repetitions we check the current
4352         // number first and skip the empty check if it's not enough.
4353         if (has_minimum) {
4354           int limit = data_.u_empty_match_check.repetition_limit;
4355           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4356         }
4357         // If the match is empty we bail out, otherwise we fall through
4358         // to the on-success continuation.
4359         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4360                                    trace->backtrack());
4361         assembler->Bind(&skip_empty_check);
4362         on_success()->Emit(compiler, trace);
4363       }
4364       break;
4365     }
4366     case POSITIVE_SUBMATCH_SUCCESS: {
4367       if (!trace->is_trivial()) {
4368         trace->Flush(compiler, this);
4369         return;
4370       }
4371       assembler->ReadCurrentPositionFromRegister(
4372           data_.u_submatch.current_position_register);
4373       assembler->ReadStackPointerFromRegister(
4374           data_.u_submatch.stack_pointer_register);
4375       int clear_register_count = data_.u_submatch.clear_register_count;
4376       if (clear_register_count == 0) {
4377         on_success()->Emit(compiler, trace);
4378         return;
4379       }
4380       int clear_registers_from = data_.u_submatch.clear_register_from;
4381       Label clear_registers_backtrack;
4382       Trace new_trace = *trace;
4383       new_trace.set_backtrack(&clear_registers_backtrack);
4384       on_success()->Emit(compiler, &new_trace);
4385 
4386       assembler->Bind(&clear_registers_backtrack);
4387       int clear_registers_to = clear_registers_from + clear_register_count - 1;
4388       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4389 
4390       DCHECK(trace->backtrack() == NULL);
4391       assembler->Backtrack();
4392       return;
4393     }
4394     default:
4395       UNREACHABLE();
4396   }
4397 }
4398 
4399 
Emit(RegExpCompiler * compiler,Trace * trace)4400 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4401   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4402   if (!trace->is_trivial()) {
4403     trace->Flush(compiler, this);
4404     return;
4405   }
4406 
4407   LimitResult limit_result = LimitVersions(compiler, trace);
4408   if (limit_result == DONE) return;
4409   DCHECK(limit_result == CONTINUE);
4410 
4411   RecursionCheck rc(compiler);
4412 
4413   DCHECK_EQ(start_reg_ + 1, end_reg_);
4414   if (compiler->ignore_case()) {
4415     assembler->CheckNotBackReferenceIgnoreCase(
4416         start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
4417   } else {
4418     assembler->CheckNotBackReference(start_reg_, read_backward(),
4419                                      trace->backtrack());
4420   }
4421   // We are going to advance backward, so we may end up at the start.
4422   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4423 
4424   // Check that the back reference does not end inside a surrogate pair.
4425   if (compiler->unicode() && !compiler->one_byte()) {
4426     assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4427   }
4428   on_success()->Emit(compiler, trace);
4429 }
4430 
4431 
4432 // -------------------------------------------------------------------
4433 // Dot/dotty output
4434 
4435 
4436 #ifdef DEBUG
4437 
4438 
4439 class DotPrinter: public NodeVisitor {
4440  public:
DotPrinter(std::ostream & os,bool ignore_case)4441   DotPrinter(std::ostream& os, bool ignore_case)  // NOLINT
4442       : os_(os),
4443         ignore_case_(ignore_case) {}
4444   void PrintNode(const char* label, RegExpNode* node);
4445   void Visit(RegExpNode* node);
4446   void PrintAttributes(RegExpNode* from);
4447   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4448 #define DECLARE_VISIT(Type)                                          \
4449   virtual void Visit##Type(Type##Node* that);
4450 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4451 #undef DECLARE_VISIT
4452  private:
4453   std::ostream& os_;
4454   bool ignore_case_;
4455 };
4456 
4457 
PrintNode(const char * label,RegExpNode * node)4458 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4459   os_ << "digraph G {\n  graph [label=\"";
4460   for (int i = 0; label[i]; i++) {
4461     switch (label[i]) {
4462       case '\\':
4463         os_ << "\\\\";
4464         break;
4465       case '"':
4466         os_ << "\"";
4467         break;
4468       default:
4469         os_ << label[i];
4470         break;
4471     }
4472   }
4473   os_ << "\"];\n";
4474   Visit(node);
4475   os_ << "}" << std::endl;
4476 }
4477 
4478 
Visit(RegExpNode * node)4479 void DotPrinter::Visit(RegExpNode* node) {
4480   if (node->info()->visited) return;
4481   node->info()->visited = true;
4482   node->Accept(this);
4483 }
4484 
4485 
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4486 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4487   os_ << "  n" << from << " -> n" << on_failure << " [style=dotted];\n";
4488   Visit(on_failure);
4489 }
4490 
4491 
4492 class TableEntryBodyPrinter {
4493  public:
TableEntryBodyPrinter(std::ostream & os,ChoiceNode * choice)4494   TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice)  // NOLINT
4495       : os_(os),
4496         choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4497   void Call(uc16 from, DispatchTable::Entry entry) {
4498     OutSet* out_set = entry.out_set();
4499     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4500       if (out_set->Get(i)) {
4501         os_ << "    n" << choice() << ":s" << from << "o" << i << " -> n"
4502             << choice()->alternatives()->at(i).node() << ";\n";
4503       }
4504     }
4505   }
4506  private:
choice()4507   ChoiceNode* choice() { return choice_; }
4508   std::ostream& os_;
4509   ChoiceNode* choice_;
4510 };
4511 
4512 
4513 class TableEntryHeaderPrinter {
4514  public:
TableEntryHeaderPrinter(std::ostream & os)4515   explicit TableEntryHeaderPrinter(std::ostream& os)  // NOLINT
4516       : first_(true),
4517         os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4518   void Call(uc16 from, DispatchTable::Entry entry) {
4519     if (first_) {
4520       first_ = false;
4521     } else {
4522       os_ << "|";
4523     }
4524     os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4525     OutSet* out_set = entry.out_set();
4526     int priority = 0;
4527     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4528       if (out_set->Get(i)) {
4529         if (priority > 0) os_ << "|";
4530         os_ << "<s" << from << "o" << i << "> " << priority;
4531         priority++;
4532       }
4533     }
4534     os_ << "}}";
4535   }
4536 
4537  private:
4538   bool first_;
4539   std::ostream& os_;
4540 };
4541 
4542 
4543 class AttributePrinter {
4544  public:
AttributePrinter(std::ostream & os)4545   explicit AttributePrinter(std::ostream& os)  // NOLINT
4546       : os_(os),
4547         first_(true) {}
PrintSeparator()4548   void PrintSeparator() {
4549     if (first_) {
4550       first_ = false;
4551     } else {
4552       os_ << "|";
4553     }
4554   }
PrintBit(const char * name,bool value)4555   void PrintBit(const char* name, bool value) {
4556     if (!value) return;
4557     PrintSeparator();
4558     os_ << "{" << name << "}";
4559   }
PrintPositive(const char * name,int value)4560   void PrintPositive(const char* name, int value) {
4561     if (value < 0) return;
4562     PrintSeparator();
4563     os_ << "{" << name << "|" << value << "}";
4564   }
4565 
4566  private:
4567   std::ostream& os_;
4568   bool first_;
4569 };
4570 
4571 
PrintAttributes(RegExpNode * that)4572 void DotPrinter::PrintAttributes(RegExpNode* that) {
4573   os_ << "  a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4574       << "margin=0.1, fontsize=10, label=\"{";
4575   AttributePrinter printer(os_);
4576   NodeInfo* info = that->info();
4577   printer.PrintBit("NI", info->follows_newline_interest);
4578   printer.PrintBit("WI", info->follows_word_interest);
4579   printer.PrintBit("SI", info->follows_start_interest);
4580   Label* label = that->label();
4581   if (label->is_bound())
4582     printer.PrintPositive("@", label->pos());
4583   os_ << "}\"];\n"
4584       << "  a" << that << " -> n" << that
4585       << " [style=dashed, color=grey, arrowhead=none];\n";
4586 }
4587 
4588 
4589 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4590 void DotPrinter::VisitChoice(ChoiceNode* that) {
4591   if (kPrintDispatchTable) {
4592     os_ << "  n" << that << " [shape=Mrecord, label=\"";
4593     TableEntryHeaderPrinter header_printer(os_);
4594     that->GetTable(ignore_case_)->ForEach(&header_printer);
4595     os_ << "\"]\n";
4596     PrintAttributes(that);
4597     TableEntryBodyPrinter body_printer(os_, that);
4598     that->GetTable(ignore_case_)->ForEach(&body_printer);
4599   } else {
4600     os_ << "  n" << that << " [shape=Mrecord, label=\"?\"];\n";
4601     for (int i = 0; i < that->alternatives()->length(); i++) {
4602       GuardedAlternative alt = that->alternatives()->at(i);
4603       os_ << "  n" << that << " -> n" << alt.node();
4604     }
4605   }
4606   for (int i = 0; i < that->alternatives()->length(); i++) {
4607     GuardedAlternative alt = that->alternatives()->at(i);
4608     alt.node()->Accept(this);
4609   }
4610 }
4611 
4612 
VisitText(TextNode * that)4613 void DotPrinter::VisitText(TextNode* that) {
4614   Zone* zone = that->zone();
4615   os_ << "  n" << that << " [label=\"";
4616   for (int i = 0; i < that->elements()->length(); i++) {
4617     if (i > 0) os_ << " ";
4618     TextElement elm = that->elements()->at(i);
4619     switch (elm.text_type()) {
4620       case TextElement::ATOM: {
4621         Vector<const uc16> data = elm.atom()->data();
4622         for (int i = 0; i < data.length(); i++) {
4623           os_ << static_cast<char>(data[i]);
4624         }
4625         break;
4626       }
4627       case TextElement::CHAR_CLASS: {
4628         RegExpCharacterClass* node = elm.char_class();
4629         os_ << "[";
4630         if (node->is_negated()) os_ << "^";
4631         for (int j = 0; j < node->ranges(zone)->length(); j++) {
4632           CharacterRange range = node->ranges(zone)->at(j);
4633           os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4634         }
4635         os_ << "]";
4636         break;
4637       }
4638       default:
4639         UNREACHABLE();
4640     }
4641   }
4642   os_ << "\", shape=box, peripheries=2];\n";
4643   PrintAttributes(that);
4644   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4645   Visit(that->on_success());
4646 }
4647 
4648 
VisitBackReference(BackReferenceNode * that)4649 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4650   os_ << "  n" << that << " [label=\"$" << that->start_register() << "..$"
4651       << that->end_register() << "\", shape=doubleoctagon];\n";
4652   PrintAttributes(that);
4653   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4654   Visit(that->on_success());
4655 }
4656 
4657 
VisitEnd(EndNode * that)4658 void DotPrinter::VisitEnd(EndNode* that) {
4659   os_ << "  n" << that << " [style=bold, shape=point];\n";
4660   PrintAttributes(that);
4661 }
4662 
4663 
VisitAssertion(AssertionNode * that)4664 void DotPrinter::VisitAssertion(AssertionNode* that) {
4665   os_ << "  n" << that << " [";
4666   switch (that->assertion_type()) {
4667     case AssertionNode::AT_END:
4668       os_ << "label=\"$\", shape=septagon";
4669       break;
4670     case AssertionNode::AT_START:
4671       os_ << "label=\"^\", shape=septagon";
4672       break;
4673     case AssertionNode::AT_BOUNDARY:
4674       os_ << "label=\"\\b\", shape=septagon";
4675       break;
4676     case AssertionNode::AT_NON_BOUNDARY:
4677       os_ << "label=\"\\B\", shape=septagon";
4678       break;
4679     case AssertionNode::AFTER_NEWLINE:
4680       os_ << "label=\"(?<=\\n)\", shape=septagon";
4681       break;
4682   }
4683   os_ << "];\n";
4684   PrintAttributes(that);
4685   RegExpNode* successor = that->on_success();
4686   os_ << "  n" << that << " -> n" << successor << ";\n";
4687   Visit(successor);
4688 }
4689 
4690 
VisitAction(ActionNode * that)4691 void DotPrinter::VisitAction(ActionNode* that) {
4692   os_ << "  n" << that << " [";
4693   switch (that->action_type_) {
4694     case ActionNode::SET_REGISTER:
4695       os_ << "label=\"$" << that->data_.u_store_register.reg
4696           << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4697       break;
4698     case ActionNode::INCREMENT_REGISTER:
4699       os_ << "label=\"$" << that->data_.u_increment_register.reg
4700           << "++\", shape=octagon";
4701       break;
4702     case ActionNode::STORE_POSITION:
4703       os_ << "label=\"$" << that->data_.u_position_register.reg
4704           << ":=$pos\", shape=octagon";
4705       break;
4706     case ActionNode::BEGIN_SUBMATCH:
4707       os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4708           << ":=$pos,begin\", shape=septagon";
4709       break;
4710     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4711       os_ << "label=\"escape\", shape=septagon";
4712       break;
4713     case ActionNode::EMPTY_MATCH_CHECK:
4714       os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4715           << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4716           << "<" << that->data_.u_empty_match_check.repetition_limit
4717           << "?\", shape=septagon";
4718       break;
4719     case ActionNode::CLEAR_CAPTURES: {
4720       os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4721           << " to $" << that->data_.u_clear_captures.range_to
4722           << "\", shape=septagon";
4723       break;
4724     }
4725   }
4726   os_ << "];\n";
4727   PrintAttributes(that);
4728   RegExpNode* successor = that->on_success();
4729   os_ << "  n" << that << " -> n" << successor << ";\n";
4730   Visit(successor);
4731 }
4732 
4733 
4734 class DispatchTableDumper {
4735  public:
DispatchTableDumper(std::ostream & os)4736   explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4737   void Call(uc16 key, DispatchTable::Entry entry);
4738  private:
4739   std::ostream& os_;
4740 };
4741 
4742 
Call(uc16 key,DispatchTable::Entry entry)4743 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4744   os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4745   OutSet* set = entry.out_set();
4746   bool first = true;
4747   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4748     if (set->Get(i)) {
4749       if (first) {
4750         first = false;
4751       } else {
4752         os_ << ", ";
4753       }
4754       os_ << i;
4755     }
4756   }
4757   os_ << "}\n";
4758 }
4759 
4760 
Dump()4761 void DispatchTable::Dump() {
4762   OFStream os(stderr);
4763   DispatchTableDumper dumper(os);
4764   tree()->ForEach(&dumper);
4765 }
4766 
4767 
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4768 void RegExpEngine::DotPrint(const char* label,
4769                             RegExpNode* node,
4770                             bool ignore_case) {
4771   OFStream os(stdout);
4772   DotPrinter printer(os, ignore_case);
4773   printer.PrintNode(label, node);
4774 }
4775 
4776 
4777 #endif  // DEBUG
4778 
4779 
4780 // -------------------------------------------------------------------
4781 // Tree to graph conversion
4782 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4783 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4784                                RegExpNode* on_success) {
4785   ZoneList<TextElement>* elms =
4786       new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4787   elms->Add(TextElement::Atom(this), compiler->zone());
4788   return new (compiler->zone())
4789       TextNode(elms, compiler->read_backward(), on_success);
4790 }
4791 
4792 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4793 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4794                                RegExpNode* on_success) {
4795   return new (compiler->zone())
4796       TextNode(elements(), compiler->read_backward(), on_success);
4797 }
4798 
4799 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4800 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4801                                  const int* special_class,
4802                                  int length) {
4803   length--;  // Remove final marker.
4804   DCHECK(special_class[length] == kRangeEndMarker);
4805   DCHECK(ranges->length() != 0);
4806   DCHECK(length != 0);
4807   DCHECK(special_class[0] != 0);
4808   if (ranges->length() != (length >> 1) + 1) {
4809     return false;
4810   }
4811   CharacterRange range = ranges->at(0);
4812   if (range.from() != 0) {
4813     return false;
4814   }
4815   for (int i = 0; i < length; i += 2) {
4816     if (special_class[i] != (range.to() + 1)) {
4817       return false;
4818     }
4819     range = ranges->at((i >> 1) + 1);
4820     if (special_class[i+1] != range.from()) {
4821       return false;
4822     }
4823   }
4824   if (range.to() != String::kMaxCodePoint) {
4825     return false;
4826   }
4827   return true;
4828 }
4829 
4830 
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4831 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4832                           const int* special_class,
4833                           int length) {
4834   length--;  // Remove final marker.
4835   DCHECK(special_class[length] == kRangeEndMarker);
4836   if (ranges->length() * 2 != length) {
4837     return false;
4838   }
4839   for (int i = 0; i < length; i += 2) {
4840     CharacterRange range = ranges->at(i >> 1);
4841     if (range.from() != special_class[i] ||
4842         range.to() != special_class[i + 1] - 1) {
4843       return false;
4844     }
4845   }
4846   return true;
4847 }
4848 
4849 
is_standard(Zone * zone)4850 bool RegExpCharacterClass::is_standard(Zone* zone) {
4851   // TODO(lrn): Remove need for this function, by not throwing away information
4852   // along the way.
4853   if (is_negated_) {
4854     return false;
4855   }
4856   if (set_.is_standard()) {
4857     return true;
4858   }
4859   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4860     set_.set_standard_set_type('s');
4861     return true;
4862   }
4863   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4864     set_.set_standard_set_type('S');
4865     return true;
4866   }
4867   if (CompareInverseRanges(set_.ranges(zone),
4868                            kLineTerminatorRanges,
4869                            kLineTerminatorRangeCount)) {
4870     set_.set_standard_set_type('.');
4871     return true;
4872   }
4873   if (CompareRanges(set_.ranges(zone),
4874                     kLineTerminatorRanges,
4875                     kLineTerminatorRangeCount)) {
4876     set_.set_standard_set_type('n');
4877     return true;
4878   }
4879   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4880     set_.set_standard_set_type('w');
4881     return true;
4882   }
4883   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4884     set_.set_standard_set_type('W');
4885     return true;
4886   }
4887   return false;
4888 }
4889 
4890 
UnicodeRangeSplitter(Zone * zone,ZoneList<CharacterRange> * base)4891 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4892                                            ZoneList<CharacterRange>* base)
4893     : zone_(zone),
4894       table_(zone),
4895       bmp_(nullptr),
4896       lead_surrogates_(nullptr),
4897       trail_surrogates_(nullptr),
4898       non_bmp_(nullptr) {
4899   // The unicode range splitter categorizes given character ranges into:
4900   // - Code points from the BMP representable by one code unit.
4901   // - Code points outside the BMP that need to be split into surrogate pairs.
4902   // - Lone lead surrogates.
4903   // - Lone trail surrogates.
4904   // Lone surrogates are valid code points, even though no actual characters.
4905   // They require special matching to make sure we do not split surrogate pairs.
4906   // We use the dispatch table to accomplish this. The base range is split up
4907   // by the table by the overlay ranges, and the Call callback is used to
4908   // filter and collect ranges for each category.
4909   for (int i = 0; i < base->length(); i++) {
4910     table_.AddRange(base->at(i), kBase, zone_);
4911   }
4912   // Add overlay ranges.
4913   table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4914                   kBmpCodePoints, zone_);
4915   table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4916                   kLeadSurrogates, zone_);
4917   table_.AddRange(
4918       CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4919       kTrailSurrogates, zone_);
4920   table_.AddRange(
4921       CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4922       kBmpCodePoints, zone_);
4923   table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4924                   kNonBmpCodePoints, zone_);
4925   table_.ForEach(this);
4926 }
4927 
4928 
Call(uc32 from,DispatchTable::Entry entry)4929 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4930   OutSet* outset = entry.out_set();
4931   if (!outset->Get(kBase)) return;
4932   ZoneList<CharacterRange>** target = NULL;
4933   if (outset->Get(kBmpCodePoints)) {
4934     target = &bmp_;
4935   } else if (outset->Get(kLeadSurrogates)) {
4936     target = &lead_surrogates_;
4937   } else if (outset->Get(kTrailSurrogates)) {
4938     target = &trail_surrogates_;
4939   } else {
4940     DCHECK(outset->Get(kNonBmpCodePoints));
4941     target = &non_bmp_;
4942   }
4943   if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4944   (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4945 }
4946 
4947 
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4948 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4949                       RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4950   ZoneList<CharacterRange>* bmp = splitter->bmp();
4951   if (bmp == nullptr) return;
4952   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4953       compiler->zone(), bmp, compiler->read_backward(), on_success)));
4954 }
4955 
4956 
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4957 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4958                              RegExpNode* on_success,
4959                              UnicodeRangeSplitter* splitter) {
4960   ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4961   if (non_bmp == nullptr) return;
4962   DCHECK(compiler->unicode());
4963   DCHECK(!compiler->one_byte());
4964   Zone* zone = compiler->zone();
4965   CharacterRange::Canonicalize(non_bmp);
4966   for (int i = 0; i < non_bmp->length(); i++) {
4967     // Match surrogate pair.
4968     // E.g. [\u10005-\u11005] becomes
4969     //      \ud800[\udc05-\udfff]|
4970     //      [\ud801-\ud803][\udc00-\udfff]|
4971     //      \ud804[\udc00-\udc05]
4972     uc32 from = non_bmp->at(i).from();
4973     uc32 to = non_bmp->at(i).to();
4974     uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4975     uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4976     uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4977     uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4978     if (from_l == to_l) {
4979       // The lead surrogate is the same.
4980       result->AddAlternative(
4981           GuardedAlternative(TextNode::CreateForSurrogatePair(
4982               zone, CharacterRange::Singleton(from_l),
4983               CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4984               on_success)));
4985     } else {
4986       if (from_t != kTrailSurrogateStart) {
4987         // Add [from_l][from_t-\udfff]
4988         result->AddAlternative(
4989             GuardedAlternative(TextNode::CreateForSurrogatePair(
4990                 zone, CharacterRange::Singleton(from_l),
4991                 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4992                 compiler->read_backward(), on_success)));
4993         from_l++;
4994       }
4995       if (to_t != kTrailSurrogateEnd) {
4996         // Add [to_l][\udc00-to_t]
4997         result->AddAlternative(
4998             GuardedAlternative(TextNode::CreateForSurrogatePair(
4999                 zone, CharacterRange::Singleton(to_l),
5000                 CharacterRange::Range(kTrailSurrogateStart, to_t),
5001                 compiler->read_backward(), on_success)));
5002         to_l--;
5003       }
5004       if (from_l <= to_l) {
5005         // Add [from_l-to_l][\udc00-\udfff]
5006         result->AddAlternative(
5007             GuardedAlternative(TextNode::CreateForSurrogatePair(
5008                 zone, CharacterRange::Range(from_l, to_l),
5009                 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5010                 compiler->read_backward(), on_success)));
5011       }
5012     }
5013   }
5014 }
5015 
5016 
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward)5017 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5018     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5019     ZoneList<CharacterRange>* match, RegExpNode* on_success,
5020     bool read_backward) {
5021   Zone* zone = compiler->zone();
5022   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5023       zone, match, read_backward, on_success);
5024   int stack_register = compiler->UnicodeLookaroundStackRegister();
5025   int position_register = compiler->UnicodeLookaroundPositionRegister();
5026   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5027                                        position_register);
5028   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5029       zone, lookbehind, !read_backward, lookaround.on_match_success());
5030   return lookaround.ForMatch(negative_match);
5031 }
5032 
5033 
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward)5034 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5035     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5036     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5037     bool read_backward) {
5038   Zone* zone = compiler->zone();
5039   int stack_register = compiler->UnicodeLookaroundStackRegister();
5040   int position_register = compiler->UnicodeLookaroundPositionRegister();
5041   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5042                                        position_register);
5043   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5044       zone, lookahead, read_backward, lookaround.on_match_success());
5045   return TextNode::CreateForCharacterRanges(
5046       zone, match, read_backward, lookaround.ForMatch(negative_match));
5047 }
5048 
5049 
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5050 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5051                            RegExpNode* on_success,
5052                            UnicodeRangeSplitter* splitter) {
5053   ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5054   if (lead_surrogates == nullptr) return;
5055   Zone* zone = compiler->zone();
5056   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5057   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5058       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5059 
5060   RegExpNode* match;
5061   if (compiler->read_backward()) {
5062     // Reading backward. Assert that reading forward, there is no trail
5063     // surrogate, and then backward match the lead surrogate.
5064     match = NegativeLookaroundAgainstReadDirectionAndMatch(
5065         compiler, trail_surrogates, lead_surrogates, on_success, true);
5066   } else {
5067     // Reading forward. Forward match the lead surrogate and assert that
5068     // no trail surrogate follows.
5069     match = MatchAndNegativeLookaroundInReadDirection(
5070         compiler, lead_surrogates, trail_surrogates, on_success, false);
5071   }
5072   result->AddAlternative(GuardedAlternative(match));
5073 }
5074 
5075 
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5076 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5077                             RegExpNode* on_success,
5078                             UnicodeRangeSplitter* splitter) {
5079   ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5080   if (trail_surrogates == nullptr) return;
5081   Zone* zone = compiler->zone();
5082   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5083   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5084       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5085 
5086   RegExpNode* match;
5087   if (compiler->read_backward()) {
5088     // Reading backward. Backward match the trail surrogate and assert that no
5089     // lead surrogate precedes it.
5090     match = MatchAndNegativeLookaroundInReadDirection(
5091         compiler, trail_surrogates, lead_surrogates, on_success, true);
5092   } else {
5093     // Reading forward. Assert that reading backward, there is no lead
5094     // surrogate, and then forward match the trail surrogate.
5095     match = NegativeLookaroundAgainstReadDirectionAndMatch(
5096         compiler, lead_surrogates, trail_surrogates, on_success, false);
5097   }
5098   result->AddAlternative(GuardedAlternative(match));
5099 }
5100 
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)5101 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5102                               RegExpNode* on_success) {
5103   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5104   DCHECK(!compiler->read_backward());
5105   Zone* zone = compiler->zone();
5106   // Advance any character. If the character happens to be a lead surrogate and
5107   // we advanced into the middle of a surrogate pair, it will work out, as
5108   // nothing will match from there. We will have to advance again, consuming
5109   // the associated trail surrogate.
5110   ZoneList<CharacterRange>* range = CharacterRange::List(
5111       zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5112   return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5113 }
5114 
5115 
AddUnicodeCaseEquivalents(RegExpCompiler * compiler,ZoneList<CharacterRange> * ranges)5116 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5117                                ZoneList<CharacterRange>* ranges) {
5118 #ifdef V8_I18N_SUPPORT
5119   // Use ICU to compute the case fold closure over the ranges.
5120   DCHECK(compiler->unicode());
5121   DCHECK(compiler->ignore_case());
5122   USet* set = uset_openEmpty();
5123   for (int i = 0; i < ranges->length(); i++) {
5124     uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
5125   }
5126   ranges->Clear();
5127   uset_closeOver(set, USET_CASE_INSENSITIVE);
5128   // Full case mapping map single characters to multiple characters.
5129   // Those are represented as strings in the set. Remove them so that
5130   // we end up with only simple and common case mappings.
5131   uset_removeAllStrings(set);
5132   int item_count = uset_getItemCount(set);
5133   int item_result = 0;
5134   UErrorCode ec = U_ZERO_ERROR;
5135   Zone* zone = compiler->zone();
5136   for (int i = 0; i < item_count; i++) {
5137     uc32 start = 0;
5138     uc32 end = 0;
5139     item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
5140     ranges->Add(CharacterRange::Range(start, end), zone);
5141   }
5142   // No errors and everything we collected have been ranges.
5143   DCHECK_EQ(U_ZERO_ERROR, ec);
5144   DCHECK_EQ(0, item_result);
5145   uset_close(set);
5146 #else
5147   // Fallback if ICU is not included.
5148   CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5149                                      ranges, compiler->one_byte());
5150 #endif  // V8_I18N_SUPPORT
5151   CharacterRange::Canonicalize(ranges);
5152 }
5153 
5154 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5155 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5156                                          RegExpNode* on_success) {
5157   set_.Canonicalize();
5158   Zone* zone = compiler->zone();
5159   ZoneList<CharacterRange>* ranges = this->ranges(zone);
5160   if (compiler->unicode() && compiler->ignore_case()) {
5161     AddUnicodeCaseEquivalents(compiler, ranges);
5162   }
5163   if (compiler->unicode() && !compiler->one_byte()) {
5164     if (is_negated()) {
5165       ZoneList<CharacterRange>* negated =
5166           new (zone) ZoneList<CharacterRange>(2, zone);
5167       CharacterRange::Negate(ranges, negated, zone);
5168       ranges = negated;
5169     }
5170     if (ranges->length() == 0) {
5171       ranges->Add(CharacterRange::Everything(), zone);
5172       RegExpCharacterClass* fail =
5173           new (zone) RegExpCharacterClass(ranges, true);
5174       return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5175     }
5176     if (standard_type() == '*') {
5177       return UnanchoredAdvance(compiler, on_success);
5178     } else {
5179       ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5180       UnicodeRangeSplitter splitter(zone, ranges);
5181       AddBmpCharacters(compiler, result, on_success, &splitter);
5182       AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5183       AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5184       AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5185       return result;
5186     }
5187   } else {
5188     return new (zone) TextNode(this, compiler->read_backward(), on_success);
5189   }
5190 }
5191 
5192 
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)5193 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5194   RegExpAtom* atom1 = (*a)->AsAtom();
5195   RegExpAtom* atom2 = (*b)->AsAtom();
5196   uc16 character1 = atom1->data().at(0);
5197   uc16 character2 = atom2->data().at(0);
5198   if (character1 < character2) return -1;
5199   if (character1 > character2) return 1;
5200   return 0;
5201 }
5202 
5203 
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)5204 static unibrow::uchar Canonical(
5205     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5206     unibrow::uchar c) {
5207   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5208   int length = canonicalize->get(c, '\0', chars);
5209   DCHECK_LE(length, 1);
5210   unibrow::uchar canonical = c;
5211   if (length == 1) canonical = chars[0];
5212   return canonical;
5213 }
5214 
5215 
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)5216 int CompareFirstCharCaseIndependent(
5217     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5218     RegExpTree* const* a, RegExpTree* const* b) {
5219   RegExpAtom* atom1 = (*a)->AsAtom();
5220   RegExpAtom* atom2 = (*b)->AsAtom();
5221   unibrow::uchar character1 = atom1->data().at(0);
5222   unibrow::uchar character2 = atom2->data().at(0);
5223   if (character1 == character2) return 0;
5224   if (character1 >= 'a' || character2 >= 'a') {
5225     character1 = Canonical(canonicalize, character1);
5226     character2 = Canonical(canonicalize, character2);
5227   }
5228   return static_cast<int>(character1) - static_cast<int>(character2);
5229 }
5230 
5231 
5232 // We can stable sort runs of atoms, since the order does not matter if they
5233 // start with different characters.
5234 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)5235 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5236   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5237   int length = alternatives->length();
5238   bool found_consecutive_atoms = false;
5239   for (int i = 0; i < length; i++) {
5240     while (i < length) {
5241       RegExpTree* alternative = alternatives->at(i);
5242       if (alternative->IsAtom()) break;
5243       i++;
5244     }
5245     // i is length or it is the index of an atom.
5246     if (i == length) break;
5247     int first_atom = i;
5248     i++;
5249     while (i < length) {
5250       RegExpTree* alternative = alternatives->at(i);
5251       if (!alternative->IsAtom()) break;
5252       i++;
5253     }
5254     // Sort atoms to get ones with common prefixes together.
5255     // This step is more tricky if we are in a case-independent regexp,
5256     // because it would change /is|I/ to /I|is/, and order matters when
5257     // the regexp parts don't match only disjoint starting points. To fix
5258     // this we have a version of CompareFirstChar that uses case-
5259     // independent character classes for comparison.
5260     DCHECK_LT(first_atom, alternatives->length());
5261     DCHECK_LE(i, alternatives->length());
5262     DCHECK_LE(first_atom, i);
5263     if (compiler->ignore_case()) {
5264       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5265           compiler->isolate()->regexp_macro_assembler_canonicalize();
5266       auto compare_closure =
5267           [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5268             return CompareFirstCharCaseIndependent(canonicalize, a, b);
5269           };
5270       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5271     } else {
5272       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5273     }
5274     if (i - first_atom > 1) found_consecutive_atoms = true;
5275   }
5276   return found_consecutive_atoms;
5277 }
5278 
5279 
5280 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)5281 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5282   Zone* zone = compiler->zone();
5283   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5284   int length = alternatives->length();
5285 
5286   int write_posn = 0;
5287   int i = 0;
5288   while (i < length) {
5289     RegExpTree* alternative = alternatives->at(i);
5290     if (!alternative->IsAtom()) {
5291       alternatives->at(write_posn++) = alternatives->at(i);
5292       i++;
5293       continue;
5294     }
5295     RegExpAtom* atom = alternative->AsAtom();
5296     unibrow::uchar common_prefix = atom->data().at(0);
5297     int first_with_prefix = i;
5298     int prefix_length = atom->length();
5299     i++;
5300     while (i < length) {
5301       alternative = alternatives->at(i);
5302       if (!alternative->IsAtom()) break;
5303       atom = alternative->AsAtom();
5304       unibrow::uchar new_prefix = atom->data().at(0);
5305       if (new_prefix != common_prefix) {
5306         if (!compiler->ignore_case()) break;
5307         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5308             compiler->isolate()->regexp_macro_assembler_canonicalize();
5309         new_prefix = Canonical(canonicalize, new_prefix);
5310         common_prefix = Canonical(canonicalize, common_prefix);
5311         if (new_prefix != common_prefix) break;
5312       }
5313       prefix_length = Min(prefix_length, atom->length());
5314       i++;
5315     }
5316     if (i > first_with_prefix + 2) {
5317       // Found worthwhile run of alternatives with common prefix of at least one
5318       // character.  The sorting function above did not sort on more than one
5319       // character for reasons of correctness, but there may still be a longer
5320       // common prefix if the terms were similar or presorted in the input.
5321       // Find out how long the common prefix is.
5322       int run_length = i - first_with_prefix;
5323       atom = alternatives->at(first_with_prefix)->AsAtom();
5324       for (int j = 1; j < run_length && prefix_length > 1; j++) {
5325         RegExpAtom* old_atom =
5326             alternatives->at(j + first_with_prefix)->AsAtom();
5327         for (int k = 1; k < prefix_length; k++) {
5328           if (atom->data().at(k) != old_atom->data().at(k)) {
5329             prefix_length = k;
5330             break;
5331           }
5332         }
5333       }
5334       RegExpAtom* prefix =
5335           new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5336       ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5337       pair->Add(prefix, zone);
5338       ZoneList<RegExpTree*>* suffixes =
5339           new (zone) ZoneList<RegExpTree*>(run_length, zone);
5340       for (int j = 0; j < run_length; j++) {
5341         RegExpAtom* old_atom =
5342             alternatives->at(j + first_with_prefix)->AsAtom();
5343         int len = old_atom->length();
5344         if (len == prefix_length) {
5345           suffixes->Add(new (zone) RegExpEmpty(), zone);
5346         } else {
5347           RegExpTree* suffix = new (zone) RegExpAtom(
5348               old_atom->data().SubVector(prefix_length, old_atom->length()));
5349           suffixes->Add(suffix, zone);
5350         }
5351       }
5352       pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5353       alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5354     } else {
5355       // Just copy any non-worthwhile alternatives.
5356       for (int j = first_with_prefix; j < i; j++) {
5357         alternatives->at(write_posn++) = alternatives->at(j);
5358       }
5359     }
5360   }
5361   alternatives->Rewind(write_posn);  // Trim end of array.
5362 }
5363 
5364 
5365 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)5366 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5367     RegExpCompiler* compiler) {
5368   Zone* zone = compiler->zone();
5369   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5370   int length = alternatives->length();
5371 
5372   int write_posn = 0;
5373   int i = 0;
5374   while (i < length) {
5375     RegExpTree* alternative = alternatives->at(i);
5376     if (!alternative->IsAtom()) {
5377       alternatives->at(write_posn++) = alternatives->at(i);
5378       i++;
5379       continue;
5380     }
5381     RegExpAtom* atom = alternative->AsAtom();
5382     if (atom->length() != 1) {
5383       alternatives->at(write_posn++) = alternatives->at(i);
5384       i++;
5385       continue;
5386     }
5387     int first_in_run = i;
5388     i++;
5389     while (i < length) {
5390       alternative = alternatives->at(i);
5391       if (!alternative->IsAtom()) break;
5392       atom = alternative->AsAtom();
5393       if (atom->length() != 1) break;
5394       i++;
5395     }
5396     if (i > first_in_run + 1) {
5397       // Found non-trivial run of single-character alternatives.
5398       int run_length = i - first_in_run;
5399       ZoneList<CharacterRange>* ranges =
5400           new (zone) ZoneList<CharacterRange>(2, zone);
5401       for (int j = 0; j < run_length; j++) {
5402         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5403         DCHECK_EQ(old_atom->length(), 1);
5404         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5405       }
5406       alternatives->at(write_posn++) =
5407           new (zone) RegExpCharacterClass(ranges, false);
5408     } else {
5409       // Just copy any trivial alternatives.
5410       for (int j = first_in_run; j < i; j++) {
5411         alternatives->at(write_posn++) = alternatives->at(j);
5412       }
5413     }
5414   }
5415   alternatives->Rewind(write_posn);  // Trim end of array.
5416 }
5417 
5418 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5419 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5420                                       RegExpNode* on_success) {
5421   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5422 
5423   if (alternatives->length() > 2) {
5424     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5425     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5426     FixSingleCharacterDisjunctions(compiler);
5427     if (alternatives->length() == 1) {
5428       return alternatives->at(0)->ToNode(compiler, on_success);
5429     }
5430   }
5431 
5432   int length = alternatives->length();
5433 
5434   ChoiceNode* result =
5435       new(compiler->zone()) ChoiceNode(length, compiler->zone());
5436   for (int i = 0; i < length; i++) {
5437     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5438                                                                on_success));
5439     result->AddAlternative(alternative);
5440   }
5441   return result;
5442 }
5443 
5444 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5445 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5446                                      RegExpNode* on_success) {
5447   return ToNode(min(),
5448                 max(),
5449                 is_greedy(),
5450                 body(),
5451                 compiler,
5452                 on_success);
5453 }
5454 
5455 
5456 // Scoped object to keep track of how much we unroll quantifier loops in the
5457 // regexp graph generator.
5458 class RegExpExpansionLimiter {
5459  public:
5460   static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)5461   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5462       : compiler_(compiler),
5463         saved_expansion_factor_(compiler->current_expansion_factor()),
5464         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5465     DCHECK(factor > 0);
5466     if (ok_to_expand_) {
5467       if (factor > kMaxExpansionFactor) {
5468         // Avoid integer overflow of the current expansion factor.
5469         ok_to_expand_ = false;
5470         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5471       } else {
5472         int new_factor = saved_expansion_factor_ * factor;
5473         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5474         compiler->set_current_expansion_factor(new_factor);
5475       }
5476     }
5477   }
5478 
~RegExpExpansionLimiter()5479   ~RegExpExpansionLimiter() {
5480     compiler_->set_current_expansion_factor(saved_expansion_factor_);
5481   }
5482 
ok_to_expand()5483   bool ok_to_expand() { return ok_to_expand_; }
5484 
5485  private:
5486   RegExpCompiler* compiler_;
5487   int saved_expansion_factor_;
5488   bool ok_to_expand_;
5489 
5490   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5491 };
5492 
5493 
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)5494 RegExpNode* RegExpQuantifier::ToNode(int min,
5495                                      int max,
5496                                      bool is_greedy,
5497                                      RegExpTree* body,
5498                                      RegExpCompiler* compiler,
5499                                      RegExpNode* on_success,
5500                                      bool not_at_start) {
5501   // x{f, t} becomes this:
5502   //
5503   //             (r++)<-.
5504   //               |     `
5505   //               |     (x)
5506   //               v     ^
5507   //      (r=0)-->(?)---/ [if r < t]
5508   //               |
5509   //   [if r >= f] \----> ...
5510   //
5511 
5512   // 15.10.2.5 RepeatMatcher algorithm.
5513   // The parser has already eliminated the case where max is 0.  In the case
5514   // where max_match is zero the parser has removed the quantifier if min was
5515   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
5516 
5517   // If we know that we cannot match zero length then things are a little
5518   // simpler since we don't need to make the special zero length match check
5519   // from step 2.1.  If the min and max are small we can unroll a little in
5520   // this case.
5521   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
5522   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
5523   if (max == 0) return on_success;  // This can happen due to recursion.
5524   bool body_can_be_empty = (body->min_match() == 0);
5525   int body_start_reg = RegExpCompiler::kNoRegister;
5526   Interval capture_registers = body->CaptureRegisters();
5527   bool needs_capture_clearing = !capture_registers.is_empty();
5528   Zone* zone = compiler->zone();
5529 
5530   if (body_can_be_empty) {
5531     body_start_reg = compiler->AllocateRegister();
5532   } else if (compiler->optimize() && !needs_capture_clearing) {
5533     // Only unroll if there are no captures and the body can't be
5534     // empty.
5535     {
5536       RegExpExpansionLimiter limiter(
5537           compiler, min + ((max != min) ? 1 : 0));
5538       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5539         int new_max = (max == kInfinity) ? max : max - min;
5540         // Recurse once to get the loop or optional matches after the fixed
5541         // ones.
5542         RegExpNode* answer = ToNode(
5543             0, new_max, is_greedy, body, compiler, on_success, true);
5544         // Unroll the forced matches from 0 to min.  This can cause chains of
5545         // TextNodes (which the parser does not generate).  These should be
5546         // combined if it turns out they hinder good code generation.
5547         for (int i = 0; i < min; i++) {
5548           answer = body->ToNode(compiler, answer);
5549         }
5550         return answer;
5551       }
5552     }
5553     if (max <= kMaxUnrolledMaxMatches && min == 0) {
5554       DCHECK(max > 0);  // Due to the 'if' above.
5555       RegExpExpansionLimiter limiter(compiler, max);
5556       if (limiter.ok_to_expand()) {
5557         // Unroll the optional matches up to max.
5558         RegExpNode* answer = on_success;
5559         for (int i = 0; i < max; i++) {
5560           ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5561           if (is_greedy) {
5562             alternation->AddAlternative(
5563                 GuardedAlternative(body->ToNode(compiler, answer)));
5564             alternation->AddAlternative(GuardedAlternative(on_success));
5565           } else {
5566             alternation->AddAlternative(GuardedAlternative(on_success));
5567             alternation->AddAlternative(
5568                 GuardedAlternative(body->ToNode(compiler, answer)));
5569           }
5570           answer = alternation;
5571           if (not_at_start && !compiler->read_backward()) {
5572             alternation->set_not_at_start();
5573           }
5574         }
5575         return answer;
5576       }
5577     }
5578   }
5579   bool has_min = min > 0;
5580   bool has_max = max < RegExpTree::kInfinity;
5581   bool needs_counter = has_min || has_max;
5582   int reg_ctr = needs_counter
5583       ? compiler->AllocateRegister()
5584       : RegExpCompiler::kNoRegister;
5585   LoopChoiceNode* center = new (zone)
5586       LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5587   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5588   RegExpNode* loop_return = needs_counter
5589       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5590       : static_cast<RegExpNode*>(center);
5591   if (body_can_be_empty) {
5592     // If the body can be empty we need to check if it was and then
5593     // backtrack.
5594     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5595                                               reg_ctr,
5596                                               min,
5597                                               loop_return);
5598   }
5599   RegExpNode* body_node = body->ToNode(compiler, loop_return);
5600   if (body_can_be_empty) {
5601     // If the body can be empty we need to store the start position
5602     // so we can bail out if it was empty.
5603     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5604   }
5605   if (needs_capture_clearing) {
5606     // Before entering the body of this loop we need to clear captures.
5607     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5608   }
5609   GuardedAlternative body_alt(body_node);
5610   if (has_max) {
5611     Guard* body_guard =
5612         new(zone) Guard(reg_ctr, Guard::LT, max);
5613     body_alt.AddGuard(body_guard, zone);
5614   }
5615   GuardedAlternative rest_alt(on_success);
5616   if (has_min) {
5617     Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5618     rest_alt.AddGuard(rest_guard, zone);
5619   }
5620   if (is_greedy) {
5621     center->AddLoopAlternative(body_alt);
5622     center->AddContinueAlternative(rest_alt);
5623   } else {
5624     center->AddContinueAlternative(rest_alt);
5625     center->AddLoopAlternative(body_alt);
5626   }
5627   if (needs_counter) {
5628     return ActionNode::SetRegister(reg_ctr, 0, center);
5629   } else {
5630     return center;
5631   }
5632 }
5633 
5634 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5635 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5636                                     RegExpNode* on_success) {
5637   NodeInfo info;
5638   Zone* zone = compiler->zone();
5639 
5640   switch (assertion_type()) {
5641     case START_OF_LINE:
5642       return AssertionNode::AfterNewline(on_success);
5643     case START_OF_INPUT:
5644       return AssertionNode::AtStart(on_success);
5645     case BOUNDARY:
5646       return AssertionNode::AtBoundary(on_success);
5647     case NON_BOUNDARY:
5648       return AssertionNode::AtNonBoundary(on_success);
5649     case END_OF_INPUT:
5650       return AssertionNode::AtEnd(on_success);
5651     case END_OF_LINE: {
5652       // Compile $ in multiline regexps as an alternation with a positive
5653       // lookahead in one side and an end-of-input on the other side.
5654       // We need two registers for the lookahead.
5655       int stack_pointer_register = compiler->AllocateRegister();
5656       int position_register = compiler->AllocateRegister();
5657       // The ChoiceNode to distinguish between a newline and end-of-input.
5658       ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5659       // Create a newline atom.
5660       ZoneList<CharacterRange>* newline_ranges =
5661           new(zone) ZoneList<CharacterRange>(3, zone);
5662       CharacterRange::AddClassEscape('n', newline_ranges, zone);
5663       RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5664       TextNode* newline_matcher = new (zone) TextNode(
5665           newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5666                                    stack_pointer_register, position_register,
5667                                    0,   // No captures inside.
5668                                    -1,  // Ignored if no captures.
5669                                    on_success));
5670       // Create an end-of-input matcher.
5671       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5672           stack_pointer_register,
5673           position_register,
5674           newline_matcher);
5675       // Add the two alternatives to the ChoiceNode.
5676       GuardedAlternative eol_alternative(end_of_line);
5677       result->AddAlternative(eol_alternative);
5678       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5679       result->AddAlternative(end_alternative);
5680       return result;
5681     }
5682     default:
5683       UNREACHABLE();
5684   }
5685   return on_success;
5686 }
5687 
5688 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5689 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5690                                         RegExpNode* on_success) {
5691   return new (compiler->zone())
5692       BackReferenceNode(RegExpCapture::StartRegister(index()),
5693                         RegExpCapture::EndRegister(index()),
5694                         compiler->read_backward(), on_success);
5695 }
5696 
5697 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5698 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5699                                 RegExpNode* on_success) {
5700   return on_success;
5701 }
5702 
5703 
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)5704 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5705                                    int stack_pointer_register,
5706                                    int position_register,
5707                                    int capture_register_count,
5708                                    int capture_register_start)
5709     : is_positive_(is_positive),
5710       on_success_(on_success),
5711       stack_pointer_register_(stack_pointer_register),
5712       position_register_(position_register) {
5713   if (is_positive_) {
5714     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5715         stack_pointer_register, position_register, capture_register_count,
5716         capture_register_start, on_success_);
5717   } else {
5718     Zone* zone = on_success_->zone();
5719     on_match_success_ = new (zone) NegativeSubmatchSuccess(
5720         stack_pointer_register, position_register, capture_register_count,
5721         capture_register_start, zone);
5722   }
5723 }
5724 
5725 
ForMatch(RegExpNode * match)5726 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5727   if (is_positive_) {
5728     return ActionNode::BeginSubmatch(stack_pointer_register_,
5729                                      position_register_, match);
5730   } else {
5731     Zone* zone = on_success_->zone();
5732     // We use a ChoiceNode to represent the negative lookaround. The first
5733     // alternative is the negative match. On success, the end node backtracks.
5734     // On failure, the second alternative is tried and leads to success.
5735     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5736     // first exit when calculating quick checks.
5737     ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5738         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5739     return ActionNode::BeginSubmatch(stack_pointer_register_,
5740                                      position_register_, choice_node);
5741   }
5742 }
5743 
5744 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5745 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5746                                      RegExpNode* on_success) {
5747   int stack_pointer_register = compiler->AllocateRegister();
5748   int position_register = compiler->AllocateRegister();
5749 
5750   const int registers_per_capture = 2;
5751   const int register_of_first_capture = 2;
5752   int register_count = capture_count_ * registers_per_capture;
5753   int register_start =
5754     register_of_first_capture + capture_from_ * registers_per_capture;
5755 
5756   RegExpNode* result;
5757   bool was_reading_backward = compiler->read_backward();
5758   compiler->set_read_backward(type() == LOOKBEHIND);
5759   Builder builder(is_positive(), on_success, stack_pointer_register,
5760                   position_register, register_count, register_start);
5761   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5762   result = builder.ForMatch(match);
5763   compiler->set_read_backward(was_reading_backward);
5764   return result;
5765 }
5766 
5767 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5768 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5769                                   RegExpNode* on_success) {
5770   return ToNode(body(), index(), compiler, on_success);
5771 }
5772 
5773 
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5774 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5775                                   int index,
5776                                   RegExpCompiler* compiler,
5777                                   RegExpNode* on_success) {
5778   DCHECK_NOT_NULL(body);
5779   int start_reg = RegExpCapture::StartRegister(index);
5780   int end_reg = RegExpCapture::EndRegister(index);
5781   if (compiler->read_backward()) std::swap(start_reg, end_reg);
5782   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5783   RegExpNode* body_node = body->ToNode(compiler, store_end);
5784   return ActionNode::StorePosition(start_reg, true, body_node);
5785 }
5786 
5787 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5788 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5789                                       RegExpNode* on_success) {
5790   ZoneList<RegExpTree*>* children = nodes();
5791   RegExpNode* current = on_success;
5792   if (compiler->read_backward()) {
5793     for (int i = 0; i < children->length(); i++) {
5794       current = children->at(i)->ToNode(compiler, current);
5795     }
5796   } else {
5797     for (int i = children->length() - 1; i >= 0; i--) {
5798       current = children->at(i)->ToNode(compiler, current);
5799     }
5800   }
5801   return current;
5802 }
5803 
5804 
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5805 static void AddClass(const int* elmv,
5806                      int elmc,
5807                      ZoneList<CharacterRange>* ranges,
5808                      Zone* zone) {
5809   elmc--;
5810   DCHECK(elmv[elmc] == kRangeEndMarker);
5811   for (int i = 0; i < elmc; i += 2) {
5812     DCHECK(elmv[i] < elmv[i + 1]);
5813     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5814   }
5815 }
5816 
5817 
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5818 static void AddClassNegated(const int *elmv,
5819                             int elmc,
5820                             ZoneList<CharacterRange>* ranges,
5821                             Zone* zone) {
5822   elmc--;
5823   DCHECK(elmv[elmc] == kRangeEndMarker);
5824   DCHECK(elmv[0] != 0x0000);
5825   DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
5826   uc16 last = 0x0000;
5827   for (int i = 0; i < elmc; i += 2) {
5828     DCHECK(last <= elmv[i] - 1);
5829     DCHECK(elmv[i] < elmv[i + 1]);
5830     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5831     last = elmv[i + 1];
5832   }
5833   ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5834 }
5835 
5836 
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges,Zone * zone)5837 void CharacterRange::AddClassEscape(uc16 type,
5838                                     ZoneList<CharacterRange>* ranges,
5839                                     Zone* zone) {
5840   switch (type) {
5841     case 's':
5842       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5843       break;
5844     case 'S':
5845       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5846       break;
5847     case 'w':
5848       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5849       break;
5850     case 'W':
5851       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5852       break;
5853     case 'd':
5854       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5855       break;
5856     case 'D':
5857       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5858       break;
5859     case '.':
5860       AddClassNegated(kLineTerminatorRanges,
5861                       kLineTerminatorRangeCount,
5862                       ranges,
5863                       zone);
5864       break;
5865     // This is not a character range as defined by the spec but a
5866     // convenient shorthand for a character class that matches any
5867     // character.
5868     case '*':
5869       ranges->Add(CharacterRange::Everything(), zone);
5870       break;
5871     // This is the set of characters matched by the $ and ^ symbols
5872     // in multiline mode.
5873     case 'n':
5874       AddClass(kLineTerminatorRanges,
5875                kLineTerminatorRangeCount,
5876                ranges,
5877                zone);
5878       break;
5879     default:
5880       UNREACHABLE();
5881   }
5882 }
5883 
5884 
GetWordBounds()5885 Vector<const int> CharacterRange::GetWordBounds() {
5886   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5887 }
5888 
5889 
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)5890 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5891                                         ZoneList<CharacterRange>* ranges,
5892                                         bool is_one_byte) {
5893   CharacterRange::Canonicalize(ranges);
5894   int range_count = ranges->length();
5895   for (int i = 0; i < range_count; i++) {
5896     CharacterRange range = ranges->at(i);
5897     uc32 bottom = range.from();
5898     if (bottom > String::kMaxUtf16CodeUnit) return;
5899     uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5900     // Nothing to be done for surrogates.
5901     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5902     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5903       if (bottom > String::kMaxOneByteCharCode) return;
5904       if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5905     }
5906     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5907     if (top == bottom) {
5908       // If this is a singleton we just expand the one character.
5909       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5910       for (int i = 0; i < length; i++) {
5911         uc32 chr = chars[i];
5912         if (chr != bottom) {
5913           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5914         }
5915       }
5916     } else {
5917       // If this is a range we expand the characters block by block, expanding
5918       // contiguous subranges (blocks) one at a time.  The approach is as
5919       // follows.  For a given start character we look up the remainder of the
5920       // block that contains it (represented by the end point), for instance we
5921       // find 'z' if the character is 'c'.  A block is characterized by the
5922       // property that all characters uncanonicalize in the same way, except
5923       // that each entry in the result is incremented by the distance from the
5924       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
5925       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
5926       // we've found the end point we look up its uncanonicalization and
5927       // produce a range for each element.  For instance for [c-f] we look up
5928       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
5929       // it is not already contained in the input, so [c-f] will be skipped but
5930       // [C-F] will be added.  If this range is not completely contained in a
5931       // block we do this for all the blocks covered by the range (handling
5932       // characters that is not in a block as a "singleton block").
5933       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5934       int pos = bottom;
5935       while (pos <= top) {
5936         int length =
5937             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5938         uc32 block_end;
5939         if (length == 0) {
5940           block_end = pos;
5941         } else {
5942           DCHECK_EQ(1, length);
5943           block_end = equivalents[0];
5944         }
5945         int end = (block_end > top) ? top : block_end;
5946         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5947                                                          equivalents);
5948         for (int i = 0; i < length; i++) {
5949           uc32 c = equivalents[i];
5950           uc32 range_from = c - (block_end - pos);
5951           uc32 range_to = c - (block_end - end);
5952           if (!(bottom <= range_from && range_to <= top)) {
5953             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5954           }
5955         }
5956         pos = end + 1;
5957       }
5958     }
5959   }
5960 }
5961 
5962 
IsCanonical(ZoneList<CharacterRange> * ranges)5963 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5964   DCHECK_NOT_NULL(ranges);
5965   int n = ranges->length();
5966   if (n <= 1) return true;
5967   int max = ranges->at(0).to();
5968   for (int i = 1; i < n; i++) {
5969     CharacterRange next_range = ranges->at(i);
5970     if (next_range.from() <= max + 1) return false;
5971     max = next_range.to();
5972   }
5973   return true;
5974 }
5975 
5976 
ranges(Zone * zone)5977 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5978   if (ranges_ == NULL) {
5979     ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5980     CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5981   }
5982   return ranges_;
5983 }
5984 
5985 
5986 // Move a number of elements in a zonelist to another position
5987 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5988 static void MoveRanges(ZoneList<CharacterRange>* list,
5989                        int from,
5990                        int to,
5991                        int count) {
5992   // Ranges are potentially overlapping.
5993   if (from < to) {
5994     for (int i = count - 1; i >= 0; i--) {
5995       list->at(to + i) = list->at(from + i);
5996     }
5997   } else {
5998     for (int i = 0; i < count; i++) {
5999       list->at(to + i) = list->at(from + i);
6000     }
6001   }
6002 }
6003 
6004 
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)6005 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6006                                       int count,
6007                                       CharacterRange insert) {
6008   // Inserts a range into list[0..count[, which must be sorted
6009   // by from value and non-overlapping and non-adjacent, using at most
6010   // list[0..count] for the result. Returns the number of resulting
6011   // canonicalized ranges. Inserting a range may collapse existing ranges into
6012   // fewer ranges, so the return value can be anything in the range 1..count+1.
6013   uc32 from = insert.from();
6014   uc32 to = insert.to();
6015   int start_pos = 0;
6016   int end_pos = count;
6017   for (int i = count - 1; i >= 0; i--) {
6018     CharacterRange current = list->at(i);
6019     if (current.from() > to + 1) {
6020       end_pos = i;
6021     } else if (current.to() + 1 < from) {
6022       start_pos = i + 1;
6023       break;
6024     }
6025   }
6026 
6027   // Inserted range overlaps, or is adjacent to, ranges at positions
6028   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6029   // not affected by the insertion.
6030   // If start_pos == end_pos, the range must be inserted before start_pos.
6031   // if start_pos < end_pos, the entire range from start_pos to end_pos
6032   // must be merged with the insert range.
6033 
6034   if (start_pos == end_pos) {
6035     // Insert between existing ranges at position start_pos.
6036     if (start_pos < count) {
6037       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6038     }
6039     list->at(start_pos) = insert;
6040     return count + 1;
6041   }
6042   if (start_pos + 1 == end_pos) {
6043     // Replace single existing range at position start_pos.
6044     CharacterRange to_replace = list->at(start_pos);
6045     int new_from = Min(to_replace.from(), from);
6046     int new_to = Max(to_replace.to(), to);
6047     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6048     return count;
6049   }
6050   // Replace a number of existing ranges from start_pos to end_pos - 1.
6051   // Move the remaining ranges down.
6052 
6053   int new_from = Min(list->at(start_pos).from(), from);
6054   int new_to = Max(list->at(end_pos - 1).to(), to);
6055   if (end_pos < count) {
6056     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6057   }
6058   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6059   return count - (end_pos - start_pos) + 1;
6060 }
6061 
6062 
Canonicalize()6063 void CharacterSet::Canonicalize() {
6064   // Special/default classes are always considered canonical. The result
6065   // of calling ranges() will be sorted.
6066   if (ranges_ == NULL) return;
6067   CharacterRange::Canonicalize(ranges_);
6068 }
6069 
6070 
Canonicalize(ZoneList<CharacterRange> * character_ranges)6071 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6072   if (character_ranges->length() <= 1) return;
6073   // Check whether ranges are already canonical (increasing, non-overlapping,
6074   // non-adjacent).
6075   int n = character_ranges->length();
6076   int max = character_ranges->at(0).to();
6077   int i = 1;
6078   while (i < n) {
6079     CharacterRange current = character_ranges->at(i);
6080     if (current.from() <= max + 1) {
6081       break;
6082     }
6083     max = current.to();
6084     i++;
6085   }
6086   // Canonical until the i'th range. If that's all of them, we are done.
6087   if (i == n) return;
6088 
6089   // The ranges at index i and forward are not canonicalized. Make them so by
6090   // doing the equivalent of insertion sort (inserting each into the previous
6091   // list, in order).
6092   // Notice that inserting a range can reduce the number of ranges in the
6093   // result due to combining of adjacent and overlapping ranges.
6094   int read = i;  // Range to insert.
6095   int num_canonical = i;  // Length of canonicalized part of list.
6096   do {
6097     num_canonical = InsertRangeInCanonicalList(character_ranges,
6098                                                num_canonical,
6099                                                character_ranges->at(read));
6100     read++;
6101   } while (read < n);
6102   character_ranges->Rewind(num_canonical);
6103 
6104   DCHECK(CharacterRange::IsCanonical(character_ranges));
6105 }
6106 
6107 
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)6108 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6109                             ZoneList<CharacterRange>* negated_ranges,
6110                             Zone* zone) {
6111   DCHECK(CharacterRange::IsCanonical(ranges));
6112   DCHECK_EQ(0, negated_ranges->length());
6113   int range_count = ranges->length();
6114   uc32 from = 0;
6115   int i = 0;
6116   if (range_count > 0 && ranges->at(0).from() == 0) {
6117     from = ranges->at(0).to() + 1;
6118     i = 1;
6119   }
6120   while (i < range_count) {
6121     CharacterRange range = ranges->at(i);
6122     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6123     from = range.to() + 1;
6124     i++;
6125   }
6126   if (from < String::kMaxCodePoint) {
6127     negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6128                         zone);
6129   }
6130 }
6131 
6132 
6133 // -------------------------------------------------------------------
6134 // Splay tree
6135 
6136 
Extend(unsigned value,Zone * zone)6137 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6138   if (Get(value))
6139     return this;
6140   if (successors(zone) != NULL) {
6141     for (int i = 0; i < successors(zone)->length(); i++) {
6142       OutSet* successor = successors(zone)->at(i);
6143       if (successor->Get(value))
6144         return successor;
6145     }
6146   } else {
6147     successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6148   }
6149   OutSet* result = new(zone) OutSet(first_, remaining_);
6150   result->Set(value, zone);
6151   successors(zone)->Add(result, zone);
6152   return result;
6153 }
6154 
6155 
Set(unsigned value,Zone * zone)6156 void OutSet::Set(unsigned value, Zone *zone) {
6157   if (value < kFirstLimit) {
6158     first_ |= (1 << value);
6159   } else {
6160     if (remaining_ == NULL)
6161       remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6162     if (remaining_->is_empty() || !remaining_->Contains(value))
6163       remaining_->Add(value, zone);
6164   }
6165 }
6166 
6167 
Get(unsigned value) const6168 bool OutSet::Get(unsigned value) const {
6169   if (value < kFirstLimit) {
6170     return (first_ & (1 << value)) != 0;
6171   } else if (remaining_ == NULL) {
6172     return false;
6173   } else {
6174     return remaining_->Contains(value);
6175   }
6176 }
6177 
6178 
6179 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6180 
6181 
AddRange(CharacterRange full_range,int value,Zone * zone)6182 void DispatchTable::AddRange(CharacterRange full_range, int value,
6183                              Zone* zone) {
6184   CharacterRange current = full_range;
6185   if (tree()->is_empty()) {
6186     // If this is the first range we just insert into the table.
6187     ZoneSplayTree<Config>::Locator loc;
6188     bool inserted = tree()->Insert(current.from(), &loc);
6189     DCHECK(inserted);
6190     USE(inserted);
6191     loc.set_value(Entry(current.from(), current.to(),
6192                         empty()->Extend(value, zone)));
6193     return;
6194   }
6195   // First see if there is a range to the left of this one that
6196   // overlaps.
6197   ZoneSplayTree<Config>::Locator loc;
6198   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6199     Entry* entry = &loc.value();
6200     // If we've found a range that overlaps with this one, and it
6201     // starts strictly to the left of this one, we have to fix it
6202     // because the following code only handles ranges that start on
6203     // or after the start point of the range we're adding.
6204     if (entry->from() < current.from() && entry->to() >= current.from()) {
6205       // Snap the overlapping range in half around the start point of
6206       // the range we're adding.
6207       CharacterRange left =
6208           CharacterRange::Range(entry->from(), current.from() - 1);
6209       CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6210       // The left part of the overlapping range doesn't overlap.
6211       // Truncate the whole entry to be just the left part.
6212       entry->set_to(left.to());
6213       // The right part is the one that overlaps.  We add this part
6214       // to the map and let the next step deal with merging it with
6215       // the range we're adding.
6216       ZoneSplayTree<Config>::Locator loc;
6217       bool inserted = tree()->Insert(right.from(), &loc);
6218       DCHECK(inserted);
6219       USE(inserted);
6220       loc.set_value(Entry(right.from(),
6221                           right.to(),
6222                           entry->out_set()));
6223     }
6224   }
6225   while (current.is_valid()) {
6226     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6227         (loc.value().from() <= current.to()) &&
6228         (loc.value().to() >= current.from())) {
6229       Entry* entry = &loc.value();
6230       // We have overlap.  If there is space between the start point of
6231       // the range we're adding and where the overlapping range starts
6232       // then we have to add a range covering just that space.
6233       if (current.from() < entry->from()) {
6234         ZoneSplayTree<Config>::Locator ins;
6235         bool inserted = tree()->Insert(current.from(), &ins);
6236         DCHECK(inserted);
6237         USE(inserted);
6238         ins.set_value(Entry(current.from(),
6239                             entry->from() - 1,
6240                             empty()->Extend(value, zone)));
6241         current.set_from(entry->from());
6242       }
6243       DCHECK_EQ(current.from(), entry->from());
6244       // If the overlapping range extends beyond the one we want to add
6245       // we have to snap the right part off and add it separately.
6246       if (entry->to() > current.to()) {
6247         ZoneSplayTree<Config>::Locator ins;
6248         bool inserted = tree()->Insert(current.to() + 1, &ins);
6249         DCHECK(inserted);
6250         USE(inserted);
6251         ins.set_value(Entry(current.to() + 1,
6252                             entry->to(),
6253                             entry->out_set()));
6254         entry->set_to(current.to());
6255       }
6256       DCHECK(entry->to() <= current.to());
6257       // The overlapping range is now completely contained by the range
6258       // we're adding so we can just update it and move the start point
6259       // of the range we're adding just past it.
6260       entry->AddValue(value, zone);
6261       DCHECK(entry->to() + 1 > current.from());
6262       current.set_from(entry->to() + 1);
6263     } else {
6264       // There is no overlap so we can just add the range
6265       ZoneSplayTree<Config>::Locator ins;
6266       bool inserted = tree()->Insert(current.from(), &ins);
6267       DCHECK(inserted);
6268       USE(inserted);
6269       ins.set_value(Entry(current.from(),
6270                           current.to(),
6271                           empty()->Extend(value, zone)));
6272       break;
6273     }
6274   }
6275 }
6276 
6277 
Get(uc32 value)6278 OutSet* DispatchTable::Get(uc32 value) {
6279   ZoneSplayTree<Config>::Locator loc;
6280   if (!tree()->FindGreatestLessThan(value, &loc))
6281     return empty();
6282   Entry* entry = &loc.value();
6283   if (value <= entry->to())
6284     return entry->out_set();
6285   else
6286     return empty();
6287 }
6288 
6289 
6290 // -------------------------------------------------------------------
6291 // Analysis
6292 
6293 
EnsureAnalyzed(RegExpNode * that)6294 void Analysis::EnsureAnalyzed(RegExpNode* that) {
6295   StackLimitCheck check(isolate());
6296   if (check.HasOverflowed()) {
6297     fail("Stack overflow");
6298     return;
6299   }
6300   if (that->info()->been_analyzed || that->info()->being_analyzed)
6301     return;
6302   that->info()->being_analyzed = true;
6303   that->Accept(this);
6304   that->info()->being_analyzed = false;
6305   that->info()->been_analyzed = true;
6306 }
6307 
6308 
VisitEnd(EndNode * that)6309 void Analysis::VisitEnd(EndNode* that) {
6310   // nothing to do
6311 }
6312 
6313 
CalculateOffsets()6314 void TextNode::CalculateOffsets() {
6315   int element_count = elements()->length();
6316   // Set up the offsets of the elements relative to the start.  This is a fixed
6317   // quantity since a TextNode can only contain fixed-width things.
6318   int cp_offset = 0;
6319   for (int i = 0; i < element_count; i++) {
6320     TextElement& elm = elements()->at(i);
6321     elm.set_cp_offset(cp_offset);
6322     cp_offset += elm.length();
6323   }
6324 }
6325 
6326 
VisitText(TextNode * that)6327 void Analysis::VisitText(TextNode* that) {
6328   if (ignore_case()) {
6329     that->MakeCaseIndependent(isolate(), is_one_byte_);
6330   }
6331   EnsureAnalyzed(that->on_success());
6332   if (!has_failed()) {
6333     that->CalculateOffsets();
6334   }
6335 }
6336 
6337 
VisitAction(ActionNode * that)6338 void Analysis::VisitAction(ActionNode* that) {
6339   RegExpNode* target = that->on_success();
6340   EnsureAnalyzed(target);
6341   if (!has_failed()) {
6342     // If the next node is interested in what it follows then this node
6343     // has to be interested too so it can pass the information on.
6344     that->info()->AddFromFollowing(target->info());
6345   }
6346 }
6347 
6348 
VisitChoice(ChoiceNode * that)6349 void Analysis::VisitChoice(ChoiceNode* that) {
6350   NodeInfo* info = that->info();
6351   for (int i = 0; i < that->alternatives()->length(); i++) {
6352     RegExpNode* node = that->alternatives()->at(i).node();
6353     EnsureAnalyzed(node);
6354     if (has_failed()) return;
6355     // Anything the following nodes need to know has to be known by
6356     // this node also, so it can pass it on.
6357     info->AddFromFollowing(node->info());
6358   }
6359 }
6360 
6361 
VisitLoopChoice(LoopChoiceNode * that)6362 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6363   NodeInfo* info = that->info();
6364   for (int i = 0; i < that->alternatives()->length(); i++) {
6365     RegExpNode* node = that->alternatives()->at(i).node();
6366     if (node != that->loop_node()) {
6367       EnsureAnalyzed(node);
6368       if (has_failed()) return;
6369       info->AddFromFollowing(node->info());
6370     }
6371   }
6372   // Check the loop last since it may need the value of this node
6373   // to get a correct result.
6374   EnsureAnalyzed(that->loop_node());
6375   if (!has_failed()) {
6376     info->AddFromFollowing(that->loop_node()->info());
6377   }
6378 }
6379 
6380 
VisitBackReference(BackReferenceNode * that)6381 void Analysis::VisitBackReference(BackReferenceNode* that) {
6382   EnsureAnalyzed(that->on_success());
6383 }
6384 
6385 
VisitAssertion(AssertionNode * that)6386 void Analysis::VisitAssertion(AssertionNode* that) {
6387   EnsureAnalyzed(that->on_success());
6388 }
6389 
6390 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6391 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6392                                      BoyerMooreLookahead* bm,
6393                                      bool not_at_start) {
6394   // Working out the set of characters that a backreference can match is too
6395   // hard, so we just say that any character can match.
6396   bm->SetRest(offset);
6397   SaveBMInfo(bm, not_at_start, offset);
6398 }
6399 
6400 
6401 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6402               RegExpMacroAssembler::kTableSize);
6403 
6404 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6405 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6406                               BoyerMooreLookahead* bm, bool not_at_start) {
6407   ZoneList<GuardedAlternative>* alts = alternatives();
6408   budget = (budget - 1) / alts->length();
6409   for (int i = 0; i < alts->length(); i++) {
6410     GuardedAlternative& alt = alts->at(i);
6411     if (alt.guards() != NULL && alt.guards()->length() != 0) {
6412       bm->SetRest(offset);  // Give up trying to fill in info.
6413       SaveBMInfo(bm, not_at_start, offset);
6414       return;
6415     }
6416     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6417   }
6418   SaveBMInfo(bm, not_at_start, offset);
6419 }
6420 
6421 
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6422 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6423                             BoyerMooreLookahead* bm, bool not_at_start) {
6424   if (initial_offset >= bm->length()) return;
6425   int offset = initial_offset;
6426   int max_char = bm->max_char();
6427   for (int i = 0; i < elements()->length(); i++) {
6428     if (offset >= bm->length()) {
6429       if (initial_offset == 0) set_bm_info(not_at_start, bm);
6430       return;
6431     }
6432     TextElement text = elements()->at(i);
6433     if (text.text_type() == TextElement::ATOM) {
6434       RegExpAtom* atom = text.atom();
6435       for (int j = 0; j < atom->length(); j++, offset++) {
6436         if (offset >= bm->length()) {
6437           if (initial_offset == 0) set_bm_info(not_at_start, bm);
6438           return;
6439         }
6440         uc16 character = atom->data()[j];
6441         if (bm->compiler()->ignore_case()) {
6442           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6443           int length = GetCaseIndependentLetters(
6444               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6445               chars);
6446           for (int j = 0; j < length; j++) {
6447             bm->Set(offset, chars[j]);
6448           }
6449         } else {
6450           if (character <= max_char) bm->Set(offset, character);
6451         }
6452       }
6453     } else {
6454       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6455       RegExpCharacterClass* char_class = text.char_class();
6456       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6457       if (char_class->is_negated()) {
6458         bm->SetAll(offset);
6459       } else {
6460         for (int k = 0; k < ranges->length(); k++) {
6461           CharacterRange& range = ranges->at(k);
6462           if (range.from() > max_char) continue;
6463           int to = Min(max_char, static_cast<int>(range.to()));
6464           bm->SetInterval(offset, Interval(range.from(), to));
6465         }
6466       }
6467       offset++;
6468     }
6469   }
6470   if (offset >= bm->length()) {
6471     if (initial_offset == 0) set_bm_info(not_at_start, bm);
6472     return;
6473   }
6474   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6475                              true);  // Not at start after a text node.
6476   if (initial_offset == 0) set_bm_info(not_at_start, bm);
6477 }
6478 
6479 
6480 // -------------------------------------------------------------------
6481 // Dispatch table construction
6482 
6483 
VisitEnd(EndNode * that)6484 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6485   AddRange(CharacterRange::Everything());
6486 }
6487 
6488 
BuildTable(ChoiceNode * node)6489 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6490   node->set_being_calculated(true);
6491   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6492   for (int i = 0; i < alternatives->length(); i++) {
6493     set_choice_index(i);
6494     alternatives->at(i).node()->Accept(this);
6495   }
6496   node->set_being_calculated(false);
6497 }
6498 
6499 
6500 class AddDispatchRange {
6501  public:
AddDispatchRange(DispatchTableConstructor * constructor)6502   explicit AddDispatchRange(DispatchTableConstructor* constructor)
6503     : constructor_(constructor) { }
6504   void Call(uc32 from, DispatchTable::Entry entry);
6505  private:
6506   DispatchTableConstructor* constructor_;
6507 };
6508 
6509 
Call(uc32 from,DispatchTable::Entry entry)6510 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6511   constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6512 }
6513 
6514 
VisitChoice(ChoiceNode * node)6515 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6516   if (node->being_calculated())
6517     return;
6518   DispatchTable* table = node->GetTable(ignore_case_);
6519   AddDispatchRange adder(this);
6520   table->ForEach(&adder);
6521 }
6522 
6523 
VisitBackReference(BackReferenceNode * that)6524 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6525   // TODO(160): Find the node that we refer back to and propagate its start
6526   // set back to here.  For now we just accept anything.
6527   AddRange(CharacterRange::Everything());
6528 }
6529 
6530 
VisitAssertion(AssertionNode * that)6531 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6532   RegExpNode* target = that->on_success();
6533   target->Accept(this);
6534 }
6535 
6536 
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)6537 static int CompareRangeByFrom(const CharacterRange* a,
6538                               const CharacterRange* b) {
6539   return Compare<uc16>(a->from(), b->from());
6540 }
6541 
6542 
AddInverse(ZoneList<CharacterRange> * ranges)6543 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6544   ranges->Sort(CompareRangeByFrom);
6545   uc16 last = 0;
6546   for (int i = 0; i < ranges->length(); i++) {
6547     CharacterRange range = ranges->at(i);
6548     if (last < range.from())
6549       AddRange(CharacterRange::Range(last, range.from() - 1));
6550     if (range.to() >= last) {
6551       if (range.to() == String::kMaxCodePoint) {
6552         return;
6553       } else {
6554         last = range.to() + 1;
6555       }
6556     }
6557   }
6558   AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6559 }
6560 
6561 
VisitText(TextNode * that)6562 void DispatchTableConstructor::VisitText(TextNode* that) {
6563   TextElement elm = that->elements()->at(0);
6564   switch (elm.text_type()) {
6565     case TextElement::ATOM: {
6566       uc16 c = elm.atom()->data()[0];
6567       AddRange(CharacterRange::Range(c, c));
6568       break;
6569     }
6570     case TextElement::CHAR_CLASS: {
6571       RegExpCharacterClass* tree = elm.char_class();
6572       ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6573       if (tree->is_negated()) {
6574         AddInverse(ranges);
6575       } else {
6576         for (int i = 0; i < ranges->length(); i++)
6577           AddRange(ranges->at(i));
6578       }
6579       break;
6580     }
6581     default: {
6582       UNIMPLEMENTED();
6583     }
6584   }
6585 }
6586 
6587 
VisitAction(ActionNode * that)6588 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6589   RegExpNode* target = that->on_success();
6590   target->Accept(this);
6591 }
6592 
6593 
OptionallyStepBackToLeadSurrogate(RegExpCompiler * compiler,RegExpNode * on_success)6594 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6595                                               RegExpNode* on_success) {
6596   // If the regexp matching starts within a surrogate pair, step back
6597   // to the lead surrogate and start matching from there.
6598   DCHECK(!compiler->read_backward());
6599   Zone* zone = compiler->zone();
6600   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6601       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6602   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6603       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6604 
6605   ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6606 
6607   int stack_register = compiler->UnicodeLookaroundStackRegister();
6608   int position_register = compiler->UnicodeLookaroundPositionRegister();
6609   RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6610       zone, lead_surrogates, true, on_success);
6611   RegExpLookaround::Builder builder(true, step_back, stack_register,
6612                                     position_register);
6613   RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6614       zone, trail_surrogates, false, builder.on_match_success());
6615 
6616   optional_step_back->AddAlternative(
6617       GuardedAlternative(builder.ForMatch(match_trail)));
6618   optional_step_back->AddAlternative(GuardedAlternative(on_success));
6619 
6620   return optional_step_back;
6621 }
6622 
6623 
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)6624 RegExpEngine::CompilationResult RegExpEngine::Compile(
6625     Isolate* isolate, Zone* zone, RegExpCompileData* data,
6626     JSRegExp::Flags flags, Handle<String> pattern,
6627     Handle<String> sample_subject, bool is_one_byte) {
6628   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6629     return IrregexpRegExpTooBig(isolate);
6630   }
6631   bool ignore_case = flags & JSRegExp::kIgnoreCase;
6632   bool is_sticky = flags & JSRegExp::kSticky;
6633   bool is_global = flags & JSRegExp::kGlobal;
6634   bool is_unicode = flags & JSRegExp::kUnicode;
6635   RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6636                           is_one_byte);
6637 
6638   if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6639 
6640   // Sample some characters from the middle of the string.
6641   static const int kSampleSize = 128;
6642 
6643   sample_subject = String::Flatten(sample_subject);
6644   int chars_sampled = 0;
6645   int half_way = (sample_subject->length() - kSampleSize) / 2;
6646   for (int i = Max(0, half_way);
6647        i < sample_subject->length() && chars_sampled < kSampleSize;
6648        i++, chars_sampled++) {
6649     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6650   }
6651 
6652   // Wrap the body of the regexp in capture #0.
6653   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6654                                                     0,
6655                                                     &compiler,
6656                                                     compiler.accept());
6657   RegExpNode* node = captured_body;
6658   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6659   bool is_start_anchored = data->tree->IsAnchoredAtStart();
6660   int max_length = data->tree->max_match();
6661   if (!is_start_anchored && !is_sticky) {
6662     // Add a .*? at the beginning, outside the body capture, unless
6663     // this expression is anchored at the beginning or sticky.
6664     RegExpNode* loop_node = RegExpQuantifier::ToNode(
6665         0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6666         &compiler, captured_body, data->contains_anchor);
6667 
6668     if (data->contains_anchor) {
6669       // Unroll loop once, to take care of the case that might start
6670       // at the start of input.
6671       ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6672       first_step_node->AddAlternative(GuardedAlternative(captured_body));
6673       first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6674           new (zone) RegExpCharacterClass('*'), false, loop_node)));
6675       node = first_step_node;
6676     } else {
6677       node = loop_node;
6678     }
6679   }
6680   if (is_one_byte) {
6681     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6682     // Do it again to propagate the new nodes to places where they were not
6683     // put because they had not been calculated yet.
6684     if (node != NULL) {
6685       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6686     }
6687   } else if (compiler.unicode() && (is_global || is_sticky)) {
6688     node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6689   }
6690 
6691   if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6692   data->node = node;
6693   Analysis analysis(isolate, flags, is_one_byte);
6694   analysis.EnsureAnalyzed(node);
6695   if (analysis.has_failed()) {
6696     const char* error_message = analysis.error_message();
6697     return CompilationResult(isolate, error_message);
6698   }
6699 
6700   // Create the correct assembler for the architecture.
6701 #ifndef V8_INTERPRETED_REGEXP
6702   // Native regexp implementation.
6703 
6704   NativeRegExpMacroAssembler::Mode mode =
6705       is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6706                   : NativeRegExpMacroAssembler::UC16;
6707 
6708 #if V8_TARGET_ARCH_IA32
6709   RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6710                                            (data->capture_count + 1) * 2);
6711 #elif V8_TARGET_ARCH_X64
6712   RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6713                                           (data->capture_count + 1) * 2);
6714 #elif V8_TARGET_ARCH_ARM
6715   RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6716                                           (data->capture_count + 1) * 2);
6717 #elif V8_TARGET_ARCH_ARM64
6718   RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6719                                             (data->capture_count + 1) * 2);
6720 #elif V8_TARGET_ARCH_S390
6721   RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6722                                            (data->capture_count + 1) * 2);
6723 #elif V8_TARGET_ARCH_PPC
6724   RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6725                                           (data->capture_count + 1) * 2);
6726 #elif V8_TARGET_ARCH_MIPS
6727   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6728                                            (data->capture_count + 1) * 2);
6729 #elif V8_TARGET_ARCH_MIPS64
6730   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6731                                            (data->capture_count + 1) * 2);
6732 #elif V8_TARGET_ARCH_X87
6733   RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6734                                           (data->capture_count + 1) * 2);
6735 #else
6736 #error "Unsupported architecture"
6737 #endif
6738 
6739 #else  // V8_INTERPRETED_REGEXP
6740   // Interpreted regexp implementation.
6741   EmbeddedVector<byte, 1024> codes;
6742   RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6743 #endif  // V8_INTERPRETED_REGEXP
6744 
6745   macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6746 
6747   // Inserted here, instead of in Assembler, because it depends on information
6748   // in the AST that isn't replicated in the Node structure.
6749   static const int kMaxBacksearchLimit = 1024;
6750   if (is_end_anchored &&
6751       !is_start_anchored &&
6752       max_length < kMaxBacksearchLimit) {
6753     macro_assembler.SetCurrentPositionFromEnd(max_length);
6754   }
6755 
6756   if (is_global) {
6757     RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6758     if (data->tree->min_match() > 0) {
6759       mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6760     } else if (is_unicode) {
6761       mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6762     }
6763     macro_assembler.set_global_mode(mode);
6764   }
6765 
6766   return compiler.Assemble(&macro_assembler,
6767                            node,
6768                            data->capture_count,
6769                            pattern);
6770 }
6771 
6772 
TooMuchRegExpCode(Handle<String> pattern)6773 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6774   Heap* heap = pattern->GetHeap();
6775   bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6776   if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6777       heap->memory_allocator()->SizeExecutable() >
6778           RegExpImpl::kRegExpExecutableMemoryLimit) {
6779     too_much = true;
6780   }
6781   return too_much;
6782 }
6783 
6784 
Lookup(Heap * heap,String * key_string,Object * key_pattern,FixedArray ** last_match_cache,ResultsCacheType type)6785 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6786                                    Object* key_pattern,
6787                                    FixedArray** last_match_cache,
6788                                    ResultsCacheType type) {
6789   FixedArray* cache;
6790   if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
6791   if (type == STRING_SPLIT_SUBSTRINGS) {
6792     DCHECK(key_pattern->IsString());
6793     if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
6794     cache = heap->string_split_cache();
6795   } else {
6796     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6797     DCHECK(key_pattern->IsFixedArray());
6798     cache = heap->regexp_multiple_cache();
6799   }
6800 
6801   uint32_t hash = key_string->Hash();
6802   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6803                     ~(kArrayEntriesPerCacheEntry - 1));
6804   if (cache->get(index + kStringOffset) != key_string ||
6805       cache->get(index + kPatternOffset) != key_pattern) {
6806     index =
6807         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6808     if (cache->get(index + kStringOffset) != key_string ||
6809         cache->get(index + kPatternOffset) != key_pattern) {
6810       return Smi::FromInt(0);
6811     }
6812   }
6813 
6814   *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6815   return cache->get(index + kArrayOffset);
6816 }
6817 
6818 
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)6819 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6820                                Handle<Object> key_pattern,
6821                                Handle<FixedArray> value_array,
6822                                Handle<FixedArray> last_match_cache,
6823                                ResultsCacheType type) {
6824   Factory* factory = isolate->factory();
6825   Handle<FixedArray> cache;
6826   if (!key_string->IsInternalizedString()) return;
6827   if (type == STRING_SPLIT_SUBSTRINGS) {
6828     DCHECK(key_pattern->IsString());
6829     if (!key_pattern->IsInternalizedString()) return;
6830     cache = factory->string_split_cache();
6831   } else {
6832     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6833     DCHECK(key_pattern->IsFixedArray());
6834     cache = factory->regexp_multiple_cache();
6835   }
6836 
6837   uint32_t hash = key_string->Hash();
6838   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6839                     ~(kArrayEntriesPerCacheEntry - 1));
6840   if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
6841     cache->set(index + kStringOffset, *key_string);
6842     cache->set(index + kPatternOffset, *key_pattern);
6843     cache->set(index + kArrayOffset, *value_array);
6844     cache->set(index + kLastMatchOffset, *last_match_cache);
6845   } else {
6846     uint32_t index2 =
6847         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6848     if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
6849       cache->set(index2 + kStringOffset, *key_string);
6850       cache->set(index2 + kPatternOffset, *key_pattern);
6851       cache->set(index2 + kArrayOffset, *value_array);
6852       cache->set(index2 + kLastMatchOffset, *last_match_cache);
6853     } else {
6854       cache->set(index2 + kStringOffset, Smi::FromInt(0));
6855       cache->set(index2 + kPatternOffset, Smi::FromInt(0));
6856       cache->set(index2 + kArrayOffset, Smi::FromInt(0));
6857       cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
6858       cache->set(index + kStringOffset, *key_string);
6859       cache->set(index + kPatternOffset, *key_pattern);
6860       cache->set(index + kArrayOffset, *value_array);
6861       cache->set(index + kLastMatchOffset, *last_match_cache);
6862     }
6863   }
6864   // If the array is a reasonably short list of substrings, convert it into a
6865   // list of internalized strings.
6866   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6867     for (int i = 0; i < value_array->length(); i++) {
6868       Handle<String> str(String::cast(value_array->get(i)), isolate);
6869       Handle<String> internalized_str = factory->InternalizeString(str);
6870       value_array->set(i, *internalized_str);
6871     }
6872   }
6873   // Convert backing store to a copy-on-write array.
6874   value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
6875 }
6876 
6877 
Clear(FixedArray * cache)6878 void RegExpResultsCache::Clear(FixedArray* cache) {
6879   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6880     cache->set(i, Smi::FromInt(0));
6881   }
6882 }
6883 
6884 }  // namespace internal
6885 }  // namespace v8
6886