• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/jsregexp.h"
6 
7 #include <memory>
8 
9 #include "src/base/platform/platform.h"
10 #include "src/compilation-cache.h"
11 #include "src/elements.h"
12 #include "src/execution.h"
13 #include "src/factory.h"
14 #include "src/isolate-inl.h"
15 #include "src/messages.h"
16 #include "src/ostreams.h"
17 #include "src/regexp/interpreter-irregexp.h"
18 #include "src/regexp/jsregexp-inl.h"
19 #include "src/regexp/regexp-macro-assembler-irregexp.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-macro-assembler.h"
22 #include "src/regexp/regexp-parser.h"
23 #include "src/regexp/regexp-stack.h"
24 #include "src/runtime/runtime.h"
25 #include "src/splay-tree-inl.h"
26 #include "src/string-search.h"
27 #include "src/unicode-decoder.h"
28 
29 #ifdef V8_I18N_SUPPORT
30 #include "unicode/uniset.h"
31 #include "unicode/utypes.h"
32 #endif  // V8_I18N_SUPPORT
33 
34 #ifndef V8_INTERPRETED_REGEXP
35 #if V8_TARGET_ARCH_IA32
36 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
37 #elif V8_TARGET_ARCH_X64
38 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
39 #elif V8_TARGET_ARCH_ARM64
40 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
41 #elif V8_TARGET_ARCH_ARM
42 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
43 #elif V8_TARGET_ARCH_PPC
44 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
45 #elif V8_TARGET_ARCH_S390
46 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
47 #elif V8_TARGET_ARCH_MIPS
48 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
49 #elif V8_TARGET_ARCH_MIPS64
50 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
51 #elif V8_TARGET_ARCH_X87
52 #include "src/regexp/x87/regexp-macro-assembler-x87.h"
53 #else
54 #error Unsupported target architecture.
55 #endif
56 #endif
57 
58 
59 namespace v8 {
60 namespace internal {
61 
62 MUST_USE_RESULT
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text)63 static inline MaybeHandle<Object> ThrowRegExpException(
64     Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
65   Isolate* isolate = re->GetIsolate();
66   THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67                                           pattern, error_text),
68                   Object);
69 }
70 
71 
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> error_text)72 inline void ThrowRegExpException(Handle<JSRegExp> re,
73                                  Handle<String> error_text) {
74   USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
75 }
76 
77 
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)78 ContainedInLattice AddRange(ContainedInLattice containment,
79                             const int* ranges,
80                             int ranges_length,
81                             Interval new_range) {
82   DCHECK((ranges_length & 1) == 1);
83   DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
84   if (containment == kLatticeUnknown) return containment;
85   bool inside = false;
86   int last = 0;
87   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88     // Consider the range from last to ranges[i].
89     // We haven't got to the new range yet.
90     if (ranges[i] <= new_range.from()) continue;
91     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
92     // inclusive, but the values in ranges are not.
93     if (last <= new_range.from() && new_range.to() < ranges[i]) {
94       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95     }
96     return kLatticeUnknown;
97   }
98   return containment;
99 }
100 
101 
102 // More makes code generation slower, less makes V8 benchmark score lower.
103 const int kMaxLookaheadForBoyerMoore = 8;
104 // In a 3-character pattern you can maximally step forwards 3 characters
105 // at a time, which is not always enough to pay for the extra logic.
106 const int kPatternTooShortForBoyerMoore = 2;
107 
108 
109 // Identifies the sort of regexps where the regexp engine is faster
110 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)111 static bool HasFewDifferentCharacters(Handle<String> pattern) {
112   int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
113   if (length <= kPatternTooShortForBoyerMoore) return false;
114   const int kMod = 128;
115   bool character_found[kMod];
116   int different = 0;
117   memset(&character_found[0], 0, sizeof(character_found));
118   for (int i = 0; i < length; i++) {
119     int ch = (pattern->Get(i) & (kMod - 1));
120     if (!character_found[ch]) {
121       character_found[ch] = true;
122       different++;
123       // We declare a regexp low-alphabet if it has at least 3 times as many
124       // characters as it has different characters.
125       if (different * 3 > length) return false;
126     }
127   }
128   return true;
129 }
130 
131 
132 // Generic RegExp methods. Dispatches to implementation specific methods.
133 
134 
Compile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags)135 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
136                                         Handle<String> pattern,
137                                         JSRegExp::Flags flags) {
138   Isolate* isolate = re->GetIsolate();
139   Zone zone(isolate->allocator(), ZONE_NAME);
140   CompilationCache* compilation_cache = isolate->compilation_cache();
141   MaybeHandle<FixedArray> maybe_cached =
142       compilation_cache->LookupRegExp(pattern, flags);
143   Handle<FixedArray> cached;
144   if (maybe_cached.ToHandle(&cached)) {
145     re->set_data(*cached);
146     return re;
147   }
148   pattern = String::Flatten(pattern);
149   PostponeInterruptsScope postpone(isolate);
150   RegExpCompileData parse_result;
151   FlatStringReader reader(isolate, pattern);
152   if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
153                                  &parse_result)) {
154     // Throw an exception if we fail to parse the pattern.
155     return ThrowRegExpException(re, pattern, parse_result.error);
156   }
157 
158   bool has_been_compiled = false;
159 
160   if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
161       !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
162     // Parse-tree is a single atom that is equal to the pattern.
163     AtomCompile(re, pattern, flags, pattern);
164     has_been_compiled = true;
165   } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
166              !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
167     RegExpAtom* atom = parse_result.tree->AsAtom();
168     Vector<const uc16> atom_pattern = atom->data();
169     Handle<String> atom_string;
170     ASSIGN_RETURN_ON_EXCEPTION(
171         isolate, atom_string,
172         isolate->factory()->NewStringFromTwoByte(atom_pattern),
173         Object);
174     if (!HasFewDifferentCharacters(atom_string)) {
175       AtomCompile(re, pattern, flags, atom_string);
176       has_been_compiled = true;
177     }
178   }
179   if (!has_been_compiled) {
180     IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
181   }
182   DCHECK(re->data()->IsFixedArray());
183   // Compilation succeeded so the data is set on the regexp
184   // and we can store it in the cache.
185   Handle<FixedArray> data(FixedArray::cast(re->data()));
186   compilation_cache->PutRegExp(pattern, flags, data);
187 
188   return re;
189 }
190 
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)191 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
192                                      Handle<String> subject, int index,
193                                      Handle<RegExpMatchInfo> last_match_info) {
194   switch (regexp->TypeTag()) {
195     case JSRegExp::ATOM:
196       return AtomExec(regexp, subject, index, last_match_info);
197     case JSRegExp::IRREGEXP: {
198       return IrregexpExec(regexp, subject, index, last_match_info);
199     }
200     default:
201       UNREACHABLE();
202       return MaybeHandle<Object>();
203   }
204 }
205 
206 
207 // RegExp Atom implementation: Simple string search using indexOf.
208 
209 
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)210 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
211                              Handle<String> pattern,
212                              JSRegExp::Flags flags,
213                              Handle<String> match_pattern) {
214   re->GetIsolate()->factory()->SetRegExpAtomData(re,
215                                                  JSRegExp::ATOM,
216                                                  pattern,
217                                                  flags,
218                                                  match_pattern);
219 }
220 
SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,String * subject,int from,int to)221 static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,
222                                String* subject, int from, int to) {
223   SealHandleScope shs(last_match_info->GetIsolate());
224   last_match_info->SetNumberOfCaptureRegisters(2);
225   last_match_info->SetLastSubject(subject);
226   last_match_info->SetLastInput(subject);
227   last_match_info->SetCapture(0, from);
228   last_match_info->SetCapture(1, to);
229 }
230 
231 
AtomExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)232 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
233                             Handle<String> subject,
234                             int index,
235                             int32_t* output,
236                             int output_size) {
237   Isolate* isolate = regexp->GetIsolate();
238 
239   DCHECK(0 <= index);
240   DCHECK(index <= subject->length());
241 
242   subject = String::Flatten(subject);
243   DisallowHeapAllocation no_gc;  // ensure vectors stay valid
244 
245   String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
246   int needle_len = needle->length();
247   DCHECK(needle->IsFlat());
248   DCHECK_LT(0, needle_len);
249 
250   if (index + needle_len > subject->length()) {
251     return RegExpImpl::RE_FAILURE;
252   }
253 
254   for (int i = 0; i < output_size; i += 2) {
255     String::FlatContent needle_content = needle->GetFlatContent();
256     String::FlatContent subject_content = subject->GetFlatContent();
257     DCHECK(needle_content.IsFlat());
258     DCHECK(subject_content.IsFlat());
259     // dispatch on type of strings
260     index =
261         (needle_content.IsOneByte()
262              ? (subject_content.IsOneByte()
263                     ? SearchString(isolate, subject_content.ToOneByteVector(),
264                                    needle_content.ToOneByteVector(), index)
265                     : SearchString(isolate, subject_content.ToUC16Vector(),
266                                    needle_content.ToOneByteVector(), index))
267              : (subject_content.IsOneByte()
268                     ? SearchString(isolate, subject_content.ToOneByteVector(),
269                                    needle_content.ToUC16Vector(), index)
270                     : SearchString(isolate, subject_content.ToUC16Vector(),
271                                    needle_content.ToUC16Vector(), index)));
272     if (index == -1) {
273       return i / 2;  // Return number of matches.
274     } else {
275       output[i] = index;
276       output[i+1] = index + needle_len;
277       index += needle_len;
278     }
279   }
280   return output_size / 2;
281 }
282 
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)283 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
284                                     int index,
285                                     Handle<RegExpMatchInfo> last_match_info) {
286   Isolate* isolate = re->GetIsolate();
287 
288   static const int kNumRegisters = 2;
289   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
290   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
291 
292   int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
293 
294   if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
295 
296   DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
297   SealHandleScope shs(isolate);
298   SetAtomLastCapture(last_match_info, *subject, output_registers[0],
299                      output_registers[1]);
300   return last_match_info;
301 }
302 
303 
304 // Irregexp implementation.
305 
306 // Ensures that the regexp object contains a compiled version of the
307 // source for either one-byte or two-byte subject strings.
308 // If the compiled version doesn't already exist, it is compiled
309 // from the source pattern.
310 // If compilation fails, an exception is thrown and this function
311 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)312 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
313                                         Handle<String> sample_subject,
314                                         bool is_one_byte) {
315   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
316 #ifdef V8_INTERPRETED_REGEXP
317   if (compiled_code->IsByteArray()) return true;
318 #else  // V8_INTERPRETED_REGEXP (RegExp native code)
319   if (compiled_code->IsCode()) return true;
320 #endif
321   // We could potentially have marked this as flushable, but have kept
322   // a saved version if we did not flush it yet.
323   Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
324   if (saved_code->IsCode()) {
325     // Reinstate the code in the original place.
326     re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
327     DCHECK(compiled_code->IsSmi());
328     return true;
329   }
330   return CompileIrregexp(re, sample_subject, is_one_byte);
331 }
332 
333 
CompileIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)334 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
335                                  Handle<String> sample_subject,
336                                  bool is_one_byte) {
337   // Compile the RegExp.
338   Isolate* isolate = re->GetIsolate();
339   Zone zone(isolate->allocator(), ZONE_NAME);
340   PostponeInterruptsScope postpone(isolate);
341   // If we had a compilation error the last time this is saved at the
342   // saved code index.
343   Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
344   // When arriving here entry can only be a smi, either representing an
345   // uncompiled regexp, a previous compilation error, or code that has
346   // been flushed.
347   DCHECK(entry->IsSmi());
348   int entry_value = Smi::cast(entry)->value();
349   DCHECK(entry_value == JSRegExp::kUninitializedValue ||
350          entry_value == JSRegExp::kCompilationErrorValue ||
351          (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
352 
353   if (entry_value == JSRegExp::kCompilationErrorValue) {
354     // A previous compilation failed and threw an error which we store in
355     // the saved code index (we store the error message, not the actual
356     // error). Recreate the error object and throw it.
357     Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
358     DCHECK(error_string->IsString());
359     Handle<String> error_message(String::cast(error_string));
360     ThrowRegExpException(re, error_message);
361     return false;
362   }
363 
364   JSRegExp::Flags flags = re->GetFlags();
365 
366   Handle<String> pattern(re->Pattern());
367   pattern = String::Flatten(pattern);
368   RegExpCompileData compile_data;
369   FlatStringReader reader(isolate, pattern);
370   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
371                                  &compile_data)) {
372     // Throw an exception if we fail to parse the pattern.
373     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
374     USE(ThrowRegExpException(re, pattern, compile_data.error));
375     return false;
376   }
377   RegExpEngine::CompilationResult result =
378       RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
379                             sample_subject, is_one_byte);
380   if (result.error_message != NULL) {
381     // Unable to compile regexp.
382     Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
383         CStrVector(result.error_message)).ToHandleChecked();
384     ThrowRegExpException(re, error_message);
385     return false;
386   }
387 
388   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
389   data->set(JSRegExp::code_index(is_one_byte), result.code);
390   SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
391   int register_max = IrregexpMaxRegisterCount(*data);
392   if (result.num_registers > register_max) {
393     SetIrregexpMaxRegisterCount(*data, result.num_registers);
394   }
395 
396   return true;
397 }
398 
399 
IrregexpMaxRegisterCount(FixedArray * re)400 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
401   return Smi::cast(
402       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
403 }
404 
405 
SetIrregexpMaxRegisterCount(FixedArray * re,int value)406 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
407   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
408 }
409 
SetIrregexpCaptureNameMap(FixedArray * re,Handle<FixedArray> value)410 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
411                                            Handle<FixedArray> value) {
412   if (value.is_null()) {
413     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
414   } else {
415     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
416   }
417 }
418 
IrregexpNumberOfCaptures(FixedArray * re)419 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
420   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
421 }
422 
423 
IrregexpNumberOfRegisters(FixedArray * re)424 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
425   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
426 }
427 
428 
IrregexpByteCode(FixedArray * re,bool is_one_byte)429 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
430   return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
431 }
432 
433 
IrregexpNativeCode(FixedArray * re,bool is_one_byte)434 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
435   return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
436 }
437 
438 
IrregexpInitialize(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)439 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
440                                     Handle<String> pattern,
441                                     JSRegExp::Flags flags,
442                                     int capture_count) {
443   // Initialize compiled code entries to null.
444   re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
445                                                      JSRegExp::IRREGEXP,
446                                                      pattern,
447                                                      flags,
448                                                      capture_count);
449 }
450 
451 
IrregexpPrepare(Handle<JSRegExp> regexp,Handle<String> subject)452 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
453                                 Handle<String> subject) {
454   DCHECK(subject->IsFlat());
455 
456   // Check representation of the underlying storage.
457   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
458   if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
459 
460 #ifdef V8_INTERPRETED_REGEXP
461   // Byte-code regexp needs space allocated for all its registers.
462   // The result captures are copied to the start of the registers array
463   // if the match succeeds.  This way those registers are not clobbered
464   // when we set the last match info from last successful match.
465   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
466          (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
467 #else  // V8_INTERPRETED_REGEXP
468   // Native regexp only needs room to output captures. Registers are handled
469   // internally.
470   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
471 #endif  // V8_INTERPRETED_REGEXP
472 }
473 
474 
IrregexpExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)475 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
476                                 Handle<String> subject,
477                                 int index,
478                                 int32_t* output,
479                                 int output_size) {
480   Isolate* isolate = regexp->GetIsolate();
481 
482   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
483 
484   DCHECK(index >= 0);
485   DCHECK(index <= subject->length());
486   DCHECK(subject->IsFlat());
487 
488   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
489 
490 #ifndef V8_INTERPRETED_REGEXP
491   DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
492   do {
493     EnsureCompiledIrregexp(regexp, subject, is_one_byte);
494     Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
495     // The stack is used to allocate registers for the compiled regexp code.
496     // This means that in case of failure, the output registers array is left
497     // untouched and contains the capture results from the previous successful
498     // match.  We can use that to set the last match info lazily.
499     NativeRegExpMacroAssembler::Result res =
500         NativeRegExpMacroAssembler::Match(code,
501                                           subject,
502                                           output,
503                                           output_size,
504                                           index,
505                                           isolate);
506     if (res != NativeRegExpMacroAssembler::RETRY) {
507       DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
508              isolate->has_pending_exception());
509       STATIC_ASSERT(
510           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
511       STATIC_ASSERT(
512           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
513       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
514                     == RE_EXCEPTION);
515       return static_cast<IrregexpResult>(res);
516     }
517     // If result is RETRY, the string has changed representation, and we
518     // must restart from scratch.
519     // In this case, it means we must make sure we are prepared to handle
520     // the, potentially, different subject (the string can switch between
521     // being internal and external, and even between being Latin1 and UC16,
522     // but the characters are always the same).
523     IrregexpPrepare(regexp, subject);
524     is_one_byte = subject->IsOneByteRepresentationUnderneath();
525   } while (true);
526   UNREACHABLE();
527   return RE_EXCEPTION;
528 #else  // V8_INTERPRETED_REGEXP
529 
530   DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
531   // We must have done EnsureCompiledIrregexp, so we can get the number of
532   // registers.
533   int number_of_capture_registers =
534       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
535   int32_t* raw_output = &output[number_of_capture_registers];
536   // We do not touch the actual capture result registers until we know there
537   // has been a match so that we can use those capture results to set the
538   // last match info.
539   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
540     raw_output[i] = -1;
541   }
542   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
543                                isolate);
544 
545   IrregexpResult result = IrregexpInterpreter::Match(isolate,
546                                                      byte_codes,
547                                                      subject,
548                                                      raw_output,
549                                                      index);
550   if (result == RE_SUCCESS) {
551     // Copy capture results to the start of the registers array.
552     MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
553   }
554   if (result == RE_EXCEPTION) {
555     DCHECK(!isolate->has_pending_exception());
556     isolate->StackOverflow();
557   }
558   return result;
559 #endif  // V8_INTERPRETED_REGEXP
560 }
561 
IrregexpExec(Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info)562 MaybeHandle<Object> RegExpImpl::IrregexpExec(
563     Handle<JSRegExp> regexp, Handle<String> subject, int previous_index,
564     Handle<RegExpMatchInfo> last_match_info) {
565   Isolate* isolate = regexp->GetIsolate();
566   DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
567 
568   subject = String::Flatten(subject);
569 
570   // Prepare space for the return values.
571 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
572   if (FLAG_trace_regexp_bytecodes) {
573     String* pattern = regexp->Pattern();
574     PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
575     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
576   }
577 #endif
578   int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
579   if (required_registers < 0) {
580     // Compiling failed with an exception.
581     DCHECK(isolate->has_pending_exception());
582     return MaybeHandle<Object>();
583   }
584 
585   int32_t* output_registers = NULL;
586   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
587     output_registers = NewArray<int32_t>(required_registers);
588   }
589   std::unique_ptr<int32_t[]> auto_release(output_registers);
590   if (output_registers == NULL) {
591     output_registers = isolate->jsregexp_static_offsets_vector();
592   }
593 
594   int res = RegExpImpl::IrregexpExecRaw(
595       regexp, subject, previous_index, output_registers, required_registers);
596   if (res == RE_SUCCESS) {
597     int capture_count =
598         IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
599     return SetLastMatchInfo(
600         last_match_info, subject, capture_count, output_registers);
601   }
602   if (res == RE_EXCEPTION) {
603     DCHECK(isolate->has_pending_exception());
604     return MaybeHandle<Object>();
605   }
606   DCHECK(res == RE_FAILURE);
607   return isolate->factory()->null_value();
608 }
609 
SetLastMatchInfo(Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)610 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
611     Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
612     int capture_count, int32_t* match) {
613   // This is the only place where match infos can grow. If, after executing the
614   // regexp, RegExpExecStub finds that the match info is too small, it restarts
615   // execution in RegExpImpl::Exec, which finally grows the match info right
616   // here.
617 
618   int capture_register_count = (capture_count + 1) * 2;
619   Handle<RegExpMatchInfo> result =
620       RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count);
621   result->SetNumberOfCaptureRegisters(capture_register_count);
622 
623   if (*result != *last_match_info) {
624     // The match info has been reallocated, update the corresponding reference
625     // on the native context.
626     Isolate* isolate = last_match_info->GetIsolate();
627     if (*last_match_info == *isolate->regexp_last_match_info()) {
628       isolate->native_context()->set_regexp_last_match_info(*result);
629     } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
630       isolate->native_context()->set_regexp_internal_match_info(*result);
631     }
632   }
633 
634   DisallowHeapAllocation no_allocation;
635   if (match != NULL) {
636     for (int i = 0; i < capture_register_count; i += 2) {
637       result->SetCapture(i, match[i]);
638       result->SetCapture(i + 1, match[i + 1]);
639     }
640   }
641   result->SetLastSubject(*subject);
642   result->SetLastInput(*subject);
643   return result;
644 }
645 
646 
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)647 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
648                                      Handle<String> subject,
649                                      Isolate* isolate)
650   : register_array_(NULL),
651     register_array_size_(0),
652     regexp_(regexp),
653     subject_(subject) {
654 #ifdef V8_INTERPRETED_REGEXP
655   bool interpreted = true;
656 #else
657   bool interpreted = false;
658 #endif  // V8_INTERPRETED_REGEXP
659 
660   if (regexp_->TypeTag() == JSRegExp::ATOM) {
661     static const int kAtomRegistersPerMatch = 2;
662     registers_per_match_ = kAtomRegistersPerMatch;
663     // There is no distinction between interpreted and native for atom regexps.
664     interpreted = false;
665   } else {
666     registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
667     if (registers_per_match_ < 0) {
668       num_matches_ = -1;  // Signal exception.
669       return;
670     }
671   }
672 
673   DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
674   if (!interpreted) {
675     register_array_size_ =
676         Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
677     max_matches_ = register_array_size_ / registers_per_match_;
678   } else {
679     // Global loop in interpreted regexp is not implemented.  We choose
680     // the size of the offsets vector so that it can only store one match.
681     register_array_size_ = registers_per_match_;
682     max_matches_ = 1;
683   }
684 
685   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
686     register_array_ = NewArray<int32_t>(register_array_size_);
687   } else {
688     register_array_ = isolate->jsregexp_static_offsets_vector();
689   }
690 
691   // Set state so that fetching the results the first time triggers a call
692   // to the compiled regexp.
693   current_match_index_ = max_matches_ - 1;
694   num_matches_ = max_matches_;
695   DCHECK(registers_per_match_ >= 2);  // Each match has at least one capture.
696   DCHECK_GE(register_array_size_, registers_per_match_);
697   int32_t* last_match =
698       &register_array_[current_match_index_ * registers_per_match_];
699   last_match[0] = -1;
700   last_match[1] = 0;
701 }
702 
AdvanceZeroLength(int last_index)703 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
704   if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
705       last_index + 1 < subject_->length() &&
706       unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
707       unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
708     // Advance over the surrogate pair.
709     return last_index + 2;
710   }
711   return last_index + 1;
712 }
713 
714 // -------------------------------------------------------------------
715 // Implementation of the Irregexp regular expression engine.
716 //
717 // The Irregexp regular expression engine is intended to be a complete
718 // implementation of ECMAScript regular expressions.  It generates either
719 // bytecodes or native code.
720 
721 //   The Irregexp regexp engine is structured in three steps.
722 //   1) The parser generates an abstract syntax tree.  See ast.cc.
723 //   2) From the AST a node network is created.  The nodes are all
724 //      subclasses of RegExpNode.  The nodes represent states when
725 //      executing a regular expression.  Several optimizations are
726 //      performed on the node network.
727 //   3) From the nodes we generate either byte codes or native code
728 //      that can actually execute the regular expression (perform
729 //      the search).  The code generation step is described in more
730 //      detail below.
731 
732 // Code generation.
733 //
734 //   The nodes are divided into four main categories.
735 //   * Choice nodes
736 //        These represent places where the regular expression can
737 //        match in more than one way.  For example on entry to an
738 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
739 //   * Action nodes
740 //        These represent places where some action should be
741 //        performed.  Examples include recording the current position
742 //        in the input string to a register (in order to implement
743 //        captures) or other actions on register for example in order
744 //        to implement the counters needed for {} repetitions.
745 //   * Matching nodes
746 //        These attempt to match some element part of the input string.
747 //        Examples of elements include character classes, plain strings
748 //        or back references.
749 //   * End nodes
750 //        These are used to implement the actions required on finding
751 //        a successful match or failing to find a match.
752 //
753 //   The code generated (whether as byte codes or native code) maintains
754 //   some state as it runs.  This consists of the following elements:
755 //
756 //   * The capture registers.  Used for string captures.
757 //   * Other registers.  Used for counters etc.
758 //   * The current position.
759 //   * The stack of backtracking information.  Used when a matching node
760 //     fails to find a match and needs to try an alternative.
761 //
762 // Conceptual regular expression execution model:
763 //
764 //   There is a simple conceptual model of regular expression execution
765 //   which will be presented first.  The actual code generated is a more
766 //   efficient simulation of the simple conceptual model:
767 //
768 //   * Choice nodes are implemented as follows:
769 //     For each choice except the last {
770 //       push current position
771 //       push backtrack code location
772 //       <generate code to test for choice>
773 //       backtrack code location:
774 //       pop current position
775 //     }
776 //     <generate code to test for last choice>
777 //
778 //   * Actions nodes are generated as follows
779 //     <push affected registers on backtrack stack>
780 //     <generate code to perform action>
781 //     push backtrack code location
782 //     <generate code to test for following nodes>
783 //     backtrack code location:
784 //     <pop affected registers to restore their state>
785 //     <pop backtrack location from stack and go to it>
786 //
787 //   * Matching nodes are generated as follows:
788 //     if input string matches at current position
789 //       update current position
790 //       <generate code to test for following nodes>
791 //     else
792 //       <pop backtrack location from stack and go to it>
793 //
794 //   Thus it can be seen that the current position is saved and restored
795 //   by the choice nodes, whereas the registers are saved and restored by
796 //   by the action nodes that manipulate them.
797 //
798 //   The other interesting aspect of this model is that nodes are generated
799 //   at the point where they are needed by a recursive call to Emit().  If
800 //   the node has already been code generated then the Emit() call will
801 //   generate a jump to the previously generated code instead.  In order to
802 //   limit recursion it is possible for the Emit() function to put the node
803 //   on a work list for later generation and instead generate a jump.  The
804 //   destination of the jump is resolved later when the code is generated.
805 //
806 // Actual regular expression code generation.
807 //
808 //   Code generation is actually more complicated than the above.  In order
809 //   to improve the efficiency of the generated code some optimizations are
810 //   performed
811 //
812 //   * Choice nodes have 1-character lookahead.
813 //     A choice node looks at the following character and eliminates some of
814 //     the choices immediately based on that character.  This is not yet
815 //     implemented.
816 //   * Simple greedy loops store reduced backtracking information.
817 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
818 //     then need to backtrack to a point where it can match "foo".  The naive
819 //     implementation of this would push each character position onto the
820 //     backtracking stack, then pop them off one by one.  This would use space
821 //     proportional to the length of the input string.  However since the "."
822 //     can only match in one way and always has a constant length (in this case
823 //     of 1) it suffices to store the current position on the top of the stack
824 //     once.  Matching now becomes merely incrementing the current position and
825 //     backtracking becomes decrementing the current position and checking the
826 //     result against the stored current position.  This is faster and saves
827 //     space.
828 //   * The current state is virtualized.
829 //     This is used to defer expensive operations until it is clear that they
830 //     are needed and to generate code for a node more than once, allowing
831 //     specialized an efficient versions of the code to be created. This is
832 //     explained in the section below.
833 //
834 // Execution state virtualization.
835 //
836 //   Instead of emitting code, nodes that manipulate the state can record their
837 //   manipulation in an object called the Trace.  The Trace object can record a
838 //   current position offset, an optional backtrack code location on the top of
839 //   the virtualized backtrack stack and some register changes.  When a node is
840 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
841 //   will emit code to bring the actual state into line with the virtual state.
842 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
843 //   registers).  Postponing work can save time when executing the regular
844 //   expression since it may be found that the work never has to be done as a
845 //   failure to match can occur.  In addition it is much faster to jump to a
846 //   known backtrack code location than it is to pop an unknown backtrack
847 //   location from the stack and jump there.
848 //
849 //   The virtual state found in the Trace affects code generation.  For example
850 //   the virtual state contains the difference between the actual current
851 //   position and the virtual current position, and matching code needs to use
852 //   this offset to attempt a match in the correct location of the input
853 //   string.  Therefore code generated for a non-trivial trace is specialized
854 //   to that trace.  The code generator therefore has the ability to generate
855 //   code for each node several times.  In order to limit the size of the
856 //   generated code there is an arbitrary limit on how many specialized sets of
857 //   code may be generated for a given node.  If the limit is reached, the
858 //   trace is flushed and a generic version of the code for a node is emitted.
859 //   This is subsequently used for that node.  The code emitted for non-generic
860 //   trace is not recorded in the node and so it cannot currently be reused in
861 //   the event that code generation is requested for an identical trace.
862 
863 
AppendToText(RegExpText * text,Zone * zone)864 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
865   UNREACHABLE();
866 }
867 
868 
AppendToText(RegExpText * text,Zone * zone)869 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
870   text->AddElement(TextElement::Atom(this), zone);
871 }
872 
873 
AppendToText(RegExpText * text,Zone * zone)874 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
875   text->AddElement(TextElement::CharClass(this), zone);
876 }
877 
878 
AppendToText(RegExpText * text,Zone * zone)879 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
880   for (int i = 0; i < elements()->length(); i++)
881     text->AddElement(elements()->at(i), zone);
882 }
883 
884 
Atom(RegExpAtom * atom)885 TextElement TextElement::Atom(RegExpAtom* atom) {
886   return TextElement(ATOM, atom);
887 }
888 
889 
CharClass(RegExpCharacterClass * char_class)890 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
891   return TextElement(CHAR_CLASS, char_class);
892 }
893 
894 
length() const895 int TextElement::length() const {
896   switch (text_type()) {
897     case ATOM:
898       return atom()->length();
899 
900     case CHAR_CLASS:
901       return 1;
902   }
903   UNREACHABLE();
904   return 0;
905 }
906 
907 
GetTable(bool ignore_case)908 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
909   if (table_ == NULL) {
910     table_ = new(zone()) DispatchTable(zone());
911     DispatchTableConstructor cons(table_, ignore_case, zone());
912     cons.BuildTable(this);
913   }
914   return table_;
915 }
916 
917 
918 class FrequencyCollator {
919  public:
FrequencyCollator()920   FrequencyCollator() : total_samples_(0) {
921     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
922       frequencies_[i] = CharacterFrequency(i);
923     }
924   }
925 
CountCharacter(int character)926   void CountCharacter(int character) {
927     int index = (character & RegExpMacroAssembler::kTableMask);
928     frequencies_[index].Increment();
929     total_samples_++;
930   }
931 
932   // Does not measure in percent, but rather per-128 (the table size from the
933   // regexp macro assembler).
Frequency(int in_character)934   int Frequency(int in_character) {
935     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
936     if (total_samples_ < 1) return 1;  // Division by zero.
937     int freq_in_per128 =
938         (frequencies_[in_character].counter() * 128) / total_samples_;
939     return freq_in_per128;
940   }
941 
942  private:
943   class CharacterFrequency {
944    public:
CharacterFrequency()945     CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)946     explicit CharacterFrequency(int character)
947         : counter_(0), character_(character) { }
948 
Increment()949     void Increment() { counter_++; }
counter()950     int counter() { return counter_; }
character()951     int character() { return character_; }
952 
953    private:
954     int counter_;
955     int character_;
956   };
957 
958 
959  private:
960   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
961   int total_samples_;
962 };
963 
964 
965 class RegExpCompiler {
966  public:
967   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
968                  JSRegExp::Flags flags, bool is_one_byte);
969 
AllocateRegister()970   int AllocateRegister() {
971     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
972       reg_exp_too_big_ = true;
973       return next_register_;
974     }
975     return next_register_++;
976   }
977 
978   // Lookarounds to match lone surrogates for unicode character class matches
979   // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()980   int UnicodeLookaroundStackRegister() {
981     if (unicode_lookaround_stack_register_ == kNoRegister) {
982       unicode_lookaround_stack_register_ = AllocateRegister();
983     }
984     return unicode_lookaround_stack_register_;
985   }
986 
UnicodeLookaroundPositionRegister()987   int UnicodeLookaroundPositionRegister() {
988     if (unicode_lookaround_position_register_ == kNoRegister) {
989       unicode_lookaround_position_register_ = AllocateRegister();
990     }
991     return unicode_lookaround_position_register_;
992   }
993 
994   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
995                                            RegExpNode* start,
996                                            int capture_count,
997                                            Handle<String> pattern);
998 
AddWork(RegExpNode * node)999   inline void AddWork(RegExpNode* node) {
1000     if (!node->on_work_list() && !node->label()->is_bound()) {
1001       node->set_on_work_list(true);
1002       work_list_->Add(node);
1003     }
1004   }
1005 
1006   static const int kImplementationOffset = 0;
1007   static const int kNumberOfRegistersOffset = 0;
1008   static const int kCodeOffset = 1;
1009 
macro_assembler()1010   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()1011   EndNode* accept() { return accept_; }
1012 
1013   static const int kMaxRecursion = 100;
recursion_depth()1014   inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()1015   inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()1016   inline void DecrementRecursionDepth() { recursion_depth_--; }
1017 
SetRegExpTooBig()1018   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1019 
ignore_case()1020   inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
unicode()1021   inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
one_byte()1022   inline bool one_byte() { return one_byte_; }
optimize()1023   inline bool optimize() { return optimize_; }
set_optimize(bool value)1024   inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()1025   inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)1026   inline void set_limiting_recursion(bool value) {
1027     limiting_recursion_ = value;
1028   }
read_backward()1029   bool read_backward() { return read_backward_; }
set_read_backward(bool value)1030   void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()1031   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1032 
current_expansion_factor()1033   int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)1034   void set_current_expansion_factor(int value) {
1035     current_expansion_factor_ = value;
1036   }
1037 
isolate() const1038   Isolate* isolate() const { return isolate_; }
zone() const1039   Zone* zone() const { return zone_; }
1040 
1041   static const int kNoRegister = -1;
1042 
1043  private:
1044   EndNode* accept_;
1045   int next_register_;
1046   int unicode_lookaround_stack_register_;
1047   int unicode_lookaround_position_register_;
1048   List<RegExpNode*>* work_list_;
1049   int recursion_depth_;
1050   RegExpMacroAssembler* macro_assembler_;
1051   JSRegExp::Flags flags_;
1052   bool one_byte_;
1053   bool reg_exp_too_big_;
1054   bool limiting_recursion_;
1055   bool optimize_;
1056   bool read_backward_;
1057   int current_expansion_factor_;
1058   FrequencyCollator frequency_collator_;
1059   Isolate* isolate_;
1060   Zone* zone_;
1061 };
1062 
1063 
1064 class RecursionCheck {
1065  public:
RecursionCheck(RegExpCompiler * compiler)1066   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1067     compiler->IncrementRecursionDepth();
1068   }
~RecursionCheck()1069   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1070  private:
1071   RegExpCompiler* compiler_;
1072 };
1073 
1074 
IrregexpRegExpTooBig(Isolate * isolate)1075 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1076   return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1077 }
1078 
1079 
1080 // Attempts to compile the regexp using an Irregexp code generator.  Returns
1081 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,JSRegExp::Flags flags,bool one_byte)1082 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1083                                JSRegExp::Flags flags, bool one_byte)
1084     : next_register_(2 * (capture_count + 1)),
1085       unicode_lookaround_stack_register_(kNoRegister),
1086       unicode_lookaround_position_register_(kNoRegister),
1087       work_list_(NULL),
1088       recursion_depth_(0),
1089       flags_(flags),
1090       one_byte_(one_byte),
1091       reg_exp_too_big_(false),
1092       limiting_recursion_(false),
1093       optimize_(FLAG_regexp_optimization),
1094       read_backward_(false),
1095       current_expansion_factor_(1),
1096       frequency_collator_(),
1097       isolate_(isolate),
1098       zone_(zone) {
1099   accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1100   DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1101 }
1102 
1103 
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1104 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1105     RegExpMacroAssembler* macro_assembler,
1106     RegExpNode* start,
1107     int capture_count,
1108     Handle<String> pattern) {
1109   Isolate* isolate = pattern->GetHeap()->isolate();
1110 
1111 #ifdef DEBUG
1112   if (FLAG_trace_regexp_assembler)
1113     macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1114   else
1115 #endif
1116     macro_assembler_ = macro_assembler;
1117 
1118   List <RegExpNode*> work_list(0);
1119   work_list_ = &work_list;
1120   Label fail;
1121   macro_assembler_->PushBacktrack(&fail);
1122   Trace new_trace;
1123   start->Emit(this, &new_trace);
1124   macro_assembler_->Bind(&fail);
1125   macro_assembler_->Fail();
1126   while (!work_list.is_empty()) {
1127     RegExpNode* node = work_list.RemoveLast();
1128     node->set_on_work_list(false);
1129     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1130   }
1131   if (reg_exp_too_big_) {
1132     macro_assembler_->AbortedCodeGeneration();
1133     return IrregexpRegExpTooBig(isolate_);
1134   }
1135 
1136   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1137   isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1138   work_list_ = NULL;
1139 #ifdef ENABLE_DISASSEMBLER
1140   if (FLAG_print_code) {
1141     CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1142     OFStream os(trace_scope.file());
1143     Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1144   }
1145 #endif
1146 #ifdef DEBUG
1147   if (FLAG_trace_regexp_assembler) {
1148     delete macro_assembler_;
1149   }
1150 #endif
1151   return RegExpEngine::CompilationResult(*code, next_register_);
1152 }
1153 
1154 
Mentions(int that)1155 bool Trace::DeferredAction::Mentions(int that) {
1156   if (action_type() == ActionNode::CLEAR_CAPTURES) {
1157     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1158     return range.Contains(that);
1159   } else {
1160     return reg() == that;
1161   }
1162 }
1163 
1164 
mentions_reg(int reg)1165 bool Trace::mentions_reg(int reg) {
1166   for (DeferredAction* action = actions_;
1167        action != NULL;
1168        action = action->next()) {
1169     if (action->Mentions(reg))
1170       return true;
1171   }
1172   return false;
1173 }
1174 
1175 
GetStoredPosition(int reg,int * cp_offset)1176 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1177   DCHECK_EQ(0, *cp_offset);
1178   for (DeferredAction* action = actions_;
1179        action != NULL;
1180        action = action->next()) {
1181     if (action->Mentions(reg)) {
1182       if (action->action_type() == ActionNode::STORE_POSITION) {
1183         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1184         return true;
1185       } else {
1186         return false;
1187       }
1188     }
1189   }
1190   return false;
1191 }
1192 
1193 
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1194 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1195                                  Zone* zone) {
1196   int max_register = RegExpCompiler::kNoRegister;
1197   for (DeferredAction* action = actions_;
1198        action != NULL;
1199        action = action->next()) {
1200     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1201       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1202       for (int i = range.from(); i <= range.to(); i++)
1203         affected_registers->Set(i, zone);
1204       if (range.to() > max_register) max_register = range.to();
1205     } else {
1206       affected_registers->Set(action->reg(), zone);
1207       if (action->reg() > max_register) max_register = action->reg();
1208     }
1209   }
1210   return max_register;
1211 }
1212 
1213 
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1214 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1215                                      int max_register,
1216                                      const OutSet& registers_to_pop,
1217                                      const OutSet& registers_to_clear) {
1218   for (int reg = max_register; reg >= 0; reg--) {
1219     if (registers_to_pop.Get(reg)) {
1220       assembler->PopRegister(reg);
1221     } else if (registers_to_clear.Get(reg)) {
1222       int clear_to = reg;
1223       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1224         reg--;
1225       }
1226       assembler->ClearRegisters(reg, clear_to);
1227     }
1228   }
1229 }
1230 
1231 
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1232 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1233                                    int max_register,
1234                                    const OutSet& affected_registers,
1235                                    OutSet* registers_to_pop,
1236                                    OutSet* registers_to_clear,
1237                                    Zone* zone) {
1238   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1239   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1240 
1241   // Count pushes performed to force a stack limit check occasionally.
1242   int pushes = 0;
1243 
1244   for (int reg = 0; reg <= max_register; reg++) {
1245     if (!affected_registers.Get(reg)) {
1246       continue;
1247     }
1248 
1249     // The chronologically first deferred action in the trace
1250     // is used to infer the action needed to restore a register
1251     // to its previous state (or not, if it's safe to ignore it).
1252     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1253     DeferredActionUndoType undo_action = IGNORE;
1254 
1255     int value = 0;
1256     bool absolute = false;
1257     bool clear = false;
1258     static const int kNoStore = kMinInt;
1259     int store_position = kNoStore;
1260     // This is a little tricky because we are scanning the actions in reverse
1261     // historical order (newest first).
1262     for (DeferredAction* action = actions_;
1263          action != NULL;
1264          action = action->next()) {
1265       if (action->Mentions(reg)) {
1266         switch (action->action_type()) {
1267           case ActionNode::SET_REGISTER: {
1268             Trace::DeferredSetRegister* psr =
1269                 static_cast<Trace::DeferredSetRegister*>(action);
1270             if (!absolute) {
1271               value += psr->value();
1272               absolute = true;
1273             }
1274             // SET_REGISTER is currently only used for newly introduced loop
1275             // counters. They can have a significant previous value if they
1276             // occour in a loop. TODO(lrn): Propagate this information, so
1277             // we can set undo_action to IGNORE if we know there is no value to
1278             // restore.
1279             undo_action = RESTORE;
1280             DCHECK_EQ(store_position, kNoStore);
1281             DCHECK(!clear);
1282             break;
1283           }
1284           case ActionNode::INCREMENT_REGISTER:
1285             if (!absolute) {
1286               value++;
1287             }
1288             DCHECK_EQ(store_position, kNoStore);
1289             DCHECK(!clear);
1290             undo_action = RESTORE;
1291             break;
1292           case ActionNode::STORE_POSITION: {
1293             Trace::DeferredCapture* pc =
1294                 static_cast<Trace::DeferredCapture*>(action);
1295             if (!clear && store_position == kNoStore) {
1296               store_position = pc->cp_offset();
1297             }
1298 
1299             // For captures we know that stores and clears alternate.
1300             // Other register, are never cleared, and if the occur
1301             // inside a loop, they might be assigned more than once.
1302             if (reg <= 1) {
1303               // Registers zero and one, aka "capture zero", is
1304               // always set correctly if we succeed. There is no
1305               // need to undo a setting on backtrack, because we
1306               // will set it again or fail.
1307               undo_action = IGNORE;
1308             } else {
1309               undo_action = pc->is_capture() ? CLEAR : RESTORE;
1310             }
1311             DCHECK(!absolute);
1312             DCHECK_EQ(value, 0);
1313             break;
1314           }
1315           case ActionNode::CLEAR_CAPTURES: {
1316             // Since we're scanning in reverse order, if we've already
1317             // set the position we have to ignore historically earlier
1318             // clearing operations.
1319             if (store_position == kNoStore) {
1320               clear = true;
1321             }
1322             undo_action = RESTORE;
1323             DCHECK(!absolute);
1324             DCHECK_EQ(value, 0);
1325             break;
1326           }
1327           default:
1328             UNREACHABLE();
1329             break;
1330         }
1331       }
1332     }
1333     // Prepare for the undo-action (e.g., push if it's going to be popped).
1334     if (undo_action == RESTORE) {
1335       pushes++;
1336       RegExpMacroAssembler::StackCheckFlag stack_check =
1337           RegExpMacroAssembler::kNoStackLimitCheck;
1338       if (pushes == push_limit) {
1339         stack_check = RegExpMacroAssembler::kCheckStackLimit;
1340         pushes = 0;
1341       }
1342 
1343       assembler->PushRegister(reg, stack_check);
1344       registers_to_pop->Set(reg, zone);
1345     } else if (undo_action == CLEAR) {
1346       registers_to_clear->Set(reg, zone);
1347     }
1348     // Perform the chronologically last action (or accumulated increment)
1349     // for the register.
1350     if (store_position != kNoStore) {
1351       assembler->WriteCurrentPositionToRegister(reg, store_position);
1352     } else if (clear) {
1353       assembler->ClearRegisters(reg, reg);
1354     } else if (absolute) {
1355       assembler->SetRegister(reg, value);
1356     } else if (value != 0) {
1357       assembler->AdvanceRegister(reg, value);
1358     }
1359   }
1360 }
1361 
1362 
1363 // This is called as we come into a loop choice node and some other tricky
1364 // nodes.  It normalizes the state of the code generator to ensure we can
1365 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1366 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1367   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1368 
1369   DCHECK(!is_trivial());
1370 
1371   if (actions_ == NULL && backtrack() == NULL) {
1372     // Here we just have some deferred cp advances to fix and we are back to
1373     // a normal situation.  We may also have to forget some information gained
1374     // through a quick check that was already performed.
1375     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1376     // Create a new trivial state and generate the node with that.
1377     Trace new_state;
1378     successor->Emit(compiler, &new_state);
1379     return;
1380   }
1381 
1382   // Generate deferred actions here along with code to undo them again.
1383   OutSet affected_registers;
1384 
1385   if (backtrack() != NULL) {
1386     // Here we have a concrete backtrack location.  These are set up by choice
1387     // nodes and so they indicate that we have a deferred save of the current
1388     // position which we may need to emit here.
1389     assembler->PushCurrentPosition();
1390   }
1391 
1392   int max_register = FindAffectedRegisters(&affected_registers,
1393                                            compiler->zone());
1394   OutSet registers_to_pop;
1395   OutSet registers_to_clear;
1396   PerformDeferredActions(assembler,
1397                          max_register,
1398                          affected_registers,
1399                          &registers_to_pop,
1400                          &registers_to_clear,
1401                          compiler->zone());
1402   if (cp_offset_ != 0) {
1403     assembler->AdvanceCurrentPosition(cp_offset_);
1404   }
1405 
1406   // Create a new trivial state and generate the node with that.
1407   Label undo;
1408   assembler->PushBacktrack(&undo);
1409   if (successor->KeepRecursing(compiler)) {
1410     Trace new_state;
1411     successor->Emit(compiler, &new_state);
1412   } else {
1413     compiler->AddWork(successor);
1414     assembler->GoTo(successor->label());
1415   }
1416 
1417   // On backtrack we need to restore state.
1418   assembler->Bind(&undo);
1419   RestoreAffectedRegisters(assembler,
1420                            max_register,
1421                            registers_to_pop,
1422                            registers_to_clear);
1423   if (backtrack() == NULL) {
1424     assembler->Backtrack();
1425   } else {
1426     assembler->PopCurrentPosition();
1427     assembler->GoTo(backtrack());
1428   }
1429 }
1430 
1431 
Emit(RegExpCompiler * compiler,Trace * trace)1432 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1433   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1434 
1435   // Omit flushing the trace. We discard the entire stack frame anyway.
1436 
1437   if (!label()->is_bound()) {
1438     // We are completely independent of the trace, since we ignore it,
1439     // so this code can be used as the generic version.
1440     assembler->Bind(label());
1441   }
1442 
1443   // Throw away everything on the backtrack stack since the start
1444   // of the negative submatch and restore the character position.
1445   assembler->ReadCurrentPositionFromRegister(current_position_register_);
1446   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1447   if (clear_capture_count_ > 0) {
1448     // Clear any captures that might have been performed during the success
1449     // of the body of the negative look-ahead.
1450     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1451     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1452   }
1453   // Now that we have unwound the stack we find at the top of the stack the
1454   // backtrack that the BeginSubmatch node got.
1455   assembler->Backtrack();
1456 }
1457 
1458 
Emit(RegExpCompiler * compiler,Trace * trace)1459 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1460   if (!trace->is_trivial()) {
1461     trace->Flush(compiler, this);
1462     return;
1463   }
1464   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1465   if (!label()->is_bound()) {
1466     assembler->Bind(label());
1467   }
1468   switch (action_) {
1469     case ACCEPT:
1470       assembler->Succeed();
1471       return;
1472     case BACKTRACK:
1473       assembler->GoTo(trace->backtrack());
1474       return;
1475     case NEGATIVE_SUBMATCH_SUCCESS:
1476       // This case is handled in a different virtual method.
1477       UNREACHABLE();
1478   }
1479   UNIMPLEMENTED();
1480 }
1481 
1482 
AddGuard(Guard * guard,Zone * zone)1483 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1484   if (guards_ == NULL)
1485     guards_ = new(zone) ZoneList<Guard*>(1, zone);
1486   guards_->Add(guard, zone);
1487 }
1488 
1489 
SetRegister(int reg,int val,RegExpNode * on_success)1490 ActionNode* ActionNode::SetRegister(int reg,
1491                                     int val,
1492                                     RegExpNode* on_success) {
1493   ActionNode* result =
1494       new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1495   result->data_.u_store_register.reg = reg;
1496   result->data_.u_store_register.value = val;
1497   return result;
1498 }
1499 
1500 
IncrementRegister(int reg,RegExpNode * on_success)1501 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1502   ActionNode* result =
1503       new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1504   result->data_.u_increment_register.reg = reg;
1505   return result;
1506 }
1507 
1508 
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1509 ActionNode* ActionNode::StorePosition(int reg,
1510                                       bool is_capture,
1511                                       RegExpNode* on_success) {
1512   ActionNode* result =
1513       new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1514   result->data_.u_position_register.reg = reg;
1515   result->data_.u_position_register.is_capture = is_capture;
1516   return result;
1517 }
1518 
1519 
ClearCaptures(Interval range,RegExpNode * on_success)1520 ActionNode* ActionNode::ClearCaptures(Interval range,
1521                                       RegExpNode* on_success) {
1522   ActionNode* result =
1523       new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1524   result->data_.u_clear_captures.range_from = range.from();
1525   result->data_.u_clear_captures.range_to = range.to();
1526   return result;
1527 }
1528 
1529 
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1530 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1531                                       int position_reg,
1532                                       RegExpNode* on_success) {
1533   ActionNode* result =
1534       new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1535   result->data_.u_submatch.stack_pointer_register = stack_reg;
1536   result->data_.u_submatch.current_position_register = position_reg;
1537   return result;
1538 }
1539 
1540 
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1541 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1542                                                 int position_reg,
1543                                                 int clear_register_count,
1544                                                 int clear_register_from,
1545                                                 RegExpNode* on_success) {
1546   ActionNode* result =
1547       new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1548   result->data_.u_submatch.stack_pointer_register = stack_reg;
1549   result->data_.u_submatch.current_position_register = position_reg;
1550   result->data_.u_submatch.clear_register_count = clear_register_count;
1551   result->data_.u_submatch.clear_register_from = clear_register_from;
1552   return result;
1553 }
1554 
1555 
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1556 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1557                                         int repetition_register,
1558                                         int repetition_limit,
1559                                         RegExpNode* on_success) {
1560   ActionNode* result =
1561       new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1562   result->data_.u_empty_match_check.start_register = start_register;
1563   result->data_.u_empty_match_check.repetition_register = repetition_register;
1564   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1565   return result;
1566 }
1567 
1568 
1569 #define DEFINE_ACCEPT(Type)                                          \
1570   void Type##Node::Accept(NodeVisitor* visitor) {                    \
1571     visitor->Visit##Type(this);                                      \
1572   }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1573 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1574 #undef DEFINE_ACCEPT
1575 
1576 
1577 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1578   visitor->VisitLoopChoice(this);
1579 }
1580 
1581 
1582 // -------------------------------------------------------------------
1583 // Emit code.
1584 
1585 
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1586 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1587                                Guard* guard,
1588                                Trace* trace) {
1589   switch (guard->op()) {
1590     case Guard::LT:
1591       DCHECK(!trace->mentions_reg(guard->reg()));
1592       macro_assembler->IfRegisterGE(guard->reg(),
1593                                     guard->value(),
1594                                     trace->backtrack());
1595       break;
1596     case Guard::GEQ:
1597       DCHECK(!trace->mentions_reg(guard->reg()));
1598       macro_assembler->IfRegisterLT(guard->reg(),
1599                                     guard->value(),
1600                                     trace->backtrack());
1601       break;
1602   }
1603 }
1604 
1605 
1606 // Returns the number of characters in the equivalence class, omitting those
1607 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1608 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1609                                      bool one_byte_subject,
1610                                      unibrow::uchar* letters) {
1611   int length =
1612       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1613   // Unibrow returns 0 or 1 for characters where case independence is
1614   // trivial.
1615   if (length == 0) {
1616     letters[0] = character;
1617     length = 1;
1618   }
1619 
1620   if (one_byte_subject) {
1621     int new_length = 0;
1622     for (int i = 0; i < length; i++) {
1623       if (letters[i] <= String::kMaxOneByteCharCode) {
1624         letters[new_length++] = letters[i];
1625       }
1626     }
1627     length = new_length;
1628   }
1629 
1630   return length;
1631 }
1632 
1633 
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1634 static inline bool EmitSimpleCharacter(Isolate* isolate,
1635                                        RegExpCompiler* compiler,
1636                                        uc16 c,
1637                                        Label* on_failure,
1638                                        int cp_offset,
1639                                        bool check,
1640                                        bool preloaded) {
1641   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1642   bool bound_checked = false;
1643   if (!preloaded) {
1644     assembler->LoadCurrentCharacter(
1645         cp_offset,
1646         on_failure,
1647         check);
1648     bound_checked = true;
1649   }
1650   assembler->CheckNotCharacter(c, on_failure);
1651   return bound_checked;
1652 }
1653 
1654 
1655 // Only emits non-letters (things that don't have case).  Only used for case
1656 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1657 static inline bool EmitAtomNonLetter(Isolate* isolate,
1658                                      RegExpCompiler* compiler,
1659                                      uc16 c,
1660                                      Label* on_failure,
1661                                      int cp_offset,
1662                                      bool check,
1663                                      bool preloaded) {
1664   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1665   bool one_byte = compiler->one_byte();
1666   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1667   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1668   if (length < 1) {
1669     // This can't match.  Must be an one-byte subject and a non-one-byte
1670     // character.  We do not need to do anything since the one-byte pass
1671     // already handled this.
1672     return false;  // Bounds not checked.
1673   }
1674   bool checked = false;
1675   // We handle the length > 1 case in a later pass.
1676   if (length == 1) {
1677     if (one_byte && c > String::kMaxOneByteCharCodeU) {
1678       // Can't match - see above.
1679       return false;  // Bounds not checked.
1680     }
1681     if (!preloaded) {
1682       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1683       checked = check;
1684     }
1685     macro_assembler->CheckNotCharacter(c, on_failure);
1686   }
1687   return checked;
1688 }
1689 
1690 
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1691 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1692                                       bool one_byte, uc16 c1, uc16 c2,
1693                                       Label* on_failure) {
1694   uc16 char_mask;
1695   if (one_byte) {
1696     char_mask = String::kMaxOneByteCharCode;
1697   } else {
1698     char_mask = String::kMaxUtf16CodeUnit;
1699   }
1700   uc16 exor = c1 ^ c2;
1701   // Check whether exor has only one bit set.
1702   if (((exor - 1) & exor) == 0) {
1703     // If c1 and c2 differ only by one bit.
1704     // Ecma262UnCanonicalize always gives the highest number last.
1705     DCHECK(c2 > c1);
1706     uc16 mask = char_mask ^ exor;
1707     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1708     return true;
1709   }
1710   DCHECK(c2 > c1);
1711   uc16 diff = c2 - c1;
1712   if (((diff - 1) & diff) == 0 && c1 >= diff) {
1713     // If the characters differ by 2^n but don't differ by one bit then
1714     // subtract the difference from the found character, then do the or
1715     // trick.  We avoid the theoretical case where negative numbers are
1716     // involved in order to simplify code generation.
1717     uc16 mask = char_mask ^ diff;
1718     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1719                                                     diff,
1720                                                     mask,
1721                                                     on_failure);
1722     return true;
1723   }
1724   return false;
1725 }
1726 
1727 
1728 typedef bool EmitCharacterFunction(Isolate* isolate,
1729                                    RegExpCompiler* compiler,
1730                                    uc16 c,
1731                                    Label* on_failure,
1732                                    int cp_offset,
1733                                    bool check,
1734                                    bool preloaded);
1735 
1736 // Only emits letters (things that have case).  Only used for case independent
1737 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1738 static inline bool EmitAtomLetter(Isolate* isolate,
1739                                   RegExpCompiler* compiler,
1740                                   uc16 c,
1741                                   Label* on_failure,
1742                                   int cp_offset,
1743                                   bool check,
1744                                   bool preloaded) {
1745   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1746   bool one_byte = compiler->one_byte();
1747   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1748   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1749   if (length <= 1) return false;
1750   // We may not need to check against the end of the input string
1751   // if this character lies before a character that matched.
1752   if (!preloaded) {
1753     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1754   }
1755   Label ok;
1756   DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1757   switch (length) {
1758     case 2: {
1759       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1760                                     chars[1], on_failure)) {
1761       } else {
1762         macro_assembler->CheckCharacter(chars[0], &ok);
1763         macro_assembler->CheckNotCharacter(chars[1], on_failure);
1764         macro_assembler->Bind(&ok);
1765       }
1766       break;
1767     }
1768     case 4:
1769       macro_assembler->CheckCharacter(chars[3], &ok);
1770       // Fall through!
1771     case 3:
1772       macro_assembler->CheckCharacter(chars[0], &ok);
1773       macro_assembler->CheckCharacter(chars[1], &ok);
1774       macro_assembler->CheckNotCharacter(chars[2], on_failure);
1775       macro_assembler->Bind(&ok);
1776       break;
1777     default:
1778       UNREACHABLE();
1779       break;
1780   }
1781   return true;
1782 }
1783 
1784 
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1785 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1786                              int border,
1787                              Label* fall_through,
1788                              Label* above_or_equal,
1789                              Label* below) {
1790   if (below != fall_through) {
1791     masm->CheckCharacterLT(border, below);
1792     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1793   } else {
1794     masm->CheckCharacterGT(border - 1, above_or_equal);
1795   }
1796 }
1797 
1798 
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1799 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1800                                    int first,
1801                                    int last,
1802                                    Label* fall_through,
1803                                    Label* in_range,
1804                                    Label* out_of_range) {
1805   if (in_range == fall_through) {
1806     if (first == last) {
1807       masm->CheckNotCharacter(first, out_of_range);
1808     } else {
1809       masm->CheckCharacterNotInRange(first, last, out_of_range);
1810     }
1811   } else {
1812     if (first == last) {
1813       masm->CheckCharacter(first, in_range);
1814     } else {
1815       masm->CheckCharacterInRange(first, last, in_range);
1816     }
1817     if (out_of_range != fall_through) masm->GoTo(out_of_range);
1818   }
1819 }
1820 
1821 
1822 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1823 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1824 static void EmitUseLookupTable(
1825     RegExpMacroAssembler* masm,
1826     ZoneList<int>* ranges,
1827     int start_index,
1828     int end_index,
1829     int min_char,
1830     Label* fall_through,
1831     Label* even_label,
1832     Label* odd_label) {
1833   static const int kSize = RegExpMacroAssembler::kTableSize;
1834   static const int kMask = RegExpMacroAssembler::kTableMask;
1835 
1836   int base = (min_char & ~kMask);
1837   USE(base);
1838 
1839   // Assert that everything is on one kTableSize page.
1840   for (int i = start_index; i <= end_index; i++) {
1841     DCHECK_EQ(ranges->at(i) & ~kMask, base);
1842   }
1843   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1844 
1845   char templ[kSize];
1846   Label* on_bit_set;
1847   Label* on_bit_clear;
1848   int bit;
1849   if (even_label == fall_through) {
1850     on_bit_set = odd_label;
1851     on_bit_clear = even_label;
1852     bit = 1;
1853   } else {
1854     on_bit_set = even_label;
1855     on_bit_clear = odd_label;
1856     bit = 0;
1857   }
1858   for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1859     templ[i] = bit;
1860   }
1861   int j = 0;
1862   bit ^= 1;
1863   for (int i = start_index; i < end_index; i++) {
1864     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1865       templ[j] = bit;
1866     }
1867     bit ^= 1;
1868   }
1869   for (int i = j; i < kSize; i++) {
1870     templ[i] = bit;
1871   }
1872   Factory* factory = masm->isolate()->factory();
1873   // TODO(erikcorry): Cache these.
1874   Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1875   for (int i = 0; i < kSize; i++) {
1876     ba->set(i, templ[i]);
1877   }
1878   masm->CheckBitInTable(ba, on_bit_set);
1879   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1880 }
1881 
1882 
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1883 static void CutOutRange(RegExpMacroAssembler* masm,
1884                         ZoneList<int>* ranges,
1885                         int start_index,
1886                         int end_index,
1887                         int cut_index,
1888                         Label* even_label,
1889                         Label* odd_label) {
1890   bool odd = (((cut_index - start_index) & 1) == 1);
1891   Label* in_range_label = odd ? odd_label : even_label;
1892   Label dummy;
1893   EmitDoubleBoundaryTest(masm,
1894                          ranges->at(cut_index),
1895                          ranges->at(cut_index + 1) - 1,
1896                          &dummy,
1897                          in_range_label,
1898                          &dummy);
1899   DCHECK(!dummy.is_linked());
1900   // Cut out the single range by rewriting the array.  This creates a new
1901   // range that is a merger of the two ranges on either side of the one we
1902   // are cutting out.  The oddity of the labels is preserved.
1903   for (int j = cut_index; j > start_index; j--) {
1904     ranges->at(j) = ranges->at(j - 1);
1905   }
1906   for (int j = cut_index + 1; j < end_index; j++) {
1907     ranges->at(j) = ranges->at(j + 1);
1908   }
1909 }
1910 
1911 
1912 // Unicode case.  Split the search space into kSize spaces that are handled
1913 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1914 static void SplitSearchSpace(ZoneList<int>* ranges,
1915                              int start_index,
1916                              int end_index,
1917                              int* new_start_index,
1918                              int* new_end_index,
1919                              int* border) {
1920   static const int kSize = RegExpMacroAssembler::kTableSize;
1921   static const int kMask = RegExpMacroAssembler::kTableMask;
1922 
1923   int first = ranges->at(start_index);
1924   int last = ranges->at(end_index) - 1;
1925 
1926   *new_start_index = start_index;
1927   *border = (ranges->at(start_index) & ~kMask) + kSize;
1928   while (*new_start_index < end_index) {
1929     if (ranges->at(*new_start_index) > *border) break;
1930     (*new_start_index)++;
1931   }
1932   // new_start_index is the index of the first edge that is beyond the
1933   // current kSize space.
1934 
1935   // For very large search spaces we do a binary chop search of the non-Latin1
1936   // space instead of just going to the end of the current kSize space.  The
1937   // heuristics are complicated a little by the fact that any 128-character
1938   // encoding space can be quickly tested with a table lookup, so we don't
1939   // wish to do binary chop search at a smaller granularity than that.  A
1940   // 128-character space can take up a lot of space in the ranges array if,
1941   // for example, we only want to match every second character (eg. the lower
1942   // case characters on some Unicode pages).
1943   int binary_chop_index = (end_index + start_index) / 2;
1944   // The first test ensures that we get to the code that handles the Latin1
1945   // range with a single not-taken branch, speeding up this important
1946   // character range (even non-Latin1 charset-based text has spaces and
1947   // punctuation).
1948   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1949       end_index - start_index > (*new_start_index - start_index) * 2 &&
1950       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1951       ranges->at(binary_chop_index) >= first + 2 * kSize) {
1952     int scan_forward_for_section_border = binary_chop_index;;
1953     int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1954 
1955     while (scan_forward_for_section_border < end_index) {
1956       if (ranges->at(scan_forward_for_section_border) > new_border) {
1957         *new_start_index = scan_forward_for_section_border;
1958         *border = new_border;
1959         break;
1960       }
1961       scan_forward_for_section_border++;
1962     }
1963   }
1964 
1965   DCHECK(*new_start_index > start_index);
1966   *new_end_index = *new_start_index - 1;
1967   if (ranges->at(*new_end_index) == *border) {
1968     (*new_end_index)--;
1969   }
1970   if (*border >= ranges->at(end_index)) {
1971     *border = ranges->at(end_index);
1972     *new_start_index = end_index;  // Won't be used.
1973     *new_end_index = end_index - 1;
1974   }
1975 }
1976 
1977 
1978 // Gets a series of segment boundaries representing a character class.  If the
1979 // character is in the range between an even and an odd boundary (counting from
1980 // start_index) then go to even_label, otherwise go to odd_label.  We already
1981 // know that the character is in the range of min_char to max_char inclusive.
1982 // Either label can be NULL indicating backtracking.  Either label can also be
1983 // equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1984 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1985                              int start_index, int end_index, uc32 min_char,
1986                              uc32 max_char, Label* fall_through,
1987                              Label* even_label, Label* odd_label) {
1988   DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1989   DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1990 
1991   int first = ranges->at(start_index);
1992   int last = ranges->at(end_index) - 1;
1993 
1994   DCHECK_LT(min_char, first);
1995 
1996   // Just need to test if the character is before or on-or-after
1997   // a particular character.
1998   if (start_index == end_index) {
1999     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2000     return;
2001   }
2002 
2003   // Another almost trivial case:  There is one interval in the middle that is
2004   // different from the end intervals.
2005   if (start_index + 1 == end_index) {
2006     EmitDoubleBoundaryTest(
2007         masm, first, last, fall_through, even_label, odd_label);
2008     return;
2009   }
2010 
2011   // It's not worth using table lookup if there are very few intervals in the
2012   // character class.
2013   if (end_index - start_index <= 6) {
2014     // It is faster to test for individual characters, so we look for those
2015     // first, then try arbitrary ranges in the second round.
2016     static int kNoCutIndex = -1;
2017     int cut = kNoCutIndex;
2018     for (int i = start_index; i < end_index; i++) {
2019       if (ranges->at(i) == ranges->at(i + 1) - 1) {
2020         cut = i;
2021         break;
2022       }
2023     }
2024     if (cut == kNoCutIndex) cut = start_index;
2025     CutOutRange(
2026         masm, ranges, start_index, end_index, cut, even_label, odd_label);
2027     DCHECK_GE(end_index - start_index, 2);
2028     GenerateBranches(masm,
2029                      ranges,
2030                      start_index + 1,
2031                      end_index - 1,
2032                      min_char,
2033                      max_char,
2034                      fall_through,
2035                      even_label,
2036                      odd_label);
2037     return;
2038   }
2039 
2040   // If there are a lot of intervals in the regexp, then we will use tables to
2041   // determine whether the character is inside or outside the character class.
2042   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2043 
2044   if ((max_char >> kBits) == (min_char >> kBits)) {
2045     EmitUseLookupTable(masm,
2046                        ranges,
2047                        start_index,
2048                        end_index,
2049                        min_char,
2050                        fall_through,
2051                        even_label,
2052                        odd_label);
2053     return;
2054   }
2055 
2056   if ((min_char >> kBits) != (first >> kBits)) {
2057     masm->CheckCharacterLT(first, odd_label);
2058     GenerateBranches(masm,
2059                      ranges,
2060                      start_index + 1,
2061                      end_index,
2062                      first,
2063                      max_char,
2064                      fall_through,
2065                      odd_label,
2066                      even_label);
2067     return;
2068   }
2069 
2070   int new_start_index = 0;
2071   int new_end_index = 0;
2072   int border = 0;
2073 
2074   SplitSearchSpace(ranges,
2075                    start_index,
2076                    end_index,
2077                    &new_start_index,
2078                    &new_end_index,
2079                    &border);
2080 
2081   Label handle_rest;
2082   Label* above = &handle_rest;
2083   if (border == last + 1) {
2084     // We didn't find any section that started after the limit, so everything
2085     // above the border is one of the terminal labels.
2086     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2087     DCHECK(new_end_index == end_index - 1);
2088   }
2089 
2090   DCHECK_LE(start_index, new_end_index);
2091   DCHECK_LE(new_start_index, end_index);
2092   DCHECK_LT(start_index, new_start_index);
2093   DCHECK_LT(new_end_index, end_index);
2094   DCHECK(new_end_index + 1 == new_start_index ||
2095          (new_end_index + 2 == new_start_index &&
2096           border == ranges->at(new_end_index + 1)));
2097   DCHECK_LT(min_char, border - 1);
2098   DCHECK_LT(border, max_char);
2099   DCHECK_LT(ranges->at(new_end_index), border);
2100   DCHECK(border < ranges->at(new_start_index) ||
2101          (border == ranges->at(new_start_index) &&
2102           new_start_index == end_index &&
2103           new_end_index == end_index - 1 &&
2104           border == last + 1));
2105   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2106 
2107   masm->CheckCharacterGT(border - 1, above);
2108   Label dummy;
2109   GenerateBranches(masm,
2110                    ranges,
2111                    start_index,
2112                    new_end_index,
2113                    min_char,
2114                    border - 1,
2115                    &dummy,
2116                    even_label,
2117                    odd_label);
2118   if (handle_rest.is_linked()) {
2119     masm->Bind(&handle_rest);
2120     bool flip = (new_start_index & 1) != (start_index & 1);
2121     GenerateBranches(masm,
2122                      ranges,
2123                      new_start_index,
2124                      end_index,
2125                      border,
2126                      max_char,
2127                      &dummy,
2128                      flip ? odd_label : even_label,
2129                      flip ? even_label : odd_label);
2130   }
2131 }
2132 
2133 
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2134 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2135                           RegExpCharacterClass* cc, bool one_byte,
2136                           Label* on_failure, int cp_offset, bool check_offset,
2137                           bool preloaded, Zone* zone) {
2138   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2139   CharacterRange::Canonicalize(ranges);
2140 
2141   int max_char;
2142   if (one_byte) {
2143     max_char = String::kMaxOneByteCharCode;
2144   } else {
2145     max_char = String::kMaxUtf16CodeUnit;
2146   }
2147 
2148   int range_count = ranges->length();
2149 
2150   int last_valid_range = range_count - 1;
2151   while (last_valid_range >= 0) {
2152     CharacterRange& range = ranges->at(last_valid_range);
2153     if (range.from() <= max_char) {
2154       break;
2155     }
2156     last_valid_range--;
2157   }
2158 
2159   if (last_valid_range < 0) {
2160     if (!cc->is_negated()) {
2161       macro_assembler->GoTo(on_failure);
2162     }
2163     if (check_offset) {
2164       macro_assembler->CheckPosition(cp_offset, on_failure);
2165     }
2166     return;
2167   }
2168 
2169   if (last_valid_range == 0 &&
2170       ranges->at(0).IsEverything(max_char)) {
2171     if (cc->is_negated()) {
2172       macro_assembler->GoTo(on_failure);
2173     } else {
2174       // This is a common case hit by non-anchored expressions.
2175       if (check_offset) {
2176         macro_assembler->CheckPosition(cp_offset, on_failure);
2177       }
2178     }
2179     return;
2180   }
2181 
2182   if (!preloaded) {
2183     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2184   }
2185 
2186   if (cc->is_standard(zone) &&
2187       macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2188                                                   on_failure)) {
2189       return;
2190   }
2191 
2192 
2193   // A new list with ascending entries.  Each entry is a code unit
2194   // where there is a boundary between code units that are part of
2195   // the class and code units that are not.  Normally we insert an
2196   // entry at zero which goes to the failure label, but if there
2197   // was already one there we fall through for success on that entry.
2198   // Subsequent entries have alternating meaning (success/failure).
2199   ZoneList<int>* range_boundaries =
2200       new(zone) ZoneList<int>(last_valid_range, zone);
2201 
2202   bool zeroth_entry_is_failure = !cc->is_negated();
2203 
2204   for (int i = 0; i <= last_valid_range; i++) {
2205     CharacterRange& range = ranges->at(i);
2206     if (range.from() == 0) {
2207       DCHECK_EQ(i, 0);
2208       zeroth_entry_is_failure = !zeroth_entry_is_failure;
2209     } else {
2210       range_boundaries->Add(range.from(), zone);
2211     }
2212     range_boundaries->Add(range.to() + 1, zone);
2213   }
2214   int end_index = range_boundaries->length() - 1;
2215   if (range_boundaries->at(end_index) > max_char) {
2216     end_index--;
2217   }
2218 
2219   Label fall_through;
2220   GenerateBranches(macro_assembler,
2221                    range_boundaries,
2222                    0,  // start_index.
2223                    end_index,
2224                    0,  // min_char.
2225                    max_char,
2226                    &fall_through,
2227                    zeroth_entry_is_failure ? &fall_through : on_failure,
2228                    zeroth_entry_is_failure ? on_failure : &fall_through);
2229   macro_assembler->Bind(&fall_through);
2230 }
2231 
2232 
~RegExpNode()2233 RegExpNode::~RegExpNode() {
2234 }
2235 
2236 
LimitVersions(RegExpCompiler * compiler,Trace * trace)2237 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2238                                                   Trace* trace) {
2239   // If we are generating a greedy loop then don't stop and don't reuse code.
2240   if (trace->stop_node() != NULL) {
2241     return CONTINUE;
2242   }
2243 
2244   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2245   if (trace->is_trivial()) {
2246     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2247       // If a generic version is already scheduled to be generated or we have
2248       // recursed too deeply then just generate a jump to that code.
2249       macro_assembler->GoTo(&label_);
2250       // This will queue it up for generation of a generic version if it hasn't
2251       // already been queued.
2252       compiler->AddWork(this);
2253       return DONE;
2254     }
2255     // Generate generic version of the node and bind the label for later use.
2256     macro_assembler->Bind(&label_);
2257     return CONTINUE;
2258   }
2259 
2260   // We are being asked to make a non-generic version.  Keep track of how many
2261   // non-generic versions we generate so as not to overdo it.
2262   trace_count_++;
2263   if (KeepRecursing(compiler) && compiler->optimize() &&
2264       trace_count_ < kMaxCopiesCodeGenerated) {
2265     return CONTINUE;
2266   }
2267 
2268   // If we get here code has been generated for this node too many times or
2269   // recursion is too deep.  Time to switch to a generic version.  The code for
2270   // generic versions above can handle deep recursion properly.
2271   bool was_limiting = compiler->limiting_recursion();
2272   compiler->set_limiting_recursion(true);
2273   trace->Flush(compiler, this);
2274   compiler->set_limiting_recursion(was_limiting);
2275   return DONE;
2276 }
2277 
2278 
KeepRecursing(RegExpCompiler * compiler)2279 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2280   return !compiler->limiting_recursion() &&
2281          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2282 }
2283 
2284 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2285 int ActionNode::EatsAtLeast(int still_to_find,
2286                             int budget,
2287                             bool not_at_start) {
2288   if (budget <= 0) return 0;
2289   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
2290   return on_success()->EatsAtLeast(still_to_find,
2291                                    budget - 1,
2292                                    not_at_start);
2293 }
2294 
2295 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2296 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2297                               BoyerMooreLookahead* bm, bool not_at_start) {
2298   if (action_type_ == BEGIN_SUBMATCH) {
2299     bm->SetRest(offset);
2300   } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2301     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2302   }
2303   SaveBMInfo(bm, not_at_start, offset);
2304 }
2305 
2306 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2307 int AssertionNode::EatsAtLeast(int still_to_find,
2308                                int budget,
2309                                bool not_at_start) {
2310   if (budget <= 0) return 0;
2311   // If we know we are not at the start and we are asked "how many characters
2312   // will you match if you succeed?" then we can answer anything since false
2313   // implies false.  So lets just return the max answer (still_to_find) since
2314   // that won't prevent us from preloading a lot of characters for the other
2315   // branches in the node graph.
2316   if (assertion_type() == AT_START && not_at_start) return still_to_find;
2317   return on_success()->EatsAtLeast(still_to_find,
2318                                    budget - 1,
2319                                    not_at_start);
2320 }
2321 
2322 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2323 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2324                                  BoyerMooreLookahead* bm, bool not_at_start) {
2325   // Match the behaviour of EatsAtLeast on this node.
2326   if (assertion_type() == AT_START && not_at_start) return;
2327   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2328   SaveBMInfo(bm, not_at_start, offset);
2329 }
2330 
2331 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2332 int BackReferenceNode::EatsAtLeast(int still_to_find,
2333                                    int budget,
2334                                    bool not_at_start) {
2335   if (read_backward()) return 0;
2336   if (budget <= 0) return 0;
2337   return on_success()->EatsAtLeast(still_to_find,
2338                                    budget - 1,
2339                                    not_at_start);
2340 }
2341 
2342 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2343 int TextNode::EatsAtLeast(int still_to_find,
2344                           int budget,
2345                           bool not_at_start) {
2346   if (read_backward()) return 0;
2347   int answer = Length();
2348   if (answer >= still_to_find) return answer;
2349   if (budget <= 0) return answer;
2350   // We are not at start after this node so we set the last argument to 'true'.
2351   return answer + on_success()->EatsAtLeast(still_to_find - answer,
2352                                             budget - 1,
2353                                             true);
2354 }
2355 
2356 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2357 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2358                                               bool not_at_start) {
2359   if (budget <= 0) return 0;
2360   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2361   // afterwards.
2362   RegExpNode* node = alternatives_->at(1).node();
2363   return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2364 }
2365 
2366 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2367 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2368     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2369     bool not_at_start) {
2370   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2371   // afterwards.
2372   RegExpNode* node = alternatives_->at(1).node();
2373   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2374 }
2375 
2376 
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2377 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2378                                   int budget,
2379                                   RegExpNode* ignore_this_node,
2380                                   bool not_at_start) {
2381   if (budget <= 0) return 0;
2382   int min = 100;
2383   int choice_count = alternatives_->length();
2384   budget = (budget - 1) / choice_count;
2385   for (int i = 0; i < choice_count; i++) {
2386     RegExpNode* node = alternatives_->at(i).node();
2387     if (node == ignore_this_node) continue;
2388     int node_eats_at_least =
2389         node->EatsAtLeast(still_to_find, budget, not_at_start);
2390     if (node_eats_at_least < min) min = node_eats_at_least;
2391     if (min == 0) return 0;
2392   }
2393   return min;
2394 }
2395 
2396 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2397 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2398                                 int budget,
2399                                 bool not_at_start) {
2400   return EatsAtLeastHelper(still_to_find,
2401                            budget - 1,
2402                            loop_node_,
2403                            not_at_start);
2404 }
2405 
2406 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2407 int ChoiceNode::EatsAtLeast(int still_to_find,
2408                             int budget,
2409                             bool not_at_start) {
2410   return EatsAtLeastHelper(still_to_find,
2411                            budget,
2412                            NULL,
2413                            not_at_start);
2414 }
2415 
2416 
2417 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2418 static inline uint32_t SmearBitsRight(uint32_t v) {
2419   v |= v >> 1;
2420   v |= v >> 2;
2421   v |= v >> 4;
2422   v |= v >> 8;
2423   v |= v >> 16;
2424   return v;
2425 }
2426 
2427 
Rationalize(bool asc)2428 bool QuickCheckDetails::Rationalize(bool asc) {
2429   bool found_useful_op = false;
2430   uint32_t char_mask;
2431   if (asc) {
2432     char_mask = String::kMaxOneByteCharCode;
2433   } else {
2434     char_mask = String::kMaxUtf16CodeUnit;
2435   }
2436   mask_ = 0;
2437   value_ = 0;
2438   int char_shift = 0;
2439   for (int i = 0; i < characters_; i++) {
2440     Position* pos = &positions_[i];
2441     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2442       found_useful_op = true;
2443     }
2444     mask_ |= (pos->mask & char_mask) << char_shift;
2445     value_ |= (pos->value & char_mask) << char_shift;
2446     char_shift += asc ? 8 : 16;
2447   }
2448   return found_useful_op;
2449 }
2450 
2451 
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2452 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2453                                 Trace* bounds_check_trace,
2454                                 Trace* trace,
2455                                 bool preload_has_checked_bounds,
2456                                 Label* on_possible_success,
2457                                 QuickCheckDetails* details,
2458                                 bool fall_through_on_failure) {
2459   if (details->characters() == 0) return false;
2460   GetQuickCheckDetails(
2461       details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2462   if (details->cannot_match()) return false;
2463   if (!details->Rationalize(compiler->one_byte())) return false;
2464   DCHECK(details->characters() == 1 ||
2465          compiler->macro_assembler()->CanReadUnaligned());
2466   uint32_t mask = details->mask();
2467   uint32_t value = details->value();
2468 
2469   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2470 
2471   if (trace->characters_preloaded() != details->characters()) {
2472     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2473     // We are attempting to preload the minimum number of characters
2474     // any choice would eat, so if the bounds check fails, then none of the
2475     // choices can succeed, so we can just immediately backtrack, rather
2476     // than go to the next choice.
2477     assembler->LoadCurrentCharacter(trace->cp_offset(),
2478                                     bounds_check_trace->backtrack(),
2479                                     !preload_has_checked_bounds,
2480                                     details->characters());
2481   }
2482 
2483 
2484   bool need_mask = true;
2485 
2486   if (details->characters() == 1) {
2487     // If number of characters preloaded is 1 then we used a byte or 16 bit
2488     // load so the value is already masked down.
2489     uint32_t char_mask;
2490     if (compiler->one_byte()) {
2491       char_mask = String::kMaxOneByteCharCode;
2492     } else {
2493       char_mask = String::kMaxUtf16CodeUnit;
2494     }
2495     if ((mask & char_mask) == char_mask) need_mask = false;
2496     mask &= char_mask;
2497   } else {
2498     // For 2-character preloads in one-byte mode or 1-character preloads in
2499     // two-byte mode we also use a 16 bit load with zero extend.
2500     static const uint32_t kTwoByteMask = 0xffff;
2501     static const uint32_t kFourByteMask = 0xffffffff;
2502     if (details->characters() == 2 && compiler->one_byte()) {
2503       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2504     } else if (details->characters() == 1 && !compiler->one_byte()) {
2505       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2506     } else {
2507       if (mask == kFourByteMask) need_mask = false;
2508     }
2509   }
2510 
2511   if (fall_through_on_failure) {
2512     if (need_mask) {
2513       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2514     } else {
2515       assembler->CheckCharacter(value, on_possible_success);
2516     }
2517   } else {
2518     if (need_mask) {
2519       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2520     } else {
2521       assembler->CheckNotCharacter(value, trace->backtrack());
2522     }
2523   }
2524   return true;
2525 }
2526 
2527 
2528 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2529 // super-class in the .h file).
2530 //
2531 // We iterate along the text object, building up for each character a
2532 // mask and value that can be used to test for a quick failure to match.
2533 // The masks and values for the positions will be combined into a single
2534 // machine word for the current character width in order to be used in
2535 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2536 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2537                                     RegExpCompiler* compiler,
2538                                     int characters_filled_in,
2539                                     bool not_at_start) {
2540   // Do not collect any quick check details if the text node reads backward,
2541   // since it reads in the opposite direction than we use for quick checks.
2542   if (read_backward()) return;
2543   Isolate* isolate = compiler->macro_assembler()->isolate();
2544   DCHECK(characters_filled_in < details->characters());
2545   int characters = details->characters();
2546   int char_mask;
2547   if (compiler->one_byte()) {
2548     char_mask = String::kMaxOneByteCharCode;
2549   } else {
2550     char_mask = String::kMaxUtf16CodeUnit;
2551   }
2552   for (int k = 0; k < elements()->length(); k++) {
2553     TextElement elm = elements()->at(k);
2554     if (elm.text_type() == TextElement::ATOM) {
2555       Vector<const uc16> quarks = elm.atom()->data();
2556       for (int i = 0; i < characters && i < quarks.length(); i++) {
2557         QuickCheckDetails::Position* pos =
2558             details->positions(characters_filled_in);
2559         uc16 c = quarks[i];
2560         if (compiler->ignore_case()) {
2561           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2562           int length = GetCaseIndependentLetters(isolate, c,
2563                                                  compiler->one_byte(), chars);
2564           if (length == 0) {
2565             // This can happen because all case variants are non-Latin1, but we
2566             // know the input is Latin1.
2567             details->set_cannot_match();
2568             pos->determines_perfectly = false;
2569             return;
2570           }
2571           if (length == 1) {
2572             // This letter has no case equivalents, so it's nice and simple
2573             // and the mask-compare will determine definitely whether we have
2574             // a match at this character position.
2575             pos->mask = char_mask;
2576             pos->value = c;
2577             pos->determines_perfectly = true;
2578           } else {
2579             uint32_t common_bits = char_mask;
2580             uint32_t bits = chars[0];
2581             for (int j = 1; j < length; j++) {
2582               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2583               common_bits ^= differing_bits;
2584               bits &= common_bits;
2585             }
2586             // If length is 2 and common bits has only one zero in it then
2587             // our mask and compare instruction will determine definitely
2588             // whether we have a match at this character position.  Otherwise
2589             // it can only be an approximate check.
2590             uint32_t one_zero = (common_bits | ~char_mask);
2591             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2592               pos->determines_perfectly = true;
2593             }
2594             pos->mask = common_bits;
2595             pos->value = bits;
2596           }
2597         } else {
2598           // Don't ignore case.  Nice simple case where the mask-compare will
2599           // determine definitely whether we have a match at this character
2600           // position.
2601           if (c > char_mask) {
2602             details->set_cannot_match();
2603             pos->determines_perfectly = false;
2604             return;
2605           }
2606           pos->mask = char_mask;
2607           pos->value = c;
2608           pos->determines_perfectly = true;
2609         }
2610         characters_filled_in++;
2611         DCHECK(characters_filled_in <= details->characters());
2612         if (characters_filled_in == details->characters()) {
2613           return;
2614         }
2615       }
2616     } else {
2617       QuickCheckDetails::Position* pos =
2618           details->positions(characters_filled_in);
2619       RegExpCharacterClass* tree = elm.char_class();
2620       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2621       if (tree->is_negated()) {
2622         // A quick check uses multi-character mask and compare.  There is no
2623         // useful way to incorporate a negative char class into this scheme
2624         // so we just conservatively create a mask and value that will always
2625         // succeed.
2626         pos->mask = 0;
2627         pos->value = 0;
2628       } else {
2629         int first_range = 0;
2630         while (ranges->at(first_range).from() > char_mask) {
2631           first_range++;
2632           if (first_range == ranges->length()) {
2633             details->set_cannot_match();
2634             pos->determines_perfectly = false;
2635             return;
2636           }
2637         }
2638         CharacterRange range = ranges->at(first_range);
2639         uc16 from = range.from();
2640         uc16 to = range.to();
2641         if (to > char_mask) {
2642           to = char_mask;
2643         }
2644         uint32_t differing_bits = (from ^ to);
2645         // A mask and compare is only perfect if the differing bits form a
2646         // number like 00011111 with one single block of trailing 1s.
2647         if ((differing_bits & (differing_bits + 1)) == 0 &&
2648              from + differing_bits == to) {
2649           pos->determines_perfectly = true;
2650         }
2651         uint32_t common_bits = ~SmearBitsRight(differing_bits);
2652         uint32_t bits = (from & common_bits);
2653         for (int i = first_range + 1; i < ranges->length(); i++) {
2654           CharacterRange range = ranges->at(i);
2655           uc16 from = range.from();
2656           uc16 to = range.to();
2657           if (from > char_mask) continue;
2658           if (to > char_mask) to = char_mask;
2659           // Here we are combining more ranges into the mask and compare
2660           // value.  With each new range the mask becomes more sparse and
2661           // so the chances of a false positive rise.  A character class
2662           // with multiple ranges is assumed never to be equivalent to a
2663           // mask and compare operation.
2664           pos->determines_perfectly = false;
2665           uint32_t new_common_bits = (from ^ to);
2666           new_common_bits = ~SmearBitsRight(new_common_bits);
2667           common_bits &= new_common_bits;
2668           bits &= new_common_bits;
2669           uint32_t differing_bits = (from & common_bits) ^ bits;
2670           common_bits ^= differing_bits;
2671           bits &= common_bits;
2672         }
2673         pos->mask = common_bits;
2674         pos->value = bits;
2675       }
2676       characters_filled_in++;
2677       DCHECK(characters_filled_in <= details->characters());
2678       if (characters_filled_in == details->characters()) {
2679         return;
2680       }
2681     }
2682   }
2683   DCHECK(characters_filled_in != details->characters());
2684   if (!details->cannot_match()) {
2685     on_success()-> GetQuickCheckDetails(details,
2686                                         compiler,
2687                                         characters_filled_in,
2688                                         true);
2689   }
2690 }
2691 
2692 
Clear()2693 void QuickCheckDetails::Clear() {
2694   for (int i = 0; i < characters_; i++) {
2695     positions_[i].mask = 0;
2696     positions_[i].value = 0;
2697     positions_[i].determines_perfectly = false;
2698   }
2699   characters_ = 0;
2700 }
2701 
2702 
Advance(int by,bool one_byte)2703 void QuickCheckDetails::Advance(int by, bool one_byte) {
2704   if (by >= characters_ || by < 0) {
2705     DCHECK_IMPLIES(by < 0, characters_ == 0);
2706     Clear();
2707     return;
2708   }
2709   DCHECK_LE(characters_ - by, 4);
2710   DCHECK_LE(characters_, 4);
2711   for (int i = 0; i < characters_ - by; i++) {
2712     positions_[i] = positions_[by + i];
2713   }
2714   for (int i = characters_ - by; i < characters_; i++) {
2715     positions_[i].mask = 0;
2716     positions_[i].value = 0;
2717     positions_[i].determines_perfectly = false;
2718   }
2719   characters_ -= by;
2720   // We could change mask_ and value_ here but we would never advance unless
2721   // they had already been used in a check and they won't be used again because
2722   // it would gain us nothing.  So there's no point.
2723 }
2724 
2725 
Merge(QuickCheckDetails * other,int from_index)2726 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2727   DCHECK(characters_ == other->characters_);
2728   if (other->cannot_match_) {
2729     return;
2730   }
2731   if (cannot_match_) {
2732     *this = *other;
2733     return;
2734   }
2735   for (int i = from_index; i < characters_; i++) {
2736     QuickCheckDetails::Position* pos = positions(i);
2737     QuickCheckDetails::Position* other_pos = other->positions(i);
2738     if (pos->mask != other_pos->mask ||
2739         pos->value != other_pos->value ||
2740         !other_pos->determines_perfectly) {
2741       // Our mask-compare operation will be approximate unless we have the
2742       // exact same operation on both sides of the alternation.
2743       pos->determines_perfectly = false;
2744     }
2745     pos->mask &= other_pos->mask;
2746     pos->value &= pos->mask;
2747     other_pos->value &= pos->mask;
2748     uc16 differing_bits = (pos->value ^ other_pos->value);
2749     pos->mask &= ~differing_bits;
2750     pos->value &= pos->mask;
2751   }
2752 }
2753 
2754 
2755 class VisitMarker {
2756  public:
VisitMarker(NodeInfo * info)2757   explicit VisitMarker(NodeInfo* info) : info_(info) {
2758     DCHECK(!info->visited);
2759     info->visited = true;
2760   }
~VisitMarker()2761   ~VisitMarker() {
2762     info_->visited = false;
2763   }
2764  private:
2765   NodeInfo* info_;
2766 };
2767 
2768 
FilterOneByte(int depth,bool ignore_case)2769 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2770   if (info()->replacement_calculated) return replacement();
2771   if (depth < 0) return this;
2772   DCHECK(!info()->visited);
2773   VisitMarker marker(info());
2774   return FilterSuccessor(depth - 1, ignore_case);
2775 }
2776 
2777 
FilterSuccessor(int depth,bool ignore_case)2778 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2779   RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2780   if (next == NULL) return set_replacement(NULL);
2781   on_success_ = next;
2782   return set_replacement(this);
2783 }
2784 
2785 
2786 // We need to check for the following characters: 0x39c 0x3bc 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2787 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2788   // TODO(dcarney): this could be a lot more efficient.
2789   return range.Contains(0x39c) ||
2790       range.Contains(0x3bc) || range.Contains(0x178);
2791 }
2792 
2793 
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2794 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2795   for (int i = 0; i < ranges->length(); i++) {
2796     // TODO(dcarney): this could be a lot more efficient.
2797     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2798   }
2799   return false;
2800 }
2801 
2802 
FilterOneByte(int depth,bool ignore_case)2803 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2804   if (info()->replacement_calculated) return replacement();
2805   if (depth < 0) return this;
2806   DCHECK(!info()->visited);
2807   VisitMarker marker(info());
2808   int element_count = elements()->length();
2809   for (int i = 0; i < element_count; i++) {
2810     TextElement elm = elements()->at(i);
2811     if (elm.text_type() == TextElement::ATOM) {
2812       Vector<const uc16> quarks = elm.atom()->data();
2813       for (int j = 0; j < quarks.length(); j++) {
2814         uint16_t c = quarks[j];
2815         if (c <= String::kMaxOneByteCharCode) continue;
2816         if (!ignore_case) return set_replacement(NULL);
2817         // Here, we need to check for characters whose upper and lower cases
2818         // are outside the Latin-1 range.
2819         uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2820         // Character is outside Latin-1 completely
2821         if (converted == 0) return set_replacement(NULL);
2822         // Convert quark to Latin-1 in place.
2823         uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2824         copy[j] = converted;
2825       }
2826     } else {
2827       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2828       RegExpCharacterClass* cc = elm.char_class();
2829       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2830       CharacterRange::Canonicalize(ranges);
2831       // Now they are in order so we only need to look at the first.
2832       int range_count = ranges->length();
2833       if (cc->is_negated()) {
2834         if (range_count != 0 &&
2835             ranges->at(0).from() == 0 &&
2836             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2837           // This will be handled in a later filter.
2838           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2839           return set_replacement(NULL);
2840         }
2841       } else {
2842         if (range_count == 0 ||
2843             ranges->at(0).from() > String::kMaxOneByteCharCode) {
2844           // This will be handled in a later filter.
2845           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2846           return set_replacement(NULL);
2847         }
2848       }
2849     }
2850   }
2851   return FilterSuccessor(depth - 1, ignore_case);
2852 }
2853 
2854 
FilterOneByte(int depth,bool ignore_case)2855 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2856   if (info()->replacement_calculated) return replacement();
2857   if (depth < 0) return this;
2858   if (info()->visited) return this;
2859   {
2860     VisitMarker marker(info());
2861 
2862     RegExpNode* continue_replacement =
2863         continue_node_->FilterOneByte(depth - 1, ignore_case);
2864     // If we can't continue after the loop then there is no sense in doing the
2865     // loop.
2866     if (continue_replacement == NULL) return set_replacement(NULL);
2867   }
2868 
2869   return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2870 }
2871 
2872 
FilterOneByte(int depth,bool ignore_case)2873 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2874   if (info()->replacement_calculated) return replacement();
2875   if (depth < 0) return this;
2876   if (info()->visited) return this;
2877   VisitMarker marker(info());
2878   int choice_count = alternatives_->length();
2879 
2880   for (int i = 0; i < choice_count; i++) {
2881     GuardedAlternative alternative = alternatives_->at(i);
2882     if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2883       set_replacement(this);
2884       return this;
2885     }
2886   }
2887 
2888   int surviving = 0;
2889   RegExpNode* survivor = NULL;
2890   for (int i = 0; i < choice_count; i++) {
2891     GuardedAlternative alternative = alternatives_->at(i);
2892     RegExpNode* replacement =
2893         alternative.node()->FilterOneByte(depth - 1, ignore_case);
2894     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
2895     if (replacement != NULL) {
2896       alternatives_->at(i).set_node(replacement);
2897       surviving++;
2898       survivor = replacement;
2899     }
2900   }
2901   if (surviving < 2) return set_replacement(survivor);
2902 
2903   set_replacement(this);
2904   if (surviving == choice_count) {
2905     return this;
2906   }
2907   // Only some of the nodes survived the filtering.  We need to rebuild the
2908   // alternatives list.
2909   ZoneList<GuardedAlternative>* new_alternatives =
2910       new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2911   for (int i = 0; i < choice_count; i++) {
2912     RegExpNode* replacement =
2913         alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2914     if (replacement != NULL) {
2915       alternatives_->at(i).set_node(replacement);
2916       new_alternatives->Add(alternatives_->at(i), zone());
2917     }
2918   }
2919   alternatives_ = new_alternatives;
2920   return this;
2921 }
2922 
2923 
FilterOneByte(int depth,bool ignore_case)2924 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2925                                                         bool ignore_case) {
2926   if (info()->replacement_calculated) return replacement();
2927   if (depth < 0) return this;
2928   if (info()->visited) return this;
2929   VisitMarker marker(info());
2930   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2931   // afterwards.
2932   RegExpNode* node = alternatives_->at(1).node();
2933   RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2934   if (replacement == NULL) return set_replacement(NULL);
2935   alternatives_->at(1).set_node(replacement);
2936 
2937   RegExpNode* neg_node = alternatives_->at(0).node();
2938   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2939   // If the negative lookahead is always going to fail then
2940   // we don't need to check it.
2941   if (neg_replacement == NULL) return set_replacement(replacement);
2942   alternatives_->at(0).set_node(neg_replacement);
2943   return set_replacement(this);
2944 }
2945 
2946 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2947 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2948                                           RegExpCompiler* compiler,
2949                                           int characters_filled_in,
2950                                           bool not_at_start) {
2951   if (body_can_be_zero_length_ || info()->visited) return;
2952   VisitMarker marker(info());
2953   return ChoiceNode::GetQuickCheckDetails(details,
2954                                           compiler,
2955                                           characters_filled_in,
2956                                           not_at_start);
2957 }
2958 
2959 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2960 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2961                                   BoyerMooreLookahead* bm, bool not_at_start) {
2962   if (body_can_be_zero_length_ || budget <= 0) {
2963     bm->SetRest(offset);
2964     SaveBMInfo(bm, not_at_start, offset);
2965     return;
2966   }
2967   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2968   SaveBMInfo(bm, not_at_start, offset);
2969 }
2970 
2971 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2972 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2973                                       RegExpCompiler* compiler,
2974                                       int characters_filled_in,
2975                                       bool not_at_start) {
2976   not_at_start = (not_at_start || not_at_start_);
2977   int choice_count = alternatives_->length();
2978   DCHECK(choice_count > 0);
2979   alternatives_->at(0).node()->GetQuickCheckDetails(details,
2980                                                     compiler,
2981                                                     characters_filled_in,
2982                                                     not_at_start);
2983   for (int i = 1; i < choice_count; i++) {
2984     QuickCheckDetails new_details(details->characters());
2985     RegExpNode* node = alternatives_->at(i).node();
2986     node->GetQuickCheckDetails(&new_details, compiler,
2987                                characters_filled_in,
2988                                not_at_start);
2989     // Here we merge the quick match details of the two branches.
2990     details->Merge(&new_details, characters_filled_in);
2991   }
2992 }
2993 
2994 
2995 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2996 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2997                           Label* word,
2998                           Label* non_word,
2999                           bool fall_through_on_word) {
3000   if (assembler->CheckSpecialCharacterClass(
3001           fall_through_on_word ? 'w' : 'W',
3002           fall_through_on_word ? non_word : word)) {
3003     // Optimized implementation available.
3004     return;
3005   }
3006   assembler->CheckCharacterGT('z', non_word);
3007   assembler->CheckCharacterLT('0', non_word);
3008   assembler->CheckCharacterGT('a' - 1, word);
3009   assembler->CheckCharacterLT('9' + 1, word);
3010   assembler->CheckCharacterLT('A', non_word);
3011   assembler->CheckCharacterLT('Z' + 1, word);
3012   if (fall_through_on_word) {
3013     assembler->CheckNotCharacter('_', non_word);
3014   } else {
3015     assembler->CheckCharacter('_', word);
3016   }
3017 }
3018 
3019 
3020 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3021 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)3022 static void EmitHat(RegExpCompiler* compiler,
3023                     RegExpNode* on_success,
3024                     Trace* trace) {
3025   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3026   // We will be loading the previous character into the current character
3027   // register.
3028   Trace new_trace(*trace);
3029   new_trace.InvalidateCurrentCharacter();
3030 
3031   Label ok;
3032   if (new_trace.cp_offset() == 0) {
3033     // The start of input counts as a newline in this context, so skip to
3034     // ok if we are at the start.
3035     assembler->CheckAtStart(&ok);
3036   }
3037   // We already checked that we are not at the start of input so it must be
3038   // OK to load the previous character.
3039   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3040                                   new_trace.backtrack(),
3041                                   false);
3042   if (!assembler->CheckSpecialCharacterClass('n',
3043                                              new_trace.backtrack())) {
3044     // Newline means \n, \r, 0x2028 or 0x2029.
3045     if (!compiler->one_byte()) {
3046       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3047     }
3048     assembler->CheckCharacter('\n', &ok);
3049     assembler->CheckNotCharacter('\r', new_trace.backtrack());
3050   }
3051   assembler->Bind(&ok);
3052   on_success->Emit(compiler, &new_trace);
3053 }
3054 
3055 
3056 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)3057 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3058   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3059   Isolate* isolate = assembler->isolate();
3060   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3061   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3062   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3063   if (lookahead == NULL) {
3064     int eats_at_least =
3065         Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3066                                                     kRecursionBudget,
3067                                                     not_at_start));
3068     if (eats_at_least >= 1) {
3069       BoyerMooreLookahead* bm =
3070           new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3071       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3072       if (bm->at(0)->is_non_word())
3073         next_is_word_character = Trace::FALSE_VALUE;
3074       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3075     }
3076   } else {
3077     if (lookahead->at(0)->is_non_word())
3078       next_is_word_character = Trace::FALSE_VALUE;
3079     if (lookahead->at(0)->is_word())
3080       next_is_word_character = Trace::TRUE_VALUE;
3081   }
3082   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3083   if (next_is_word_character == Trace::UNKNOWN) {
3084     Label before_non_word;
3085     Label before_word;
3086     if (trace->characters_preloaded() != 1) {
3087       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3088     }
3089     // Fall through on non-word.
3090     EmitWordCheck(assembler, &before_word, &before_non_word, false);
3091     // Next character is not a word character.
3092     assembler->Bind(&before_non_word);
3093     Label ok;
3094     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3095     assembler->GoTo(&ok);
3096 
3097     assembler->Bind(&before_word);
3098     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3099     assembler->Bind(&ok);
3100   } else if (next_is_word_character == Trace::TRUE_VALUE) {
3101     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3102   } else {
3103     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3104     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3105   }
3106 }
3107 
3108 
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3109 void AssertionNode::BacktrackIfPrevious(
3110     RegExpCompiler* compiler,
3111     Trace* trace,
3112     AssertionNode::IfPrevious backtrack_if_previous) {
3113   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3114   Trace new_trace(*trace);
3115   new_trace.InvalidateCurrentCharacter();
3116 
3117   Label fall_through, dummy;
3118 
3119   Label* non_word = backtrack_if_previous == kIsNonWord ?
3120                     new_trace.backtrack() :
3121                     &fall_through;
3122   Label* word = backtrack_if_previous == kIsNonWord ?
3123                 &fall_through :
3124                 new_trace.backtrack();
3125 
3126   if (new_trace.cp_offset() == 0) {
3127     // The start of input counts as a non-word character, so the question is
3128     // decided if we are at the start.
3129     assembler->CheckAtStart(non_word);
3130   }
3131   // We already checked that we are not at the start of input so it must be
3132   // OK to load the previous character.
3133   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3134   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3135 
3136   assembler->Bind(&fall_through);
3137   on_success()->Emit(compiler, &new_trace);
3138 }
3139 
3140 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3141 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3142                                          RegExpCompiler* compiler,
3143                                          int filled_in,
3144                                          bool not_at_start) {
3145   if (assertion_type_ == AT_START && not_at_start) {
3146     details->set_cannot_match();
3147     return;
3148   }
3149   return on_success()->GetQuickCheckDetails(details,
3150                                             compiler,
3151                                             filled_in,
3152                                             not_at_start);
3153 }
3154 
3155 
Emit(RegExpCompiler * compiler,Trace * trace)3156 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3157   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3158   switch (assertion_type_) {
3159     case AT_END: {
3160       Label ok;
3161       assembler->CheckPosition(trace->cp_offset(), &ok);
3162       assembler->GoTo(trace->backtrack());
3163       assembler->Bind(&ok);
3164       break;
3165     }
3166     case AT_START: {
3167       if (trace->at_start() == Trace::FALSE_VALUE) {
3168         assembler->GoTo(trace->backtrack());
3169         return;
3170       }
3171       if (trace->at_start() == Trace::UNKNOWN) {
3172         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3173         Trace at_start_trace = *trace;
3174         at_start_trace.set_at_start(Trace::TRUE_VALUE);
3175         on_success()->Emit(compiler, &at_start_trace);
3176         return;
3177       }
3178     }
3179     break;
3180     case AFTER_NEWLINE:
3181       EmitHat(compiler, on_success(), trace);
3182       return;
3183     case AT_BOUNDARY:
3184     case AT_NON_BOUNDARY: {
3185       EmitBoundaryCheck(compiler, trace);
3186       return;
3187     }
3188   }
3189   on_success()->Emit(compiler, trace);
3190 }
3191 
3192 
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3193 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3194   if (quick_check == NULL) return false;
3195   if (offset >= quick_check->characters()) return false;
3196   return quick_check->positions(offset)->determines_perfectly;
3197 }
3198 
3199 
UpdateBoundsCheck(int index,int * checked_up_to)3200 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3201   if (index > *checked_up_to) {
3202     *checked_up_to = index;
3203   }
3204 }
3205 
3206 
3207 // We call this repeatedly to generate code for each pass over the text node.
3208 // The passes are in increasing order of difficulty because we hope one
3209 // of the first passes will fail in which case we are saved the work of the
3210 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
3211 // we will check the '%' in the first pass, the case independent 'a' in the
3212 // second pass and the character class in the last pass.
3213 //
3214 // The passes are done from right to left, so for example to test for /bar/
3215 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3216 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
3217 // bounds check most of the time.  In the example we only need to check for
3218 // end-of-input when loading the putative 'r'.
3219 //
3220 // A slight complication involves the fact that the first character may already
3221 // be fetched into a register by the previous node.  In this case we want to
3222 // do the test for that character first.  We do this in separate passes.  The
3223 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
3224 // pass has been performed then subsequent passes will have true in
3225 // first_element_checked to indicate that that character does not need to be
3226 // checked again.
3227 //
3228 // In addition to all this we are passed a Trace, which can
3229 // contain an AlternativeGeneration object.  In this AlternativeGeneration
3230 // object we can see details of any quick check that was already passed in
3231 // order to get to the code we are now generating.  The quick check can involve
3232 // loading characters, which means we do not need to recheck the bounds
3233 // up to the limit the quick check already checked.  In addition the quick
3234 // check can have involved a mask and compare operation which may simplify
3235 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3236 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3237                             TextEmitPassType pass,
3238                             bool preloaded,
3239                             Trace* trace,
3240                             bool first_element_checked,
3241                             int* checked_up_to) {
3242   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3243   Isolate* isolate = assembler->isolate();
3244   bool one_byte = compiler->one_byte();
3245   Label* backtrack = trace->backtrack();
3246   QuickCheckDetails* quick_check = trace->quick_check_performed();
3247   int element_count = elements()->length();
3248   int backward_offset = read_backward() ? -Length() : 0;
3249   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3250     TextElement elm = elements()->at(i);
3251     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3252     if (elm.text_type() == TextElement::ATOM) {
3253       Vector<const uc16> quarks = elm.atom()->data();
3254       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3255         if (first_element_checked && i == 0 && j == 0) continue;
3256         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3257         EmitCharacterFunction* emit_function = NULL;
3258         switch (pass) {
3259           case NON_LATIN1_MATCH:
3260             DCHECK(one_byte);
3261             if (quarks[j] > String::kMaxOneByteCharCode) {
3262               assembler->GoTo(backtrack);
3263               return;
3264             }
3265             break;
3266           case NON_LETTER_CHARACTER_MATCH:
3267             emit_function = &EmitAtomNonLetter;
3268             break;
3269           case SIMPLE_CHARACTER_MATCH:
3270             emit_function = &EmitSimpleCharacter;
3271             break;
3272           case CASE_CHARACTER_MATCH:
3273             emit_function = &EmitAtomLetter;
3274             break;
3275           default:
3276             break;
3277         }
3278         if (emit_function != NULL) {
3279           bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3280           bool bound_checked =
3281               emit_function(isolate, compiler, quarks[j], backtrack,
3282                             cp_offset + j, bounds_check, preloaded);
3283           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3284         }
3285       }
3286     } else {
3287       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3288       if (pass == CHARACTER_CLASS_MATCH) {
3289         if (first_element_checked && i == 0) continue;
3290         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3291         RegExpCharacterClass* cc = elm.char_class();
3292         bool bounds_check = *checked_up_to < cp_offset || read_backward();
3293         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3294                       bounds_check, preloaded, zone());
3295         UpdateBoundsCheck(cp_offset, checked_up_to);
3296       }
3297     }
3298   }
3299 }
3300 
3301 
Length()3302 int TextNode::Length() {
3303   TextElement elm = elements()->last();
3304   DCHECK(elm.cp_offset() >= 0);
3305   return elm.cp_offset() + elm.length();
3306 }
3307 
3308 
SkipPass(int int_pass,bool ignore_case)3309 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3310   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3311   if (ignore_case) {
3312     return pass == SIMPLE_CHARACTER_MATCH;
3313   } else {
3314     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3315   }
3316 }
3317 
3318 
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success)3319 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3320                                              ZoneList<CharacterRange>* ranges,
3321                                              bool read_backward,
3322                                              RegExpNode* on_success) {
3323   DCHECK_NOT_NULL(ranges);
3324   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3325   elms->Add(
3326       TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3327       zone);
3328   return new (zone) TextNode(elms, read_backward, on_success);
3329 }
3330 
3331 
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success)3332 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3333                                            CharacterRange trail,
3334                                            bool read_backward,
3335                                            RegExpNode* on_success) {
3336   ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3337   ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3338   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3339   elms->Add(TextElement::CharClass(
3340                 new (zone) RegExpCharacterClass(lead_ranges, false)),
3341             zone);
3342   elms->Add(TextElement::CharClass(
3343                 new (zone) RegExpCharacterClass(trail_ranges, false)),
3344             zone);
3345   return new (zone) TextNode(elms, read_backward, on_success);
3346 }
3347 
3348 
3349 // This generates the code to match a text node.  A text node can contain
3350 // straight character sequences (possibly to be matched in a case-independent
3351 // way) and character classes.  For efficiency we do not do this in a single
3352 // pass from left to right.  Instead we pass over the text node several times,
3353 // emitting code for some character positions every time.  See the comment on
3354 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3355 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3356   LimitResult limit_result = LimitVersions(compiler, trace);
3357   if (limit_result == DONE) return;
3358   DCHECK(limit_result == CONTINUE);
3359 
3360   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3361     compiler->SetRegExpTooBig();
3362     return;
3363   }
3364 
3365   if (compiler->one_byte()) {
3366     int dummy = 0;
3367     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3368   }
3369 
3370   bool first_elt_done = false;
3371   int bound_checked_to = trace->cp_offset() - 1;
3372   bound_checked_to += trace->bound_checked_up_to();
3373 
3374   // If a character is preloaded into the current character register then
3375   // check that now.
3376   if (trace->characters_preloaded() == 1) {
3377     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3378       if (!SkipPass(pass, compiler->ignore_case())) {
3379         TextEmitPass(compiler,
3380                      static_cast<TextEmitPassType>(pass),
3381                      true,
3382                      trace,
3383                      false,
3384                      &bound_checked_to);
3385       }
3386     }
3387     first_elt_done = true;
3388   }
3389 
3390   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3391     if (!SkipPass(pass, compiler->ignore_case())) {
3392       TextEmitPass(compiler,
3393                    static_cast<TextEmitPassType>(pass),
3394                    false,
3395                    trace,
3396                    first_elt_done,
3397                    &bound_checked_to);
3398     }
3399   }
3400 
3401   Trace successor_trace(*trace);
3402   // If we advance backward, we may end up at the start.
3403   successor_trace.AdvanceCurrentPositionInTrace(
3404       read_backward() ? -Length() : Length(), compiler);
3405   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3406                                                : Trace::FALSE_VALUE);
3407   RecursionCheck rc(compiler);
3408   on_success()->Emit(compiler, &successor_trace);
3409 }
3410 
3411 
InvalidateCurrentCharacter()3412 void Trace::InvalidateCurrentCharacter() {
3413   characters_preloaded_ = 0;
3414 }
3415 
3416 
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3417 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3418   // We don't have an instruction for shifting the current character register
3419   // down or for using a shifted value for anything so lets just forget that
3420   // we preloaded any characters into it.
3421   characters_preloaded_ = 0;
3422   // Adjust the offsets of the quick check performed information.  This
3423   // information is used to find out what we already determined about the
3424   // characters by means of mask and compare.
3425   quick_check_performed_.Advance(by, compiler->one_byte());
3426   cp_offset_ += by;
3427   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3428     compiler->SetRegExpTooBig();
3429     cp_offset_ = 0;
3430   }
3431   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3432 }
3433 
3434 
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)3435 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3436   int element_count = elements()->length();
3437   for (int i = 0; i < element_count; i++) {
3438     TextElement elm = elements()->at(i);
3439     if (elm.text_type() == TextElement::CHAR_CLASS) {
3440       RegExpCharacterClass* cc = elm.char_class();
3441       // None of the standard character classes is different in the case
3442       // independent case and it slows us down if we don't know that.
3443       if (cc->is_standard(zone())) continue;
3444       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3445       CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3446     }
3447   }
3448 }
3449 
3450 
GreedyLoopTextLength()3451 int TextNode::GreedyLoopTextLength() { return Length(); }
3452 
3453 
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3454 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3455     RegExpCompiler* compiler) {
3456   if (read_backward()) return NULL;
3457   if (elements()->length() != 1) return NULL;
3458   TextElement elm = elements()->at(0);
3459   if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3460   RegExpCharacterClass* node = elm.char_class();
3461   ZoneList<CharacterRange>* ranges = node->ranges(zone());
3462   CharacterRange::Canonicalize(ranges);
3463   if (node->is_negated()) {
3464     return ranges->length() == 0 ? on_success() : NULL;
3465   }
3466   if (ranges->length() != 1) return NULL;
3467   uint32_t max_char;
3468   if (compiler->one_byte()) {
3469     max_char = String::kMaxOneByteCharCode;
3470   } else {
3471     max_char = String::kMaxUtf16CodeUnit;
3472   }
3473   return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3474 }
3475 
3476 
3477 // Finds the fixed match length of a sequence of nodes that goes from
3478 // this alternative and back to this choice node.  If there are variable
3479 // length nodes or other complications in the way then return a sentinel
3480 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3481 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3482     GuardedAlternative* alternative) {
3483   int length = 0;
3484   RegExpNode* node = alternative->node();
3485   // Later we will generate code for all these text nodes using recursion
3486   // so we have to limit the max number.
3487   int recursion_depth = 0;
3488   while (node != this) {
3489     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3490       return kNodeIsTooComplexForGreedyLoops;
3491     }
3492     int node_length = node->GreedyLoopTextLength();
3493     if (node_length == kNodeIsTooComplexForGreedyLoops) {
3494       return kNodeIsTooComplexForGreedyLoops;
3495     }
3496     length += node_length;
3497     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3498     node = seq_node->on_success();
3499   }
3500   return read_backward() ? -length : length;
3501 }
3502 
3503 
AddLoopAlternative(GuardedAlternative alt)3504 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3505   DCHECK_NULL(loop_node_);
3506   AddAlternative(alt);
3507   loop_node_ = alt.node();
3508 }
3509 
3510 
AddContinueAlternative(GuardedAlternative alt)3511 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3512   DCHECK_NULL(continue_node_);
3513   AddAlternative(alt);
3514   continue_node_ = alt.node();
3515 }
3516 
3517 
Emit(RegExpCompiler * compiler,Trace * trace)3518 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3519   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3520   if (trace->stop_node() == this) {
3521     // Back edge of greedy optimized loop node graph.
3522     int text_length =
3523         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3524     DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3525     // Update the counter-based backtracking info on the stack.  This is an
3526     // optimization for greedy loops (see below).
3527     DCHECK(trace->cp_offset() == text_length);
3528     macro_assembler->AdvanceCurrentPosition(text_length);
3529     macro_assembler->GoTo(trace->loop_label());
3530     return;
3531   }
3532   DCHECK_NULL(trace->stop_node());
3533   if (!trace->is_trivial()) {
3534     trace->Flush(compiler, this);
3535     return;
3536   }
3537   ChoiceNode::Emit(compiler, trace);
3538 }
3539 
3540 
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3541 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3542                                            int eats_at_least) {
3543   int preload_characters = Min(4, eats_at_least);
3544   if (compiler->macro_assembler()->CanReadUnaligned()) {
3545     bool one_byte = compiler->one_byte();
3546     if (one_byte) {
3547       if (preload_characters > 4) preload_characters = 4;
3548       // We can't preload 3 characters because there is no machine instruction
3549       // to do that.  We can't just load 4 because we could be reading
3550       // beyond the end of the string, which could cause a memory fault.
3551       if (preload_characters == 3) preload_characters = 2;
3552     } else {
3553       if (preload_characters > 2) preload_characters = 2;
3554     }
3555   } else {
3556     if (preload_characters > 1) preload_characters = 1;
3557   }
3558   return preload_characters;
3559 }
3560 
3561 
3562 // This class is used when generating the alternatives in a choice node.  It
3563 // records the way the alternative is being code generated.
3564 class AlternativeGeneration: public Malloced {
3565  public:
AlternativeGeneration()3566   AlternativeGeneration()
3567       : possible_success(),
3568         expects_preload(false),
3569         after(),
3570         quick_check_details() { }
3571   Label possible_success;
3572   bool expects_preload;
3573   Label after;
3574   QuickCheckDetails quick_check_details;
3575 };
3576 
3577 
3578 // Creates a list of AlternativeGenerations.  If the list has a reasonable
3579 // size then it is on the stack, otherwise the excess is on the heap.
3580 class AlternativeGenerationList {
3581  public:
AlternativeGenerationList(int count,Zone * zone)3582   AlternativeGenerationList(int count, Zone* zone)
3583       : alt_gens_(count, zone) {
3584     for (int i = 0; i < count && i < kAFew; i++) {
3585       alt_gens_.Add(a_few_alt_gens_ + i, zone);
3586     }
3587     for (int i = kAFew; i < count; i++) {
3588       alt_gens_.Add(new AlternativeGeneration(), zone);
3589     }
3590   }
~AlternativeGenerationList()3591   ~AlternativeGenerationList() {
3592     for (int i = kAFew; i < alt_gens_.length(); i++) {
3593       delete alt_gens_[i];
3594       alt_gens_[i] = NULL;
3595     }
3596   }
3597 
at(int i)3598   AlternativeGeneration* at(int i) {
3599     return alt_gens_[i];
3600   }
3601 
3602  private:
3603   static const int kAFew = 10;
3604   ZoneList<AlternativeGeneration*> alt_gens_;
3605   AlternativeGeneration a_few_alt_gens_[kAFew];
3606 };
3607 
3608 
3609 static const uc32 kRangeEndMarker = 0x110000;
3610 
3611 // The '2' variant is has inclusive from and exclusive to.
3612 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3613 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3614 static const int kSpaceRanges[] = {
3615     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,         0x1681,
3616     0x180E, 0x180F,   0x2000, 0x200B,  0x2028, 0x202A, 0x202F,         0x2030,
3617     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
3618 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3619 
3620 static const int kWordRanges[] = {
3621     '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3622 static const int kWordRangeCount = arraysize(kWordRanges);
3623 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3624 static const int kDigitRangeCount = arraysize(kDigitRanges);
3625 static const int kSurrogateRanges[] = {
3626     kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3627 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3628 static const int kLineTerminatorRanges[] = {
3629     0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3630 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3631 
Set(int character)3632 void BoyerMoorePositionInfo::Set(int character) {
3633   SetInterval(Interval(character, character));
3634 }
3635 
3636 
SetInterval(const Interval & interval)3637 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3638   s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3639   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3640   d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3641   surrogate_ =
3642       AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3643   if (interval.to() - interval.from() >= kMapSize - 1) {
3644     if (map_count_ != kMapSize) {
3645       map_count_ = kMapSize;
3646       for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3647     }
3648     return;
3649   }
3650   for (int i = interval.from(); i <= interval.to(); i++) {
3651     int mod_character = (i & kMask);
3652     if (!map_->at(mod_character)) {
3653       map_count_++;
3654       map_->at(mod_character) = true;
3655     }
3656     if (map_count_ == kMapSize) return;
3657   }
3658 }
3659 
3660 
SetAll()3661 void BoyerMoorePositionInfo::SetAll() {
3662   s_ = w_ = d_ = kLatticeUnknown;
3663   if (map_count_ != kMapSize) {
3664     map_count_ = kMapSize;
3665     for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3666   }
3667 }
3668 
3669 
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3670 BoyerMooreLookahead::BoyerMooreLookahead(
3671     int length, RegExpCompiler* compiler, Zone* zone)
3672     : length_(length),
3673       compiler_(compiler) {
3674   if (compiler->one_byte()) {
3675     max_char_ = String::kMaxOneByteCharCode;
3676   } else {
3677     max_char_ = String::kMaxUtf16CodeUnit;
3678   }
3679   bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3680   for (int i = 0; i < length; i++) {
3681     bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3682   }
3683 }
3684 
3685 
3686 // Find the longest range of lookahead that has the fewest number of different
3687 // characters that can occur at a given position.  Since we are optimizing two
3688 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3689 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3690   int biggest_points = 0;
3691   // If more than 32 characters out of 128 can occur it is unlikely that we can
3692   // be lucky enough to step forwards much of the time.
3693   const int kMaxMax = 32;
3694   for (int max_number_of_chars = 4;
3695        max_number_of_chars < kMaxMax;
3696        max_number_of_chars *= 2) {
3697     biggest_points =
3698         FindBestInterval(max_number_of_chars, biggest_points, from, to);
3699   }
3700   if (biggest_points == 0) return false;
3701   return true;
3702 }
3703 
3704 
3705 // Find the highest-points range between 0 and length_ where the character
3706 // information is not too vague.  'Too vague' means that there are more than
3707 // max_number_of_chars that can occur at this position.  Calculates the number
3708 // of points as the product of width-of-the-range and
3709 // probability-of-finding-one-of-the-characters, where the probability is
3710 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3711 int BoyerMooreLookahead::FindBestInterval(
3712     int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3713   int biggest_points = old_biggest_points;
3714   static const int kSize = RegExpMacroAssembler::kTableSize;
3715   for (int i = 0; i < length_; ) {
3716     while (i < length_ && Count(i) > max_number_of_chars) i++;
3717     if (i == length_) break;
3718     int remembered_from = i;
3719     bool union_map[kSize];
3720     for (int j = 0; j < kSize; j++) union_map[j] = false;
3721     while (i < length_ && Count(i) <= max_number_of_chars) {
3722       BoyerMoorePositionInfo* map = bitmaps_->at(i);
3723       for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3724       i++;
3725     }
3726     int frequency = 0;
3727     for (int j = 0; j < kSize; j++) {
3728       if (union_map[j]) {
3729         // Add 1 to the frequency to give a small per-character boost for
3730         // the cases where our sampling is not good enough and many
3731         // characters have a frequency of zero.  This means the frequency
3732         // can theoretically be up to 2*kSize though we treat it mostly as
3733         // a fraction of kSize.
3734         frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3735       }
3736     }
3737     // We use the probability of skipping times the distance we are skipping to
3738     // judge the effectiveness of this.  Actually we have a cut-off:  By
3739     // dividing by 2 we switch off the skipping if the probability of skipping
3740     // is less than 50%.  This is because the multibyte mask-and-compare
3741     // skipping in quickcheck is more likely to do well on this case.
3742     bool in_quickcheck_range =
3743         ((i - remembered_from < 4) ||
3744          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3745     // Called 'probability' but it is only a rough estimate and can actually
3746     // be outside the 0-kSize range.
3747     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3748     int points = (i - remembered_from) * probability;
3749     if (points > biggest_points) {
3750       *from = remembered_from;
3751       *to = i - 1;
3752       biggest_points = points;
3753     }
3754   }
3755   return biggest_points;
3756 }
3757 
3758 
3759 // Take all the characters that will not prevent a successful match if they
3760 // occur in the subject string in the range between min_lookahead and
3761 // max_lookahead (inclusive) measured from the current position.  If the
3762 // character at max_lookahead offset is not one of these characters, then we
3763 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3764 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3765                                       int max_lookahead,
3766                                       Handle<ByteArray> boolean_skip_table) {
3767   const int kSize = RegExpMacroAssembler::kTableSize;
3768 
3769   const int kSkipArrayEntry = 0;
3770   const int kDontSkipArrayEntry = 1;
3771 
3772   for (int i = 0; i < kSize; i++) {
3773     boolean_skip_table->set(i, kSkipArrayEntry);
3774   }
3775   int skip = max_lookahead + 1 - min_lookahead;
3776 
3777   for (int i = max_lookahead; i >= min_lookahead; i--) {
3778     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3779     for (int j = 0; j < kSize; j++) {
3780       if (map->at(j)) {
3781         boolean_skip_table->set(j, kDontSkipArrayEntry);
3782       }
3783     }
3784   }
3785 
3786   return skip;
3787 }
3788 
3789 
3790 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3791 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3792   const int kSize = RegExpMacroAssembler::kTableSize;
3793 
3794   int min_lookahead = 0;
3795   int max_lookahead = 0;
3796 
3797   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3798 
3799   bool found_single_character = false;
3800   int single_character = 0;
3801   for (int i = max_lookahead; i >= min_lookahead; i--) {
3802     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3803     if (map->map_count() > 1 ||
3804         (found_single_character && map->map_count() != 0)) {
3805       found_single_character = false;
3806       break;
3807     }
3808     for (int j = 0; j < kSize; j++) {
3809       if (map->at(j)) {
3810         found_single_character = true;
3811         single_character = j;
3812         break;
3813       }
3814     }
3815   }
3816 
3817   int lookahead_width = max_lookahead + 1 - min_lookahead;
3818 
3819   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3820     // The mask-compare can probably handle this better.
3821     return;
3822   }
3823 
3824   if (found_single_character) {
3825     Label cont, again;
3826     masm->Bind(&again);
3827     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3828     if (max_char_ > kSize) {
3829       masm->CheckCharacterAfterAnd(single_character,
3830                                    RegExpMacroAssembler::kTableMask,
3831                                    &cont);
3832     } else {
3833       masm->CheckCharacter(single_character, &cont);
3834     }
3835     masm->AdvanceCurrentPosition(lookahead_width);
3836     masm->GoTo(&again);
3837     masm->Bind(&cont);
3838     return;
3839   }
3840 
3841   Factory* factory = masm->isolate()->factory();
3842   Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3843   int skip_distance = GetSkipTable(
3844       min_lookahead, max_lookahead, boolean_skip_table);
3845   DCHECK(skip_distance != 0);
3846 
3847   Label cont, again;
3848   masm->Bind(&again);
3849   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3850   masm->CheckBitInTable(boolean_skip_table, &cont);
3851   masm->AdvanceCurrentPosition(skip_distance);
3852   masm->GoTo(&again);
3853   masm->Bind(&cont);
3854 }
3855 
3856 
3857 /* Code generation for choice nodes.
3858  *
3859  * We generate quick checks that do a mask and compare to eliminate a
3860  * choice.  If the quick check succeeds then it jumps to the continuation to
3861  * do slow checks and check subsequent nodes.  If it fails (the common case)
3862  * it falls through to the next choice.
3863  *
3864  * Here is the desired flow graph.  Nodes directly below each other imply
3865  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
3866  * 3 doesn't have a quick check so we have to call the slow check.
3867  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
3868  * regexp continuation is generated directly after the Sn node, up to the
3869  * next GoTo if we decide to reuse some already generated code.  Some
3870  * nodes expect preload_characters to be preloaded into the current
3871  * character register.  R nodes do this preloading.  Vertices are marked
3872  * F for failures and S for success (possible success in the case of quick
3873  * nodes).  L, V, < and > are used as arrow heads.
3874  *
3875  * ----------> R
3876  *             |
3877  *             V
3878  *            Q1 -----> S1
3879  *             |   S   /
3880  *            F|      /
3881  *             |    F/
3882  *             |    /
3883  *             |   R
3884  *             |  /
3885  *             V L
3886  *            Q2 -----> S2
3887  *             |   S   /
3888  *            F|      /
3889  *             |    F/
3890  *             |    /
3891  *             |   R
3892  *             |  /
3893  *             V L
3894  *            S3
3895  *             |
3896  *            F|
3897  *             |
3898  *             R
3899  *             |
3900  * backtrack   V
3901  * <----------Q4
3902  *   \    F    |
3903  *    \        |S
3904  *     \   F   V
3905  *      \-----S4
3906  *
3907  * For greedy loops we push the current position, then generate the code that
3908  * eats the input specially in EmitGreedyLoop.  The other choice (the
3909  * continuation) is generated by the normal code in EmitChoices, and steps back
3910  * in the input to the starting position when it fails to match.  The loop code
3911  * looks like this (U is the unwind code that steps back in the greedy loop).
3912  *
3913  *              _____
3914  *             /     \
3915  *             V     |
3916  * ----------> S1    |
3917  *            /|     |
3918  *           / |S    |
3919  *         F/  \_____/
3920  *         /
3921  *        |<-----
3922  *        |      \
3923  *        V       |S
3924  *        Q2 ---> U----->backtrack
3925  *        |  F   /
3926  *       S|     /
3927  *        V  F /
3928  *        S2--/
3929  */
3930 
GreedyLoopState(bool not_at_start)3931 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3932   counter_backtrack_trace_.set_backtrack(&label_);
3933   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3934 }
3935 
3936 
AssertGuardsMentionRegisters(Trace * trace)3937 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3938 #ifdef DEBUG
3939   int choice_count = alternatives_->length();
3940   for (int i = 0; i < choice_count - 1; i++) {
3941     GuardedAlternative alternative = alternatives_->at(i);
3942     ZoneList<Guard*>* guards = alternative.guards();
3943     int guard_count = (guards == NULL) ? 0 : guards->length();
3944     for (int j = 0; j < guard_count; j++) {
3945       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3946     }
3947   }
3948 #endif
3949 }
3950 
3951 
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3952 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3953                               Trace* current_trace,
3954                               PreloadState* state) {
3955     if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3956       // Save some time by looking at most one machine word ahead.
3957       state->eats_at_least_ =
3958           EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3959                       current_trace->at_start() == Trace::FALSE_VALUE);
3960     }
3961     state->preload_characters_ =
3962         CalculatePreloadCharacters(compiler, state->eats_at_least_);
3963 
3964     state->preload_is_current_ =
3965         (current_trace->characters_preloaded() == state->preload_characters_);
3966     state->preload_has_checked_bounds_ = state->preload_is_current_;
3967 }
3968 
3969 
Emit(RegExpCompiler * compiler,Trace * trace)3970 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3971   int choice_count = alternatives_->length();
3972 
3973   if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3974     alternatives_->at(0).node()->Emit(compiler, trace);
3975     return;
3976   }
3977 
3978   AssertGuardsMentionRegisters(trace);
3979 
3980   LimitResult limit_result = LimitVersions(compiler, trace);
3981   if (limit_result == DONE) return;
3982   DCHECK(limit_result == CONTINUE);
3983 
3984   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3985   // other choice nodes we only flush if we are out of code size budget.
3986   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3987     trace->Flush(compiler, this);
3988     return;
3989   }
3990 
3991   RecursionCheck rc(compiler);
3992 
3993   PreloadState preload;
3994   preload.init();
3995   GreedyLoopState greedy_loop_state(not_at_start());
3996 
3997   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3998   AlternativeGenerationList alt_gens(choice_count, zone());
3999 
4000   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4001     trace = EmitGreedyLoop(compiler,
4002                            trace,
4003                            &alt_gens,
4004                            &preload,
4005                            &greedy_loop_state,
4006                            text_length);
4007   } else {
4008     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
4009     // match the traces produced pre-cleanup.
4010     Label second_choice;
4011     compiler->macro_assembler()->Bind(&second_choice);
4012 
4013     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4014 
4015     EmitChoices(compiler,
4016                 &alt_gens,
4017                 0,
4018                 trace,
4019                 &preload);
4020   }
4021 
4022   // At this point we need to generate slow checks for the alternatives where
4023   // the quick check was inlined.  We can recognize these because the associated
4024   // label was bound.
4025   int new_flush_budget = trace->flush_budget() / choice_count;
4026   for (int i = 0; i < choice_count; i++) {
4027     AlternativeGeneration* alt_gen = alt_gens.at(i);
4028     Trace new_trace(*trace);
4029     // If there are actions to be flushed we have to limit how many times
4030     // they are flushed.  Take the budget of the parent trace and distribute
4031     // it fairly amongst the children.
4032     if (new_trace.actions() != NULL) {
4033       new_trace.set_flush_budget(new_flush_budget);
4034     }
4035     bool next_expects_preload =
4036         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4037     EmitOutOfLineContinuation(compiler,
4038                               &new_trace,
4039                               alternatives_->at(i),
4040                               alt_gen,
4041                               preload.preload_characters_,
4042                               next_expects_preload);
4043   }
4044 }
4045 
4046 
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)4047 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4048                                   Trace* trace,
4049                                   AlternativeGenerationList* alt_gens,
4050                                   PreloadState* preload,
4051                                   GreedyLoopState* greedy_loop_state,
4052                                   int text_length) {
4053   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4054   // Here we have special handling for greedy loops containing only text nodes
4055   // and other simple nodes.  These are handled by pushing the current
4056   // position on the stack and then incrementing the current position each
4057   // time around the switch.  On backtrack we decrement the current position
4058   // and check it against the pushed value.  This avoids pushing backtrack
4059   // information for each iteration of the loop, which could take up a lot of
4060   // space.
4061   DCHECK(trace->stop_node() == NULL);
4062   macro_assembler->PushCurrentPosition();
4063   Label greedy_match_failed;
4064   Trace greedy_match_trace;
4065   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4066   greedy_match_trace.set_backtrack(&greedy_match_failed);
4067   Label loop_label;
4068   macro_assembler->Bind(&loop_label);
4069   greedy_match_trace.set_stop_node(this);
4070   greedy_match_trace.set_loop_label(&loop_label);
4071   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4072   macro_assembler->Bind(&greedy_match_failed);
4073 
4074   Label second_choice;  // For use in greedy matches.
4075   macro_assembler->Bind(&second_choice);
4076 
4077   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4078 
4079   EmitChoices(compiler,
4080               alt_gens,
4081               1,
4082               new_trace,
4083               preload);
4084 
4085   macro_assembler->Bind(greedy_loop_state->label());
4086   // If we have unwound to the bottom then backtrack.
4087   macro_assembler->CheckGreedyLoop(trace->backtrack());
4088   // Otherwise try the second priority at an earlier position.
4089   macro_assembler->AdvanceCurrentPosition(-text_length);
4090   macro_assembler->GoTo(&second_choice);
4091   return new_trace;
4092 }
4093 
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4094 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4095                                               Trace* trace) {
4096   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4097   if (alternatives_->length() != 2) return eats_at_least;
4098 
4099   GuardedAlternative alt1 = alternatives_->at(1);
4100   if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4101     return eats_at_least;
4102   }
4103   RegExpNode* eats_anything_node = alt1.node();
4104   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4105     return eats_at_least;
4106   }
4107 
4108   // Really we should be creating a new trace when we execute this function,
4109   // but there is no need, because the code it generates cannot backtrack, and
4110   // we always arrive here with a trivial trace (since it's the entry to a
4111   // loop.  That also implies that there are no preloaded characters, which is
4112   // good, because it means we won't be violating any assumptions by
4113   // overwriting those characters with new load instructions.
4114   DCHECK(trace->is_trivial());
4115 
4116   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4117   Isolate* isolate = macro_assembler->isolate();
4118   // At this point we know that we are at a non-greedy loop that will eat
4119   // any character one at a time.  Any non-anchored regexp has such a
4120   // loop prepended to it in order to find where it starts.  We look for
4121   // a pattern of the form ...abc... where we can look 6 characters ahead
4122   // and step forwards 3 if the character is not one of abc.  Abc need
4123   // not be atoms, they can be any reasonably limited character class or
4124   // small alternation.
4125   BoyerMooreLookahead* bm = bm_info(false);
4126   if (bm == NULL) {
4127     eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4128                         EatsAtLeast(kMaxLookaheadForBoyerMoore,
4129                                     kRecursionBudget,
4130                                     false));
4131     if (eats_at_least >= 1) {
4132       bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4133                                            compiler,
4134                                            zone());
4135       GuardedAlternative alt0 = alternatives_->at(0);
4136       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4137     }
4138   }
4139   if (bm != NULL) {
4140     bm->EmitSkipInstructions(macro_assembler);
4141   }
4142   return eats_at_least;
4143 }
4144 
4145 
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4146 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4147                              AlternativeGenerationList* alt_gens,
4148                              int first_choice,
4149                              Trace* trace,
4150                              PreloadState* preload) {
4151   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4152   SetUpPreLoad(compiler, trace, preload);
4153 
4154   // For now we just call all choices one after the other.  The idea ultimately
4155   // is to use the Dispatch table to try only the relevant ones.
4156   int choice_count = alternatives_->length();
4157 
4158   int new_flush_budget = trace->flush_budget() / choice_count;
4159 
4160   for (int i = first_choice; i < choice_count; i++) {
4161     bool is_last = i == choice_count - 1;
4162     bool fall_through_on_failure = !is_last;
4163     GuardedAlternative alternative = alternatives_->at(i);
4164     AlternativeGeneration* alt_gen = alt_gens->at(i);
4165     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4166     ZoneList<Guard*>* guards = alternative.guards();
4167     int guard_count = (guards == NULL) ? 0 : guards->length();
4168     Trace new_trace(*trace);
4169     new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4170                                          preload->preload_characters_ :
4171                                          0);
4172     if (preload->preload_has_checked_bounds_) {
4173       new_trace.set_bound_checked_up_to(preload->preload_characters_);
4174     }
4175     new_trace.quick_check_performed()->Clear();
4176     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4177     if (!is_last) {
4178       new_trace.set_backtrack(&alt_gen->after);
4179     }
4180     alt_gen->expects_preload = preload->preload_is_current_;
4181     bool generate_full_check_inline = false;
4182     if (compiler->optimize() &&
4183         try_to_emit_quick_check_for_alternative(i == 0) &&
4184         alternative.node()->EmitQuickCheck(
4185             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4186             &alt_gen->possible_success, &alt_gen->quick_check_details,
4187             fall_through_on_failure)) {
4188       // Quick check was generated for this choice.
4189       preload->preload_is_current_ = true;
4190       preload->preload_has_checked_bounds_ = true;
4191       // If we generated the quick check to fall through on possible success,
4192       // we now need to generate the full check inline.
4193       if (!fall_through_on_failure) {
4194         macro_assembler->Bind(&alt_gen->possible_success);
4195         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4196         new_trace.set_characters_preloaded(preload->preload_characters_);
4197         new_trace.set_bound_checked_up_to(preload->preload_characters_);
4198         generate_full_check_inline = true;
4199       }
4200     } else if (alt_gen->quick_check_details.cannot_match()) {
4201       if (!fall_through_on_failure) {
4202         macro_assembler->GoTo(trace->backtrack());
4203       }
4204       continue;
4205     } else {
4206       // No quick check was generated.  Put the full code here.
4207       // If this is not the first choice then there could be slow checks from
4208       // previous cases that go here when they fail.  There's no reason to
4209       // insist that they preload characters since the slow check we are about
4210       // to generate probably can't use it.
4211       if (i != first_choice) {
4212         alt_gen->expects_preload = false;
4213         new_trace.InvalidateCurrentCharacter();
4214       }
4215       generate_full_check_inline = true;
4216     }
4217     if (generate_full_check_inline) {
4218       if (new_trace.actions() != NULL) {
4219         new_trace.set_flush_budget(new_flush_budget);
4220       }
4221       for (int j = 0; j < guard_count; j++) {
4222         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4223       }
4224       alternative.node()->Emit(compiler, &new_trace);
4225       preload->preload_is_current_ = false;
4226     }
4227     macro_assembler->Bind(&alt_gen->after);
4228   }
4229 }
4230 
4231 
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4232 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4233                                            Trace* trace,
4234                                            GuardedAlternative alternative,
4235                                            AlternativeGeneration* alt_gen,
4236                                            int preload_characters,
4237                                            bool next_expects_preload) {
4238   if (!alt_gen->possible_success.is_linked()) return;
4239 
4240   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4241   macro_assembler->Bind(&alt_gen->possible_success);
4242   Trace out_of_line_trace(*trace);
4243   out_of_line_trace.set_characters_preloaded(preload_characters);
4244   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4245   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4246   ZoneList<Guard*>* guards = alternative.guards();
4247   int guard_count = (guards == NULL) ? 0 : guards->length();
4248   if (next_expects_preload) {
4249     Label reload_current_char;
4250     out_of_line_trace.set_backtrack(&reload_current_char);
4251     for (int j = 0; j < guard_count; j++) {
4252       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4253     }
4254     alternative.node()->Emit(compiler, &out_of_line_trace);
4255     macro_assembler->Bind(&reload_current_char);
4256     // Reload the current character, since the next quick check expects that.
4257     // We don't need to check bounds here because we only get into this
4258     // code through a quick check which already did the checked load.
4259     macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4260                                           NULL,
4261                                           false,
4262                                           preload_characters);
4263     macro_assembler->GoTo(&(alt_gen->after));
4264   } else {
4265     out_of_line_trace.set_backtrack(&(alt_gen->after));
4266     for (int j = 0; j < guard_count; j++) {
4267       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4268     }
4269     alternative.node()->Emit(compiler, &out_of_line_trace);
4270   }
4271 }
4272 
4273 
Emit(RegExpCompiler * compiler,Trace * trace)4274 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4275   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4276   LimitResult limit_result = LimitVersions(compiler, trace);
4277   if (limit_result == DONE) return;
4278   DCHECK(limit_result == CONTINUE);
4279 
4280   RecursionCheck rc(compiler);
4281 
4282   switch (action_type_) {
4283     case STORE_POSITION: {
4284       Trace::DeferredCapture
4285           new_capture(data_.u_position_register.reg,
4286                       data_.u_position_register.is_capture,
4287                       trace);
4288       Trace new_trace = *trace;
4289       new_trace.add_action(&new_capture);
4290       on_success()->Emit(compiler, &new_trace);
4291       break;
4292     }
4293     case INCREMENT_REGISTER: {
4294       Trace::DeferredIncrementRegister
4295           new_increment(data_.u_increment_register.reg);
4296       Trace new_trace = *trace;
4297       new_trace.add_action(&new_increment);
4298       on_success()->Emit(compiler, &new_trace);
4299       break;
4300     }
4301     case SET_REGISTER: {
4302       Trace::DeferredSetRegister
4303           new_set(data_.u_store_register.reg, data_.u_store_register.value);
4304       Trace new_trace = *trace;
4305       new_trace.add_action(&new_set);
4306       on_success()->Emit(compiler, &new_trace);
4307       break;
4308     }
4309     case CLEAR_CAPTURES: {
4310       Trace::DeferredClearCaptures
4311         new_capture(Interval(data_.u_clear_captures.range_from,
4312                              data_.u_clear_captures.range_to));
4313       Trace new_trace = *trace;
4314       new_trace.add_action(&new_capture);
4315       on_success()->Emit(compiler, &new_trace);
4316       break;
4317     }
4318     case BEGIN_SUBMATCH:
4319       if (!trace->is_trivial()) {
4320         trace->Flush(compiler, this);
4321       } else {
4322         assembler->WriteCurrentPositionToRegister(
4323             data_.u_submatch.current_position_register, 0);
4324         assembler->WriteStackPointerToRegister(
4325             data_.u_submatch.stack_pointer_register);
4326         on_success()->Emit(compiler, trace);
4327       }
4328       break;
4329     case EMPTY_MATCH_CHECK: {
4330       int start_pos_reg = data_.u_empty_match_check.start_register;
4331       int stored_pos = 0;
4332       int rep_reg = data_.u_empty_match_check.repetition_register;
4333       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4334       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4335       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4336         // If we know we haven't advanced and there is no minimum we
4337         // can just backtrack immediately.
4338         assembler->GoTo(trace->backtrack());
4339       } else if (know_dist && stored_pos < trace->cp_offset()) {
4340         // If we know we've advanced we can generate the continuation
4341         // immediately.
4342         on_success()->Emit(compiler, trace);
4343       } else if (!trace->is_trivial()) {
4344         trace->Flush(compiler, this);
4345       } else {
4346         Label skip_empty_check;
4347         // If we have a minimum number of repetitions we check the current
4348         // number first and skip the empty check if it's not enough.
4349         if (has_minimum) {
4350           int limit = data_.u_empty_match_check.repetition_limit;
4351           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4352         }
4353         // If the match is empty we bail out, otherwise we fall through
4354         // to the on-success continuation.
4355         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4356                                    trace->backtrack());
4357         assembler->Bind(&skip_empty_check);
4358         on_success()->Emit(compiler, trace);
4359       }
4360       break;
4361     }
4362     case POSITIVE_SUBMATCH_SUCCESS: {
4363       if (!trace->is_trivial()) {
4364         trace->Flush(compiler, this);
4365         return;
4366       }
4367       assembler->ReadCurrentPositionFromRegister(
4368           data_.u_submatch.current_position_register);
4369       assembler->ReadStackPointerFromRegister(
4370           data_.u_submatch.stack_pointer_register);
4371       int clear_register_count = data_.u_submatch.clear_register_count;
4372       if (clear_register_count == 0) {
4373         on_success()->Emit(compiler, trace);
4374         return;
4375       }
4376       int clear_registers_from = data_.u_submatch.clear_register_from;
4377       Label clear_registers_backtrack;
4378       Trace new_trace = *trace;
4379       new_trace.set_backtrack(&clear_registers_backtrack);
4380       on_success()->Emit(compiler, &new_trace);
4381 
4382       assembler->Bind(&clear_registers_backtrack);
4383       int clear_registers_to = clear_registers_from + clear_register_count - 1;
4384       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4385 
4386       DCHECK(trace->backtrack() == NULL);
4387       assembler->Backtrack();
4388       return;
4389     }
4390     default:
4391       UNREACHABLE();
4392   }
4393 }
4394 
4395 
Emit(RegExpCompiler * compiler,Trace * trace)4396 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4397   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4398   if (!trace->is_trivial()) {
4399     trace->Flush(compiler, this);
4400     return;
4401   }
4402 
4403   LimitResult limit_result = LimitVersions(compiler, trace);
4404   if (limit_result == DONE) return;
4405   DCHECK(limit_result == CONTINUE);
4406 
4407   RecursionCheck rc(compiler);
4408 
4409   DCHECK_EQ(start_reg_ + 1, end_reg_);
4410   if (compiler->ignore_case()) {
4411     assembler->CheckNotBackReferenceIgnoreCase(
4412         start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
4413   } else {
4414     assembler->CheckNotBackReference(start_reg_, read_backward(),
4415                                      trace->backtrack());
4416   }
4417   // We are going to advance backward, so we may end up at the start.
4418   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4419 
4420   // Check that the back reference does not end inside a surrogate pair.
4421   if (compiler->unicode() && !compiler->one_byte()) {
4422     assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4423   }
4424   on_success()->Emit(compiler, trace);
4425 }
4426 
4427 
4428 // -------------------------------------------------------------------
4429 // Dot/dotty output
4430 
4431 
4432 #ifdef DEBUG
4433 
4434 
4435 class DotPrinter: public NodeVisitor {
4436  public:
DotPrinter(std::ostream & os,bool ignore_case)4437   DotPrinter(std::ostream& os, bool ignore_case)  // NOLINT
4438       : os_(os),
4439         ignore_case_(ignore_case) {}
4440   void PrintNode(const char* label, RegExpNode* node);
4441   void Visit(RegExpNode* node);
4442   void PrintAttributes(RegExpNode* from);
4443   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4444 #define DECLARE_VISIT(Type)                                          \
4445   virtual void Visit##Type(Type##Node* that);
4446 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4447 #undef DECLARE_VISIT
4448  private:
4449   std::ostream& os_;
4450   bool ignore_case_;
4451 };
4452 
4453 
PrintNode(const char * label,RegExpNode * node)4454 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4455   os_ << "digraph G {\n  graph [label=\"";
4456   for (int i = 0; label[i]; i++) {
4457     switch (label[i]) {
4458       case '\\':
4459         os_ << "\\\\";
4460         break;
4461       case '"':
4462         os_ << "\"";
4463         break;
4464       default:
4465         os_ << label[i];
4466         break;
4467     }
4468   }
4469   os_ << "\"];\n";
4470   Visit(node);
4471   os_ << "}" << std::endl;
4472 }
4473 
4474 
Visit(RegExpNode * node)4475 void DotPrinter::Visit(RegExpNode* node) {
4476   if (node->info()->visited) return;
4477   node->info()->visited = true;
4478   node->Accept(this);
4479 }
4480 
4481 
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4482 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4483   os_ << "  n" << from << " -> n" << on_failure << " [style=dotted];\n";
4484   Visit(on_failure);
4485 }
4486 
4487 
4488 class TableEntryBodyPrinter {
4489  public:
TableEntryBodyPrinter(std::ostream & os,ChoiceNode * choice)4490   TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice)  // NOLINT
4491       : os_(os),
4492         choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4493   void Call(uc16 from, DispatchTable::Entry entry) {
4494     OutSet* out_set = entry.out_set();
4495     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4496       if (out_set->Get(i)) {
4497         os_ << "    n" << choice() << ":s" << from << "o" << i << " -> n"
4498             << choice()->alternatives()->at(i).node() << ";\n";
4499       }
4500     }
4501   }
4502  private:
choice()4503   ChoiceNode* choice() { return choice_; }
4504   std::ostream& os_;
4505   ChoiceNode* choice_;
4506 };
4507 
4508 
4509 class TableEntryHeaderPrinter {
4510  public:
TableEntryHeaderPrinter(std::ostream & os)4511   explicit TableEntryHeaderPrinter(std::ostream& os)  // NOLINT
4512       : first_(true),
4513         os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4514   void Call(uc16 from, DispatchTable::Entry entry) {
4515     if (first_) {
4516       first_ = false;
4517     } else {
4518       os_ << "|";
4519     }
4520     os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4521     OutSet* out_set = entry.out_set();
4522     int priority = 0;
4523     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4524       if (out_set->Get(i)) {
4525         if (priority > 0) os_ << "|";
4526         os_ << "<s" << from << "o" << i << "> " << priority;
4527         priority++;
4528       }
4529     }
4530     os_ << "}}";
4531   }
4532 
4533  private:
4534   bool first_;
4535   std::ostream& os_;
4536 };
4537 
4538 
4539 class AttributePrinter {
4540  public:
AttributePrinter(std::ostream & os)4541   explicit AttributePrinter(std::ostream& os)  // NOLINT
4542       : os_(os),
4543         first_(true) {}
PrintSeparator()4544   void PrintSeparator() {
4545     if (first_) {
4546       first_ = false;
4547     } else {
4548       os_ << "|";
4549     }
4550   }
PrintBit(const char * name,bool value)4551   void PrintBit(const char* name, bool value) {
4552     if (!value) return;
4553     PrintSeparator();
4554     os_ << "{" << name << "}";
4555   }
PrintPositive(const char * name,int value)4556   void PrintPositive(const char* name, int value) {
4557     if (value < 0) return;
4558     PrintSeparator();
4559     os_ << "{" << name << "|" << value << "}";
4560   }
4561 
4562  private:
4563   std::ostream& os_;
4564   bool first_;
4565 };
4566 
4567 
PrintAttributes(RegExpNode * that)4568 void DotPrinter::PrintAttributes(RegExpNode* that) {
4569   os_ << "  a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4570       << "margin=0.1, fontsize=10, label=\"{";
4571   AttributePrinter printer(os_);
4572   NodeInfo* info = that->info();
4573   printer.PrintBit("NI", info->follows_newline_interest);
4574   printer.PrintBit("WI", info->follows_word_interest);
4575   printer.PrintBit("SI", info->follows_start_interest);
4576   Label* label = that->label();
4577   if (label->is_bound())
4578     printer.PrintPositive("@", label->pos());
4579   os_ << "}\"];\n"
4580       << "  a" << that << " -> n" << that
4581       << " [style=dashed, color=grey, arrowhead=none];\n";
4582 }
4583 
4584 
4585 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4586 void DotPrinter::VisitChoice(ChoiceNode* that) {
4587   if (kPrintDispatchTable) {
4588     os_ << "  n" << that << " [shape=Mrecord, label=\"";
4589     TableEntryHeaderPrinter header_printer(os_);
4590     that->GetTable(ignore_case_)->ForEach(&header_printer);
4591     os_ << "\"]\n";
4592     PrintAttributes(that);
4593     TableEntryBodyPrinter body_printer(os_, that);
4594     that->GetTable(ignore_case_)->ForEach(&body_printer);
4595   } else {
4596     os_ << "  n" << that << " [shape=Mrecord, label=\"?\"];\n";
4597     for (int i = 0; i < that->alternatives()->length(); i++) {
4598       GuardedAlternative alt = that->alternatives()->at(i);
4599       os_ << "  n" << that << " -> n" << alt.node();
4600     }
4601   }
4602   for (int i = 0; i < that->alternatives()->length(); i++) {
4603     GuardedAlternative alt = that->alternatives()->at(i);
4604     alt.node()->Accept(this);
4605   }
4606 }
4607 
4608 
VisitText(TextNode * that)4609 void DotPrinter::VisitText(TextNode* that) {
4610   Zone* zone = that->zone();
4611   os_ << "  n" << that << " [label=\"";
4612   for (int i = 0; i < that->elements()->length(); i++) {
4613     if (i > 0) os_ << " ";
4614     TextElement elm = that->elements()->at(i);
4615     switch (elm.text_type()) {
4616       case TextElement::ATOM: {
4617         Vector<const uc16> data = elm.atom()->data();
4618         for (int i = 0; i < data.length(); i++) {
4619           os_ << static_cast<char>(data[i]);
4620         }
4621         break;
4622       }
4623       case TextElement::CHAR_CLASS: {
4624         RegExpCharacterClass* node = elm.char_class();
4625         os_ << "[";
4626         if (node->is_negated()) os_ << "^";
4627         for (int j = 0; j < node->ranges(zone)->length(); j++) {
4628           CharacterRange range = node->ranges(zone)->at(j);
4629           os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4630         }
4631         os_ << "]";
4632         break;
4633       }
4634       default:
4635         UNREACHABLE();
4636     }
4637   }
4638   os_ << "\", shape=box, peripheries=2];\n";
4639   PrintAttributes(that);
4640   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4641   Visit(that->on_success());
4642 }
4643 
4644 
VisitBackReference(BackReferenceNode * that)4645 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4646   os_ << "  n" << that << " [label=\"$" << that->start_register() << "..$"
4647       << that->end_register() << "\", shape=doubleoctagon];\n";
4648   PrintAttributes(that);
4649   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4650   Visit(that->on_success());
4651 }
4652 
4653 
VisitEnd(EndNode * that)4654 void DotPrinter::VisitEnd(EndNode* that) {
4655   os_ << "  n" << that << " [style=bold, shape=point];\n";
4656   PrintAttributes(that);
4657 }
4658 
4659 
VisitAssertion(AssertionNode * that)4660 void DotPrinter::VisitAssertion(AssertionNode* that) {
4661   os_ << "  n" << that << " [";
4662   switch (that->assertion_type()) {
4663     case AssertionNode::AT_END:
4664       os_ << "label=\"$\", shape=septagon";
4665       break;
4666     case AssertionNode::AT_START:
4667       os_ << "label=\"^\", shape=septagon";
4668       break;
4669     case AssertionNode::AT_BOUNDARY:
4670       os_ << "label=\"\\b\", shape=septagon";
4671       break;
4672     case AssertionNode::AT_NON_BOUNDARY:
4673       os_ << "label=\"\\B\", shape=septagon";
4674       break;
4675     case AssertionNode::AFTER_NEWLINE:
4676       os_ << "label=\"(?<=\\n)\", shape=septagon";
4677       break;
4678   }
4679   os_ << "];\n";
4680   PrintAttributes(that);
4681   RegExpNode* successor = that->on_success();
4682   os_ << "  n" << that << " -> n" << successor << ";\n";
4683   Visit(successor);
4684 }
4685 
4686 
VisitAction(ActionNode * that)4687 void DotPrinter::VisitAction(ActionNode* that) {
4688   os_ << "  n" << that << " [";
4689   switch (that->action_type_) {
4690     case ActionNode::SET_REGISTER:
4691       os_ << "label=\"$" << that->data_.u_store_register.reg
4692           << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4693       break;
4694     case ActionNode::INCREMENT_REGISTER:
4695       os_ << "label=\"$" << that->data_.u_increment_register.reg
4696           << "++\", shape=octagon";
4697       break;
4698     case ActionNode::STORE_POSITION:
4699       os_ << "label=\"$" << that->data_.u_position_register.reg
4700           << ":=$pos\", shape=octagon";
4701       break;
4702     case ActionNode::BEGIN_SUBMATCH:
4703       os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4704           << ":=$pos,begin\", shape=septagon";
4705       break;
4706     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4707       os_ << "label=\"escape\", shape=septagon";
4708       break;
4709     case ActionNode::EMPTY_MATCH_CHECK:
4710       os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4711           << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4712           << "<" << that->data_.u_empty_match_check.repetition_limit
4713           << "?\", shape=septagon";
4714       break;
4715     case ActionNode::CLEAR_CAPTURES: {
4716       os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4717           << " to $" << that->data_.u_clear_captures.range_to
4718           << "\", shape=septagon";
4719       break;
4720     }
4721   }
4722   os_ << "];\n";
4723   PrintAttributes(that);
4724   RegExpNode* successor = that->on_success();
4725   os_ << "  n" << that << " -> n" << successor << ";\n";
4726   Visit(successor);
4727 }
4728 
4729 
4730 class DispatchTableDumper {
4731  public:
DispatchTableDumper(std::ostream & os)4732   explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4733   void Call(uc16 key, DispatchTable::Entry entry);
4734  private:
4735   std::ostream& os_;
4736 };
4737 
4738 
Call(uc16 key,DispatchTable::Entry entry)4739 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4740   os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4741   OutSet* set = entry.out_set();
4742   bool first = true;
4743   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4744     if (set->Get(i)) {
4745       if (first) {
4746         first = false;
4747       } else {
4748         os_ << ", ";
4749       }
4750       os_ << i;
4751     }
4752   }
4753   os_ << "}\n";
4754 }
4755 
4756 
Dump()4757 void DispatchTable::Dump() {
4758   OFStream os(stderr);
4759   DispatchTableDumper dumper(os);
4760   tree()->ForEach(&dumper);
4761 }
4762 
4763 
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4764 void RegExpEngine::DotPrint(const char* label,
4765                             RegExpNode* node,
4766                             bool ignore_case) {
4767   OFStream os(stdout);
4768   DotPrinter printer(os, ignore_case);
4769   printer.PrintNode(label, node);
4770 }
4771 
4772 
4773 #endif  // DEBUG
4774 
4775 
4776 // -------------------------------------------------------------------
4777 // Tree to graph conversion
4778 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4779 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4780                                RegExpNode* on_success) {
4781   ZoneList<TextElement>* elms =
4782       new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4783   elms->Add(TextElement::Atom(this), compiler->zone());
4784   return new (compiler->zone())
4785       TextNode(elms, compiler->read_backward(), on_success);
4786 }
4787 
4788 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4789 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4790                                RegExpNode* on_success) {
4791   return new (compiler->zone())
4792       TextNode(elements(), compiler->read_backward(), on_success);
4793 }
4794 
4795 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4796 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4797                                  const int* special_class,
4798                                  int length) {
4799   length--;  // Remove final marker.
4800   DCHECK(special_class[length] == kRangeEndMarker);
4801   DCHECK(ranges->length() != 0);
4802   DCHECK(length != 0);
4803   DCHECK(special_class[0] != 0);
4804   if (ranges->length() != (length >> 1) + 1) {
4805     return false;
4806   }
4807   CharacterRange range = ranges->at(0);
4808   if (range.from() != 0) {
4809     return false;
4810   }
4811   for (int i = 0; i < length; i += 2) {
4812     if (special_class[i] != (range.to() + 1)) {
4813       return false;
4814     }
4815     range = ranges->at((i >> 1) + 1);
4816     if (special_class[i+1] != range.from()) {
4817       return false;
4818     }
4819   }
4820   if (range.to() != String::kMaxCodePoint) {
4821     return false;
4822   }
4823   return true;
4824 }
4825 
4826 
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4827 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4828                           const int* special_class,
4829                           int length) {
4830   length--;  // Remove final marker.
4831   DCHECK(special_class[length] == kRangeEndMarker);
4832   if (ranges->length() * 2 != length) {
4833     return false;
4834   }
4835   for (int i = 0; i < length; i += 2) {
4836     CharacterRange range = ranges->at(i >> 1);
4837     if (range.from() != special_class[i] ||
4838         range.to() != special_class[i + 1] - 1) {
4839       return false;
4840     }
4841   }
4842   return true;
4843 }
4844 
4845 
is_standard(Zone * zone)4846 bool RegExpCharacterClass::is_standard(Zone* zone) {
4847   // TODO(lrn): Remove need for this function, by not throwing away information
4848   // along the way.
4849   if (is_negated_) {
4850     return false;
4851   }
4852   if (set_.is_standard()) {
4853     return true;
4854   }
4855   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4856     set_.set_standard_set_type('s');
4857     return true;
4858   }
4859   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4860     set_.set_standard_set_type('S');
4861     return true;
4862   }
4863   if (CompareInverseRanges(set_.ranges(zone),
4864                            kLineTerminatorRanges,
4865                            kLineTerminatorRangeCount)) {
4866     set_.set_standard_set_type('.');
4867     return true;
4868   }
4869   if (CompareRanges(set_.ranges(zone),
4870                     kLineTerminatorRanges,
4871                     kLineTerminatorRangeCount)) {
4872     set_.set_standard_set_type('n');
4873     return true;
4874   }
4875   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4876     set_.set_standard_set_type('w');
4877     return true;
4878   }
4879   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4880     set_.set_standard_set_type('W');
4881     return true;
4882   }
4883   return false;
4884 }
4885 
4886 
UnicodeRangeSplitter(Zone * zone,ZoneList<CharacterRange> * base)4887 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4888                                            ZoneList<CharacterRange>* base)
4889     : zone_(zone),
4890       table_(zone),
4891       bmp_(nullptr),
4892       lead_surrogates_(nullptr),
4893       trail_surrogates_(nullptr),
4894       non_bmp_(nullptr) {
4895   // The unicode range splitter categorizes given character ranges into:
4896   // - Code points from the BMP representable by one code unit.
4897   // - Code points outside the BMP that need to be split into surrogate pairs.
4898   // - Lone lead surrogates.
4899   // - Lone trail surrogates.
4900   // Lone surrogates are valid code points, even though no actual characters.
4901   // They require special matching to make sure we do not split surrogate pairs.
4902   // We use the dispatch table to accomplish this. The base range is split up
4903   // by the table by the overlay ranges, and the Call callback is used to
4904   // filter and collect ranges for each category.
4905   for (int i = 0; i < base->length(); i++) {
4906     table_.AddRange(base->at(i), kBase, zone_);
4907   }
4908   // Add overlay ranges.
4909   table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4910                   kBmpCodePoints, zone_);
4911   table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4912                   kLeadSurrogates, zone_);
4913   table_.AddRange(
4914       CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4915       kTrailSurrogates, zone_);
4916   table_.AddRange(
4917       CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4918       kBmpCodePoints, zone_);
4919   table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4920                   kNonBmpCodePoints, zone_);
4921   table_.ForEach(this);
4922 }
4923 
4924 
Call(uc32 from,DispatchTable::Entry entry)4925 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4926   OutSet* outset = entry.out_set();
4927   if (!outset->Get(kBase)) return;
4928   ZoneList<CharacterRange>** target = NULL;
4929   if (outset->Get(kBmpCodePoints)) {
4930     target = &bmp_;
4931   } else if (outset->Get(kLeadSurrogates)) {
4932     target = &lead_surrogates_;
4933   } else if (outset->Get(kTrailSurrogates)) {
4934     target = &trail_surrogates_;
4935   } else {
4936     DCHECK(outset->Get(kNonBmpCodePoints));
4937     target = &non_bmp_;
4938   }
4939   if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4940   (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4941 }
4942 
4943 
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4944 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4945                       RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4946   ZoneList<CharacterRange>* bmp = splitter->bmp();
4947   if (bmp == nullptr) return;
4948   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4949       compiler->zone(), bmp, compiler->read_backward(), on_success)));
4950 }
4951 
4952 
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4953 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4954                              RegExpNode* on_success,
4955                              UnicodeRangeSplitter* splitter) {
4956   ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4957   if (non_bmp == nullptr) return;
4958   DCHECK(compiler->unicode());
4959   DCHECK(!compiler->one_byte());
4960   Zone* zone = compiler->zone();
4961   CharacterRange::Canonicalize(non_bmp);
4962   for (int i = 0; i < non_bmp->length(); i++) {
4963     // Match surrogate pair.
4964     // E.g. [\u10005-\u11005] becomes
4965     //      \ud800[\udc05-\udfff]|
4966     //      [\ud801-\ud803][\udc00-\udfff]|
4967     //      \ud804[\udc00-\udc05]
4968     uc32 from = non_bmp->at(i).from();
4969     uc32 to = non_bmp->at(i).to();
4970     uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4971     uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4972     uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4973     uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4974     if (from_l == to_l) {
4975       // The lead surrogate is the same.
4976       result->AddAlternative(
4977           GuardedAlternative(TextNode::CreateForSurrogatePair(
4978               zone, CharacterRange::Singleton(from_l),
4979               CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4980               on_success)));
4981     } else {
4982       if (from_t != kTrailSurrogateStart) {
4983         // Add [from_l][from_t-\udfff]
4984         result->AddAlternative(
4985             GuardedAlternative(TextNode::CreateForSurrogatePair(
4986                 zone, CharacterRange::Singleton(from_l),
4987                 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4988                 compiler->read_backward(), on_success)));
4989         from_l++;
4990       }
4991       if (to_t != kTrailSurrogateEnd) {
4992         // Add [to_l][\udc00-to_t]
4993         result->AddAlternative(
4994             GuardedAlternative(TextNode::CreateForSurrogatePair(
4995                 zone, CharacterRange::Singleton(to_l),
4996                 CharacterRange::Range(kTrailSurrogateStart, to_t),
4997                 compiler->read_backward(), on_success)));
4998         to_l--;
4999       }
5000       if (from_l <= to_l) {
5001         // Add [from_l-to_l][\udc00-\udfff]
5002         result->AddAlternative(
5003             GuardedAlternative(TextNode::CreateForSurrogatePair(
5004                 zone, CharacterRange::Range(from_l, to_l),
5005                 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5006                 compiler->read_backward(), on_success)));
5007       }
5008     }
5009   }
5010 }
5011 
5012 
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward)5013 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5014     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5015     ZoneList<CharacterRange>* match, RegExpNode* on_success,
5016     bool read_backward) {
5017   Zone* zone = compiler->zone();
5018   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5019       zone, match, read_backward, on_success);
5020   int stack_register = compiler->UnicodeLookaroundStackRegister();
5021   int position_register = compiler->UnicodeLookaroundPositionRegister();
5022   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5023                                        position_register);
5024   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5025       zone, lookbehind, !read_backward, lookaround.on_match_success());
5026   return lookaround.ForMatch(negative_match);
5027 }
5028 
5029 
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward)5030 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5031     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5032     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5033     bool read_backward) {
5034   Zone* zone = compiler->zone();
5035   int stack_register = compiler->UnicodeLookaroundStackRegister();
5036   int position_register = compiler->UnicodeLookaroundPositionRegister();
5037   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5038                                        position_register);
5039   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5040       zone, lookahead, read_backward, lookaround.on_match_success());
5041   return TextNode::CreateForCharacterRanges(
5042       zone, match, read_backward, lookaround.ForMatch(negative_match));
5043 }
5044 
5045 
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5046 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5047                            RegExpNode* on_success,
5048                            UnicodeRangeSplitter* splitter) {
5049   ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5050   if (lead_surrogates == nullptr) return;
5051   Zone* zone = compiler->zone();
5052   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5053   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5054       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5055 
5056   RegExpNode* match;
5057   if (compiler->read_backward()) {
5058     // Reading backward. Assert that reading forward, there is no trail
5059     // surrogate, and then backward match the lead surrogate.
5060     match = NegativeLookaroundAgainstReadDirectionAndMatch(
5061         compiler, trail_surrogates, lead_surrogates, on_success, true);
5062   } else {
5063     // Reading forward. Forward match the lead surrogate and assert that
5064     // no trail surrogate follows.
5065     match = MatchAndNegativeLookaroundInReadDirection(
5066         compiler, lead_surrogates, trail_surrogates, on_success, false);
5067   }
5068   result->AddAlternative(GuardedAlternative(match));
5069 }
5070 
5071 
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5072 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5073                             RegExpNode* on_success,
5074                             UnicodeRangeSplitter* splitter) {
5075   ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5076   if (trail_surrogates == nullptr) return;
5077   Zone* zone = compiler->zone();
5078   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5079   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5080       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5081 
5082   RegExpNode* match;
5083   if (compiler->read_backward()) {
5084     // Reading backward. Backward match the trail surrogate and assert that no
5085     // lead surrogate precedes it.
5086     match = MatchAndNegativeLookaroundInReadDirection(
5087         compiler, trail_surrogates, lead_surrogates, on_success, true);
5088   } else {
5089     // Reading forward. Assert that reading backward, there is no lead
5090     // surrogate, and then forward match the trail surrogate.
5091     match = NegativeLookaroundAgainstReadDirectionAndMatch(
5092         compiler, lead_surrogates, trail_surrogates, on_success, false);
5093   }
5094   result->AddAlternative(GuardedAlternative(match));
5095 }
5096 
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)5097 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5098                               RegExpNode* on_success) {
5099   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5100   DCHECK(!compiler->read_backward());
5101   Zone* zone = compiler->zone();
5102   // Advance any character. If the character happens to be a lead surrogate and
5103   // we advanced into the middle of a surrogate pair, it will work out, as
5104   // nothing will match from there. We will have to advance again, consuming
5105   // the associated trail surrogate.
5106   ZoneList<CharacterRange>* range = CharacterRange::List(
5107       zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5108   return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5109 }
5110 
5111 
AddUnicodeCaseEquivalents(RegExpCompiler * compiler,ZoneList<CharacterRange> * ranges)5112 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5113                                ZoneList<CharacterRange>* ranges) {
5114 #ifdef V8_I18N_SUPPORT
5115   // Use ICU to compute the case fold closure over the ranges.
5116   DCHECK(compiler->unicode());
5117   DCHECK(compiler->ignore_case());
5118   icu::UnicodeSet set;
5119   for (int i = 0; i < ranges->length(); i++) {
5120     set.add(ranges->at(i).from(), ranges->at(i).to());
5121   }
5122   ranges->Clear();
5123   set.closeOver(USET_CASE_INSENSITIVE);
5124   // Full case mapping map single characters to multiple characters.
5125   // Those are represented as strings in the set. Remove them so that
5126   // we end up with only simple and common case mappings.
5127   set.removeAllStrings();
5128   Zone* zone = compiler->zone();
5129   for (int i = 0; i < set.getRangeCount(); i++) {
5130     ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5131                 zone);
5132   }
5133   // No errors and everything we collected have been ranges.
5134 #else
5135   // Fallback if ICU is not included.
5136   CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5137                                      ranges, compiler->one_byte());
5138 #endif  // V8_I18N_SUPPORT
5139   CharacterRange::Canonicalize(ranges);
5140 }
5141 
5142 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5143 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5144                                          RegExpNode* on_success) {
5145   set_.Canonicalize();
5146   Zone* zone = compiler->zone();
5147   ZoneList<CharacterRange>* ranges = this->ranges(zone);
5148   if (compiler->unicode() && compiler->ignore_case()) {
5149     AddUnicodeCaseEquivalents(compiler, ranges);
5150   }
5151   if (compiler->unicode() && !compiler->one_byte()) {
5152     if (is_negated()) {
5153       ZoneList<CharacterRange>* negated =
5154           new (zone) ZoneList<CharacterRange>(2, zone);
5155       CharacterRange::Negate(ranges, negated, zone);
5156       ranges = negated;
5157     }
5158     if (ranges->length() == 0) {
5159       ranges->Add(CharacterRange::Everything(), zone);
5160       RegExpCharacterClass* fail =
5161           new (zone) RegExpCharacterClass(ranges, true);
5162       return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5163     }
5164     if (standard_type() == '*') {
5165       return UnanchoredAdvance(compiler, on_success);
5166     } else {
5167       ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5168       UnicodeRangeSplitter splitter(zone, ranges);
5169       AddBmpCharacters(compiler, result, on_success, &splitter);
5170       AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5171       AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5172       AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5173       return result;
5174     }
5175   } else {
5176     return new (zone) TextNode(this, compiler->read_backward(), on_success);
5177   }
5178 }
5179 
5180 
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)5181 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5182   RegExpAtom* atom1 = (*a)->AsAtom();
5183   RegExpAtom* atom2 = (*b)->AsAtom();
5184   uc16 character1 = atom1->data().at(0);
5185   uc16 character2 = atom2->data().at(0);
5186   if (character1 < character2) return -1;
5187   if (character1 > character2) return 1;
5188   return 0;
5189 }
5190 
5191 
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)5192 static unibrow::uchar Canonical(
5193     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5194     unibrow::uchar c) {
5195   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5196   int length = canonicalize->get(c, '\0', chars);
5197   DCHECK_LE(length, 1);
5198   unibrow::uchar canonical = c;
5199   if (length == 1) canonical = chars[0];
5200   return canonical;
5201 }
5202 
5203 
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)5204 int CompareFirstCharCaseIndependent(
5205     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5206     RegExpTree* const* a, RegExpTree* const* b) {
5207   RegExpAtom* atom1 = (*a)->AsAtom();
5208   RegExpAtom* atom2 = (*b)->AsAtom();
5209   unibrow::uchar character1 = atom1->data().at(0);
5210   unibrow::uchar character2 = atom2->data().at(0);
5211   if (character1 == character2) return 0;
5212   if (character1 >= 'a' || character2 >= 'a') {
5213     character1 = Canonical(canonicalize, character1);
5214     character2 = Canonical(canonicalize, character2);
5215   }
5216   return static_cast<int>(character1) - static_cast<int>(character2);
5217 }
5218 
5219 
5220 // We can stable sort runs of atoms, since the order does not matter if they
5221 // start with different characters.
5222 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)5223 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5224   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5225   int length = alternatives->length();
5226   bool found_consecutive_atoms = false;
5227   for (int i = 0; i < length; i++) {
5228     while (i < length) {
5229       RegExpTree* alternative = alternatives->at(i);
5230       if (alternative->IsAtom()) break;
5231       i++;
5232     }
5233     // i is length or it is the index of an atom.
5234     if (i == length) break;
5235     int first_atom = i;
5236     i++;
5237     while (i < length) {
5238       RegExpTree* alternative = alternatives->at(i);
5239       if (!alternative->IsAtom()) break;
5240       i++;
5241     }
5242     // Sort atoms to get ones with common prefixes together.
5243     // This step is more tricky if we are in a case-independent regexp,
5244     // because it would change /is|I/ to /I|is/, and order matters when
5245     // the regexp parts don't match only disjoint starting points. To fix
5246     // this we have a version of CompareFirstChar that uses case-
5247     // independent character classes for comparison.
5248     DCHECK_LT(first_atom, alternatives->length());
5249     DCHECK_LE(i, alternatives->length());
5250     DCHECK_LE(first_atom, i);
5251     if (compiler->ignore_case()) {
5252       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5253           compiler->isolate()->regexp_macro_assembler_canonicalize();
5254       auto compare_closure =
5255           [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5256             return CompareFirstCharCaseIndependent(canonicalize, a, b);
5257           };
5258       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5259     } else {
5260       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5261     }
5262     if (i - first_atom > 1) found_consecutive_atoms = true;
5263   }
5264   return found_consecutive_atoms;
5265 }
5266 
5267 
5268 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)5269 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5270   Zone* zone = compiler->zone();
5271   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5272   int length = alternatives->length();
5273 
5274   int write_posn = 0;
5275   int i = 0;
5276   while (i < length) {
5277     RegExpTree* alternative = alternatives->at(i);
5278     if (!alternative->IsAtom()) {
5279       alternatives->at(write_posn++) = alternatives->at(i);
5280       i++;
5281       continue;
5282     }
5283     RegExpAtom* atom = alternative->AsAtom();
5284     unibrow::uchar common_prefix = atom->data().at(0);
5285     int first_with_prefix = i;
5286     int prefix_length = atom->length();
5287     i++;
5288     while (i < length) {
5289       alternative = alternatives->at(i);
5290       if (!alternative->IsAtom()) break;
5291       atom = alternative->AsAtom();
5292       unibrow::uchar new_prefix = atom->data().at(0);
5293       if (new_prefix != common_prefix) {
5294         if (!compiler->ignore_case()) break;
5295         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5296             compiler->isolate()->regexp_macro_assembler_canonicalize();
5297         new_prefix = Canonical(canonicalize, new_prefix);
5298         common_prefix = Canonical(canonicalize, common_prefix);
5299         if (new_prefix != common_prefix) break;
5300       }
5301       prefix_length = Min(prefix_length, atom->length());
5302       i++;
5303     }
5304     if (i > first_with_prefix + 2) {
5305       // Found worthwhile run of alternatives with common prefix of at least one
5306       // character.  The sorting function above did not sort on more than one
5307       // character for reasons of correctness, but there may still be a longer
5308       // common prefix if the terms were similar or presorted in the input.
5309       // Find out how long the common prefix is.
5310       int run_length = i - first_with_prefix;
5311       atom = alternatives->at(first_with_prefix)->AsAtom();
5312       for (int j = 1; j < run_length && prefix_length > 1; j++) {
5313         RegExpAtom* old_atom =
5314             alternatives->at(j + first_with_prefix)->AsAtom();
5315         for (int k = 1; k < prefix_length; k++) {
5316           if (atom->data().at(k) != old_atom->data().at(k)) {
5317             prefix_length = k;
5318             break;
5319           }
5320         }
5321       }
5322       RegExpAtom* prefix =
5323           new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5324       ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5325       pair->Add(prefix, zone);
5326       ZoneList<RegExpTree*>* suffixes =
5327           new (zone) ZoneList<RegExpTree*>(run_length, zone);
5328       for (int j = 0; j < run_length; j++) {
5329         RegExpAtom* old_atom =
5330             alternatives->at(j + first_with_prefix)->AsAtom();
5331         int len = old_atom->length();
5332         if (len == prefix_length) {
5333           suffixes->Add(new (zone) RegExpEmpty(), zone);
5334         } else {
5335           RegExpTree* suffix = new (zone) RegExpAtom(
5336               old_atom->data().SubVector(prefix_length, old_atom->length()));
5337           suffixes->Add(suffix, zone);
5338         }
5339       }
5340       pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5341       alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5342     } else {
5343       // Just copy any non-worthwhile alternatives.
5344       for (int j = first_with_prefix; j < i; j++) {
5345         alternatives->at(write_posn++) = alternatives->at(j);
5346       }
5347     }
5348   }
5349   alternatives->Rewind(write_posn);  // Trim end of array.
5350 }
5351 
5352 
5353 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)5354 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5355     RegExpCompiler* compiler) {
5356   Zone* zone = compiler->zone();
5357   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5358   int length = alternatives->length();
5359 
5360   int write_posn = 0;
5361   int i = 0;
5362   while (i < length) {
5363     RegExpTree* alternative = alternatives->at(i);
5364     if (!alternative->IsAtom()) {
5365       alternatives->at(write_posn++) = alternatives->at(i);
5366       i++;
5367       continue;
5368     }
5369     RegExpAtom* atom = alternative->AsAtom();
5370     if (atom->length() != 1) {
5371       alternatives->at(write_posn++) = alternatives->at(i);
5372       i++;
5373       continue;
5374     }
5375     int first_in_run = i;
5376     i++;
5377     while (i < length) {
5378       alternative = alternatives->at(i);
5379       if (!alternative->IsAtom()) break;
5380       atom = alternative->AsAtom();
5381       if (atom->length() != 1) break;
5382       i++;
5383     }
5384     if (i > first_in_run + 1) {
5385       // Found non-trivial run of single-character alternatives.
5386       int run_length = i - first_in_run;
5387       ZoneList<CharacterRange>* ranges =
5388           new (zone) ZoneList<CharacterRange>(2, zone);
5389       for (int j = 0; j < run_length; j++) {
5390         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5391         DCHECK_EQ(old_atom->length(), 1);
5392         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5393       }
5394       alternatives->at(write_posn++) =
5395           new (zone) RegExpCharacterClass(ranges, false);
5396     } else {
5397       // Just copy any trivial alternatives.
5398       for (int j = first_in_run; j < i; j++) {
5399         alternatives->at(write_posn++) = alternatives->at(j);
5400       }
5401     }
5402   }
5403   alternatives->Rewind(write_posn);  // Trim end of array.
5404 }
5405 
5406 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5407 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5408                                       RegExpNode* on_success) {
5409   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5410 
5411   if (alternatives->length() > 2) {
5412     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5413     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5414     FixSingleCharacterDisjunctions(compiler);
5415     if (alternatives->length() == 1) {
5416       return alternatives->at(0)->ToNode(compiler, on_success);
5417     }
5418   }
5419 
5420   int length = alternatives->length();
5421 
5422   ChoiceNode* result =
5423       new(compiler->zone()) ChoiceNode(length, compiler->zone());
5424   for (int i = 0; i < length; i++) {
5425     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5426                                                                on_success));
5427     result->AddAlternative(alternative);
5428   }
5429   return result;
5430 }
5431 
5432 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5433 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5434                                      RegExpNode* on_success) {
5435   return ToNode(min(),
5436                 max(),
5437                 is_greedy(),
5438                 body(),
5439                 compiler,
5440                 on_success);
5441 }
5442 
5443 
5444 // Scoped object to keep track of how much we unroll quantifier loops in the
5445 // regexp graph generator.
5446 class RegExpExpansionLimiter {
5447  public:
5448   static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)5449   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5450       : compiler_(compiler),
5451         saved_expansion_factor_(compiler->current_expansion_factor()),
5452         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5453     DCHECK(factor > 0);
5454     if (ok_to_expand_) {
5455       if (factor > kMaxExpansionFactor) {
5456         // Avoid integer overflow of the current expansion factor.
5457         ok_to_expand_ = false;
5458         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5459       } else {
5460         int new_factor = saved_expansion_factor_ * factor;
5461         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5462         compiler->set_current_expansion_factor(new_factor);
5463       }
5464     }
5465   }
5466 
~RegExpExpansionLimiter()5467   ~RegExpExpansionLimiter() {
5468     compiler_->set_current_expansion_factor(saved_expansion_factor_);
5469   }
5470 
ok_to_expand()5471   bool ok_to_expand() { return ok_to_expand_; }
5472 
5473  private:
5474   RegExpCompiler* compiler_;
5475   int saved_expansion_factor_;
5476   bool ok_to_expand_;
5477 
5478   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5479 };
5480 
5481 
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)5482 RegExpNode* RegExpQuantifier::ToNode(int min,
5483                                      int max,
5484                                      bool is_greedy,
5485                                      RegExpTree* body,
5486                                      RegExpCompiler* compiler,
5487                                      RegExpNode* on_success,
5488                                      bool not_at_start) {
5489   // x{f, t} becomes this:
5490   //
5491   //             (r++)<-.
5492   //               |     `
5493   //               |     (x)
5494   //               v     ^
5495   //      (r=0)-->(?)---/ [if r < t]
5496   //               |
5497   //   [if r >= f] \----> ...
5498   //
5499 
5500   // 15.10.2.5 RepeatMatcher algorithm.
5501   // The parser has already eliminated the case where max is 0.  In the case
5502   // where max_match is zero the parser has removed the quantifier if min was
5503   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
5504 
5505   // If we know that we cannot match zero length then things are a little
5506   // simpler since we don't need to make the special zero length match check
5507   // from step 2.1.  If the min and max are small we can unroll a little in
5508   // this case.
5509   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
5510   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
5511   if (max == 0) return on_success;  // This can happen due to recursion.
5512   bool body_can_be_empty = (body->min_match() == 0);
5513   int body_start_reg = RegExpCompiler::kNoRegister;
5514   Interval capture_registers = body->CaptureRegisters();
5515   bool needs_capture_clearing = !capture_registers.is_empty();
5516   Zone* zone = compiler->zone();
5517 
5518   if (body_can_be_empty) {
5519     body_start_reg = compiler->AllocateRegister();
5520   } else if (compiler->optimize() && !needs_capture_clearing) {
5521     // Only unroll if there are no captures and the body can't be
5522     // empty.
5523     {
5524       RegExpExpansionLimiter limiter(
5525           compiler, min + ((max != min) ? 1 : 0));
5526       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5527         int new_max = (max == kInfinity) ? max : max - min;
5528         // Recurse once to get the loop or optional matches after the fixed
5529         // ones.
5530         RegExpNode* answer = ToNode(
5531             0, new_max, is_greedy, body, compiler, on_success, true);
5532         // Unroll the forced matches from 0 to min.  This can cause chains of
5533         // TextNodes (which the parser does not generate).  These should be
5534         // combined if it turns out they hinder good code generation.
5535         for (int i = 0; i < min; i++) {
5536           answer = body->ToNode(compiler, answer);
5537         }
5538         return answer;
5539       }
5540     }
5541     if (max <= kMaxUnrolledMaxMatches && min == 0) {
5542       DCHECK(max > 0);  // Due to the 'if' above.
5543       RegExpExpansionLimiter limiter(compiler, max);
5544       if (limiter.ok_to_expand()) {
5545         // Unroll the optional matches up to max.
5546         RegExpNode* answer = on_success;
5547         for (int i = 0; i < max; i++) {
5548           ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5549           if (is_greedy) {
5550             alternation->AddAlternative(
5551                 GuardedAlternative(body->ToNode(compiler, answer)));
5552             alternation->AddAlternative(GuardedAlternative(on_success));
5553           } else {
5554             alternation->AddAlternative(GuardedAlternative(on_success));
5555             alternation->AddAlternative(
5556                 GuardedAlternative(body->ToNode(compiler, answer)));
5557           }
5558           answer = alternation;
5559           if (not_at_start && !compiler->read_backward()) {
5560             alternation->set_not_at_start();
5561           }
5562         }
5563         return answer;
5564       }
5565     }
5566   }
5567   bool has_min = min > 0;
5568   bool has_max = max < RegExpTree::kInfinity;
5569   bool needs_counter = has_min || has_max;
5570   int reg_ctr = needs_counter
5571       ? compiler->AllocateRegister()
5572       : RegExpCompiler::kNoRegister;
5573   LoopChoiceNode* center = new (zone)
5574       LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5575   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5576   RegExpNode* loop_return = needs_counter
5577       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5578       : static_cast<RegExpNode*>(center);
5579   if (body_can_be_empty) {
5580     // If the body can be empty we need to check if it was and then
5581     // backtrack.
5582     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5583                                               reg_ctr,
5584                                               min,
5585                                               loop_return);
5586   }
5587   RegExpNode* body_node = body->ToNode(compiler, loop_return);
5588   if (body_can_be_empty) {
5589     // If the body can be empty we need to store the start position
5590     // so we can bail out if it was empty.
5591     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5592   }
5593   if (needs_capture_clearing) {
5594     // Before entering the body of this loop we need to clear captures.
5595     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5596   }
5597   GuardedAlternative body_alt(body_node);
5598   if (has_max) {
5599     Guard* body_guard =
5600         new(zone) Guard(reg_ctr, Guard::LT, max);
5601     body_alt.AddGuard(body_guard, zone);
5602   }
5603   GuardedAlternative rest_alt(on_success);
5604   if (has_min) {
5605     Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5606     rest_alt.AddGuard(rest_guard, zone);
5607   }
5608   if (is_greedy) {
5609     center->AddLoopAlternative(body_alt);
5610     center->AddContinueAlternative(rest_alt);
5611   } else {
5612     center->AddContinueAlternative(rest_alt);
5613     center->AddLoopAlternative(body_alt);
5614   }
5615   if (needs_counter) {
5616     return ActionNode::SetRegister(reg_ctr, 0, center);
5617   } else {
5618     return center;
5619   }
5620 }
5621 
5622 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5623 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5624                                     RegExpNode* on_success) {
5625   NodeInfo info;
5626   Zone* zone = compiler->zone();
5627 
5628   switch (assertion_type()) {
5629     case START_OF_LINE:
5630       return AssertionNode::AfterNewline(on_success);
5631     case START_OF_INPUT:
5632       return AssertionNode::AtStart(on_success);
5633     case BOUNDARY:
5634       return AssertionNode::AtBoundary(on_success);
5635     case NON_BOUNDARY:
5636       return AssertionNode::AtNonBoundary(on_success);
5637     case END_OF_INPUT:
5638       return AssertionNode::AtEnd(on_success);
5639     case END_OF_LINE: {
5640       // Compile $ in multiline regexps as an alternation with a positive
5641       // lookahead in one side and an end-of-input on the other side.
5642       // We need two registers for the lookahead.
5643       int stack_pointer_register = compiler->AllocateRegister();
5644       int position_register = compiler->AllocateRegister();
5645       // The ChoiceNode to distinguish between a newline and end-of-input.
5646       ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5647       // Create a newline atom.
5648       ZoneList<CharacterRange>* newline_ranges =
5649           new(zone) ZoneList<CharacterRange>(3, zone);
5650       CharacterRange::AddClassEscape('n', newline_ranges, zone);
5651       RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5652       TextNode* newline_matcher = new (zone) TextNode(
5653           newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5654                                    stack_pointer_register, position_register,
5655                                    0,   // No captures inside.
5656                                    -1,  // Ignored if no captures.
5657                                    on_success));
5658       // Create an end-of-input matcher.
5659       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5660           stack_pointer_register,
5661           position_register,
5662           newline_matcher);
5663       // Add the two alternatives to the ChoiceNode.
5664       GuardedAlternative eol_alternative(end_of_line);
5665       result->AddAlternative(eol_alternative);
5666       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5667       result->AddAlternative(end_alternative);
5668       return result;
5669     }
5670     default:
5671       UNREACHABLE();
5672   }
5673   return on_success;
5674 }
5675 
5676 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5677 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5678                                         RegExpNode* on_success) {
5679   return new (compiler->zone())
5680       BackReferenceNode(RegExpCapture::StartRegister(index()),
5681                         RegExpCapture::EndRegister(index()),
5682                         compiler->read_backward(), on_success);
5683 }
5684 
5685 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5686 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5687                                 RegExpNode* on_success) {
5688   return on_success;
5689 }
5690 
5691 
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)5692 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5693                                    int stack_pointer_register,
5694                                    int position_register,
5695                                    int capture_register_count,
5696                                    int capture_register_start)
5697     : is_positive_(is_positive),
5698       on_success_(on_success),
5699       stack_pointer_register_(stack_pointer_register),
5700       position_register_(position_register) {
5701   if (is_positive_) {
5702     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5703         stack_pointer_register, position_register, capture_register_count,
5704         capture_register_start, on_success_);
5705   } else {
5706     Zone* zone = on_success_->zone();
5707     on_match_success_ = new (zone) NegativeSubmatchSuccess(
5708         stack_pointer_register, position_register, capture_register_count,
5709         capture_register_start, zone);
5710   }
5711 }
5712 
5713 
ForMatch(RegExpNode * match)5714 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5715   if (is_positive_) {
5716     return ActionNode::BeginSubmatch(stack_pointer_register_,
5717                                      position_register_, match);
5718   } else {
5719     Zone* zone = on_success_->zone();
5720     // We use a ChoiceNode to represent the negative lookaround. The first
5721     // alternative is the negative match. On success, the end node backtracks.
5722     // On failure, the second alternative is tried and leads to success.
5723     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5724     // first exit when calculating quick checks.
5725     ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5726         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5727     return ActionNode::BeginSubmatch(stack_pointer_register_,
5728                                      position_register_, choice_node);
5729   }
5730 }
5731 
5732 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5733 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5734                                      RegExpNode* on_success) {
5735   int stack_pointer_register = compiler->AllocateRegister();
5736   int position_register = compiler->AllocateRegister();
5737 
5738   const int registers_per_capture = 2;
5739   const int register_of_first_capture = 2;
5740   int register_count = capture_count_ * registers_per_capture;
5741   int register_start =
5742     register_of_first_capture + capture_from_ * registers_per_capture;
5743 
5744   RegExpNode* result;
5745   bool was_reading_backward = compiler->read_backward();
5746   compiler->set_read_backward(type() == LOOKBEHIND);
5747   Builder builder(is_positive(), on_success, stack_pointer_register,
5748                   position_register, register_count, register_start);
5749   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5750   result = builder.ForMatch(match);
5751   compiler->set_read_backward(was_reading_backward);
5752   return result;
5753 }
5754 
5755 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5756 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5757                                   RegExpNode* on_success) {
5758   return ToNode(body(), index(), compiler, on_success);
5759 }
5760 
5761 
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5762 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5763                                   int index,
5764                                   RegExpCompiler* compiler,
5765                                   RegExpNode* on_success) {
5766   DCHECK_NOT_NULL(body);
5767   int start_reg = RegExpCapture::StartRegister(index);
5768   int end_reg = RegExpCapture::EndRegister(index);
5769   if (compiler->read_backward()) std::swap(start_reg, end_reg);
5770   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5771   RegExpNode* body_node = body->ToNode(compiler, store_end);
5772   return ActionNode::StorePosition(start_reg, true, body_node);
5773 }
5774 
5775 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5776 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5777                                       RegExpNode* on_success) {
5778   ZoneList<RegExpTree*>* children = nodes();
5779   RegExpNode* current = on_success;
5780   if (compiler->read_backward()) {
5781     for (int i = 0; i < children->length(); i++) {
5782       current = children->at(i)->ToNode(compiler, current);
5783     }
5784   } else {
5785     for (int i = children->length() - 1; i >= 0; i--) {
5786       current = children->at(i)->ToNode(compiler, current);
5787     }
5788   }
5789   return current;
5790 }
5791 
5792 
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5793 static void AddClass(const int* elmv,
5794                      int elmc,
5795                      ZoneList<CharacterRange>* ranges,
5796                      Zone* zone) {
5797   elmc--;
5798   DCHECK(elmv[elmc] == kRangeEndMarker);
5799   for (int i = 0; i < elmc; i += 2) {
5800     DCHECK(elmv[i] < elmv[i + 1]);
5801     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5802   }
5803 }
5804 
5805 
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5806 static void AddClassNegated(const int *elmv,
5807                             int elmc,
5808                             ZoneList<CharacterRange>* ranges,
5809                             Zone* zone) {
5810   elmc--;
5811   DCHECK(elmv[elmc] == kRangeEndMarker);
5812   DCHECK(elmv[0] != 0x0000);
5813   DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
5814   uc16 last = 0x0000;
5815   for (int i = 0; i < elmc; i += 2) {
5816     DCHECK(last <= elmv[i] - 1);
5817     DCHECK(elmv[i] < elmv[i + 1]);
5818     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5819     last = elmv[i + 1];
5820   }
5821   ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5822 }
5823 
5824 
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges,Zone * zone)5825 void CharacterRange::AddClassEscape(uc16 type,
5826                                     ZoneList<CharacterRange>* ranges,
5827                                     Zone* zone) {
5828   switch (type) {
5829     case 's':
5830       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5831       break;
5832     case 'S':
5833       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5834       break;
5835     case 'w':
5836       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5837       break;
5838     case 'W':
5839       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5840       break;
5841     case 'd':
5842       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5843       break;
5844     case 'D':
5845       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5846       break;
5847     case '.':
5848       AddClassNegated(kLineTerminatorRanges,
5849                       kLineTerminatorRangeCount,
5850                       ranges,
5851                       zone);
5852       break;
5853     // This is not a character range as defined by the spec but a
5854     // convenient shorthand for a character class that matches any
5855     // character.
5856     case '*':
5857       ranges->Add(CharacterRange::Everything(), zone);
5858       break;
5859     // This is the set of characters matched by the $ and ^ symbols
5860     // in multiline mode.
5861     case 'n':
5862       AddClass(kLineTerminatorRanges,
5863                kLineTerminatorRangeCount,
5864                ranges,
5865                zone);
5866       break;
5867     default:
5868       UNREACHABLE();
5869   }
5870 }
5871 
5872 
GetWordBounds()5873 Vector<const int> CharacterRange::GetWordBounds() {
5874   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5875 }
5876 
5877 
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)5878 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5879                                         ZoneList<CharacterRange>* ranges,
5880                                         bool is_one_byte) {
5881   CharacterRange::Canonicalize(ranges);
5882   int range_count = ranges->length();
5883   for (int i = 0; i < range_count; i++) {
5884     CharacterRange range = ranges->at(i);
5885     uc32 bottom = range.from();
5886     if (bottom > String::kMaxUtf16CodeUnit) return;
5887     uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5888     // Nothing to be done for surrogates.
5889     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5890     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5891       if (bottom > String::kMaxOneByteCharCode) return;
5892       if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5893     }
5894     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5895     if (top == bottom) {
5896       // If this is a singleton we just expand the one character.
5897       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5898       for (int i = 0; i < length; i++) {
5899         uc32 chr = chars[i];
5900         if (chr != bottom) {
5901           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5902         }
5903       }
5904     } else {
5905       // If this is a range we expand the characters block by block, expanding
5906       // contiguous subranges (blocks) one at a time.  The approach is as
5907       // follows.  For a given start character we look up the remainder of the
5908       // block that contains it (represented by the end point), for instance we
5909       // find 'z' if the character is 'c'.  A block is characterized by the
5910       // property that all characters uncanonicalize in the same way, except
5911       // that each entry in the result is incremented by the distance from the
5912       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
5913       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
5914       // we've found the end point we look up its uncanonicalization and
5915       // produce a range for each element.  For instance for [c-f] we look up
5916       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
5917       // it is not already contained in the input, so [c-f] will be skipped but
5918       // [C-F] will be added.  If this range is not completely contained in a
5919       // block we do this for all the blocks covered by the range (handling
5920       // characters that is not in a block as a "singleton block").
5921       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5922       int pos = bottom;
5923       while (pos <= top) {
5924         int length =
5925             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5926         uc32 block_end;
5927         if (length == 0) {
5928           block_end = pos;
5929         } else {
5930           DCHECK_EQ(1, length);
5931           block_end = equivalents[0];
5932         }
5933         int end = (block_end > top) ? top : block_end;
5934         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5935                                                          equivalents);
5936         for (int i = 0; i < length; i++) {
5937           uc32 c = equivalents[i];
5938           uc32 range_from = c - (block_end - pos);
5939           uc32 range_to = c - (block_end - end);
5940           if (!(bottom <= range_from && range_to <= top)) {
5941             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5942           }
5943         }
5944         pos = end + 1;
5945       }
5946     }
5947   }
5948 }
5949 
5950 
IsCanonical(ZoneList<CharacterRange> * ranges)5951 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5952   DCHECK_NOT_NULL(ranges);
5953   int n = ranges->length();
5954   if (n <= 1) return true;
5955   int max = ranges->at(0).to();
5956   for (int i = 1; i < n; i++) {
5957     CharacterRange next_range = ranges->at(i);
5958     if (next_range.from() <= max + 1) return false;
5959     max = next_range.to();
5960   }
5961   return true;
5962 }
5963 
5964 
ranges(Zone * zone)5965 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5966   if (ranges_ == NULL) {
5967     ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5968     CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5969   }
5970   return ranges_;
5971 }
5972 
5973 
5974 // Move a number of elements in a zonelist to another position
5975 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5976 static void MoveRanges(ZoneList<CharacterRange>* list,
5977                        int from,
5978                        int to,
5979                        int count) {
5980   // Ranges are potentially overlapping.
5981   if (from < to) {
5982     for (int i = count - 1; i >= 0; i--) {
5983       list->at(to + i) = list->at(from + i);
5984     }
5985   } else {
5986     for (int i = 0; i < count; i++) {
5987       list->at(to + i) = list->at(from + i);
5988     }
5989   }
5990 }
5991 
5992 
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)5993 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5994                                       int count,
5995                                       CharacterRange insert) {
5996   // Inserts a range into list[0..count[, which must be sorted
5997   // by from value and non-overlapping and non-adjacent, using at most
5998   // list[0..count] for the result. Returns the number of resulting
5999   // canonicalized ranges. Inserting a range may collapse existing ranges into
6000   // fewer ranges, so the return value can be anything in the range 1..count+1.
6001   uc32 from = insert.from();
6002   uc32 to = insert.to();
6003   int start_pos = 0;
6004   int end_pos = count;
6005   for (int i = count - 1; i >= 0; i--) {
6006     CharacterRange current = list->at(i);
6007     if (current.from() > to + 1) {
6008       end_pos = i;
6009     } else if (current.to() + 1 < from) {
6010       start_pos = i + 1;
6011       break;
6012     }
6013   }
6014 
6015   // Inserted range overlaps, or is adjacent to, ranges at positions
6016   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6017   // not affected by the insertion.
6018   // If start_pos == end_pos, the range must be inserted before start_pos.
6019   // if start_pos < end_pos, the entire range from start_pos to end_pos
6020   // must be merged with the insert range.
6021 
6022   if (start_pos == end_pos) {
6023     // Insert between existing ranges at position start_pos.
6024     if (start_pos < count) {
6025       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6026     }
6027     list->at(start_pos) = insert;
6028     return count + 1;
6029   }
6030   if (start_pos + 1 == end_pos) {
6031     // Replace single existing range at position start_pos.
6032     CharacterRange to_replace = list->at(start_pos);
6033     int new_from = Min(to_replace.from(), from);
6034     int new_to = Max(to_replace.to(), to);
6035     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6036     return count;
6037   }
6038   // Replace a number of existing ranges from start_pos to end_pos - 1.
6039   // Move the remaining ranges down.
6040 
6041   int new_from = Min(list->at(start_pos).from(), from);
6042   int new_to = Max(list->at(end_pos - 1).to(), to);
6043   if (end_pos < count) {
6044     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6045   }
6046   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6047   return count - (end_pos - start_pos) + 1;
6048 }
6049 
6050 
Canonicalize()6051 void CharacterSet::Canonicalize() {
6052   // Special/default classes are always considered canonical. The result
6053   // of calling ranges() will be sorted.
6054   if (ranges_ == NULL) return;
6055   CharacterRange::Canonicalize(ranges_);
6056 }
6057 
6058 
Canonicalize(ZoneList<CharacterRange> * character_ranges)6059 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6060   if (character_ranges->length() <= 1) return;
6061   // Check whether ranges are already canonical (increasing, non-overlapping,
6062   // non-adjacent).
6063   int n = character_ranges->length();
6064   int max = character_ranges->at(0).to();
6065   int i = 1;
6066   while (i < n) {
6067     CharacterRange current = character_ranges->at(i);
6068     if (current.from() <= max + 1) {
6069       break;
6070     }
6071     max = current.to();
6072     i++;
6073   }
6074   // Canonical until the i'th range. If that's all of them, we are done.
6075   if (i == n) return;
6076 
6077   // The ranges at index i and forward are not canonicalized. Make them so by
6078   // doing the equivalent of insertion sort (inserting each into the previous
6079   // list, in order).
6080   // Notice that inserting a range can reduce the number of ranges in the
6081   // result due to combining of adjacent and overlapping ranges.
6082   int read = i;  // Range to insert.
6083   int num_canonical = i;  // Length of canonicalized part of list.
6084   do {
6085     num_canonical = InsertRangeInCanonicalList(character_ranges,
6086                                                num_canonical,
6087                                                character_ranges->at(read));
6088     read++;
6089   } while (read < n);
6090   character_ranges->Rewind(num_canonical);
6091 
6092   DCHECK(CharacterRange::IsCanonical(character_ranges));
6093 }
6094 
6095 
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)6096 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6097                             ZoneList<CharacterRange>* negated_ranges,
6098                             Zone* zone) {
6099   DCHECK(CharacterRange::IsCanonical(ranges));
6100   DCHECK_EQ(0, negated_ranges->length());
6101   int range_count = ranges->length();
6102   uc32 from = 0;
6103   int i = 0;
6104   if (range_count > 0 && ranges->at(0).from() == 0) {
6105     from = ranges->at(0).to() + 1;
6106     i = 1;
6107   }
6108   while (i < range_count) {
6109     CharacterRange range = ranges->at(i);
6110     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6111     from = range.to() + 1;
6112     i++;
6113   }
6114   if (from < String::kMaxCodePoint) {
6115     negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6116                         zone);
6117   }
6118 }
6119 
6120 
6121 // -------------------------------------------------------------------
6122 // Splay tree
6123 
6124 
Extend(unsigned value,Zone * zone)6125 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6126   if (Get(value))
6127     return this;
6128   if (successors(zone) != NULL) {
6129     for (int i = 0; i < successors(zone)->length(); i++) {
6130       OutSet* successor = successors(zone)->at(i);
6131       if (successor->Get(value))
6132         return successor;
6133     }
6134   } else {
6135     successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6136   }
6137   OutSet* result = new(zone) OutSet(first_, remaining_);
6138   result->Set(value, zone);
6139   successors(zone)->Add(result, zone);
6140   return result;
6141 }
6142 
6143 
Set(unsigned value,Zone * zone)6144 void OutSet::Set(unsigned value, Zone *zone) {
6145   if (value < kFirstLimit) {
6146     first_ |= (1 << value);
6147   } else {
6148     if (remaining_ == NULL)
6149       remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6150     if (remaining_->is_empty() || !remaining_->Contains(value))
6151       remaining_->Add(value, zone);
6152   }
6153 }
6154 
6155 
Get(unsigned value) const6156 bool OutSet::Get(unsigned value) const {
6157   if (value < kFirstLimit) {
6158     return (first_ & (1 << value)) != 0;
6159   } else if (remaining_ == NULL) {
6160     return false;
6161   } else {
6162     return remaining_->Contains(value);
6163   }
6164 }
6165 
6166 
6167 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6168 
6169 
AddRange(CharacterRange full_range,int value,Zone * zone)6170 void DispatchTable::AddRange(CharacterRange full_range, int value,
6171                              Zone* zone) {
6172   CharacterRange current = full_range;
6173   if (tree()->is_empty()) {
6174     // If this is the first range we just insert into the table.
6175     ZoneSplayTree<Config>::Locator loc;
6176     bool inserted = tree()->Insert(current.from(), &loc);
6177     DCHECK(inserted);
6178     USE(inserted);
6179     loc.set_value(Entry(current.from(), current.to(),
6180                         empty()->Extend(value, zone)));
6181     return;
6182   }
6183   // First see if there is a range to the left of this one that
6184   // overlaps.
6185   ZoneSplayTree<Config>::Locator loc;
6186   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6187     Entry* entry = &loc.value();
6188     // If we've found a range that overlaps with this one, and it
6189     // starts strictly to the left of this one, we have to fix it
6190     // because the following code only handles ranges that start on
6191     // or after the start point of the range we're adding.
6192     if (entry->from() < current.from() && entry->to() >= current.from()) {
6193       // Snap the overlapping range in half around the start point of
6194       // the range we're adding.
6195       CharacterRange left =
6196           CharacterRange::Range(entry->from(), current.from() - 1);
6197       CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6198       // The left part of the overlapping range doesn't overlap.
6199       // Truncate the whole entry to be just the left part.
6200       entry->set_to(left.to());
6201       // The right part is the one that overlaps.  We add this part
6202       // to the map and let the next step deal with merging it with
6203       // the range we're adding.
6204       ZoneSplayTree<Config>::Locator loc;
6205       bool inserted = tree()->Insert(right.from(), &loc);
6206       DCHECK(inserted);
6207       USE(inserted);
6208       loc.set_value(Entry(right.from(),
6209                           right.to(),
6210                           entry->out_set()));
6211     }
6212   }
6213   while (current.is_valid()) {
6214     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6215         (loc.value().from() <= current.to()) &&
6216         (loc.value().to() >= current.from())) {
6217       Entry* entry = &loc.value();
6218       // We have overlap.  If there is space between the start point of
6219       // the range we're adding and where the overlapping range starts
6220       // then we have to add a range covering just that space.
6221       if (current.from() < entry->from()) {
6222         ZoneSplayTree<Config>::Locator ins;
6223         bool inserted = tree()->Insert(current.from(), &ins);
6224         DCHECK(inserted);
6225         USE(inserted);
6226         ins.set_value(Entry(current.from(),
6227                             entry->from() - 1,
6228                             empty()->Extend(value, zone)));
6229         current.set_from(entry->from());
6230       }
6231       DCHECK_EQ(current.from(), entry->from());
6232       // If the overlapping range extends beyond the one we want to add
6233       // we have to snap the right part off and add it separately.
6234       if (entry->to() > current.to()) {
6235         ZoneSplayTree<Config>::Locator ins;
6236         bool inserted = tree()->Insert(current.to() + 1, &ins);
6237         DCHECK(inserted);
6238         USE(inserted);
6239         ins.set_value(Entry(current.to() + 1,
6240                             entry->to(),
6241                             entry->out_set()));
6242         entry->set_to(current.to());
6243       }
6244       DCHECK(entry->to() <= current.to());
6245       // The overlapping range is now completely contained by the range
6246       // we're adding so we can just update it and move the start point
6247       // of the range we're adding just past it.
6248       entry->AddValue(value, zone);
6249       DCHECK(entry->to() + 1 > current.from());
6250       current.set_from(entry->to() + 1);
6251     } else {
6252       // There is no overlap so we can just add the range
6253       ZoneSplayTree<Config>::Locator ins;
6254       bool inserted = tree()->Insert(current.from(), &ins);
6255       DCHECK(inserted);
6256       USE(inserted);
6257       ins.set_value(Entry(current.from(),
6258                           current.to(),
6259                           empty()->Extend(value, zone)));
6260       break;
6261     }
6262   }
6263 }
6264 
6265 
Get(uc32 value)6266 OutSet* DispatchTable::Get(uc32 value) {
6267   ZoneSplayTree<Config>::Locator loc;
6268   if (!tree()->FindGreatestLessThan(value, &loc))
6269     return empty();
6270   Entry* entry = &loc.value();
6271   if (value <= entry->to())
6272     return entry->out_set();
6273   else
6274     return empty();
6275 }
6276 
6277 
6278 // -------------------------------------------------------------------
6279 // Analysis
6280 
6281 
EnsureAnalyzed(RegExpNode * that)6282 void Analysis::EnsureAnalyzed(RegExpNode* that) {
6283   StackLimitCheck check(isolate());
6284   if (check.HasOverflowed()) {
6285     fail("Stack overflow");
6286     return;
6287   }
6288   if (that->info()->been_analyzed || that->info()->being_analyzed)
6289     return;
6290   that->info()->being_analyzed = true;
6291   that->Accept(this);
6292   that->info()->being_analyzed = false;
6293   that->info()->been_analyzed = true;
6294 }
6295 
6296 
VisitEnd(EndNode * that)6297 void Analysis::VisitEnd(EndNode* that) {
6298   // nothing to do
6299 }
6300 
6301 
CalculateOffsets()6302 void TextNode::CalculateOffsets() {
6303   int element_count = elements()->length();
6304   // Set up the offsets of the elements relative to the start.  This is a fixed
6305   // quantity since a TextNode can only contain fixed-width things.
6306   int cp_offset = 0;
6307   for (int i = 0; i < element_count; i++) {
6308     TextElement& elm = elements()->at(i);
6309     elm.set_cp_offset(cp_offset);
6310     cp_offset += elm.length();
6311   }
6312 }
6313 
6314 
VisitText(TextNode * that)6315 void Analysis::VisitText(TextNode* that) {
6316   if (ignore_case()) {
6317     that->MakeCaseIndependent(isolate(), is_one_byte_);
6318   }
6319   EnsureAnalyzed(that->on_success());
6320   if (!has_failed()) {
6321     that->CalculateOffsets();
6322   }
6323 }
6324 
6325 
VisitAction(ActionNode * that)6326 void Analysis::VisitAction(ActionNode* that) {
6327   RegExpNode* target = that->on_success();
6328   EnsureAnalyzed(target);
6329   if (!has_failed()) {
6330     // If the next node is interested in what it follows then this node
6331     // has to be interested too so it can pass the information on.
6332     that->info()->AddFromFollowing(target->info());
6333   }
6334 }
6335 
6336 
VisitChoice(ChoiceNode * that)6337 void Analysis::VisitChoice(ChoiceNode* that) {
6338   NodeInfo* info = that->info();
6339   for (int i = 0; i < that->alternatives()->length(); i++) {
6340     RegExpNode* node = that->alternatives()->at(i).node();
6341     EnsureAnalyzed(node);
6342     if (has_failed()) return;
6343     // Anything the following nodes need to know has to be known by
6344     // this node also, so it can pass it on.
6345     info->AddFromFollowing(node->info());
6346   }
6347 }
6348 
6349 
VisitLoopChoice(LoopChoiceNode * that)6350 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6351   NodeInfo* info = that->info();
6352   for (int i = 0; i < that->alternatives()->length(); i++) {
6353     RegExpNode* node = that->alternatives()->at(i).node();
6354     if (node != that->loop_node()) {
6355       EnsureAnalyzed(node);
6356       if (has_failed()) return;
6357       info->AddFromFollowing(node->info());
6358     }
6359   }
6360   // Check the loop last since it may need the value of this node
6361   // to get a correct result.
6362   EnsureAnalyzed(that->loop_node());
6363   if (!has_failed()) {
6364     info->AddFromFollowing(that->loop_node()->info());
6365   }
6366 }
6367 
6368 
VisitBackReference(BackReferenceNode * that)6369 void Analysis::VisitBackReference(BackReferenceNode* that) {
6370   EnsureAnalyzed(that->on_success());
6371 }
6372 
6373 
VisitAssertion(AssertionNode * that)6374 void Analysis::VisitAssertion(AssertionNode* that) {
6375   EnsureAnalyzed(that->on_success());
6376 }
6377 
6378 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6379 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6380                                      BoyerMooreLookahead* bm,
6381                                      bool not_at_start) {
6382   // Working out the set of characters that a backreference can match is too
6383   // hard, so we just say that any character can match.
6384   bm->SetRest(offset);
6385   SaveBMInfo(bm, not_at_start, offset);
6386 }
6387 
6388 
6389 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6390               RegExpMacroAssembler::kTableSize);
6391 
6392 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6393 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6394                               BoyerMooreLookahead* bm, bool not_at_start) {
6395   ZoneList<GuardedAlternative>* alts = alternatives();
6396   budget = (budget - 1) / alts->length();
6397   for (int i = 0; i < alts->length(); i++) {
6398     GuardedAlternative& alt = alts->at(i);
6399     if (alt.guards() != NULL && alt.guards()->length() != 0) {
6400       bm->SetRest(offset);  // Give up trying to fill in info.
6401       SaveBMInfo(bm, not_at_start, offset);
6402       return;
6403     }
6404     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6405   }
6406   SaveBMInfo(bm, not_at_start, offset);
6407 }
6408 
6409 
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6410 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6411                             BoyerMooreLookahead* bm, bool not_at_start) {
6412   if (initial_offset >= bm->length()) return;
6413   int offset = initial_offset;
6414   int max_char = bm->max_char();
6415   for (int i = 0; i < elements()->length(); i++) {
6416     if (offset >= bm->length()) {
6417       if (initial_offset == 0) set_bm_info(not_at_start, bm);
6418       return;
6419     }
6420     TextElement text = elements()->at(i);
6421     if (text.text_type() == TextElement::ATOM) {
6422       RegExpAtom* atom = text.atom();
6423       for (int j = 0; j < atom->length(); j++, offset++) {
6424         if (offset >= bm->length()) {
6425           if (initial_offset == 0) set_bm_info(not_at_start, bm);
6426           return;
6427         }
6428         uc16 character = atom->data()[j];
6429         if (bm->compiler()->ignore_case()) {
6430           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6431           int length = GetCaseIndependentLetters(
6432               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6433               chars);
6434           for (int j = 0; j < length; j++) {
6435             bm->Set(offset, chars[j]);
6436           }
6437         } else {
6438           if (character <= max_char) bm->Set(offset, character);
6439         }
6440       }
6441     } else {
6442       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6443       RegExpCharacterClass* char_class = text.char_class();
6444       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6445       if (char_class->is_negated()) {
6446         bm->SetAll(offset);
6447       } else {
6448         for (int k = 0; k < ranges->length(); k++) {
6449           CharacterRange& range = ranges->at(k);
6450           if (range.from() > max_char) continue;
6451           int to = Min(max_char, static_cast<int>(range.to()));
6452           bm->SetInterval(offset, Interval(range.from(), to));
6453         }
6454       }
6455       offset++;
6456     }
6457   }
6458   if (offset >= bm->length()) {
6459     if (initial_offset == 0) set_bm_info(not_at_start, bm);
6460     return;
6461   }
6462   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6463                              true);  // Not at start after a text node.
6464   if (initial_offset == 0) set_bm_info(not_at_start, bm);
6465 }
6466 
6467 
6468 // -------------------------------------------------------------------
6469 // Dispatch table construction
6470 
6471 
VisitEnd(EndNode * that)6472 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6473   AddRange(CharacterRange::Everything());
6474 }
6475 
6476 
BuildTable(ChoiceNode * node)6477 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6478   node->set_being_calculated(true);
6479   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6480   for (int i = 0; i < alternatives->length(); i++) {
6481     set_choice_index(i);
6482     alternatives->at(i).node()->Accept(this);
6483   }
6484   node->set_being_calculated(false);
6485 }
6486 
6487 
6488 class AddDispatchRange {
6489  public:
AddDispatchRange(DispatchTableConstructor * constructor)6490   explicit AddDispatchRange(DispatchTableConstructor* constructor)
6491     : constructor_(constructor) { }
6492   void Call(uc32 from, DispatchTable::Entry entry);
6493  private:
6494   DispatchTableConstructor* constructor_;
6495 };
6496 
6497 
Call(uc32 from,DispatchTable::Entry entry)6498 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6499   constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6500 }
6501 
6502 
VisitChoice(ChoiceNode * node)6503 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6504   if (node->being_calculated())
6505     return;
6506   DispatchTable* table = node->GetTable(ignore_case_);
6507   AddDispatchRange adder(this);
6508   table->ForEach(&adder);
6509 }
6510 
6511 
VisitBackReference(BackReferenceNode * that)6512 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6513   // TODO(160): Find the node that we refer back to and propagate its start
6514   // set back to here.  For now we just accept anything.
6515   AddRange(CharacterRange::Everything());
6516 }
6517 
6518 
VisitAssertion(AssertionNode * that)6519 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6520   RegExpNode* target = that->on_success();
6521   target->Accept(this);
6522 }
6523 
6524 
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)6525 static int CompareRangeByFrom(const CharacterRange* a,
6526                               const CharacterRange* b) {
6527   return Compare<uc16>(a->from(), b->from());
6528 }
6529 
6530 
AddInverse(ZoneList<CharacterRange> * ranges)6531 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6532   ranges->Sort(CompareRangeByFrom);
6533   uc16 last = 0;
6534   for (int i = 0; i < ranges->length(); i++) {
6535     CharacterRange range = ranges->at(i);
6536     if (last < range.from())
6537       AddRange(CharacterRange::Range(last, range.from() - 1));
6538     if (range.to() >= last) {
6539       if (range.to() == String::kMaxCodePoint) {
6540         return;
6541       } else {
6542         last = range.to() + 1;
6543       }
6544     }
6545   }
6546   AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6547 }
6548 
6549 
VisitText(TextNode * that)6550 void DispatchTableConstructor::VisitText(TextNode* that) {
6551   TextElement elm = that->elements()->at(0);
6552   switch (elm.text_type()) {
6553     case TextElement::ATOM: {
6554       uc16 c = elm.atom()->data()[0];
6555       AddRange(CharacterRange::Range(c, c));
6556       break;
6557     }
6558     case TextElement::CHAR_CLASS: {
6559       RegExpCharacterClass* tree = elm.char_class();
6560       ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6561       if (tree->is_negated()) {
6562         AddInverse(ranges);
6563       } else {
6564         for (int i = 0; i < ranges->length(); i++)
6565           AddRange(ranges->at(i));
6566       }
6567       break;
6568     }
6569     default: {
6570       UNIMPLEMENTED();
6571     }
6572   }
6573 }
6574 
6575 
VisitAction(ActionNode * that)6576 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6577   RegExpNode* target = that->on_success();
6578   target->Accept(this);
6579 }
6580 
6581 
OptionallyStepBackToLeadSurrogate(RegExpCompiler * compiler,RegExpNode * on_success)6582 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6583                                               RegExpNode* on_success) {
6584   // If the regexp matching starts within a surrogate pair, step back
6585   // to the lead surrogate and start matching from there.
6586   DCHECK(!compiler->read_backward());
6587   Zone* zone = compiler->zone();
6588   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6589       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6590   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6591       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6592 
6593   ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6594 
6595   int stack_register = compiler->UnicodeLookaroundStackRegister();
6596   int position_register = compiler->UnicodeLookaroundPositionRegister();
6597   RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6598       zone, lead_surrogates, true, on_success);
6599   RegExpLookaround::Builder builder(true, step_back, stack_register,
6600                                     position_register);
6601   RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6602       zone, trail_surrogates, false, builder.on_match_success());
6603 
6604   optional_step_back->AddAlternative(
6605       GuardedAlternative(builder.ForMatch(match_trail)));
6606   optional_step_back->AddAlternative(GuardedAlternative(on_success));
6607 
6608   return optional_step_back;
6609 }
6610 
6611 
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)6612 RegExpEngine::CompilationResult RegExpEngine::Compile(
6613     Isolate* isolate, Zone* zone, RegExpCompileData* data,
6614     JSRegExp::Flags flags, Handle<String> pattern,
6615     Handle<String> sample_subject, bool is_one_byte) {
6616   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6617     return IrregexpRegExpTooBig(isolate);
6618   }
6619   bool ignore_case = flags & JSRegExp::kIgnoreCase;
6620   bool is_sticky = flags & JSRegExp::kSticky;
6621   bool is_global = flags & JSRegExp::kGlobal;
6622   bool is_unicode = flags & JSRegExp::kUnicode;
6623   RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6624                           is_one_byte);
6625 
6626   if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6627 
6628   // Sample some characters from the middle of the string.
6629   static const int kSampleSize = 128;
6630 
6631   sample_subject = String::Flatten(sample_subject);
6632   int chars_sampled = 0;
6633   int half_way = (sample_subject->length() - kSampleSize) / 2;
6634   for (int i = Max(0, half_way);
6635        i < sample_subject->length() && chars_sampled < kSampleSize;
6636        i++, chars_sampled++) {
6637     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6638   }
6639 
6640   // Wrap the body of the regexp in capture #0.
6641   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6642                                                     0,
6643                                                     &compiler,
6644                                                     compiler.accept());
6645   RegExpNode* node = captured_body;
6646   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6647   bool is_start_anchored = data->tree->IsAnchoredAtStart();
6648   int max_length = data->tree->max_match();
6649   if (!is_start_anchored && !is_sticky) {
6650     // Add a .*? at the beginning, outside the body capture, unless
6651     // this expression is anchored at the beginning or sticky.
6652     RegExpNode* loop_node = RegExpQuantifier::ToNode(
6653         0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6654         &compiler, captured_body, data->contains_anchor);
6655 
6656     if (data->contains_anchor) {
6657       // Unroll loop once, to take care of the case that might start
6658       // at the start of input.
6659       ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6660       first_step_node->AddAlternative(GuardedAlternative(captured_body));
6661       first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6662           new (zone) RegExpCharacterClass('*'), false, loop_node)));
6663       node = first_step_node;
6664     } else {
6665       node = loop_node;
6666     }
6667   }
6668   if (is_one_byte) {
6669     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6670     // Do it again to propagate the new nodes to places where they were not
6671     // put because they had not been calculated yet.
6672     if (node != NULL) {
6673       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6674     }
6675   } else if (compiler.unicode() && (is_global || is_sticky)) {
6676     node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6677   }
6678 
6679   if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6680   data->node = node;
6681   Analysis analysis(isolate, flags, is_one_byte);
6682   analysis.EnsureAnalyzed(node);
6683   if (analysis.has_failed()) {
6684     const char* error_message = analysis.error_message();
6685     return CompilationResult(isolate, error_message);
6686   }
6687 
6688   // Create the correct assembler for the architecture.
6689 #ifndef V8_INTERPRETED_REGEXP
6690   // Native regexp implementation.
6691 
6692   NativeRegExpMacroAssembler::Mode mode =
6693       is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6694                   : NativeRegExpMacroAssembler::UC16;
6695 
6696 #if V8_TARGET_ARCH_IA32
6697   RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6698                                            (data->capture_count + 1) * 2);
6699 #elif V8_TARGET_ARCH_X64
6700   RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6701                                           (data->capture_count + 1) * 2);
6702 #elif V8_TARGET_ARCH_ARM
6703   RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6704                                           (data->capture_count + 1) * 2);
6705 #elif V8_TARGET_ARCH_ARM64
6706   RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6707                                             (data->capture_count + 1) * 2);
6708 #elif V8_TARGET_ARCH_S390
6709   RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6710                                            (data->capture_count + 1) * 2);
6711 #elif V8_TARGET_ARCH_PPC
6712   RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6713                                           (data->capture_count + 1) * 2);
6714 #elif V8_TARGET_ARCH_MIPS
6715   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6716                                            (data->capture_count + 1) * 2);
6717 #elif V8_TARGET_ARCH_MIPS64
6718   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6719                                            (data->capture_count + 1) * 2);
6720 #elif V8_TARGET_ARCH_X87
6721   RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6722                                           (data->capture_count + 1) * 2);
6723 #else
6724 #error "Unsupported architecture"
6725 #endif
6726 
6727 #else  // V8_INTERPRETED_REGEXP
6728   // Interpreted regexp implementation.
6729   EmbeddedVector<byte, 1024> codes;
6730   RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6731 #endif  // V8_INTERPRETED_REGEXP
6732 
6733   macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6734 
6735   // Inserted here, instead of in Assembler, because it depends on information
6736   // in the AST that isn't replicated in the Node structure.
6737   static const int kMaxBacksearchLimit = 1024;
6738   if (is_end_anchored && !is_start_anchored && !is_sticky &&
6739       max_length < kMaxBacksearchLimit) {
6740     macro_assembler.SetCurrentPositionFromEnd(max_length);
6741   }
6742 
6743   if (is_global) {
6744     RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6745     if (data->tree->min_match() > 0) {
6746       mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6747     } else if (is_unicode) {
6748       mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6749     }
6750     macro_assembler.set_global_mode(mode);
6751   }
6752 
6753   return compiler.Assemble(&macro_assembler,
6754                            node,
6755                            data->capture_count,
6756                            pattern);
6757 }
6758 
6759 
TooMuchRegExpCode(Handle<String> pattern)6760 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6761   Heap* heap = pattern->GetHeap();
6762   bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6763   if (heap->isolate()->total_regexp_code_generated() >
6764           RegExpImpl::kRegExpCompiledLimit &&
6765       heap->CommittedMemoryExecutable() >
6766           RegExpImpl::kRegExpExecutableMemoryLimit) {
6767     too_much = true;
6768   }
6769   return too_much;
6770 }
6771 
6772 
Lookup(Heap * heap,String * key_string,Object * key_pattern,FixedArray ** last_match_cache,ResultsCacheType type)6773 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6774                                    Object* key_pattern,
6775                                    FixedArray** last_match_cache,
6776                                    ResultsCacheType type) {
6777   FixedArray* cache;
6778   if (!key_string->IsInternalizedString()) return Smi::kZero;
6779   if (type == STRING_SPLIT_SUBSTRINGS) {
6780     DCHECK(key_pattern->IsString());
6781     if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6782     cache = heap->string_split_cache();
6783   } else {
6784     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6785     DCHECK(key_pattern->IsFixedArray());
6786     cache = heap->regexp_multiple_cache();
6787   }
6788 
6789   uint32_t hash = key_string->Hash();
6790   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6791                     ~(kArrayEntriesPerCacheEntry - 1));
6792   if (cache->get(index + kStringOffset) != key_string ||
6793       cache->get(index + kPatternOffset) != key_pattern) {
6794     index =
6795         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6796     if (cache->get(index + kStringOffset) != key_string ||
6797         cache->get(index + kPatternOffset) != key_pattern) {
6798       return Smi::kZero;
6799     }
6800   }
6801 
6802   *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6803   return cache->get(index + kArrayOffset);
6804 }
6805 
6806 
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)6807 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6808                                Handle<Object> key_pattern,
6809                                Handle<FixedArray> value_array,
6810                                Handle<FixedArray> last_match_cache,
6811                                ResultsCacheType type) {
6812   Factory* factory = isolate->factory();
6813   Handle<FixedArray> cache;
6814   if (!key_string->IsInternalizedString()) return;
6815   if (type == STRING_SPLIT_SUBSTRINGS) {
6816     DCHECK(key_pattern->IsString());
6817     if (!key_pattern->IsInternalizedString()) return;
6818     cache = factory->string_split_cache();
6819   } else {
6820     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6821     DCHECK(key_pattern->IsFixedArray());
6822     cache = factory->regexp_multiple_cache();
6823   }
6824 
6825   uint32_t hash = key_string->Hash();
6826   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6827                     ~(kArrayEntriesPerCacheEntry - 1));
6828   if (cache->get(index + kStringOffset) == Smi::kZero) {
6829     cache->set(index + kStringOffset, *key_string);
6830     cache->set(index + kPatternOffset, *key_pattern);
6831     cache->set(index + kArrayOffset, *value_array);
6832     cache->set(index + kLastMatchOffset, *last_match_cache);
6833   } else {
6834     uint32_t index2 =
6835         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6836     if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6837       cache->set(index2 + kStringOffset, *key_string);
6838       cache->set(index2 + kPatternOffset, *key_pattern);
6839       cache->set(index2 + kArrayOffset, *value_array);
6840       cache->set(index2 + kLastMatchOffset, *last_match_cache);
6841     } else {
6842       cache->set(index2 + kStringOffset, Smi::kZero);
6843       cache->set(index2 + kPatternOffset, Smi::kZero);
6844       cache->set(index2 + kArrayOffset, Smi::kZero);
6845       cache->set(index2 + kLastMatchOffset, Smi::kZero);
6846       cache->set(index + kStringOffset, *key_string);
6847       cache->set(index + kPatternOffset, *key_pattern);
6848       cache->set(index + kArrayOffset, *value_array);
6849       cache->set(index + kLastMatchOffset, *last_match_cache);
6850     }
6851   }
6852   // If the array is a reasonably short list of substrings, convert it into a
6853   // list of internalized strings.
6854   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6855     for (int i = 0; i < value_array->length(); i++) {
6856       Handle<String> str(String::cast(value_array->get(i)), isolate);
6857       Handle<String> internalized_str = factory->InternalizeString(str);
6858       value_array->set(i, *internalized_str);
6859     }
6860   }
6861   // Convert backing store to a copy-on-write array.
6862   value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map());
6863 }
6864 
6865 
Clear(FixedArray * cache)6866 void RegExpResultsCache::Clear(FixedArray* cache) {
6867   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6868     cache->set(i, Smi::kZero);
6869   }
6870 }
6871 
6872 }  // namespace internal
6873 }  // namespace v8
6874