• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #include "v8.h"
29 
30 #include "ast.h"
31 #include "compiler.h"
32 #include "execution.h"
33 #include "factory.h"
34 #include "jsregexp.h"
35 #include "platform.h"
36 #include "runtime.h"
37 #include "top.h"
38 #include "compilation-cache.h"
39 #include "string-stream.h"
40 #include "parser.h"
41 #include "regexp-macro-assembler.h"
42 #include "regexp-macro-assembler-tracer.h"
43 #include "regexp-macro-assembler-irregexp.h"
44 #include "regexp-stack.h"
45 
46 #ifdef V8_NATIVE_REGEXP
47 #if V8_TARGET_ARCH_IA32
48 #include "ia32/regexp-macro-assembler-ia32.h"
49 #elif V8_TARGET_ARCH_X64
50 #include "x64/regexp-macro-assembler-x64.h"
51 #elif V8_TARGET_ARCH_ARM
52 #include "arm/regexp-macro-assembler-arm.h"
53 #else
54 #error Unsupported target architecture.
55 #endif
56 #endif
57 
58 #include "interpreter-irregexp.h"
59 
60 
61 namespace v8 {
62 namespace internal {
63 
64 
CreateRegExpLiteral(Handle<JSFunction> constructor,Handle<String> pattern,Handle<String> flags,bool * has_pending_exception)65 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
66                                                Handle<String> pattern,
67                                                Handle<String> flags,
68                                                bool* has_pending_exception) {
69   // Call the construct code with 2 arguments.
70   Object** argv[2] = { Handle<Object>::cast(pattern).location(),
71                        Handle<Object>::cast(flags).location() };
72   return Execution::New(constructor, 2, argv, has_pending_exception);
73 }
74 
75 
RegExpFlagsFromString(Handle<String> str)76 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
77   int flags = JSRegExp::NONE;
78   for (int i = 0; i < str->length(); i++) {
79     switch (str->Get(i)) {
80       case 'i':
81         flags |= JSRegExp::IGNORE_CASE;
82         break;
83       case 'g':
84         flags |= JSRegExp::GLOBAL;
85         break;
86       case 'm':
87         flags |= JSRegExp::MULTILINE;
88         break;
89     }
90   }
91   return JSRegExp::Flags(flags);
92 }
93 
94 
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text,const char * message)95 static inline void ThrowRegExpException(Handle<JSRegExp> re,
96                                         Handle<String> pattern,
97                                         Handle<String> error_text,
98                                         const char* message) {
99   Handle<JSArray> array = Factory::NewJSArray(2);
100   SetElement(array, 0, pattern);
101   SetElement(array, 1, error_text);
102   Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
103   Top::Throw(*regexp_err);
104 }
105 
106 
107 // Generic RegExp methods. Dispatches to implementation specific methods.
108 
109 
Compile(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> flag_str)110 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
111                                    Handle<String> pattern,
112                                    Handle<String> flag_str) {
113   JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
114   Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
115   bool in_cache = !cached.is_null();
116   LOG(RegExpCompileEvent(re, in_cache));
117 
118   Handle<Object> result;
119   if (in_cache) {
120     re->set_data(*cached);
121     return re;
122   }
123   FlattenString(pattern);
124   CompilationZoneScope zone_scope(DELETE_ON_EXIT);
125   RegExpCompileData parse_result;
126   FlatStringReader reader(pattern);
127   if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
128     // Throw an exception if we fail to parse the pattern.
129     ThrowRegExpException(re,
130                          pattern,
131                          parse_result.error,
132                          "malformed_regexp");
133     return Handle<Object>::null();
134   }
135 
136   if (parse_result.simple && !flags.is_ignore_case()) {
137     // Parse-tree is a single atom that is equal to the pattern.
138     AtomCompile(re, pattern, flags, pattern);
139   } else if (parse_result.tree->IsAtom() &&
140       !flags.is_ignore_case() &&
141       parse_result.capture_count == 0) {
142     RegExpAtom* atom = parse_result.tree->AsAtom();
143     Vector<const uc16> atom_pattern = atom->data();
144     Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
145     AtomCompile(re, pattern, flags, atom_string);
146   } else {
147     IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
148   }
149   ASSERT(re->data()->IsFixedArray());
150   // Compilation succeeded so the data is set on the regexp
151   // and we can store it in the cache.
152   Handle<FixedArray> data(FixedArray::cast(re->data()));
153   CompilationCache::PutRegExp(pattern, flags, data);
154 
155   return re;
156 }
157 
158 
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<JSArray> last_match_info)159 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
160                                 Handle<String> subject,
161                                 int index,
162                                 Handle<JSArray> last_match_info) {
163   switch (regexp->TypeTag()) {
164     case JSRegExp::ATOM:
165       return AtomExec(regexp, subject, index, last_match_info);
166     case JSRegExp::IRREGEXP: {
167       Handle<Object> result =
168           IrregexpExec(regexp, subject, index, last_match_info);
169       ASSERT(!result.is_null() || Top::has_pending_exception());
170       return result;
171     }
172     default:
173       UNREACHABLE();
174       return Handle<Object>::null();
175   }
176 }
177 
178 
179 // RegExp Atom implementation: Simple string search using indexOf.
180 
181 
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)182 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
183                              Handle<String> pattern,
184                              JSRegExp::Flags flags,
185                              Handle<String> match_pattern) {
186   Factory::SetRegExpAtomData(re,
187                              JSRegExp::ATOM,
188                              pattern,
189                              flags,
190                              match_pattern);
191 }
192 
193 
SetAtomLastCapture(FixedArray * array,String * subject,int from,int to)194 static void SetAtomLastCapture(FixedArray* array,
195                                String* subject,
196                                int from,
197                                int to) {
198   NoHandleAllocation no_handles;
199   RegExpImpl::SetLastCaptureCount(array, 2);
200   RegExpImpl::SetLastSubject(array, subject);
201   RegExpImpl::SetLastInput(array, subject);
202   RegExpImpl::SetCapture(array, 0, from);
203   RegExpImpl::SetCapture(array, 1, to);
204 }
205 
206 
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<JSArray> last_match_info)207 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
208                                     Handle<String> subject,
209                                     int index,
210                                     Handle<JSArray> last_match_info) {
211   Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
212 
213   uint32_t start_index = index;
214 
215   int value = Runtime::StringMatch(subject, needle, start_index);
216   if (value == -1) return Factory::null_value();
217   ASSERT(last_match_info->HasFastElements());
218 
219   {
220     NoHandleAllocation no_handles;
221     FixedArray* array = FixedArray::cast(last_match_info->elements());
222     SetAtomLastCapture(array, *subject, value, value + needle->length());
223   }
224   return last_match_info;
225 }
226 
227 
228 // Irregexp implementation.
229 
230 // Ensures that the regexp object contains a compiled version of the
231 // source for either ASCII or non-ASCII strings.
232 // If the compiled version doesn't already exist, it is compiled
233 // from the source pattern.
234 // If compilation fails, an exception is thrown and this function
235 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,bool is_ascii)236 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
237   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
238 #ifdef V8_NATIVE_REGEXP
239   if (compiled_code->IsCode()) return true;
240 #else  // ! V8_NATIVE_REGEXP (RegExp interpreter code)
241   if (compiled_code->IsByteArray()) return true;
242 #endif
243   return CompileIrregexp(re, is_ascii);
244 }
245 
246 
CompileIrregexp(Handle<JSRegExp> re,bool is_ascii)247 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
248   // Compile the RegExp.
249   CompilationZoneScope zone_scope(DELETE_ON_EXIT);
250   Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
251   if (entry->IsJSObject()) {
252     // If it's a JSObject, a previous compilation failed and threw this object.
253     // Re-throw the object without trying again.
254     Top::Throw(entry);
255     return false;
256   }
257   ASSERT(entry->IsTheHole());
258 
259   JSRegExp::Flags flags = re->GetFlags();
260 
261   Handle<String> pattern(re->Pattern());
262   if (!pattern->IsFlat()) {
263     FlattenString(pattern);
264   }
265 
266   RegExpCompileData compile_data;
267   FlatStringReader reader(pattern);
268   if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
269     // Throw an exception if we fail to parse the pattern.
270     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
271     ThrowRegExpException(re,
272                          pattern,
273                          compile_data.error,
274                          "malformed_regexp");
275     return false;
276   }
277   RegExpEngine::CompilationResult result =
278       RegExpEngine::Compile(&compile_data,
279                             flags.is_ignore_case(),
280                             flags.is_multiline(),
281                             pattern,
282                             is_ascii);
283   if (result.error_message != NULL) {
284     // Unable to compile regexp.
285     Handle<JSArray> array = Factory::NewJSArray(2);
286     SetElement(array, 0, pattern);
287     SetElement(array,
288                1,
289                Factory::NewStringFromUtf8(CStrVector(result.error_message)));
290     Handle<Object> regexp_err =
291         Factory::NewSyntaxError("malformed_regexp", array);
292     Top::Throw(*regexp_err);
293     re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
294     return false;
295   }
296 
297   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
298   data->set(JSRegExp::code_index(is_ascii), result.code);
299   int register_max = IrregexpMaxRegisterCount(*data);
300   if (result.num_registers > register_max) {
301     SetIrregexpMaxRegisterCount(*data, result.num_registers);
302   }
303 
304   return true;
305 }
306 
307 
IrregexpMaxRegisterCount(FixedArray * re)308 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
309   return Smi::cast(
310       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
311 }
312 
313 
SetIrregexpMaxRegisterCount(FixedArray * re,int value)314 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
315   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
316 }
317 
318 
IrregexpNumberOfCaptures(FixedArray * re)319 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
320   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
321 }
322 
323 
IrregexpNumberOfRegisters(FixedArray * re)324 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
325   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
326 }
327 
328 
IrregexpByteCode(FixedArray * re,bool is_ascii)329 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
330   return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
331 }
332 
333 
IrregexpNativeCode(FixedArray * re,bool is_ascii)334 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
335   return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
336 }
337 
338 
IrregexpPrepare(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)339 void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
340                                  Handle<String> pattern,
341                                  JSRegExp::Flags flags,
342                                  int capture_count) {
343   // Initialize compiled code entries to null.
344   Factory::SetRegExpIrregexpData(re,
345                                  JSRegExp::IRREGEXP,
346                                  pattern,
347                                  flags,
348                                  capture_count);
349 }
350 
351 
IrregexpExec(Handle<JSRegExp> jsregexp,Handle<String> subject,int previous_index,Handle<JSArray> last_match_info)352 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
353                                         Handle<String> subject,
354                                         int previous_index,
355                                         Handle<JSArray> last_match_info) {
356   ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
357 
358   // Prepare space for the return values.
359   int number_of_capture_registers =
360       (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
361 
362 #ifndef V8_NATIVE_REGEXP
363 #ifdef DEBUG
364   if (FLAG_trace_regexp_bytecodes) {
365     String* pattern = jsregexp->Pattern();
366     PrintF("\n\nRegexp match:   /%s/\n\n", *(pattern->ToCString()));
367     PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
368   }
369 #endif
370 #endif
371 
372   if (!subject->IsFlat()) {
373     FlattenString(subject);
374   }
375 
376   last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
377 
378   Handle<FixedArray> array;
379 
380   // Dispatch to the correct RegExp implementation.
381   Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
382 
383 #ifdef V8_NATIVE_REGEXP
384 
385   OffsetsVector captures(number_of_capture_registers);
386   int* captures_vector = captures.vector();
387   NativeRegExpMacroAssembler::Result res;
388   do {
389     bool is_ascii = subject->IsAsciiRepresentation();
390     if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
391       return Handle<Object>::null();
392     }
393     Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
394     res = NativeRegExpMacroAssembler::Match(code,
395                                             subject,
396                                             captures_vector,
397                                             captures.length(),
398                                             previous_index);
399     // If result is RETRY, the string have changed representation, and we
400     // must restart from scratch.
401   } while (res == NativeRegExpMacroAssembler::RETRY);
402   if (res == NativeRegExpMacroAssembler::EXCEPTION) {
403     ASSERT(Top::has_pending_exception());
404     return Handle<Object>::null();
405   }
406   ASSERT(res == NativeRegExpMacroAssembler::SUCCESS
407       || res == NativeRegExpMacroAssembler::FAILURE);
408 
409   if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value();
410 
411   array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
412   ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
413   // The captures come in (start, end+1) pairs.
414   for (int i = 0; i < number_of_capture_registers; i += 2) {
415     // Capture values are relative to start_offset only.
416     // Convert them to be relative to start of string.
417     if (captures_vector[i] >= 0) {
418       captures_vector[i] += previous_index;
419     }
420     if (captures_vector[i + 1] >= 0) {
421       captures_vector[i + 1] += previous_index;
422     }
423     SetCapture(*array, i, captures_vector[i]);
424     SetCapture(*array, i + 1, captures_vector[i + 1]);
425   }
426 
427 #else  // ! V8_NATIVE_REGEXP
428 
429   bool is_ascii = subject->IsAsciiRepresentation();
430   if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
431     return Handle<Object>::null();
432   }
433   // Now that we have done EnsureCompiledIrregexp we can get the number of
434   // registers.
435   int number_of_registers =
436       IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
437   OffsetsVector registers(number_of_registers);
438   int* register_vector = registers.vector();
439   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
440     register_vector[i] = -1;
441   }
442   Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
443 
444   if (!IrregexpInterpreter::Match(byte_codes,
445                                   subject,
446                                   register_vector,
447                                   previous_index)) {
448     return Factory::null_value();
449   }
450 
451   array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
452   ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
453   // The captures come in (start, end+1) pairs.
454   for (int i = 0; i < number_of_capture_registers; i += 2) {
455     SetCapture(*array, i, register_vector[i]);
456     SetCapture(*array, i + 1, register_vector[i + 1]);
457   }
458 
459 #endif  // V8_NATIVE_REGEXP
460 
461   SetLastCaptureCount(*array, number_of_capture_registers);
462   SetLastSubject(*array, *subject);
463   SetLastInput(*array, *subject);
464 
465   return last_match_info;
466 }
467 
468 
469 // -------------------------------------------------------------------
470 // Implementation of the Irregexp regular expression engine.
471 //
472 // The Irregexp regular expression engine is intended to be a complete
473 // implementation of ECMAScript regular expressions.  It generates either
474 // bytecodes or native code.
475 
476 //   The Irregexp regexp engine is structured in three steps.
477 //   1) The parser generates an abstract syntax tree.  See ast.cc.
478 //   2) From the AST a node network is created.  The nodes are all
479 //      subclasses of RegExpNode.  The nodes represent states when
480 //      executing a regular expression.  Several optimizations are
481 //      performed on the node network.
482 //   3) From the nodes we generate either byte codes or native code
483 //      that can actually execute the regular expression (perform
484 //      the search).  The code generation step is described in more
485 //      detail below.
486 
487 // Code generation.
488 //
489 //   The nodes are divided into four main categories.
490 //   * Choice nodes
491 //        These represent places where the regular expression can
492 //        match in more than one way.  For example on entry to an
493 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
494 //   * Action nodes
495 //        These represent places where some action should be
496 //        performed.  Examples include recording the current position
497 //        in the input string to a register (in order to implement
498 //        captures) or other actions on register for example in order
499 //        to implement the counters needed for {} repetitions.
500 //   * Matching nodes
501 //        These attempt to match some element part of the input string.
502 //        Examples of elements include character classes, plain strings
503 //        or back references.
504 //   * End nodes
505 //        These are used to implement the actions required on finding
506 //        a successful match or failing to find a match.
507 //
508 //   The code generated (whether as byte codes or native code) maintains
509 //   some state as it runs.  This consists of the following elements:
510 //
511 //   * The capture registers.  Used for string captures.
512 //   * Other registers.  Used for counters etc.
513 //   * The current position.
514 //   * The stack of backtracking information.  Used when a matching node
515 //     fails to find a match and needs to try an alternative.
516 //
517 // Conceptual regular expression execution model:
518 //
519 //   There is a simple conceptual model of regular expression execution
520 //   which will be presented first.  The actual code generated is a more
521 //   efficient simulation of the simple conceptual model:
522 //
523 //   * Choice nodes are implemented as follows:
524 //     For each choice except the last {
525 //       push current position
526 //       push backtrack code location
527 //       <generate code to test for choice>
528 //       backtrack code location:
529 //       pop current position
530 //     }
531 //     <generate code to test for last choice>
532 //
533 //   * Actions nodes are generated as follows
534 //     <push affected registers on backtrack stack>
535 //     <generate code to perform action>
536 //     push backtrack code location
537 //     <generate code to test for following nodes>
538 //     backtrack code location:
539 //     <pop affected registers to restore their state>
540 //     <pop backtrack location from stack and go to it>
541 //
542 //   * Matching nodes are generated as follows:
543 //     if input string matches at current position
544 //       update current position
545 //       <generate code to test for following nodes>
546 //     else
547 //       <pop backtrack location from stack and go to it>
548 //
549 //   Thus it can be seen that the current position is saved and restored
550 //   by the choice nodes, whereas the registers are saved and restored by
551 //   by the action nodes that manipulate them.
552 //
553 //   The other interesting aspect of this model is that nodes are generated
554 //   at the point where they are needed by a recursive call to Emit().  If
555 //   the node has already been code generated then the Emit() call will
556 //   generate a jump to the previously generated code instead.  In order to
557 //   limit recursion it is possible for the Emit() function to put the node
558 //   on a work list for later generation and instead generate a jump.  The
559 //   destination of the jump is resolved later when the code is generated.
560 //
561 // Actual regular expression code generation.
562 //
563 //   Code generation is actually more complicated than the above.  In order
564 //   to improve the efficiency of the generated code some optimizations are
565 //   performed
566 //
567 //   * Choice nodes have 1-character lookahead.
568 //     A choice node looks at the following character and eliminates some of
569 //     the choices immediately based on that character.  This is not yet
570 //     implemented.
571 //   * Simple greedy loops store reduced backtracking information.
572 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
573 //     then need to backtrack to a point where it can match "foo".  The naive
574 //     implementation of this would push each character position onto the
575 //     backtracking stack, then pop them off one by one.  This would use space
576 //     proportional to the length of the input string.  However since the "."
577 //     can only match in one way and always has a constant length (in this case
578 //     of 1) it suffices to store the current position on the top of the stack
579 //     once.  Matching now becomes merely incrementing the current position and
580 //     backtracking becomes decrementing the current position and checking the
581 //     result against the stored current position.  This is faster and saves
582 //     space.
583 //   * The current state is virtualized.
584 //     This is used to defer expensive operations until it is clear that they
585 //     are needed and to generate code for a node more than once, allowing
586 //     specialized an efficient versions of the code to be created. This is
587 //     explained in the section below.
588 //
589 // Execution state virtualization.
590 //
591 //   Instead of emitting code, nodes that manipulate the state can record their
592 //   manipulation in an object called the Trace.  The Trace object can record a
593 //   current position offset, an optional backtrack code location on the top of
594 //   the virtualized backtrack stack and some register changes.  When a node is
595 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
596 //   will emit code to bring the actual state into line with the virtual state.
597 //   Avoiding flushing the state can postpone some work (eg updates of capture
598 //   registers).  Postponing work can save time when executing the regular
599 //   expression since it may be found that the work never has to be done as a
600 //   failure to match can occur.  In addition it is much faster to jump to a
601 //   known backtrack code location than it is to pop an unknown backtrack
602 //   location from the stack and jump there.
603 //
604 //   The virtual state found in the Trace affects code generation.  For example
605 //   the virtual state contains the difference between the actual current
606 //   position and the virtual current position, and matching code needs to use
607 //   this offset to attempt a match in the correct location of the input
608 //   string.  Therefore code generated for a non-trivial trace is specialized
609 //   to that trace.  The code generator therefore has the ability to generate
610 //   code for each node several times.  In order to limit the size of the
611 //   generated code there is an arbitrary limit on how many specialized sets of
612 //   code may be generated for a given node.  If the limit is reached, the
613 //   trace is flushed and a generic version of the code for a node is emitted.
614 //   This is subsequently used for that node.  The code emitted for non-generic
615 //   trace is not recorded in the node and so it cannot currently be reused in
616 //   the event that code generation is requested for an identical trace.
617 
618 
AppendToText(RegExpText * text)619 void RegExpTree::AppendToText(RegExpText* text) {
620   UNREACHABLE();
621 }
622 
623 
AppendToText(RegExpText * text)624 void RegExpAtom::AppendToText(RegExpText* text) {
625   text->AddElement(TextElement::Atom(this));
626 }
627 
628 
AppendToText(RegExpText * text)629 void RegExpCharacterClass::AppendToText(RegExpText* text) {
630   text->AddElement(TextElement::CharClass(this));
631 }
632 
633 
AppendToText(RegExpText * text)634 void RegExpText::AppendToText(RegExpText* text) {
635   for (int i = 0; i < elements()->length(); i++)
636     text->AddElement(elements()->at(i));
637 }
638 
639 
Atom(RegExpAtom * atom)640 TextElement TextElement::Atom(RegExpAtom* atom) {
641   TextElement result = TextElement(ATOM);
642   result.data.u_atom = atom;
643   return result;
644 }
645 
646 
CharClass(RegExpCharacterClass * char_class)647 TextElement TextElement::CharClass(
648       RegExpCharacterClass* char_class) {
649   TextElement result = TextElement(CHAR_CLASS);
650   result.data.u_char_class = char_class;
651   return result;
652 }
653 
654 
length()655 int TextElement::length() {
656   if (type == ATOM) {
657     return data.u_atom->length();
658   } else {
659     ASSERT(type == CHAR_CLASS);
660     return 1;
661   }
662 }
663 
664 
GetTable(bool ignore_case)665 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
666   if (table_ == NULL) {
667     table_ = new DispatchTable();
668     DispatchTableConstructor cons(table_, ignore_case);
669     cons.BuildTable(this);
670   }
671   return table_;
672 }
673 
674 
675 class RegExpCompiler {
676  public:
677   RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
678 
AllocateRegister()679   int AllocateRegister() {
680     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
681       reg_exp_too_big_ = true;
682       return next_register_;
683     }
684     return next_register_++;
685   }
686 
687   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
688                                            RegExpNode* start,
689                                            int capture_count,
690                                            Handle<String> pattern);
691 
AddWork(RegExpNode * node)692   inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
693 
694   static const int kImplementationOffset = 0;
695   static const int kNumberOfRegistersOffset = 0;
696   static const int kCodeOffset = 1;
697 
macro_assembler()698   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()699   EndNode* accept() { return accept_; }
700 
701   static const int kMaxRecursion = 100;
recursion_depth()702   inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()703   inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()704   inline void DecrementRecursionDepth() { recursion_depth_--; }
705 
SetRegExpTooBig()706   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
707 
ignore_case()708   inline bool ignore_case() { return ignore_case_; }
ascii()709   inline bool ascii() { return ascii_; }
710 
711   static const int kNoRegister = -1;
712  private:
713   EndNode* accept_;
714   int next_register_;
715   List<RegExpNode*>* work_list_;
716   int recursion_depth_;
717   RegExpMacroAssembler* macro_assembler_;
718   bool ignore_case_;
719   bool ascii_;
720   bool reg_exp_too_big_;
721 };
722 
723 
724 class RecursionCheck {
725  public:
RecursionCheck(RegExpCompiler * compiler)726   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
727     compiler->IncrementRecursionDepth();
728   }
~RecursionCheck()729   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
730  private:
731   RegExpCompiler* compiler_;
732 };
733 
734 
IrregexpRegExpTooBig()735 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
736   return RegExpEngine::CompilationResult("RegExp too big");
737 }
738 
739 
740 // Attempts to compile the regexp using an Irregexp code generator.  Returns
741 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(int capture_count,bool ignore_case,bool ascii)742 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
743     : next_register_(2 * (capture_count + 1)),
744       work_list_(NULL),
745       recursion_depth_(0),
746       ignore_case_(ignore_case),
747       ascii_(ascii),
748       reg_exp_too_big_(false) {
749   accept_ = new EndNode(EndNode::ACCEPT);
750   ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
751 }
752 
753 
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)754 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
755     RegExpMacroAssembler* macro_assembler,
756     RegExpNode* start,
757     int capture_count,
758     Handle<String> pattern) {
759 #ifdef DEBUG
760   if (FLAG_trace_regexp_assembler)
761     macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
762   else
763 #endif
764     macro_assembler_ = macro_assembler;
765   List <RegExpNode*> work_list(0);
766   work_list_ = &work_list;
767   Label fail;
768   macro_assembler_->PushBacktrack(&fail);
769   Trace new_trace;
770   start->Emit(this, &new_trace);
771   macro_assembler_->Bind(&fail);
772   macro_assembler_->Fail();
773   while (!work_list.is_empty()) {
774     work_list.RemoveLast()->Emit(this, &new_trace);
775   }
776   if (reg_exp_too_big_) return IrregexpRegExpTooBig();
777 
778   Handle<Object> code = macro_assembler_->GetCode(pattern);
779 
780   work_list_ = NULL;
781 #ifdef DEBUG
782   if (FLAG_trace_regexp_assembler) {
783     delete macro_assembler_;
784   }
785 #endif
786   return RegExpEngine::CompilationResult(*code, next_register_);
787 }
788 
789 
Mentions(int that)790 bool Trace::DeferredAction::Mentions(int that) {
791   if (type() == ActionNode::CLEAR_CAPTURES) {
792     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
793     return range.Contains(that);
794   } else {
795     return reg() == that;
796   }
797 }
798 
799 
mentions_reg(int reg)800 bool Trace::mentions_reg(int reg) {
801   for (DeferredAction* action = actions_;
802        action != NULL;
803        action = action->next()) {
804     if (action->Mentions(reg))
805       return true;
806   }
807   return false;
808 }
809 
810 
GetStoredPosition(int reg,int * cp_offset)811 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
812   ASSERT_EQ(0, *cp_offset);
813   for (DeferredAction* action = actions_;
814        action != NULL;
815        action = action->next()) {
816     if (action->Mentions(reg)) {
817       if (action->type() == ActionNode::STORE_POSITION) {
818         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
819         return true;
820       } else {
821         return false;
822       }
823     }
824   }
825   return false;
826 }
827 
828 
FindAffectedRegisters(OutSet * affected_registers)829 int Trace::FindAffectedRegisters(OutSet* affected_registers) {
830   int max_register = RegExpCompiler::kNoRegister;
831   for (DeferredAction* action = actions_;
832        action != NULL;
833        action = action->next()) {
834     if (action->type() == ActionNode::CLEAR_CAPTURES) {
835       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
836       for (int i = range.from(); i <= range.to(); i++)
837         affected_registers->Set(i);
838       if (range.to() > max_register) max_register = range.to();
839     } else {
840       affected_registers->Set(action->reg());
841       if (action->reg() > max_register) max_register = action->reg();
842     }
843   }
844   return max_register;
845 }
846 
847 
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,OutSet & registers_to_pop,OutSet & registers_to_clear)848 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
849                                      int max_register,
850                                      OutSet& registers_to_pop,
851                                      OutSet& registers_to_clear) {
852   for (int reg = max_register; reg >= 0; reg--) {
853     if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
854     else if (registers_to_clear.Get(reg)) {
855       int clear_to = reg;
856       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
857         reg--;
858       }
859       assembler->ClearRegisters(reg, clear_to);
860     }
861   }
862 }
863 
864 
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear)865 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
866                                    int max_register,
867                                    OutSet& affected_registers,
868                                    OutSet* registers_to_pop,
869                                    OutSet* registers_to_clear) {
870   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
871   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
872 
873   // Count pushes performed to force a stack limit check occasionally.
874   int pushes = 0;
875 
876   for (int reg = 0; reg <= max_register; reg++) {
877     if (!affected_registers.Get(reg)) {
878       continue;
879     }
880 
881     // The chronologically first deferred action in the trace
882     // is used to infer the action needed to restore a register
883     // to its previous state (or not, if it's safe to ignore it).
884     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
885     DeferredActionUndoType undo_action = IGNORE;
886 
887     int value = 0;
888     bool absolute = false;
889     bool clear = false;
890     int store_position = -1;
891     // This is a little tricky because we are scanning the actions in reverse
892     // historical order (newest first).
893     for (DeferredAction* action = actions_;
894          action != NULL;
895          action = action->next()) {
896       if (action->Mentions(reg)) {
897         switch (action->type()) {
898           case ActionNode::SET_REGISTER: {
899             Trace::DeferredSetRegister* psr =
900                 static_cast<Trace::DeferredSetRegister*>(action);
901             if (!absolute) {
902               value += psr->value();
903               absolute = true;
904             }
905             // SET_REGISTER is currently only used for newly introduced loop
906             // counters. They can have a significant previous value if they
907             // occour in a loop. TODO(lrn): Propagate this information, so
908             // we can set undo_action to IGNORE if we know there is no value to
909             // restore.
910             undo_action = RESTORE;
911             ASSERT_EQ(store_position, -1);
912             ASSERT(!clear);
913             break;
914           }
915           case ActionNode::INCREMENT_REGISTER:
916             if (!absolute) {
917               value++;
918             }
919             ASSERT_EQ(store_position, -1);
920             ASSERT(!clear);
921             undo_action = RESTORE;
922             break;
923           case ActionNode::STORE_POSITION: {
924             Trace::DeferredCapture* pc =
925                 static_cast<Trace::DeferredCapture*>(action);
926             if (!clear && store_position == -1) {
927               store_position = pc->cp_offset();
928             }
929 
930             // For captures we know that stores and clears alternate.
931             // Other register, are never cleared, and if the occur
932             // inside a loop, they might be assigned more than once.
933             if (reg <= 1) {
934               // Registers zero and one, aka "capture zero", is
935               // always set correctly if we succeed. There is no
936               // need to undo a setting on backtrack, because we
937               // will set it again or fail.
938               undo_action = IGNORE;
939             } else {
940               undo_action = pc->is_capture() ? CLEAR : RESTORE;
941             }
942             ASSERT(!absolute);
943             ASSERT_EQ(value, 0);
944             break;
945           }
946           case ActionNode::CLEAR_CAPTURES: {
947             // Since we're scanning in reverse order, if we've already
948             // set the position we have to ignore historically earlier
949             // clearing operations.
950             if (store_position == -1) {
951               clear = true;
952             }
953             undo_action = RESTORE;
954             ASSERT(!absolute);
955             ASSERT_EQ(value, 0);
956             break;
957           }
958           default:
959             UNREACHABLE();
960             break;
961         }
962       }
963     }
964     // Prepare for the undo-action (e.g., push if it's going to be popped).
965     if (undo_action == RESTORE) {
966       pushes++;
967       RegExpMacroAssembler::StackCheckFlag stack_check =
968           RegExpMacroAssembler::kNoStackLimitCheck;
969       if (pushes == push_limit) {
970         stack_check = RegExpMacroAssembler::kCheckStackLimit;
971         pushes = 0;
972       }
973 
974       assembler->PushRegister(reg, stack_check);
975       registers_to_pop->Set(reg);
976     } else if (undo_action == CLEAR) {
977       registers_to_clear->Set(reg);
978     }
979     // Perform the chronologically last action (or accumulated increment)
980     // for the register.
981     if (store_position != -1) {
982       assembler->WriteCurrentPositionToRegister(reg, store_position);
983     } else if (clear) {
984       assembler->ClearRegisters(reg, reg);
985     } else if (absolute) {
986       assembler->SetRegister(reg, value);
987     } else if (value != 0) {
988       assembler->AdvanceRegister(reg, value);
989     }
990   }
991 }
992 
993 
994 // This is called as we come into a loop choice node and some other tricky
995 // nodes.  It normalizes the state of the code generator to ensure we can
996 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)997 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
998   RegExpMacroAssembler* assembler = compiler->macro_assembler();
999 
1000   ASSERT(!is_trivial());
1001 
1002   if (actions_ == NULL && backtrack() == NULL) {
1003     // Here we just have some deferred cp advances to fix and we are back to
1004     // a normal situation.  We may also have to forget some information gained
1005     // through a quick check that was already performed.
1006     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1007     // Create a new trivial state and generate the node with that.
1008     Trace new_state;
1009     successor->Emit(compiler, &new_state);
1010     return;
1011   }
1012 
1013   // Generate deferred actions here along with code to undo them again.
1014   OutSet affected_registers;
1015 
1016   if (backtrack() != NULL) {
1017     // Here we have a concrete backtrack location.  These are set up by choice
1018     // nodes and so they indicate that we have a deferred save of the current
1019     // position which we may need to emit here.
1020     assembler->PushCurrentPosition();
1021   }
1022 
1023   int max_register = FindAffectedRegisters(&affected_registers);
1024   OutSet registers_to_pop;
1025   OutSet registers_to_clear;
1026   PerformDeferredActions(assembler,
1027                          max_register,
1028                          affected_registers,
1029                          &registers_to_pop,
1030                          &registers_to_clear);
1031   if (cp_offset_ != 0) {
1032     assembler->AdvanceCurrentPosition(cp_offset_);
1033   }
1034 
1035   // Create a new trivial state and generate the node with that.
1036   Label undo;
1037   assembler->PushBacktrack(&undo);
1038   Trace new_state;
1039   successor->Emit(compiler, &new_state);
1040 
1041   // On backtrack we need to restore state.
1042   assembler->Bind(&undo);
1043   RestoreAffectedRegisters(assembler,
1044                            max_register,
1045                            registers_to_pop,
1046                            registers_to_clear);
1047   if (backtrack() == NULL) {
1048     assembler->Backtrack();
1049   } else {
1050     assembler->PopCurrentPosition();
1051     assembler->GoTo(backtrack());
1052   }
1053 }
1054 
1055 
Emit(RegExpCompiler * compiler,Trace * trace)1056 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1057   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1058 
1059   // Omit flushing the trace. We discard the entire stack frame anyway.
1060 
1061   if (!label()->is_bound()) {
1062     // We are completely independent of the trace, since we ignore it,
1063     // so this code can be used as the generic version.
1064     assembler->Bind(label());
1065   }
1066 
1067   // Throw away everything on the backtrack stack since the start
1068   // of the negative submatch and restore the character position.
1069   assembler->ReadCurrentPositionFromRegister(current_position_register_);
1070   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1071   if (clear_capture_count_ > 0) {
1072     // Clear any captures that might have been performed during the success
1073     // of the body of the negative look-ahead.
1074     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1075     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1076   }
1077   // Now that we have unwound the stack we find at the top of the stack the
1078   // backtrack that the BeginSubmatch node got.
1079   assembler->Backtrack();
1080 }
1081 
1082 
Emit(RegExpCompiler * compiler,Trace * trace)1083 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1084   if (!trace->is_trivial()) {
1085     trace->Flush(compiler, this);
1086     return;
1087   }
1088   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1089   if (!label()->is_bound()) {
1090     assembler->Bind(label());
1091   }
1092   switch (action_) {
1093     case ACCEPT:
1094       assembler->Succeed();
1095       return;
1096     case BACKTRACK:
1097       assembler->GoTo(trace->backtrack());
1098       return;
1099     case NEGATIVE_SUBMATCH_SUCCESS:
1100       // This case is handled in a different virtual method.
1101       UNREACHABLE();
1102   }
1103   UNIMPLEMENTED();
1104 }
1105 
1106 
AddGuard(Guard * guard)1107 void GuardedAlternative::AddGuard(Guard* guard) {
1108   if (guards_ == NULL)
1109     guards_ = new ZoneList<Guard*>(1);
1110   guards_->Add(guard);
1111 }
1112 
1113 
SetRegister(int reg,int val,RegExpNode * on_success)1114 ActionNode* ActionNode::SetRegister(int reg,
1115                                     int val,
1116                                     RegExpNode* on_success) {
1117   ActionNode* result = new ActionNode(SET_REGISTER, on_success);
1118   result->data_.u_store_register.reg = reg;
1119   result->data_.u_store_register.value = val;
1120   return result;
1121 }
1122 
1123 
IncrementRegister(int reg,RegExpNode * on_success)1124 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1125   ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1126   result->data_.u_increment_register.reg = reg;
1127   return result;
1128 }
1129 
1130 
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1131 ActionNode* ActionNode::StorePosition(int reg,
1132                                       bool is_capture,
1133                                       RegExpNode* on_success) {
1134   ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1135   result->data_.u_position_register.reg = reg;
1136   result->data_.u_position_register.is_capture = is_capture;
1137   return result;
1138 }
1139 
1140 
ClearCaptures(Interval range,RegExpNode * on_success)1141 ActionNode* ActionNode::ClearCaptures(Interval range,
1142                                       RegExpNode* on_success) {
1143   ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1144   result->data_.u_clear_captures.range_from = range.from();
1145   result->data_.u_clear_captures.range_to = range.to();
1146   return result;
1147 }
1148 
1149 
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1150 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1151                                       int position_reg,
1152                                       RegExpNode* on_success) {
1153   ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1154   result->data_.u_submatch.stack_pointer_register = stack_reg;
1155   result->data_.u_submatch.current_position_register = position_reg;
1156   return result;
1157 }
1158 
1159 
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1160 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1161                                                 int position_reg,
1162                                                 int clear_register_count,
1163                                                 int clear_register_from,
1164                                                 RegExpNode* on_success) {
1165   ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1166   result->data_.u_submatch.stack_pointer_register = stack_reg;
1167   result->data_.u_submatch.current_position_register = position_reg;
1168   result->data_.u_submatch.clear_register_count = clear_register_count;
1169   result->data_.u_submatch.clear_register_from = clear_register_from;
1170   return result;
1171 }
1172 
1173 
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1174 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1175                                         int repetition_register,
1176                                         int repetition_limit,
1177                                         RegExpNode* on_success) {
1178   ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1179   result->data_.u_empty_match_check.start_register = start_register;
1180   result->data_.u_empty_match_check.repetition_register = repetition_register;
1181   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1182   return result;
1183 }
1184 
1185 
1186 #define DEFINE_ACCEPT(Type)                                          \
1187   void Type##Node::Accept(NodeVisitor* visitor) {                    \
1188     visitor->Visit##Type(this);                                      \
1189   }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1190 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1191 #undef DEFINE_ACCEPT
1192 
1193 
1194 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1195   visitor->VisitLoopChoice(this);
1196 }
1197 
1198 
1199 // -------------------------------------------------------------------
1200 // Emit code.
1201 
1202 
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1203 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1204                                Guard* guard,
1205                                Trace* trace) {
1206   switch (guard->op()) {
1207     case Guard::LT:
1208       ASSERT(!trace->mentions_reg(guard->reg()));
1209       macro_assembler->IfRegisterGE(guard->reg(),
1210                                     guard->value(),
1211                                     trace->backtrack());
1212       break;
1213     case Guard::GEQ:
1214       ASSERT(!trace->mentions_reg(guard->reg()));
1215       macro_assembler->IfRegisterLT(guard->reg(),
1216                                     guard->value(),
1217                                     trace->backtrack());
1218       break;
1219   }
1220 }
1221 
1222 
1223 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1224 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1225 
1226 
1227 // Returns the number of characters in the equivalence class, omitting those
1228 // that cannot occur in the source string because it is ASCII.
GetCaseIndependentLetters(uc16 character,bool ascii_subject,unibrow::uchar * letters)1229 static int GetCaseIndependentLetters(uc16 character,
1230                                      bool ascii_subject,
1231                                      unibrow::uchar* letters) {
1232   int length = uncanonicalize.get(character, '\0', letters);
1233   // Unibrow returns 0 or 1 for characters where case independependence is
1234   // trivial.
1235   if (length == 0) {
1236     letters[0] = character;
1237     length = 1;
1238   }
1239   if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1240     return length;
1241   }
1242   // The standard requires that non-ASCII characters cannot have ASCII
1243   // character codes in their equivalence class.
1244   return 0;
1245 }
1246 
1247 
EmitSimpleCharacter(RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1248 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
1249                                        uc16 c,
1250                                        Label* on_failure,
1251                                        int cp_offset,
1252                                        bool check,
1253                                        bool preloaded) {
1254   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1255   bool bound_checked = false;
1256   if (!preloaded) {
1257     assembler->LoadCurrentCharacter(
1258         cp_offset,
1259         on_failure,
1260         check);
1261     bound_checked = true;
1262   }
1263   assembler->CheckNotCharacter(c, on_failure);
1264   return bound_checked;
1265 }
1266 
1267 
1268 // Only emits non-letters (things that don't have case).  Only used for case
1269 // independent matches.
EmitAtomNonLetter(RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1270 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
1271                                      uc16 c,
1272                                      Label* on_failure,
1273                                      int cp_offset,
1274                                      bool check,
1275                                      bool preloaded) {
1276   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1277   bool ascii = compiler->ascii();
1278   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1279   int length = GetCaseIndependentLetters(c, ascii, chars);
1280   if (length < 1) {
1281     // This can't match.  Must be an ASCII subject and a non-ASCII character.
1282     // We do not need to do anything since the ASCII pass already handled this.
1283     return false;  // Bounds not checked.
1284   }
1285   bool checked = false;
1286   // We handle the length > 1 case in a later pass.
1287   if (length == 1) {
1288     if (ascii && c > String::kMaxAsciiCharCodeU) {
1289       // Can't match - see above.
1290       return false;  // Bounds not checked.
1291     }
1292     if (!preloaded) {
1293       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1294       checked = check;
1295     }
1296     macro_assembler->CheckNotCharacter(c, on_failure);
1297   }
1298   return checked;
1299 }
1300 
1301 
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool ascii,uc16 c1,uc16 c2,Label * on_failure)1302 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1303                                       bool ascii,
1304                                       uc16 c1,
1305                                       uc16 c2,
1306                                       Label* on_failure) {
1307   uc16 char_mask;
1308   if (ascii) {
1309     char_mask = String::kMaxAsciiCharCode;
1310   } else {
1311     char_mask = String::kMaxUC16CharCode;
1312   }
1313   uc16 exor = c1 ^ c2;
1314   // Check whether exor has only one bit set.
1315   if (((exor - 1) & exor) == 0) {
1316     // If c1 and c2 differ only by one bit.
1317     // Ecma262UnCanonicalize always gives the highest number last.
1318     ASSERT(c2 > c1);
1319     uc16 mask = char_mask ^ exor;
1320     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1321     return true;
1322   }
1323   ASSERT(c2 > c1);
1324   uc16 diff = c2 - c1;
1325   if (((diff - 1) & diff) == 0 && c1 >= diff) {
1326     // If the characters differ by 2^n but don't differ by one bit then
1327     // subtract the difference from the found character, then do the or
1328     // trick.  We avoid the theoretical case where negative numbers are
1329     // involved in order to simplify code generation.
1330     uc16 mask = char_mask ^ diff;
1331     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1332                                                     diff,
1333                                                     mask,
1334                                                     on_failure);
1335     return true;
1336   }
1337   return false;
1338 }
1339 
1340 
1341 typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
1342                                    uc16 c,
1343                                    Label* on_failure,
1344                                    int cp_offset,
1345                                    bool check,
1346                                    bool preloaded);
1347 
1348 // Only emits letters (things that have case).  Only used for case independent
1349 // matches.
EmitAtomLetter(RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1350 static inline bool EmitAtomLetter(RegExpCompiler* compiler,
1351                                   uc16 c,
1352                                   Label* on_failure,
1353                                   int cp_offset,
1354                                   bool check,
1355                                   bool preloaded) {
1356   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1357   bool ascii = compiler->ascii();
1358   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1359   int length = GetCaseIndependentLetters(c, ascii, chars);
1360   if (length <= 1) return false;
1361   // We may not need to check against the end of the input string
1362   // if this character lies before a character that matched.
1363   if (!preloaded) {
1364     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1365   }
1366   Label ok;
1367   ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1368   switch (length) {
1369     case 2: {
1370       if (ShortCutEmitCharacterPair(macro_assembler,
1371                                     ascii,
1372                                     chars[0],
1373                                     chars[1],
1374                                     on_failure)) {
1375       } else {
1376         macro_assembler->CheckCharacter(chars[0], &ok);
1377         macro_assembler->CheckNotCharacter(chars[1], on_failure);
1378         macro_assembler->Bind(&ok);
1379       }
1380       break;
1381     }
1382     case 4:
1383       macro_assembler->CheckCharacter(chars[3], &ok);
1384       // Fall through!
1385     case 3:
1386       macro_assembler->CheckCharacter(chars[0], &ok);
1387       macro_assembler->CheckCharacter(chars[1], &ok);
1388       macro_assembler->CheckNotCharacter(chars[2], on_failure);
1389       macro_assembler->Bind(&ok);
1390       break;
1391     default:
1392       UNREACHABLE();
1393       break;
1394   }
1395   return true;
1396 }
1397 
1398 
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool ascii,Label * on_failure,int cp_offset,bool check_offset,bool preloaded)1399 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1400                           RegExpCharacterClass* cc,
1401                           bool ascii,
1402                           Label* on_failure,
1403                           int cp_offset,
1404                           bool check_offset,
1405                           bool preloaded) {
1406   ZoneList<CharacterRange>* ranges = cc->ranges();
1407   int max_char;
1408   if (ascii) {
1409     max_char = String::kMaxAsciiCharCode;
1410   } else {
1411     max_char = String::kMaxUC16CharCode;
1412   }
1413 
1414   Label success;
1415 
1416   Label* char_is_in_class =
1417       cc->is_negated() ? on_failure : &success;
1418 
1419   int range_count = ranges->length();
1420 
1421   int last_valid_range = range_count - 1;
1422   while (last_valid_range >= 0) {
1423     CharacterRange& range = ranges->at(last_valid_range);
1424     if (range.from() <= max_char) {
1425       break;
1426     }
1427     last_valid_range--;
1428   }
1429 
1430   if (last_valid_range < 0) {
1431     if (!cc->is_negated()) {
1432       // TODO(plesner): We can remove this when the node level does our
1433       // ASCII optimizations for us.
1434       macro_assembler->GoTo(on_failure);
1435     }
1436     if (check_offset) {
1437       macro_assembler->CheckPosition(cp_offset, on_failure);
1438     }
1439     return;
1440   }
1441 
1442   if (last_valid_range == 0 &&
1443       !cc->is_negated() &&
1444       ranges->at(0).IsEverything(max_char)) {
1445     // This is a common case hit by non-anchored expressions.
1446     if (check_offset) {
1447       macro_assembler->CheckPosition(cp_offset, on_failure);
1448     }
1449     return;
1450   }
1451 
1452   if (!preloaded) {
1453     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1454   }
1455 
1456   if (cc->is_standard() &&
1457         macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1458                                                     on_failure)) {
1459       return;
1460   }
1461 
1462   for (int i = 0; i < last_valid_range; i++) {
1463     CharacterRange& range = ranges->at(i);
1464     Label next_range;
1465     uc16 from = range.from();
1466     uc16 to = range.to();
1467     if (from > max_char) {
1468       continue;
1469     }
1470     if (to > max_char) to = max_char;
1471     if (to == from) {
1472       macro_assembler->CheckCharacter(to, char_is_in_class);
1473     } else {
1474       if (from != 0) {
1475         macro_assembler->CheckCharacterLT(from, &next_range);
1476       }
1477       if (to != max_char) {
1478         macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1479       } else {
1480         macro_assembler->GoTo(char_is_in_class);
1481       }
1482     }
1483     macro_assembler->Bind(&next_range);
1484   }
1485 
1486   CharacterRange& range = ranges->at(last_valid_range);
1487   uc16 from = range.from();
1488   uc16 to = range.to();
1489 
1490   if (to > max_char) to = max_char;
1491   ASSERT(to >= from);
1492 
1493   if (to == from) {
1494     if (cc->is_negated()) {
1495       macro_assembler->CheckCharacter(to, on_failure);
1496     } else {
1497       macro_assembler->CheckNotCharacter(to, on_failure);
1498     }
1499   } else {
1500     if (from != 0) {
1501       if (cc->is_negated()) {
1502         macro_assembler->CheckCharacterLT(from, &success);
1503       } else {
1504         macro_assembler->CheckCharacterLT(from, on_failure);
1505       }
1506     }
1507     if (to != String::kMaxUC16CharCode) {
1508       if (cc->is_negated()) {
1509         macro_assembler->CheckCharacterLT(to + 1, on_failure);
1510       } else {
1511         macro_assembler->CheckCharacterGT(to, on_failure);
1512       }
1513     } else {
1514       if (cc->is_negated()) {
1515         macro_assembler->GoTo(on_failure);
1516       }
1517     }
1518   }
1519   macro_assembler->Bind(&success);
1520 }
1521 
1522 
~RegExpNode()1523 RegExpNode::~RegExpNode() {
1524 }
1525 
1526 
LimitVersions(RegExpCompiler * compiler,Trace * trace)1527 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1528                                                   Trace* trace) {
1529   // If we are generating a greedy loop then don't stop and don't reuse code.
1530   if (trace->stop_node() != NULL) {
1531     return CONTINUE;
1532   }
1533 
1534   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1535   if (trace->is_trivial()) {
1536     if (label_.is_bound()) {
1537       // We are being asked to generate a generic version, but that's already
1538       // been done so just go to it.
1539       macro_assembler->GoTo(&label_);
1540       return DONE;
1541     }
1542     if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1543       // To avoid too deep recursion we push the node to the work queue and just
1544       // generate a goto here.
1545       compiler->AddWork(this);
1546       macro_assembler->GoTo(&label_);
1547       return DONE;
1548     }
1549     // Generate generic version of the node and bind the label for later use.
1550     macro_assembler->Bind(&label_);
1551     return CONTINUE;
1552   }
1553 
1554   // We are being asked to make a non-generic version.  Keep track of how many
1555   // non-generic versions we generate so as not to overdo it.
1556   trace_count_++;
1557   if (FLAG_regexp_optimization &&
1558       trace_count_ < kMaxCopiesCodeGenerated &&
1559       compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1560     return CONTINUE;
1561   }
1562 
1563   // If we get here code has been generated for this node too many times or
1564   // recursion is too deep.  Time to switch to a generic version.  The code for
1565   // generic versions above can handle deep recursion properly.
1566   trace->Flush(compiler, this);
1567   return DONE;
1568 }
1569 
1570 
EatsAtLeast(int still_to_find,int recursion_depth)1571 int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1572   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1573   if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
1574   return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1575 }
1576 
1577 
EatsAtLeast(int still_to_find,int recursion_depth)1578 int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1579   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1580   return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1581 }
1582 
1583 
EatsAtLeast(int still_to_find,int recursion_depth)1584 int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1585   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1586   return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1587 }
1588 
1589 
EatsAtLeast(int still_to_find,int recursion_depth)1590 int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1591   int answer = Length();
1592   if (answer >= still_to_find) return answer;
1593   if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1594   return answer + on_success()->EatsAtLeast(still_to_find - answer,
1595                                             recursion_depth + 1);
1596 }
1597 
1598 
EatsAtLeast(int still_to_find,int recursion_depth)1599 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
1600                                              int recursion_depth) {
1601   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1602   // Alternative 0 is the negative lookahead, alternative 1 is what comes
1603   // afterwards.
1604   RegExpNode* node = alternatives_->at(1).node();
1605   return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1606 }
1607 
1608 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1609 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1610     QuickCheckDetails* details,
1611     RegExpCompiler* compiler,
1612     int filled_in,
1613     bool not_at_start) {
1614   // Alternative 0 is the negative lookahead, alternative 1 is what comes
1615   // afterwards.
1616   RegExpNode* node = alternatives_->at(1).node();
1617   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1618 }
1619 
1620 
EatsAtLeastHelper(int still_to_find,int recursion_depth,RegExpNode * ignore_this_node)1621 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1622                                   int recursion_depth,
1623                                   RegExpNode* ignore_this_node) {
1624   if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1625   int min = 100;
1626   int choice_count = alternatives_->length();
1627   for (int i = 0; i < choice_count; i++) {
1628     RegExpNode* node = alternatives_->at(i).node();
1629     if (node == ignore_this_node) continue;
1630     int node_eats_at_least = node->EatsAtLeast(still_to_find,
1631                                                recursion_depth + 1);
1632     if (node_eats_at_least < min) min = node_eats_at_least;
1633   }
1634   return min;
1635 }
1636 
1637 
EatsAtLeast(int still_to_find,int recursion_depth)1638 int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1639   return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
1640 }
1641 
1642 
EatsAtLeast(int still_to_find,int recursion_depth)1643 int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1644   return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
1645 }
1646 
1647 
1648 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)1649 static inline uint32_t SmearBitsRight(uint32_t v) {
1650   v |= v >> 1;
1651   v |= v >> 2;
1652   v |= v >> 4;
1653   v |= v >> 8;
1654   v |= v >> 16;
1655   return v;
1656 }
1657 
1658 
Rationalize(bool asc)1659 bool QuickCheckDetails::Rationalize(bool asc) {
1660   bool found_useful_op = false;
1661   uint32_t char_mask;
1662   if (asc) {
1663     char_mask = String::kMaxAsciiCharCode;
1664   } else {
1665     char_mask = String::kMaxUC16CharCode;
1666   }
1667   mask_ = 0;
1668   value_ = 0;
1669   int char_shift = 0;
1670   for (int i = 0; i < characters_; i++) {
1671     Position* pos = &positions_[i];
1672     if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1673       found_useful_op = true;
1674     }
1675     mask_ |= (pos->mask & char_mask) << char_shift;
1676     value_ |= (pos->value & char_mask) << char_shift;
1677     char_shift += asc ? 8 : 16;
1678   }
1679   return found_useful_op;
1680 }
1681 
1682 
EmitQuickCheck(RegExpCompiler * compiler,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)1683 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1684                                 Trace* trace,
1685                                 bool preload_has_checked_bounds,
1686                                 Label* on_possible_success,
1687                                 QuickCheckDetails* details,
1688                                 bool fall_through_on_failure) {
1689   if (details->characters() == 0) return false;
1690   GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1691   if (details->cannot_match()) return false;
1692   if (!details->Rationalize(compiler->ascii())) return false;
1693   ASSERT(details->characters() == 1 ||
1694          compiler->macro_assembler()->CanReadUnaligned());
1695   uint32_t mask = details->mask();
1696   uint32_t value = details->value();
1697 
1698   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1699 
1700   if (trace->characters_preloaded() != details->characters()) {
1701     assembler->LoadCurrentCharacter(trace->cp_offset(),
1702                                     trace->backtrack(),
1703                                     !preload_has_checked_bounds,
1704                                     details->characters());
1705   }
1706 
1707 
1708   bool need_mask = true;
1709 
1710   if (details->characters() == 1) {
1711     // If number of characters preloaded is 1 then we used a byte or 16 bit
1712     // load so the value is already masked down.
1713     uint32_t char_mask;
1714     if (compiler->ascii()) {
1715       char_mask = String::kMaxAsciiCharCode;
1716     } else {
1717       char_mask = String::kMaxUC16CharCode;
1718     }
1719     if ((mask & char_mask) == char_mask) need_mask = false;
1720     mask &= char_mask;
1721   } else {
1722     // For 2-character preloads in ASCII mode we also use a 16 bit load with
1723     // zero extend.
1724     if (details->characters() == 2 && compiler->ascii()) {
1725       if ((mask & 0xffff) == 0xffff) need_mask = false;
1726     } else {
1727       if (mask == 0xffffffff) need_mask = false;
1728     }
1729   }
1730 
1731   if (fall_through_on_failure) {
1732     if (need_mask) {
1733       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1734     } else {
1735       assembler->CheckCharacter(value, on_possible_success);
1736     }
1737   } else {
1738     if (need_mask) {
1739       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1740     } else {
1741       assembler->CheckNotCharacter(value, trace->backtrack());
1742     }
1743   }
1744   return true;
1745 }
1746 
1747 
1748 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1749 // super-class in the .h file).
1750 //
1751 // We iterate along the text object, building up for each character a
1752 // mask and value that can be used to test for a quick failure to match.
1753 // The masks and values for the positions will be combined into a single
1754 // machine word for the current character width in order to be used in
1755 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1756 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1757                                     RegExpCompiler* compiler,
1758                                     int characters_filled_in,
1759                                     bool not_at_start) {
1760   ASSERT(characters_filled_in < details->characters());
1761   int characters = details->characters();
1762   int char_mask;
1763   int char_shift;
1764   if (compiler->ascii()) {
1765     char_mask = String::kMaxAsciiCharCode;
1766     char_shift = 8;
1767   } else {
1768     char_mask = String::kMaxUC16CharCode;
1769     char_shift = 16;
1770   }
1771   for (int k = 0; k < elms_->length(); k++) {
1772     TextElement elm = elms_->at(k);
1773     if (elm.type == TextElement::ATOM) {
1774       Vector<const uc16> quarks = elm.data.u_atom->data();
1775       for (int i = 0; i < characters && i < quarks.length(); i++) {
1776         QuickCheckDetails::Position* pos =
1777             details->positions(characters_filled_in);
1778         uc16 c = quarks[i];
1779         if (c > char_mask) {
1780           // If we expect a non-ASCII character from an ASCII string,
1781           // there is no way we can match. Not even case independent
1782           // matching can turn an ASCII character into non-ASCII or
1783           // vice versa.
1784           details->set_cannot_match();
1785           pos->determines_perfectly = false;
1786           return;
1787         }
1788         if (compiler->ignore_case()) {
1789           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1790           int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
1791           ASSERT(length != 0);  // Can only happen if c > char_mask (see above).
1792           if (length == 1) {
1793             // This letter has no case equivalents, so it's nice and simple
1794             // and the mask-compare will determine definitely whether we have
1795             // a match at this character position.
1796             pos->mask = char_mask;
1797             pos->value = c;
1798             pos->determines_perfectly = true;
1799           } else {
1800             uint32_t common_bits = char_mask;
1801             uint32_t bits = chars[0];
1802             for (int j = 1; j < length; j++) {
1803               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1804               common_bits ^= differing_bits;
1805               bits &= common_bits;
1806             }
1807             // If length is 2 and common bits has only one zero in it then
1808             // our mask and compare instruction will determine definitely
1809             // whether we have a match at this character position.  Otherwise
1810             // it can only be an approximate check.
1811             uint32_t one_zero = (common_bits | ~char_mask);
1812             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1813               pos->determines_perfectly = true;
1814             }
1815             pos->mask = common_bits;
1816             pos->value = bits;
1817           }
1818         } else {
1819           // Don't ignore case.  Nice simple case where the mask-compare will
1820           // determine definitely whether we have a match at this character
1821           // position.
1822           pos->mask = char_mask;
1823           pos->value = c;
1824           pos->determines_perfectly = true;
1825         }
1826         characters_filled_in++;
1827         ASSERT(characters_filled_in <= details->characters());
1828         if (characters_filled_in == details->characters()) {
1829           return;
1830         }
1831       }
1832     } else {
1833       QuickCheckDetails::Position* pos =
1834           details->positions(characters_filled_in);
1835       RegExpCharacterClass* tree = elm.data.u_char_class;
1836       ZoneList<CharacterRange>* ranges = tree->ranges();
1837       if (tree->is_negated()) {
1838         // A quick check uses multi-character mask and compare.  There is no
1839         // useful way to incorporate a negative char class into this scheme
1840         // so we just conservatively create a mask and value that will always
1841         // succeed.
1842         pos->mask = 0;
1843         pos->value = 0;
1844       } else {
1845         int first_range = 0;
1846         while (ranges->at(first_range).from() > char_mask) {
1847           first_range++;
1848           if (first_range == ranges->length()) {
1849             details->set_cannot_match();
1850             pos->determines_perfectly = false;
1851             return;
1852           }
1853         }
1854         CharacterRange range = ranges->at(first_range);
1855         uc16 from = range.from();
1856         uc16 to = range.to();
1857         if (to > char_mask) {
1858           to = char_mask;
1859         }
1860         uint32_t differing_bits = (from ^ to);
1861         // A mask and compare is only perfect if the differing bits form a
1862         // number like 00011111 with one single block of trailing 1s.
1863         if ((differing_bits & (differing_bits + 1)) == 0 &&
1864              from + differing_bits == to) {
1865           pos->determines_perfectly = true;
1866         }
1867         uint32_t common_bits = ~SmearBitsRight(differing_bits);
1868         uint32_t bits = (from & common_bits);
1869         for (int i = first_range + 1; i < ranges->length(); i++) {
1870           CharacterRange range = ranges->at(i);
1871           uc16 from = range.from();
1872           uc16 to = range.to();
1873           if (from > char_mask) continue;
1874           if (to > char_mask) to = char_mask;
1875           // Here we are combining more ranges into the mask and compare
1876           // value.  With each new range the mask becomes more sparse and
1877           // so the chances of a false positive rise.  A character class
1878           // with multiple ranges is assumed never to be equivalent to a
1879           // mask and compare operation.
1880           pos->determines_perfectly = false;
1881           uint32_t new_common_bits = (from ^ to);
1882           new_common_bits = ~SmearBitsRight(new_common_bits);
1883           common_bits &= new_common_bits;
1884           bits &= new_common_bits;
1885           uint32_t differing_bits = (from & common_bits) ^ bits;
1886           common_bits ^= differing_bits;
1887           bits &= common_bits;
1888         }
1889         pos->mask = common_bits;
1890         pos->value = bits;
1891       }
1892       characters_filled_in++;
1893       ASSERT(characters_filled_in <= details->characters());
1894       if (characters_filled_in == details->characters()) {
1895         return;
1896       }
1897     }
1898   }
1899   ASSERT(characters_filled_in != details->characters());
1900   on_success()-> GetQuickCheckDetails(details,
1901                                       compiler,
1902                                       characters_filled_in,
1903                                       true);
1904 }
1905 
1906 
Clear()1907 void QuickCheckDetails::Clear() {
1908   for (int i = 0; i < characters_; i++) {
1909     positions_[i].mask = 0;
1910     positions_[i].value = 0;
1911     positions_[i].determines_perfectly = false;
1912   }
1913   characters_ = 0;
1914 }
1915 
1916 
Advance(int by,bool ascii)1917 void QuickCheckDetails::Advance(int by, bool ascii) {
1918   ASSERT(by >= 0);
1919   if (by >= characters_) {
1920     Clear();
1921     return;
1922   }
1923   for (int i = 0; i < characters_ - by; i++) {
1924     positions_[i] = positions_[by + i];
1925   }
1926   for (int i = characters_ - by; i < characters_; i++) {
1927     positions_[i].mask = 0;
1928     positions_[i].value = 0;
1929     positions_[i].determines_perfectly = false;
1930   }
1931   characters_ -= by;
1932   // We could change mask_ and value_ here but we would never advance unless
1933   // they had already been used in a check and they won't be used again because
1934   // it would gain us nothing.  So there's no point.
1935 }
1936 
1937 
Merge(QuickCheckDetails * other,int from_index)1938 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1939   ASSERT(characters_ == other->characters_);
1940   if (other->cannot_match_) {
1941     return;
1942   }
1943   if (cannot_match_) {
1944     *this = *other;
1945     return;
1946   }
1947   for (int i = from_index; i < characters_; i++) {
1948     QuickCheckDetails::Position* pos = positions(i);
1949     QuickCheckDetails::Position* other_pos = other->positions(i);
1950     if (pos->mask != other_pos->mask ||
1951         pos->value != other_pos->value ||
1952         !other_pos->determines_perfectly) {
1953       // Our mask-compare operation will be approximate unless we have the
1954       // exact same operation on both sides of the alternation.
1955       pos->determines_perfectly = false;
1956     }
1957     pos->mask &= other_pos->mask;
1958     pos->value &= pos->mask;
1959     other_pos->value &= pos->mask;
1960     uc16 differing_bits = (pos->value ^ other_pos->value);
1961     pos->mask &= ~differing_bits;
1962     pos->value &= pos->mask;
1963   }
1964 }
1965 
1966 
1967 class VisitMarker {
1968  public:
VisitMarker(NodeInfo * info)1969   explicit VisitMarker(NodeInfo* info) : info_(info) {
1970     ASSERT(!info->visited);
1971     info->visited = true;
1972   }
~VisitMarker()1973   ~VisitMarker() {
1974     info_->visited = false;
1975   }
1976  private:
1977   NodeInfo* info_;
1978 };
1979 
1980 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1981 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1982                                           RegExpCompiler* compiler,
1983                                           int characters_filled_in,
1984                                           bool not_at_start) {
1985   if (body_can_be_zero_length_ || info()->visited) return;
1986   VisitMarker marker(info());
1987   return ChoiceNode::GetQuickCheckDetails(details,
1988                                           compiler,
1989                                           characters_filled_in,
1990                                           not_at_start);
1991 }
1992 
1993 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1994 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
1995                                       RegExpCompiler* compiler,
1996                                       int characters_filled_in,
1997                                       bool not_at_start) {
1998   not_at_start = (not_at_start || not_at_start_);
1999   int choice_count = alternatives_->length();
2000   ASSERT(choice_count > 0);
2001   alternatives_->at(0).node()->GetQuickCheckDetails(details,
2002                                                     compiler,
2003                                                     characters_filled_in,
2004                                                     not_at_start);
2005   for (int i = 1; i < choice_count; i++) {
2006     QuickCheckDetails new_details(details->characters());
2007     RegExpNode* node = alternatives_->at(i).node();
2008     node->GetQuickCheckDetails(&new_details, compiler,
2009                                characters_filled_in,
2010                                not_at_start);
2011     // Here we merge the quick match details of the two branches.
2012     details->Merge(&new_details, characters_filled_in);
2013   }
2014 }
2015 
2016 
2017 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2018 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2019                           Label* word,
2020                           Label* non_word,
2021                           bool fall_through_on_word) {
2022   if (assembler->CheckSpecialCharacterClass(
2023           fall_through_on_word ? 'w' : 'W',
2024           fall_through_on_word ? non_word : word)) {
2025     // Optimized implementation available.
2026     return;
2027   }
2028   assembler->CheckCharacterGT('z', non_word);
2029   assembler->CheckCharacterLT('0', non_word);
2030   assembler->CheckCharacterGT('a' - 1, word);
2031   assembler->CheckCharacterLT('9' + 1, word);
2032   assembler->CheckCharacterLT('A', non_word);
2033   assembler->CheckCharacterLT('Z' + 1, word);
2034   if (fall_through_on_word) {
2035     assembler->CheckNotCharacter('_', non_word);
2036   } else {
2037     assembler->CheckCharacter('_', word);
2038   }
2039 }
2040 
2041 
2042 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2043 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2044 static void EmitHat(RegExpCompiler* compiler,
2045                     RegExpNode* on_success,
2046                     Trace* trace) {
2047   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2048   // We will be loading the previous character into the current character
2049   // register.
2050   Trace new_trace(*trace);
2051   new_trace.InvalidateCurrentCharacter();
2052 
2053   Label ok;
2054   if (new_trace.cp_offset() == 0) {
2055     // The start of input counts as a newline in this context, so skip to
2056     // ok if we are at the start.
2057     assembler->CheckAtStart(&ok);
2058   }
2059   // We already checked that we are not at the start of input so it must be
2060   // OK to load the previous character.
2061   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2062                                   new_trace.backtrack(),
2063                                   false);
2064   if (!assembler->CheckSpecialCharacterClass('n',
2065                                              new_trace.backtrack())) {
2066     // Newline means \n, \r, 0x2028 or 0x2029.
2067     if (!compiler->ascii()) {
2068       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2069     }
2070     assembler->CheckCharacter('\n', &ok);
2071     assembler->CheckNotCharacter('\r', new_trace.backtrack());
2072   }
2073   assembler->Bind(&ok);
2074   on_success->Emit(compiler, &new_trace);
2075 }
2076 
2077 
2078 // Emit the code to handle \b and \B (word-boundary or non-word-boundary)
2079 // when we know whether the next character must be a word character or not.
EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2080 static void EmitHalfBoundaryCheck(AssertionNode::AssertionNodeType type,
2081                                   RegExpCompiler* compiler,
2082                                   RegExpNode* on_success,
2083                                   Trace* trace) {
2084   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2085   Label done;
2086 
2087   Trace new_trace(*trace);
2088 
2089   bool expect_word_character = (type == AssertionNode::AFTER_WORD_CHARACTER);
2090   Label* on_word = expect_word_character ? &done : new_trace.backtrack();
2091   Label* on_non_word = expect_word_character ? new_trace.backtrack() : &done;
2092 
2093   // Check whether previous character was a word character.
2094   switch (trace->at_start()) {
2095     case Trace::TRUE:
2096       if (expect_word_character) {
2097         assembler->GoTo(on_non_word);
2098       }
2099       break;
2100     case Trace::UNKNOWN:
2101       ASSERT_EQ(0, trace->cp_offset());
2102       assembler->CheckAtStart(on_non_word);
2103       // Fall through.
2104     case Trace::FALSE:
2105       int prev_char_offset = trace->cp_offset() - 1;
2106       assembler->LoadCurrentCharacter(prev_char_offset, NULL, false, 1);
2107       EmitWordCheck(assembler, on_word, on_non_word, expect_word_character);
2108       // We may or may not have loaded the previous character.
2109       new_trace.InvalidateCurrentCharacter();
2110   }
2111 
2112   assembler->Bind(&done);
2113 
2114   on_success->Emit(compiler, &new_trace);
2115 }
2116 
2117 
2118 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(AssertionNode::AssertionNodeType type,RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2119 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2120                               RegExpCompiler* compiler,
2121                               RegExpNode* on_success,
2122                               Trace* trace) {
2123   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2124   Label before_non_word;
2125   Label before_word;
2126   if (trace->characters_preloaded() != 1) {
2127     assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2128   }
2129   // Fall through on non-word.
2130   EmitWordCheck(assembler, &before_word, &before_non_word, false);
2131 
2132   // We will be loading the previous character into the current character
2133   // register.
2134   Trace new_trace(*trace);
2135   new_trace.InvalidateCurrentCharacter();
2136 
2137   Label ok;
2138   Label* boundary;
2139   Label* not_boundary;
2140   if (type == AssertionNode::AT_BOUNDARY) {
2141     boundary = &ok;
2142     not_boundary = new_trace.backtrack();
2143   } else {
2144     not_boundary = &ok;
2145     boundary = new_trace.backtrack();
2146   }
2147 
2148   // Next character is not a word character.
2149   assembler->Bind(&before_non_word);
2150   if (new_trace.cp_offset() == 0) {
2151     // The start of input counts as a non-word character, so the question is
2152     // decided if we are at the start.
2153     assembler->CheckAtStart(not_boundary);
2154   }
2155   // We already checked that we are not at the start of input so it must be
2156   // OK to load the previous character.
2157   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2158                                   &ok,  // Unused dummy label in this call.
2159                                   false);
2160   // Fall through on non-word.
2161   EmitWordCheck(assembler, boundary, not_boundary, false);
2162   assembler->GoTo(not_boundary);
2163 
2164   // Next character is a word character.
2165   assembler->Bind(&before_word);
2166   if (new_trace.cp_offset() == 0) {
2167     // The start of input counts as a non-word character, so the question is
2168     // decided if we are at the start.
2169     assembler->CheckAtStart(boundary);
2170   }
2171   // We already checked that we are not at the start of input so it must be
2172   // OK to load the previous character.
2173   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2174                                   &ok,  // Unused dummy label in this call.
2175                                   false);
2176   bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2177   EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2178 
2179   assembler->Bind(&ok);
2180 
2181   on_success->Emit(compiler, &new_trace);
2182 }
2183 
2184 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2185 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2186                                          RegExpCompiler* compiler,
2187                                          int filled_in,
2188                                          bool not_at_start) {
2189   if (type_ == AT_START && not_at_start) {
2190     details->set_cannot_match();
2191     return;
2192   }
2193   return on_success()->GetQuickCheckDetails(details,
2194                                             compiler,
2195                                             filled_in,
2196                                             not_at_start);
2197 }
2198 
2199 
Emit(RegExpCompiler * compiler,Trace * trace)2200 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2201   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2202   switch (type_) {
2203     case AT_END: {
2204       Label ok;
2205       assembler->CheckPosition(trace->cp_offset(), &ok);
2206       assembler->GoTo(trace->backtrack());
2207       assembler->Bind(&ok);
2208       break;
2209     }
2210     case AT_START: {
2211       if (trace->at_start() == Trace::FALSE) {
2212         assembler->GoTo(trace->backtrack());
2213         return;
2214       }
2215       if (trace->at_start() == Trace::UNKNOWN) {
2216         assembler->CheckNotAtStart(trace->backtrack());
2217         Trace at_start_trace = *trace;
2218         at_start_trace.set_at_start(true);
2219         on_success()->Emit(compiler, &at_start_trace);
2220         return;
2221       }
2222     }
2223     break;
2224     case AFTER_NEWLINE:
2225       EmitHat(compiler, on_success(), trace);
2226       return;
2227     case AT_BOUNDARY:
2228     case AT_NON_BOUNDARY: {
2229       EmitBoundaryCheck(type_, compiler, on_success(), trace);
2230       return;
2231     }
2232     case AFTER_WORD_CHARACTER:
2233     case AFTER_NONWORD_CHARACTER: {
2234       EmitHalfBoundaryCheck(type_, compiler, on_success(), trace);
2235     }
2236   }
2237   on_success()->Emit(compiler, trace);
2238 }
2239 
2240 
DeterminedAlready(QuickCheckDetails * quick_check,int offset)2241 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2242   if (quick_check == NULL) return false;
2243   if (offset >= quick_check->characters()) return false;
2244   return quick_check->positions(offset)->determines_perfectly;
2245 }
2246 
2247 
UpdateBoundsCheck(int index,int * checked_up_to)2248 static void UpdateBoundsCheck(int index, int* checked_up_to) {
2249   if (index > *checked_up_to) {
2250     *checked_up_to = index;
2251   }
2252 }
2253 
2254 
2255 // We call this repeatedly to generate code for each pass over the text node.
2256 // The passes are in increasing order of difficulty because we hope one
2257 // of the first passes will fail in which case we are saved the work of the
2258 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
2259 // we will check the '%' in the first pass, the case independent 'a' in the
2260 // second pass and the character class in the last pass.
2261 //
2262 // The passes are done from right to left, so for example to test for /bar/
2263 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2264 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
2265 // bounds check most of the time.  In the example we only need to check for
2266 // end-of-input when loading the putative 'r'.
2267 //
2268 // A slight complication involves the fact that the first character may already
2269 // be fetched into a register by the previous node.  In this case we want to
2270 // do the test for that character first.  We do this in separate passes.  The
2271 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
2272 // pass has been performed then subsequent passes will have true in
2273 // first_element_checked to indicate that that character does not need to be
2274 // checked again.
2275 //
2276 // In addition to all this we are passed a Trace, which can
2277 // contain an AlternativeGeneration object.  In this AlternativeGeneration
2278 // object we can see details of any quick check that was already passed in
2279 // order to get to the code we are now generating.  The quick check can involve
2280 // loading characters, which means we do not need to recheck the bounds
2281 // up to the limit the quick check already checked.  In addition the quick
2282 // check can have involved a mask and compare operation which may simplify
2283 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)2284 void TextNode::TextEmitPass(RegExpCompiler* compiler,
2285                             TextEmitPassType pass,
2286                             bool preloaded,
2287                             Trace* trace,
2288                             bool first_element_checked,
2289                             int* checked_up_to) {
2290   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2291   bool ascii = compiler->ascii();
2292   Label* backtrack = trace->backtrack();
2293   QuickCheckDetails* quick_check = trace->quick_check_performed();
2294   int element_count = elms_->length();
2295   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2296     TextElement elm = elms_->at(i);
2297     int cp_offset = trace->cp_offset() + elm.cp_offset;
2298     if (elm.type == TextElement::ATOM) {
2299       Vector<const uc16> quarks = elm.data.u_atom->data();
2300       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2301         if (first_element_checked && i == 0 && j == 0) continue;
2302         if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2303         EmitCharacterFunction* emit_function = NULL;
2304         switch (pass) {
2305           case NON_ASCII_MATCH:
2306             ASSERT(ascii);
2307             if (quarks[j] > String::kMaxAsciiCharCode) {
2308               assembler->GoTo(backtrack);
2309               return;
2310             }
2311             break;
2312           case NON_LETTER_CHARACTER_MATCH:
2313             emit_function = &EmitAtomNonLetter;
2314             break;
2315           case SIMPLE_CHARACTER_MATCH:
2316             emit_function = &EmitSimpleCharacter;
2317             break;
2318           case CASE_CHARACTER_MATCH:
2319             emit_function = &EmitAtomLetter;
2320             break;
2321           default:
2322             break;
2323         }
2324         if (emit_function != NULL) {
2325           bool bound_checked = emit_function(compiler,
2326                                              quarks[j],
2327                                              backtrack,
2328                                              cp_offset + j,
2329                                              *checked_up_to < cp_offset + j,
2330                                              preloaded);
2331           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2332         }
2333       }
2334     } else {
2335       ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2336       if (pass == CHARACTER_CLASS_MATCH) {
2337         if (first_element_checked && i == 0) continue;
2338         if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
2339         RegExpCharacterClass* cc = elm.data.u_char_class;
2340         EmitCharClass(assembler,
2341                       cc,
2342                       ascii,
2343                       backtrack,
2344                       cp_offset,
2345                       *checked_up_to < cp_offset,
2346                       preloaded);
2347         UpdateBoundsCheck(cp_offset, checked_up_to);
2348       }
2349     }
2350   }
2351 }
2352 
2353 
Length()2354 int TextNode::Length() {
2355   TextElement elm = elms_->last();
2356   ASSERT(elm.cp_offset >= 0);
2357   if (elm.type == TextElement::ATOM) {
2358     return elm.cp_offset + elm.data.u_atom->data().length();
2359   } else {
2360     return elm.cp_offset + 1;
2361   }
2362 }
2363 
2364 
SkipPass(int int_pass,bool ignore_case)2365 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2366   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2367   if (ignore_case) {
2368     return pass == SIMPLE_CHARACTER_MATCH;
2369   } else {
2370     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2371   }
2372 }
2373 
2374 
2375 // This generates the code to match a text node.  A text node can contain
2376 // straight character sequences (possibly to be matched in a case-independent
2377 // way) and character classes.  For efficiency we do not do this in a single
2378 // pass from left to right.  Instead we pass over the text node several times,
2379 // emitting code for some character positions every time.  See the comment on
2380 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)2381 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2382   LimitResult limit_result = LimitVersions(compiler, trace);
2383   if (limit_result == DONE) return;
2384   ASSERT(limit_result == CONTINUE);
2385 
2386   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2387     compiler->SetRegExpTooBig();
2388     return;
2389   }
2390 
2391   if (compiler->ascii()) {
2392     int dummy = 0;
2393     TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
2394   }
2395 
2396   bool first_elt_done = false;
2397   int bound_checked_to = trace->cp_offset() - 1;
2398   bound_checked_to += trace->bound_checked_up_to();
2399 
2400   // If a character is preloaded into the current character register then
2401   // check that now.
2402   if (trace->characters_preloaded() == 1) {
2403     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2404       if (!SkipPass(pass, compiler->ignore_case())) {
2405         TextEmitPass(compiler,
2406                      static_cast<TextEmitPassType>(pass),
2407                      true,
2408                      trace,
2409                      false,
2410                      &bound_checked_to);
2411       }
2412     }
2413     first_elt_done = true;
2414   }
2415 
2416   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2417     if (!SkipPass(pass, compiler->ignore_case())) {
2418       TextEmitPass(compiler,
2419                    static_cast<TextEmitPassType>(pass),
2420                    false,
2421                    trace,
2422                    first_elt_done,
2423                    &bound_checked_to);
2424     }
2425   }
2426 
2427   Trace successor_trace(*trace);
2428   successor_trace.set_at_start(false);
2429   successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
2430   RecursionCheck rc(compiler);
2431   on_success()->Emit(compiler, &successor_trace);
2432 }
2433 
2434 
InvalidateCurrentCharacter()2435 void Trace::InvalidateCurrentCharacter() {
2436   characters_preloaded_ = 0;
2437 }
2438 
2439 
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)2440 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2441   ASSERT(by > 0);
2442   // We don't have an instruction for shifting the current character register
2443   // down or for using a shifted value for anything so lets just forget that
2444   // we preloaded any characters into it.
2445   characters_preloaded_ = 0;
2446   // Adjust the offsets of the quick check performed information.  This
2447   // information is used to find out what we already determined about the
2448   // characters by means of mask and compare.
2449   quick_check_performed_.Advance(by, compiler->ascii());
2450   cp_offset_ += by;
2451   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2452     compiler->SetRegExpTooBig();
2453     cp_offset_ = 0;
2454   }
2455   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2456 }
2457 
2458 
MakeCaseIndependent(bool is_ascii)2459 void TextNode::MakeCaseIndependent(bool is_ascii) {
2460   int element_count = elms_->length();
2461   for (int i = 0; i < element_count; i++) {
2462     TextElement elm = elms_->at(i);
2463     if (elm.type == TextElement::CHAR_CLASS) {
2464       RegExpCharacterClass* cc = elm.data.u_char_class;
2465       // None of the standard character classses is different in the case
2466       // independent case and it slows us down if we don't know that.
2467       if (cc->is_standard()) continue;
2468       ZoneList<CharacterRange>* ranges = cc->ranges();
2469       int range_count = ranges->length();
2470       for (int j = 0; j < range_count; j++) {
2471         ranges->at(j).AddCaseEquivalents(ranges, is_ascii);
2472       }
2473     }
2474   }
2475 }
2476 
2477 
GreedyLoopTextLength()2478 int TextNode::GreedyLoopTextLength() {
2479   TextElement elm = elms_->at(elms_->length() - 1);
2480   if (elm.type == TextElement::CHAR_CLASS) {
2481     return elm.cp_offset + 1;
2482   } else {
2483     return elm.cp_offset + elm.data.u_atom->data().length();
2484   }
2485 }
2486 
2487 
2488 // Finds the fixed match length of a sequence of nodes that goes from
2489 // this alternative and back to this choice node.  If there are variable
2490 // length nodes or other complications in the way then return a sentinel
2491 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLength(GuardedAlternative * alternative)2492 int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2493   int length = 0;
2494   RegExpNode* node = alternative->node();
2495   // Later we will generate code for all these text nodes using recursion
2496   // so we have to limit the max number.
2497   int recursion_depth = 0;
2498   while (node != this) {
2499     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2500       return kNodeIsTooComplexForGreedyLoops;
2501     }
2502     int node_length = node->GreedyLoopTextLength();
2503     if (node_length == kNodeIsTooComplexForGreedyLoops) {
2504       return kNodeIsTooComplexForGreedyLoops;
2505     }
2506     length += node_length;
2507     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2508     node = seq_node->on_success();
2509   }
2510   return length;
2511 }
2512 
2513 
AddLoopAlternative(GuardedAlternative alt)2514 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2515   ASSERT_EQ(loop_node_, NULL);
2516   AddAlternative(alt);
2517   loop_node_ = alt.node();
2518 }
2519 
2520 
AddContinueAlternative(GuardedAlternative alt)2521 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2522   ASSERT_EQ(continue_node_, NULL);
2523   AddAlternative(alt);
2524   continue_node_ = alt.node();
2525 }
2526 
2527 
Emit(RegExpCompiler * compiler,Trace * trace)2528 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2529   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2530   if (trace->stop_node() == this) {
2531     int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2532     ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2533     // Update the counter-based backtracking info on the stack.  This is an
2534     // optimization for greedy loops (see below).
2535     ASSERT(trace->cp_offset() == text_length);
2536     macro_assembler->AdvanceCurrentPosition(text_length);
2537     macro_assembler->GoTo(trace->loop_label());
2538     return;
2539   }
2540   ASSERT(trace->stop_node() == NULL);
2541   if (!trace->is_trivial()) {
2542     trace->Flush(compiler, this);
2543     return;
2544   }
2545   ChoiceNode::Emit(compiler, trace);
2546 }
2547 
2548 
CalculatePreloadCharacters(RegExpCompiler * compiler)2549 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
2550   int preload_characters = EatsAtLeast(4, 0);
2551   if (compiler->macro_assembler()->CanReadUnaligned()) {
2552     bool ascii = compiler->ascii();
2553     if (ascii) {
2554       if (preload_characters > 4) preload_characters = 4;
2555       // We can't preload 3 characters because there is no machine instruction
2556       // to do that.  We can't just load 4 because we could be reading
2557       // beyond the end of the string, which could cause a memory fault.
2558       if (preload_characters == 3) preload_characters = 2;
2559     } else {
2560       if (preload_characters > 2) preload_characters = 2;
2561     }
2562   } else {
2563     if (preload_characters > 1) preload_characters = 1;
2564   }
2565   return preload_characters;
2566 }
2567 
2568 
2569 // This class is used when generating the alternatives in a choice node.  It
2570 // records the way the alternative is being code generated.
2571 class AlternativeGeneration: public Malloced {
2572  public:
AlternativeGeneration()2573   AlternativeGeneration()
2574       : possible_success(),
2575         expects_preload(false),
2576         after(),
2577         quick_check_details() { }
2578   Label possible_success;
2579   bool expects_preload;
2580   Label after;
2581   QuickCheckDetails quick_check_details;
2582 };
2583 
2584 
2585 // Creates a list of AlternativeGenerations.  If the list has a reasonable
2586 // size then it is on the stack, otherwise the excess is on the heap.
2587 class AlternativeGenerationList {
2588  public:
AlternativeGenerationList(int count)2589   explicit AlternativeGenerationList(int count)
2590       : alt_gens_(count) {
2591     for (int i = 0; i < count && i < kAFew; i++) {
2592       alt_gens_.Add(a_few_alt_gens_ + i);
2593     }
2594     for (int i = kAFew; i < count; i++) {
2595       alt_gens_.Add(new AlternativeGeneration());
2596     }
2597   }
~AlternativeGenerationList()2598   ~AlternativeGenerationList() {
2599     for (int i = kAFew; i < alt_gens_.length(); i++) {
2600       delete alt_gens_[i];
2601       alt_gens_[i] = NULL;
2602     }
2603   }
2604 
at(int i)2605   AlternativeGeneration* at(int i) {
2606     return alt_gens_[i];
2607   }
2608  private:
2609   static const int kAFew = 10;
2610   ZoneList<AlternativeGeneration*> alt_gens_;
2611   AlternativeGeneration a_few_alt_gens_[kAFew];
2612 };
2613 
2614 
2615 /* Code generation for choice nodes.
2616  *
2617  * We generate quick checks that do a mask and compare to eliminate a
2618  * choice.  If the quick check succeeds then it jumps to the continuation to
2619  * do slow checks and check subsequent nodes.  If it fails (the common case)
2620  * it falls through to the next choice.
2621  *
2622  * Here is the desired flow graph.  Nodes directly below each other imply
2623  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
2624  * 3 doesn't have a quick check so we have to call the slow check.
2625  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
2626  * regexp continuation is generated directly after the Sn node, up to the
2627  * next GoTo if we decide to reuse some already generated code.  Some
2628  * nodes expect preload_characters to be preloaded into the current
2629  * character register.  R nodes do this preloading.  Vertices are marked
2630  * F for failures and S for success (possible success in the case of quick
2631  * nodes).  L, V, < and > are used as arrow heads.
2632  *
2633  * ----------> R
2634  *             |
2635  *             V
2636  *            Q1 -----> S1
2637  *             |   S   /
2638  *            F|      /
2639  *             |    F/
2640  *             |    /
2641  *             |   R
2642  *             |  /
2643  *             V L
2644  *            Q2 -----> S2
2645  *             |   S   /
2646  *            F|      /
2647  *             |    F/
2648  *             |    /
2649  *             |   R
2650  *             |  /
2651  *             V L
2652  *            S3
2653  *             |
2654  *            F|
2655  *             |
2656  *             R
2657  *             |
2658  * backtrack   V
2659  * <----------Q4
2660  *   \    F    |
2661  *    \        |S
2662  *     \   F   V
2663  *      \-----S4
2664  *
2665  * For greedy loops we reverse our expectation and expect to match rather
2666  * than fail. Therefore we want the loop code to look like this (U is the
2667  * unwind code that steps back in the greedy loop).  The following alternatives
2668  * look the same as above.
2669  *              _____
2670  *             /     \
2671  *             V     |
2672  * ----------> S1    |
2673  *            /|     |
2674  *           / |S    |
2675  *         F/  \_____/
2676  *         /
2677  *        |<-----------
2678  *        |            \
2679  *        V             \
2680  *        Q2 ---> S2     \
2681  *        |  S   /       |
2682  *       F|     /        |
2683  *        |   F/         |
2684  *        |   /          |
2685  *        |  R           |
2686  *        | /            |
2687  *   F    VL             |
2688  * <------U              |
2689  * back   |S             |
2690  *        \______________/
2691  */
2692 
2693 
Emit(RegExpCompiler * compiler,Trace * trace)2694 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2695   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2696   int choice_count = alternatives_->length();
2697 #ifdef DEBUG
2698   for (int i = 0; i < choice_count - 1; i++) {
2699     GuardedAlternative alternative = alternatives_->at(i);
2700     ZoneList<Guard*>* guards = alternative.guards();
2701     int guard_count = (guards == NULL) ? 0 : guards->length();
2702     for (int j = 0; j < guard_count; j++) {
2703       ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
2704     }
2705   }
2706 #endif
2707 
2708   LimitResult limit_result = LimitVersions(compiler, trace);
2709   if (limit_result == DONE) return;
2710   ASSERT(limit_result == CONTINUE);
2711 
2712   int new_flush_budget = trace->flush_budget() / choice_count;
2713   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2714     trace->Flush(compiler, this);
2715     return;
2716   }
2717 
2718   RecursionCheck rc(compiler);
2719 
2720   Trace* current_trace = trace;
2721 
2722   int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2723   bool greedy_loop = false;
2724   Label greedy_loop_label;
2725   Trace counter_backtrack_trace;
2726   counter_backtrack_trace.set_backtrack(&greedy_loop_label);
2727   if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2728 
2729   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2730     // Here we have special handling for greedy loops containing only text nodes
2731     // and other simple nodes.  These are handled by pushing the current
2732     // position on the stack and then incrementing the current position each
2733     // time around the switch.  On backtrack we decrement the current position
2734     // and check it against the pushed value.  This avoids pushing backtrack
2735     // information for each iteration of the loop, which could take up a lot of
2736     // space.
2737     greedy_loop = true;
2738     ASSERT(trace->stop_node() == NULL);
2739     macro_assembler->PushCurrentPosition();
2740     current_trace = &counter_backtrack_trace;
2741     Label greedy_match_failed;
2742     Trace greedy_match_trace;
2743     if (not_at_start()) greedy_match_trace.set_at_start(false);
2744     greedy_match_trace.set_backtrack(&greedy_match_failed);
2745     Label loop_label;
2746     macro_assembler->Bind(&loop_label);
2747     greedy_match_trace.set_stop_node(this);
2748     greedy_match_trace.set_loop_label(&loop_label);
2749     alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
2750     macro_assembler->Bind(&greedy_match_failed);
2751   }
2752 
2753   Label second_choice;  // For use in greedy matches.
2754   macro_assembler->Bind(&second_choice);
2755 
2756   int first_normal_choice = greedy_loop ? 1 : 0;
2757 
2758   int preload_characters = CalculatePreloadCharacters(compiler);
2759   bool preload_is_current =
2760       (current_trace->characters_preloaded() == preload_characters);
2761   bool preload_has_checked_bounds = preload_is_current;
2762 
2763   AlternativeGenerationList alt_gens(choice_count);
2764 
2765   // For now we just call all choices one after the other.  The idea ultimately
2766   // is to use the Dispatch table to try only the relevant ones.
2767   for (int i = first_normal_choice; i < choice_count; i++) {
2768     GuardedAlternative alternative = alternatives_->at(i);
2769     AlternativeGeneration* alt_gen = alt_gens.at(i);
2770     alt_gen->quick_check_details.set_characters(preload_characters);
2771     ZoneList<Guard*>* guards = alternative.guards();
2772     int guard_count = (guards == NULL) ? 0 : guards->length();
2773     Trace new_trace(*current_trace);
2774     new_trace.set_characters_preloaded(preload_is_current ?
2775                                          preload_characters :
2776                                          0);
2777     if (preload_has_checked_bounds) {
2778       new_trace.set_bound_checked_up_to(preload_characters);
2779     }
2780     new_trace.quick_check_performed()->Clear();
2781     if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
2782     alt_gen->expects_preload = preload_is_current;
2783     bool generate_full_check_inline = false;
2784     if (FLAG_regexp_optimization &&
2785         try_to_emit_quick_check_for_alternative(i) &&
2786         alternative.node()->EmitQuickCheck(compiler,
2787                                            &new_trace,
2788                                            preload_has_checked_bounds,
2789                                            &alt_gen->possible_success,
2790                                            &alt_gen->quick_check_details,
2791                                            i < choice_count - 1)) {
2792       // Quick check was generated for this choice.
2793       preload_is_current = true;
2794       preload_has_checked_bounds = true;
2795       // On the last choice in the ChoiceNode we generated the quick
2796       // check to fall through on possible success.  So now we need to
2797       // generate the full check inline.
2798       if (i == choice_count - 1) {
2799         macro_assembler->Bind(&alt_gen->possible_success);
2800         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2801         new_trace.set_characters_preloaded(preload_characters);
2802         new_trace.set_bound_checked_up_to(preload_characters);
2803         generate_full_check_inline = true;
2804       }
2805     } else if (alt_gen->quick_check_details.cannot_match()) {
2806       if (i == choice_count - 1 && !greedy_loop) {
2807         macro_assembler->GoTo(trace->backtrack());
2808       }
2809       continue;
2810     } else {
2811       // No quick check was generated.  Put the full code here.
2812       // If this is not the first choice then there could be slow checks from
2813       // previous cases that go here when they fail.  There's no reason to
2814       // insist that they preload characters since the slow check we are about
2815       // to generate probably can't use it.
2816       if (i != first_normal_choice) {
2817         alt_gen->expects_preload = false;
2818         new_trace.InvalidateCurrentCharacter();
2819       }
2820       if (i < choice_count - 1) {
2821         new_trace.set_backtrack(&alt_gen->after);
2822       }
2823       generate_full_check_inline = true;
2824     }
2825     if (generate_full_check_inline) {
2826       if (new_trace.actions() != NULL) {
2827         new_trace.set_flush_budget(new_flush_budget);
2828       }
2829       for (int j = 0; j < guard_count; j++) {
2830         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
2831       }
2832       alternative.node()->Emit(compiler, &new_trace);
2833       preload_is_current = false;
2834     }
2835     macro_assembler->Bind(&alt_gen->after);
2836   }
2837   if (greedy_loop) {
2838     macro_assembler->Bind(&greedy_loop_label);
2839     // If we have unwound to the bottom then backtrack.
2840     macro_assembler->CheckGreedyLoop(trace->backtrack());
2841     // Otherwise try the second priority at an earlier position.
2842     macro_assembler->AdvanceCurrentPosition(-text_length);
2843     macro_assembler->GoTo(&second_choice);
2844   }
2845 
2846   // At this point we need to generate slow checks for the alternatives where
2847   // the quick check was inlined.  We can recognize these because the associated
2848   // label was bound.
2849   for (int i = first_normal_choice; i < choice_count - 1; i++) {
2850     AlternativeGeneration* alt_gen = alt_gens.at(i);
2851     Trace new_trace(*current_trace);
2852     // If there are actions to be flushed we have to limit how many times
2853     // they are flushed.  Take the budget of the parent trace and distribute
2854     // it fairly amongst the children.
2855     if (new_trace.actions() != NULL) {
2856       new_trace.set_flush_budget(new_flush_budget);
2857     }
2858     EmitOutOfLineContinuation(compiler,
2859                               &new_trace,
2860                               alternatives_->at(i),
2861                               alt_gen,
2862                               preload_characters,
2863                               alt_gens.at(i + 1)->expects_preload);
2864   }
2865 }
2866 
2867 
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)2868 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
2869                                            Trace* trace,
2870                                            GuardedAlternative alternative,
2871                                            AlternativeGeneration* alt_gen,
2872                                            int preload_characters,
2873                                            bool next_expects_preload) {
2874   if (!alt_gen->possible_success.is_linked()) return;
2875 
2876   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2877   macro_assembler->Bind(&alt_gen->possible_success);
2878   Trace out_of_line_trace(*trace);
2879   out_of_line_trace.set_characters_preloaded(preload_characters);
2880   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2881   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
2882   ZoneList<Guard*>* guards = alternative.guards();
2883   int guard_count = (guards == NULL) ? 0 : guards->length();
2884   if (next_expects_preload) {
2885     Label reload_current_char;
2886     out_of_line_trace.set_backtrack(&reload_current_char);
2887     for (int j = 0; j < guard_count; j++) {
2888       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
2889     }
2890     alternative.node()->Emit(compiler, &out_of_line_trace);
2891     macro_assembler->Bind(&reload_current_char);
2892     // Reload the current character, since the next quick check expects that.
2893     // We don't need to check bounds here because we only get into this
2894     // code through a quick check which already did the checked load.
2895     macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
2896                                           NULL,
2897                                           false,
2898                                           preload_characters);
2899     macro_assembler->GoTo(&(alt_gen->after));
2900   } else {
2901     out_of_line_trace.set_backtrack(&(alt_gen->after));
2902     for (int j = 0; j < guard_count; j++) {
2903       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
2904     }
2905     alternative.node()->Emit(compiler, &out_of_line_trace);
2906   }
2907 }
2908 
2909 
Emit(RegExpCompiler * compiler,Trace * trace)2910 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2911   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2912   LimitResult limit_result = LimitVersions(compiler, trace);
2913   if (limit_result == DONE) return;
2914   ASSERT(limit_result == CONTINUE);
2915 
2916   RecursionCheck rc(compiler);
2917 
2918   switch (type_) {
2919     case STORE_POSITION: {
2920       Trace::DeferredCapture
2921           new_capture(data_.u_position_register.reg,
2922                       data_.u_position_register.is_capture,
2923                       trace);
2924       Trace new_trace = *trace;
2925       new_trace.add_action(&new_capture);
2926       on_success()->Emit(compiler, &new_trace);
2927       break;
2928     }
2929     case INCREMENT_REGISTER: {
2930       Trace::DeferredIncrementRegister
2931           new_increment(data_.u_increment_register.reg);
2932       Trace new_trace = *trace;
2933       new_trace.add_action(&new_increment);
2934       on_success()->Emit(compiler, &new_trace);
2935       break;
2936     }
2937     case SET_REGISTER: {
2938       Trace::DeferredSetRegister
2939           new_set(data_.u_store_register.reg, data_.u_store_register.value);
2940       Trace new_trace = *trace;
2941       new_trace.add_action(&new_set);
2942       on_success()->Emit(compiler, &new_trace);
2943       break;
2944     }
2945     case CLEAR_CAPTURES: {
2946       Trace::DeferredClearCaptures
2947         new_capture(Interval(data_.u_clear_captures.range_from,
2948                              data_.u_clear_captures.range_to));
2949       Trace new_trace = *trace;
2950       new_trace.add_action(&new_capture);
2951       on_success()->Emit(compiler, &new_trace);
2952       break;
2953     }
2954     case BEGIN_SUBMATCH:
2955       if (!trace->is_trivial()) {
2956         trace->Flush(compiler, this);
2957       } else {
2958         assembler->WriteCurrentPositionToRegister(
2959             data_.u_submatch.current_position_register, 0);
2960         assembler->WriteStackPointerToRegister(
2961             data_.u_submatch.stack_pointer_register);
2962         on_success()->Emit(compiler, trace);
2963       }
2964       break;
2965     case EMPTY_MATCH_CHECK: {
2966       int start_pos_reg = data_.u_empty_match_check.start_register;
2967       int stored_pos = 0;
2968       int rep_reg = data_.u_empty_match_check.repetition_register;
2969       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
2970       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
2971       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
2972         // If we know we haven't advanced and there is no minimum we
2973         // can just backtrack immediately.
2974         assembler->GoTo(trace->backtrack());
2975       } else if (know_dist && stored_pos < trace->cp_offset()) {
2976         // If we know we've advanced we can generate the continuation
2977         // immediately.
2978         on_success()->Emit(compiler, trace);
2979       } else if (!trace->is_trivial()) {
2980         trace->Flush(compiler, this);
2981       } else {
2982         Label skip_empty_check;
2983         // If we have a minimum number of repetitions we check the current
2984         // number first and skip the empty check if it's not enough.
2985         if (has_minimum) {
2986           int limit = data_.u_empty_match_check.repetition_limit;
2987           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
2988         }
2989         // If the match is empty we bail out, otherwise we fall through
2990         // to the on-success continuation.
2991         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
2992                                    trace->backtrack());
2993         assembler->Bind(&skip_empty_check);
2994         on_success()->Emit(compiler, trace);
2995       }
2996       break;
2997     }
2998     case POSITIVE_SUBMATCH_SUCCESS: {
2999       if (!trace->is_trivial()) {
3000         trace->Flush(compiler, this);
3001         return;
3002       }
3003       assembler->ReadCurrentPositionFromRegister(
3004           data_.u_submatch.current_position_register);
3005       assembler->ReadStackPointerFromRegister(
3006           data_.u_submatch.stack_pointer_register);
3007       int clear_register_count = data_.u_submatch.clear_register_count;
3008       if (clear_register_count == 0) {
3009         on_success()->Emit(compiler, trace);
3010         return;
3011       }
3012       int clear_registers_from = data_.u_submatch.clear_register_from;
3013       Label clear_registers_backtrack;
3014       Trace new_trace = *trace;
3015       new_trace.set_backtrack(&clear_registers_backtrack);
3016       on_success()->Emit(compiler, &new_trace);
3017 
3018       assembler->Bind(&clear_registers_backtrack);
3019       int clear_registers_to = clear_registers_from + clear_register_count - 1;
3020       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
3021 
3022       ASSERT(trace->backtrack() == NULL);
3023       assembler->Backtrack();
3024       return;
3025     }
3026     default:
3027       UNREACHABLE();
3028   }
3029 }
3030 
3031 
Emit(RegExpCompiler * compiler,Trace * trace)3032 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3033   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3034   if (!trace->is_trivial()) {
3035     trace->Flush(compiler, this);
3036     return;
3037   }
3038 
3039   LimitResult limit_result = LimitVersions(compiler, trace);
3040   if (limit_result == DONE) return;
3041   ASSERT(limit_result == CONTINUE);
3042 
3043   RecursionCheck rc(compiler);
3044 
3045   ASSERT_EQ(start_reg_ + 1, end_reg_);
3046   if (compiler->ignore_case()) {
3047     assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3048                                                trace->backtrack());
3049   } else {
3050     assembler->CheckNotBackReference(start_reg_, trace->backtrack());
3051   }
3052   on_success()->Emit(compiler, trace);
3053 }
3054 
3055 
3056 // -------------------------------------------------------------------
3057 // Dot/dotty output
3058 
3059 
3060 #ifdef DEBUG
3061 
3062 
3063 class DotPrinter: public NodeVisitor {
3064  public:
DotPrinter(bool ignore_case)3065   explicit DotPrinter(bool ignore_case)
3066       : ignore_case_(ignore_case),
3067         stream_(&alloc_) { }
3068   void PrintNode(const char* label, RegExpNode* node);
3069   void Visit(RegExpNode* node);
3070   void PrintAttributes(RegExpNode* from);
stream()3071   StringStream* stream() { return &stream_; }
3072   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3073 #define DECLARE_VISIT(Type)                                          \
3074   virtual void Visit##Type(Type##Node* that);
3075 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3076 #undef DECLARE_VISIT
3077  private:
3078   bool ignore_case_;
3079   HeapStringAllocator alloc_;
3080   StringStream stream_;
3081 };
3082 
3083 
PrintNode(const char * label,RegExpNode * node)3084 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3085   stream()->Add("digraph G {\n  graph [label=\"");
3086   for (int i = 0; label[i]; i++) {
3087     switch (label[i]) {
3088       case '\\':
3089         stream()->Add("\\\\");
3090         break;
3091       case '"':
3092         stream()->Add("\"");
3093         break;
3094       default:
3095         stream()->Put(label[i]);
3096         break;
3097     }
3098   }
3099   stream()->Add("\"];\n");
3100   Visit(node);
3101   stream()->Add("}\n");
3102   printf("%s", *(stream()->ToCString()));
3103 }
3104 
3105 
Visit(RegExpNode * node)3106 void DotPrinter::Visit(RegExpNode* node) {
3107   if (node->info()->visited) return;
3108   node->info()->visited = true;
3109   node->Accept(this);
3110 }
3111 
3112 
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)3113 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3114   stream()->Add("  n%p -> n%p [style=dotted];\n", from, on_failure);
3115   Visit(on_failure);
3116 }
3117 
3118 
3119 class TableEntryBodyPrinter {
3120  public:
TableEntryBodyPrinter(StringStream * stream,ChoiceNode * choice)3121   TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3122       : stream_(stream), choice_(choice) { }
Call(uc16 from,DispatchTable::Entry entry)3123   void Call(uc16 from, DispatchTable::Entry entry) {
3124     OutSet* out_set = entry.out_set();
3125     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3126       if (out_set->Get(i)) {
3127         stream()->Add("    n%p:s%io%i -> n%p;\n",
3128                       choice(),
3129                       from,
3130                       i,
3131                       choice()->alternatives()->at(i).node());
3132       }
3133     }
3134   }
3135  private:
stream()3136   StringStream* stream() { return stream_; }
choice()3137   ChoiceNode* choice() { return choice_; }
3138   StringStream* stream_;
3139   ChoiceNode* choice_;
3140 };
3141 
3142 
3143 class TableEntryHeaderPrinter {
3144  public:
TableEntryHeaderPrinter(StringStream * stream)3145   explicit TableEntryHeaderPrinter(StringStream* stream)
3146       : first_(true), stream_(stream) { }
Call(uc16 from,DispatchTable::Entry entry)3147   void Call(uc16 from, DispatchTable::Entry entry) {
3148     if (first_) {
3149       first_ = false;
3150     } else {
3151       stream()->Add("|");
3152     }
3153     stream()->Add("{\\%k-\\%k|{", from, entry.to());
3154     OutSet* out_set = entry.out_set();
3155     int priority = 0;
3156     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3157       if (out_set->Get(i)) {
3158         if (priority > 0) stream()->Add("|");
3159         stream()->Add("<s%io%i> %i", from, i, priority);
3160         priority++;
3161       }
3162     }
3163     stream()->Add("}}");
3164   }
3165  private:
3166   bool first_;
stream()3167   StringStream* stream() { return stream_; }
3168   StringStream* stream_;
3169 };
3170 
3171 
3172 class AttributePrinter {
3173  public:
AttributePrinter(DotPrinter * out)3174   explicit AttributePrinter(DotPrinter* out)
3175       : out_(out), first_(true) { }
PrintSeparator()3176   void PrintSeparator() {
3177     if (first_) {
3178       first_ = false;
3179     } else {
3180       out_->stream()->Add("|");
3181     }
3182   }
PrintBit(const char * name,bool value)3183   void PrintBit(const char* name, bool value) {
3184     if (!value) return;
3185     PrintSeparator();
3186     out_->stream()->Add("{%s}", name);
3187   }
PrintPositive(const char * name,int value)3188   void PrintPositive(const char* name, int value) {
3189     if (value < 0) return;
3190     PrintSeparator();
3191     out_->stream()->Add("{%s|%x}", name, value);
3192   }
3193  private:
3194   DotPrinter* out_;
3195   bool first_;
3196 };
3197 
3198 
PrintAttributes(RegExpNode * that)3199 void DotPrinter::PrintAttributes(RegExpNode* that) {
3200   stream()->Add("  a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3201                 "margin=0.1, fontsize=10, label=\"{",
3202                 that);
3203   AttributePrinter printer(this);
3204   NodeInfo* info = that->info();
3205   printer.PrintBit("NI", info->follows_newline_interest);
3206   printer.PrintBit("WI", info->follows_word_interest);
3207   printer.PrintBit("SI", info->follows_start_interest);
3208   Label* label = that->label();
3209   if (label->is_bound())
3210     printer.PrintPositive("@", label->pos());
3211   stream()->Add("}\"];\n");
3212   stream()->Add("  a%p -> n%p [style=dashed, color=grey, "
3213                 "arrowhead=none];\n", that, that);
3214 }
3215 
3216 
3217 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)3218 void DotPrinter::VisitChoice(ChoiceNode* that) {
3219   if (kPrintDispatchTable) {
3220     stream()->Add("  n%p [shape=Mrecord, label=\"", that);
3221     TableEntryHeaderPrinter header_printer(stream());
3222     that->GetTable(ignore_case_)->ForEach(&header_printer);
3223     stream()->Add("\"]\n", that);
3224     PrintAttributes(that);
3225     TableEntryBodyPrinter body_printer(stream(), that);
3226     that->GetTable(ignore_case_)->ForEach(&body_printer);
3227   } else {
3228     stream()->Add("  n%p [shape=Mrecord, label=\"?\"];\n", that);
3229     for (int i = 0; i < that->alternatives()->length(); i++) {
3230       GuardedAlternative alt = that->alternatives()->at(i);
3231       stream()->Add("  n%p -> n%p;\n", that, alt.node());
3232     }
3233   }
3234   for (int i = 0; i < that->alternatives()->length(); i++) {
3235     GuardedAlternative alt = that->alternatives()->at(i);
3236     alt.node()->Accept(this);
3237   }
3238 }
3239 
3240 
VisitText(TextNode * that)3241 void DotPrinter::VisitText(TextNode* that) {
3242   stream()->Add("  n%p [label=\"", that);
3243   for (int i = 0; i < that->elements()->length(); i++) {
3244     if (i > 0) stream()->Add(" ");
3245     TextElement elm = that->elements()->at(i);
3246     switch (elm.type) {
3247       case TextElement::ATOM: {
3248         stream()->Add("'%w'", elm.data.u_atom->data());
3249         break;
3250       }
3251       case TextElement::CHAR_CLASS: {
3252         RegExpCharacterClass* node = elm.data.u_char_class;
3253         stream()->Add("[");
3254         if (node->is_negated())
3255           stream()->Add("^");
3256         for (int j = 0; j < node->ranges()->length(); j++) {
3257           CharacterRange range = node->ranges()->at(j);
3258           stream()->Add("%k-%k", range.from(), range.to());
3259         }
3260         stream()->Add("]");
3261         break;
3262       }
3263       default:
3264         UNREACHABLE();
3265     }
3266   }
3267   stream()->Add("\", shape=box, peripheries=2];\n");
3268   PrintAttributes(that);
3269   stream()->Add("  n%p -> n%p;\n", that, that->on_success());
3270   Visit(that->on_success());
3271 }
3272 
3273 
VisitBackReference(BackReferenceNode * that)3274 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3275   stream()->Add("  n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3276                 that,
3277                 that->start_register(),
3278                 that->end_register());
3279   PrintAttributes(that);
3280   stream()->Add("  n%p -> n%p;\n", that, that->on_success());
3281   Visit(that->on_success());
3282 }
3283 
3284 
VisitEnd(EndNode * that)3285 void DotPrinter::VisitEnd(EndNode* that) {
3286   stream()->Add("  n%p [style=bold, shape=point];\n", that);
3287   PrintAttributes(that);
3288 }
3289 
3290 
VisitAssertion(AssertionNode * that)3291 void DotPrinter::VisitAssertion(AssertionNode* that) {
3292   stream()->Add("  n%p [", that);
3293   switch (that->type()) {
3294     case AssertionNode::AT_END:
3295       stream()->Add("label=\"$\", shape=septagon");
3296       break;
3297     case AssertionNode::AT_START:
3298       stream()->Add("label=\"^\", shape=septagon");
3299       break;
3300     case AssertionNode::AT_BOUNDARY:
3301       stream()->Add("label=\"\\b\", shape=septagon");
3302       break;
3303     case AssertionNode::AT_NON_BOUNDARY:
3304       stream()->Add("label=\"\\B\", shape=septagon");
3305       break;
3306     case AssertionNode::AFTER_NEWLINE:
3307       stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3308       break;
3309     case AssertionNode::AFTER_WORD_CHARACTER:
3310       stream()->Add("label=\"(?<=\\w)\", shape=septagon");
3311       break;
3312     case AssertionNode::AFTER_NONWORD_CHARACTER:
3313       stream()->Add("label=\"(?<=\\W)\", shape=septagon");
3314       break;
3315   }
3316   stream()->Add("];\n");
3317   PrintAttributes(that);
3318   RegExpNode* successor = that->on_success();
3319   stream()->Add("  n%p -> n%p;\n", that, successor);
3320   Visit(successor);
3321 }
3322 
3323 
VisitAction(ActionNode * that)3324 void DotPrinter::VisitAction(ActionNode* that) {
3325   stream()->Add("  n%p [", that);
3326   switch (that->type_) {
3327     case ActionNode::SET_REGISTER:
3328       stream()->Add("label=\"$%i:=%i\", shape=octagon",
3329                     that->data_.u_store_register.reg,
3330                     that->data_.u_store_register.value);
3331       break;
3332     case ActionNode::INCREMENT_REGISTER:
3333       stream()->Add("label=\"$%i++\", shape=octagon",
3334                     that->data_.u_increment_register.reg);
3335       break;
3336     case ActionNode::STORE_POSITION:
3337       stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3338                     that->data_.u_position_register.reg);
3339       break;
3340     case ActionNode::BEGIN_SUBMATCH:
3341       stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3342                     that->data_.u_submatch.current_position_register);
3343       break;
3344     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3345       stream()->Add("label=\"escape\", shape=septagon");
3346       break;
3347     case ActionNode::EMPTY_MATCH_CHECK:
3348       stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3349                     that->data_.u_empty_match_check.start_register,
3350                     that->data_.u_empty_match_check.repetition_register,
3351                     that->data_.u_empty_match_check.repetition_limit);
3352       break;
3353     case ActionNode::CLEAR_CAPTURES: {
3354       stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3355                     that->data_.u_clear_captures.range_from,
3356                     that->data_.u_clear_captures.range_to);
3357       break;
3358     }
3359   }
3360   stream()->Add("];\n");
3361   PrintAttributes(that);
3362   RegExpNode* successor = that->on_success();
3363   stream()->Add("  n%p -> n%p;\n", that, successor);
3364   Visit(successor);
3365 }
3366 
3367 
3368 class DispatchTableDumper {
3369  public:
DispatchTableDumper(StringStream * stream)3370   explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3371   void Call(uc16 key, DispatchTable::Entry entry);
stream()3372   StringStream* stream() { return stream_; }
3373  private:
3374   StringStream* stream_;
3375 };
3376 
3377 
Call(uc16 key,DispatchTable::Entry entry)3378 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3379   stream()->Add("[%k-%k]: {", key, entry.to());
3380   OutSet* set = entry.out_set();
3381   bool first = true;
3382   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3383     if (set->Get(i)) {
3384       if (first) {
3385         first = false;
3386       } else {
3387         stream()->Add(", ");
3388       }
3389       stream()->Add("%i", i);
3390     }
3391   }
3392   stream()->Add("}\n");
3393 }
3394 
3395 
Dump()3396 void DispatchTable::Dump() {
3397   HeapStringAllocator alloc;
3398   StringStream stream(&alloc);
3399   DispatchTableDumper dumper(&stream);
3400   tree()->ForEach(&dumper);
3401   OS::PrintError("%s", *stream.ToCString());
3402 }
3403 
3404 
DotPrint(const char * label,RegExpNode * node,bool ignore_case)3405 void RegExpEngine::DotPrint(const char* label,
3406                             RegExpNode* node,
3407                             bool ignore_case) {
3408   DotPrinter printer(ignore_case);
3409   printer.PrintNode(label, node);
3410 }
3411 
3412 
3413 #endif  // DEBUG
3414 
3415 
3416 // -------------------------------------------------------------------
3417 // Tree to graph conversion
3418 
3419 static const int kSpaceRangeCount = 20;
3420 static const int kSpaceRangeAsciiCount = 4;
3421 static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3422     0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3423     0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3424 
3425 static const int kWordRangeCount = 8;
3426 static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3427     '_', 'a', 'z' };
3428 
3429 static const int kDigitRangeCount = 2;
3430 static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3431 
3432 static const int kLineTerminatorRangeCount = 6;
3433 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3434     0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
3435 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3436 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3437                                RegExpNode* on_success) {
3438   ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3439   elms->Add(TextElement::Atom(this));
3440   return new TextNode(elms, on_success);
3441 }
3442 
3443 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3444 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3445                                RegExpNode* on_success) {
3446   return new TextNode(elements(), on_success);
3447 }
3448 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const uc16 * special_class,int length)3449 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3450                                  const uc16* special_class,
3451                                  int length) {
3452   ASSERT(ranges->length() != 0);
3453   ASSERT(length != 0);
3454   ASSERT(special_class[0] != 0);
3455   if (ranges->length() != (length >> 1) + 1) {
3456     return false;
3457   }
3458   CharacterRange range = ranges->at(0);
3459   if (range.from() != 0) {
3460     return false;
3461   }
3462   for (int i = 0; i < length; i += 2) {
3463     if (special_class[i] != (range.to() + 1)) {
3464       return false;
3465     }
3466     range = ranges->at((i >> 1) + 1);
3467     if (special_class[i+1] != range.from() - 1) {
3468       return false;
3469     }
3470   }
3471   if (range.to() != 0xffff) {
3472     return false;
3473   }
3474   return true;
3475 }
3476 
3477 
CompareRanges(ZoneList<CharacterRange> * ranges,const uc16 * special_class,int length)3478 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3479                           const uc16* special_class,
3480                           int length) {
3481   if (ranges->length() * 2 != length) {
3482     return false;
3483   }
3484   for (int i = 0; i < length; i += 2) {
3485     CharacterRange range = ranges->at(i >> 1);
3486     if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3487       return false;
3488     }
3489   }
3490   return true;
3491 }
3492 
3493 
is_standard()3494 bool RegExpCharacterClass::is_standard() {
3495   // TODO(lrn): Remove need for this function, by not throwing away information
3496   // along the way.
3497   if (is_negated_) {
3498     return false;
3499   }
3500   if (set_.is_standard()) {
3501     return true;
3502   }
3503   if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3504     set_.set_standard_set_type('s');
3505     return true;
3506   }
3507   if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3508     set_.set_standard_set_type('S');
3509     return true;
3510   }
3511   if (CompareInverseRanges(set_.ranges(),
3512                            kLineTerminatorRanges,
3513                            kLineTerminatorRangeCount)) {
3514     set_.set_standard_set_type('.');
3515     return true;
3516   }
3517   if (CompareRanges(set_.ranges(),
3518                     kLineTerminatorRanges,
3519                     kLineTerminatorRangeCount)) {
3520     set_.set_standard_set_type('n');
3521     return true;
3522   }
3523   if (CompareRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3524     set_.set_standard_set_type('w');
3525     return true;
3526   }
3527   if (CompareInverseRanges(set_.ranges(), kWordRanges, kWordRangeCount)) {
3528     set_.set_standard_set_type('W');
3529     return true;
3530   }
3531   return false;
3532 }
3533 
3534 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3535 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
3536                                          RegExpNode* on_success) {
3537   return new TextNode(this, on_success);
3538 }
3539 
3540 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3541 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
3542                                       RegExpNode* on_success) {
3543   ZoneList<RegExpTree*>* alternatives = this->alternatives();
3544   int length = alternatives->length();
3545   ChoiceNode* result = new ChoiceNode(length);
3546   for (int i = 0; i < length; i++) {
3547     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
3548                                                                on_success));
3549     result->AddAlternative(alternative);
3550   }
3551   return result;
3552 }
3553 
3554 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3555 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
3556                                      RegExpNode* on_success) {
3557   return ToNode(min(),
3558                 max(),
3559                 is_greedy(),
3560                 body(),
3561                 compiler,
3562                 on_success);
3563 }
3564 
3565 
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)3566 RegExpNode* RegExpQuantifier::ToNode(int min,
3567                                      int max,
3568                                      bool is_greedy,
3569                                      RegExpTree* body,
3570                                      RegExpCompiler* compiler,
3571                                      RegExpNode* on_success,
3572                                      bool not_at_start) {
3573   // x{f, t} becomes this:
3574   //
3575   //             (r++)<-.
3576   //               |     `
3577   //               |     (x)
3578   //               v     ^
3579   //      (r=0)-->(?)---/ [if r < t]
3580   //               |
3581   //   [if r >= f] \----> ...
3582   //
3583 
3584   // 15.10.2.5 RepeatMatcher algorithm.
3585   // The parser has already eliminated the case where max is 0.  In the case
3586   // where max_match is zero the parser has removed the quantifier if min was
3587   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
3588 
3589   // If we know that we cannot match zero length then things are a little
3590   // simpler since we don't need to make the special zero length match check
3591   // from step 2.1.  If the min and max are small we can unroll a little in
3592   // this case.
3593   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
3594   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
3595   if (max == 0) return on_success;  // This can happen due to recursion.
3596   bool body_can_be_empty = (body->min_match() == 0);
3597   int body_start_reg = RegExpCompiler::kNoRegister;
3598   Interval capture_registers = body->CaptureRegisters();
3599   bool needs_capture_clearing = !capture_registers.is_empty();
3600   if (body_can_be_empty) {
3601     body_start_reg = compiler->AllocateRegister();
3602   } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
3603     // Only unroll if there are no captures and the body can't be
3604     // empty.
3605     if (min > 0 && min <= kMaxUnrolledMinMatches) {
3606       int new_max = (max == kInfinity) ? max : max - min;
3607       // Recurse once to get the loop or optional matches after the fixed ones.
3608       RegExpNode* answer = ToNode(
3609           0, new_max, is_greedy, body, compiler, on_success, true);
3610       // Unroll the forced matches from 0 to min.  This can cause chains of
3611       // TextNodes (which the parser does not generate).  These should be
3612       // combined if it turns out they hinder good code generation.
3613       for (int i = 0; i < min; i++) {
3614         answer = body->ToNode(compiler, answer);
3615       }
3616       return answer;
3617     }
3618     if (max <= kMaxUnrolledMaxMatches) {
3619       ASSERT(min == 0);
3620       // Unroll the optional matches up to max.
3621       RegExpNode* answer = on_success;
3622       for (int i = 0; i < max; i++) {
3623         ChoiceNode* alternation = new ChoiceNode(2);
3624         if (is_greedy) {
3625           alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3626                                                                       answer)));
3627           alternation->AddAlternative(GuardedAlternative(on_success));
3628         } else {
3629           alternation->AddAlternative(GuardedAlternative(on_success));
3630           alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3631                                                                       answer)));
3632         }
3633         answer = alternation;
3634         if (not_at_start) alternation->set_not_at_start();
3635       }
3636       return answer;
3637     }
3638   }
3639   bool has_min = min > 0;
3640   bool has_max = max < RegExpTree::kInfinity;
3641   bool needs_counter = has_min || has_max;
3642   int reg_ctr = needs_counter
3643       ? compiler->AllocateRegister()
3644       : RegExpCompiler::kNoRegister;
3645   LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
3646   if (not_at_start) center->set_not_at_start();
3647   RegExpNode* loop_return = needs_counter
3648       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3649       : static_cast<RegExpNode*>(center);
3650   if (body_can_be_empty) {
3651     // If the body can be empty we need to check if it was and then
3652     // backtrack.
3653     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3654                                               reg_ctr,
3655                                               min,
3656                                               loop_return);
3657   }
3658   RegExpNode* body_node = body->ToNode(compiler, loop_return);
3659   if (body_can_be_empty) {
3660     // If the body can be empty we need to store the start position
3661     // so we can bail out if it was empty.
3662     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
3663   }
3664   if (needs_capture_clearing) {
3665     // Before entering the body of this loop we need to clear captures.
3666     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3667   }
3668   GuardedAlternative body_alt(body_node);
3669   if (has_max) {
3670     Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3671     body_alt.AddGuard(body_guard);
3672   }
3673   GuardedAlternative rest_alt(on_success);
3674   if (has_min) {
3675     Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3676     rest_alt.AddGuard(rest_guard);
3677   }
3678   if (is_greedy) {
3679     center->AddLoopAlternative(body_alt);
3680     center->AddContinueAlternative(rest_alt);
3681   } else {
3682     center->AddContinueAlternative(rest_alt);
3683     center->AddLoopAlternative(body_alt);
3684   }
3685   if (needs_counter) {
3686     return ActionNode::SetRegister(reg_ctr, 0, center);
3687   } else {
3688     return center;
3689   }
3690 }
3691 
3692 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3693 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
3694                                     RegExpNode* on_success) {
3695   NodeInfo info;
3696   switch (type()) {
3697     case START_OF_LINE:
3698       return AssertionNode::AfterNewline(on_success);
3699     case START_OF_INPUT:
3700       return AssertionNode::AtStart(on_success);
3701     case BOUNDARY:
3702       return AssertionNode::AtBoundary(on_success);
3703     case NON_BOUNDARY:
3704       return AssertionNode::AtNonBoundary(on_success);
3705     case END_OF_INPUT:
3706       return AssertionNode::AtEnd(on_success);
3707     case END_OF_LINE: {
3708       // Compile $ in multiline regexps as an alternation with a positive
3709       // lookahead in one side and an end-of-input on the other side.
3710       // We need two registers for the lookahead.
3711       int stack_pointer_register = compiler->AllocateRegister();
3712       int position_register = compiler->AllocateRegister();
3713       // The ChoiceNode to distinguish between a newline and end-of-input.
3714       ChoiceNode* result = new ChoiceNode(2);
3715       // Create a newline atom.
3716       ZoneList<CharacterRange>* newline_ranges =
3717           new ZoneList<CharacterRange>(3);
3718       CharacterRange::AddClassEscape('n', newline_ranges);
3719       RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3720       TextNode* newline_matcher = new TextNode(
3721          newline_atom,
3722          ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3723                                              position_register,
3724                                              0,  // No captures inside.
3725                                              -1,  // Ignored if no captures.
3726                                              on_success));
3727       // Create an end-of-input matcher.
3728       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3729           stack_pointer_register,
3730           position_register,
3731           newline_matcher);
3732       // Add the two alternatives to the ChoiceNode.
3733       GuardedAlternative eol_alternative(end_of_line);
3734       result->AddAlternative(eol_alternative);
3735       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3736       result->AddAlternative(end_alternative);
3737       return result;
3738     }
3739     default:
3740       UNREACHABLE();
3741   }
3742   return on_success;
3743 }
3744 
3745 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3746 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
3747                                         RegExpNode* on_success) {
3748   return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3749                                RegExpCapture::EndRegister(index()),
3750                                on_success);
3751 }
3752 
3753 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3754 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
3755                                 RegExpNode* on_success) {
3756   return on_success;
3757 }
3758 
3759 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3760 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
3761                                     RegExpNode* on_success) {
3762   int stack_pointer_register = compiler->AllocateRegister();
3763   int position_register = compiler->AllocateRegister();
3764 
3765   const int registers_per_capture = 2;
3766   const int register_of_first_capture = 2;
3767   int register_count = capture_count_ * registers_per_capture;
3768   int register_start =
3769     register_of_first_capture + capture_from_ * registers_per_capture;
3770 
3771   RegExpNode* success;
3772   if (is_positive()) {
3773     RegExpNode* node = ActionNode::BeginSubmatch(
3774         stack_pointer_register,
3775         position_register,
3776         body()->ToNode(
3777             compiler,
3778             ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3779                                                 position_register,
3780                                                 register_count,
3781                                                 register_start,
3782                                                 on_success)));
3783     return node;
3784   } else {
3785     // We use a ChoiceNode for a negative lookahead because it has most of
3786     // the characteristics we need.  It has the body of the lookahead as its
3787     // first alternative and the expression after the lookahead of the second
3788     // alternative.  If the first alternative succeeds then the
3789     // NegativeSubmatchSuccess will unwind the stack including everything the
3790     // choice node set up and backtrack.  If the first alternative fails then
3791     // the second alternative is tried, which is exactly the desired result
3792     // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
3793     // ChoiceNode that knows to ignore the first exit when calculating quick
3794     // checks.
3795     GuardedAlternative body_alt(
3796         body()->ToNode(
3797             compiler,
3798             success = new NegativeSubmatchSuccess(stack_pointer_register,
3799                                                   position_register,
3800                                                   register_count,
3801                                                   register_start)));
3802     ChoiceNode* choice_node =
3803         new NegativeLookaheadChoiceNode(body_alt,
3804                                         GuardedAlternative(on_success));
3805     return ActionNode::BeginSubmatch(stack_pointer_register,
3806                                      position_register,
3807                                      choice_node);
3808   }
3809 }
3810 
3811 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3812 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
3813                                   RegExpNode* on_success) {
3814   return ToNode(body(), index(), compiler, on_success);
3815 }
3816 
3817 
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)3818 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3819                                   int index,
3820                                   RegExpCompiler* compiler,
3821                                   RegExpNode* on_success) {
3822   int start_reg = RegExpCapture::StartRegister(index);
3823   int end_reg = RegExpCapture::EndRegister(index);
3824   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
3825   RegExpNode* body_node = body->ToNode(compiler, store_end);
3826   return ActionNode::StorePosition(start_reg, true, body_node);
3827 }
3828 
3829 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3830 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
3831                                       RegExpNode* on_success) {
3832   ZoneList<RegExpTree*>* children = nodes();
3833   RegExpNode* current = on_success;
3834   for (int i = children->length() - 1; i >= 0; i--) {
3835     current = children->at(i)->ToNode(compiler, current);
3836   }
3837   return current;
3838 }
3839 
3840 
AddClass(const uc16 * elmv,int elmc,ZoneList<CharacterRange> * ranges)3841 static void AddClass(const uc16* elmv,
3842                      int elmc,
3843                      ZoneList<CharacterRange>* ranges) {
3844   for (int i = 0; i < elmc; i += 2) {
3845     ASSERT(elmv[i] <= elmv[i + 1]);
3846     ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3847   }
3848 }
3849 
3850 
AddClassNegated(const uc16 * elmv,int elmc,ZoneList<CharacterRange> * ranges)3851 static void AddClassNegated(const uc16 *elmv,
3852                             int elmc,
3853                             ZoneList<CharacterRange>* ranges) {
3854   ASSERT(elmv[0] != 0x0000);
3855   ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
3856   uc16 last = 0x0000;
3857   for (int i = 0; i < elmc; i += 2) {
3858     ASSERT(last <= elmv[i] - 1);
3859     ASSERT(elmv[i] <= elmv[i + 1]);
3860     ranges->Add(CharacterRange(last, elmv[i] - 1));
3861     last = elmv[i + 1] + 1;
3862   }
3863   ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
3864 }
3865 
3866 
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges)3867 void CharacterRange::AddClassEscape(uc16 type,
3868                                     ZoneList<CharacterRange>* ranges) {
3869   switch (type) {
3870     case 's':
3871       AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3872       break;
3873     case 'S':
3874       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3875       break;
3876     case 'w':
3877       AddClass(kWordRanges, kWordRangeCount, ranges);
3878       break;
3879     case 'W':
3880       AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3881       break;
3882     case 'd':
3883       AddClass(kDigitRanges, kDigitRangeCount, ranges);
3884       break;
3885     case 'D':
3886       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3887       break;
3888     case '.':
3889       AddClassNegated(kLineTerminatorRanges,
3890                       kLineTerminatorRangeCount,
3891                       ranges);
3892       break;
3893     // This is not a character range as defined by the spec but a
3894     // convenient shorthand for a character class that matches any
3895     // character.
3896     case '*':
3897       ranges->Add(CharacterRange::Everything());
3898       break;
3899     // This is the set of characters matched by the $ and ^ symbols
3900     // in multiline mode.
3901     case 'n':
3902       AddClass(kLineTerminatorRanges,
3903                kLineTerminatorRangeCount,
3904                ranges);
3905       break;
3906     default:
3907       UNREACHABLE();
3908   }
3909 }
3910 
3911 
GetWordBounds()3912 Vector<const uc16> CharacterRange::GetWordBounds() {
3913   return Vector<const uc16>(kWordRanges, kWordRangeCount);
3914 }
3915 
3916 
3917 class CharacterRangeSplitter {
3918  public:
CharacterRangeSplitter(ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded)3919   CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3920                           ZoneList<CharacterRange>** excluded)
3921       : included_(included),
3922         excluded_(excluded) { }
3923   void Call(uc16 from, DispatchTable::Entry entry);
3924 
3925   static const int kInBase = 0;
3926   static const int kInOverlay = 1;
3927 
3928  private:
3929   ZoneList<CharacterRange>** included_;
3930   ZoneList<CharacterRange>** excluded_;
3931 };
3932 
3933 
Call(uc16 from,DispatchTable::Entry entry)3934 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3935   if (!entry.out_set()->Get(kInBase)) return;
3936   ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3937     ? included_
3938     : excluded_;
3939   if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3940   (*target)->Add(CharacterRange(entry.from(), entry.to()));
3941 }
3942 
3943 
Split(ZoneList<CharacterRange> * base,Vector<const uc16> overlay,ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded)3944 void CharacterRange::Split(ZoneList<CharacterRange>* base,
3945                            Vector<const uc16> overlay,
3946                            ZoneList<CharacterRange>** included,
3947                            ZoneList<CharacterRange>** excluded) {
3948   ASSERT_EQ(NULL, *included);
3949   ASSERT_EQ(NULL, *excluded);
3950   DispatchTable table;
3951   for (int i = 0; i < base->length(); i++)
3952     table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3953   for (int i = 0; i < overlay.length(); i += 2) {
3954     table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3955                    CharacterRangeSplitter::kInOverlay);
3956   }
3957   CharacterRangeSplitter callback(included, excluded);
3958   table.ForEach(&callback);
3959 }
3960 
3961 
3962 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
3963                             int bottom,
3964                             int top);
3965 
3966 
AddCaseEquivalents(ZoneList<CharacterRange> * ranges,bool is_ascii)3967 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
3968                                         bool is_ascii) {
3969   uc16 bottom = from();
3970   uc16 top = to();
3971   if (is_ascii) {
3972     if (bottom > String::kMaxAsciiCharCode) return;
3973     if (top > String::kMaxAsciiCharCode) top = String::kMaxAsciiCharCode;
3974   }
3975   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3976   if (top == bottom) {
3977     // If this is a singleton we just expand the one character.
3978     int length = uncanonicalize.get(bottom, '\0', chars);
3979     for (int i = 0; i < length; i++) {
3980       uc32 chr = chars[i];
3981       if (chr != bottom) {
3982         ranges->Add(CharacterRange::Singleton(chars[i]));
3983       }
3984     }
3985   } else if (bottom <= kRangeCanonicalizeMax &&
3986              top <= kRangeCanonicalizeMax) {
3987     // If this is a range we expand the characters block by block,
3988     // expanding contiguous subranges (blocks) one at a time.
3989     // The approach is as follows.  For a given start character we
3990     // look up the block that contains it, for instance 'a' if the
3991     // start character is 'c'.  A block is characterized by the property
3992     // that all characters uncanonicalize in the same way as the first
3993     // element, except that each entry in the result is incremented
3994     // by the distance from the first element.  So a-z is a block
3995     // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3996     // uncanonicalizes to ['a' + k, 'A' + k].
3997     // Once we've found the start point we look up its uncanonicalization
3998     // and produce a range for each element.  For instance for [c-f]
3999     // we look up ['a', 'A'] and produce [c-f] and [C-F].  We then only
4000     // add a range if it is not already contained in the input, so [c-f]
4001     // will be skipped but [C-F] will be added.  If this range is not
4002     // completely contained in a block we do this for all the blocks
4003     // covered by the range.
4004     unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4005     // First, look up the block that contains the 'bottom' character.
4006     int length = canonrange.get(bottom, '\0', range);
4007     if (length == 0) {
4008       range[0] = bottom;
4009     } else {
4010       ASSERT_EQ(1, length);
4011     }
4012     int pos = bottom;
4013     // The start of the current block.  Note that except for the first
4014     // iteration 'start' is always equal to 'pos'.
4015     int start;
4016     // If it is not the start point of a block the entry contains the
4017     // offset of the character from the start point.
4018     if ((range[0] & kStartMarker) == 0) {
4019       start = pos - range[0];
4020     } else {
4021       start = pos;
4022     }
4023     // Then we add the ranges one at a time, incrementing the current
4024     // position to be after the last block each time.  The position
4025     // always points to the start of a block.
4026     while (pos < top) {
4027       length = canonrange.get(start, '\0', range);
4028       if (length == 0) {
4029         range[0] = start;
4030       } else {
4031         ASSERT_EQ(1, length);
4032       }
4033       ASSERT((range[0] & kStartMarker) != 0);
4034       // The start point of a block contains the distance to the end
4035       // of the range.
4036       int block_end = start + (range[0] & kPayloadMask) - 1;
4037       int end = (block_end > top) ? top : block_end;
4038       length = uncanonicalize.get(start, '\0', range);
4039       for (int i = 0; i < length; i++) {
4040         uc32 c = range[i];
4041         uc16 range_from = c + (pos - start);
4042         uc16 range_to = c + (end - start);
4043         if (!(bottom <= range_from && range_to <= top)) {
4044           ranges->Add(CharacterRange(range_from, range_to));
4045         }
4046       }
4047       start = pos = block_end + 1;
4048     }
4049   } else {
4050     // Unibrow ranges don't work for high characters due to the "2^11 bug".
4051     // Therefore we do something dumber for these ranges.
4052     AddUncanonicals(ranges, bottom, top);
4053   }
4054 }
4055 
4056 
IsCanonical(ZoneList<CharacterRange> * ranges)4057 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
4058   ASSERT_NOT_NULL(ranges);
4059   int n = ranges->length();
4060   if (n <= 1) return true;
4061   int max = ranges->at(0).to();
4062   for (int i = 1; i < n; i++) {
4063     CharacterRange next_range = ranges->at(i);
4064     if (next_range.from() <= max + 1) return false;
4065     max = next_range.to();
4066   }
4067   return true;
4068 }
4069 
WordCharacterRelation(ZoneList<CharacterRange> * range)4070 SetRelation CharacterRange::WordCharacterRelation(
4071     ZoneList<CharacterRange>* range) {
4072   ASSERT(IsCanonical(range));
4073   int i = 0;  // Word character range index.
4074   int j = 0;  // Argument range index.
4075   ASSERT_NE(0, kWordRangeCount);
4076   SetRelation result;
4077   if (range->length() == 0) {
4078     result.SetElementsInSecondSet();
4079     return result;
4080   }
4081   CharacterRange argument_range = range->at(0);
4082   CharacterRange word_range = CharacterRange(kWordRanges[0], kWordRanges[1]);
4083   while (i < kWordRangeCount && j < range->length()) {
4084     // Check the two ranges for the five cases:
4085     // - no overlap.
4086     // - partial overlap (there are elements in both ranges that isn't
4087     //   in the other, and there are also elements that are in both).
4088     // - argument range entirely inside word range.
4089     // - word range entirely inside argument range.
4090     // - ranges are completely equal.
4091 
4092     // First check for no overlap. The earlier range is not in the other set.
4093     if (argument_range.from() > word_range.to()) {
4094       // Ranges are disjoint. The earlier word range contains elements that
4095       // cannot be in the argument set.
4096       result.SetElementsInSecondSet();
4097     } else if (word_range.from() > argument_range.to()) {
4098       // Ranges are disjoint. The earlier argument range contains elements that
4099       // cannot be in the word set.
4100       result.SetElementsInFirstSet();
4101     } else if (word_range.from() <= argument_range.from() &&
4102                word_range.to() >= argument_range.from()) {
4103       result.SetElementsInBothSets();
4104       // argument range completely inside word range.
4105       if (word_range.from() < argument_range.from() ||
4106           word_range.to() > argument_range.from()) {
4107         result.SetElementsInSecondSet();
4108       }
4109     } else if (word_range.from() >= argument_range.from() &&
4110                word_range.to() <= argument_range.from()) {
4111       result.SetElementsInBothSets();
4112       result.SetElementsInFirstSet();
4113     } else {
4114       // There is overlap, and neither is a subrange of the other
4115       result.SetElementsInFirstSet();
4116       result.SetElementsInSecondSet();
4117       result.SetElementsInBothSets();
4118     }
4119     if (result.NonTrivialIntersection()) {
4120       // The result is as (im)precise as we can possibly make it.
4121       return result;
4122     }
4123     // Progress the range(s) with minimal to-character.
4124     uc16 word_to = word_range.to();
4125     uc16 argument_to = argument_range.to();
4126     if (argument_to <= word_to) {
4127       j++;
4128       if (j < range->length()) {
4129         argument_range = range->at(j);
4130       }
4131     }
4132     if (word_to <= argument_to) {
4133       i += 2;
4134       if (i < kWordRangeCount) {
4135         word_range = CharacterRange(kWordRanges[i], kWordRanges[i + 1]);
4136       }
4137     }
4138   }
4139   // Check if anything wasn't compared in the loop.
4140   if (i < kWordRangeCount) {
4141     // word range contains something not in argument range.
4142     result.SetElementsInSecondSet();
4143   } else if (j < range->length()) {
4144     // Argument range contains something not in word range.
4145     result.SetElementsInFirstSet();
4146   }
4147 
4148   return result;
4149 }
4150 
4151 
AddUncanonicals(ZoneList<CharacterRange> * ranges,int bottom,int top)4152 static void AddUncanonicals(ZoneList<CharacterRange>* ranges,
4153                             int bottom,
4154                             int top) {
4155   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
4156   // Zones with no case mappings.  There is a DEBUG-mode loop to assert that
4157   // this table is correct.
4158   // 0x0600 - 0x0fff
4159   // 0x1100 - 0x1cff
4160   // 0x2000 - 0x20ff
4161   // 0x2200 - 0x23ff
4162   // 0x2500 - 0x2bff
4163   // 0x2e00 - 0xa5ff
4164   // 0xa800 - 0xfaff
4165   // 0xfc00 - 0xfeff
4166   const int boundary_count = 18;
4167   // The ASCII boundary and the kRangeCanonicalizeMax boundary are also in this
4168   // array.  This is to split up big ranges and not because they actually denote
4169   // a case-mapping-free-zone.
4170   ASSERT(CharacterRange::kRangeCanonicalizeMax < 0x600);
4171   const int kFirstRealCaselessZoneIndex = 2;
4172   int boundaries[] = {0x80, CharacterRange::kRangeCanonicalizeMax,
4173       0x600, 0x1000, 0x1100, 0x1d00, 0x2000, 0x2100, 0x2200, 0x2400, 0x2500,
4174       0x2c00, 0x2e00, 0xa600, 0xa800, 0xfb00, 0xfc00, 0xff00};
4175 
4176   // Special ASCII rule from spec can save us some work here.
4177   if (bottom == 0x80 && top == 0xffff) return;
4178 
4179   // We have optimized support for this range.
4180   if (top <= CharacterRange::kRangeCanonicalizeMax) {
4181     CharacterRange range(bottom, top);
4182     range.AddCaseEquivalents(ranges, false);
4183     return;
4184   }
4185 
4186   // Split up very large ranges.  This helps remove ranges where there are no
4187   // case mappings.
4188   for (int i = 0; i < boundary_count; i++) {
4189     if (bottom < boundaries[i] && top >= boundaries[i]) {
4190       AddUncanonicals(ranges, bottom, boundaries[i] - 1);
4191       AddUncanonicals(ranges, boundaries[i], top);
4192       return;
4193     }
4194   }
4195 
4196   // If we are completely in a zone with no case mappings then we are done.
4197   // We start at 2 so as not to except the ASCII range from mappings.
4198   for (int i = kFirstRealCaselessZoneIndex; i < boundary_count; i += 2) {
4199     if (bottom >= boundaries[i] && top < boundaries[i + 1]) {
4200 #ifdef DEBUG
4201       for (int j = bottom; j <= top; j++) {
4202         unsigned current_char = j;
4203         int length = uncanonicalize.get(current_char, '\0', chars);
4204         for (int k = 0; k < length; k++) {
4205           ASSERT(chars[k] == current_char);
4206         }
4207       }
4208 #endif
4209       return;
4210     }
4211   }
4212 
4213   // Step through the range finding equivalent characters.
4214   ZoneList<unibrow::uchar> *characters = new ZoneList<unibrow::uchar>(100);
4215   for (int i = bottom; i <= top; i++) {
4216     int length = uncanonicalize.get(i, '\0', chars);
4217     for (int j = 0; j < length; j++) {
4218       uc32 chr = chars[j];
4219       if (chr != i && (chr < bottom || chr > top)) {
4220         characters->Add(chr);
4221       }
4222     }
4223   }
4224 
4225   // Step through the equivalent characters finding simple ranges and
4226   // adding ranges to the character class.
4227   if (characters->length() > 0) {
4228     int new_from = characters->at(0);
4229     int new_to = new_from;
4230     for (int i = 1; i < characters->length(); i++) {
4231       int chr = characters->at(i);
4232       if (chr == new_to + 1) {
4233         new_to++;
4234       } else {
4235         if (new_to == new_from) {
4236           ranges->Add(CharacterRange::Singleton(new_from));
4237         } else {
4238           ranges->Add(CharacterRange(new_from, new_to));
4239         }
4240         new_from = new_to = chr;
4241       }
4242     }
4243     if (new_to == new_from) {
4244       ranges->Add(CharacterRange::Singleton(new_from));
4245     } else {
4246       ranges->Add(CharacterRange(new_from, new_to));
4247     }
4248   }
4249 }
4250 
4251 
ranges()4252 ZoneList<CharacterRange>* CharacterSet::ranges() {
4253   if (ranges_ == NULL) {
4254     ranges_ = new ZoneList<CharacterRange>(2);
4255     CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4256   }
4257   return ranges_;
4258 }
4259 
4260 
4261 // Move a number of elements in a zonelist to another position
4262 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)4263 static void MoveRanges(ZoneList<CharacterRange>* list,
4264                        int from,
4265                        int to,
4266                        int count) {
4267   // Ranges are potentially overlapping.
4268   if (from < to) {
4269     for (int i = count - 1; i >= 0; i--) {
4270       list->at(to + i) = list->at(from + i);
4271     }
4272   } else {
4273     for (int i = 0; i < count; i++) {
4274       list->at(to + i) = list->at(from + i);
4275     }
4276   }
4277 }
4278 
4279 
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)4280 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
4281                                       int count,
4282                                       CharacterRange insert) {
4283   // Inserts a range into list[0..count[, which must be sorted
4284   // by from value and non-overlapping and non-adjacent, using at most
4285   // list[0..count] for the result. Returns the number of resulting
4286   // canonicalized ranges. Inserting a range may collapse existing ranges into
4287   // fewer ranges, so the return value can be anything in the range 1..count+1.
4288   uc16 from = insert.from();
4289   uc16 to = insert.to();
4290   int start_pos = 0;
4291   int end_pos = count;
4292   for (int i = count - 1; i >= 0; i--) {
4293     CharacterRange current = list->at(i);
4294     if (current.from() > to + 1) {
4295       end_pos = i;
4296     } else if (current.to() + 1 < from) {
4297       start_pos = i + 1;
4298       break;
4299     }
4300   }
4301 
4302   // Inserted range overlaps, or is adjacent to, ranges at positions
4303   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
4304   // not affected by the insertion.
4305   // If start_pos == end_pos, the range must be inserted before start_pos.
4306   // if start_pos < end_pos, the entire range from start_pos to end_pos
4307   // must be merged with the insert range.
4308 
4309   if (start_pos == end_pos) {
4310     // Insert between existing ranges at position start_pos.
4311     if (start_pos < count) {
4312       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
4313     }
4314     list->at(start_pos) = insert;
4315     return count + 1;
4316   }
4317   if (start_pos + 1 == end_pos) {
4318     // Replace single existing range at position start_pos.
4319     CharacterRange to_replace = list->at(start_pos);
4320     int new_from = Min(to_replace.from(), from);
4321     int new_to = Max(to_replace.to(), to);
4322     list->at(start_pos) = CharacterRange(new_from, new_to);
4323     return count;
4324   }
4325   // Replace a number of existing ranges from start_pos to end_pos - 1.
4326   // Move the remaining ranges down.
4327 
4328   int new_from = Min(list->at(start_pos).from(), from);
4329   int new_to = Max(list->at(end_pos - 1).to(), to);
4330   if (end_pos < count) {
4331     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
4332   }
4333   list->at(start_pos) = CharacterRange(new_from, new_to);
4334   return count - (end_pos - start_pos) + 1;
4335 }
4336 
4337 
Canonicalize()4338 void CharacterSet::Canonicalize() {
4339   // Special/default classes are always considered canonical. The result
4340   // of calling ranges() will be sorted.
4341   if (ranges_ == NULL) return;
4342   CharacterRange::Canonicalize(ranges_);
4343 }
4344 
4345 
Canonicalize(ZoneList<CharacterRange> * character_ranges)4346 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
4347   if (character_ranges->length() <= 1) return;
4348   // Check whether ranges are already canonical (increasing, non-overlapping,
4349   // non-adjacent).
4350   int n = character_ranges->length();
4351   int max = character_ranges->at(0).to();
4352   int i = 1;
4353   while (i < n) {
4354     CharacterRange current = character_ranges->at(i);
4355     if (current.from() <= max + 1) {
4356       break;
4357     }
4358     max = current.to();
4359     i++;
4360   }
4361   // Canonical until the i'th range. If that's all of them, we are done.
4362   if (i == n) return;
4363 
4364   // The ranges at index i and forward are not canonicalized. Make them so by
4365   // doing the equivalent of insertion sort (inserting each into the previous
4366   // list, in order).
4367   // Notice that inserting a range can reduce the number of ranges in the
4368   // result due to combining of adjacent and overlapping ranges.
4369   int read = i;  // Range to insert.
4370   int num_canonical = i;  // Length of canonicalized part of list.
4371   do {
4372     num_canonical = InsertRangeInCanonicalList(character_ranges,
4373                                                num_canonical,
4374                                                character_ranges->at(read));
4375     read++;
4376   } while (read < n);
4377   character_ranges->Rewind(num_canonical);
4378 
4379   ASSERT(CharacterRange::IsCanonical(character_ranges));
4380 }
4381 
4382 
4383 // Utility function for CharacterRange::Merge. Adds a range at the end of
4384 // a canonicalized range list, if necessary merging the range with the last
4385 // range of the list.
AddRangeToSet(ZoneList<CharacterRange> * set,CharacterRange range)4386 static void AddRangeToSet(ZoneList<CharacterRange>* set, CharacterRange range) {
4387   if (set == NULL) return;
4388   ASSERT(set->length() == 0 || set->at(set->length() - 1).to() < range.from());
4389   int n = set->length();
4390   if (n > 0) {
4391     CharacterRange lastRange = set->at(n - 1);
4392     if (lastRange.to() == range.from() - 1) {
4393       set->at(n - 1) = CharacterRange(lastRange.from(), range.to());
4394       return;
4395     }
4396   }
4397   set->Add(range);
4398 }
4399 
4400 
AddRangeToSelectedSet(int selector,ZoneList<CharacterRange> * first_set,ZoneList<CharacterRange> * second_set,ZoneList<CharacterRange> * intersection_set,CharacterRange range)4401 static void AddRangeToSelectedSet(int selector,
4402                                   ZoneList<CharacterRange>* first_set,
4403                                   ZoneList<CharacterRange>* second_set,
4404                                   ZoneList<CharacterRange>* intersection_set,
4405                                   CharacterRange range) {
4406   switch (selector) {
4407     case kInsideFirst:
4408       AddRangeToSet(first_set, range);
4409       break;
4410     case kInsideSecond:
4411       AddRangeToSet(second_set, range);
4412       break;
4413     case kInsideBoth:
4414       AddRangeToSet(intersection_set, range);
4415       break;
4416   }
4417 }
4418 
4419 
4420 
Merge(ZoneList<CharacterRange> * first_set,ZoneList<CharacterRange> * second_set,ZoneList<CharacterRange> * first_set_only_out,ZoneList<CharacterRange> * second_set_only_out,ZoneList<CharacterRange> * both_sets_out)4421 void CharacterRange::Merge(ZoneList<CharacterRange>* first_set,
4422                            ZoneList<CharacterRange>* second_set,
4423                            ZoneList<CharacterRange>* first_set_only_out,
4424                            ZoneList<CharacterRange>* second_set_only_out,
4425                            ZoneList<CharacterRange>* both_sets_out) {
4426   // Inputs are canonicalized.
4427   ASSERT(CharacterRange::IsCanonical(first_set));
4428   ASSERT(CharacterRange::IsCanonical(second_set));
4429   // Outputs are empty, if applicable.
4430   ASSERT(first_set_only_out == NULL || first_set_only_out->length() == 0);
4431   ASSERT(second_set_only_out == NULL || second_set_only_out->length() == 0);
4432   ASSERT(both_sets_out == NULL || both_sets_out->length() == 0);
4433 
4434   // Merge sets by iterating through the lists in order of lowest "from" value,
4435   // and putting intervals into one of three sets.
4436 
4437   if (first_set->length() == 0) {
4438     second_set_only_out->AddAll(*second_set);
4439     return;
4440   }
4441   if (second_set->length() == 0) {
4442     first_set_only_out->AddAll(*first_set);
4443     return;
4444   }
4445   // Indices into input lists.
4446   int i1 = 0;
4447   int i2 = 0;
4448   // Cache length of input lists.
4449   int n1 = first_set->length();
4450   int n2 = second_set->length();
4451   // Current range. May be invalid if state is kInsideNone.
4452   int from = 0;
4453   int to = -1;
4454   // Where current range comes from.
4455   int state = kInsideNone;
4456 
4457   while (i1 < n1 || i2 < n2) {
4458     CharacterRange next_range;
4459     int range_source;
4460     if (i2 == n2 ||
4461         (i1 < n1 && first_set->at(i1).from() < second_set->at(i2).from())) {
4462       // Next smallest element is in first set.
4463       next_range = first_set->at(i1++);
4464       range_source = kInsideFirst;
4465     } else {
4466       // Next smallest element is in second set.
4467       next_range = second_set->at(i2++);
4468       range_source = kInsideSecond;
4469     }
4470     if (to < next_range.from()) {
4471       // Ranges disjoint: |current|  |next|
4472       AddRangeToSelectedSet(state,
4473                             first_set_only_out,
4474                             second_set_only_out,
4475                             both_sets_out,
4476                             CharacterRange(from, to));
4477       from = next_range.from();
4478       to = next_range.to();
4479       state = range_source;
4480     } else {
4481       if (from < next_range.from()) {
4482         AddRangeToSelectedSet(state,
4483                               first_set_only_out,
4484                               second_set_only_out,
4485                               both_sets_out,
4486                               CharacterRange(from, next_range.from()-1));
4487       }
4488       if (to < next_range.to()) {
4489         // Ranges overlap:  |current|
4490         //                       |next|
4491         AddRangeToSelectedSet(state | range_source,
4492                               first_set_only_out,
4493                               second_set_only_out,
4494                               both_sets_out,
4495                               CharacterRange(next_range.from(), to));
4496         from = to + 1;
4497         to = next_range.to();
4498         state = range_source;
4499       } else {
4500         // Range included:    |current| , possibly ending at same character.
4501         //                      |next|
4502         AddRangeToSelectedSet(
4503             state | range_source,
4504             first_set_only_out,
4505             second_set_only_out,
4506             both_sets_out,
4507             CharacterRange(next_range.from(), next_range.to()));
4508         from = next_range.to() + 1;
4509         // If ranges end at same character, both ranges are consumed completely.
4510         if (next_range.to() == to) state = kInsideNone;
4511       }
4512     }
4513   }
4514   AddRangeToSelectedSet(state,
4515                         first_set_only_out,
4516                         second_set_only_out,
4517                         both_sets_out,
4518                         CharacterRange(from, to));
4519 }
4520 
4521 
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges)4522 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
4523                             ZoneList<CharacterRange>* negated_ranges) {
4524   ASSERT(CharacterRange::IsCanonical(ranges));
4525   ASSERT_EQ(0, negated_ranges->length());
4526   int range_count = ranges->length();
4527   uc16 from = 0;
4528   int i = 0;
4529   if (range_count > 0 && ranges->at(0).from() == 0) {
4530     from = ranges->at(0).to();
4531     i = 1;
4532   }
4533   while (i < range_count) {
4534     CharacterRange range = ranges->at(i);
4535     negated_ranges->Add(CharacterRange(from + 1, range.from() - 1));
4536     from = range.to();
4537     i++;
4538   }
4539   if (from < String::kMaxUC16CharCode) {
4540     negated_ranges->Add(CharacterRange(from + 1, String::kMaxUC16CharCode));
4541   }
4542 }
4543 
4544 
4545 
4546 // -------------------------------------------------------------------
4547 // Interest propagation
4548 
4549 
TryGetSibling(NodeInfo * info)4550 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4551   for (int i = 0; i < siblings_.length(); i++) {
4552     RegExpNode* sibling = siblings_.Get(i);
4553     if (sibling->info()->Matches(info))
4554       return sibling;
4555   }
4556   return NULL;
4557 }
4558 
4559 
EnsureSibling(NodeInfo * info,bool * cloned)4560 RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4561   ASSERT_EQ(false, *cloned);
4562   siblings_.Ensure(this);
4563   RegExpNode* result = TryGetSibling(info);
4564   if (result != NULL) return result;
4565   result = this->Clone();
4566   NodeInfo* new_info = result->info();
4567   new_info->ResetCompilationState();
4568   new_info->AddFromPreceding(info);
4569   AddSibling(result);
4570   *cloned = true;
4571   return result;
4572 }
4573 
4574 
4575 template <class C>
PropagateToEndpoint(C * node,NodeInfo * info)4576 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4577   NodeInfo full_info(*node->info());
4578   full_info.AddFromPreceding(info);
4579   bool cloned = false;
4580   return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4581 }
4582 
4583 
4584 // -------------------------------------------------------------------
4585 // Splay tree
4586 
4587 
Extend(unsigned value)4588 OutSet* OutSet::Extend(unsigned value) {
4589   if (Get(value))
4590     return this;
4591   if (successors() != NULL) {
4592     for (int i = 0; i < successors()->length(); i++) {
4593       OutSet* successor = successors()->at(i);
4594       if (successor->Get(value))
4595         return successor;
4596     }
4597   } else {
4598     successors_ = new ZoneList<OutSet*>(2);
4599   }
4600   OutSet* result = new OutSet(first_, remaining_);
4601   result->Set(value);
4602   successors()->Add(result);
4603   return result;
4604 }
4605 
4606 
Set(unsigned value)4607 void OutSet::Set(unsigned value) {
4608   if (value < kFirstLimit) {
4609     first_ |= (1 << value);
4610   } else {
4611     if (remaining_ == NULL)
4612       remaining_ = new ZoneList<unsigned>(1);
4613     if (remaining_->is_empty() || !remaining_->Contains(value))
4614       remaining_->Add(value);
4615   }
4616 }
4617 
4618 
Get(unsigned value)4619 bool OutSet::Get(unsigned value) {
4620   if (value < kFirstLimit) {
4621     return (first_ & (1 << value)) != 0;
4622   } else if (remaining_ == NULL) {
4623     return false;
4624   } else {
4625     return remaining_->Contains(value);
4626   }
4627 }
4628 
4629 
4630 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4631 const DispatchTable::Entry DispatchTable::Config::kNoValue;
4632 
4633 
AddRange(CharacterRange full_range,int value)4634 void DispatchTable::AddRange(CharacterRange full_range, int value) {
4635   CharacterRange current = full_range;
4636   if (tree()->is_empty()) {
4637     // If this is the first range we just insert into the table.
4638     ZoneSplayTree<Config>::Locator loc;
4639     ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4640     loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4641     return;
4642   }
4643   // First see if there is a range to the left of this one that
4644   // overlaps.
4645   ZoneSplayTree<Config>::Locator loc;
4646   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4647     Entry* entry = &loc.value();
4648     // If we've found a range that overlaps with this one, and it
4649     // starts strictly to the left of this one, we have to fix it
4650     // because the following code only handles ranges that start on
4651     // or after the start point of the range we're adding.
4652     if (entry->from() < current.from() && entry->to() >= current.from()) {
4653       // Snap the overlapping range in half around the start point of
4654       // the range we're adding.
4655       CharacterRange left(entry->from(), current.from() - 1);
4656       CharacterRange right(current.from(), entry->to());
4657       // The left part of the overlapping range doesn't overlap.
4658       // Truncate the whole entry to be just the left part.
4659       entry->set_to(left.to());
4660       // The right part is the one that overlaps.  We add this part
4661       // to the map and let the next step deal with merging it with
4662       // the range we're adding.
4663       ZoneSplayTree<Config>::Locator loc;
4664       ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4665       loc.set_value(Entry(right.from(),
4666                           right.to(),
4667                           entry->out_set()));
4668     }
4669   }
4670   while (current.is_valid()) {
4671     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4672         (loc.value().from() <= current.to()) &&
4673         (loc.value().to() >= current.from())) {
4674       Entry* entry = &loc.value();
4675       // We have overlap.  If there is space between the start point of
4676       // the range we're adding and where the overlapping range starts
4677       // then we have to add a range covering just that space.
4678       if (current.from() < entry->from()) {
4679         ZoneSplayTree<Config>::Locator ins;
4680         ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4681         ins.set_value(Entry(current.from(),
4682                             entry->from() - 1,
4683                             empty()->Extend(value)));
4684         current.set_from(entry->from());
4685       }
4686       ASSERT_EQ(current.from(), entry->from());
4687       // If the overlapping range extends beyond the one we want to add
4688       // we have to snap the right part off and add it separately.
4689       if (entry->to() > current.to()) {
4690         ZoneSplayTree<Config>::Locator ins;
4691         ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4692         ins.set_value(Entry(current.to() + 1,
4693                             entry->to(),
4694                             entry->out_set()));
4695         entry->set_to(current.to());
4696       }
4697       ASSERT(entry->to() <= current.to());
4698       // The overlapping range is now completely contained by the range
4699       // we're adding so we can just update it and move the start point
4700       // of the range we're adding just past it.
4701       entry->AddValue(value);
4702       // Bail out if the last interval ended at 0xFFFF since otherwise
4703       // adding 1 will wrap around to 0.
4704       if (entry->to() == String::kMaxUC16CharCode)
4705         break;
4706       ASSERT(entry->to() + 1 > current.from());
4707       current.set_from(entry->to() + 1);
4708     } else {
4709       // There is no overlap so we can just add the range
4710       ZoneSplayTree<Config>::Locator ins;
4711       ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4712       ins.set_value(Entry(current.from(),
4713                           current.to(),
4714                           empty()->Extend(value)));
4715       break;
4716     }
4717   }
4718 }
4719 
4720 
Get(uc16 value)4721 OutSet* DispatchTable::Get(uc16 value) {
4722   ZoneSplayTree<Config>::Locator loc;
4723   if (!tree()->FindGreatestLessThan(value, &loc))
4724     return empty();
4725   Entry* entry = &loc.value();
4726   if (value <= entry->to())
4727     return entry->out_set();
4728   else
4729     return empty();
4730 }
4731 
4732 
4733 // -------------------------------------------------------------------
4734 // Analysis
4735 
4736 
EnsureAnalyzed(RegExpNode * that)4737 void Analysis::EnsureAnalyzed(RegExpNode* that) {
4738   StackLimitCheck check;
4739   if (check.HasOverflowed()) {
4740     fail("Stack overflow");
4741     return;
4742   }
4743   if (that->info()->been_analyzed || that->info()->being_analyzed)
4744     return;
4745   that->info()->being_analyzed = true;
4746   that->Accept(this);
4747   that->info()->being_analyzed = false;
4748   that->info()->been_analyzed = true;
4749 }
4750 
4751 
VisitEnd(EndNode * that)4752 void Analysis::VisitEnd(EndNode* that) {
4753   // nothing to do
4754 }
4755 
4756 
CalculateOffsets()4757 void TextNode::CalculateOffsets() {
4758   int element_count = elements()->length();
4759   // Set up the offsets of the elements relative to the start.  This is a fixed
4760   // quantity since a TextNode can only contain fixed-width things.
4761   int cp_offset = 0;
4762   for (int i = 0; i < element_count; i++) {
4763     TextElement& elm = elements()->at(i);
4764     elm.cp_offset = cp_offset;
4765     if (elm.type == TextElement::ATOM) {
4766       cp_offset += elm.data.u_atom->data().length();
4767     } else {
4768       cp_offset++;
4769       Vector<const uc16> quarks = elm.data.u_atom->data();
4770     }
4771   }
4772 }
4773 
4774 
VisitText(TextNode * that)4775 void Analysis::VisitText(TextNode* that) {
4776   if (ignore_case_) {
4777     that->MakeCaseIndependent(is_ascii_);
4778   }
4779   EnsureAnalyzed(that->on_success());
4780   if (!has_failed()) {
4781     that->CalculateOffsets();
4782   }
4783 }
4784 
4785 
VisitAction(ActionNode * that)4786 void Analysis::VisitAction(ActionNode* that) {
4787   RegExpNode* target = that->on_success();
4788   EnsureAnalyzed(target);
4789   if (!has_failed()) {
4790     // If the next node is interested in what it follows then this node
4791     // has to be interested too so it can pass the information on.
4792     that->info()->AddFromFollowing(target->info());
4793   }
4794 }
4795 
4796 
VisitChoice(ChoiceNode * that)4797 void Analysis::VisitChoice(ChoiceNode* that) {
4798   NodeInfo* info = that->info();
4799   for (int i = 0; i < that->alternatives()->length(); i++) {
4800     RegExpNode* node = that->alternatives()->at(i).node();
4801     EnsureAnalyzed(node);
4802     if (has_failed()) return;
4803     // Anything the following nodes need to know has to be known by
4804     // this node also, so it can pass it on.
4805     info->AddFromFollowing(node->info());
4806   }
4807 }
4808 
4809 
VisitLoopChoice(LoopChoiceNode * that)4810 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4811   NodeInfo* info = that->info();
4812   for (int i = 0; i < that->alternatives()->length(); i++) {
4813     RegExpNode* node = that->alternatives()->at(i).node();
4814     if (node != that->loop_node()) {
4815       EnsureAnalyzed(node);
4816       if (has_failed()) return;
4817       info->AddFromFollowing(node->info());
4818     }
4819   }
4820   // Check the loop last since it may need the value of this node
4821   // to get a correct result.
4822   EnsureAnalyzed(that->loop_node());
4823   if (!has_failed()) {
4824     info->AddFromFollowing(that->loop_node()->info());
4825   }
4826 }
4827 
4828 
VisitBackReference(BackReferenceNode * that)4829 void Analysis::VisitBackReference(BackReferenceNode* that) {
4830   EnsureAnalyzed(that->on_success());
4831 }
4832 
4833 
VisitAssertion(AssertionNode * that)4834 void Analysis::VisitAssertion(AssertionNode* that) {
4835   EnsureAnalyzed(that->on_success());
4836   AssertionNode::AssertionNodeType type = that->type();
4837   if (type == AssertionNode::AT_BOUNDARY ||
4838       type == AssertionNode::AT_NON_BOUNDARY) {
4839     // Check if the following character is known to be a word character
4840     // or known to not be a word character.
4841     ZoneList<CharacterRange>* following_chars = that->FirstCharacterSet();
4842 
4843     CharacterRange::Canonicalize(following_chars);
4844 
4845     SetRelation word_relation =
4846         CharacterRange::WordCharacterRelation(following_chars);
4847     if (word_relation.Disjoint()) {
4848       // Includes the case where following_chars is empty (e.g., end-of-input).
4849       // Following character is definitely *not* a word character.
4850       type = (type == AssertionNode::AT_BOUNDARY) ?
4851                  AssertionNode::AFTER_WORD_CHARACTER :
4852                  AssertionNode::AFTER_NONWORD_CHARACTER;
4853       that->set_type(type);
4854     } else if (word_relation.ContainedIn()) {
4855       // Following character is definitely a word character.
4856       type = (type == AssertionNode::AT_BOUNDARY) ?
4857                  AssertionNode::AFTER_NONWORD_CHARACTER :
4858                  AssertionNode::AFTER_WORD_CHARACTER;
4859       that->set_type(type);
4860     }
4861   }
4862 }
4863 
4864 
FirstCharacterSet()4865 ZoneList<CharacterRange>* RegExpNode::FirstCharacterSet() {
4866   if (first_character_set_ == NULL) {
4867     if (ComputeFirstCharacterSet(kFirstCharBudget) < 0) {
4868       // If we can't find an exact solution within the budget, we
4869       // set the value to the set of every character, i.e., all characters
4870       // are possible.
4871       ZoneList<CharacterRange>* all_set = new ZoneList<CharacterRange>(1);
4872       all_set->Add(CharacterRange::Everything());
4873       first_character_set_ = all_set;
4874     }
4875   }
4876   return first_character_set_;
4877 }
4878 
4879 
ComputeFirstCharacterSet(int budget)4880 int RegExpNode::ComputeFirstCharacterSet(int budget) {
4881   // Default behavior is to not be able to determine the first character.
4882   return kComputeFirstCharacterSetFail;
4883 }
4884 
4885 
ComputeFirstCharacterSet(int budget)4886 int LoopChoiceNode::ComputeFirstCharacterSet(int budget) {
4887   budget--;
4888   if (budget >= 0) {
4889     // Find loop min-iteration. It's the value of the guarded choice node
4890     // with a GEQ guard, if any.
4891     int min_repetition = 0;
4892 
4893     for (int i = 0; i <= 1; i++) {
4894       GuardedAlternative alternative = alternatives()->at(i);
4895       ZoneList<Guard*>* guards = alternative.guards();
4896       if (guards != NULL && guards->length() > 0) {
4897         Guard* guard = guards->at(0);
4898         if (guard->op() == Guard::GEQ) {
4899           min_repetition = guard->value();
4900           break;
4901         }
4902       }
4903     }
4904 
4905     budget = loop_node()->ComputeFirstCharacterSet(budget);
4906     if (budget >= 0) {
4907       ZoneList<CharacterRange>* character_set =
4908           loop_node()->first_character_set();
4909       if (body_can_be_zero_length() || min_repetition == 0) {
4910         budget = continue_node()->ComputeFirstCharacterSet(budget);
4911         if (budget < 0) return budget;
4912         ZoneList<CharacterRange>* body_set =
4913             continue_node()->first_character_set();
4914         ZoneList<CharacterRange>* union_set =
4915           new ZoneList<CharacterRange>(Max(character_set->length(),
4916                                            body_set->length()));
4917         CharacterRange::Merge(character_set,
4918                               body_set,
4919                               union_set,
4920                               union_set,
4921                               union_set);
4922         character_set = union_set;
4923       }
4924       set_first_character_set(character_set);
4925     }
4926   }
4927   return budget;
4928 }
4929 
4930 
ComputeFirstCharacterSet(int budget)4931 int NegativeLookaheadChoiceNode::ComputeFirstCharacterSet(int budget) {
4932   budget--;
4933   if (budget >= 0) {
4934     GuardedAlternative successor = this->alternatives()->at(1);
4935     RegExpNode* successor_node = successor.node();
4936     budget = successor_node->ComputeFirstCharacterSet(budget);
4937     if (budget >= 0) {
4938       set_first_character_set(successor_node->first_character_set());
4939     }
4940   }
4941   return budget;
4942 }
4943 
4944 
4945 // The first character set of an EndNode is unknowable. Just use the
4946 // default implementation that fails and returns all characters as possible.
4947 
4948 
ComputeFirstCharacterSet(int budget)4949 int AssertionNode::ComputeFirstCharacterSet(int budget) {
4950   budget -= 1;
4951   if (budget >= 0) {
4952     switch (type_) {
4953       case AT_END: {
4954         set_first_character_set(new ZoneList<CharacterRange>(0));
4955         break;
4956       }
4957       case AT_START:
4958       case AT_BOUNDARY:
4959       case AT_NON_BOUNDARY:
4960       case AFTER_NEWLINE:
4961       case AFTER_NONWORD_CHARACTER:
4962       case AFTER_WORD_CHARACTER: {
4963         ASSERT_NOT_NULL(on_success());
4964         budget = on_success()->ComputeFirstCharacterSet(budget);
4965         set_first_character_set(on_success()->first_character_set());
4966         break;
4967       }
4968     }
4969   }
4970   return budget;
4971 }
4972 
4973 
ComputeFirstCharacterSet(int budget)4974 int ActionNode::ComputeFirstCharacterSet(int budget) {
4975   if (type_ == POSITIVE_SUBMATCH_SUCCESS) return kComputeFirstCharacterSetFail;
4976   budget--;
4977   if (budget >= 0) {
4978     ASSERT_NOT_NULL(on_success());
4979     budget = on_success()->ComputeFirstCharacterSet(budget);
4980     if (budget >= 0) {
4981       set_first_character_set(on_success()->first_character_set());
4982     }
4983   }
4984   return budget;
4985 }
4986 
4987 
ComputeFirstCharacterSet(int budget)4988 int BackReferenceNode::ComputeFirstCharacterSet(int budget) {
4989   // We don't know anything about the first character of a backreference
4990   // at this point.
4991   return kComputeFirstCharacterSetFail;
4992 }
4993 
4994 
ComputeFirstCharacterSet(int budget)4995 int TextNode::ComputeFirstCharacterSet(int budget) {
4996   budget--;
4997   if (budget >= 0) {
4998     ASSERT_NE(0, elements()->length());
4999     TextElement text = elements()->at(0);
5000     if (text.type == TextElement::ATOM) {
5001       RegExpAtom* atom = text.data.u_atom;
5002       ASSERT_NE(0, atom->length());
5003       uc16 first_char = atom->data()[0];
5004       ZoneList<CharacterRange>* range = new ZoneList<CharacterRange>(1);
5005       range->Add(CharacterRange(first_char, first_char));
5006       set_first_character_set(range);
5007     } else {
5008       ASSERT(text.type == TextElement::CHAR_CLASS);
5009       RegExpCharacterClass* char_class = text.data.u_char_class;
5010       if (char_class->is_negated()) {
5011         ZoneList<CharacterRange>* ranges = char_class->ranges();
5012         int length = ranges->length();
5013         int new_length = length + 1;
5014         if (length > 0) {
5015           if (ranges->at(0).from() == 0) new_length--;
5016           if (ranges->at(length - 1).to() == String::kMaxUC16CharCode) {
5017             new_length--;
5018           }
5019         }
5020         ZoneList<CharacterRange>* negated_ranges =
5021             new ZoneList<CharacterRange>(new_length);
5022         CharacterRange::Negate(ranges, negated_ranges);
5023         set_first_character_set(negated_ranges);
5024       } else {
5025         set_first_character_set(char_class->ranges());
5026       }
5027     }
5028   }
5029   return budget;
5030 }
5031 
5032 
5033 
5034 // -------------------------------------------------------------------
5035 // Dispatch table construction
5036 
5037 
VisitEnd(EndNode * that)5038 void DispatchTableConstructor::VisitEnd(EndNode* that) {
5039   AddRange(CharacterRange::Everything());
5040 }
5041 
5042 
BuildTable(ChoiceNode * node)5043 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5044   node->set_being_calculated(true);
5045   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5046   for (int i = 0; i < alternatives->length(); i++) {
5047     set_choice_index(i);
5048     alternatives->at(i).node()->Accept(this);
5049   }
5050   node->set_being_calculated(false);
5051 }
5052 
5053 
5054 class AddDispatchRange {
5055  public:
AddDispatchRange(DispatchTableConstructor * constructor)5056   explicit AddDispatchRange(DispatchTableConstructor* constructor)
5057     : constructor_(constructor) { }
5058   void Call(uc32 from, DispatchTable::Entry entry);
5059  private:
5060   DispatchTableConstructor* constructor_;
5061 };
5062 
5063 
Call(uc32 from,DispatchTable::Entry entry)5064 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5065   CharacterRange range(from, entry.to());
5066   constructor_->AddRange(range);
5067 }
5068 
5069 
VisitChoice(ChoiceNode * node)5070 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5071   if (node->being_calculated())
5072     return;
5073   DispatchTable* table = node->GetTable(ignore_case_);
5074   AddDispatchRange adder(this);
5075   table->ForEach(&adder);
5076 }
5077 
5078 
VisitBackReference(BackReferenceNode * that)5079 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5080   // TODO(160): Find the node that we refer back to and propagate its start
5081   // set back to here.  For now we just accept anything.
5082   AddRange(CharacterRange::Everything());
5083 }
5084 
5085 
VisitAssertion(AssertionNode * that)5086 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5087   RegExpNode* target = that->on_success();
5088   target->Accept(this);
5089 }
5090 
5091 
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)5092 static int CompareRangeByFrom(const CharacterRange* a,
5093                               const CharacterRange* b) {
5094   return Compare<uc16>(a->from(), b->from());
5095 }
5096 
5097 
AddInverse(ZoneList<CharacterRange> * ranges)5098 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5099   ranges->Sort(CompareRangeByFrom);
5100   uc16 last = 0;
5101   for (int i = 0; i < ranges->length(); i++) {
5102     CharacterRange range = ranges->at(i);
5103     if (last < range.from())
5104       AddRange(CharacterRange(last, range.from() - 1));
5105     if (range.to() >= last) {
5106       if (range.to() == String::kMaxUC16CharCode) {
5107         return;
5108       } else {
5109         last = range.to() + 1;
5110       }
5111     }
5112   }
5113   AddRange(CharacterRange(last, String::kMaxUC16CharCode));
5114 }
5115 
5116 
VisitText(TextNode * that)5117 void DispatchTableConstructor::VisitText(TextNode* that) {
5118   TextElement elm = that->elements()->at(0);
5119   switch (elm.type) {
5120     case TextElement::ATOM: {
5121       uc16 c = elm.data.u_atom->data()[0];
5122       AddRange(CharacterRange(c, c));
5123       break;
5124     }
5125     case TextElement::CHAR_CLASS: {
5126       RegExpCharacterClass* tree = elm.data.u_char_class;
5127       ZoneList<CharacterRange>* ranges = tree->ranges();
5128       if (tree->is_negated()) {
5129         AddInverse(ranges);
5130       } else {
5131         for (int i = 0; i < ranges->length(); i++)
5132           AddRange(ranges->at(i));
5133       }
5134       break;
5135     }
5136     default: {
5137       UNIMPLEMENTED();
5138     }
5139   }
5140 }
5141 
5142 
VisitAction(ActionNode * that)5143 void DispatchTableConstructor::VisitAction(ActionNode* that) {
5144   RegExpNode* target = that->on_success();
5145   target->Accept(this);
5146 }
5147 
5148 
Compile(RegExpCompileData * data,bool ignore_case,bool is_multiline,Handle<String> pattern,bool is_ascii)5149 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
5150                                                       bool ignore_case,
5151                                                       bool is_multiline,
5152                                                       Handle<String> pattern,
5153                                                       bool is_ascii) {
5154   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
5155     return IrregexpRegExpTooBig();
5156   }
5157   RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
5158   // Wrap the body of the regexp in capture #0.
5159   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
5160                                                     0,
5161                                                     &compiler,
5162                                                     compiler.accept());
5163   RegExpNode* node = captured_body;
5164   if (!data->tree->IsAnchored()) {
5165     // Add a .*? at the beginning, outside the body capture, unless
5166     // this expression is anchored at the beginning.
5167     RegExpNode* loop_node =
5168         RegExpQuantifier::ToNode(0,
5169                                  RegExpTree::kInfinity,
5170                                  false,
5171                                  new RegExpCharacterClass('*'),
5172                                  &compiler,
5173                                  captured_body,
5174                                  data->contains_anchor);
5175 
5176     if (data->contains_anchor) {
5177       // Unroll loop once, to take care of the case that might start
5178       // at the start of input.
5179       ChoiceNode* first_step_node = new ChoiceNode(2);
5180       first_step_node->AddAlternative(GuardedAlternative(captured_body));
5181       first_step_node->AddAlternative(GuardedAlternative(
5182           new TextNode(new RegExpCharacterClass('*'), loop_node)));
5183       node = first_step_node;
5184     } else {
5185       node = loop_node;
5186     }
5187   }
5188   data->node = node;
5189   Analysis analysis(ignore_case, is_ascii);
5190   analysis.EnsureAnalyzed(node);
5191   if (analysis.has_failed()) {
5192     const char* error_message = analysis.error_message();
5193     return CompilationResult(error_message);
5194   }
5195 
5196   NodeInfo info = *node->info();
5197 
5198   // Create the correct assembler for the architecture.
5199 #ifdef V8_NATIVE_REGEXP
5200   // Native regexp implementation.
5201 
5202   NativeRegExpMacroAssembler::Mode mode =
5203       is_ascii ? NativeRegExpMacroAssembler::ASCII
5204                : NativeRegExpMacroAssembler::UC16;
5205 
5206 #if V8_TARGET_ARCH_IA32
5207   RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
5208 #elif V8_TARGET_ARCH_X64
5209   RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
5210 #elif V8_TARGET_ARCH_ARM
5211   RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
5212 #endif
5213 
5214 #else  // ! V8_NATIVE_REGEXP
5215   // Interpreted regexp implementation.
5216   EmbeddedVector<byte, 1024> codes;
5217   RegExpMacroAssemblerIrregexp macro_assembler(codes);
5218 #endif
5219 
5220   return compiler.Assemble(&macro_assembler,
5221                            node,
5222                            data->capture_count,
5223                            pattern);
5224 }
5225 
5226 
5227 int OffsetsVector::static_offsets_vector_[
5228     OffsetsVector::kStaticOffsetsVectorSize];
5229 
5230 }}  // namespace v8::internal
5231