1 // Copyright 2006-2009 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 #include "v8.h"
29
30 #include "ast.h"
31 #include "compiler.h"
32 #include "execution.h"
33 #include "factory.h"
34 #include "jsregexp.h"
35 #include "platform.h"
36 #include "runtime.h"
37 #include "top.h"
38 #include "compilation-cache.h"
39 #include "string-stream.h"
40 #include "parser.h"
41 #include "regexp-macro-assembler.h"
42 #include "regexp-macro-assembler-tracer.h"
43 #include "regexp-macro-assembler-irregexp.h"
44 #include "regexp-stack.h"
45
46 #ifdef V8_NATIVE_REGEXP
47 #if V8_TARGET_ARCH_IA32
48 #include "ia32/macro-assembler-ia32.h"
49 #include "ia32/regexp-macro-assembler-ia32.h"
50 #elif V8_TARGET_ARCH_X64
51 #include "x64/macro-assembler-x64.h"
52 #include "x64/regexp-macro-assembler-x64.h"
53 #elif V8_TARGET_ARCH_ARM
54 #include "arm/macro-assembler-arm.h"
55 #include "arm/regexp-macro-assembler-arm.h"
56 #else
57 #error Unsupported target architecture.
58 #endif
59 #endif
60
61 #include "interpreter-irregexp.h"
62
63
64 namespace v8 {
65 namespace internal {
66
67
CreateRegExpLiteral(Handle<JSFunction> constructor,Handle<String> pattern,Handle<String> flags,bool * has_pending_exception)68 Handle<Object> RegExpImpl::CreateRegExpLiteral(Handle<JSFunction> constructor,
69 Handle<String> pattern,
70 Handle<String> flags,
71 bool* has_pending_exception) {
72 // Ensure that the constructor function has been loaded.
73 if (!constructor->IsLoaded()) {
74 LoadLazy(constructor, has_pending_exception);
75 if (*has_pending_exception) return Handle<Object>();
76 }
77 // Call the construct code with 2 arguments.
78 Object** argv[2] = { Handle<Object>::cast(pattern).location(),
79 Handle<Object>::cast(flags).location() };
80 return Execution::New(constructor, 2, argv, has_pending_exception);
81 }
82
83
RegExpFlagsFromString(Handle<String> str)84 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
85 int flags = JSRegExp::NONE;
86 for (int i = 0; i < str->length(); i++) {
87 switch (str->Get(i)) {
88 case 'i':
89 flags |= JSRegExp::IGNORE_CASE;
90 break;
91 case 'g':
92 flags |= JSRegExp::GLOBAL;
93 break;
94 case 'm':
95 flags |= JSRegExp::MULTILINE;
96 break;
97 }
98 }
99 return JSRegExp::Flags(flags);
100 }
101
102
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text,const char * message)103 static inline void ThrowRegExpException(Handle<JSRegExp> re,
104 Handle<String> pattern,
105 Handle<String> error_text,
106 const char* message) {
107 Handle<JSArray> array = Factory::NewJSArray(2);
108 SetElement(array, 0, pattern);
109 SetElement(array, 1, error_text);
110 Handle<Object> regexp_err = Factory::NewSyntaxError(message, array);
111 Top::Throw(*regexp_err);
112 }
113
114
115 // Generic RegExp methods. Dispatches to implementation specific methods.
116
117
118 class OffsetsVector {
119 public:
OffsetsVector(int num_registers)120 inline OffsetsVector(int num_registers)
121 : offsets_vector_length_(num_registers) {
122 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
123 vector_ = NewArray<int>(offsets_vector_length_);
124 } else {
125 vector_ = static_offsets_vector_;
126 }
127 }
~OffsetsVector()128 inline ~OffsetsVector() {
129 if (offsets_vector_length_ > kStaticOffsetsVectorSize) {
130 DeleteArray(vector_);
131 vector_ = NULL;
132 }
133 }
vector()134 inline int* vector() { return vector_; }
length()135 inline int length() { return offsets_vector_length_; }
136
137 private:
138 int* vector_;
139 int offsets_vector_length_;
140 static const int kStaticOffsetsVectorSize = 50;
141 static int static_offsets_vector_[kStaticOffsetsVectorSize];
142 };
143
144
145 int OffsetsVector::static_offsets_vector_[
146 OffsetsVector::kStaticOffsetsVectorSize];
147
148
Compile(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> flag_str)149 Handle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
150 Handle<String> pattern,
151 Handle<String> flag_str) {
152 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
153 Handle<FixedArray> cached = CompilationCache::LookupRegExp(pattern, flags);
154 bool in_cache = !cached.is_null();
155 LOG(RegExpCompileEvent(re, in_cache));
156
157 Handle<Object> result;
158 if (in_cache) {
159 re->set_data(*cached);
160 return re;
161 }
162 FlattenString(pattern);
163 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
164 RegExpCompileData parse_result;
165 FlatStringReader reader(pattern);
166 if (!ParseRegExp(&reader, flags.is_multiline(), &parse_result)) {
167 // Throw an exception if we fail to parse the pattern.
168 ThrowRegExpException(re,
169 pattern,
170 parse_result.error,
171 "malformed_regexp");
172 return Handle<Object>::null();
173 }
174
175 if (parse_result.simple && !flags.is_ignore_case()) {
176 // Parse-tree is a single atom that is equal to the pattern.
177 AtomCompile(re, pattern, flags, pattern);
178 } else if (parse_result.tree->IsAtom() &&
179 !flags.is_ignore_case() &&
180 parse_result.capture_count == 0) {
181 RegExpAtom* atom = parse_result.tree->AsAtom();
182 Vector<const uc16> atom_pattern = atom->data();
183 Handle<String> atom_string = Factory::NewStringFromTwoByte(atom_pattern);
184 AtomCompile(re, pattern, flags, atom_string);
185 } else {
186 IrregexpPrepare(re, pattern, flags, parse_result.capture_count);
187 }
188 ASSERT(re->data()->IsFixedArray());
189 // Compilation succeeded so the data is set on the regexp
190 // and we can store it in the cache.
191 Handle<FixedArray> data(FixedArray::cast(re->data()));
192 CompilationCache::PutRegExp(pattern, flags, data);
193
194 return re;
195 }
196
197
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<JSArray> last_match_info)198 Handle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
199 Handle<String> subject,
200 int index,
201 Handle<JSArray> last_match_info) {
202 switch (regexp->TypeTag()) {
203 case JSRegExp::ATOM:
204 return AtomExec(regexp, subject, index, last_match_info);
205 case JSRegExp::IRREGEXP: {
206 Handle<Object> result =
207 IrregexpExec(regexp, subject, index, last_match_info);
208 ASSERT(!result.is_null() || Top::has_pending_exception());
209 return result;
210 }
211 default:
212 UNREACHABLE();
213 return Handle<Object>::null();
214 }
215 }
216
217
218 // RegExp Atom implementation: Simple string search using indexOf.
219
220
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)221 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
222 Handle<String> pattern,
223 JSRegExp::Flags flags,
224 Handle<String> match_pattern) {
225 Factory::SetRegExpAtomData(re,
226 JSRegExp::ATOM,
227 pattern,
228 flags,
229 match_pattern);
230 }
231
232
SetAtomLastCapture(FixedArray * array,String * subject,int from,int to)233 static void SetAtomLastCapture(FixedArray* array,
234 String* subject,
235 int from,
236 int to) {
237 NoHandleAllocation no_handles;
238 RegExpImpl::SetLastCaptureCount(array, 2);
239 RegExpImpl::SetLastSubject(array, subject);
240 RegExpImpl::SetLastInput(array, subject);
241 RegExpImpl::SetCapture(array, 0, from);
242 RegExpImpl::SetCapture(array, 1, to);
243 }
244
245
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<JSArray> last_match_info)246 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
247 Handle<String> subject,
248 int index,
249 Handle<JSArray> last_match_info) {
250 Handle<String> needle(String::cast(re->DataAt(JSRegExp::kAtomPatternIndex)));
251
252 uint32_t start_index = index;
253
254 int value = Runtime::StringMatch(subject, needle, start_index);
255 if (value == -1) return Factory::null_value();
256 ASSERT(last_match_info->HasFastElements());
257
258 {
259 NoHandleAllocation no_handles;
260 FixedArray* array = FixedArray::cast(last_match_info->elements());
261 SetAtomLastCapture(array, *subject, value, value + needle->length());
262 }
263 return last_match_info;
264 }
265
266
267 // Irregexp implementation.
268
269 // Ensures that the regexp object contains a compiled version of the
270 // source for either ASCII or non-ASCII strings.
271 // If the compiled version doesn't already exist, it is compiled
272 // from the source pattern.
273 // If compilation fails, an exception is thrown and this function
274 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,bool is_ascii)275 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re, bool is_ascii) {
276 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
277 #ifdef V8_NATIVE_REGEXP
278 if (compiled_code->IsCode()) return true;
279 #else // ! V8_NATIVE_REGEXP (RegExp interpreter code)
280 if (compiled_code->IsByteArray()) return true;
281 #endif
282 return CompileIrregexp(re, is_ascii);
283 }
284
285
CompileIrregexp(Handle<JSRegExp> re,bool is_ascii)286 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re, bool is_ascii) {
287 // Compile the RegExp.
288 CompilationZoneScope zone_scope(DELETE_ON_EXIT);
289 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
290 if (entry->IsJSObject()) {
291 // If it's a JSObject, a previous compilation failed and threw this object.
292 // Re-throw the object without trying again.
293 Top::Throw(entry);
294 return false;
295 }
296 ASSERT(entry->IsTheHole());
297
298 JSRegExp::Flags flags = re->GetFlags();
299
300 Handle<String> pattern(re->Pattern());
301 if (!pattern->IsFlat()) {
302 FlattenString(pattern);
303 }
304
305 RegExpCompileData compile_data;
306 FlatStringReader reader(pattern);
307 if (!ParseRegExp(&reader, flags.is_multiline(), &compile_data)) {
308 // Throw an exception if we fail to parse the pattern.
309 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
310 ThrowRegExpException(re,
311 pattern,
312 compile_data.error,
313 "malformed_regexp");
314 return false;
315 }
316 RegExpEngine::CompilationResult result =
317 RegExpEngine::Compile(&compile_data,
318 flags.is_ignore_case(),
319 flags.is_multiline(),
320 pattern,
321 is_ascii);
322 if (result.error_message != NULL) {
323 // Unable to compile regexp.
324 Handle<JSArray> array = Factory::NewJSArray(2);
325 SetElement(array, 0, pattern);
326 SetElement(array,
327 1,
328 Factory::NewStringFromUtf8(CStrVector(result.error_message)));
329 Handle<Object> regexp_err =
330 Factory::NewSyntaxError("malformed_regexp", array);
331 Top::Throw(*regexp_err);
332 re->SetDataAt(JSRegExp::code_index(is_ascii), *regexp_err);
333 return false;
334 }
335
336 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
337 data->set(JSRegExp::code_index(is_ascii), result.code);
338 int register_max = IrregexpMaxRegisterCount(*data);
339 if (result.num_registers > register_max) {
340 SetIrregexpMaxRegisterCount(*data, result.num_registers);
341 }
342
343 return true;
344 }
345
346
IrregexpMaxRegisterCount(FixedArray * re)347 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
348 return Smi::cast(
349 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
350 }
351
352
SetIrregexpMaxRegisterCount(FixedArray * re,int value)353 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
354 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
355 }
356
357
IrregexpNumberOfCaptures(FixedArray * re)358 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
359 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
360 }
361
362
IrregexpNumberOfRegisters(FixedArray * re)363 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
364 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
365 }
366
367
IrregexpByteCode(FixedArray * re,bool is_ascii)368 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
369 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
370 }
371
372
IrregexpNativeCode(FixedArray * re,bool is_ascii)373 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
374 return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
375 }
376
377
IrregexpPrepare(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)378 void RegExpImpl::IrregexpPrepare(Handle<JSRegExp> re,
379 Handle<String> pattern,
380 JSRegExp::Flags flags,
381 int capture_count) {
382 // Initialize compiled code entries to null.
383 Factory::SetRegExpIrregexpData(re,
384 JSRegExp::IRREGEXP,
385 pattern,
386 flags,
387 capture_count);
388 }
389
390
IrregexpExec(Handle<JSRegExp> jsregexp,Handle<String> subject,int previous_index,Handle<JSArray> last_match_info)391 Handle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> jsregexp,
392 Handle<String> subject,
393 int previous_index,
394 Handle<JSArray> last_match_info) {
395 ASSERT_EQ(jsregexp->TypeTag(), JSRegExp::IRREGEXP);
396
397 // Prepare space for the return values.
398 int number_of_capture_registers =
399 (IrregexpNumberOfCaptures(FixedArray::cast(jsregexp->data())) + 1) * 2;
400
401 #ifndef V8_NATIVE_REGEXP
402 #ifdef DEBUG
403 if (FLAG_trace_regexp_bytecodes) {
404 String* pattern = jsregexp->Pattern();
405 PrintF("\n\nRegexp match: /%s/\n\n", *(pattern->ToCString()));
406 PrintF("\n\nSubject string: '%s'\n\n", *(subject->ToCString()));
407 }
408 #endif
409 #endif
410
411 if (!subject->IsFlat()) {
412 FlattenString(subject);
413 }
414
415 last_match_info->EnsureSize(number_of_capture_registers + kLastMatchOverhead);
416
417 Handle<FixedArray> array;
418
419 // Dispatch to the correct RegExp implementation.
420 Handle<FixedArray> regexp(FixedArray::cast(jsregexp->data()));
421
422 #ifdef V8_NATIVE_REGEXP
423
424 OffsetsVector captures(number_of_capture_registers);
425 int* captures_vector = captures.vector();
426 NativeRegExpMacroAssembler::Result res;
427 do {
428 bool is_ascii = subject->IsAsciiRepresentation();
429 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
430 return Handle<Object>::null();
431 }
432 Handle<Code> code(RegExpImpl::IrregexpNativeCode(*regexp, is_ascii));
433 res = NativeRegExpMacroAssembler::Match(code,
434 subject,
435 captures_vector,
436 captures.length(),
437 previous_index);
438 // If result is RETRY, the string have changed representation, and we
439 // must restart from scratch.
440 } while (res == NativeRegExpMacroAssembler::RETRY);
441 if (res == NativeRegExpMacroAssembler::EXCEPTION) {
442 ASSERT(Top::has_pending_exception());
443 return Handle<Object>::null();
444 }
445 ASSERT(res == NativeRegExpMacroAssembler::SUCCESS
446 || res == NativeRegExpMacroAssembler::FAILURE);
447
448 if (res != NativeRegExpMacroAssembler::SUCCESS) return Factory::null_value();
449
450 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
451 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
452 // The captures come in (start, end+1) pairs.
453 for (int i = 0; i < number_of_capture_registers; i += 2) {
454 SetCapture(*array, i, captures_vector[i]);
455 SetCapture(*array, i + 1, captures_vector[i + 1]);
456 }
457
458 #else // ! V8_NATIVE_REGEXP
459
460 bool is_ascii = subject->IsAsciiRepresentation();
461 if (!EnsureCompiledIrregexp(jsregexp, is_ascii)) {
462 return Handle<Object>::null();
463 }
464 // Now that we have done EnsureCompiledIrregexp we can get the number of
465 // registers.
466 int number_of_registers =
467 IrregexpNumberOfRegisters(FixedArray::cast(jsregexp->data()));
468 OffsetsVector registers(number_of_registers);
469 int* register_vector = registers.vector();
470 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
471 register_vector[i] = -1;
472 }
473 Handle<ByteArray> byte_codes(IrregexpByteCode(*regexp, is_ascii));
474
475 if (!IrregexpInterpreter::Match(byte_codes,
476 subject,
477 register_vector,
478 previous_index)) {
479 return Factory::null_value();
480 }
481
482 array = Handle<FixedArray>(FixedArray::cast(last_match_info->elements()));
483 ASSERT(array->length() >= number_of_capture_registers + kLastMatchOverhead);
484 // The captures come in (start, end+1) pairs.
485 for (int i = 0; i < number_of_capture_registers; i += 2) {
486 SetCapture(*array, i, register_vector[i]);
487 SetCapture(*array, i + 1, register_vector[i + 1]);
488 }
489
490 #endif // V8_NATIVE_REGEXP
491
492 SetLastCaptureCount(*array, number_of_capture_registers);
493 SetLastSubject(*array, *subject);
494 SetLastInput(*array, *subject);
495
496 return last_match_info;
497 }
498
499
500 // -------------------------------------------------------------------
501 // Implementation of the Irregexp regular expression engine.
502 //
503 // The Irregexp regular expression engine is intended to be a complete
504 // implementation of ECMAScript regular expressions. It generates either
505 // bytecodes or native code.
506
507 // The Irregexp regexp engine is structured in three steps.
508 // 1) The parser generates an abstract syntax tree. See ast.cc.
509 // 2) From the AST a node network is created. The nodes are all
510 // subclasses of RegExpNode. The nodes represent states when
511 // executing a regular expression. Several optimizations are
512 // performed on the node network.
513 // 3) From the nodes we generate either byte codes or native code
514 // that can actually execute the regular expression (perform
515 // the search). The code generation step is described in more
516 // detail below.
517
518 // Code generation.
519 //
520 // The nodes are divided into four main categories.
521 // * Choice nodes
522 // These represent places where the regular expression can
523 // match in more than one way. For example on entry to an
524 // alternation (foo|bar) or a repetition (*, +, ? or {}).
525 // * Action nodes
526 // These represent places where some action should be
527 // performed. Examples include recording the current position
528 // in the input string to a register (in order to implement
529 // captures) or other actions on register for example in order
530 // to implement the counters needed for {} repetitions.
531 // * Matching nodes
532 // These attempt to match some element part of the input string.
533 // Examples of elements include character classes, plain strings
534 // or back references.
535 // * End nodes
536 // These are used to implement the actions required on finding
537 // a successful match or failing to find a match.
538 //
539 // The code generated (whether as byte codes or native code) maintains
540 // some state as it runs. This consists of the following elements:
541 //
542 // * The capture registers. Used for string captures.
543 // * Other registers. Used for counters etc.
544 // * The current position.
545 // * The stack of backtracking information. Used when a matching node
546 // fails to find a match and needs to try an alternative.
547 //
548 // Conceptual regular expression execution model:
549 //
550 // There is a simple conceptual model of regular expression execution
551 // which will be presented first. The actual code generated is a more
552 // efficient simulation of the simple conceptual model:
553 //
554 // * Choice nodes are implemented as follows:
555 // For each choice except the last {
556 // push current position
557 // push backtrack code location
558 // <generate code to test for choice>
559 // backtrack code location:
560 // pop current position
561 // }
562 // <generate code to test for last choice>
563 //
564 // * Actions nodes are generated as follows
565 // <push affected registers on backtrack stack>
566 // <generate code to perform action>
567 // push backtrack code location
568 // <generate code to test for following nodes>
569 // backtrack code location:
570 // <pop affected registers to restore their state>
571 // <pop backtrack location from stack and go to it>
572 //
573 // * Matching nodes are generated as follows:
574 // if input string matches at current position
575 // update current position
576 // <generate code to test for following nodes>
577 // else
578 // <pop backtrack location from stack and go to it>
579 //
580 // Thus it can be seen that the current position is saved and restored
581 // by the choice nodes, whereas the registers are saved and restored by
582 // by the action nodes that manipulate them.
583 //
584 // The other interesting aspect of this model is that nodes are generated
585 // at the point where they are needed by a recursive call to Emit(). If
586 // the node has already been code generated then the Emit() call will
587 // generate a jump to the previously generated code instead. In order to
588 // limit recursion it is possible for the Emit() function to put the node
589 // on a work list for later generation and instead generate a jump. The
590 // destination of the jump is resolved later when the code is generated.
591 //
592 // Actual regular expression code generation.
593 //
594 // Code generation is actually more complicated than the above. In order
595 // to improve the efficiency of the generated code some optimizations are
596 // performed
597 //
598 // * Choice nodes have 1-character lookahead.
599 // A choice node looks at the following character and eliminates some of
600 // the choices immediately based on that character. This is not yet
601 // implemented.
602 // * Simple greedy loops store reduced backtracking information.
603 // A quantifier like /.*foo/m will greedily match the whole input. It will
604 // then need to backtrack to a point where it can match "foo". The naive
605 // implementation of this would push each character position onto the
606 // backtracking stack, then pop them off one by one. This would use space
607 // proportional to the length of the input string. However since the "."
608 // can only match in one way and always has a constant length (in this case
609 // of 1) it suffices to store the current position on the top of the stack
610 // once. Matching now becomes merely incrementing the current position and
611 // backtracking becomes decrementing the current position and checking the
612 // result against the stored current position. This is faster and saves
613 // space.
614 // * The current state is virtualized.
615 // This is used to defer expensive operations until it is clear that they
616 // are needed and to generate code for a node more than once, allowing
617 // specialized an efficient versions of the code to be created. This is
618 // explained in the section below.
619 //
620 // Execution state virtualization.
621 //
622 // Instead of emitting code, nodes that manipulate the state can record their
623 // manipulation in an object called the Trace. The Trace object can record a
624 // current position offset, an optional backtrack code location on the top of
625 // the virtualized backtrack stack and some register changes. When a node is
626 // to be emitted it can flush the Trace or update it. Flushing the Trace
627 // will emit code to bring the actual state into line with the virtual state.
628 // Avoiding flushing the state can postpone some work (eg updates of capture
629 // registers). Postponing work can save time when executing the regular
630 // expression since it may be found that the work never has to be done as a
631 // failure to match can occur. In addition it is much faster to jump to a
632 // known backtrack code location than it is to pop an unknown backtrack
633 // location from the stack and jump there.
634 //
635 // The virtual state found in the Trace affects code generation. For example
636 // the virtual state contains the difference between the actual current
637 // position and the virtual current position, and matching code needs to use
638 // this offset to attempt a match in the correct location of the input
639 // string. Therefore code generated for a non-trivial trace is specialized
640 // to that trace. The code generator therefore has the ability to generate
641 // code for each node several times. In order to limit the size of the
642 // generated code there is an arbitrary limit on how many specialized sets of
643 // code may be generated for a given node. If the limit is reached, the
644 // trace is flushed and a generic version of the code for a node is emitted.
645 // This is subsequently used for that node. The code emitted for non-generic
646 // trace is not recorded in the node and so it cannot currently be reused in
647 // the event that code generation is requested for an identical trace.
648
649
AppendToText(RegExpText * text)650 void RegExpTree::AppendToText(RegExpText* text) {
651 UNREACHABLE();
652 }
653
654
AppendToText(RegExpText * text)655 void RegExpAtom::AppendToText(RegExpText* text) {
656 text->AddElement(TextElement::Atom(this));
657 }
658
659
AppendToText(RegExpText * text)660 void RegExpCharacterClass::AppendToText(RegExpText* text) {
661 text->AddElement(TextElement::CharClass(this));
662 }
663
664
AppendToText(RegExpText * text)665 void RegExpText::AppendToText(RegExpText* text) {
666 for (int i = 0; i < elements()->length(); i++)
667 text->AddElement(elements()->at(i));
668 }
669
670
Atom(RegExpAtom * atom)671 TextElement TextElement::Atom(RegExpAtom* atom) {
672 TextElement result = TextElement(ATOM);
673 result.data.u_atom = atom;
674 return result;
675 }
676
677
CharClass(RegExpCharacterClass * char_class)678 TextElement TextElement::CharClass(
679 RegExpCharacterClass* char_class) {
680 TextElement result = TextElement(CHAR_CLASS);
681 result.data.u_char_class = char_class;
682 return result;
683 }
684
685
length()686 int TextElement::length() {
687 if (type == ATOM) {
688 return data.u_atom->length();
689 } else {
690 ASSERT(type == CHAR_CLASS);
691 return 1;
692 }
693 }
694
695
GetTable(bool ignore_case)696 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
697 if (table_ == NULL) {
698 table_ = new DispatchTable();
699 DispatchTableConstructor cons(table_, ignore_case);
700 cons.BuildTable(this);
701 }
702 return table_;
703 }
704
705
706 class RegExpCompiler {
707 public:
708 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii);
709
AllocateRegister()710 int AllocateRegister() {
711 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
712 reg_exp_too_big_ = true;
713 return next_register_;
714 }
715 return next_register_++;
716 }
717
718 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
719 RegExpNode* start,
720 int capture_count,
721 Handle<String> pattern);
722
AddWork(RegExpNode * node)723 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
724
725 static const int kImplementationOffset = 0;
726 static const int kNumberOfRegistersOffset = 0;
727 static const int kCodeOffset = 1;
728
macro_assembler()729 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()730 EndNode* accept() { return accept_; }
731
732 static const int kMaxRecursion = 100;
recursion_depth()733 inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()734 inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()735 inline void DecrementRecursionDepth() { recursion_depth_--; }
736
SetRegExpTooBig()737 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
738
ignore_case()739 inline bool ignore_case() { return ignore_case_; }
ascii()740 inline bool ascii() { return ascii_; }
741
742 static const int kNoRegister = -1;
743 private:
744 EndNode* accept_;
745 int next_register_;
746 List<RegExpNode*>* work_list_;
747 int recursion_depth_;
748 RegExpMacroAssembler* macro_assembler_;
749 bool ignore_case_;
750 bool ascii_;
751 bool reg_exp_too_big_;
752 };
753
754
755 class RecursionCheck {
756 public:
RecursionCheck(RegExpCompiler * compiler)757 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
758 compiler->IncrementRecursionDepth();
759 }
~RecursionCheck()760 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
761 private:
762 RegExpCompiler* compiler_;
763 };
764
765
IrregexpRegExpTooBig()766 static RegExpEngine::CompilationResult IrregexpRegExpTooBig() {
767 return RegExpEngine::CompilationResult("RegExp too big");
768 }
769
770
771 // Attempts to compile the regexp using an Irregexp code generator. Returns
772 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(int capture_count,bool ignore_case,bool ascii)773 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii)
774 : next_register_(2 * (capture_count + 1)),
775 work_list_(NULL),
776 recursion_depth_(0),
777 ignore_case_(ignore_case),
778 ascii_(ascii),
779 reg_exp_too_big_(false) {
780 accept_ = new EndNode(EndNode::ACCEPT);
781 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
782 }
783
784
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)785 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
786 RegExpMacroAssembler* macro_assembler,
787 RegExpNode* start,
788 int capture_count,
789 Handle<String> pattern) {
790 #ifdef DEBUG
791 if (FLAG_trace_regexp_assembler)
792 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
793 else
794 #endif
795 macro_assembler_ = macro_assembler;
796 List <RegExpNode*> work_list(0);
797 work_list_ = &work_list;
798 Label fail;
799 macro_assembler_->PushBacktrack(&fail);
800 Trace new_trace;
801 start->Emit(this, &new_trace);
802 macro_assembler_->Bind(&fail);
803 macro_assembler_->Fail();
804 while (!work_list.is_empty()) {
805 work_list.RemoveLast()->Emit(this, &new_trace);
806 }
807 if (reg_exp_too_big_) return IrregexpRegExpTooBig();
808
809 Handle<Object> code = macro_assembler_->GetCode(pattern);
810
811 work_list_ = NULL;
812 #ifdef DEBUG
813 if (FLAG_trace_regexp_assembler) {
814 delete macro_assembler_;
815 }
816 #endif
817 return RegExpEngine::CompilationResult(*code, next_register_);
818 }
819
820
Mentions(int that)821 bool Trace::DeferredAction::Mentions(int that) {
822 if (type() == ActionNode::CLEAR_CAPTURES) {
823 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
824 return range.Contains(that);
825 } else {
826 return reg() == that;
827 }
828 }
829
830
mentions_reg(int reg)831 bool Trace::mentions_reg(int reg) {
832 for (DeferredAction* action = actions_;
833 action != NULL;
834 action = action->next()) {
835 if (action->Mentions(reg))
836 return true;
837 }
838 return false;
839 }
840
841
GetStoredPosition(int reg,int * cp_offset)842 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
843 ASSERT_EQ(0, *cp_offset);
844 for (DeferredAction* action = actions_;
845 action != NULL;
846 action = action->next()) {
847 if (action->Mentions(reg)) {
848 if (action->type() == ActionNode::STORE_POSITION) {
849 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
850 return true;
851 } else {
852 return false;
853 }
854 }
855 }
856 return false;
857 }
858
859
FindAffectedRegisters(OutSet * affected_registers)860 int Trace::FindAffectedRegisters(OutSet* affected_registers) {
861 int max_register = RegExpCompiler::kNoRegister;
862 for (DeferredAction* action = actions_;
863 action != NULL;
864 action = action->next()) {
865 if (action->type() == ActionNode::CLEAR_CAPTURES) {
866 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
867 for (int i = range.from(); i <= range.to(); i++)
868 affected_registers->Set(i);
869 if (range.to() > max_register) max_register = range.to();
870 } else {
871 affected_registers->Set(action->reg());
872 if (action->reg() > max_register) max_register = action->reg();
873 }
874 }
875 return max_register;
876 }
877
878
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,OutSet & registers_to_pop,OutSet & registers_to_clear)879 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
880 int max_register,
881 OutSet& registers_to_pop,
882 OutSet& registers_to_clear) {
883 for (int reg = max_register; reg >= 0; reg--) {
884 if (registers_to_pop.Get(reg)) assembler->PopRegister(reg);
885 else if (registers_to_clear.Get(reg)) {
886 int clear_to = reg;
887 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
888 reg--;
889 }
890 assembler->ClearRegisters(reg, clear_to);
891 }
892 }
893 }
894
895
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear)896 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
897 int max_register,
898 OutSet& affected_registers,
899 OutSet* registers_to_pop,
900 OutSet* registers_to_clear) {
901 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
902 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
903
904 // Count pushes performed to force a stack limit check occasionally.
905 int pushes = 0;
906
907 for (int reg = 0; reg <= max_register; reg++) {
908 if (!affected_registers.Get(reg)) {
909 continue;
910 }
911
912 // The chronologically first deferred action in the trace
913 // is used to infer the action needed to restore a register
914 // to its previous state (or not, if it's safe to ignore it).
915 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
916 DeferredActionUndoType undo_action = IGNORE;
917
918 int value = 0;
919 bool absolute = false;
920 bool clear = false;
921 int store_position = -1;
922 // This is a little tricky because we are scanning the actions in reverse
923 // historical order (newest first).
924 for (DeferredAction* action = actions_;
925 action != NULL;
926 action = action->next()) {
927 if (action->Mentions(reg)) {
928 switch (action->type()) {
929 case ActionNode::SET_REGISTER: {
930 Trace::DeferredSetRegister* psr =
931 static_cast<Trace::DeferredSetRegister*>(action);
932 if (!absolute) {
933 value += psr->value();
934 absolute = true;
935 }
936 // SET_REGISTER is currently only used for newly introduced loop
937 // counters. They can have a significant previous value if they
938 // occour in a loop. TODO(lrn): Propagate this information, so
939 // we can set undo_action to IGNORE if we know there is no value to
940 // restore.
941 undo_action = RESTORE;
942 ASSERT_EQ(store_position, -1);
943 ASSERT(!clear);
944 break;
945 }
946 case ActionNode::INCREMENT_REGISTER:
947 if (!absolute) {
948 value++;
949 }
950 ASSERT_EQ(store_position, -1);
951 ASSERT(!clear);
952 undo_action = RESTORE;
953 break;
954 case ActionNode::STORE_POSITION: {
955 Trace::DeferredCapture* pc =
956 static_cast<Trace::DeferredCapture*>(action);
957 if (!clear && store_position == -1) {
958 store_position = pc->cp_offset();
959 }
960
961 // For captures we know that stores and clears alternate.
962 // Other register, are never cleared, and if the occur
963 // inside a loop, they might be assigned more than once.
964 if (reg <= 1) {
965 // Registers zero and one, aka "capture zero", is
966 // always set correctly if we succeed. There is no
967 // need to undo a setting on backtrack, because we
968 // will set it again or fail.
969 undo_action = IGNORE;
970 } else {
971 undo_action = pc->is_capture() ? CLEAR : RESTORE;
972 }
973 ASSERT(!absolute);
974 ASSERT_EQ(value, 0);
975 break;
976 }
977 case ActionNode::CLEAR_CAPTURES: {
978 // Since we're scanning in reverse order, if we've already
979 // set the position we have to ignore historically earlier
980 // clearing operations.
981 if (store_position == -1) {
982 clear = true;
983 }
984 undo_action = RESTORE;
985 ASSERT(!absolute);
986 ASSERT_EQ(value, 0);
987 break;
988 }
989 default:
990 UNREACHABLE();
991 break;
992 }
993 }
994 }
995 // Prepare for the undo-action (e.g., push if it's going to be popped).
996 if (undo_action == RESTORE) {
997 pushes++;
998 RegExpMacroAssembler::StackCheckFlag stack_check =
999 RegExpMacroAssembler::kNoStackLimitCheck;
1000 if (pushes == push_limit) {
1001 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1002 pushes = 0;
1003 }
1004
1005 assembler->PushRegister(reg, stack_check);
1006 registers_to_pop->Set(reg);
1007 } else if (undo_action == CLEAR) {
1008 registers_to_clear->Set(reg);
1009 }
1010 // Perform the chronologically last action (or accumulated increment)
1011 // for the register.
1012 if (store_position != -1) {
1013 assembler->WriteCurrentPositionToRegister(reg, store_position);
1014 } else if (clear) {
1015 assembler->ClearRegisters(reg, reg);
1016 } else if (absolute) {
1017 assembler->SetRegister(reg, value);
1018 } else if (value != 0) {
1019 assembler->AdvanceRegister(reg, value);
1020 }
1021 }
1022 }
1023
1024
1025 // This is called as we come into a loop choice node and some other tricky
1026 // nodes. It normalizes the state of the code generator to ensure we can
1027 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1028 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1029 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1030
1031 ASSERT(!is_trivial());
1032
1033 if (actions_ == NULL && backtrack() == NULL) {
1034 // Here we just have some deferred cp advances to fix and we are back to
1035 // a normal situation. We may also have to forget some information gained
1036 // through a quick check that was already performed.
1037 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1038 // Create a new trivial state and generate the node with that.
1039 Trace new_state;
1040 successor->Emit(compiler, &new_state);
1041 return;
1042 }
1043
1044 // Generate deferred actions here along with code to undo them again.
1045 OutSet affected_registers;
1046
1047 if (backtrack() != NULL) {
1048 // Here we have a concrete backtrack location. These are set up by choice
1049 // nodes and so they indicate that we have a deferred save of the current
1050 // position which we may need to emit here.
1051 assembler->PushCurrentPosition();
1052 }
1053
1054 int max_register = FindAffectedRegisters(&affected_registers);
1055 OutSet registers_to_pop;
1056 OutSet registers_to_clear;
1057 PerformDeferredActions(assembler,
1058 max_register,
1059 affected_registers,
1060 ®isters_to_pop,
1061 ®isters_to_clear);
1062 if (cp_offset_ != 0) {
1063 assembler->AdvanceCurrentPosition(cp_offset_);
1064 }
1065
1066 // Create a new trivial state and generate the node with that.
1067 Label undo;
1068 assembler->PushBacktrack(&undo);
1069 Trace new_state;
1070 successor->Emit(compiler, &new_state);
1071
1072 // On backtrack we need to restore state.
1073 assembler->Bind(&undo);
1074 RestoreAffectedRegisters(assembler,
1075 max_register,
1076 registers_to_pop,
1077 registers_to_clear);
1078 if (backtrack() == NULL) {
1079 assembler->Backtrack();
1080 } else {
1081 assembler->PopCurrentPosition();
1082 assembler->GoTo(backtrack());
1083 }
1084 }
1085
1086
Emit(RegExpCompiler * compiler,Trace * trace)1087 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1088 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1089
1090 // Omit flushing the trace. We discard the entire stack frame anyway.
1091
1092 if (!label()->is_bound()) {
1093 // We are completely independent of the trace, since we ignore it,
1094 // so this code can be used as the generic version.
1095 assembler->Bind(label());
1096 }
1097
1098 // Throw away everything on the backtrack stack since the start
1099 // of the negative submatch and restore the character position.
1100 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1101 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1102 if (clear_capture_count_ > 0) {
1103 // Clear any captures that might have been performed during the success
1104 // of the body of the negative look-ahead.
1105 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1106 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1107 }
1108 // Now that we have unwound the stack we find at the top of the stack the
1109 // backtrack that the BeginSubmatch node got.
1110 assembler->Backtrack();
1111 }
1112
1113
Emit(RegExpCompiler * compiler,Trace * trace)1114 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1115 if (!trace->is_trivial()) {
1116 trace->Flush(compiler, this);
1117 return;
1118 }
1119 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1120 if (!label()->is_bound()) {
1121 assembler->Bind(label());
1122 }
1123 switch (action_) {
1124 case ACCEPT:
1125 assembler->Succeed();
1126 return;
1127 case BACKTRACK:
1128 assembler->GoTo(trace->backtrack());
1129 return;
1130 case NEGATIVE_SUBMATCH_SUCCESS:
1131 // This case is handled in a different virtual method.
1132 UNREACHABLE();
1133 }
1134 UNIMPLEMENTED();
1135 }
1136
1137
AddGuard(Guard * guard)1138 void GuardedAlternative::AddGuard(Guard* guard) {
1139 if (guards_ == NULL)
1140 guards_ = new ZoneList<Guard*>(1);
1141 guards_->Add(guard);
1142 }
1143
1144
SetRegister(int reg,int val,RegExpNode * on_success)1145 ActionNode* ActionNode::SetRegister(int reg,
1146 int val,
1147 RegExpNode* on_success) {
1148 ActionNode* result = new ActionNode(SET_REGISTER, on_success);
1149 result->data_.u_store_register.reg = reg;
1150 result->data_.u_store_register.value = val;
1151 return result;
1152 }
1153
1154
IncrementRegister(int reg,RegExpNode * on_success)1155 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1156 ActionNode* result = new ActionNode(INCREMENT_REGISTER, on_success);
1157 result->data_.u_increment_register.reg = reg;
1158 return result;
1159 }
1160
1161
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1162 ActionNode* ActionNode::StorePosition(int reg,
1163 bool is_capture,
1164 RegExpNode* on_success) {
1165 ActionNode* result = new ActionNode(STORE_POSITION, on_success);
1166 result->data_.u_position_register.reg = reg;
1167 result->data_.u_position_register.is_capture = is_capture;
1168 return result;
1169 }
1170
1171
ClearCaptures(Interval range,RegExpNode * on_success)1172 ActionNode* ActionNode::ClearCaptures(Interval range,
1173 RegExpNode* on_success) {
1174 ActionNode* result = new ActionNode(CLEAR_CAPTURES, on_success);
1175 result->data_.u_clear_captures.range_from = range.from();
1176 result->data_.u_clear_captures.range_to = range.to();
1177 return result;
1178 }
1179
1180
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1181 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1182 int position_reg,
1183 RegExpNode* on_success) {
1184 ActionNode* result = new ActionNode(BEGIN_SUBMATCH, on_success);
1185 result->data_.u_submatch.stack_pointer_register = stack_reg;
1186 result->data_.u_submatch.current_position_register = position_reg;
1187 return result;
1188 }
1189
1190
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1191 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1192 int position_reg,
1193 int clear_register_count,
1194 int clear_register_from,
1195 RegExpNode* on_success) {
1196 ActionNode* result = new ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1197 result->data_.u_submatch.stack_pointer_register = stack_reg;
1198 result->data_.u_submatch.current_position_register = position_reg;
1199 result->data_.u_submatch.clear_register_count = clear_register_count;
1200 result->data_.u_submatch.clear_register_from = clear_register_from;
1201 return result;
1202 }
1203
1204
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1205 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1206 int repetition_register,
1207 int repetition_limit,
1208 RegExpNode* on_success) {
1209 ActionNode* result = new ActionNode(EMPTY_MATCH_CHECK, on_success);
1210 result->data_.u_empty_match_check.start_register = start_register;
1211 result->data_.u_empty_match_check.repetition_register = repetition_register;
1212 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1213 return result;
1214 }
1215
1216
1217 #define DEFINE_ACCEPT(Type) \
1218 void Type##Node::Accept(NodeVisitor* visitor) { \
1219 visitor->Visit##Type(this); \
1220 }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1221 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1222 #undef DEFINE_ACCEPT
1223
1224
1225 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1226 visitor->VisitLoopChoice(this);
1227 }
1228
1229
1230 // -------------------------------------------------------------------
1231 // Emit code.
1232
1233
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1234 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1235 Guard* guard,
1236 Trace* trace) {
1237 switch (guard->op()) {
1238 case Guard::LT:
1239 ASSERT(!trace->mentions_reg(guard->reg()));
1240 macro_assembler->IfRegisterGE(guard->reg(),
1241 guard->value(),
1242 trace->backtrack());
1243 break;
1244 case Guard::GEQ:
1245 ASSERT(!trace->mentions_reg(guard->reg()));
1246 macro_assembler->IfRegisterLT(guard->reg(),
1247 guard->value(),
1248 trace->backtrack());
1249 break;
1250 }
1251 }
1252
1253
1254 static unibrow::Mapping<unibrow::Ecma262UnCanonicalize> uncanonicalize;
1255 static unibrow::Mapping<unibrow::CanonicalizationRange> canonrange;
1256
1257
1258 // Returns the number of characters in the equivalence class, omitting those
1259 // that cannot occur in the source string because it is ASCII.
GetCaseIndependentLetters(uc16 character,bool ascii_subject,unibrow::uchar * letters)1260 static int GetCaseIndependentLetters(uc16 character,
1261 bool ascii_subject,
1262 unibrow::uchar* letters) {
1263 int length = uncanonicalize.get(character, '\0', letters);
1264 // Unibrow returns 0 or 1 for characters where case independependence is
1265 // trivial.
1266 if (length == 0) {
1267 letters[0] = character;
1268 length = 1;
1269 }
1270 if (!ascii_subject || character <= String::kMaxAsciiCharCode) {
1271 return length;
1272 }
1273 // The standard requires that non-ASCII characters cannot have ASCII
1274 // character codes in their equivalence class.
1275 return 0;
1276 }
1277
1278
EmitSimpleCharacter(RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1279 static inline bool EmitSimpleCharacter(RegExpCompiler* compiler,
1280 uc16 c,
1281 Label* on_failure,
1282 int cp_offset,
1283 bool check,
1284 bool preloaded) {
1285 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1286 bool bound_checked = false;
1287 if (!preloaded) {
1288 assembler->LoadCurrentCharacter(
1289 cp_offset,
1290 on_failure,
1291 check);
1292 bound_checked = true;
1293 }
1294 assembler->CheckNotCharacter(c, on_failure);
1295 return bound_checked;
1296 }
1297
1298
1299 // Only emits non-letters (things that don't have case). Only used for case
1300 // independent matches.
EmitAtomNonLetter(RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1301 static inline bool EmitAtomNonLetter(RegExpCompiler* compiler,
1302 uc16 c,
1303 Label* on_failure,
1304 int cp_offset,
1305 bool check,
1306 bool preloaded) {
1307 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1308 bool ascii = compiler->ascii();
1309 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1310 int length = GetCaseIndependentLetters(c, ascii, chars);
1311 if (length < 1) {
1312 // This can't match. Must be an ASCII subject and a non-ASCII character.
1313 // We do not need to do anything since the ASCII pass already handled this.
1314 return false; // Bounds not checked.
1315 }
1316 bool checked = false;
1317 // We handle the length > 1 case in a later pass.
1318 if (length == 1) {
1319 if (ascii && c > String::kMaxAsciiCharCodeU) {
1320 // Can't match - see above.
1321 return false; // Bounds not checked.
1322 }
1323 if (!preloaded) {
1324 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1325 checked = check;
1326 }
1327 macro_assembler->CheckNotCharacter(c, on_failure);
1328 }
1329 return checked;
1330 }
1331
1332
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool ascii,uc16 c1,uc16 c2,Label * on_failure)1333 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1334 bool ascii,
1335 uc16 c1,
1336 uc16 c2,
1337 Label* on_failure) {
1338 uc16 char_mask;
1339 if (ascii) {
1340 char_mask = String::kMaxAsciiCharCode;
1341 } else {
1342 char_mask = String::kMaxUC16CharCode;
1343 }
1344 uc16 exor = c1 ^ c2;
1345 // Check whether exor has only one bit set.
1346 if (((exor - 1) & exor) == 0) {
1347 // If c1 and c2 differ only by one bit.
1348 // Ecma262UnCanonicalize always gives the highest number last.
1349 ASSERT(c2 > c1);
1350 uc16 mask = char_mask ^ exor;
1351 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1352 return true;
1353 }
1354 ASSERT(c2 > c1);
1355 uc16 diff = c2 - c1;
1356 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1357 // If the characters differ by 2^n but don't differ by one bit then
1358 // subtract the difference from the found character, then do the or
1359 // trick. We avoid the theoretical case where negative numbers are
1360 // involved in order to simplify code generation.
1361 uc16 mask = char_mask ^ diff;
1362 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1363 diff,
1364 mask,
1365 on_failure);
1366 return true;
1367 }
1368 return false;
1369 }
1370
1371
1372 typedef bool EmitCharacterFunction(RegExpCompiler* compiler,
1373 uc16 c,
1374 Label* on_failure,
1375 int cp_offset,
1376 bool check,
1377 bool preloaded);
1378
1379 // Only emits letters (things that have case). Only used for case independent
1380 // matches.
EmitAtomLetter(RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1381 static inline bool EmitAtomLetter(RegExpCompiler* compiler,
1382 uc16 c,
1383 Label* on_failure,
1384 int cp_offset,
1385 bool check,
1386 bool preloaded) {
1387 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1388 bool ascii = compiler->ascii();
1389 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1390 int length = GetCaseIndependentLetters(c, ascii, chars);
1391 if (length <= 1) return false;
1392 // We may not need to check against the end of the input string
1393 // if this character lies before a character that matched.
1394 if (!preloaded) {
1395 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1396 }
1397 Label ok;
1398 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1399 switch (length) {
1400 case 2: {
1401 if (ShortCutEmitCharacterPair(macro_assembler,
1402 ascii,
1403 chars[0],
1404 chars[1],
1405 on_failure)) {
1406 } else {
1407 macro_assembler->CheckCharacter(chars[0], &ok);
1408 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1409 macro_assembler->Bind(&ok);
1410 }
1411 break;
1412 }
1413 case 4:
1414 macro_assembler->CheckCharacter(chars[3], &ok);
1415 // Fall through!
1416 case 3:
1417 macro_assembler->CheckCharacter(chars[0], &ok);
1418 macro_assembler->CheckCharacter(chars[1], &ok);
1419 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1420 macro_assembler->Bind(&ok);
1421 break;
1422 default:
1423 UNREACHABLE();
1424 break;
1425 }
1426 return true;
1427 }
1428
1429
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool ascii,Label * on_failure,int cp_offset,bool check_offset,bool preloaded)1430 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
1431 RegExpCharacterClass* cc,
1432 bool ascii,
1433 Label* on_failure,
1434 int cp_offset,
1435 bool check_offset,
1436 bool preloaded) {
1437 if (cc->is_standard() &&
1438 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
1439 cp_offset,
1440 check_offset,
1441 on_failure)) {
1442 return;
1443 }
1444
1445 ZoneList<CharacterRange>* ranges = cc->ranges();
1446 int max_char;
1447 if (ascii) {
1448 max_char = String::kMaxAsciiCharCode;
1449 } else {
1450 max_char = String::kMaxUC16CharCode;
1451 }
1452
1453 Label success;
1454
1455 Label* char_is_in_class =
1456 cc->is_negated() ? on_failure : &success;
1457
1458 int range_count = ranges->length();
1459
1460 int last_valid_range = range_count - 1;
1461 while (last_valid_range >= 0) {
1462 CharacterRange& range = ranges->at(last_valid_range);
1463 if (range.from() <= max_char) {
1464 break;
1465 }
1466 last_valid_range--;
1467 }
1468
1469 if (last_valid_range < 0) {
1470 if (!cc->is_negated()) {
1471 // TODO(plesner): We can remove this when the node level does our
1472 // ASCII optimizations for us.
1473 macro_assembler->GoTo(on_failure);
1474 }
1475 if (check_offset) {
1476 macro_assembler->CheckPosition(cp_offset, on_failure);
1477 }
1478 return;
1479 }
1480
1481 if (last_valid_range == 0 &&
1482 !cc->is_negated() &&
1483 ranges->at(0).IsEverything(max_char)) {
1484 // This is a common case hit by non-anchored expressions.
1485 if (check_offset) {
1486 macro_assembler->CheckPosition(cp_offset, on_failure);
1487 }
1488 return;
1489 }
1490
1491 if (!preloaded) {
1492 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
1493 }
1494
1495 for (int i = 0; i < last_valid_range; i++) {
1496 CharacterRange& range = ranges->at(i);
1497 Label next_range;
1498 uc16 from = range.from();
1499 uc16 to = range.to();
1500 if (from > max_char) {
1501 continue;
1502 }
1503 if (to > max_char) to = max_char;
1504 if (to == from) {
1505 macro_assembler->CheckCharacter(to, char_is_in_class);
1506 } else {
1507 if (from != 0) {
1508 macro_assembler->CheckCharacterLT(from, &next_range);
1509 }
1510 if (to != max_char) {
1511 macro_assembler->CheckCharacterLT(to + 1, char_is_in_class);
1512 } else {
1513 macro_assembler->GoTo(char_is_in_class);
1514 }
1515 }
1516 macro_assembler->Bind(&next_range);
1517 }
1518
1519 CharacterRange& range = ranges->at(last_valid_range);
1520 uc16 from = range.from();
1521 uc16 to = range.to();
1522
1523 if (to > max_char) to = max_char;
1524 ASSERT(to >= from);
1525
1526 if (to == from) {
1527 if (cc->is_negated()) {
1528 macro_assembler->CheckCharacter(to, on_failure);
1529 } else {
1530 macro_assembler->CheckNotCharacter(to, on_failure);
1531 }
1532 } else {
1533 if (from != 0) {
1534 if (cc->is_negated()) {
1535 macro_assembler->CheckCharacterLT(from, &success);
1536 } else {
1537 macro_assembler->CheckCharacterLT(from, on_failure);
1538 }
1539 }
1540 if (to != String::kMaxUC16CharCode) {
1541 if (cc->is_negated()) {
1542 macro_assembler->CheckCharacterLT(to + 1, on_failure);
1543 } else {
1544 macro_assembler->CheckCharacterGT(to, on_failure);
1545 }
1546 } else {
1547 if (cc->is_negated()) {
1548 macro_assembler->GoTo(on_failure);
1549 }
1550 }
1551 }
1552 macro_assembler->Bind(&success);
1553 }
1554
1555
~RegExpNode()1556 RegExpNode::~RegExpNode() {
1557 }
1558
1559
LimitVersions(RegExpCompiler * compiler,Trace * trace)1560 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
1561 Trace* trace) {
1562 // If we are generating a greedy loop then don't stop and don't reuse code.
1563 if (trace->stop_node() != NULL) {
1564 return CONTINUE;
1565 }
1566
1567 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1568 if (trace->is_trivial()) {
1569 if (label_.is_bound()) {
1570 // We are being asked to generate a generic version, but that's already
1571 // been done so just go to it.
1572 macro_assembler->GoTo(&label_);
1573 return DONE;
1574 }
1575 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
1576 // To avoid too deep recursion we push the node to the work queue and just
1577 // generate a goto here.
1578 compiler->AddWork(this);
1579 macro_assembler->GoTo(&label_);
1580 return DONE;
1581 }
1582 // Generate generic version of the node and bind the label for later use.
1583 macro_assembler->Bind(&label_);
1584 return CONTINUE;
1585 }
1586
1587 // We are being asked to make a non-generic version. Keep track of how many
1588 // non-generic versions we generate so as not to overdo it.
1589 trace_count_++;
1590 if (FLAG_regexp_optimization &&
1591 trace_count_ < kMaxCopiesCodeGenerated &&
1592 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
1593 return CONTINUE;
1594 }
1595
1596 // If we get here code has been generated for this node too many times or
1597 // recursion is too deep. Time to switch to a generic version. The code for
1598 // generic versions above can handle deep recursion properly.
1599 trace->Flush(compiler, this);
1600 return DONE;
1601 }
1602
1603
EatsAtLeast(int still_to_find,int recursion_depth)1604 int ActionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1605 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1606 if (type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
1607 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1608 }
1609
1610
EatsAtLeast(int still_to_find,int recursion_depth)1611 int AssertionNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1612 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1613 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1614 }
1615
1616
EatsAtLeast(int still_to_find,int recursion_depth)1617 int BackReferenceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1618 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1619 return on_success()->EatsAtLeast(still_to_find, recursion_depth + 1);
1620 }
1621
1622
EatsAtLeast(int still_to_find,int recursion_depth)1623 int TextNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1624 int answer = Length();
1625 if (answer >= still_to_find) return answer;
1626 if (recursion_depth > RegExpCompiler::kMaxRecursion) return answer;
1627 return answer + on_success()->EatsAtLeast(still_to_find - answer,
1628 recursion_depth + 1);
1629 }
1630
1631
EatsAtLeast(int still_to_find,int recursion_depth)1632 int NegativeLookaheadChoiceNode:: EatsAtLeast(int still_to_find,
1633 int recursion_depth) {
1634 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1635 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1636 // afterwards.
1637 RegExpNode* node = alternatives_->at(1).node();
1638 return node->EatsAtLeast(still_to_find, recursion_depth + 1);
1639 }
1640
1641
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)1642 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
1643 QuickCheckDetails* details,
1644 RegExpCompiler* compiler,
1645 int filled_in,
1646 bool not_at_start) {
1647 // Alternative 0 is the negative lookahead, alternative 1 is what comes
1648 // afterwards.
1649 RegExpNode* node = alternatives_->at(1).node();
1650 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
1651 }
1652
1653
EatsAtLeastHelper(int still_to_find,int recursion_depth,RegExpNode * ignore_this_node)1654 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
1655 int recursion_depth,
1656 RegExpNode* ignore_this_node) {
1657 if (recursion_depth > RegExpCompiler::kMaxRecursion) return 0;
1658 int min = 100;
1659 int choice_count = alternatives_->length();
1660 for (int i = 0; i < choice_count; i++) {
1661 RegExpNode* node = alternatives_->at(i).node();
1662 if (node == ignore_this_node) continue;
1663 int node_eats_at_least = node->EatsAtLeast(still_to_find,
1664 recursion_depth + 1);
1665 if (node_eats_at_least < min) min = node_eats_at_least;
1666 }
1667 return min;
1668 }
1669
1670
EatsAtLeast(int still_to_find,int recursion_depth)1671 int LoopChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1672 return EatsAtLeastHelper(still_to_find, recursion_depth, loop_node_);
1673 }
1674
1675
EatsAtLeast(int still_to_find,int recursion_depth)1676 int ChoiceNode::EatsAtLeast(int still_to_find, int recursion_depth) {
1677 return EatsAtLeastHelper(still_to_find, recursion_depth, NULL);
1678 }
1679
1680
1681 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)1682 static inline uint32_t SmearBitsRight(uint32_t v) {
1683 v |= v >> 1;
1684 v |= v >> 2;
1685 v |= v >> 4;
1686 v |= v >> 8;
1687 v |= v >> 16;
1688 return v;
1689 }
1690
1691
Rationalize(bool asc)1692 bool QuickCheckDetails::Rationalize(bool asc) {
1693 bool found_useful_op = false;
1694 uint32_t char_mask;
1695 if (asc) {
1696 char_mask = String::kMaxAsciiCharCode;
1697 } else {
1698 char_mask = String::kMaxUC16CharCode;
1699 }
1700 mask_ = 0;
1701 value_ = 0;
1702 int char_shift = 0;
1703 for (int i = 0; i < characters_; i++) {
1704 Position* pos = &positions_[i];
1705 if ((pos->mask & String::kMaxAsciiCharCode) != 0) {
1706 found_useful_op = true;
1707 }
1708 mask_ |= (pos->mask & char_mask) << char_shift;
1709 value_ |= (pos->value & char_mask) << char_shift;
1710 char_shift += asc ? 8 : 16;
1711 }
1712 return found_useful_op;
1713 }
1714
1715
EmitQuickCheck(RegExpCompiler * compiler,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)1716 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
1717 Trace* trace,
1718 bool preload_has_checked_bounds,
1719 Label* on_possible_success,
1720 QuickCheckDetails* details,
1721 bool fall_through_on_failure) {
1722 if (details->characters() == 0) return false;
1723 GetQuickCheckDetails(details, compiler, 0, trace->at_start() == Trace::FALSE);
1724 if (details->cannot_match()) return false;
1725 if (!details->Rationalize(compiler->ascii())) return false;
1726 ASSERT(details->characters() == 1 ||
1727 compiler->macro_assembler()->CanReadUnaligned());
1728 uint32_t mask = details->mask();
1729 uint32_t value = details->value();
1730
1731 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1732
1733 if (trace->characters_preloaded() != details->characters()) {
1734 assembler->LoadCurrentCharacter(trace->cp_offset(),
1735 trace->backtrack(),
1736 !preload_has_checked_bounds,
1737 details->characters());
1738 }
1739
1740
1741 bool need_mask = true;
1742
1743 if (details->characters() == 1) {
1744 // If number of characters preloaded is 1 then we used a byte or 16 bit
1745 // load so the value is already masked down.
1746 uint32_t char_mask;
1747 if (compiler->ascii()) {
1748 char_mask = String::kMaxAsciiCharCode;
1749 } else {
1750 char_mask = String::kMaxUC16CharCode;
1751 }
1752 if ((mask & char_mask) == char_mask) need_mask = false;
1753 mask &= char_mask;
1754 } else {
1755 // For 2-character preloads in ASCII mode we also use a 16 bit load with
1756 // zero extend.
1757 if (details->characters() == 2 && compiler->ascii()) {
1758 if ((mask & 0xffff) == 0xffff) need_mask = false;
1759 } else {
1760 if (mask == 0xffffffff) need_mask = false;
1761 }
1762 }
1763
1764 if (fall_through_on_failure) {
1765 if (need_mask) {
1766 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
1767 } else {
1768 assembler->CheckCharacter(value, on_possible_success);
1769 }
1770 } else {
1771 if (need_mask) {
1772 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
1773 } else {
1774 assembler->CheckNotCharacter(value, trace->backtrack());
1775 }
1776 }
1777 return true;
1778 }
1779
1780
1781 // Here is the meat of GetQuickCheckDetails (see also the comment on the
1782 // super-class in the .h file).
1783 //
1784 // We iterate along the text object, building up for each character a
1785 // mask and value that can be used to test for a quick failure to match.
1786 // The masks and values for the positions will be combined into a single
1787 // machine word for the current character width in order to be used in
1788 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)1789 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
1790 RegExpCompiler* compiler,
1791 int characters_filled_in,
1792 bool not_at_start) {
1793 ASSERT(characters_filled_in < details->characters());
1794 int characters = details->characters();
1795 int char_mask;
1796 int char_shift;
1797 if (compiler->ascii()) {
1798 char_mask = String::kMaxAsciiCharCode;
1799 char_shift = 8;
1800 } else {
1801 char_mask = String::kMaxUC16CharCode;
1802 char_shift = 16;
1803 }
1804 for (int k = 0; k < elms_->length(); k++) {
1805 TextElement elm = elms_->at(k);
1806 if (elm.type == TextElement::ATOM) {
1807 Vector<const uc16> quarks = elm.data.u_atom->data();
1808 for (int i = 0; i < characters && i < quarks.length(); i++) {
1809 QuickCheckDetails::Position* pos =
1810 details->positions(characters_filled_in);
1811 uc16 c = quarks[i];
1812 if (c > char_mask) {
1813 // If we expect a non-ASCII character from an ASCII string,
1814 // there is no way we can match. Not even case independent
1815 // matching can turn an ASCII character into non-ASCII or
1816 // vice versa.
1817 details->set_cannot_match();
1818 pos->determines_perfectly = false;
1819 return;
1820 }
1821 if (compiler->ignore_case()) {
1822 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1823 int length = GetCaseIndependentLetters(c, compiler->ascii(), chars);
1824 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
1825 if (length == 1) {
1826 // This letter has no case equivalents, so it's nice and simple
1827 // and the mask-compare will determine definitely whether we have
1828 // a match at this character position.
1829 pos->mask = char_mask;
1830 pos->value = c;
1831 pos->determines_perfectly = true;
1832 } else {
1833 uint32_t common_bits = char_mask;
1834 uint32_t bits = chars[0];
1835 for (int j = 1; j < length; j++) {
1836 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
1837 common_bits ^= differing_bits;
1838 bits &= common_bits;
1839 }
1840 // If length is 2 and common bits has only one zero in it then
1841 // our mask and compare instruction will determine definitely
1842 // whether we have a match at this character position. Otherwise
1843 // it can only be an approximate check.
1844 uint32_t one_zero = (common_bits | ~char_mask);
1845 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
1846 pos->determines_perfectly = true;
1847 }
1848 pos->mask = common_bits;
1849 pos->value = bits;
1850 }
1851 } else {
1852 // Don't ignore case. Nice simple case where the mask-compare will
1853 // determine definitely whether we have a match at this character
1854 // position.
1855 pos->mask = char_mask;
1856 pos->value = c;
1857 pos->determines_perfectly = true;
1858 }
1859 characters_filled_in++;
1860 ASSERT(characters_filled_in <= details->characters());
1861 if (characters_filled_in == details->characters()) {
1862 return;
1863 }
1864 }
1865 } else {
1866 QuickCheckDetails::Position* pos =
1867 details->positions(characters_filled_in);
1868 RegExpCharacterClass* tree = elm.data.u_char_class;
1869 ZoneList<CharacterRange>* ranges = tree->ranges();
1870 if (tree->is_negated()) {
1871 // A quick check uses multi-character mask and compare. There is no
1872 // useful way to incorporate a negative char class into this scheme
1873 // so we just conservatively create a mask and value that will always
1874 // succeed.
1875 pos->mask = 0;
1876 pos->value = 0;
1877 } else {
1878 int first_range = 0;
1879 while (ranges->at(first_range).from() > char_mask) {
1880 first_range++;
1881 if (first_range == ranges->length()) {
1882 details->set_cannot_match();
1883 pos->determines_perfectly = false;
1884 return;
1885 }
1886 }
1887 CharacterRange range = ranges->at(first_range);
1888 uc16 from = range.from();
1889 uc16 to = range.to();
1890 if (to > char_mask) {
1891 to = char_mask;
1892 }
1893 uint32_t differing_bits = (from ^ to);
1894 // A mask and compare is only perfect if the differing bits form a
1895 // number like 00011111 with one single block of trailing 1s.
1896 if ((differing_bits & (differing_bits + 1)) == 0 &&
1897 from + differing_bits == to) {
1898 pos->determines_perfectly = true;
1899 }
1900 uint32_t common_bits = ~SmearBitsRight(differing_bits);
1901 uint32_t bits = (from & common_bits);
1902 for (int i = first_range + 1; i < ranges->length(); i++) {
1903 CharacterRange range = ranges->at(i);
1904 uc16 from = range.from();
1905 uc16 to = range.to();
1906 if (from > char_mask) continue;
1907 if (to > char_mask) to = char_mask;
1908 // Here we are combining more ranges into the mask and compare
1909 // value. With each new range the mask becomes more sparse and
1910 // so the chances of a false positive rise. A character class
1911 // with multiple ranges is assumed never to be equivalent to a
1912 // mask and compare operation.
1913 pos->determines_perfectly = false;
1914 uint32_t new_common_bits = (from ^ to);
1915 new_common_bits = ~SmearBitsRight(new_common_bits);
1916 common_bits &= new_common_bits;
1917 bits &= new_common_bits;
1918 uint32_t differing_bits = (from & common_bits) ^ bits;
1919 common_bits ^= differing_bits;
1920 bits &= common_bits;
1921 }
1922 pos->mask = common_bits;
1923 pos->value = bits;
1924 }
1925 characters_filled_in++;
1926 ASSERT(characters_filled_in <= details->characters());
1927 if (characters_filled_in == details->characters()) {
1928 return;
1929 }
1930 }
1931 }
1932 ASSERT(characters_filled_in != details->characters());
1933 on_success()-> GetQuickCheckDetails(details,
1934 compiler,
1935 characters_filled_in,
1936 true);
1937 }
1938
1939
Clear()1940 void QuickCheckDetails::Clear() {
1941 for (int i = 0; i < characters_; i++) {
1942 positions_[i].mask = 0;
1943 positions_[i].value = 0;
1944 positions_[i].determines_perfectly = false;
1945 }
1946 characters_ = 0;
1947 }
1948
1949
Advance(int by,bool ascii)1950 void QuickCheckDetails::Advance(int by, bool ascii) {
1951 ASSERT(by >= 0);
1952 if (by >= characters_) {
1953 Clear();
1954 return;
1955 }
1956 for (int i = 0; i < characters_ - by; i++) {
1957 positions_[i] = positions_[by + i];
1958 }
1959 for (int i = characters_ - by; i < characters_; i++) {
1960 positions_[i].mask = 0;
1961 positions_[i].value = 0;
1962 positions_[i].determines_perfectly = false;
1963 }
1964 characters_ -= by;
1965 // We could change mask_ and value_ here but we would never advance unless
1966 // they had already been used in a check and they won't be used again because
1967 // it would gain us nothing. So there's no point.
1968 }
1969
1970
Merge(QuickCheckDetails * other,int from_index)1971 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
1972 ASSERT(characters_ == other->characters_);
1973 if (other->cannot_match_) {
1974 return;
1975 }
1976 if (cannot_match_) {
1977 *this = *other;
1978 return;
1979 }
1980 for (int i = from_index; i < characters_; i++) {
1981 QuickCheckDetails::Position* pos = positions(i);
1982 QuickCheckDetails::Position* other_pos = other->positions(i);
1983 if (pos->mask != other_pos->mask ||
1984 pos->value != other_pos->value ||
1985 !other_pos->determines_perfectly) {
1986 // Our mask-compare operation will be approximate unless we have the
1987 // exact same operation on both sides of the alternation.
1988 pos->determines_perfectly = false;
1989 }
1990 pos->mask &= other_pos->mask;
1991 pos->value &= pos->mask;
1992 other_pos->value &= pos->mask;
1993 uc16 differing_bits = (pos->value ^ other_pos->value);
1994 pos->mask &= ~differing_bits;
1995 pos->value &= pos->mask;
1996 }
1997 }
1998
1999
2000 class VisitMarker {
2001 public:
VisitMarker(NodeInfo * info)2002 explicit VisitMarker(NodeInfo* info) : info_(info) {
2003 ASSERT(!info->visited);
2004 info->visited = true;
2005 }
~VisitMarker()2006 ~VisitMarker() {
2007 info_->visited = false;
2008 }
2009 private:
2010 NodeInfo* info_;
2011 };
2012
2013
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2014 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2015 RegExpCompiler* compiler,
2016 int characters_filled_in,
2017 bool not_at_start) {
2018 if (body_can_be_zero_length_ || info()->visited) return;
2019 VisitMarker marker(info());
2020 return ChoiceNode::GetQuickCheckDetails(details,
2021 compiler,
2022 characters_filled_in,
2023 not_at_start);
2024 }
2025
2026
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2027 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2028 RegExpCompiler* compiler,
2029 int characters_filled_in,
2030 bool not_at_start) {
2031 not_at_start = (not_at_start || not_at_start_);
2032 int choice_count = alternatives_->length();
2033 ASSERT(choice_count > 0);
2034 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2035 compiler,
2036 characters_filled_in,
2037 not_at_start);
2038 for (int i = 1; i < choice_count; i++) {
2039 QuickCheckDetails new_details(details->characters());
2040 RegExpNode* node = alternatives_->at(i).node();
2041 node->GetQuickCheckDetails(&new_details, compiler,
2042 characters_filled_in,
2043 not_at_start);
2044 // Here we merge the quick match details of the two branches.
2045 details->Merge(&new_details, characters_filled_in);
2046 }
2047 }
2048
2049
2050 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2051 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2052 Label* word,
2053 Label* non_word,
2054 bool fall_through_on_word) {
2055 assembler->CheckCharacterGT('z', non_word);
2056 assembler->CheckCharacterLT('0', non_word);
2057 assembler->CheckCharacterGT('a' - 1, word);
2058 assembler->CheckCharacterLT('9' + 1, word);
2059 assembler->CheckCharacterLT('A', non_word);
2060 assembler->CheckCharacterLT('Z' + 1, word);
2061 if (fall_through_on_word) {
2062 assembler->CheckNotCharacter('_', non_word);
2063 } else {
2064 assembler->CheckCharacter('_', word);
2065 }
2066 }
2067
2068
2069 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2070 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2071 static void EmitHat(RegExpCompiler* compiler,
2072 RegExpNode* on_success,
2073 Trace* trace) {
2074 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2075 // We will be loading the previous character into the current character
2076 // register.
2077 Trace new_trace(*trace);
2078 new_trace.InvalidateCurrentCharacter();
2079
2080 Label ok;
2081 if (new_trace.cp_offset() == 0) {
2082 // The start of input counts as a newline in this context, so skip to
2083 // ok if we are at the start.
2084 assembler->CheckAtStart(&ok);
2085 }
2086 // We already checked that we are not at the start of input so it must be
2087 // OK to load the previous character.
2088 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2089 new_trace.backtrack(),
2090 false);
2091 // Newline means \n, \r, 0x2028 or 0x2029.
2092 if (!compiler->ascii()) {
2093 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2094 }
2095 assembler->CheckCharacter('\n', &ok);
2096 assembler->CheckNotCharacter('\r', new_trace.backtrack());
2097 assembler->Bind(&ok);
2098 on_success->Emit(compiler, &new_trace);
2099 }
2100
2101
2102 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(AssertionNode::AssertionNodeType type,RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2103 static void EmitBoundaryCheck(AssertionNode::AssertionNodeType type,
2104 RegExpCompiler* compiler,
2105 RegExpNode* on_success,
2106 Trace* trace) {
2107 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2108 Label before_non_word;
2109 Label before_word;
2110 if (trace->characters_preloaded() != 1) {
2111 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2112 }
2113 // Fall through on non-word.
2114 EmitWordCheck(assembler, &before_word, &before_non_word, false);
2115
2116 // We will be loading the previous character into the current character
2117 // register.
2118 Trace new_trace(*trace);
2119 new_trace.InvalidateCurrentCharacter();
2120
2121 Label ok;
2122 Label* boundary;
2123 Label* not_boundary;
2124 if (type == AssertionNode::AT_BOUNDARY) {
2125 boundary = &ok;
2126 not_boundary = new_trace.backtrack();
2127 } else {
2128 not_boundary = &ok;
2129 boundary = new_trace.backtrack();
2130 }
2131
2132 // Next character is not a word character.
2133 assembler->Bind(&before_non_word);
2134 if (new_trace.cp_offset() == 0) {
2135 // The start of input counts as a non-word character, so the question is
2136 // decided if we are at the start.
2137 assembler->CheckAtStart(not_boundary);
2138 }
2139 // We already checked that we are not at the start of input so it must be
2140 // OK to load the previous character.
2141 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2142 &ok, // Unused dummy label in this call.
2143 false);
2144 // Fall through on non-word.
2145 EmitWordCheck(assembler, boundary, not_boundary, false);
2146 assembler->GoTo(not_boundary);
2147
2148 // Next character is a word character.
2149 assembler->Bind(&before_word);
2150 if (new_trace.cp_offset() == 0) {
2151 // The start of input counts as a non-word character, so the question is
2152 // decided if we are at the start.
2153 assembler->CheckAtStart(boundary);
2154 }
2155 // We already checked that we are not at the start of input so it must be
2156 // OK to load the previous character.
2157 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1,
2158 &ok, // Unused dummy label in this call.
2159 false);
2160 bool fall_through_on_word = (type == AssertionNode::AT_NON_BOUNDARY);
2161 EmitWordCheck(assembler, not_boundary, boundary, fall_through_on_word);
2162
2163 assembler->Bind(&ok);
2164
2165 on_success->Emit(compiler, &new_trace);
2166 }
2167
2168
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2169 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
2170 RegExpCompiler* compiler,
2171 int filled_in,
2172 bool not_at_start) {
2173 if (type_ == AT_START && not_at_start) {
2174 details->set_cannot_match();
2175 return;
2176 }
2177 return on_success()->GetQuickCheckDetails(details,
2178 compiler,
2179 filled_in,
2180 not_at_start);
2181 }
2182
2183
Emit(RegExpCompiler * compiler,Trace * trace)2184 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2185 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2186 switch (type_) {
2187 case AT_END: {
2188 Label ok;
2189 assembler->CheckPosition(trace->cp_offset(), &ok);
2190 assembler->GoTo(trace->backtrack());
2191 assembler->Bind(&ok);
2192 break;
2193 }
2194 case AT_START: {
2195 if (trace->at_start() == Trace::FALSE) {
2196 assembler->GoTo(trace->backtrack());
2197 return;
2198 }
2199 if (trace->at_start() == Trace::UNKNOWN) {
2200 assembler->CheckNotAtStart(trace->backtrack());
2201 Trace at_start_trace = *trace;
2202 at_start_trace.set_at_start(true);
2203 on_success()->Emit(compiler, &at_start_trace);
2204 return;
2205 }
2206 }
2207 break;
2208 case AFTER_NEWLINE:
2209 EmitHat(compiler, on_success(), trace);
2210 return;
2211 case AT_NON_BOUNDARY:
2212 case AT_BOUNDARY:
2213 EmitBoundaryCheck(type_, compiler, on_success(), trace);
2214 return;
2215 }
2216 on_success()->Emit(compiler, trace);
2217 }
2218
2219
DeterminedAlready(QuickCheckDetails * quick_check,int offset)2220 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
2221 if (quick_check == NULL) return false;
2222 if (offset >= quick_check->characters()) return false;
2223 return quick_check->positions(offset)->determines_perfectly;
2224 }
2225
2226
UpdateBoundsCheck(int index,int * checked_up_to)2227 static void UpdateBoundsCheck(int index, int* checked_up_to) {
2228 if (index > *checked_up_to) {
2229 *checked_up_to = index;
2230 }
2231 }
2232
2233
2234 // We call this repeatedly to generate code for each pass over the text node.
2235 // The passes are in increasing order of difficulty because we hope one
2236 // of the first passes will fail in which case we are saved the work of the
2237 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
2238 // we will check the '%' in the first pass, the case independent 'a' in the
2239 // second pass and the character class in the last pass.
2240 //
2241 // The passes are done from right to left, so for example to test for /bar/
2242 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
2243 // and then a 'b' with offset 0. This means we can avoid the end-of-input
2244 // bounds check most of the time. In the example we only need to check for
2245 // end-of-input when loading the putative 'r'.
2246 //
2247 // A slight complication involves the fact that the first character may already
2248 // be fetched into a register by the previous node. In this case we want to
2249 // do the test for that character first. We do this in separate passes. The
2250 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
2251 // pass has been performed then subsequent passes will have true in
2252 // first_element_checked to indicate that that character does not need to be
2253 // checked again.
2254 //
2255 // In addition to all this we are passed a Trace, which can
2256 // contain an AlternativeGeneration object. In this AlternativeGeneration
2257 // object we can see details of any quick check that was already passed in
2258 // order to get to the code we are now generating. The quick check can involve
2259 // loading characters, which means we do not need to recheck the bounds
2260 // up to the limit the quick check already checked. In addition the quick
2261 // check can have involved a mask and compare operation which may simplify
2262 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)2263 void TextNode::TextEmitPass(RegExpCompiler* compiler,
2264 TextEmitPassType pass,
2265 bool preloaded,
2266 Trace* trace,
2267 bool first_element_checked,
2268 int* checked_up_to) {
2269 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2270 bool ascii = compiler->ascii();
2271 Label* backtrack = trace->backtrack();
2272 QuickCheckDetails* quick_check = trace->quick_check_performed();
2273 int element_count = elms_->length();
2274 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
2275 TextElement elm = elms_->at(i);
2276 int cp_offset = trace->cp_offset() + elm.cp_offset;
2277 if (elm.type == TextElement::ATOM) {
2278 Vector<const uc16> quarks = elm.data.u_atom->data();
2279 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
2280 if (first_element_checked && i == 0 && j == 0) continue;
2281 if (DeterminedAlready(quick_check, elm.cp_offset + j)) continue;
2282 EmitCharacterFunction* emit_function = NULL;
2283 switch (pass) {
2284 case NON_ASCII_MATCH:
2285 ASSERT(ascii);
2286 if (quarks[j] > String::kMaxAsciiCharCode) {
2287 assembler->GoTo(backtrack);
2288 return;
2289 }
2290 break;
2291 case NON_LETTER_CHARACTER_MATCH:
2292 emit_function = &EmitAtomNonLetter;
2293 break;
2294 case SIMPLE_CHARACTER_MATCH:
2295 emit_function = &EmitSimpleCharacter;
2296 break;
2297 case CASE_CHARACTER_MATCH:
2298 emit_function = &EmitAtomLetter;
2299 break;
2300 default:
2301 break;
2302 }
2303 if (emit_function != NULL) {
2304 bool bound_checked = emit_function(compiler,
2305 quarks[j],
2306 backtrack,
2307 cp_offset + j,
2308 *checked_up_to < cp_offset + j,
2309 preloaded);
2310 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
2311 }
2312 }
2313 } else {
2314 ASSERT_EQ(elm.type, TextElement::CHAR_CLASS);
2315 if (pass == CHARACTER_CLASS_MATCH) {
2316 if (first_element_checked && i == 0) continue;
2317 if (DeterminedAlready(quick_check, elm.cp_offset)) continue;
2318 RegExpCharacterClass* cc = elm.data.u_char_class;
2319 EmitCharClass(assembler,
2320 cc,
2321 ascii,
2322 backtrack,
2323 cp_offset,
2324 *checked_up_to < cp_offset,
2325 preloaded);
2326 UpdateBoundsCheck(cp_offset, checked_up_to);
2327 }
2328 }
2329 }
2330 }
2331
2332
Length()2333 int TextNode::Length() {
2334 TextElement elm = elms_->last();
2335 ASSERT(elm.cp_offset >= 0);
2336 if (elm.type == TextElement::ATOM) {
2337 return elm.cp_offset + elm.data.u_atom->data().length();
2338 } else {
2339 return elm.cp_offset + 1;
2340 }
2341 }
2342
2343
SkipPass(int int_pass,bool ignore_case)2344 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
2345 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
2346 if (ignore_case) {
2347 return pass == SIMPLE_CHARACTER_MATCH;
2348 } else {
2349 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
2350 }
2351 }
2352
2353
2354 // This generates the code to match a text node. A text node can contain
2355 // straight character sequences (possibly to be matched in a case-independent
2356 // way) and character classes. For efficiency we do not do this in a single
2357 // pass from left to right. Instead we pass over the text node several times,
2358 // emitting code for some character positions every time. See the comment on
2359 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)2360 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2361 LimitResult limit_result = LimitVersions(compiler, trace);
2362 if (limit_result == DONE) return;
2363 ASSERT(limit_result == CONTINUE);
2364
2365 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
2366 compiler->SetRegExpTooBig();
2367 return;
2368 }
2369
2370 if (compiler->ascii()) {
2371 int dummy = 0;
2372 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
2373 }
2374
2375 bool first_elt_done = false;
2376 int bound_checked_to = trace->cp_offset() - 1;
2377 bound_checked_to += trace->bound_checked_up_to();
2378
2379 // If a character is preloaded into the current character register then
2380 // check that now.
2381 if (trace->characters_preloaded() == 1) {
2382 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2383 if (!SkipPass(pass, compiler->ignore_case())) {
2384 TextEmitPass(compiler,
2385 static_cast<TextEmitPassType>(pass),
2386 true,
2387 trace,
2388 false,
2389 &bound_checked_to);
2390 }
2391 }
2392 first_elt_done = true;
2393 }
2394
2395 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
2396 if (!SkipPass(pass, compiler->ignore_case())) {
2397 TextEmitPass(compiler,
2398 static_cast<TextEmitPassType>(pass),
2399 false,
2400 trace,
2401 first_elt_done,
2402 &bound_checked_to);
2403 }
2404 }
2405
2406 Trace successor_trace(*trace);
2407 successor_trace.set_at_start(false);
2408 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
2409 RecursionCheck rc(compiler);
2410 on_success()->Emit(compiler, &successor_trace);
2411 }
2412
2413
InvalidateCurrentCharacter()2414 void Trace::InvalidateCurrentCharacter() {
2415 characters_preloaded_ = 0;
2416 }
2417
2418
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)2419 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
2420 ASSERT(by > 0);
2421 // We don't have an instruction for shifting the current character register
2422 // down or for using a shifted value for anything so lets just forget that
2423 // we preloaded any characters into it.
2424 characters_preloaded_ = 0;
2425 // Adjust the offsets of the quick check performed information. This
2426 // information is used to find out what we already determined about the
2427 // characters by means of mask and compare.
2428 quick_check_performed_.Advance(by, compiler->ascii());
2429 cp_offset_ += by;
2430 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
2431 compiler->SetRegExpTooBig();
2432 cp_offset_ = 0;
2433 }
2434 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
2435 }
2436
2437
MakeCaseIndependent()2438 void TextNode::MakeCaseIndependent() {
2439 int element_count = elms_->length();
2440 for (int i = 0; i < element_count; i++) {
2441 TextElement elm = elms_->at(i);
2442 if (elm.type == TextElement::CHAR_CLASS) {
2443 RegExpCharacterClass* cc = elm.data.u_char_class;
2444 ZoneList<CharacterRange>* ranges = cc->ranges();
2445 int range_count = ranges->length();
2446 for (int i = 0; i < range_count; i++) {
2447 ranges->at(i).AddCaseEquivalents(ranges);
2448 }
2449 }
2450 }
2451 }
2452
2453
GreedyLoopTextLength()2454 int TextNode::GreedyLoopTextLength() {
2455 TextElement elm = elms_->at(elms_->length() - 1);
2456 if (elm.type == TextElement::CHAR_CLASS) {
2457 return elm.cp_offset + 1;
2458 } else {
2459 return elm.cp_offset + elm.data.u_atom->data().length();
2460 }
2461 }
2462
2463
2464 // Finds the fixed match length of a sequence of nodes that goes from
2465 // this alternative and back to this choice node. If there are variable
2466 // length nodes or other complications in the way then return a sentinel
2467 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLength(GuardedAlternative * alternative)2468 int ChoiceNode::GreedyLoopTextLength(GuardedAlternative* alternative) {
2469 int length = 0;
2470 RegExpNode* node = alternative->node();
2471 // Later we will generate code for all these text nodes using recursion
2472 // so we have to limit the max number.
2473 int recursion_depth = 0;
2474 while (node != this) {
2475 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
2476 return kNodeIsTooComplexForGreedyLoops;
2477 }
2478 int node_length = node->GreedyLoopTextLength();
2479 if (node_length == kNodeIsTooComplexForGreedyLoops) {
2480 return kNodeIsTooComplexForGreedyLoops;
2481 }
2482 length += node_length;
2483 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
2484 node = seq_node->on_success();
2485 }
2486 return length;
2487 }
2488
2489
AddLoopAlternative(GuardedAlternative alt)2490 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
2491 ASSERT_EQ(loop_node_, NULL);
2492 AddAlternative(alt);
2493 loop_node_ = alt.node();
2494 }
2495
2496
AddContinueAlternative(GuardedAlternative alt)2497 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
2498 ASSERT_EQ(continue_node_, NULL);
2499 AddAlternative(alt);
2500 continue_node_ = alt.node();
2501 }
2502
2503
Emit(RegExpCompiler * compiler,Trace * trace)2504 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2505 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2506 if (trace->stop_node() == this) {
2507 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2508 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
2509 // Update the counter-based backtracking info on the stack. This is an
2510 // optimization for greedy loops (see below).
2511 ASSERT(trace->cp_offset() == text_length);
2512 macro_assembler->AdvanceCurrentPosition(text_length);
2513 macro_assembler->GoTo(trace->loop_label());
2514 return;
2515 }
2516 ASSERT(trace->stop_node() == NULL);
2517 if (!trace->is_trivial()) {
2518 trace->Flush(compiler, this);
2519 return;
2520 }
2521 ChoiceNode::Emit(compiler, trace);
2522 }
2523
2524
CalculatePreloadCharacters(RegExpCompiler * compiler)2525 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler) {
2526 int preload_characters = EatsAtLeast(4, 0);
2527 if (compiler->macro_assembler()->CanReadUnaligned()) {
2528 bool ascii = compiler->ascii();
2529 if (ascii) {
2530 if (preload_characters > 4) preload_characters = 4;
2531 // We can't preload 3 characters because there is no machine instruction
2532 // to do that. We can't just load 4 because we could be reading
2533 // beyond the end of the string, which could cause a memory fault.
2534 if (preload_characters == 3) preload_characters = 2;
2535 } else {
2536 if (preload_characters > 2) preload_characters = 2;
2537 }
2538 } else {
2539 if (preload_characters > 1) preload_characters = 1;
2540 }
2541 return preload_characters;
2542 }
2543
2544
2545 // This class is used when generating the alternatives in a choice node. It
2546 // records the way the alternative is being code generated.
2547 class AlternativeGeneration: public Malloced {
2548 public:
AlternativeGeneration()2549 AlternativeGeneration()
2550 : possible_success(),
2551 expects_preload(false),
2552 after(),
2553 quick_check_details() { }
2554 Label possible_success;
2555 bool expects_preload;
2556 Label after;
2557 QuickCheckDetails quick_check_details;
2558 };
2559
2560
2561 // Creates a list of AlternativeGenerations. If the list has a reasonable
2562 // size then it is on the stack, otherwise the excess is on the heap.
2563 class AlternativeGenerationList {
2564 public:
AlternativeGenerationList(int count)2565 explicit AlternativeGenerationList(int count)
2566 : alt_gens_(count) {
2567 for (int i = 0; i < count && i < kAFew; i++) {
2568 alt_gens_.Add(a_few_alt_gens_ + i);
2569 }
2570 for (int i = kAFew; i < count; i++) {
2571 alt_gens_.Add(new AlternativeGeneration());
2572 }
2573 }
~AlternativeGenerationList()2574 ~AlternativeGenerationList() {
2575 for (int i = kAFew; i < alt_gens_.length(); i++) {
2576 delete alt_gens_[i];
2577 alt_gens_[i] = NULL;
2578 }
2579 }
2580
at(int i)2581 AlternativeGeneration* at(int i) {
2582 return alt_gens_[i];
2583 }
2584 private:
2585 static const int kAFew = 10;
2586 ZoneList<AlternativeGeneration*> alt_gens_;
2587 AlternativeGeneration a_few_alt_gens_[kAFew];
2588 };
2589
2590
2591 /* Code generation for choice nodes.
2592 *
2593 * We generate quick checks that do a mask and compare to eliminate a
2594 * choice. If the quick check succeeds then it jumps to the continuation to
2595 * do slow checks and check subsequent nodes. If it fails (the common case)
2596 * it falls through to the next choice.
2597 *
2598 * Here is the desired flow graph. Nodes directly below each other imply
2599 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
2600 * 3 doesn't have a quick check so we have to call the slow check.
2601 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
2602 * regexp continuation is generated directly after the Sn node, up to the
2603 * next GoTo if we decide to reuse some already generated code. Some
2604 * nodes expect preload_characters to be preloaded into the current
2605 * character register. R nodes do this preloading. Vertices are marked
2606 * F for failures and S for success (possible success in the case of quick
2607 * nodes). L, V, < and > are used as arrow heads.
2608 *
2609 * ----------> R
2610 * |
2611 * V
2612 * Q1 -----> S1
2613 * | S /
2614 * F| /
2615 * | F/
2616 * | /
2617 * | R
2618 * | /
2619 * V L
2620 * Q2 -----> S2
2621 * | S /
2622 * F| /
2623 * | F/
2624 * | /
2625 * | R
2626 * | /
2627 * V L
2628 * S3
2629 * |
2630 * F|
2631 * |
2632 * R
2633 * |
2634 * backtrack V
2635 * <----------Q4
2636 * \ F |
2637 * \ |S
2638 * \ F V
2639 * \-----S4
2640 *
2641 * For greedy loops we reverse our expectation and expect to match rather
2642 * than fail. Therefore we want the loop code to look like this (U is the
2643 * unwind code that steps back in the greedy loop). The following alternatives
2644 * look the same as above.
2645 * _____
2646 * / \
2647 * V |
2648 * ----------> S1 |
2649 * /| |
2650 * / |S |
2651 * F/ \_____/
2652 * /
2653 * |<-----------
2654 * | \
2655 * V \
2656 * Q2 ---> S2 \
2657 * | S / |
2658 * F| / |
2659 * | F/ |
2660 * | / |
2661 * | R |
2662 * | / |
2663 * F VL |
2664 * <------U |
2665 * back |S |
2666 * \______________/
2667 */
2668
2669
Emit(RegExpCompiler * compiler,Trace * trace)2670 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2671 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2672 int choice_count = alternatives_->length();
2673 #ifdef DEBUG
2674 for (int i = 0; i < choice_count - 1; i++) {
2675 GuardedAlternative alternative = alternatives_->at(i);
2676 ZoneList<Guard*>* guards = alternative.guards();
2677 int guard_count = (guards == NULL) ? 0 : guards->length();
2678 for (int j = 0; j < guard_count; j++) {
2679 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
2680 }
2681 }
2682 #endif
2683
2684 LimitResult limit_result = LimitVersions(compiler, trace);
2685 if (limit_result == DONE) return;
2686 ASSERT(limit_result == CONTINUE);
2687
2688 int new_flush_budget = trace->flush_budget() / choice_count;
2689 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
2690 trace->Flush(compiler, this);
2691 return;
2692 }
2693
2694 RecursionCheck rc(compiler);
2695
2696 Trace* current_trace = trace;
2697
2698 int text_length = GreedyLoopTextLength(&(alternatives_->at(0)));
2699 bool greedy_loop = false;
2700 Label greedy_loop_label;
2701 Trace counter_backtrack_trace;
2702 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
2703 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
2704
2705 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
2706 // Here we have special handling for greedy loops containing only text nodes
2707 // and other simple nodes. These are handled by pushing the current
2708 // position on the stack and then incrementing the current position each
2709 // time around the switch. On backtrack we decrement the current position
2710 // and check it against the pushed value. This avoids pushing backtrack
2711 // information for each iteration of the loop, which could take up a lot of
2712 // space.
2713 greedy_loop = true;
2714 ASSERT(trace->stop_node() == NULL);
2715 macro_assembler->PushCurrentPosition();
2716 current_trace = &counter_backtrack_trace;
2717 Label greedy_match_failed;
2718 Trace greedy_match_trace;
2719 if (not_at_start()) greedy_match_trace.set_at_start(false);
2720 greedy_match_trace.set_backtrack(&greedy_match_failed);
2721 Label loop_label;
2722 macro_assembler->Bind(&loop_label);
2723 greedy_match_trace.set_stop_node(this);
2724 greedy_match_trace.set_loop_label(&loop_label);
2725 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
2726 macro_assembler->Bind(&greedy_match_failed);
2727 }
2728
2729 Label second_choice; // For use in greedy matches.
2730 macro_assembler->Bind(&second_choice);
2731
2732 int first_normal_choice = greedy_loop ? 1 : 0;
2733
2734 int preload_characters = CalculatePreloadCharacters(compiler);
2735 bool preload_is_current =
2736 (current_trace->characters_preloaded() == preload_characters);
2737 bool preload_has_checked_bounds = preload_is_current;
2738
2739 AlternativeGenerationList alt_gens(choice_count);
2740
2741 // For now we just call all choices one after the other. The idea ultimately
2742 // is to use the Dispatch table to try only the relevant ones.
2743 for (int i = first_normal_choice; i < choice_count; i++) {
2744 GuardedAlternative alternative = alternatives_->at(i);
2745 AlternativeGeneration* alt_gen = alt_gens.at(i);
2746 alt_gen->quick_check_details.set_characters(preload_characters);
2747 ZoneList<Guard*>* guards = alternative.guards();
2748 int guard_count = (guards == NULL) ? 0 : guards->length();
2749 Trace new_trace(*current_trace);
2750 new_trace.set_characters_preloaded(preload_is_current ?
2751 preload_characters :
2752 0);
2753 if (preload_has_checked_bounds) {
2754 new_trace.set_bound_checked_up_to(preload_characters);
2755 }
2756 new_trace.quick_check_performed()->Clear();
2757 if (not_at_start_) new_trace.set_at_start(Trace::FALSE);
2758 alt_gen->expects_preload = preload_is_current;
2759 bool generate_full_check_inline = false;
2760 if (FLAG_regexp_optimization &&
2761 try_to_emit_quick_check_for_alternative(i) &&
2762 alternative.node()->EmitQuickCheck(compiler,
2763 &new_trace,
2764 preload_has_checked_bounds,
2765 &alt_gen->possible_success,
2766 &alt_gen->quick_check_details,
2767 i < choice_count - 1)) {
2768 // Quick check was generated for this choice.
2769 preload_is_current = true;
2770 preload_has_checked_bounds = true;
2771 // On the last choice in the ChoiceNode we generated the quick
2772 // check to fall through on possible success. So now we need to
2773 // generate the full check inline.
2774 if (i == choice_count - 1) {
2775 macro_assembler->Bind(&alt_gen->possible_success);
2776 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2777 new_trace.set_characters_preloaded(preload_characters);
2778 new_trace.set_bound_checked_up_to(preload_characters);
2779 generate_full_check_inline = true;
2780 }
2781 } else if (alt_gen->quick_check_details.cannot_match()) {
2782 if (i == choice_count - 1 && !greedy_loop) {
2783 macro_assembler->GoTo(trace->backtrack());
2784 }
2785 continue;
2786 } else {
2787 // No quick check was generated. Put the full code here.
2788 // If this is not the first choice then there could be slow checks from
2789 // previous cases that go here when they fail. There's no reason to
2790 // insist that they preload characters since the slow check we are about
2791 // to generate probably can't use it.
2792 if (i != first_normal_choice) {
2793 alt_gen->expects_preload = false;
2794 new_trace.set_characters_preloaded(0);
2795 }
2796 if (i < choice_count - 1) {
2797 new_trace.set_backtrack(&alt_gen->after);
2798 }
2799 generate_full_check_inline = true;
2800 }
2801 if (generate_full_check_inline) {
2802 if (new_trace.actions() != NULL) {
2803 new_trace.set_flush_budget(new_flush_budget);
2804 }
2805 for (int j = 0; j < guard_count; j++) {
2806 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
2807 }
2808 alternative.node()->Emit(compiler, &new_trace);
2809 preload_is_current = false;
2810 }
2811 macro_assembler->Bind(&alt_gen->after);
2812 }
2813 if (greedy_loop) {
2814 macro_assembler->Bind(&greedy_loop_label);
2815 // If we have unwound to the bottom then backtrack.
2816 macro_assembler->CheckGreedyLoop(trace->backtrack());
2817 // Otherwise try the second priority at an earlier position.
2818 macro_assembler->AdvanceCurrentPosition(-text_length);
2819 macro_assembler->GoTo(&second_choice);
2820 }
2821
2822 // At this point we need to generate slow checks for the alternatives where
2823 // the quick check was inlined. We can recognize these because the associated
2824 // label was bound.
2825 for (int i = first_normal_choice; i < choice_count - 1; i++) {
2826 AlternativeGeneration* alt_gen = alt_gens.at(i);
2827 Trace new_trace(*current_trace);
2828 // If there are actions to be flushed we have to limit how many times
2829 // they are flushed. Take the budget of the parent trace and distribute
2830 // it fairly amongst the children.
2831 if (new_trace.actions() != NULL) {
2832 new_trace.set_flush_budget(new_flush_budget);
2833 }
2834 EmitOutOfLineContinuation(compiler,
2835 &new_trace,
2836 alternatives_->at(i),
2837 alt_gen,
2838 preload_characters,
2839 alt_gens.at(i + 1)->expects_preload);
2840 }
2841 }
2842
2843
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)2844 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
2845 Trace* trace,
2846 GuardedAlternative alternative,
2847 AlternativeGeneration* alt_gen,
2848 int preload_characters,
2849 bool next_expects_preload) {
2850 if (!alt_gen->possible_success.is_linked()) return;
2851
2852 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2853 macro_assembler->Bind(&alt_gen->possible_success);
2854 Trace out_of_line_trace(*trace);
2855 out_of_line_trace.set_characters_preloaded(preload_characters);
2856 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
2857 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE);
2858 ZoneList<Guard*>* guards = alternative.guards();
2859 int guard_count = (guards == NULL) ? 0 : guards->length();
2860 if (next_expects_preload) {
2861 Label reload_current_char;
2862 out_of_line_trace.set_backtrack(&reload_current_char);
2863 for (int j = 0; j < guard_count; j++) {
2864 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
2865 }
2866 alternative.node()->Emit(compiler, &out_of_line_trace);
2867 macro_assembler->Bind(&reload_current_char);
2868 // Reload the current character, since the next quick check expects that.
2869 // We don't need to check bounds here because we only get into this
2870 // code through a quick check which already did the checked load.
2871 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
2872 NULL,
2873 false,
2874 preload_characters);
2875 macro_assembler->GoTo(&(alt_gen->after));
2876 } else {
2877 out_of_line_trace.set_backtrack(&(alt_gen->after));
2878 for (int j = 0; j < guard_count; j++) {
2879 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
2880 }
2881 alternative.node()->Emit(compiler, &out_of_line_trace);
2882 }
2883 }
2884
2885
Emit(RegExpCompiler * compiler,Trace * trace)2886 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
2887 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2888 LimitResult limit_result = LimitVersions(compiler, trace);
2889 if (limit_result == DONE) return;
2890 ASSERT(limit_result == CONTINUE);
2891
2892 RecursionCheck rc(compiler);
2893
2894 switch (type_) {
2895 case STORE_POSITION: {
2896 Trace::DeferredCapture
2897 new_capture(data_.u_position_register.reg,
2898 data_.u_position_register.is_capture,
2899 trace);
2900 Trace new_trace = *trace;
2901 new_trace.add_action(&new_capture);
2902 on_success()->Emit(compiler, &new_trace);
2903 break;
2904 }
2905 case INCREMENT_REGISTER: {
2906 Trace::DeferredIncrementRegister
2907 new_increment(data_.u_increment_register.reg);
2908 Trace new_trace = *trace;
2909 new_trace.add_action(&new_increment);
2910 on_success()->Emit(compiler, &new_trace);
2911 break;
2912 }
2913 case SET_REGISTER: {
2914 Trace::DeferredSetRegister
2915 new_set(data_.u_store_register.reg, data_.u_store_register.value);
2916 Trace new_trace = *trace;
2917 new_trace.add_action(&new_set);
2918 on_success()->Emit(compiler, &new_trace);
2919 break;
2920 }
2921 case CLEAR_CAPTURES: {
2922 Trace::DeferredClearCaptures
2923 new_capture(Interval(data_.u_clear_captures.range_from,
2924 data_.u_clear_captures.range_to));
2925 Trace new_trace = *trace;
2926 new_trace.add_action(&new_capture);
2927 on_success()->Emit(compiler, &new_trace);
2928 break;
2929 }
2930 case BEGIN_SUBMATCH:
2931 if (!trace->is_trivial()) {
2932 trace->Flush(compiler, this);
2933 } else {
2934 assembler->WriteCurrentPositionToRegister(
2935 data_.u_submatch.current_position_register, 0);
2936 assembler->WriteStackPointerToRegister(
2937 data_.u_submatch.stack_pointer_register);
2938 on_success()->Emit(compiler, trace);
2939 }
2940 break;
2941 case EMPTY_MATCH_CHECK: {
2942 int start_pos_reg = data_.u_empty_match_check.start_register;
2943 int stored_pos = 0;
2944 int rep_reg = data_.u_empty_match_check.repetition_register;
2945 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
2946 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
2947 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
2948 // If we know we haven't advanced and there is no minimum we
2949 // can just backtrack immediately.
2950 assembler->GoTo(trace->backtrack());
2951 } else if (know_dist && stored_pos < trace->cp_offset()) {
2952 // If we know we've advanced we can generate the continuation
2953 // immediately.
2954 on_success()->Emit(compiler, trace);
2955 } else if (!trace->is_trivial()) {
2956 trace->Flush(compiler, this);
2957 } else {
2958 Label skip_empty_check;
2959 // If we have a minimum number of repetitions we check the current
2960 // number first and skip the empty check if it's not enough.
2961 if (has_minimum) {
2962 int limit = data_.u_empty_match_check.repetition_limit;
2963 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
2964 }
2965 // If the match is empty we bail out, otherwise we fall through
2966 // to the on-success continuation.
2967 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
2968 trace->backtrack());
2969 assembler->Bind(&skip_empty_check);
2970 on_success()->Emit(compiler, trace);
2971 }
2972 break;
2973 }
2974 case POSITIVE_SUBMATCH_SUCCESS: {
2975 if (!trace->is_trivial()) {
2976 trace->Flush(compiler, this);
2977 return;
2978 }
2979 assembler->ReadCurrentPositionFromRegister(
2980 data_.u_submatch.current_position_register);
2981 assembler->ReadStackPointerFromRegister(
2982 data_.u_submatch.stack_pointer_register);
2983 int clear_register_count = data_.u_submatch.clear_register_count;
2984 if (clear_register_count == 0) {
2985 on_success()->Emit(compiler, trace);
2986 return;
2987 }
2988 int clear_registers_from = data_.u_submatch.clear_register_from;
2989 Label clear_registers_backtrack;
2990 Trace new_trace = *trace;
2991 new_trace.set_backtrack(&clear_registers_backtrack);
2992 on_success()->Emit(compiler, &new_trace);
2993
2994 assembler->Bind(&clear_registers_backtrack);
2995 int clear_registers_to = clear_registers_from + clear_register_count - 1;
2996 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
2997
2998 ASSERT(trace->backtrack() == NULL);
2999 assembler->Backtrack();
3000 return;
3001 }
3002 default:
3003 UNREACHABLE();
3004 }
3005 }
3006
3007
Emit(RegExpCompiler * compiler,Trace * trace)3008 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3009 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3010 if (!trace->is_trivial()) {
3011 trace->Flush(compiler, this);
3012 return;
3013 }
3014
3015 LimitResult limit_result = LimitVersions(compiler, trace);
3016 if (limit_result == DONE) return;
3017 ASSERT(limit_result == CONTINUE);
3018
3019 RecursionCheck rc(compiler);
3020
3021 ASSERT_EQ(start_reg_ + 1, end_reg_);
3022 if (compiler->ignore_case()) {
3023 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
3024 trace->backtrack());
3025 } else {
3026 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
3027 }
3028 on_success()->Emit(compiler, trace);
3029 }
3030
3031
3032 // -------------------------------------------------------------------
3033 // Dot/dotty output
3034
3035
3036 #ifdef DEBUG
3037
3038
3039 class DotPrinter: public NodeVisitor {
3040 public:
DotPrinter(bool ignore_case)3041 explicit DotPrinter(bool ignore_case)
3042 : ignore_case_(ignore_case),
3043 stream_(&alloc_) { }
3044 void PrintNode(const char* label, RegExpNode* node);
3045 void Visit(RegExpNode* node);
3046 void PrintAttributes(RegExpNode* from);
stream()3047 StringStream* stream() { return &stream_; }
3048 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
3049 #define DECLARE_VISIT(Type) \
3050 virtual void Visit##Type(Type##Node* that);
3051 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
3052 #undef DECLARE_VISIT
3053 private:
3054 bool ignore_case_;
3055 HeapStringAllocator alloc_;
3056 StringStream stream_;
3057 };
3058
3059
PrintNode(const char * label,RegExpNode * node)3060 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
3061 stream()->Add("digraph G {\n graph [label=\"");
3062 for (int i = 0; label[i]; i++) {
3063 switch (label[i]) {
3064 case '\\':
3065 stream()->Add("\\\\");
3066 break;
3067 case '"':
3068 stream()->Add("\"");
3069 break;
3070 default:
3071 stream()->Put(label[i]);
3072 break;
3073 }
3074 }
3075 stream()->Add("\"];\n");
3076 Visit(node);
3077 stream()->Add("}\n");
3078 printf("%s", *(stream()->ToCString()));
3079 }
3080
3081
Visit(RegExpNode * node)3082 void DotPrinter::Visit(RegExpNode* node) {
3083 if (node->info()->visited) return;
3084 node->info()->visited = true;
3085 node->Accept(this);
3086 }
3087
3088
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)3089 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
3090 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
3091 Visit(on_failure);
3092 }
3093
3094
3095 class TableEntryBodyPrinter {
3096 public:
TableEntryBodyPrinter(StringStream * stream,ChoiceNode * choice)3097 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
3098 : stream_(stream), choice_(choice) { }
Call(uc16 from,DispatchTable::Entry entry)3099 void Call(uc16 from, DispatchTable::Entry entry) {
3100 OutSet* out_set = entry.out_set();
3101 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3102 if (out_set->Get(i)) {
3103 stream()->Add(" n%p:s%io%i -> n%p;\n",
3104 choice(),
3105 from,
3106 i,
3107 choice()->alternatives()->at(i).node());
3108 }
3109 }
3110 }
3111 private:
stream()3112 StringStream* stream() { return stream_; }
choice()3113 ChoiceNode* choice() { return choice_; }
3114 StringStream* stream_;
3115 ChoiceNode* choice_;
3116 };
3117
3118
3119 class TableEntryHeaderPrinter {
3120 public:
TableEntryHeaderPrinter(StringStream * stream)3121 explicit TableEntryHeaderPrinter(StringStream* stream)
3122 : first_(true), stream_(stream) { }
Call(uc16 from,DispatchTable::Entry entry)3123 void Call(uc16 from, DispatchTable::Entry entry) {
3124 if (first_) {
3125 first_ = false;
3126 } else {
3127 stream()->Add("|");
3128 }
3129 stream()->Add("{\\%k-\\%k|{", from, entry.to());
3130 OutSet* out_set = entry.out_set();
3131 int priority = 0;
3132 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3133 if (out_set->Get(i)) {
3134 if (priority > 0) stream()->Add("|");
3135 stream()->Add("<s%io%i> %i", from, i, priority);
3136 priority++;
3137 }
3138 }
3139 stream()->Add("}}");
3140 }
3141 private:
3142 bool first_;
stream()3143 StringStream* stream() { return stream_; }
3144 StringStream* stream_;
3145 };
3146
3147
3148 class AttributePrinter {
3149 public:
AttributePrinter(DotPrinter * out)3150 explicit AttributePrinter(DotPrinter* out)
3151 : out_(out), first_(true) { }
PrintSeparator()3152 void PrintSeparator() {
3153 if (first_) {
3154 first_ = false;
3155 } else {
3156 out_->stream()->Add("|");
3157 }
3158 }
PrintBit(const char * name,bool value)3159 void PrintBit(const char* name, bool value) {
3160 if (!value) return;
3161 PrintSeparator();
3162 out_->stream()->Add("{%s}", name);
3163 }
PrintPositive(const char * name,int value)3164 void PrintPositive(const char* name, int value) {
3165 if (value < 0) return;
3166 PrintSeparator();
3167 out_->stream()->Add("{%s|%x}", name, value);
3168 }
3169 private:
3170 DotPrinter* out_;
3171 bool first_;
3172 };
3173
3174
PrintAttributes(RegExpNode * that)3175 void DotPrinter::PrintAttributes(RegExpNode* that) {
3176 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
3177 "margin=0.1, fontsize=10, label=\"{",
3178 that);
3179 AttributePrinter printer(this);
3180 NodeInfo* info = that->info();
3181 printer.PrintBit("NI", info->follows_newline_interest);
3182 printer.PrintBit("WI", info->follows_word_interest);
3183 printer.PrintBit("SI", info->follows_start_interest);
3184 Label* label = that->label();
3185 if (label->is_bound())
3186 printer.PrintPositive("@", label->pos());
3187 stream()->Add("}\"];\n");
3188 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
3189 "arrowhead=none];\n", that, that);
3190 }
3191
3192
3193 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)3194 void DotPrinter::VisitChoice(ChoiceNode* that) {
3195 if (kPrintDispatchTable) {
3196 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
3197 TableEntryHeaderPrinter header_printer(stream());
3198 that->GetTable(ignore_case_)->ForEach(&header_printer);
3199 stream()->Add("\"]\n", that);
3200 PrintAttributes(that);
3201 TableEntryBodyPrinter body_printer(stream(), that);
3202 that->GetTable(ignore_case_)->ForEach(&body_printer);
3203 } else {
3204 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
3205 for (int i = 0; i < that->alternatives()->length(); i++) {
3206 GuardedAlternative alt = that->alternatives()->at(i);
3207 stream()->Add(" n%p -> n%p;\n", that, alt.node());
3208 }
3209 }
3210 for (int i = 0; i < that->alternatives()->length(); i++) {
3211 GuardedAlternative alt = that->alternatives()->at(i);
3212 alt.node()->Accept(this);
3213 }
3214 }
3215
3216
VisitText(TextNode * that)3217 void DotPrinter::VisitText(TextNode* that) {
3218 stream()->Add(" n%p [label=\"", that);
3219 for (int i = 0; i < that->elements()->length(); i++) {
3220 if (i > 0) stream()->Add(" ");
3221 TextElement elm = that->elements()->at(i);
3222 switch (elm.type) {
3223 case TextElement::ATOM: {
3224 stream()->Add("'%w'", elm.data.u_atom->data());
3225 break;
3226 }
3227 case TextElement::CHAR_CLASS: {
3228 RegExpCharacterClass* node = elm.data.u_char_class;
3229 stream()->Add("[");
3230 if (node->is_negated())
3231 stream()->Add("^");
3232 for (int j = 0; j < node->ranges()->length(); j++) {
3233 CharacterRange range = node->ranges()->at(j);
3234 stream()->Add("%k-%k", range.from(), range.to());
3235 }
3236 stream()->Add("]");
3237 break;
3238 }
3239 default:
3240 UNREACHABLE();
3241 }
3242 }
3243 stream()->Add("\", shape=box, peripheries=2];\n");
3244 PrintAttributes(that);
3245 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3246 Visit(that->on_success());
3247 }
3248
3249
VisitBackReference(BackReferenceNode * that)3250 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
3251 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
3252 that,
3253 that->start_register(),
3254 that->end_register());
3255 PrintAttributes(that);
3256 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
3257 Visit(that->on_success());
3258 }
3259
3260
VisitEnd(EndNode * that)3261 void DotPrinter::VisitEnd(EndNode* that) {
3262 stream()->Add(" n%p [style=bold, shape=point];\n", that);
3263 PrintAttributes(that);
3264 }
3265
3266
VisitAssertion(AssertionNode * that)3267 void DotPrinter::VisitAssertion(AssertionNode* that) {
3268 stream()->Add(" n%p [", that);
3269 switch (that->type()) {
3270 case AssertionNode::AT_END:
3271 stream()->Add("label=\"$\", shape=septagon");
3272 break;
3273 case AssertionNode::AT_START:
3274 stream()->Add("label=\"^\", shape=septagon");
3275 break;
3276 case AssertionNode::AT_BOUNDARY:
3277 stream()->Add("label=\"\\b\", shape=septagon");
3278 break;
3279 case AssertionNode::AT_NON_BOUNDARY:
3280 stream()->Add("label=\"\\B\", shape=septagon");
3281 break;
3282 case AssertionNode::AFTER_NEWLINE:
3283 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
3284 break;
3285 }
3286 stream()->Add("];\n");
3287 PrintAttributes(that);
3288 RegExpNode* successor = that->on_success();
3289 stream()->Add(" n%p -> n%p;\n", that, successor);
3290 Visit(successor);
3291 }
3292
3293
VisitAction(ActionNode * that)3294 void DotPrinter::VisitAction(ActionNode* that) {
3295 stream()->Add(" n%p [", that);
3296 switch (that->type_) {
3297 case ActionNode::SET_REGISTER:
3298 stream()->Add("label=\"$%i:=%i\", shape=octagon",
3299 that->data_.u_store_register.reg,
3300 that->data_.u_store_register.value);
3301 break;
3302 case ActionNode::INCREMENT_REGISTER:
3303 stream()->Add("label=\"$%i++\", shape=octagon",
3304 that->data_.u_increment_register.reg);
3305 break;
3306 case ActionNode::STORE_POSITION:
3307 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
3308 that->data_.u_position_register.reg);
3309 break;
3310 case ActionNode::BEGIN_SUBMATCH:
3311 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
3312 that->data_.u_submatch.current_position_register);
3313 break;
3314 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
3315 stream()->Add("label=\"escape\", shape=septagon");
3316 break;
3317 case ActionNode::EMPTY_MATCH_CHECK:
3318 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
3319 that->data_.u_empty_match_check.start_register,
3320 that->data_.u_empty_match_check.repetition_register,
3321 that->data_.u_empty_match_check.repetition_limit);
3322 break;
3323 case ActionNode::CLEAR_CAPTURES: {
3324 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
3325 that->data_.u_clear_captures.range_from,
3326 that->data_.u_clear_captures.range_to);
3327 break;
3328 }
3329 }
3330 stream()->Add("];\n");
3331 PrintAttributes(that);
3332 RegExpNode* successor = that->on_success();
3333 stream()->Add(" n%p -> n%p;\n", that, successor);
3334 Visit(successor);
3335 }
3336
3337
3338 class DispatchTableDumper {
3339 public:
DispatchTableDumper(StringStream * stream)3340 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
3341 void Call(uc16 key, DispatchTable::Entry entry);
stream()3342 StringStream* stream() { return stream_; }
3343 private:
3344 StringStream* stream_;
3345 };
3346
3347
Call(uc16 key,DispatchTable::Entry entry)3348 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
3349 stream()->Add("[%k-%k]: {", key, entry.to());
3350 OutSet* set = entry.out_set();
3351 bool first = true;
3352 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
3353 if (set->Get(i)) {
3354 if (first) {
3355 first = false;
3356 } else {
3357 stream()->Add(", ");
3358 }
3359 stream()->Add("%i", i);
3360 }
3361 }
3362 stream()->Add("}\n");
3363 }
3364
3365
Dump()3366 void DispatchTable::Dump() {
3367 HeapStringAllocator alloc;
3368 StringStream stream(&alloc);
3369 DispatchTableDumper dumper(&stream);
3370 tree()->ForEach(&dumper);
3371 OS::PrintError("%s", *stream.ToCString());
3372 }
3373
3374
DotPrint(const char * label,RegExpNode * node,bool ignore_case)3375 void RegExpEngine::DotPrint(const char* label,
3376 RegExpNode* node,
3377 bool ignore_case) {
3378 DotPrinter printer(ignore_case);
3379 printer.PrintNode(label, node);
3380 }
3381
3382
3383 #endif // DEBUG
3384
3385
3386 // -------------------------------------------------------------------
3387 // Tree to graph conversion
3388
3389 static const int kSpaceRangeCount = 20;
3390 static const int kSpaceRangeAsciiCount = 4;
3391 static const uc16 kSpaceRanges[kSpaceRangeCount] = { 0x0009, 0x000D, 0x0020,
3392 0x0020, 0x00A0, 0x00A0, 0x1680, 0x1680, 0x180E, 0x180E, 0x2000, 0x200A,
3393 0x2028, 0x2029, 0x202F, 0x202F, 0x205F, 0x205F, 0x3000, 0x3000 };
3394
3395 static const int kWordRangeCount = 8;
3396 static const uc16 kWordRanges[kWordRangeCount] = { '0', '9', 'A', 'Z', '_',
3397 '_', 'a', 'z' };
3398
3399 static const int kDigitRangeCount = 2;
3400 static const uc16 kDigitRanges[kDigitRangeCount] = { '0', '9' };
3401
3402 static const int kLineTerminatorRangeCount = 6;
3403 static const uc16 kLineTerminatorRanges[kLineTerminatorRangeCount] = { 0x000A,
3404 0x000A, 0x000D, 0x000D, 0x2028, 0x2029 };
3405
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3406 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
3407 RegExpNode* on_success) {
3408 ZoneList<TextElement>* elms = new ZoneList<TextElement>(1);
3409 elms->Add(TextElement::Atom(this));
3410 return new TextNode(elms, on_success);
3411 }
3412
3413
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3414 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
3415 RegExpNode* on_success) {
3416 return new TextNode(elements(), on_success);
3417 }
3418
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const uc16 * special_class,int length)3419 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
3420 const uc16* special_class,
3421 int length) {
3422 ASSERT(ranges->length() != 0);
3423 ASSERT(length != 0);
3424 ASSERT(special_class[0] != 0);
3425 if (ranges->length() != (length >> 1) + 1) {
3426 return false;
3427 }
3428 CharacterRange range = ranges->at(0);
3429 if (range.from() != 0) {
3430 return false;
3431 }
3432 for (int i = 0; i < length; i += 2) {
3433 if (special_class[i] != (range.to() + 1)) {
3434 return false;
3435 }
3436 range = ranges->at((i >> 1) + 1);
3437 if (special_class[i+1] != range.from() - 1) {
3438 return false;
3439 }
3440 }
3441 if (range.to() != 0xffff) {
3442 return false;
3443 }
3444 return true;
3445 }
3446
3447
CompareRanges(ZoneList<CharacterRange> * ranges,const uc16 * special_class,int length)3448 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
3449 const uc16* special_class,
3450 int length) {
3451 if (ranges->length() * 2 != length) {
3452 return false;
3453 }
3454 for (int i = 0; i < length; i += 2) {
3455 CharacterRange range = ranges->at(i >> 1);
3456 if (range.from() != special_class[i] || range.to() != special_class[i+1]) {
3457 return false;
3458 }
3459 }
3460 return true;
3461 }
3462
3463
is_standard()3464 bool RegExpCharacterClass::is_standard() {
3465 // TODO(lrn): Remove need for this function, by not throwing away information
3466 // along the way.
3467 if (is_negated_) {
3468 return false;
3469 }
3470 if (set_.is_standard()) {
3471 return true;
3472 }
3473 if (CompareRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3474 set_.set_standard_set_type('s');
3475 return true;
3476 }
3477 if (CompareInverseRanges(set_.ranges(), kSpaceRanges, kSpaceRangeCount)) {
3478 set_.set_standard_set_type('S');
3479 return true;
3480 }
3481 if (CompareInverseRanges(set_.ranges(),
3482 kLineTerminatorRanges,
3483 kLineTerminatorRangeCount)) {
3484 set_.set_standard_set_type('.');
3485 return true;
3486 }
3487 return false;
3488 }
3489
3490
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3491 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
3492 RegExpNode* on_success) {
3493 return new TextNode(this, on_success);
3494 }
3495
3496
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3497 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
3498 RegExpNode* on_success) {
3499 ZoneList<RegExpTree*>* alternatives = this->alternatives();
3500 int length = alternatives->length();
3501 ChoiceNode* result = new ChoiceNode(length);
3502 for (int i = 0; i < length; i++) {
3503 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
3504 on_success));
3505 result->AddAlternative(alternative);
3506 }
3507 return result;
3508 }
3509
3510
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3511 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
3512 RegExpNode* on_success) {
3513 return ToNode(min(),
3514 max(),
3515 is_greedy(),
3516 body(),
3517 compiler,
3518 on_success);
3519 }
3520
3521
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)3522 RegExpNode* RegExpQuantifier::ToNode(int min,
3523 int max,
3524 bool is_greedy,
3525 RegExpTree* body,
3526 RegExpCompiler* compiler,
3527 RegExpNode* on_success,
3528 bool not_at_start) {
3529 // x{f, t} becomes this:
3530 //
3531 // (r++)<-.
3532 // | `
3533 // | (x)
3534 // v ^
3535 // (r=0)-->(?)---/ [if r < t]
3536 // |
3537 // [if r >= f] \----> ...
3538 //
3539
3540 // 15.10.2.5 RepeatMatcher algorithm.
3541 // The parser has already eliminated the case where max is 0. In the case
3542 // where max_match is zero the parser has removed the quantifier if min was
3543 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
3544
3545 // If we know that we cannot match zero length then things are a little
3546 // simpler since we don't need to make the special zero length match check
3547 // from step 2.1. If the min and max are small we can unroll a little in
3548 // this case.
3549 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
3550 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
3551 if (max == 0) return on_success; // This can happen due to recursion.
3552 bool body_can_be_empty = (body->min_match() == 0);
3553 int body_start_reg = RegExpCompiler::kNoRegister;
3554 Interval capture_registers = body->CaptureRegisters();
3555 bool needs_capture_clearing = !capture_registers.is_empty();
3556 if (body_can_be_empty) {
3557 body_start_reg = compiler->AllocateRegister();
3558 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
3559 // Only unroll if there are no captures and the body can't be
3560 // empty.
3561 if (min > 0 && min <= kMaxUnrolledMinMatches) {
3562 int new_max = (max == kInfinity) ? max : max - min;
3563 // Recurse once to get the loop or optional matches after the fixed ones.
3564 RegExpNode* answer = ToNode(
3565 0, new_max, is_greedy, body, compiler, on_success, true);
3566 // Unroll the forced matches from 0 to min. This can cause chains of
3567 // TextNodes (which the parser does not generate). These should be
3568 // combined if it turns out they hinder good code generation.
3569 for (int i = 0; i < min; i++) {
3570 answer = body->ToNode(compiler, answer);
3571 }
3572 return answer;
3573 }
3574 if (max <= kMaxUnrolledMaxMatches) {
3575 ASSERT(min == 0);
3576 // Unroll the optional matches up to max.
3577 RegExpNode* answer = on_success;
3578 for (int i = 0; i < max; i++) {
3579 ChoiceNode* alternation = new ChoiceNode(2);
3580 if (is_greedy) {
3581 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3582 answer)));
3583 alternation->AddAlternative(GuardedAlternative(on_success));
3584 } else {
3585 alternation->AddAlternative(GuardedAlternative(on_success));
3586 alternation->AddAlternative(GuardedAlternative(body->ToNode(compiler,
3587 answer)));
3588 }
3589 answer = alternation;
3590 if (not_at_start) alternation->set_not_at_start();
3591 }
3592 return answer;
3593 }
3594 }
3595 bool has_min = min > 0;
3596 bool has_max = max < RegExpTree::kInfinity;
3597 bool needs_counter = has_min || has_max;
3598 int reg_ctr = needs_counter
3599 ? compiler->AllocateRegister()
3600 : RegExpCompiler::kNoRegister;
3601 LoopChoiceNode* center = new LoopChoiceNode(body->min_match() == 0);
3602 if (not_at_start) center->set_not_at_start();
3603 RegExpNode* loop_return = needs_counter
3604 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
3605 : static_cast<RegExpNode*>(center);
3606 if (body_can_be_empty) {
3607 // If the body can be empty we need to check if it was and then
3608 // backtrack.
3609 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
3610 reg_ctr,
3611 min,
3612 loop_return);
3613 }
3614 RegExpNode* body_node = body->ToNode(compiler, loop_return);
3615 if (body_can_be_empty) {
3616 // If the body can be empty we need to store the start position
3617 // so we can bail out if it was empty.
3618 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
3619 }
3620 if (needs_capture_clearing) {
3621 // Before entering the body of this loop we need to clear captures.
3622 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
3623 }
3624 GuardedAlternative body_alt(body_node);
3625 if (has_max) {
3626 Guard* body_guard = new Guard(reg_ctr, Guard::LT, max);
3627 body_alt.AddGuard(body_guard);
3628 }
3629 GuardedAlternative rest_alt(on_success);
3630 if (has_min) {
3631 Guard* rest_guard = new Guard(reg_ctr, Guard::GEQ, min);
3632 rest_alt.AddGuard(rest_guard);
3633 }
3634 if (is_greedy) {
3635 center->AddLoopAlternative(body_alt);
3636 center->AddContinueAlternative(rest_alt);
3637 } else {
3638 center->AddContinueAlternative(rest_alt);
3639 center->AddLoopAlternative(body_alt);
3640 }
3641 if (needs_counter) {
3642 return ActionNode::SetRegister(reg_ctr, 0, center);
3643 } else {
3644 return center;
3645 }
3646 }
3647
3648
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3649 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
3650 RegExpNode* on_success) {
3651 NodeInfo info;
3652 switch (type()) {
3653 case START_OF_LINE:
3654 return AssertionNode::AfterNewline(on_success);
3655 case START_OF_INPUT:
3656 return AssertionNode::AtStart(on_success);
3657 case BOUNDARY:
3658 return AssertionNode::AtBoundary(on_success);
3659 case NON_BOUNDARY:
3660 return AssertionNode::AtNonBoundary(on_success);
3661 case END_OF_INPUT:
3662 return AssertionNode::AtEnd(on_success);
3663 case END_OF_LINE: {
3664 // Compile $ in multiline regexps as an alternation with a positive
3665 // lookahead in one side and an end-of-input on the other side.
3666 // We need two registers for the lookahead.
3667 int stack_pointer_register = compiler->AllocateRegister();
3668 int position_register = compiler->AllocateRegister();
3669 // The ChoiceNode to distinguish between a newline and end-of-input.
3670 ChoiceNode* result = new ChoiceNode(2);
3671 // Create a newline atom.
3672 ZoneList<CharacterRange>* newline_ranges =
3673 new ZoneList<CharacterRange>(3);
3674 CharacterRange::AddClassEscape('n', newline_ranges);
3675 RegExpCharacterClass* newline_atom = new RegExpCharacterClass('n');
3676 TextNode* newline_matcher = new TextNode(
3677 newline_atom,
3678 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3679 position_register,
3680 0, // No captures inside.
3681 -1, // Ignored if no captures.
3682 on_success));
3683 // Create an end-of-input matcher.
3684 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
3685 stack_pointer_register,
3686 position_register,
3687 newline_matcher);
3688 // Add the two alternatives to the ChoiceNode.
3689 GuardedAlternative eol_alternative(end_of_line);
3690 result->AddAlternative(eol_alternative);
3691 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
3692 result->AddAlternative(end_alternative);
3693 return result;
3694 }
3695 default:
3696 UNREACHABLE();
3697 }
3698 return on_success;
3699 }
3700
3701
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3702 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
3703 RegExpNode* on_success) {
3704 return new BackReferenceNode(RegExpCapture::StartRegister(index()),
3705 RegExpCapture::EndRegister(index()),
3706 on_success);
3707 }
3708
3709
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3710 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
3711 RegExpNode* on_success) {
3712 return on_success;
3713 }
3714
3715
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3716 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
3717 RegExpNode* on_success) {
3718 int stack_pointer_register = compiler->AllocateRegister();
3719 int position_register = compiler->AllocateRegister();
3720
3721 const int registers_per_capture = 2;
3722 const int register_of_first_capture = 2;
3723 int register_count = capture_count_ * registers_per_capture;
3724 int register_start =
3725 register_of_first_capture + capture_from_ * registers_per_capture;
3726
3727 RegExpNode* success;
3728 if (is_positive()) {
3729 RegExpNode* node = ActionNode::BeginSubmatch(
3730 stack_pointer_register,
3731 position_register,
3732 body()->ToNode(
3733 compiler,
3734 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
3735 position_register,
3736 register_count,
3737 register_start,
3738 on_success)));
3739 return node;
3740 } else {
3741 // We use a ChoiceNode for a negative lookahead because it has most of
3742 // the characteristics we need. It has the body of the lookahead as its
3743 // first alternative and the expression after the lookahead of the second
3744 // alternative. If the first alternative succeeds then the
3745 // NegativeSubmatchSuccess will unwind the stack including everything the
3746 // choice node set up and backtrack. If the first alternative fails then
3747 // the second alternative is tried, which is exactly the desired result
3748 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
3749 // ChoiceNode that knows to ignore the first exit when calculating quick
3750 // checks.
3751 GuardedAlternative body_alt(
3752 body()->ToNode(
3753 compiler,
3754 success = new NegativeSubmatchSuccess(stack_pointer_register,
3755 position_register,
3756 register_count,
3757 register_start)));
3758 ChoiceNode* choice_node =
3759 new NegativeLookaheadChoiceNode(body_alt,
3760 GuardedAlternative(on_success));
3761 return ActionNode::BeginSubmatch(stack_pointer_register,
3762 position_register,
3763 choice_node);
3764 }
3765 }
3766
3767
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3768 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
3769 RegExpNode* on_success) {
3770 return ToNode(body(), index(), compiler, on_success);
3771 }
3772
3773
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)3774 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
3775 int index,
3776 RegExpCompiler* compiler,
3777 RegExpNode* on_success) {
3778 int start_reg = RegExpCapture::StartRegister(index);
3779 int end_reg = RegExpCapture::EndRegister(index);
3780 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
3781 RegExpNode* body_node = body->ToNode(compiler, store_end);
3782 return ActionNode::StorePosition(start_reg, true, body_node);
3783 }
3784
3785
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)3786 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
3787 RegExpNode* on_success) {
3788 ZoneList<RegExpTree*>* children = nodes();
3789 RegExpNode* current = on_success;
3790 for (int i = children->length() - 1; i >= 0; i--) {
3791 current = children->at(i)->ToNode(compiler, current);
3792 }
3793 return current;
3794 }
3795
3796
AddClass(const uc16 * elmv,int elmc,ZoneList<CharacterRange> * ranges)3797 static void AddClass(const uc16* elmv,
3798 int elmc,
3799 ZoneList<CharacterRange>* ranges) {
3800 for (int i = 0; i < elmc; i += 2) {
3801 ASSERT(elmv[i] <= elmv[i + 1]);
3802 ranges->Add(CharacterRange(elmv[i], elmv[i + 1]));
3803 }
3804 }
3805
3806
AddClassNegated(const uc16 * elmv,int elmc,ZoneList<CharacterRange> * ranges)3807 static void AddClassNegated(const uc16 *elmv,
3808 int elmc,
3809 ZoneList<CharacterRange>* ranges) {
3810 ASSERT(elmv[0] != 0x0000);
3811 ASSERT(elmv[elmc-1] != String::kMaxUC16CharCode);
3812 uc16 last = 0x0000;
3813 for (int i = 0; i < elmc; i += 2) {
3814 ASSERT(last <= elmv[i] - 1);
3815 ASSERT(elmv[i] <= elmv[i + 1]);
3816 ranges->Add(CharacterRange(last, elmv[i] - 1));
3817 last = elmv[i + 1] + 1;
3818 }
3819 ranges->Add(CharacterRange(last, String::kMaxUC16CharCode));
3820 }
3821
3822
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges)3823 void CharacterRange::AddClassEscape(uc16 type,
3824 ZoneList<CharacterRange>* ranges) {
3825 switch (type) {
3826 case 's':
3827 AddClass(kSpaceRanges, kSpaceRangeCount, ranges);
3828 break;
3829 case 'S':
3830 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges);
3831 break;
3832 case 'w':
3833 AddClass(kWordRanges, kWordRangeCount, ranges);
3834 break;
3835 case 'W':
3836 AddClassNegated(kWordRanges, kWordRangeCount, ranges);
3837 break;
3838 case 'd':
3839 AddClass(kDigitRanges, kDigitRangeCount, ranges);
3840 break;
3841 case 'D':
3842 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges);
3843 break;
3844 case '.':
3845 AddClassNegated(kLineTerminatorRanges,
3846 kLineTerminatorRangeCount,
3847 ranges);
3848 break;
3849 // This is not a character range as defined by the spec but a
3850 // convenient shorthand for a character class that matches any
3851 // character.
3852 case '*':
3853 ranges->Add(CharacterRange::Everything());
3854 break;
3855 // This is the set of characters matched by the $ and ^ symbols
3856 // in multiline mode.
3857 case 'n':
3858 AddClass(kLineTerminatorRanges,
3859 kLineTerminatorRangeCount,
3860 ranges);
3861 break;
3862 default:
3863 UNREACHABLE();
3864 }
3865 }
3866
3867
GetWordBounds()3868 Vector<const uc16> CharacterRange::GetWordBounds() {
3869 return Vector<const uc16>(kWordRanges, kWordRangeCount);
3870 }
3871
3872
3873 class CharacterRangeSplitter {
3874 public:
CharacterRangeSplitter(ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded)3875 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
3876 ZoneList<CharacterRange>** excluded)
3877 : included_(included),
3878 excluded_(excluded) { }
3879 void Call(uc16 from, DispatchTable::Entry entry);
3880
3881 static const int kInBase = 0;
3882 static const int kInOverlay = 1;
3883
3884 private:
3885 ZoneList<CharacterRange>** included_;
3886 ZoneList<CharacterRange>** excluded_;
3887 };
3888
3889
Call(uc16 from,DispatchTable::Entry entry)3890 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
3891 if (!entry.out_set()->Get(kInBase)) return;
3892 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
3893 ? included_
3894 : excluded_;
3895 if (*target == NULL) *target = new ZoneList<CharacterRange>(2);
3896 (*target)->Add(CharacterRange(entry.from(), entry.to()));
3897 }
3898
3899
Split(ZoneList<CharacterRange> * base,Vector<const uc16> overlay,ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded)3900 void CharacterRange::Split(ZoneList<CharacterRange>* base,
3901 Vector<const uc16> overlay,
3902 ZoneList<CharacterRange>** included,
3903 ZoneList<CharacterRange>** excluded) {
3904 ASSERT_EQ(NULL, *included);
3905 ASSERT_EQ(NULL, *excluded);
3906 DispatchTable table;
3907 for (int i = 0; i < base->length(); i++)
3908 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase);
3909 for (int i = 0; i < overlay.length(); i += 2) {
3910 table.AddRange(CharacterRange(overlay[i], overlay[i+1]),
3911 CharacterRangeSplitter::kInOverlay);
3912 }
3913 CharacterRangeSplitter callback(included, excluded);
3914 table.ForEach(&callback);
3915 }
3916
3917
AddCaseEquivalents(ZoneList<CharacterRange> * ranges)3918 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges) {
3919 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3920 if (IsSingleton()) {
3921 // If this is a singleton we just expand the one character.
3922 int length = uncanonicalize.get(from(), '\0', chars);
3923 for (int i = 0; i < length; i++) {
3924 uc32 chr = chars[i];
3925 if (chr != from()) {
3926 ranges->Add(CharacterRange::Singleton(chars[i]));
3927 }
3928 }
3929 } else if (from() <= kRangeCanonicalizeMax &&
3930 to() <= kRangeCanonicalizeMax) {
3931 // If this is a range we expand the characters block by block,
3932 // expanding contiguous subranges (blocks) one at a time.
3933 // The approach is as follows. For a given start character we
3934 // look up the block that contains it, for instance 'a' if the
3935 // start character is 'c'. A block is characterized by the property
3936 // that all characters uncanonicalize in the same way as the first
3937 // element, except that each entry in the result is incremented
3938 // by the distance from the first element. So a-z is a block
3939 // because 'a' uncanonicalizes to ['a', 'A'] and the k'th letter
3940 // uncanonicalizes to ['a' + k, 'A' + k].
3941 // Once we've found the start point we look up its uncanonicalization
3942 // and produce a range for each element. For instance for [c-f]
3943 // we look up ['a', 'A'] and produce [c-f] and [C-F]. We then only
3944 // add a range if it is not already contained in the input, so [c-f]
3945 // will be skipped but [C-F] will be added. If this range is not
3946 // completely contained in a block we do this for all the blocks
3947 // covered by the range.
3948 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
3949 // First, look up the block that contains the 'from' character.
3950 int length = canonrange.get(from(), '\0', range);
3951 if (length == 0) {
3952 range[0] = from();
3953 } else {
3954 ASSERT_EQ(1, length);
3955 }
3956 int pos = from();
3957 // The start of the current block. Note that except for the first
3958 // iteration 'start' is always equal to 'pos'.
3959 int start;
3960 // If it is not the start point of a block the entry contains the
3961 // offset of the character from the start point.
3962 if ((range[0] & kStartMarker) == 0) {
3963 start = pos - range[0];
3964 } else {
3965 start = pos;
3966 }
3967 // Then we add the ranges on at a time, incrementing the current
3968 // position to be after the last block each time. The position
3969 // always points to the start of a block.
3970 while (pos < to()) {
3971 length = canonrange.get(start, '\0', range);
3972 if (length == 0) {
3973 range[0] = start;
3974 } else {
3975 ASSERT_EQ(1, length);
3976 }
3977 ASSERT((range[0] & kStartMarker) != 0);
3978 // The start point of a block contains the distance to the end
3979 // of the range.
3980 int block_end = start + (range[0] & kPayloadMask) - 1;
3981 int end = (block_end > to()) ? to() : block_end;
3982 length = uncanonicalize.get(start, '\0', range);
3983 for (int i = 0; i < length; i++) {
3984 uc32 c = range[i];
3985 uc16 range_from = c + (pos - start);
3986 uc16 range_to = c + (end - start);
3987 if (!(from() <= range_from && range_to <= to())) {
3988 ranges->Add(CharacterRange(range_from, range_to));
3989 }
3990 }
3991 start = pos = block_end + 1;
3992 }
3993 } else {
3994 // TODO(plesner) when we've fixed the 2^11 bug in unibrow.
3995 }
3996 }
3997
3998
ranges()3999 ZoneList<CharacterRange>* CharacterSet::ranges() {
4000 if (ranges_ == NULL) {
4001 ranges_ = new ZoneList<CharacterRange>(2);
4002 CharacterRange::AddClassEscape(standard_set_type_, ranges_);
4003 }
4004 return ranges_;
4005 }
4006
4007
4008
4009 // -------------------------------------------------------------------
4010 // Interest propagation
4011
4012
TryGetSibling(NodeInfo * info)4013 RegExpNode* RegExpNode::TryGetSibling(NodeInfo* info) {
4014 for (int i = 0; i < siblings_.length(); i++) {
4015 RegExpNode* sibling = siblings_.Get(i);
4016 if (sibling->info()->Matches(info))
4017 return sibling;
4018 }
4019 return NULL;
4020 }
4021
4022
EnsureSibling(NodeInfo * info,bool * cloned)4023 RegExpNode* RegExpNode::EnsureSibling(NodeInfo* info, bool* cloned) {
4024 ASSERT_EQ(false, *cloned);
4025 siblings_.Ensure(this);
4026 RegExpNode* result = TryGetSibling(info);
4027 if (result != NULL) return result;
4028 result = this->Clone();
4029 NodeInfo* new_info = result->info();
4030 new_info->ResetCompilationState();
4031 new_info->AddFromPreceding(info);
4032 AddSibling(result);
4033 *cloned = true;
4034 return result;
4035 }
4036
4037
4038 template <class C>
PropagateToEndpoint(C * node,NodeInfo * info)4039 static RegExpNode* PropagateToEndpoint(C* node, NodeInfo* info) {
4040 NodeInfo full_info(*node->info());
4041 full_info.AddFromPreceding(info);
4042 bool cloned = false;
4043 return RegExpNode::EnsureSibling(node, &full_info, &cloned);
4044 }
4045
4046
4047 // -------------------------------------------------------------------
4048 // Splay tree
4049
4050
Extend(unsigned value)4051 OutSet* OutSet::Extend(unsigned value) {
4052 if (Get(value))
4053 return this;
4054 if (successors() != NULL) {
4055 for (int i = 0; i < successors()->length(); i++) {
4056 OutSet* successor = successors()->at(i);
4057 if (successor->Get(value))
4058 return successor;
4059 }
4060 } else {
4061 successors_ = new ZoneList<OutSet*>(2);
4062 }
4063 OutSet* result = new OutSet(first_, remaining_);
4064 result->Set(value);
4065 successors()->Add(result);
4066 return result;
4067 }
4068
4069
Set(unsigned value)4070 void OutSet::Set(unsigned value) {
4071 if (value < kFirstLimit) {
4072 first_ |= (1 << value);
4073 } else {
4074 if (remaining_ == NULL)
4075 remaining_ = new ZoneList<unsigned>(1);
4076 if (remaining_->is_empty() || !remaining_->Contains(value))
4077 remaining_->Add(value);
4078 }
4079 }
4080
4081
Get(unsigned value)4082 bool OutSet::Get(unsigned value) {
4083 if (value < kFirstLimit) {
4084 return (first_ & (1 << value)) != 0;
4085 } else if (remaining_ == NULL) {
4086 return false;
4087 } else {
4088 return remaining_->Contains(value);
4089 }
4090 }
4091
4092
4093 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
4094 const DispatchTable::Entry DispatchTable::Config::kNoValue;
4095
4096
AddRange(CharacterRange full_range,int value)4097 void DispatchTable::AddRange(CharacterRange full_range, int value) {
4098 CharacterRange current = full_range;
4099 if (tree()->is_empty()) {
4100 // If this is the first range we just insert into the table.
4101 ZoneSplayTree<Config>::Locator loc;
4102 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
4103 loc.set_value(Entry(current.from(), current.to(), empty()->Extend(value)));
4104 return;
4105 }
4106 // First see if there is a range to the left of this one that
4107 // overlaps.
4108 ZoneSplayTree<Config>::Locator loc;
4109 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
4110 Entry* entry = &loc.value();
4111 // If we've found a range that overlaps with this one, and it
4112 // starts strictly to the left of this one, we have to fix it
4113 // because the following code only handles ranges that start on
4114 // or after the start point of the range we're adding.
4115 if (entry->from() < current.from() && entry->to() >= current.from()) {
4116 // Snap the overlapping range in half around the start point of
4117 // the range we're adding.
4118 CharacterRange left(entry->from(), current.from() - 1);
4119 CharacterRange right(current.from(), entry->to());
4120 // The left part of the overlapping range doesn't overlap.
4121 // Truncate the whole entry to be just the left part.
4122 entry->set_to(left.to());
4123 // The right part is the one that overlaps. We add this part
4124 // to the map and let the next step deal with merging it with
4125 // the range we're adding.
4126 ZoneSplayTree<Config>::Locator loc;
4127 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
4128 loc.set_value(Entry(right.from(),
4129 right.to(),
4130 entry->out_set()));
4131 }
4132 }
4133 while (current.is_valid()) {
4134 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
4135 (loc.value().from() <= current.to()) &&
4136 (loc.value().to() >= current.from())) {
4137 Entry* entry = &loc.value();
4138 // We have overlap. If there is space between the start point of
4139 // the range we're adding and where the overlapping range starts
4140 // then we have to add a range covering just that space.
4141 if (current.from() < entry->from()) {
4142 ZoneSplayTree<Config>::Locator ins;
4143 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4144 ins.set_value(Entry(current.from(),
4145 entry->from() - 1,
4146 empty()->Extend(value)));
4147 current.set_from(entry->from());
4148 }
4149 ASSERT_EQ(current.from(), entry->from());
4150 // If the overlapping range extends beyond the one we want to add
4151 // we have to snap the right part off and add it separately.
4152 if (entry->to() > current.to()) {
4153 ZoneSplayTree<Config>::Locator ins;
4154 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
4155 ins.set_value(Entry(current.to() + 1,
4156 entry->to(),
4157 entry->out_set()));
4158 entry->set_to(current.to());
4159 }
4160 ASSERT(entry->to() <= current.to());
4161 // The overlapping range is now completely contained by the range
4162 // we're adding so we can just update it and move the start point
4163 // of the range we're adding just past it.
4164 entry->AddValue(value);
4165 // Bail out if the last interval ended at 0xFFFF since otherwise
4166 // adding 1 will wrap around to 0.
4167 if (entry->to() == String::kMaxUC16CharCode)
4168 break;
4169 ASSERT(entry->to() + 1 > current.from());
4170 current.set_from(entry->to() + 1);
4171 } else {
4172 // There is no overlap so we can just add the range
4173 ZoneSplayTree<Config>::Locator ins;
4174 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
4175 ins.set_value(Entry(current.from(),
4176 current.to(),
4177 empty()->Extend(value)));
4178 break;
4179 }
4180 }
4181 }
4182
4183
Get(uc16 value)4184 OutSet* DispatchTable::Get(uc16 value) {
4185 ZoneSplayTree<Config>::Locator loc;
4186 if (!tree()->FindGreatestLessThan(value, &loc))
4187 return empty();
4188 Entry* entry = &loc.value();
4189 if (value <= entry->to())
4190 return entry->out_set();
4191 else
4192 return empty();
4193 }
4194
4195
4196 // -------------------------------------------------------------------
4197 // Analysis
4198
4199
EnsureAnalyzed(RegExpNode * that)4200 void Analysis::EnsureAnalyzed(RegExpNode* that) {
4201 StackLimitCheck check;
4202 if (check.HasOverflowed()) {
4203 fail("Stack overflow");
4204 return;
4205 }
4206 if (that->info()->been_analyzed || that->info()->being_analyzed)
4207 return;
4208 that->info()->being_analyzed = true;
4209 that->Accept(this);
4210 that->info()->being_analyzed = false;
4211 that->info()->been_analyzed = true;
4212 }
4213
4214
VisitEnd(EndNode * that)4215 void Analysis::VisitEnd(EndNode* that) {
4216 // nothing to do
4217 }
4218
4219
CalculateOffsets()4220 void TextNode::CalculateOffsets() {
4221 int element_count = elements()->length();
4222 // Set up the offsets of the elements relative to the start. This is a fixed
4223 // quantity since a TextNode can only contain fixed-width things.
4224 int cp_offset = 0;
4225 for (int i = 0; i < element_count; i++) {
4226 TextElement& elm = elements()->at(i);
4227 elm.cp_offset = cp_offset;
4228 if (elm.type == TextElement::ATOM) {
4229 cp_offset += elm.data.u_atom->data().length();
4230 } else {
4231 cp_offset++;
4232 Vector<const uc16> quarks = elm.data.u_atom->data();
4233 }
4234 }
4235 }
4236
4237
VisitText(TextNode * that)4238 void Analysis::VisitText(TextNode* that) {
4239 if (ignore_case_) {
4240 that->MakeCaseIndependent();
4241 }
4242 EnsureAnalyzed(that->on_success());
4243 if (!has_failed()) {
4244 that->CalculateOffsets();
4245 }
4246 }
4247
4248
VisitAction(ActionNode * that)4249 void Analysis::VisitAction(ActionNode* that) {
4250 RegExpNode* target = that->on_success();
4251 EnsureAnalyzed(target);
4252 if (!has_failed()) {
4253 // If the next node is interested in what it follows then this node
4254 // has to be interested too so it can pass the information on.
4255 that->info()->AddFromFollowing(target->info());
4256 }
4257 }
4258
4259
VisitChoice(ChoiceNode * that)4260 void Analysis::VisitChoice(ChoiceNode* that) {
4261 NodeInfo* info = that->info();
4262 for (int i = 0; i < that->alternatives()->length(); i++) {
4263 RegExpNode* node = that->alternatives()->at(i).node();
4264 EnsureAnalyzed(node);
4265 if (has_failed()) return;
4266 // Anything the following nodes need to know has to be known by
4267 // this node also, so it can pass it on.
4268 info->AddFromFollowing(node->info());
4269 }
4270 }
4271
4272
VisitLoopChoice(LoopChoiceNode * that)4273 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
4274 NodeInfo* info = that->info();
4275 for (int i = 0; i < that->alternatives()->length(); i++) {
4276 RegExpNode* node = that->alternatives()->at(i).node();
4277 if (node != that->loop_node()) {
4278 EnsureAnalyzed(node);
4279 if (has_failed()) return;
4280 info->AddFromFollowing(node->info());
4281 }
4282 }
4283 // Check the loop last since it may need the value of this node
4284 // to get a correct result.
4285 EnsureAnalyzed(that->loop_node());
4286 if (!has_failed()) {
4287 info->AddFromFollowing(that->loop_node()->info());
4288 }
4289 }
4290
4291
VisitBackReference(BackReferenceNode * that)4292 void Analysis::VisitBackReference(BackReferenceNode* that) {
4293 EnsureAnalyzed(that->on_success());
4294 }
4295
4296
VisitAssertion(AssertionNode * that)4297 void Analysis::VisitAssertion(AssertionNode* that) {
4298 EnsureAnalyzed(that->on_success());
4299 }
4300
4301
4302 // -------------------------------------------------------------------
4303 // Dispatch table construction
4304
4305
VisitEnd(EndNode * that)4306 void DispatchTableConstructor::VisitEnd(EndNode* that) {
4307 AddRange(CharacterRange::Everything());
4308 }
4309
4310
BuildTable(ChoiceNode * node)4311 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
4312 node->set_being_calculated(true);
4313 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
4314 for (int i = 0; i < alternatives->length(); i++) {
4315 set_choice_index(i);
4316 alternatives->at(i).node()->Accept(this);
4317 }
4318 node->set_being_calculated(false);
4319 }
4320
4321
4322 class AddDispatchRange {
4323 public:
AddDispatchRange(DispatchTableConstructor * constructor)4324 explicit AddDispatchRange(DispatchTableConstructor* constructor)
4325 : constructor_(constructor) { }
4326 void Call(uc32 from, DispatchTable::Entry entry);
4327 private:
4328 DispatchTableConstructor* constructor_;
4329 };
4330
4331
Call(uc32 from,DispatchTable::Entry entry)4332 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
4333 CharacterRange range(from, entry.to());
4334 constructor_->AddRange(range);
4335 }
4336
4337
VisitChoice(ChoiceNode * node)4338 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
4339 if (node->being_calculated())
4340 return;
4341 DispatchTable* table = node->GetTable(ignore_case_);
4342 AddDispatchRange adder(this);
4343 table->ForEach(&adder);
4344 }
4345
4346
VisitBackReference(BackReferenceNode * that)4347 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
4348 // TODO(160): Find the node that we refer back to and propagate its start
4349 // set back to here. For now we just accept anything.
4350 AddRange(CharacterRange::Everything());
4351 }
4352
4353
VisitAssertion(AssertionNode * that)4354 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
4355 RegExpNode* target = that->on_success();
4356 target->Accept(this);
4357 }
4358
4359
4360
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)4361 static int CompareRangeByFrom(const CharacterRange* a,
4362 const CharacterRange* b) {
4363 return Compare<uc16>(a->from(), b->from());
4364 }
4365
4366
AddInverse(ZoneList<CharacterRange> * ranges)4367 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
4368 ranges->Sort(CompareRangeByFrom);
4369 uc16 last = 0;
4370 for (int i = 0; i < ranges->length(); i++) {
4371 CharacterRange range = ranges->at(i);
4372 if (last < range.from())
4373 AddRange(CharacterRange(last, range.from() - 1));
4374 if (range.to() >= last) {
4375 if (range.to() == String::kMaxUC16CharCode) {
4376 return;
4377 } else {
4378 last = range.to() + 1;
4379 }
4380 }
4381 }
4382 AddRange(CharacterRange(last, String::kMaxUC16CharCode));
4383 }
4384
4385
VisitText(TextNode * that)4386 void DispatchTableConstructor::VisitText(TextNode* that) {
4387 TextElement elm = that->elements()->at(0);
4388 switch (elm.type) {
4389 case TextElement::ATOM: {
4390 uc16 c = elm.data.u_atom->data()[0];
4391 AddRange(CharacterRange(c, c));
4392 break;
4393 }
4394 case TextElement::CHAR_CLASS: {
4395 RegExpCharacterClass* tree = elm.data.u_char_class;
4396 ZoneList<CharacterRange>* ranges = tree->ranges();
4397 if (tree->is_negated()) {
4398 AddInverse(ranges);
4399 } else {
4400 for (int i = 0; i < ranges->length(); i++)
4401 AddRange(ranges->at(i));
4402 }
4403 break;
4404 }
4405 default: {
4406 UNIMPLEMENTED();
4407 }
4408 }
4409 }
4410
4411
VisitAction(ActionNode * that)4412 void DispatchTableConstructor::VisitAction(ActionNode* that) {
4413 RegExpNode* target = that->on_success();
4414 target->Accept(this);
4415 }
4416
4417
Compile(RegExpCompileData * data,bool ignore_case,bool is_multiline,Handle<String> pattern,bool is_ascii)4418 RegExpEngine::CompilationResult RegExpEngine::Compile(RegExpCompileData* data,
4419 bool ignore_case,
4420 bool is_multiline,
4421 Handle<String> pattern,
4422 bool is_ascii) {
4423 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
4424 return IrregexpRegExpTooBig();
4425 }
4426 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii);
4427 // Wrap the body of the regexp in capture #0.
4428 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
4429 0,
4430 &compiler,
4431 compiler.accept());
4432 RegExpNode* node = captured_body;
4433 if (!data->tree->IsAnchored()) {
4434 // Add a .*? at the beginning, outside the body capture, unless
4435 // this expression is anchored at the beginning.
4436 RegExpNode* loop_node =
4437 RegExpQuantifier::ToNode(0,
4438 RegExpTree::kInfinity,
4439 false,
4440 new RegExpCharacterClass('*'),
4441 &compiler,
4442 captured_body,
4443 data->contains_anchor);
4444
4445 if (data->contains_anchor) {
4446 // Unroll loop once, to take care of the case that might start
4447 // at the start of input.
4448 ChoiceNode* first_step_node = new ChoiceNode(2);
4449 first_step_node->AddAlternative(GuardedAlternative(captured_body));
4450 first_step_node->AddAlternative(GuardedAlternative(
4451 new TextNode(new RegExpCharacterClass('*'), loop_node)));
4452 node = first_step_node;
4453 } else {
4454 node = loop_node;
4455 }
4456 }
4457 data->node = node;
4458 Analysis analysis(ignore_case);
4459 analysis.EnsureAnalyzed(node);
4460 if (analysis.has_failed()) {
4461 const char* error_message = analysis.error_message();
4462 return CompilationResult(error_message);
4463 }
4464
4465 NodeInfo info = *node->info();
4466
4467 // Create the correct assembler for the architecture.
4468 #ifdef V8_NATIVE_REGEXP
4469 // Native regexp implementation.
4470
4471 NativeRegExpMacroAssembler::Mode mode =
4472 is_ascii ? NativeRegExpMacroAssembler::ASCII
4473 : NativeRegExpMacroAssembler::UC16;
4474
4475 #if V8_TARGET_ARCH_IA32
4476 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2);
4477 #elif V8_TARGET_ARCH_X64
4478 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2);
4479 #elif V8_TARGET_ARCH_ARM
4480 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2);
4481 #endif
4482
4483 #else // ! V8_NATIVE_REGEXP
4484 // Interpreted regexp implementation.
4485 EmbeddedVector<byte, 1024> codes;
4486 RegExpMacroAssemblerIrregexp macro_assembler(codes);
4487 #endif
4488
4489 return compiler.Assemble(¯o_assembler,
4490 node,
4491 data->capture_count,
4492 pattern);
4493 }
4494
4495 }} // namespace v8::internal
4496