1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/v8.h"
6
7 #include "src/ast.h"
8 #include "src/compiler.h"
9 #include "src/execution.h"
10 #include "src/factory.h"
11 #include "src/jsregexp.h"
12 #include "src/jsregexp-inl.h"
13 #include "src/platform.h"
14 #include "src/string-search.h"
15 #include "src/runtime.h"
16 #include "src/compilation-cache.h"
17 #include "src/string-stream.h"
18 #include "src/parser.h"
19 #include "src/regexp-macro-assembler.h"
20 #include "src/regexp-macro-assembler-tracer.h"
21 #include "src/regexp-macro-assembler-irregexp.h"
22 #include "src/regexp-stack.h"
23
24 #ifndef V8_INTERPRETED_REGEXP
25 #if V8_TARGET_ARCH_IA32
26 #include "src/ia32/regexp-macro-assembler-ia32.h"
27 #elif V8_TARGET_ARCH_X64
28 #include "src/x64/regexp-macro-assembler-x64.h"
29 #elif V8_TARGET_ARCH_ARM64
30 #include "src/arm64/regexp-macro-assembler-arm64.h"
31 #elif V8_TARGET_ARCH_ARM
32 #include "src/arm/regexp-macro-assembler-arm.h"
33 #elif V8_TARGET_ARCH_MIPS
34 #include "src/mips/regexp-macro-assembler-mips.h"
35 #elif V8_TARGET_ARCH_X87
36 #include "src/x87/regexp-macro-assembler-x87.h"
37 #else
38 #error Unsupported target architecture.
39 #endif
40 #endif
41
42 #include "src/interpreter-irregexp.h"
43
44
45 namespace v8 {
46 namespace internal {
47
CreateRegExpLiteral(Handle<JSFunction> constructor,Handle<String> pattern,Handle<String> flags)48 MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(
49 Handle<JSFunction> constructor,
50 Handle<String> pattern,
51 Handle<String> flags) {
52 // Call the construct code with 2 arguments.
53 Handle<Object> argv[] = { pattern, flags };
54 return Execution::New(constructor, ARRAY_SIZE(argv), argv);
55 }
56
57
RegExpFlagsFromString(Handle<String> str)58 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
59 int flags = JSRegExp::NONE;
60 for (int i = 0; i < str->length(); i++) {
61 switch (str->Get(i)) {
62 case 'i':
63 flags |= JSRegExp::IGNORE_CASE;
64 break;
65 case 'g':
66 flags |= JSRegExp::GLOBAL;
67 break;
68 case 'm':
69 flags |= JSRegExp::MULTILINE;
70 break;
71 }
72 }
73 return JSRegExp::Flags(flags);
74 }
75
76
77 MUST_USE_RESULT
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text,const char * message)78 static inline MaybeHandle<Object> ThrowRegExpException(
79 Handle<JSRegExp> re,
80 Handle<String> pattern,
81 Handle<String> error_text,
82 const char* message) {
83 Isolate* isolate = re->GetIsolate();
84 Factory* factory = isolate->factory();
85 Handle<FixedArray> elements = factory->NewFixedArray(2);
86 elements->set(0, *pattern);
87 elements->set(1, *error_text);
88 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
89 Handle<Object> regexp_err = factory->NewSyntaxError(message, array);
90 return isolate->Throw<Object>(regexp_err);
91 }
92
93
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)94 ContainedInLattice AddRange(ContainedInLattice containment,
95 const int* ranges,
96 int ranges_length,
97 Interval new_range) {
98 ASSERT((ranges_length & 1) == 1);
99 ASSERT(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
100 if (containment == kLatticeUnknown) return containment;
101 bool inside = false;
102 int last = 0;
103 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
104 // Consider the range from last to ranges[i].
105 // We haven't got to the new range yet.
106 if (ranges[i] <= new_range.from()) continue;
107 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
108 // inclusive, but the values in ranges are not.
109 if (last <= new_range.from() && new_range.to() < ranges[i]) {
110 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
111 }
112 return kLatticeUnknown;
113 }
114 return containment;
115 }
116
117
118 // More makes code generation slower, less makes V8 benchmark score lower.
119 const int kMaxLookaheadForBoyerMoore = 8;
120 // In a 3-character pattern you can maximally step forwards 3 characters
121 // at a time, which is not always enough to pay for the extra logic.
122 const int kPatternTooShortForBoyerMoore = 2;
123
124
125 // Identifies the sort of regexps where the regexp engine is faster
126 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)127 static bool HasFewDifferentCharacters(Handle<String> pattern) {
128 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
129 if (length <= kPatternTooShortForBoyerMoore) return false;
130 const int kMod = 128;
131 bool character_found[kMod];
132 int different = 0;
133 memset(&character_found[0], 0, sizeof(character_found));
134 for (int i = 0; i < length; i++) {
135 int ch = (pattern->Get(i) & (kMod - 1));
136 if (!character_found[ch]) {
137 character_found[ch] = true;
138 different++;
139 // We declare a regexp low-alphabet if it has at least 3 times as many
140 // characters as it has different characters.
141 if (different * 3 > length) return false;
142 }
143 }
144 return true;
145 }
146
147
148 // Generic RegExp methods. Dispatches to implementation specific methods.
149
150
Compile(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> flag_str)151 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
152 Handle<String> pattern,
153 Handle<String> flag_str) {
154 Isolate* isolate = re->GetIsolate();
155 Zone zone(isolate);
156 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
157 CompilationCache* compilation_cache = isolate->compilation_cache();
158 MaybeHandle<FixedArray> maybe_cached =
159 compilation_cache->LookupRegExp(pattern, flags);
160 Handle<FixedArray> cached;
161 bool in_cache = maybe_cached.ToHandle(&cached);
162 LOG(isolate, RegExpCompileEvent(re, in_cache));
163
164 Handle<Object> result;
165 if (in_cache) {
166 re->set_data(*cached);
167 return re;
168 }
169 pattern = String::Flatten(pattern);
170 PostponeInterruptsScope postpone(isolate);
171 RegExpCompileData parse_result;
172 FlatStringReader reader(isolate, pattern);
173 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
174 &parse_result, &zone)) {
175 // Throw an exception if we fail to parse the pattern.
176 return ThrowRegExpException(re,
177 pattern,
178 parse_result.error,
179 "malformed_regexp");
180 }
181
182 bool has_been_compiled = false;
183
184 if (parse_result.simple &&
185 !flags.is_ignore_case() &&
186 !HasFewDifferentCharacters(pattern)) {
187 // Parse-tree is a single atom that is equal to the pattern.
188 AtomCompile(re, pattern, flags, pattern);
189 has_been_compiled = true;
190 } else if (parse_result.tree->IsAtom() &&
191 !flags.is_ignore_case() &&
192 parse_result.capture_count == 0) {
193 RegExpAtom* atom = parse_result.tree->AsAtom();
194 Vector<const uc16> atom_pattern = atom->data();
195 Handle<String> atom_string;
196 ASSIGN_RETURN_ON_EXCEPTION(
197 isolate, atom_string,
198 isolate->factory()->NewStringFromTwoByte(atom_pattern),
199 Object);
200 if (!HasFewDifferentCharacters(atom_string)) {
201 AtomCompile(re, pattern, flags, atom_string);
202 has_been_compiled = true;
203 }
204 }
205 if (!has_been_compiled) {
206 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
207 }
208 ASSERT(re->data()->IsFixedArray());
209 // Compilation succeeded so the data is set on the regexp
210 // and we can store it in the cache.
211 Handle<FixedArray> data(FixedArray::cast(re->data()));
212 compilation_cache->PutRegExp(pattern, flags, data);
213
214 return re;
215 }
216
217
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<JSArray> last_match_info)218 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
219 Handle<String> subject,
220 int index,
221 Handle<JSArray> last_match_info) {
222 switch (regexp->TypeTag()) {
223 case JSRegExp::ATOM:
224 return AtomExec(regexp, subject, index, last_match_info);
225 case JSRegExp::IRREGEXP: {
226 return IrregexpExec(regexp, subject, index, last_match_info);
227 }
228 default:
229 UNREACHABLE();
230 return MaybeHandle<Object>();
231 }
232 }
233
234
235 // RegExp Atom implementation: Simple string search using indexOf.
236
237
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)238 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
239 Handle<String> pattern,
240 JSRegExp::Flags flags,
241 Handle<String> match_pattern) {
242 re->GetIsolate()->factory()->SetRegExpAtomData(re,
243 JSRegExp::ATOM,
244 pattern,
245 flags,
246 match_pattern);
247 }
248
249
SetAtomLastCapture(FixedArray * array,String * subject,int from,int to)250 static void SetAtomLastCapture(FixedArray* array,
251 String* subject,
252 int from,
253 int to) {
254 SealHandleScope shs(array->GetIsolate());
255 RegExpImpl::SetLastCaptureCount(array, 2);
256 RegExpImpl::SetLastSubject(array, subject);
257 RegExpImpl::SetLastInput(array, subject);
258 RegExpImpl::SetCapture(array, 0, from);
259 RegExpImpl::SetCapture(array, 1, to);
260 }
261
262
AtomExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)263 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
264 Handle<String> subject,
265 int index,
266 int32_t* output,
267 int output_size) {
268 Isolate* isolate = regexp->GetIsolate();
269
270 ASSERT(0 <= index);
271 ASSERT(index <= subject->length());
272
273 subject = String::Flatten(subject);
274 DisallowHeapAllocation no_gc; // ensure vectors stay valid
275
276 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
277 int needle_len = needle->length();
278 ASSERT(needle->IsFlat());
279 ASSERT_LT(0, needle_len);
280
281 if (index + needle_len > subject->length()) {
282 return RegExpImpl::RE_FAILURE;
283 }
284
285 for (int i = 0; i < output_size; i += 2) {
286 String::FlatContent needle_content = needle->GetFlatContent();
287 String::FlatContent subject_content = subject->GetFlatContent();
288 ASSERT(needle_content.IsFlat());
289 ASSERT(subject_content.IsFlat());
290 // dispatch on type of strings
291 index = (needle_content.IsAscii()
292 ? (subject_content.IsAscii()
293 ? SearchString(isolate,
294 subject_content.ToOneByteVector(),
295 needle_content.ToOneByteVector(),
296 index)
297 : SearchString(isolate,
298 subject_content.ToUC16Vector(),
299 needle_content.ToOneByteVector(),
300 index))
301 : (subject_content.IsAscii()
302 ? SearchString(isolate,
303 subject_content.ToOneByteVector(),
304 needle_content.ToUC16Vector(),
305 index)
306 : SearchString(isolate,
307 subject_content.ToUC16Vector(),
308 needle_content.ToUC16Vector(),
309 index)));
310 if (index == -1) {
311 return i / 2; // Return number of matches.
312 } else {
313 output[i] = index;
314 output[i+1] = index + needle_len;
315 index += needle_len;
316 }
317 }
318 return output_size / 2;
319 }
320
321
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<JSArray> last_match_info)322 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
323 Handle<String> subject,
324 int index,
325 Handle<JSArray> last_match_info) {
326 Isolate* isolate = re->GetIsolate();
327
328 static const int kNumRegisters = 2;
329 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
330 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
331
332 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
333
334 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
335
336 ASSERT_EQ(res, RegExpImpl::RE_SUCCESS);
337 SealHandleScope shs(isolate);
338 FixedArray* array = FixedArray::cast(last_match_info->elements());
339 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
340 return last_match_info;
341 }
342
343
344 // Irregexp implementation.
345
346 // Ensures that the regexp object contains a compiled version of the
347 // source for either ASCII or non-ASCII strings.
348 // If the compiled version doesn't already exist, it is compiled
349 // from the source pattern.
350 // If compilation fails, an exception is thrown and this function
351 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_ascii)352 bool RegExpImpl::EnsureCompiledIrregexp(
353 Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii) {
354 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_ascii));
355 #ifdef V8_INTERPRETED_REGEXP
356 if (compiled_code->IsByteArray()) return true;
357 #else // V8_INTERPRETED_REGEXP (RegExp native code)
358 if (compiled_code->IsCode()) return true;
359 #endif
360 // We could potentially have marked this as flushable, but have kept
361 // a saved version if we did not flush it yet.
362 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_ascii));
363 if (saved_code->IsCode()) {
364 // Reinstate the code in the original place.
365 re->SetDataAt(JSRegExp::code_index(is_ascii), saved_code);
366 ASSERT(compiled_code->IsSmi());
367 return true;
368 }
369 return CompileIrregexp(re, sample_subject, is_ascii);
370 }
371
372
CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,bool is_ascii,Handle<String> error_message,Isolate * isolate)373 static bool CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
374 bool is_ascii,
375 Handle<String> error_message,
376 Isolate* isolate) {
377 Factory* factory = isolate->factory();
378 Handle<FixedArray> elements = factory->NewFixedArray(2);
379 elements->set(0, re->Pattern());
380 elements->set(1, *error_message);
381 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
382 Handle<Object> regexp_err =
383 factory->NewSyntaxError("malformed_regexp", array);
384 isolate->Throw(*regexp_err);
385 return false;
386 }
387
388
CompileIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_ascii)389 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
390 Handle<String> sample_subject,
391 bool is_ascii) {
392 // Compile the RegExp.
393 Isolate* isolate = re->GetIsolate();
394 Zone zone(isolate);
395 PostponeInterruptsScope postpone(isolate);
396 // If we had a compilation error the last time this is saved at the
397 // saved code index.
398 Object* entry = re->DataAt(JSRegExp::code_index(is_ascii));
399 // When arriving here entry can only be a smi, either representing an
400 // uncompiled regexp, a previous compilation error, or code that has
401 // been flushed.
402 ASSERT(entry->IsSmi());
403 int entry_value = Smi::cast(entry)->value();
404 ASSERT(entry_value == JSRegExp::kUninitializedValue ||
405 entry_value == JSRegExp::kCompilationErrorValue ||
406 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
407
408 if (entry_value == JSRegExp::kCompilationErrorValue) {
409 // A previous compilation failed and threw an error which we store in
410 // the saved code index (we store the error message, not the actual
411 // error). Recreate the error object and throw it.
412 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_ascii));
413 ASSERT(error_string->IsString());
414 Handle<String> error_message(String::cast(error_string));
415 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
416 return false;
417 }
418
419 JSRegExp::Flags flags = re->GetFlags();
420
421 Handle<String> pattern(re->Pattern());
422 pattern = String::Flatten(pattern);
423 RegExpCompileData compile_data;
424 FlatStringReader reader(isolate, pattern);
425 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
426 &compile_data,
427 &zone)) {
428 // Throw an exception if we fail to parse the pattern.
429 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
430 USE(ThrowRegExpException(re,
431 pattern,
432 compile_data.error,
433 "malformed_regexp"));
434 return false;
435 }
436 RegExpEngine::CompilationResult result =
437 RegExpEngine::Compile(&compile_data,
438 flags.is_ignore_case(),
439 flags.is_global(),
440 flags.is_multiline(),
441 pattern,
442 sample_subject,
443 is_ascii,
444 &zone);
445 if (result.error_message != NULL) {
446 // Unable to compile regexp.
447 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
448 CStrVector(result.error_message)).ToHandleChecked();
449 CreateRegExpErrorObjectAndThrow(re, is_ascii, error_message, isolate);
450 return false;
451 }
452
453 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
454 data->set(JSRegExp::code_index(is_ascii), result.code);
455 int register_max = IrregexpMaxRegisterCount(*data);
456 if (result.num_registers > register_max) {
457 SetIrregexpMaxRegisterCount(*data, result.num_registers);
458 }
459
460 return true;
461 }
462
463
IrregexpMaxRegisterCount(FixedArray * re)464 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
465 return Smi::cast(
466 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
467 }
468
469
SetIrregexpMaxRegisterCount(FixedArray * re,int value)470 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
471 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
472 }
473
474
IrregexpNumberOfCaptures(FixedArray * re)475 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
476 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
477 }
478
479
IrregexpNumberOfRegisters(FixedArray * re)480 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
481 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
482 }
483
484
IrregexpByteCode(FixedArray * re,bool is_ascii)485 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_ascii) {
486 return ByteArray::cast(re->get(JSRegExp::code_index(is_ascii)));
487 }
488
489
IrregexpNativeCode(FixedArray * re,bool is_ascii)490 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_ascii) {
491 return Code::cast(re->get(JSRegExp::code_index(is_ascii)));
492 }
493
494
IrregexpInitialize(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)495 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
496 Handle<String> pattern,
497 JSRegExp::Flags flags,
498 int capture_count) {
499 // Initialize compiled code entries to null.
500 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
501 JSRegExp::IRREGEXP,
502 pattern,
503 flags,
504 capture_count);
505 }
506
507
IrregexpPrepare(Handle<JSRegExp> regexp,Handle<String> subject)508 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
509 Handle<String> subject) {
510 subject = String::Flatten(subject);
511
512 // Check the asciiness of the underlying storage.
513 bool is_ascii = subject->IsOneByteRepresentationUnderneath();
514 if (!EnsureCompiledIrregexp(regexp, subject, is_ascii)) return -1;
515
516 #ifdef V8_INTERPRETED_REGEXP
517 // Byte-code regexp needs space allocated for all its registers.
518 // The result captures are copied to the start of the registers array
519 // if the match succeeds. This way those registers are not clobbered
520 // when we set the last match info from last successful match.
521 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
522 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
523 #else // V8_INTERPRETED_REGEXP
524 // Native regexp only needs room to output captures. Registers are handled
525 // internally.
526 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
527 #endif // V8_INTERPRETED_REGEXP
528 }
529
530
IrregexpExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)531 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
532 Handle<String> subject,
533 int index,
534 int32_t* output,
535 int output_size) {
536 Isolate* isolate = regexp->GetIsolate();
537
538 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
539
540 ASSERT(index >= 0);
541 ASSERT(index <= subject->length());
542 ASSERT(subject->IsFlat());
543
544 bool is_ascii = subject->IsOneByteRepresentationUnderneath();
545
546 #ifndef V8_INTERPRETED_REGEXP
547 ASSERT(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
548 do {
549 EnsureCompiledIrregexp(regexp, subject, is_ascii);
550 Handle<Code> code(IrregexpNativeCode(*irregexp, is_ascii), isolate);
551 // The stack is used to allocate registers for the compiled regexp code.
552 // This means that in case of failure, the output registers array is left
553 // untouched and contains the capture results from the previous successful
554 // match. We can use that to set the last match info lazily.
555 NativeRegExpMacroAssembler::Result res =
556 NativeRegExpMacroAssembler::Match(code,
557 subject,
558 output,
559 output_size,
560 index,
561 isolate);
562 if (res != NativeRegExpMacroAssembler::RETRY) {
563 ASSERT(res != NativeRegExpMacroAssembler::EXCEPTION ||
564 isolate->has_pending_exception());
565 STATIC_ASSERT(
566 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
567 STATIC_ASSERT(
568 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
569 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
570 == RE_EXCEPTION);
571 return static_cast<IrregexpResult>(res);
572 }
573 // If result is RETRY, the string has changed representation, and we
574 // must restart from scratch.
575 // In this case, it means we must make sure we are prepared to handle
576 // the, potentially, different subject (the string can switch between
577 // being internal and external, and even between being ASCII and UC16,
578 // but the characters are always the same).
579 IrregexpPrepare(regexp, subject);
580 is_ascii = subject->IsOneByteRepresentationUnderneath();
581 } while (true);
582 UNREACHABLE();
583 return RE_EXCEPTION;
584 #else // V8_INTERPRETED_REGEXP
585
586 ASSERT(output_size >= IrregexpNumberOfRegisters(*irregexp));
587 // We must have done EnsureCompiledIrregexp, so we can get the number of
588 // registers.
589 int number_of_capture_registers =
590 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
591 int32_t* raw_output = &output[number_of_capture_registers];
592 // We do not touch the actual capture result registers until we know there
593 // has been a match so that we can use those capture results to set the
594 // last match info.
595 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
596 raw_output[i] = -1;
597 }
598 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_ascii), isolate);
599
600 IrregexpResult result = IrregexpInterpreter::Match(isolate,
601 byte_codes,
602 subject,
603 raw_output,
604 index);
605 if (result == RE_SUCCESS) {
606 // Copy capture results to the start of the registers array.
607 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
608 }
609 if (result == RE_EXCEPTION) {
610 ASSERT(!isolate->has_pending_exception());
611 isolate->StackOverflow();
612 }
613 return result;
614 #endif // V8_INTERPRETED_REGEXP
615 }
616
617
IrregexpExec(Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<JSArray> last_match_info)618 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
619 Handle<String> subject,
620 int previous_index,
621 Handle<JSArray> last_match_info) {
622 Isolate* isolate = regexp->GetIsolate();
623 ASSERT_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
624
625 // Prepare space for the return values.
626 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
627 if (FLAG_trace_regexp_bytecodes) {
628 String* pattern = regexp->Pattern();
629 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
630 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
631 }
632 #endif
633 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
634 if (required_registers < 0) {
635 // Compiling failed with an exception.
636 ASSERT(isolate->has_pending_exception());
637 return MaybeHandle<Object>();
638 }
639
640 int32_t* output_registers = NULL;
641 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
642 output_registers = NewArray<int32_t>(required_registers);
643 }
644 SmartArrayPointer<int32_t> auto_release(output_registers);
645 if (output_registers == NULL) {
646 output_registers = isolate->jsregexp_static_offsets_vector();
647 }
648
649 int res = RegExpImpl::IrregexpExecRaw(
650 regexp, subject, previous_index, output_registers, required_registers);
651 if (res == RE_SUCCESS) {
652 int capture_count =
653 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
654 return SetLastMatchInfo(
655 last_match_info, subject, capture_count, output_registers);
656 }
657 if (res == RE_EXCEPTION) {
658 ASSERT(isolate->has_pending_exception());
659 return MaybeHandle<Object>();
660 }
661 ASSERT(res == RE_FAILURE);
662 return isolate->factory()->null_value();
663 }
664
665
SetLastMatchInfo(Handle<JSArray> last_match_info,Handle<String> subject,int capture_count,int32_t * match)666 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
667 Handle<String> subject,
668 int capture_count,
669 int32_t* match) {
670 ASSERT(last_match_info->HasFastObjectElements());
671 int capture_register_count = (capture_count + 1) * 2;
672 JSArray::EnsureSize(last_match_info,
673 capture_register_count + kLastMatchOverhead);
674 DisallowHeapAllocation no_allocation;
675 FixedArray* array = FixedArray::cast(last_match_info->elements());
676 if (match != NULL) {
677 for (int i = 0; i < capture_register_count; i += 2) {
678 SetCapture(array, i, match[i]);
679 SetCapture(array, i + 1, match[i + 1]);
680 }
681 }
682 SetLastCaptureCount(array, capture_register_count);
683 SetLastSubject(array, *subject);
684 SetLastInput(array, *subject);
685 return last_match_info;
686 }
687
688
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,bool is_global,Isolate * isolate)689 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
690 Handle<String> subject,
691 bool is_global,
692 Isolate* isolate)
693 : register_array_(NULL),
694 register_array_size_(0),
695 regexp_(regexp),
696 subject_(subject) {
697 #ifdef V8_INTERPRETED_REGEXP
698 bool interpreted = true;
699 #else
700 bool interpreted = false;
701 #endif // V8_INTERPRETED_REGEXP
702
703 if (regexp_->TypeTag() == JSRegExp::ATOM) {
704 static const int kAtomRegistersPerMatch = 2;
705 registers_per_match_ = kAtomRegistersPerMatch;
706 // There is no distinction between interpreted and native for atom regexps.
707 interpreted = false;
708 } else {
709 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
710 if (registers_per_match_ < 0) {
711 num_matches_ = -1; // Signal exception.
712 return;
713 }
714 }
715
716 if (is_global && !interpreted) {
717 register_array_size_ =
718 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
719 max_matches_ = register_array_size_ / registers_per_match_;
720 } else {
721 // Global loop in interpreted regexp is not implemented. We choose
722 // the size of the offsets vector so that it can only store one match.
723 register_array_size_ = registers_per_match_;
724 max_matches_ = 1;
725 }
726
727 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
728 register_array_ = NewArray<int32_t>(register_array_size_);
729 } else {
730 register_array_ = isolate->jsregexp_static_offsets_vector();
731 }
732
733 // Set state so that fetching the results the first time triggers a call
734 // to the compiled regexp.
735 current_match_index_ = max_matches_ - 1;
736 num_matches_ = max_matches_;
737 ASSERT(registers_per_match_ >= 2); // Each match has at least one capture.
738 ASSERT_GE(register_array_size_, registers_per_match_);
739 int32_t* last_match =
740 ®ister_array_[current_match_index_ * registers_per_match_];
741 last_match[0] = -1;
742 last_match[1] = 0;
743 }
744
745
746 // -------------------------------------------------------------------
747 // Implementation of the Irregexp regular expression engine.
748 //
749 // The Irregexp regular expression engine is intended to be a complete
750 // implementation of ECMAScript regular expressions. It generates either
751 // bytecodes or native code.
752
753 // The Irregexp regexp engine is structured in three steps.
754 // 1) The parser generates an abstract syntax tree. See ast.cc.
755 // 2) From the AST a node network is created. The nodes are all
756 // subclasses of RegExpNode. The nodes represent states when
757 // executing a regular expression. Several optimizations are
758 // performed on the node network.
759 // 3) From the nodes we generate either byte codes or native code
760 // that can actually execute the regular expression (perform
761 // the search). The code generation step is described in more
762 // detail below.
763
764 // Code generation.
765 //
766 // The nodes are divided into four main categories.
767 // * Choice nodes
768 // These represent places where the regular expression can
769 // match in more than one way. For example on entry to an
770 // alternation (foo|bar) or a repetition (*, +, ? or {}).
771 // * Action nodes
772 // These represent places where some action should be
773 // performed. Examples include recording the current position
774 // in the input string to a register (in order to implement
775 // captures) or other actions on register for example in order
776 // to implement the counters needed for {} repetitions.
777 // * Matching nodes
778 // These attempt to match some element part of the input string.
779 // Examples of elements include character classes, plain strings
780 // or back references.
781 // * End nodes
782 // These are used to implement the actions required on finding
783 // a successful match or failing to find a match.
784 //
785 // The code generated (whether as byte codes or native code) maintains
786 // some state as it runs. This consists of the following elements:
787 //
788 // * The capture registers. Used for string captures.
789 // * Other registers. Used for counters etc.
790 // * The current position.
791 // * The stack of backtracking information. Used when a matching node
792 // fails to find a match and needs to try an alternative.
793 //
794 // Conceptual regular expression execution model:
795 //
796 // There is a simple conceptual model of regular expression execution
797 // which will be presented first. The actual code generated is a more
798 // efficient simulation of the simple conceptual model:
799 //
800 // * Choice nodes are implemented as follows:
801 // For each choice except the last {
802 // push current position
803 // push backtrack code location
804 // <generate code to test for choice>
805 // backtrack code location:
806 // pop current position
807 // }
808 // <generate code to test for last choice>
809 //
810 // * Actions nodes are generated as follows
811 // <push affected registers on backtrack stack>
812 // <generate code to perform action>
813 // push backtrack code location
814 // <generate code to test for following nodes>
815 // backtrack code location:
816 // <pop affected registers to restore their state>
817 // <pop backtrack location from stack and go to it>
818 //
819 // * Matching nodes are generated as follows:
820 // if input string matches at current position
821 // update current position
822 // <generate code to test for following nodes>
823 // else
824 // <pop backtrack location from stack and go to it>
825 //
826 // Thus it can be seen that the current position is saved and restored
827 // by the choice nodes, whereas the registers are saved and restored by
828 // by the action nodes that manipulate them.
829 //
830 // The other interesting aspect of this model is that nodes are generated
831 // at the point where they are needed by a recursive call to Emit(). If
832 // the node has already been code generated then the Emit() call will
833 // generate a jump to the previously generated code instead. In order to
834 // limit recursion it is possible for the Emit() function to put the node
835 // on a work list for later generation and instead generate a jump. The
836 // destination of the jump is resolved later when the code is generated.
837 //
838 // Actual regular expression code generation.
839 //
840 // Code generation is actually more complicated than the above. In order
841 // to improve the efficiency of the generated code some optimizations are
842 // performed
843 //
844 // * Choice nodes have 1-character lookahead.
845 // A choice node looks at the following character and eliminates some of
846 // the choices immediately based on that character. This is not yet
847 // implemented.
848 // * Simple greedy loops store reduced backtracking information.
849 // A quantifier like /.*foo/m will greedily match the whole input. It will
850 // then need to backtrack to a point where it can match "foo". The naive
851 // implementation of this would push each character position onto the
852 // backtracking stack, then pop them off one by one. This would use space
853 // proportional to the length of the input string. However since the "."
854 // can only match in one way and always has a constant length (in this case
855 // of 1) it suffices to store the current position on the top of the stack
856 // once. Matching now becomes merely incrementing the current position and
857 // backtracking becomes decrementing the current position and checking the
858 // result against the stored current position. This is faster and saves
859 // space.
860 // * The current state is virtualized.
861 // This is used to defer expensive operations until it is clear that they
862 // are needed and to generate code for a node more than once, allowing
863 // specialized an efficient versions of the code to be created. This is
864 // explained in the section below.
865 //
866 // Execution state virtualization.
867 //
868 // Instead of emitting code, nodes that manipulate the state can record their
869 // manipulation in an object called the Trace. The Trace object can record a
870 // current position offset, an optional backtrack code location on the top of
871 // the virtualized backtrack stack and some register changes. When a node is
872 // to be emitted it can flush the Trace or update it. Flushing the Trace
873 // will emit code to bring the actual state into line with the virtual state.
874 // Avoiding flushing the state can postpone some work (e.g. updates of capture
875 // registers). Postponing work can save time when executing the regular
876 // expression since it may be found that the work never has to be done as a
877 // failure to match can occur. In addition it is much faster to jump to a
878 // known backtrack code location than it is to pop an unknown backtrack
879 // location from the stack and jump there.
880 //
881 // The virtual state found in the Trace affects code generation. For example
882 // the virtual state contains the difference between the actual current
883 // position and the virtual current position, and matching code needs to use
884 // this offset to attempt a match in the correct location of the input
885 // string. Therefore code generated for a non-trivial trace is specialized
886 // to that trace. The code generator therefore has the ability to generate
887 // code for each node several times. In order to limit the size of the
888 // generated code there is an arbitrary limit on how many specialized sets of
889 // code may be generated for a given node. If the limit is reached, the
890 // trace is flushed and a generic version of the code for a node is emitted.
891 // This is subsequently used for that node. The code emitted for non-generic
892 // trace is not recorded in the node and so it cannot currently be reused in
893 // the event that code generation is requested for an identical trace.
894
895
AppendToText(RegExpText * text,Zone * zone)896 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
897 UNREACHABLE();
898 }
899
900
AppendToText(RegExpText * text,Zone * zone)901 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
902 text->AddElement(TextElement::Atom(this), zone);
903 }
904
905
AppendToText(RegExpText * text,Zone * zone)906 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
907 text->AddElement(TextElement::CharClass(this), zone);
908 }
909
910
AppendToText(RegExpText * text,Zone * zone)911 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
912 for (int i = 0; i < elements()->length(); i++)
913 text->AddElement(elements()->at(i), zone);
914 }
915
916
Atom(RegExpAtom * atom)917 TextElement TextElement::Atom(RegExpAtom* atom) {
918 return TextElement(ATOM, atom);
919 }
920
921
CharClass(RegExpCharacterClass * char_class)922 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
923 return TextElement(CHAR_CLASS, char_class);
924 }
925
926
length() const927 int TextElement::length() const {
928 switch (text_type()) {
929 case ATOM:
930 return atom()->length();
931
932 case CHAR_CLASS:
933 return 1;
934 }
935 UNREACHABLE();
936 return 0;
937 }
938
939
GetTable(bool ignore_case)940 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
941 if (table_ == NULL) {
942 table_ = new(zone()) DispatchTable(zone());
943 DispatchTableConstructor cons(table_, ignore_case, zone());
944 cons.BuildTable(this);
945 }
946 return table_;
947 }
948
949
950 class FrequencyCollator {
951 public:
FrequencyCollator()952 FrequencyCollator() : total_samples_(0) {
953 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
954 frequencies_[i] = CharacterFrequency(i);
955 }
956 }
957
CountCharacter(int character)958 void CountCharacter(int character) {
959 int index = (character & RegExpMacroAssembler::kTableMask);
960 frequencies_[index].Increment();
961 total_samples_++;
962 }
963
964 // Does not measure in percent, but rather per-128 (the table size from the
965 // regexp macro assembler).
Frequency(int in_character)966 int Frequency(int in_character) {
967 ASSERT((in_character & RegExpMacroAssembler::kTableMask) == in_character);
968 if (total_samples_ < 1) return 1; // Division by zero.
969 int freq_in_per128 =
970 (frequencies_[in_character].counter() * 128) / total_samples_;
971 return freq_in_per128;
972 }
973
974 private:
975 class CharacterFrequency {
976 public:
CharacterFrequency()977 CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)978 explicit CharacterFrequency(int character)
979 : counter_(0), character_(character) { }
980
Increment()981 void Increment() { counter_++; }
counter()982 int counter() { return counter_; }
character()983 int character() { return character_; }
984
985 private:
986 int counter_;
987 int character_;
988 };
989
990
991 private:
992 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
993 int total_samples_;
994 };
995
996
997 class RegExpCompiler {
998 public:
999 RegExpCompiler(int capture_count, bool ignore_case, bool is_ascii,
1000 Zone* zone);
1001
AllocateRegister()1002 int AllocateRegister() {
1003 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1004 reg_exp_too_big_ = true;
1005 return next_register_;
1006 }
1007 return next_register_++;
1008 }
1009
1010 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
1011 RegExpNode* start,
1012 int capture_count,
1013 Handle<String> pattern);
1014
AddWork(RegExpNode * node)1015 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1016
1017 static const int kImplementationOffset = 0;
1018 static const int kNumberOfRegistersOffset = 0;
1019 static const int kCodeOffset = 1;
1020
macro_assembler()1021 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()1022 EndNode* accept() { return accept_; }
1023
1024 static const int kMaxRecursion = 100;
recursion_depth()1025 inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()1026 inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()1027 inline void DecrementRecursionDepth() { recursion_depth_--; }
1028
SetRegExpTooBig()1029 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1030
ignore_case()1031 inline bool ignore_case() { return ignore_case_; }
ascii()1032 inline bool ascii() { return ascii_; }
frequency_collator()1033 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1034
current_expansion_factor()1035 int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)1036 void set_current_expansion_factor(int value) {
1037 current_expansion_factor_ = value;
1038 }
1039
zone() const1040 Zone* zone() const { return zone_; }
1041
1042 static const int kNoRegister = -1;
1043
1044 private:
1045 EndNode* accept_;
1046 int next_register_;
1047 List<RegExpNode*>* work_list_;
1048 int recursion_depth_;
1049 RegExpMacroAssembler* macro_assembler_;
1050 bool ignore_case_;
1051 bool ascii_;
1052 bool reg_exp_too_big_;
1053 int current_expansion_factor_;
1054 FrequencyCollator frequency_collator_;
1055 Zone* zone_;
1056 };
1057
1058
1059 class RecursionCheck {
1060 public:
RecursionCheck(RegExpCompiler * compiler)1061 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1062 compiler->IncrementRecursionDepth();
1063 }
~RecursionCheck()1064 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1065 private:
1066 RegExpCompiler* compiler_;
1067 };
1068
1069
IrregexpRegExpTooBig(Isolate * isolate)1070 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1071 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1072 }
1073
1074
1075 // Attempts to compile the regexp using an Irregexp code generator. Returns
1076 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(int capture_count,bool ignore_case,bool ascii,Zone * zone)1077 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case, bool ascii,
1078 Zone* zone)
1079 : next_register_(2 * (capture_count + 1)),
1080 work_list_(NULL),
1081 recursion_depth_(0),
1082 ignore_case_(ignore_case),
1083 ascii_(ascii),
1084 reg_exp_too_big_(false),
1085 current_expansion_factor_(1),
1086 frequency_collator_(),
1087 zone_(zone) {
1088 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1089 ASSERT(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1090 }
1091
1092
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1093 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1094 RegExpMacroAssembler* macro_assembler,
1095 RegExpNode* start,
1096 int capture_count,
1097 Handle<String> pattern) {
1098 Heap* heap = pattern->GetHeap();
1099
1100 bool use_slow_safe_regexp_compiler = false;
1101 if (heap->total_regexp_code_generated() >
1102 RegExpImpl::kRegWxpCompiledLimit &&
1103 heap->isolate()->memory_allocator()->SizeExecutable() >
1104 RegExpImpl::kRegExpExecutableMemoryLimit) {
1105 use_slow_safe_regexp_compiler = true;
1106 }
1107
1108 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1109
1110 #ifdef DEBUG
1111 if (FLAG_trace_regexp_assembler)
1112 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1113 else
1114 #endif
1115 macro_assembler_ = macro_assembler;
1116
1117 List <RegExpNode*> work_list(0);
1118 work_list_ = &work_list;
1119 Label fail;
1120 macro_assembler_->PushBacktrack(&fail);
1121 Trace new_trace;
1122 start->Emit(this, &new_trace);
1123 macro_assembler_->Bind(&fail);
1124 macro_assembler_->Fail();
1125 while (!work_list.is_empty()) {
1126 work_list.RemoveLast()->Emit(this, &new_trace);
1127 }
1128 if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());
1129
1130 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1131 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1132 work_list_ = NULL;
1133 #ifdef DEBUG
1134 if (FLAG_print_code) {
1135 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1136 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(),
1137 trace_scope.file());
1138 }
1139 if (FLAG_trace_regexp_assembler) {
1140 delete macro_assembler_;
1141 }
1142 #endif
1143 return RegExpEngine::CompilationResult(*code, next_register_);
1144 }
1145
1146
Mentions(int that)1147 bool Trace::DeferredAction::Mentions(int that) {
1148 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1149 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1150 return range.Contains(that);
1151 } else {
1152 return reg() == that;
1153 }
1154 }
1155
1156
mentions_reg(int reg)1157 bool Trace::mentions_reg(int reg) {
1158 for (DeferredAction* action = actions_;
1159 action != NULL;
1160 action = action->next()) {
1161 if (action->Mentions(reg))
1162 return true;
1163 }
1164 return false;
1165 }
1166
1167
GetStoredPosition(int reg,int * cp_offset)1168 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1169 ASSERT_EQ(0, *cp_offset);
1170 for (DeferredAction* action = actions_;
1171 action != NULL;
1172 action = action->next()) {
1173 if (action->Mentions(reg)) {
1174 if (action->action_type() == ActionNode::STORE_POSITION) {
1175 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1176 return true;
1177 } else {
1178 return false;
1179 }
1180 }
1181 }
1182 return false;
1183 }
1184
1185
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1186 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1187 Zone* zone) {
1188 int max_register = RegExpCompiler::kNoRegister;
1189 for (DeferredAction* action = actions_;
1190 action != NULL;
1191 action = action->next()) {
1192 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1193 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1194 for (int i = range.from(); i <= range.to(); i++)
1195 affected_registers->Set(i, zone);
1196 if (range.to() > max_register) max_register = range.to();
1197 } else {
1198 affected_registers->Set(action->reg(), zone);
1199 if (action->reg() > max_register) max_register = action->reg();
1200 }
1201 }
1202 return max_register;
1203 }
1204
1205
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1206 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1207 int max_register,
1208 const OutSet& registers_to_pop,
1209 const OutSet& registers_to_clear) {
1210 for (int reg = max_register; reg >= 0; reg--) {
1211 if (registers_to_pop.Get(reg)) {
1212 assembler->PopRegister(reg);
1213 } else if (registers_to_clear.Get(reg)) {
1214 int clear_to = reg;
1215 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1216 reg--;
1217 }
1218 assembler->ClearRegisters(reg, clear_to);
1219 }
1220 }
1221 }
1222
1223
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1224 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1225 int max_register,
1226 const OutSet& affected_registers,
1227 OutSet* registers_to_pop,
1228 OutSet* registers_to_clear,
1229 Zone* zone) {
1230 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1231 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1232
1233 // Count pushes performed to force a stack limit check occasionally.
1234 int pushes = 0;
1235
1236 for (int reg = 0; reg <= max_register; reg++) {
1237 if (!affected_registers.Get(reg)) {
1238 continue;
1239 }
1240
1241 // The chronologically first deferred action in the trace
1242 // is used to infer the action needed to restore a register
1243 // to its previous state (or not, if it's safe to ignore it).
1244 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1245 DeferredActionUndoType undo_action = IGNORE;
1246
1247 int value = 0;
1248 bool absolute = false;
1249 bool clear = false;
1250 int store_position = -1;
1251 // This is a little tricky because we are scanning the actions in reverse
1252 // historical order (newest first).
1253 for (DeferredAction* action = actions_;
1254 action != NULL;
1255 action = action->next()) {
1256 if (action->Mentions(reg)) {
1257 switch (action->action_type()) {
1258 case ActionNode::SET_REGISTER: {
1259 Trace::DeferredSetRegister* psr =
1260 static_cast<Trace::DeferredSetRegister*>(action);
1261 if (!absolute) {
1262 value += psr->value();
1263 absolute = true;
1264 }
1265 // SET_REGISTER is currently only used for newly introduced loop
1266 // counters. They can have a significant previous value if they
1267 // occour in a loop. TODO(lrn): Propagate this information, so
1268 // we can set undo_action to IGNORE if we know there is no value to
1269 // restore.
1270 undo_action = RESTORE;
1271 ASSERT_EQ(store_position, -1);
1272 ASSERT(!clear);
1273 break;
1274 }
1275 case ActionNode::INCREMENT_REGISTER:
1276 if (!absolute) {
1277 value++;
1278 }
1279 ASSERT_EQ(store_position, -1);
1280 ASSERT(!clear);
1281 undo_action = RESTORE;
1282 break;
1283 case ActionNode::STORE_POSITION: {
1284 Trace::DeferredCapture* pc =
1285 static_cast<Trace::DeferredCapture*>(action);
1286 if (!clear && store_position == -1) {
1287 store_position = pc->cp_offset();
1288 }
1289
1290 // For captures we know that stores and clears alternate.
1291 // Other register, are never cleared, and if the occur
1292 // inside a loop, they might be assigned more than once.
1293 if (reg <= 1) {
1294 // Registers zero and one, aka "capture zero", is
1295 // always set correctly if we succeed. There is no
1296 // need to undo a setting on backtrack, because we
1297 // will set it again or fail.
1298 undo_action = IGNORE;
1299 } else {
1300 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1301 }
1302 ASSERT(!absolute);
1303 ASSERT_EQ(value, 0);
1304 break;
1305 }
1306 case ActionNode::CLEAR_CAPTURES: {
1307 // Since we're scanning in reverse order, if we've already
1308 // set the position we have to ignore historically earlier
1309 // clearing operations.
1310 if (store_position == -1) {
1311 clear = true;
1312 }
1313 undo_action = RESTORE;
1314 ASSERT(!absolute);
1315 ASSERT_EQ(value, 0);
1316 break;
1317 }
1318 default:
1319 UNREACHABLE();
1320 break;
1321 }
1322 }
1323 }
1324 // Prepare for the undo-action (e.g., push if it's going to be popped).
1325 if (undo_action == RESTORE) {
1326 pushes++;
1327 RegExpMacroAssembler::StackCheckFlag stack_check =
1328 RegExpMacroAssembler::kNoStackLimitCheck;
1329 if (pushes == push_limit) {
1330 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1331 pushes = 0;
1332 }
1333
1334 assembler->PushRegister(reg, stack_check);
1335 registers_to_pop->Set(reg, zone);
1336 } else if (undo_action == CLEAR) {
1337 registers_to_clear->Set(reg, zone);
1338 }
1339 // Perform the chronologically last action (or accumulated increment)
1340 // for the register.
1341 if (store_position != -1) {
1342 assembler->WriteCurrentPositionToRegister(reg, store_position);
1343 } else if (clear) {
1344 assembler->ClearRegisters(reg, reg);
1345 } else if (absolute) {
1346 assembler->SetRegister(reg, value);
1347 } else if (value != 0) {
1348 assembler->AdvanceRegister(reg, value);
1349 }
1350 }
1351 }
1352
1353
1354 // This is called as we come into a loop choice node and some other tricky
1355 // nodes. It normalizes the state of the code generator to ensure we can
1356 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1357 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1358 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1359
1360 ASSERT(!is_trivial());
1361
1362 if (actions_ == NULL && backtrack() == NULL) {
1363 // Here we just have some deferred cp advances to fix and we are back to
1364 // a normal situation. We may also have to forget some information gained
1365 // through a quick check that was already performed.
1366 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1367 // Create a new trivial state and generate the node with that.
1368 Trace new_state;
1369 successor->Emit(compiler, &new_state);
1370 return;
1371 }
1372
1373 // Generate deferred actions here along with code to undo them again.
1374 OutSet affected_registers;
1375
1376 if (backtrack() != NULL) {
1377 // Here we have a concrete backtrack location. These are set up by choice
1378 // nodes and so they indicate that we have a deferred save of the current
1379 // position which we may need to emit here.
1380 assembler->PushCurrentPosition();
1381 }
1382
1383 int max_register = FindAffectedRegisters(&affected_registers,
1384 compiler->zone());
1385 OutSet registers_to_pop;
1386 OutSet registers_to_clear;
1387 PerformDeferredActions(assembler,
1388 max_register,
1389 affected_registers,
1390 ®isters_to_pop,
1391 ®isters_to_clear,
1392 compiler->zone());
1393 if (cp_offset_ != 0) {
1394 assembler->AdvanceCurrentPosition(cp_offset_);
1395 }
1396
1397 // Create a new trivial state and generate the node with that.
1398 Label undo;
1399 assembler->PushBacktrack(&undo);
1400 Trace new_state;
1401 successor->Emit(compiler, &new_state);
1402
1403 // On backtrack we need to restore state.
1404 assembler->Bind(&undo);
1405 RestoreAffectedRegisters(assembler,
1406 max_register,
1407 registers_to_pop,
1408 registers_to_clear);
1409 if (backtrack() == NULL) {
1410 assembler->Backtrack();
1411 } else {
1412 assembler->PopCurrentPosition();
1413 assembler->GoTo(backtrack());
1414 }
1415 }
1416
1417
Emit(RegExpCompiler * compiler,Trace * trace)1418 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1419 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1420
1421 // Omit flushing the trace. We discard the entire stack frame anyway.
1422
1423 if (!label()->is_bound()) {
1424 // We are completely independent of the trace, since we ignore it,
1425 // so this code can be used as the generic version.
1426 assembler->Bind(label());
1427 }
1428
1429 // Throw away everything on the backtrack stack since the start
1430 // of the negative submatch and restore the character position.
1431 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1432 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1433 if (clear_capture_count_ > 0) {
1434 // Clear any captures that might have been performed during the success
1435 // of the body of the negative look-ahead.
1436 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1437 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1438 }
1439 // Now that we have unwound the stack we find at the top of the stack the
1440 // backtrack that the BeginSubmatch node got.
1441 assembler->Backtrack();
1442 }
1443
1444
Emit(RegExpCompiler * compiler,Trace * trace)1445 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1446 if (!trace->is_trivial()) {
1447 trace->Flush(compiler, this);
1448 return;
1449 }
1450 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1451 if (!label()->is_bound()) {
1452 assembler->Bind(label());
1453 }
1454 switch (action_) {
1455 case ACCEPT:
1456 assembler->Succeed();
1457 return;
1458 case BACKTRACK:
1459 assembler->GoTo(trace->backtrack());
1460 return;
1461 case NEGATIVE_SUBMATCH_SUCCESS:
1462 // This case is handled in a different virtual method.
1463 UNREACHABLE();
1464 }
1465 UNIMPLEMENTED();
1466 }
1467
1468
AddGuard(Guard * guard,Zone * zone)1469 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1470 if (guards_ == NULL)
1471 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1472 guards_->Add(guard, zone);
1473 }
1474
1475
SetRegister(int reg,int val,RegExpNode * on_success)1476 ActionNode* ActionNode::SetRegister(int reg,
1477 int val,
1478 RegExpNode* on_success) {
1479 ActionNode* result =
1480 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1481 result->data_.u_store_register.reg = reg;
1482 result->data_.u_store_register.value = val;
1483 return result;
1484 }
1485
1486
IncrementRegister(int reg,RegExpNode * on_success)1487 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1488 ActionNode* result =
1489 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1490 result->data_.u_increment_register.reg = reg;
1491 return result;
1492 }
1493
1494
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1495 ActionNode* ActionNode::StorePosition(int reg,
1496 bool is_capture,
1497 RegExpNode* on_success) {
1498 ActionNode* result =
1499 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1500 result->data_.u_position_register.reg = reg;
1501 result->data_.u_position_register.is_capture = is_capture;
1502 return result;
1503 }
1504
1505
ClearCaptures(Interval range,RegExpNode * on_success)1506 ActionNode* ActionNode::ClearCaptures(Interval range,
1507 RegExpNode* on_success) {
1508 ActionNode* result =
1509 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1510 result->data_.u_clear_captures.range_from = range.from();
1511 result->data_.u_clear_captures.range_to = range.to();
1512 return result;
1513 }
1514
1515
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1516 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1517 int position_reg,
1518 RegExpNode* on_success) {
1519 ActionNode* result =
1520 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1521 result->data_.u_submatch.stack_pointer_register = stack_reg;
1522 result->data_.u_submatch.current_position_register = position_reg;
1523 return result;
1524 }
1525
1526
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1527 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1528 int position_reg,
1529 int clear_register_count,
1530 int clear_register_from,
1531 RegExpNode* on_success) {
1532 ActionNode* result =
1533 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1534 result->data_.u_submatch.stack_pointer_register = stack_reg;
1535 result->data_.u_submatch.current_position_register = position_reg;
1536 result->data_.u_submatch.clear_register_count = clear_register_count;
1537 result->data_.u_submatch.clear_register_from = clear_register_from;
1538 return result;
1539 }
1540
1541
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1542 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1543 int repetition_register,
1544 int repetition_limit,
1545 RegExpNode* on_success) {
1546 ActionNode* result =
1547 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1548 result->data_.u_empty_match_check.start_register = start_register;
1549 result->data_.u_empty_match_check.repetition_register = repetition_register;
1550 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1551 return result;
1552 }
1553
1554
1555 #define DEFINE_ACCEPT(Type) \
1556 void Type##Node::Accept(NodeVisitor* visitor) { \
1557 visitor->Visit##Type(this); \
1558 }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1559 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1560 #undef DEFINE_ACCEPT
1561
1562
1563 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1564 visitor->VisitLoopChoice(this);
1565 }
1566
1567
1568 // -------------------------------------------------------------------
1569 // Emit code.
1570
1571
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1572 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1573 Guard* guard,
1574 Trace* trace) {
1575 switch (guard->op()) {
1576 case Guard::LT:
1577 ASSERT(!trace->mentions_reg(guard->reg()));
1578 macro_assembler->IfRegisterGE(guard->reg(),
1579 guard->value(),
1580 trace->backtrack());
1581 break;
1582 case Guard::GEQ:
1583 ASSERT(!trace->mentions_reg(guard->reg()));
1584 macro_assembler->IfRegisterLT(guard->reg(),
1585 guard->value(),
1586 trace->backtrack());
1587 break;
1588 }
1589 }
1590
1591
1592 // Returns the number of characters in the equivalence class, omitting those
1593 // that cannot occur in the source string because it is ASCII.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool ascii_subject,unibrow::uchar * letters)1594 static int GetCaseIndependentLetters(Isolate* isolate,
1595 uc16 character,
1596 bool ascii_subject,
1597 unibrow::uchar* letters) {
1598 int length =
1599 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1600 // Unibrow returns 0 or 1 for characters where case independence is
1601 // trivial.
1602 if (length == 0) {
1603 letters[0] = character;
1604 length = 1;
1605 }
1606 if (!ascii_subject || character <= String::kMaxOneByteCharCode) {
1607 return length;
1608 }
1609 // The standard requires that non-ASCII characters cannot have ASCII
1610 // character codes in their equivalence class.
1611 return 0;
1612 }
1613
1614
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1615 static inline bool EmitSimpleCharacter(Isolate* isolate,
1616 RegExpCompiler* compiler,
1617 uc16 c,
1618 Label* on_failure,
1619 int cp_offset,
1620 bool check,
1621 bool preloaded) {
1622 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1623 bool bound_checked = false;
1624 if (!preloaded) {
1625 assembler->LoadCurrentCharacter(
1626 cp_offset,
1627 on_failure,
1628 check);
1629 bound_checked = true;
1630 }
1631 assembler->CheckNotCharacter(c, on_failure);
1632 return bound_checked;
1633 }
1634
1635
1636 // Only emits non-letters (things that don't have case). Only used for case
1637 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1638 static inline bool EmitAtomNonLetter(Isolate* isolate,
1639 RegExpCompiler* compiler,
1640 uc16 c,
1641 Label* on_failure,
1642 int cp_offset,
1643 bool check,
1644 bool preloaded) {
1645 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1646 bool ascii = compiler->ascii();
1647 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1648 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1649 if (length < 1) {
1650 // This can't match. Must be an ASCII subject and a non-ASCII character.
1651 // We do not need to do anything since the ASCII pass already handled this.
1652 return false; // Bounds not checked.
1653 }
1654 bool checked = false;
1655 // We handle the length > 1 case in a later pass.
1656 if (length == 1) {
1657 if (ascii && c > String::kMaxOneByteCharCodeU) {
1658 // Can't match - see above.
1659 return false; // Bounds not checked.
1660 }
1661 if (!preloaded) {
1662 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1663 checked = check;
1664 }
1665 macro_assembler->CheckNotCharacter(c, on_failure);
1666 }
1667 return checked;
1668 }
1669
1670
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool ascii,uc16 c1,uc16 c2,Label * on_failure)1671 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1672 bool ascii,
1673 uc16 c1,
1674 uc16 c2,
1675 Label* on_failure) {
1676 uc16 char_mask;
1677 if (ascii) {
1678 char_mask = String::kMaxOneByteCharCode;
1679 } else {
1680 char_mask = String::kMaxUtf16CodeUnit;
1681 }
1682 uc16 exor = c1 ^ c2;
1683 // Check whether exor has only one bit set.
1684 if (((exor - 1) & exor) == 0) {
1685 // If c1 and c2 differ only by one bit.
1686 // Ecma262UnCanonicalize always gives the highest number last.
1687 ASSERT(c2 > c1);
1688 uc16 mask = char_mask ^ exor;
1689 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1690 return true;
1691 }
1692 ASSERT(c2 > c1);
1693 uc16 diff = c2 - c1;
1694 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1695 // If the characters differ by 2^n but don't differ by one bit then
1696 // subtract the difference from the found character, then do the or
1697 // trick. We avoid the theoretical case where negative numbers are
1698 // involved in order to simplify code generation.
1699 uc16 mask = char_mask ^ diff;
1700 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1701 diff,
1702 mask,
1703 on_failure);
1704 return true;
1705 }
1706 return false;
1707 }
1708
1709
1710 typedef bool EmitCharacterFunction(Isolate* isolate,
1711 RegExpCompiler* compiler,
1712 uc16 c,
1713 Label* on_failure,
1714 int cp_offset,
1715 bool check,
1716 bool preloaded);
1717
1718 // Only emits letters (things that have case). Only used for case independent
1719 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1720 static inline bool EmitAtomLetter(Isolate* isolate,
1721 RegExpCompiler* compiler,
1722 uc16 c,
1723 Label* on_failure,
1724 int cp_offset,
1725 bool check,
1726 bool preloaded) {
1727 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1728 bool ascii = compiler->ascii();
1729 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1730 int length = GetCaseIndependentLetters(isolate, c, ascii, chars);
1731 if (length <= 1) return false;
1732 // We may not need to check against the end of the input string
1733 // if this character lies before a character that matched.
1734 if (!preloaded) {
1735 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1736 }
1737 Label ok;
1738 ASSERT(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1739 switch (length) {
1740 case 2: {
1741 if (ShortCutEmitCharacterPair(macro_assembler,
1742 ascii,
1743 chars[0],
1744 chars[1],
1745 on_failure)) {
1746 } else {
1747 macro_assembler->CheckCharacter(chars[0], &ok);
1748 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1749 macro_assembler->Bind(&ok);
1750 }
1751 break;
1752 }
1753 case 4:
1754 macro_assembler->CheckCharacter(chars[3], &ok);
1755 // Fall through!
1756 case 3:
1757 macro_assembler->CheckCharacter(chars[0], &ok);
1758 macro_assembler->CheckCharacter(chars[1], &ok);
1759 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1760 macro_assembler->Bind(&ok);
1761 break;
1762 default:
1763 UNREACHABLE();
1764 break;
1765 }
1766 return true;
1767 }
1768
1769
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1770 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1771 int border,
1772 Label* fall_through,
1773 Label* above_or_equal,
1774 Label* below) {
1775 if (below != fall_through) {
1776 masm->CheckCharacterLT(border, below);
1777 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1778 } else {
1779 masm->CheckCharacterGT(border - 1, above_or_equal);
1780 }
1781 }
1782
1783
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1784 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1785 int first,
1786 int last,
1787 Label* fall_through,
1788 Label* in_range,
1789 Label* out_of_range) {
1790 if (in_range == fall_through) {
1791 if (first == last) {
1792 masm->CheckNotCharacter(first, out_of_range);
1793 } else {
1794 masm->CheckCharacterNotInRange(first, last, out_of_range);
1795 }
1796 } else {
1797 if (first == last) {
1798 masm->CheckCharacter(first, in_range);
1799 } else {
1800 masm->CheckCharacterInRange(first, last, in_range);
1801 }
1802 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1803 }
1804 }
1805
1806
1807 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1808 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1809 static void EmitUseLookupTable(
1810 RegExpMacroAssembler* masm,
1811 ZoneList<int>* ranges,
1812 int start_index,
1813 int end_index,
1814 int min_char,
1815 Label* fall_through,
1816 Label* even_label,
1817 Label* odd_label) {
1818 static const int kSize = RegExpMacroAssembler::kTableSize;
1819 static const int kMask = RegExpMacroAssembler::kTableMask;
1820
1821 int base = (min_char & ~kMask);
1822 USE(base);
1823
1824 // Assert that everything is on one kTableSize page.
1825 for (int i = start_index; i <= end_index; i++) {
1826 ASSERT_EQ(ranges->at(i) & ~kMask, base);
1827 }
1828 ASSERT(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1829
1830 char templ[kSize];
1831 Label* on_bit_set;
1832 Label* on_bit_clear;
1833 int bit;
1834 if (even_label == fall_through) {
1835 on_bit_set = odd_label;
1836 on_bit_clear = even_label;
1837 bit = 1;
1838 } else {
1839 on_bit_set = even_label;
1840 on_bit_clear = odd_label;
1841 bit = 0;
1842 }
1843 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1844 templ[i] = bit;
1845 }
1846 int j = 0;
1847 bit ^= 1;
1848 for (int i = start_index; i < end_index; i++) {
1849 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1850 templ[j] = bit;
1851 }
1852 bit ^= 1;
1853 }
1854 for (int i = j; i < kSize; i++) {
1855 templ[i] = bit;
1856 }
1857 Factory* factory = masm->zone()->isolate()->factory();
1858 // TODO(erikcorry): Cache these.
1859 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1860 for (int i = 0; i < kSize; i++) {
1861 ba->set(i, templ[i]);
1862 }
1863 masm->CheckBitInTable(ba, on_bit_set);
1864 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1865 }
1866
1867
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1868 static void CutOutRange(RegExpMacroAssembler* masm,
1869 ZoneList<int>* ranges,
1870 int start_index,
1871 int end_index,
1872 int cut_index,
1873 Label* even_label,
1874 Label* odd_label) {
1875 bool odd = (((cut_index - start_index) & 1) == 1);
1876 Label* in_range_label = odd ? odd_label : even_label;
1877 Label dummy;
1878 EmitDoubleBoundaryTest(masm,
1879 ranges->at(cut_index),
1880 ranges->at(cut_index + 1) - 1,
1881 &dummy,
1882 in_range_label,
1883 &dummy);
1884 ASSERT(!dummy.is_linked());
1885 // Cut out the single range by rewriting the array. This creates a new
1886 // range that is a merger of the two ranges on either side of the one we
1887 // are cutting out. The oddity of the labels is preserved.
1888 for (int j = cut_index; j > start_index; j--) {
1889 ranges->at(j) = ranges->at(j - 1);
1890 }
1891 for (int j = cut_index + 1; j < end_index; j++) {
1892 ranges->at(j) = ranges->at(j + 1);
1893 }
1894 }
1895
1896
1897 // Unicode case. Split the search space into kSize spaces that are handled
1898 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1899 static void SplitSearchSpace(ZoneList<int>* ranges,
1900 int start_index,
1901 int end_index,
1902 int* new_start_index,
1903 int* new_end_index,
1904 int* border) {
1905 static const int kSize = RegExpMacroAssembler::kTableSize;
1906 static const int kMask = RegExpMacroAssembler::kTableMask;
1907
1908 int first = ranges->at(start_index);
1909 int last = ranges->at(end_index) - 1;
1910
1911 *new_start_index = start_index;
1912 *border = (ranges->at(start_index) & ~kMask) + kSize;
1913 while (*new_start_index < end_index) {
1914 if (ranges->at(*new_start_index) > *border) break;
1915 (*new_start_index)++;
1916 }
1917 // new_start_index is the index of the first edge that is beyond the
1918 // current kSize space.
1919
1920 // For very large search spaces we do a binary chop search of the non-ASCII
1921 // space instead of just going to the end of the current kSize space. The
1922 // heuristics are complicated a little by the fact that any 128-character
1923 // encoding space can be quickly tested with a table lookup, so we don't
1924 // wish to do binary chop search at a smaller granularity than that. A
1925 // 128-character space can take up a lot of space in the ranges array if,
1926 // for example, we only want to match every second character (eg. the lower
1927 // case characters on some Unicode pages).
1928 int binary_chop_index = (end_index + start_index) / 2;
1929 // The first test ensures that we get to the code that handles the ASCII
1930 // range with a single not-taken branch, speeding up this important
1931 // character range (even non-ASCII charset-based text has spaces and
1932 // punctuation).
1933 if (*border - 1 > String::kMaxOneByteCharCode && // ASCII case.
1934 end_index - start_index > (*new_start_index - start_index) * 2 &&
1935 last - first > kSize * 2 &&
1936 binary_chop_index > *new_start_index &&
1937 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1938 int scan_forward_for_section_border = binary_chop_index;;
1939 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1940
1941 while (scan_forward_for_section_border < end_index) {
1942 if (ranges->at(scan_forward_for_section_border) > new_border) {
1943 *new_start_index = scan_forward_for_section_border;
1944 *border = new_border;
1945 break;
1946 }
1947 scan_forward_for_section_border++;
1948 }
1949 }
1950
1951 ASSERT(*new_start_index > start_index);
1952 *new_end_index = *new_start_index - 1;
1953 if (ranges->at(*new_end_index) == *border) {
1954 (*new_end_index)--;
1955 }
1956 if (*border >= ranges->at(end_index)) {
1957 *border = ranges->at(end_index);
1958 *new_start_index = end_index; // Won't be used.
1959 *new_end_index = end_index - 1;
1960 }
1961 }
1962
1963
1964 // Gets a series of segment boundaries representing a character class. If the
1965 // character is in the range between an even and an odd boundary (counting from
1966 // start_index) then go to even_label, otherwise go to odd_label. We already
1967 // know that the character is in the range of min_char to max_char inclusive.
1968 // Either label can be NULL indicating backtracking. Either label can also be
1969 // equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc16 min_char,uc16 max_char,Label * fall_through,Label * even_label,Label * odd_label)1970 static void GenerateBranches(RegExpMacroAssembler* masm,
1971 ZoneList<int>* ranges,
1972 int start_index,
1973 int end_index,
1974 uc16 min_char,
1975 uc16 max_char,
1976 Label* fall_through,
1977 Label* even_label,
1978 Label* odd_label) {
1979 int first = ranges->at(start_index);
1980 int last = ranges->at(end_index) - 1;
1981
1982 ASSERT_LT(min_char, first);
1983
1984 // Just need to test if the character is before or on-or-after
1985 // a particular character.
1986 if (start_index == end_index) {
1987 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1988 return;
1989 }
1990
1991 // Another almost trivial case: There is one interval in the middle that is
1992 // different from the end intervals.
1993 if (start_index + 1 == end_index) {
1994 EmitDoubleBoundaryTest(
1995 masm, first, last, fall_through, even_label, odd_label);
1996 return;
1997 }
1998
1999 // It's not worth using table lookup if there are very few intervals in the
2000 // character class.
2001 if (end_index - start_index <= 6) {
2002 // It is faster to test for individual characters, so we look for those
2003 // first, then try arbitrary ranges in the second round.
2004 static int kNoCutIndex = -1;
2005 int cut = kNoCutIndex;
2006 for (int i = start_index; i < end_index; i++) {
2007 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2008 cut = i;
2009 break;
2010 }
2011 }
2012 if (cut == kNoCutIndex) cut = start_index;
2013 CutOutRange(
2014 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2015 ASSERT_GE(end_index - start_index, 2);
2016 GenerateBranches(masm,
2017 ranges,
2018 start_index + 1,
2019 end_index - 1,
2020 min_char,
2021 max_char,
2022 fall_through,
2023 even_label,
2024 odd_label);
2025 return;
2026 }
2027
2028 // If there are a lot of intervals in the regexp, then we will use tables to
2029 // determine whether the character is inside or outside the character class.
2030 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2031
2032 if ((max_char >> kBits) == (min_char >> kBits)) {
2033 EmitUseLookupTable(masm,
2034 ranges,
2035 start_index,
2036 end_index,
2037 min_char,
2038 fall_through,
2039 even_label,
2040 odd_label);
2041 return;
2042 }
2043
2044 if ((min_char >> kBits) != (first >> kBits)) {
2045 masm->CheckCharacterLT(first, odd_label);
2046 GenerateBranches(masm,
2047 ranges,
2048 start_index + 1,
2049 end_index,
2050 first,
2051 max_char,
2052 fall_through,
2053 odd_label,
2054 even_label);
2055 return;
2056 }
2057
2058 int new_start_index = 0;
2059 int new_end_index = 0;
2060 int border = 0;
2061
2062 SplitSearchSpace(ranges,
2063 start_index,
2064 end_index,
2065 &new_start_index,
2066 &new_end_index,
2067 &border);
2068
2069 Label handle_rest;
2070 Label* above = &handle_rest;
2071 if (border == last + 1) {
2072 // We didn't find any section that started after the limit, so everything
2073 // above the border is one of the terminal labels.
2074 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2075 ASSERT(new_end_index == end_index - 1);
2076 }
2077
2078 ASSERT_LE(start_index, new_end_index);
2079 ASSERT_LE(new_start_index, end_index);
2080 ASSERT_LT(start_index, new_start_index);
2081 ASSERT_LT(new_end_index, end_index);
2082 ASSERT(new_end_index + 1 == new_start_index ||
2083 (new_end_index + 2 == new_start_index &&
2084 border == ranges->at(new_end_index + 1)));
2085 ASSERT_LT(min_char, border - 1);
2086 ASSERT_LT(border, max_char);
2087 ASSERT_LT(ranges->at(new_end_index), border);
2088 ASSERT(border < ranges->at(new_start_index) ||
2089 (border == ranges->at(new_start_index) &&
2090 new_start_index == end_index &&
2091 new_end_index == end_index - 1 &&
2092 border == last + 1));
2093 ASSERT(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2094
2095 masm->CheckCharacterGT(border - 1, above);
2096 Label dummy;
2097 GenerateBranches(masm,
2098 ranges,
2099 start_index,
2100 new_end_index,
2101 min_char,
2102 border - 1,
2103 &dummy,
2104 even_label,
2105 odd_label);
2106 if (handle_rest.is_linked()) {
2107 masm->Bind(&handle_rest);
2108 bool flip = (new_start_index & 1) != (start_index & 1);
2109 GenerateBranches(masm,
2110 ranges,
2111 new_start_index,
2112 end_index,
2113 border,
2114 max_char,
2115 &dummy,
2116 flip ? odd_label : even_label,
2117 flip ? even_label : odd_label);
2118 }
2119 }
2120
2121
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool ascii,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2122 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2123 RegExpCharacterClass* cc,
2124 bool ascii,
2125 Label* on_failure,
2126 int cp_offset,
2127 bool check_offset,
2128 bool preloaded,
2129 Zone* zone) {
2130 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2131 if (!CharacterRange::IsCanonical(ranges)) {
2132 CharacterRange::Canonicalize(ranges);
2133 }
2134
2135 int max_char;
2136 if (ascii) {
2137 max_char = String::kMaxOneByteCharCode;
2138 } else {
2139 max_char = String::kMaxUtf16CodeUnit;
2140 }
2141
2142 int range_count = ranges->length();
2143
2144 int last_valid_range = range_count - 1;
2145 while (last_valid_range >= 0) {
2146 CharacterRange& range = ranges->at(last_valid_range);
2147 if (range.from() <= max_char) {
2148 break;
2149 }
2150 last_valid_range--;
2151 }
2152
2153 if (last_valid_range < 0) {
2154 if (!cc->is_negated()) {
2155 macro_assembler->GoTo(on_failure);
2156 }
2157 if (check_offset) {
2158 macro_assembler->CheckPosition(cp_offset, on_failure);
2159 }
2160 return;
2161 }
2162
2163 if (last_valid_range == 0 &&
2164 ranges->at(0).IsEverything(max_char)) {
2165 if (cc->is_negated()) {
2166 macro_assembler->GoTo(on_failure);
2167 } else {
2168 // This is a common case hit by non-anchored expressions.
2169 if (check_offset) {
2170 macro_assembler->CheckPosition(cp_offset, on_failure);
2171 }
2172 }
2173 return;
2174 }
2175 if (last_valid_range == 0 &&
2176 !cc->is_negated() &&
2177 ranges->at(0).IsEverything(max_char)) {
2178 // This is a common case hit by non-anchored expressions.
2179 if (check_offset) {
2180 macro_assembler->CheckPosition(cp_offset, on_failure);
2181 }
2182 return;
2183 }
2184
2185 if (!preloaded) {
2186 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2187 }
2188
2189 if (cc->is_standard(zone) &&
2190 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2191 on_failure)) {
2192 return;
2193 }
2194
2195
2196 // A new list with ascending entries. Each entry is a code unit
2197 // where there is a boundary between code units that are part of
2198 // the class and code units that are not. Normally we insert an
2199 // entry at zero which goes to the failure label, but if there
2200 // was already one there we fall through for success on that entry.
2201 // Subsequent entries have alternating meaning (success/failure).
2202 ZoneList<int>* range_boundaries =
2203 new(zone) ZoneList<int>(last_valid_range, zone);
2204
2205 bool zeroth_entry_is_failure = !cc->is_negated();
2206
2207 for (int i = 0; i <= last_valid_range; i++) {
2208 CharacterRange& range = ranges->at(i);
2209 if (range.from() == 0) {
2210 ASSERT_EQ(i, 0);
2211 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2212 } else {
2213 range_boundaries->Add(range.from(), zone);
2214 }
2215 range_boundaries->Add(range.to() + 1, zone);
2216 }
2217 int end_index = range_boundaries->length() - 1;
2218 if (range_boundaries->at(end_index) > max_char) {
2219 end_index--;
2220 }
2221
2222 Label fall_through;
2223 GenerateBranches(macro_assembler,
2224 range_boundaries,
2225 0, // start_index.
2226 end_index,
2227 0, // min_char.
2228 max_char,
2229 &fall_through,
2230 zeroth_entry_is_failure ? &fall_through : on_failure,
2231 zeroth_entry_is_failure ? on_failure : &fall_through);
2232 macro_assembler->Bind(&fall_through);
2233 }
2234
2235
~RegExpNode()2236 RegExpNode::~RegExpNode() {
2237 }
2238
2239
LimitVersions(RegExpCompiler * compiler,Trace * trace)2240 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2241 Trace* trace) {
2242 // If we are generating a greedy loop then don't stop and don't reuse code.
2243 if (trace->stop_node() != NULL) {
2244 return CONTINUE;
2245 }
2246
2247 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2248 if (trace->is_trivial()) {
2249 if (label_.is_bound()) {
2250 // We are being asked to generate a generic version, but that's already
2251 // been done so just go to it.
2252 macro_assembler->GoTo(&label_);
2253 return DONE;
2254 }
2255 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2256 // To avoid too deep recursion we push the node to the work queue and just
2257 // generate a goto here.
2258 compiler->AddWork(this);
2259 macro_assembler->GoTo(&label_);
2260 return DONE;
2261 }
2262 // Generate generic version of the node and bind the label for later use.
2263 macro_assembler->Bind(&label_);
2264 return CONTINUE;
2265 }
2266
2267 // We are being asked to make a non-generic version. Keep track of how many
2268 // non-generic versions we generate so as not to overdo it.
2269 trace_count_++;
2270 if (FLAG_regexp_optimization &&
2271 trace_count_ < kMaxCopiesCodeGenerated &&
2272 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2273 return CONTINUE;
2274 }
2275
2276 // If we get here code has been generated for this node too many times or
2277 // recursion is too deep. Time to switch to a generic version. The code for
2278 // generic versions above can handle deep recursion properly.
2279 trace->Flush(compiler, this);
2280 return DONE;
2281 }
2282
2283
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2284 int ActionNode::EatsAtLeast(int still_to_find,
2285 int budget,
2286 bool not_at_start) {
2287 if (budget <= 0) return 0;
2288 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2289 return on_success()->EatsAtLeast(still_to_find,
2290 budget - 1,
2291 not_at_start);
2292 }
2293
2294
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2295 void ActionNode::FillInBMInfo(int offset,
2296 int budget,
2297 BoyerMooreLookahead* bm,
2298 bool not_at_start) {
2299 if (action_type_ == BEGIN_SUBMATCH) {
2300 bm->SetRest(offset);
2301 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2302 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2303 }
2304 SaveBMInfo(bm, not_at_start, offset);
2305 }
2306
2307
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2308 int AssertionNode::EatsAtLeast(int still_to_find,
2309 int budget,
2310 bool not_at_start) {
2311 if (budget <= 0) return 0;
2312 // If we know we are not at the start and we are asked "how many characters
2313 // will you match if you succeed?" then we can answer anything since false
2314 // implies false. So lets just return the max answer (still_to_find) since
2315 // that won't prevent us from preloading a lot of characters for the other
2316 // branches in the node graph.
2317 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2318 return on_success()->EatsAtLeast(still_to_find,
2319 budget - 1,
2320 not_at_start);
2321 }
2322
2323
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2324 void AssertionNode::FillInBMInfo(int offset,
2325 int budget,
2326 BoyerMooreLookahead* bm,
2327 bool not_at_start) {
2328 // Match the behaviour of EatsAtLeast on this node.
2329 if (assertion_type() == AT_START && not_at_start) return;
2330 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2331 SaveBMInfo(bm, not_at_start, offset);
2332 }
2333
2334
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2335 int BackReferenceNode::EatsAtLeast(int still_to_find,
2336 int budget,
2337 bool not_at_start) {
2338 if (budget <= 0) return 0;
2339 return on_success()->EatsAtLeast(still_to_find,
2340 budget - 1,
2341 not_at_start);
2342 }
2343
2344
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2345 int TextNode::EatsAtLeast(int still_to_find,
2346 int budget,
2347 bool not_at_start) {
2348 int answer = Length();
2349 if (answer >= still_to_find) return answer;
2350 if (budget <= 0) return answer;
2351 // We are not at start after this node so we set the last argument to 'true'.
2352 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2353 budget - 1,
2354 true);
2355 }
2356
2357
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2358 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
2359 int budget,
2360 bool not_at_start) {
2361 if (budget <= 0) return 0;
2362 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2363 // afterwards.
2364 RegExpNode* node = alternatives_->at(1).node();
2365 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2366 }
2367
2368
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2369 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2370 QuickCheckDetails* details,
2371 RegExpCompiler* compiler,
2372 int filled_in,
2373 bool not_at_start) {
2374 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2375 // afterwards.
2376 RegExpNode* node = alternatives_->at(1).node();
2377 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2378 }
2379
2380
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2381 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2382 int budget,
2383 RegExpNode* ignore_this_node,
2384 bool not_at_start) {
2385 if (budget <= 0) return 0;
2386 int min = 100;
2387 int choice_count = alternatives_->length();
2388 budget = (budget - 1) / choice_count;
2389 for (int i = 0; i < choice_count; i++) {
2390 RegExpNode* node = alternatives_->at(i).node();
2391 if (node == ignore_this_node) continue;
2392 int node_eats_at_least =
2393 node->EatsAtLeast(still_to_find, budget, not_at_start);
2394 if (node_eats_at_least < min) min = node_eats_at_least;
2395 if (min == 0) return 0;
2396 }
2397 return min;
2398 }
2399
2400
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2401 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2402 int budget,
2403 bool not_at_start) {
2404 return EatsAtLeastHelper(still_to_find,
2405 budget - 1,
2406 loop_node_,
2407 not_at_start);
2408 }
2409
2410
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2411 int ChoiceNode::EatsAtLeast(int still_to_find,
2412 int budget,
2413 bool not_at_start) {
2414 return EatsAtLeastHelper(still_to_find,
2415 budget,
2416 NULL,
2417 not_at_start);
2418 }
2419
2420
2421 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2422 static inline uint32_t SmearBitsRight(uint32_t v) {
2423 v |= v >> 1;
2424 v |= v >> 2;
2425 v |= v >> 4;
2426 v |= v >> 8;
2427 v |= v >> 16;
2428 return v;
2429 }
2430
2431
Rationalize(bool asc)2432 bool QuickCheckDetails::Rationalize(bool asc) {
2433 bool found_useful_op = false;
2434 uint32_t char_mask;
2435 if (asc) {
2436 char_mask = String::kMaxOneByteCharCode;
2437 } else {
2438 char_mask = String::kMaxUtf16CodeUnit;
2439 }
2440 mask_ = 0;
2441 value_ = 0;
2442 int char_shift = 0;
2443 for (int i = 0; i < characters_; i++) {
2444 Position* pos = &positions_[i];
2445 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2446 found_useful_op = true;
2447 }
2448 mask_ |= (pos->mask & char_mask) << char_shift;
2449 value_ |= (pos->value & char_mask) << char_shift;
2450 char_shift += asc ? 8 : 16;
2451 }
2452 return found_useful_op;
2453 }
2454
2455
EmitQuickCheck(RegExpCompiler * compiler,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2456 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2457 Trace* trace,
2458 bool preload_has_checked_bounds,
2459 Label* on_possible_success,
2460 QuickCheckDetails* details,
2461 bool fall_through_on_failure) {
2462 if (details->characters() == 0) return false;
2463 GetQuickCheckDetails(
2464 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2465 if (details->cannot_match()) return false;
2466 if (!details->Rationalize(compiler->ascii())) return false;
2467 ASSERT(details->characters() == 1 ||
2468 compiler->macro_assembler()->CanReadUnaligned());
2469 uint32_t mask = details->mask();
2470 uint32_t value = details->value();
2471
2472 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2473
2474 if (trace->characters_preloaded() != details->characters()) {
2475 assembler->LoadCurrentCharacter(trace->cp_offset(),
2476 trace->backtrack(),
2477 !preload_has_checked_bounds,
2478 details->characters());
2479 }
2480
2481
2482 bool need_mask = true;
2483
2484 if (details->characters() == 1) {
2485 // If number of characters preloaded is 1 then we used a byte or 16 bit
2486 // load so the value is already masked down.
2487 uint32_t char_mask;
2488 if (compiler->ascii()) {
2489 char_mask = String::kMaxOneByteCharCode;
2490 } else {
2491 char_mask = String::kMaxUtf16CodeUnit;
2492 }
2493 if ((mask & char_mask) == char_mask) need_mask = false;
2494 mask &= char_mask;
2495 } else {
2496 // For 2-character preloads in ASCII mode or 1-character preloads in
2497 // TWO_BYTE mode we also use a 16 bit load with zero extend.
2498 if (details->characters() == 2 && compiler->ascii()) {
2499 if ((mask & 0xffff) == 0xffff) need_mask = false;
2500 } else if (details->characters() == 1 && !compiler->ascii()) {
2501 if ((mask & 0xffff) == 0xffff) need_mask = false;
2502 } else {
2503 if (mask == 0xffffffff) need_mask = false;
2504 }
2505 }
2506
2507 if (fall_through_on_failure) {
2508 if (need_mask) {
2509 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2510 } else {
2511 assembler->CheckCharacter(value, on_possible_success);
2512 }
2513 } else {
2514 if (need_mask) {
2515 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2516 } else {
2517 assembler->CheckNotCharacter(value, trace->backtrack());
2518 }
2519 }
2520 return true;
2521 }
2522
2523
2524 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2525 // super-class in the .h file).
2526 //
2527 // We iterate along the text object, building up for each character a
2528 // mask and value that can be used to test for a quick failure to match.
2529 // The masks and values for the positions will be combined into a single
2530 // machine word for the current character width in order to be used in
2531 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2532 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2533 RegExpCompiler* compiler,
2534 int characters_filled_in,
2535 bool not_at_start) {
2536 Isolate* isolate = compiler->macro_assembler()->zone()->isolate();
2537 ASSERT(characters_filled_in < details->characters());
2538 int characters = details->characters();
2539 int char_mask;
2540 if (compiler->ascii()) {
2541 char_mask = String::kMaxOneByteCharCode;
2542 } else {
2543 char_mask = String::kMaxUtf16CodeUnit;
2544 }
2545 for (int k = 0; k < elms_->length(); k++) {
2546 TextElement elm = elms_->at(k);
2547 if (elm.text_type() == TextElement::ATOM) {
2548 Vector<const uc16> quarks = elm.atom()->data();
2549 for (int i = 0; i < characters && i < quarks.length(); i++) {
2550 QuickCheckDetails::Position* pos =
2551 details->positions(characters_filled_in);
2552 uc16 c = quarks[i];
2553 if (c > char_mask) {
2554 // If we expect a non-ASCII character from an ASCII string,
2555 // there is no way we can match. Not even case independent
2556 // matching can turn an ASCII character into non-ASCII or
2557 // vice versa.
2558 details->set_cannot_match();
2559 pos->determines_perfectly = false;
2560 return;
2561 }
2562 if (compiler->ignore_case()) {
2563 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2564 int length = GetCaseIndependentLetters(isolate, c, compiler->ascii(),
2565 chars);
2566 ASSERT(length != 0); // Can only happen if c > char_mask (see above).
2567 if (length == 1) {
2568 // This letter has no case equivalents, so it's nice and simple
2569 // and the mask-compare will determine definitely whether we have
2570 // a match at this character position.
2571 pos->mask = char_mask;
2572 pos->value = c;
2573 pos->determines_perfectly = true;
2574 } else {
2575 uint32_t common_bits = char_mask;
2576 uint32_t bits = chars[0];
2577 for (int j = 1; j < length; j++) {
2578 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2579 common_bits ^= differing_bits;
2580 bits &= common_bits;
2581 }
2582 // If length is 2 and common bits has only one zero in it then
2583 // our mask and compare instruction will determine definitely
2584 // whether we have a match at this character position. Otherwise
2585 // it can only be an approximate check.
2586 uint32_t one_zero = (common_bits | ~char_mask);
2587 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2588 pos->determines_perfectly = true;
2589 }
2590 pos->mask = common_bits;
2591 pos->value = bits;
2592 }
2593 } else {
2594 // Don't ignore case. Nice simple case where the mask-compare will
2595 // determine definitely whether we have a match at this character
2596 // position.
2597 pos->mask = char_mask;
2598 pos->value = c;
2599 pos->determines_perfectly = true;
2600 }
2601 characters_filled_in++;
2602 ASSERT(characters_filled_in <= details->characters());
2603 if (characters_filled_in == details->characters()) {
2604 return;
2605 }
2606 }
2607 } else {
2608 QuickCheckDetails::Position* pos =
2609 details->positions(characters_filled_in);
2610 RegExpCharacterClass* tree = elm.char_class();
2611 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2612 if (tree->is_negated()) {
2613 // A quick check uses multi-character mask and compare. There is no
2614 // useful way to incorporate a negative char class into this scheme
2615 // so we just conservatively create a mask and value that will always
2616 // succeed.
2617 pos->mask = 0;
2618 pos->value = 0;
2619 } else {
2620 int first_range = 0;
2621 while (ranges->at(first_range).from() > char_mask) {
2622 first_range++;
2623 if (first_range == ranges->length()) {
2624 details->set_cannot_match();
2625 pos->determines_perfectly = false;
2626 return;
2627 }
2628 }
2629 CharacterRange range = ranges->at(first_range);
2630 uc16 from = range.from();
2631 uc16 to = range.to();
2632 if (to > char_mask) {
2633 to = char_mask;
2634 }
2635 uint32_t differing_bits = (from ^ to);
2636 // A mask and compare is only perfect if the differing bits form a
2637 // number like 00011111 with one single block of trailing 1s.
2638 if ((differing_bits & (differing_bits + 1)) == 0 &&
2639 from + differing_bits == to) {
2640 pos->determines_perfectly = true;
2641 }
2642 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2643 uint32_t bits = (from & common_bits);
2644 for (int i = first_range + 1; i < ranges->length(); i++) {
2645 CharacterRange range = ranges->at(i);
2646 uc16 from = range.from();
2647 uc16 to = range.to();
2648 if (from > char_mask) continue;
2649 if (to > char_mask) to = char_mask;
2650 // Here we are combining more ranges into the mask and compare
2651 // value. With each new range the mask becomes more sparse and
2652 // so the chances of a false positive rise. A character class
2653 // with multiple ranges is assumed never to be equivalent to a
2654 // mask and compare operation.
2655 pos->determines_perfectly = false;
2656 uint32_t new_common_bits = (from ^ to);
2657 new_common_bits = ~SmearBitsRight(new_common_bits);
2658 common_bits &= new_common_bits;
2659 bits &= new_common_bits;
2660 uint32_t differing_bits = (from & common_bits) ^ bits;
2661 common_bits ^= differing_bits;
2662 bits &= common_bits;
2663 }
2664 pos->mask = common_bits;
2665 pos->value = bits;
2666 }
2667 characters_filled_in++;
2668 ASSERT(characters_filled_in <= details->characters());
2669 if (characters_filled_in == details->characters()) {
2670 return;
2671 }
2672 }
2673 }
2674 ASSERT(characters_filled_in != details->characters());
2675 if (!details->cannot_match()) {
2676 on_success()-> GetQuickCheckDetails(details,
2677 compiler,
2678 characters_filled_in,
2679 true);
2680 }
2681 }
2682
2683
Clear()2684 void QuickCheckDetails::Clear() {
2685 for (int i = 0; i < characters_; i++) {
2686 positions_[i].mask = 0;
2687 positions_[i].value = 0;
2688 positions_[i].determines_perfectly = false;
2689 }
2690 characters_ = 0;
2691 }
2692
2693
Advance(int by,bool ascii)2694 void QuickCheckDetails::Advance(int by, bool ascii) {
2695 ASSERT(by >= 0);
2696 if (by >= characters_) {
2697 Clear();
2698 return;
2699 }
2700 for (int i = 0; i < characters_ - by; i++) {
2701 positions_[i] = positions_[by + i];
2702 }
2703 for (int i = characters_ - by; i < characters_; i++) {
2704 positions_[i].mask = 0;
2705 positions_[i].value = 0;
2706 positions_[i].determines_perfectly = false;
2707 }
2708 characters_ -= by;
2709 // We could change mask_ and value_ here but we would never advance unless
2710 // they had already been used in a check and they won't be used again because
2711 // it would gain us nothing. So there's no point.
2712 }
2713
2714
Merge(QuickCheckDetails * other,int from_index)2715 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2716 ASSERT(characters_ == other->characters_);
2717 if (other->cannot_match_) {
2718 return;
2719 }
2720 if (cannot_match_) {
2721 *this = *other;
2722 return;
2723 }
2724 for (int i = from_index; i < characters_; i++) {
2725 QuickCheckDetails::Position* pos = positions(i);
2726 QuickCheckDetails::Position* other_pos = other->positions(i);
2727 if (pos->mask != other_pos->mask ||
2728 pos->value != other_pos->value ||
2729 !other_pos->determines_perfectly) {
2730 // Our mask-compare operation will be approximate unless we have the
2731 // exact same operation on both sides of the alternation.
2732 pos->determines_perfectly = false;
2733 }
2734 pos->mask &= other_pos->mask;
2735 pos->value &= pos->mask;
2736 other_pos->value &= pos->mask;
2737 uc16 differing_bits = (pos->value ^ other_pos->value);
2738 pos->mask &= ~differing_bits;
2739 pos->value &= pos->mask;
2740 }
2741 }
2742
2743
2744 class VisitMarker {
2745 public:
VisitMarker(NodeInfo * info)2746 explicit VisitMarker(NodeInfo* info) : info_(info) {
2747 ASSERT(!info->visited);
2748 info->visited = true;
2749 }
~VisitMarker()2750 ~VisitMarker() {
2751 info_->visited = false;
2752 }
2753 private:
2754 NodeInfo* info_;
2755 };
2756
2757
FilterASCII(int depth,bool ignore_case)2758 RegExpNode* SeqRegExpNode::FilterASCII(int depth, bool ignore_case) {
2759 if (info()->replacement_calculated) return replacement();
2760 if (depth < 0) return this;
2761 ASSERT(!info()->visited);
2762 VisitMarker marker(info());
2763 return FilterSuccessor(depth - 1, ignore_case);
2764 }
2765
2766
FilterSuccessor(int depth,bool ignore_case)2767 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2768 RegExpNode* next = on_success_->FilterASCII(depth - 1, ignore_case);
2769 if (next == NULL) return set_replacement(NULL);
2770 on_success_ = next;
2771 return set_replacement(this);
2772 }
2773
2774
2775 // We need to check for the following characters: 0x39c 0x3bc 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2776 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2777 // TODO(dcarney): this could be a lot more efficient.
2778 return range.Contains(0x39c) ||
2779 range.Contains(0x3bc) || range.Contains(0x178);
2780 }
2781
2782
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2783 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2784 for (int i = 0; i < ranges->length(); i++) {
2785 // TODO(dcarney): this could be a lot more efficient.
2786 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2787 }
2788 return false;
2789 }
2790
2791
FilterASCII(int depth,bool ignore_case)2792 RegExpNode* TextNode::FilterASCII(int depth, bool ignore_case) {
2793 if (info()->replacement_calculated) return replacement();
2794 if (depth < 0) return this;
2795 ASSERT(!info()->visited);
2796 VisitMarker marker(info());
2797 int element_count = elms_->length();
2798 for (int i = 0; i < element_count; i++) {
2799 TextElement elm = elms_->at(i);
2800 if (elm.text_type() == TextElement::ATOM) {
2801 Vector<const uc16> quarks = elm.atom()->data();
2802 for (int j = 0; j < quarks.length(); j++) {
2803 uint16_t c = quarks[j];
2804 if (c <= String::kMaxOneByteCharCode) continue;
2805 if (!ignore_case) return set_replacement(NULL);
2806 // Here, we need to check for characters whose upper and lower cases
2807 // are outside the Latin-1 range.
2808 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2809 // Character is outside Latin-1 completely
2810 if (converted == 0) return set_replacement(NULL);
2811 // Convert quark to Latin-1 in place.
2812 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2813 copy[j] = converted;
2814 }
2815 } else {
2816 ASSERT(elm.text_type() == TextElement::CHAR_CLASS);
2817 RegExpCharacterClass* cc = elm.char_class();
2818 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2819 if (!CharacterRange::IsCanonical(ranges)) {
2820 CharacterRange::Canonicalize(ranges);
2821 }
2822 // Now they are in order so we only need to look at the first.
2823 int range_count = ranges->length();
2824 if (cc->is_negated()) {
2825 if (range_count != 0 &&
2826 ranges->at(0).from() == 0 &&
2827 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2828 // This will be handled in a later filter.
2829 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2830 return set_replacement(NULL);
2831 }
2832 } else {
2833 if (range_count == 0 ||
2834 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2835 // This will be handled in a later filter.
2836 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2837 return set_replacement(NULL);
2838 }
2839 }
2840 }
2841 }
2842 return FilterSuccessor(depth - 1, ignore_case);
2843 }
2844
2845
FilterASCII(int depth,bool ignore_case)2846 RegExpNode* LoopChoiceNode::FilterASCII(int depth, bool ignore_case) {
2847 if (info()->replacement_calculated) return replacement();
2848 if (depth < 0) return this;
2849 if (info()->visited) return this;
2850 {
2851 VisitMarker marker(info());
2852
2853 RegExpNode* continue_replacement =
2854 continue_node_->FilterASCII(depth - 1, ignore_case);
2855 // If we can't continue after the loop then there is no sense in doing the
2856 // loop.
2857 if (continue_replacement == NULL) return set_replacement(NULL);
2858 }
2859
2860 return ChoiceNode::FilterASCII(depth - 1, ignore_case);
2861 }
2862
2863
FilterASCII(int depth,bool ignore_case)2864 RegExpNode* ChoiceNode::FilterASCII(int depth, bool ignore_case) {
2865 if (info()->replacement_calculated) return replacement();
2866 if (depth < 0) return this;
2867 if (info()->visited) return this;
2868 VisitMarker marker(info());
2869 int choice_count = alternatives_->length();
2870
2871 for (int i = 0; i < choice_count; i++) {
2872 GuardedAlternative alternative = alternatives_->at(i);
2873 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2874 set_replacement(this);
2875 return this;
2876 }
2877 }
2878
2879 int surviving = 0;
2880 RegExpNode* survivor = NULL;
2881 for (int i = 0; i < choice_count; i++) {
2882 GuardedAlternative alternative = alternatives_->at(i);
2883 RegExpNode* replacement =
2884 alternative.node()->FilterASCII(depth - 1, ignore_case);
2885 ASSERT(replacement != this); // No missing EMPTY_MATCH_CHECK.
2886 if (replacement != NULL) {
2887 alternatives_->at(i).set_node(replacement);
2888 surviving++;
2889 survivor = replacement;
2890 }
2891 }
2892 if (surviving < 2) return set_replacement(survivor);
2893
2894 set_replacement(this);
2895 if (surviving == choice_count) {
2896 return this;
2897 }
2898 // Only some of the nodes survived the filtering. We need to rebuild the
2899 // alternatives list.
2900 ZoneList<GuardedAlternative>* new_alternatives =
2901 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2902 for (int i = 0; i < choice_count; i++) {
2903 RegExpNode* replacement =
2904 alternatives_->at(i).node()->FilterASCII(depth - 1, ignore_case);
2905 if (replacement != NULL) {
2906 alternatives_->at(i).set_node(replacement);
2907 new_alternatives->Add(alternatives_->at(i), zone());
2908 }
2909 }
2910 alternatives_ = new_alternatives;
2911 return this;
2912 }
2913
2914
FilterASCII(int depth,bool ignore_case)2915 RegExpNode* NegativeLookaheadChoiceNode::FilterASCII(int depth,
2916 bool ignore_case) {
2917 if (info()->replacement_calculated) return replacement();
2918 if (depth < 0) return this;
2919 if (info()->visited) return this;
2920 VisitMarker marker(info());
2921 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2922 // afterwards.
2923 RegExpNode* node = alternatives_->at(1).node();
2924 RegExpNode* replacement = node->FilterASCII(depth - 1, ignore_case);
2925 if (replacement == NULL) return set_replacement(NULL);
2926 alternatives_->at(1).set_node(replacement);
2927
2928 RegExpNode* neg_node = alternatives_->at(0).node();
2929 RegExpNode* neg_replacement = neg_node->FilterASCII(depth - 1, ignore_case);
2930 // If the negative lookahead is always going to fail then
2931 // we don't need to check it.
2932 if (neg_replacement == NULL) return set_replacement(replacement);
2933 alternatives_->at(0).set_node(neg_replacement);
2934 return set_replacement(this);
2935 }
2936
2937
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2938 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2939 RegExpCompiler* compiler,
2940 int characters_filled_in,
2941 bool not_at_start) {
2942 if (body_can_be_zero_length_ || info()->visited) return;
2943 VisitMarker marker(info());
2944 return ChoiceNode::GetQuickCheckDetails(details,
2945 compiler,
2946 characters_filled_in,
2947 not_at_start);
2948 }
2949
2950
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2951 void LoopChoiceNode::FillInBMInfo(int offset,
2952 int budget,
2953 BoyerMooreLookahead* bm,
2954 bool not_at_start) {
2955 if (body_can_be_zero_length_ || budget <= 0) {
2956 bm->SetRest(offset);
2957 SaveBMInfo(bm, not_at_start, offset);
2958 return;
2959 }
2960 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2961 SaveBMInfo(bm, not_at_start, offset);
2962 }
2963
2964
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2965 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2966 RegExpCompiler* compiler,
2967 int characters_filled_in,
2968 bool not_at_start) {
2969 not_at_start = (not_at_start || not_at_start_);
2970 int choice_count = alternatives_->length();
2971 ASSERT(choice_count > 0);
2972 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2973 compiler,
2974 characters_filled_in,
2975 not_at_start);
2976 for (int i = 1; i < choice_count; i++) {
2977 QuickCheckDetails new_details(details->characters());
2978 RegExpNode* node = alternatives_->at(i).node();
2979 node->GetQuickCheckDetails(&new_details, compiler,
2980 characters_filled_in,
2981 not_at_start);
2982 // Here we merge the quick match details of the two branches.
2983 details->Merge(&new_details, characters_filled_in);
2984 }
2985 }
2986
2987
2988 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2989 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2990 Label* word,
2991 Label* non_word,
2992 bool fall_through_on_word) {
2993 if (assembler->CheckSpecialCharacterClass(
2994 fall_through_on_word ? 'w' : 'W',
2995 fall_through_on_word ? non_word : word)) {
2996 // Optimized implementation available.
2997 return;
2998 }
2999 assembler->CheckCharacterGT('z', non_word);
3000 assembler->CheckCharacterLT('0', non_word);
3001 assembler->CheckCharacterGT('a' - 1, word);
3002 assembler->CheckCharacterLT('9' + 1, word);
3003 assembler->CheckCharacterLT('A', non_word);
3004 assembler->CheckCharacterLT('Z' + 1, word);
3005 if (fall_through_on_word) {
3006 assembler->CheckNotCharacter('_', non_word);
3007 } else {
3008 assembler->CheckCharacter('_', word);
3009 }
3010 }
3011
3012
3013 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3014 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)3015 static void EmitHat(RegExpCompiler* compiler,
3016 RegExpNode* on_success,
3017 Trace* trace) {
3018 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3019 // We will be loading the previous character into the current character
3020 // register.
3021 Trace new_trace(*trace);
3022 new_trace.InvalidateCurrentCharacter();
3023
3024 Label ok;
3025 if (new_trace.cp_offset() == 0) {
3026 // The start of input counts as a newline in this context, so skip to
3027 // ok if we are at the start.
3028 assembler->CheckAtStart(&ok);
3029 }
3030 // We already checked that we are not at the start of input so it must be
3031 // OK to load the previous character.
3032 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3033 new_trace.backtrack(),
3034 false);
3035 if (!assembler->CheckSpecialCharacterClass('n',
3036 new_trace.backtrack())) {
3037 // Newline means \n, \r, 0x2028 or 0x2029.
3038 if (!compiler->ascii()) {
3039 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3040 }
3041 assembler->CheckCharacter('\n', &ok);
3042 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3043 }
3044 assembler->Bind(&ok);
3045 on_success->Emit(compiler, &new_trace);
3046 }
3047
3048
3049 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)3050 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3051 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3052 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3053 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3054 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3055 if (lookahead == NULL) {
3056 int eats_at_least =
3057 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3058 kRecursionBudget,
3059 not_at_start));
3060 if (eats_at_least >= 1) {
3061 BoyerMooreLookahead* bm =
3062 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3063 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3064 if (bm->at(0)->is_non_word())
3065 next_is_word_character = Trace::FALSE_VALUE;
3066 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3067 }
3068 } else {
3069 if (lookahead->at(0)->is_non_word())
3070 next_is_word_character = Trace::FALSE_VALUE;
3071 if (lookahead->at(0)->is_word())
3072 next_is_word_character = Trace::TRUE_VALUE;
3073 }
3074 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3075 if (next_is_word_character == Trace::UNKNOWN) {
3076 Label before_non_word;
3077 Label before_word;
3078 if (trace->characters_preloaded() != 1) {
3079 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3080 }
3081 // Fall through on non-word.
3082 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3083 // Next character is not a word character.
3084 assembler->Bind(&before_non_word);
3085 Label ok;
3086 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3087 assembler->GoTo(&ok);
3088
3089 assembler->Bind(&before_word);
3090 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3091 assembler->Bind(&ok);
3092 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3094 } else {
3095 ASSERT(next_is_word_character == Trace::FALSE_VALUE);
3096 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3097 }
3098 }
3099
3100
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3101 void AssertionNode::BacktrackIfPrevious(
3102 RegExpCompiler* compiler,
3103 Trace* trace,
3104 AssertionNode::IfPrevious backtrack_if_previous) {
3105 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3106 Trace new_trace(*trace);
3107 new_trace.InvalidateCurrentCharacter();
3108
3109 Label fall_through, dummy;
3110
3111 Label* non_word = backtrack_if_previous == kIsNonWord ?
3112 new_trace.backtrack() :
3113 &fall_through;
3114 Label* word = backtrack_if_previous == kIsNonWord ?
3115 &fall_through :
3116 new_trace.backtrack();
3117
3118 if (new_trace.cp_offset() == 0) {
3119 // The start of input counts as a non-word character, so the question is
3120 // decided if we are at the start.
3121 assembler->CheckAtStart(non_word);
3122 }
3123 // We already checked that we are not at the start of input so it must be
3124 // OK to load the previous character.
3125 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3126 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3127
3128 assembler->Bind(&fall_through);
3129 on_success()->Emit(compiler, &new_trace);
3130 }
3131
3132
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3133 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3134 RegExpCompiler* compiler,
3135 int filled_in,
3136 bool not_at_start) {
3137 if (assertion_type_ == AT_START && not_at_start) {
3138 details->set_cannot_match();
3139 return;
3140 }
3141 return on_success()->GetQuickCheckDetails(details,
3142 compiler,
3143 filled_in,
3144 not_at_start);
3145 }
3146
3147
Emit(RegExpCompiler * compiler,Trace * trace)3148 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3149 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3150 switch (assertion_type_) {
3151 case AT_END: {
3152 Label ok;
3153 assembler->CheckPosition(trace->cp_offset(), &ok);
3154 assembler->GoTo(trace->backtrack());
3155 assembler->Bind(&ok);
3156 break;
3157 }
3158 case AT_START: {
3159 if (trace->at_start() == Trace::FALSE_VALUE) {
3160 assembler->GoTo(trace->backtrack());
3161 return;
3162 }
3163 if (trace->at_start() == Trace::UNKNOWN) {
3164 assembler->CheckNotAtStart(trace->backtrack());
3165 Trace at_start_trace = *trace;
3166 at_start_trace.set_at_start(true);
3167 on_success()->Emit(compiler, &at_start_trace);
3168 return;
3169 }
3170 }
3171 break;
3172 case AFTER_NEWLINE:
3173 EmitHat(compiler, on_success(), trace);
3174 return;
3175 case AT_BOUNDARY:
3176 case AT_NON_BOUNDARY: {
3177 EmitBoundaryCheck(compiler, trace);
3178 return;
3179 }
3180 }
3181 on_success()->Emit(compiler, trace);
3182 }
3183
3184
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3185 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3186 if (quick_check == NULL) return false;
3187 if (offset >= quick_check->characters()) return false;
3188 return quick_check->positions(offset)->determines_perfectly;
3189 }
3190
3191
UpdateBoundsCheck(int index,int * checked_up_to)3192 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3193 if (index > *checked_up_to) {
3194 *checked_up_to = index;
3195 }
3196 }
3197
3198
3199 // We call this repeatedly to generate code for each pass over the text node.
3200 // The passes are in increasing order of difficulty because we hope one
3201 // of the first passes will fail in which case we are saved the work of the
3202 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3203 // we will check the '%' in the first pass, the case independent 'a' in the
3204 // second pass and the character class in the last pass.
3205 //
3206 // The passes are done from right to left, so for example to test for /bar/
3207 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3208 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3209 // bounds check most of the time. In the example we only need to check for
3210 // end-of-input when loading the putative 'r'.
3211 //
3212 // A slight complication involves the fact that the first character may already
3213 // be fetched into a register by the previous node. In this case we want to
3214 // do the test for that character first. We do this in separate passes. The
3215 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3216 // pass has been performed then subsequent passes will have true in
3217 // first_element_checked to indicate that that character does not need to be
3218 // checked again.
3219 //
3220 // In addition to all this we are passed a Trace, which can
3221 // contain an AlternativeGeneration object. In this AlternativeGeneration
3222 // object we can see details of any quick check that was already passed in
3223 // order to get to the code we are now generating. The quick check can involve
3224 // loading characters, which means we do not need to recheck the bounds
3225 // up to the limit the quick check already checked. In addition the quick
3226 // check can have involved a mask and compare operation which may simplify
3227 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3228 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3229 TextEmitPassType pass,
3230 bool preloaded,
3231 Trace* trace,
3232 bool first_element_checked,
3233 int* checked_up_to) {
3234 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3235 Isolate* isolate = assembler->zone()->isolate();
3236 bool ascii = compiler->ascii();
3237 Label* backtrack = trace->backtrack();
3238 QuickCheckDetails* quick_check = trace->quick_check_performed();
3239 int element_count = elms_->length();
3240 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3241 TextElement elm = elms_->at(i);
3242 int cp_offset = trace->cp_offset() + elm.cp_offset();
3243 if (elm.text_type() == TextElement::ATOM) {
3244 Vector<const uc16> quarks = elm.atom()->data();
3245 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3246 if (first_element_checked && i == 0 && j == 0) continue;
3247 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3248 EmitCharacterFunction* emit_function = NULL;
3249 switch (pass) {
3250 case NON_ASCII_MATCH:
3251 ASSERT(ascii);
3252 if (quarks[j] > String::kMaxOneByteCharCode) {
3253 assembler->GoTo(backtrack);
3254 return;
3255 }
3256 break;
3257 case NON_LETTER_CHARACTER_MATCH:
3258 emit_function = &EmitAtomNonLetter;
3259 break;
3260 case SIMPLE_CHARACTER_MATCH:
3261 emit_function = &EmitSimpleCharacter;
3262 break;
3263 case CASE_CHARACTER_MATCH:
3264 emit_function = &EmitAtomLetter;
3265 break;
3266 default:
3267 break;
3268 }
3269 if (emit_function != NULL) {
3270 bool bound_checked = emit_function(isolate,
3271 compiler,
3272 quarks[j],
3273 backtrack,
3274 cp_offset + j,
3275 *checked_up_to < cp_offset + j,
3276 preloaded);
3277 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3278 }
3279 }
3280 } else {
3281 ASSERT_EQ(TextElement::CHAR_CLASS, elm.text_type());
3282 if (pass == CHARACTER_CLASS_MATCH) {
3283 if (first_element_checked && i == 0) continue;
3284 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3285 RegExpCharacterClass* cc = elm.char_class();
3286 EmitCharClass(assembler,
3287 cc,
3288 ascii,
3289 backtrack,
3290 cp_offset,
3291 *checked_up_to < cp_offset,
3292 preloaded,
3293 zone());
3294 UpdateBoundsCheck(cp_offset, checked_up_to);
3295 }
3296 }
3297 }
3298 }
3299
3300
Length()3301 int TextNode::Length() {
3302 TextElement elm = elms_->last();
3303 ASSERT(elm.cp_offset() >= 0);
3304 return elm.cp_offset() + elm.length();
3305 }
3306
3307
SkipPass(int int_pass,bool ignore_case)3308 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3309 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3310 if (ignore_case) {
3311 return pass == SIMPLE_CHARACTER_MATCH;
3312 } else {
3313 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3314 }
3315 }
3316
3317
3318 // This generates the code to match a text node. A text node can contain
3319 // straight character sequences (possibly to be matched in a case-independent
3320 // way) and character classes. For efficiency we do not do this in a single
3321 // pass from left to right. Instead we pass over the text node several times,
3322 // emitting code for some character positions every time. See the comment on
3323 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3324 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3325 LimitResult limit_result = LimitVersions(compiler, trace);
3326 if (limit_result == DONE) return;
3327 ASSERT(limit_result == CONTINUE);
3328
3329 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3330 compiler->SetRegExpTooBig();
3331 return;
3332 }
3333
3334 if (compiler->ascii()) {
3335 int dummy = 0;
3336 TextEmitPass(compiler, NON_ASCII_MATCH, false, trace, false, &dummy);
3337 }
3338
3339 bool first_elt_done = false;
3340 int bound_checked_to = trace->cp_offset() - 1;
3341 bound_checked_to += trace->bound_checked_up_to();
3342
3343 // If a character is preloaded into the current character register then
3344 // check that now.
3345 if (trace->characters_preloaded() == 1) {
3346 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3347 if (!SkipPass(pass, compiler->ignore_case())) {
3348 TextEmitPass(compiler,
3349 static_cast<TextEmitPassType>(pass),
3350 true,
3351 trace,
3352 false,
3353 &bound_checked_to);
3354 }
3355 }
3356 first_elt_done = true;
3357 }
3358
3359 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3360 if (!SkipPass(pass, compiler->ignore_case())) {
3361 TextEmitPass(compiler,
3362 static_cast<TextEmitPassType>(pass),
3363 false,
3364 trace,
3365 first_elt_done,
3366 &bound_checked_to);
3367 }
3368 }
3369
3370 Trace successor_trace(*trace);
3371 successor_trace.set_at_start(false);
3372 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3373 RecursionCheck rc(compiler);
3374 on_success()->Emit(compiler, &successor_trace);
3375 }
3376
3377
InvalidateCurrentCharacter()3378 void Trace::InvalidateCurrentCharacter() {
3379 characters_preloaded_ = 0;
3380 }
3381
3382
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3383 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3384 ASSERT(by > 0);
3385 // We don't have an instruction for shifting the current character register
3386 // down or for using a shifted value for anything so lets just forget that
3387 // we preloaded any characters into it.
3388 characters_preloaded_ = 0;
3389 // Adjust the offsets of the quick check performed information. This
3390 // information is used to find out what we already determined about the
3391 // characters by means of mask and compare.
3392 quick_check_performed_.Advance(by, compiler->ascii());
3393 cp_offset_ += by;
3394 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3395 compiler->SetRegExpTooBig();
3396 cp_offset_ = 0;
3397 }
3398 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3399 }
3400
3401
MakeCaseIndependent(bool is_ascii)3402 void TextNode::MakeCaseIndependent(bool is_ascii) {
3403 int element_count = elms_->length();
3404 for (int i = 0; i < element_count; i++) {
3405 TextElement elm = elms_->at(i);
3406 if (elm.text_type() == TextElement::CHAR_CLASS) {
3407 RegExpCharacterClass* cc = elm.char_class();
3408 // None of the standard character classes is different in the case
3409 // independent case and it slows us down if we don't know that.
3410 if (cc->is_standard(zone())) continue;
3411 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3412 int range_count = ranges->length();
3413 for (int j = 0; j < range_count; j++) {
3414 ranges->at(j).AddCaseEquivalents(ranges, is_ascii, zone());
3415 }
3416 }
3417 }
3418 }
3419
3420
GreedyLoopTextLength()3421 int TextNode::GreedyLoopTextLength() {
3422 TextElement elm = elms_->at(elms_->length() - 1);
3423 return elm.cp_offset() + elm.length();
3424 }
3425
3426
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3427 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3428 RegExpCompiler* compiler) {
3429 if (elms_->length() != 1) return NULL;
3430 TextElement elm = elms_->at(0);
3431 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3432 RegExpCharacterClass* node = elm.char_class();
3433 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3434 if (!CharacterRange::IsCanonical(ranges)) {
3435 CharacterRange::Canonicalize(ranges);
3436 }
3437 if (node->is_negated()) {
3438 return ranges->length() == 0 ? on_success() : NULL;
3439 }
3440 if (ranges->length() != 1) return NULL;
3441 uint32_t max_char;
3442 if (compiler->ascii()) {
3443 max_char = String::kMaxOneByteCharCode;
3444 } else {
3445 max_char = String::kMaxUtf16CodeUnit;
3446 }
3447 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3448 }
3449
3450
3451 // Finds the fixed match length of a sequence of nodes that goes from
3452 // this alternative and back to this choice node. If there are variable
3453 // length nodes or other complications in the way then return a sentinel
3454 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3455 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3456 GuardedAlternative* alternative) {
3457 int length = 0;
3458 RegExpNode* node = alternative->node();
3459 // Later we will generate code for all these text nodes using recursion
3460 // so we have to limit the max number.
3461 int recursion_depth = 0;
3462 while (node != this) {
3463 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3464 return kNodeIsTooComplexForGreedyLoops;
3465 }
3466 int node_length = node->GreedyLoopTextLength();
3467 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3468 return kNodeIsTooComplexForGreedyLoops;
3469 }
3470 length += node_length;
3471 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3472 node = seq_node->on_success();
3473 }
3474 return length;
3475 }
3476
3477
AddLoopAlternative(GuardedAlternative alt)3478 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3479 ASSERT_EQ(loop_node_, NULL);
3480 AddAlternative(alt);
3481 loop_node_ = alt.node();
3482 }
3483
3484
AddContinueAlternative(GuardedAlternative alt)3485 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3486 ASSERT_EQ(continue_node_, NULL);
3487 AddAlternative(alt);
3488 continue_node_ = alt.node();
3489 }
3490
3491
Emit(RegExpCompiler * compiler,Trace * trace)3492 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3493 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3494 if (trace->stop_node() == this) {
3495 int text_length =
3496 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3497 ASSERT(text_length != kNodeIsTooComplexForGreedyLoops);
3498 // Update the counter-based backtracking info on the stack. This is an
3499 // optimization for greedy loops (see below).
3500 ASSERT(trace->cp_offset() == text_length);
3501 macro_assembler->AdvanceCurrentPosition(text_length);
3502 macro_assembler->GoTo(trace->loop_label());
3503 return;
3504 }
3505 ASSERT(trace->stop_node() == NULL);
3506 if (!trace->is_trivial()) {
3507 trace->Flush(compiler, this);
3508 return;
3509 }
3510 ChoiceNode::Emit(compiler, trace);
3511 }
3512
3513
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3514 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3515 int eats_at_least) {
3516 int preload_characters = Min(4, eats_at_least);
3517 if (compiler->macro_assembler()->CanReadUnaligned()) {
3518 bool ascii = compiler->ascii();
3519 if (ascii) {
3520 if (preload_characters > 4) preload_characters = 4;
3521 // We can't preload 3 characters because there is no machine instruction
3522 // to do that. We can't just load 4 because we could be reading
3523 // beyond the end of the string, which could cause a memory fault.
3524 if (preload_characters == 3) preload_characters = 2;
3525 } else {
3526 if (preload_characters > 2) preload_characters = 2;
3527 }
3528 } else {
3529 if (preload_characters > 1) preload_characters = 1;
3530 }
3531 return preload_characters;
3532 }
3533
3534
3535 // This class is used when generating the alternatives in a choice node. It
3536 // records the way the alternative is being code generated.
3537 class AlternativeGeneration: public Malloced {
3538 public:
AlternativeGeneration()3539 AlternativeGeneration()
3540 : possible_success(),
3541 expects_preload(false),
3542 after(),
3543 quick_check_details() { }
3544 Label possible_success;
3545 bool expects_preload;
3546 Label after;
3547 QuickCheckDetails quick_check_details;
3548 };
3549
3550
3551 // Creates a list of AlternativeGenerations. If the list has a reasonable
3552 // size then it is on the stack, otherwise the excess is on the heap.
3553 class AlternativeGenerationList {
3554 public:
AlternativeGenerationList(int count,Zone * zone)3555 AlternativeGenerationList(int count, Zone* zone)
3556 : alt_gens_(count, zone) {
3557 for (int i = 0; i < count && i < kAFew; i++) {
3558 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3559 }
3560 for (int i = kAFew; i < count; i++) {
3561 alt_gens_.Add(new AlternativeGeneration(), zone);
3562 }
3563 }
~AlternativeGenerationList()3564 ~AlternativeGenerationList() {
3565 for (int i = kAFew; i < alt_gens_.length(); i++) {
3566 delete alt_gens_[i];
3567 alt_gens_[i] = NULL;
3568 }
3569 }
3570
at(int i)3571 AlternativeGeneration* at(int i) {
3572 return alt_gens_[i];
3573 }
3574
3575 private:
3576 static const int kAFew = 10;
3577 ZoneList<AlternativeGeneration*> alt_gens_;
3578 AlternativeGeneration a_few_alt_gens_[kAFew];
3579 };
3580
3581
3582 // The '2' variant is has inclusive from and exclusive to.
3583 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3584 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3585 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
3586 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
3587 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
3588 0xFEFF, 0xFF00, 0x10000 };
3589 static const int kSpaceRangeCount = ARRAY_SIZE(kSpaceRanges);
3590
3591 static const int kWordRanges[] = {
3592 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3593 static const int kWordRangeCount = ARRAY_SIZE(kWordRanges);
3594 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3595 static const int kDigitRangeCount = ARRAY_SIZE(kDigitRanges);
3596 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3597 static const int kSurrogateRangeCount = ARRAY_SIZE(kSurrogateRanges);
3598 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3599 0x2028, 0x202A, 0x10000 };
3600 static const int kLineTerminatorRangeCount = ARRAY_SIZE(kLineTerminatorRanges);
3601
3602
Set(int character)3603 void BoyerMoorePositionInfo::Set(int character) {
3604 SetInterval(Interval(character, character));
3605 }
3606
3607
SetInterval(const Interval & interval)3608 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3609 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3610 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3611 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3612 surrogate_ =
3613 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3614 if (interval.to() - interval.from() >= kMapSize - 1) {
3615 if (map_count_ != kMapSize) {
3616 map_count_ = kMapSize;
3617 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3618 }
3619 return;
3620 }
3621 for (int i = interval.from(); i <= interval.to(); i++) {
3622 int mod_character = (i & kMask);
3623 if (!map_->at(mod_character)) {
3624 map_count_++;
3625 map_->at(mod_character) = true;
3626 }
3627 if (map_count_ == kMapSize) return;
3628 }
3629 }
3630
3631
SetAll()3632 void BoyerMoorePositionInfo::SetAll() {
3633 s_ = w_ = d_ = kLatticeUnknown;
3634 if (map_count_ != kMapSize) {
3635 map_count_ = kMapSize;
3636 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3637 }
3638 }
3639
3640
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3641 BoyerMooreLookahead::BoyerMooreLookahead(
3642 int length, RegExpCompiler* compiler, Zone* zone)
3643 : length_(length),
3644 compiler_(compiler) {
3645 if (compiler->ascii()) {
3646 max_char_ = String::kMaxOneByteCharCode;
3647 } else {
3648 max_char_ = String::kMaxUtf16CodeUnit;
3649 }
3650 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3651 for (int i = 0; i < length; i++) {
3652 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3653 }
3654 }
3655
3656
3657 // Find the longest range of lookahead that has the fewest number of different
3658 // characters that can occur at a given position. Since we are optimizing two
3659 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3660 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3661 int biggest_points = 0;
3662 // If more than 32 characters out of 128 can occur it is unlikely that we can
3663 // be lucky enough to step forwards much of the time.
3664 const int kMaxMax = 32;
3665 for (int max_number_of_chars = 4;
3666 max_number_of_chars < kMaxMax;
3667 max_number_of_chars *= 2) {
3668 biggest_points =
3669 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3670 }
3671 if (biggest_points == 0) return false;
3672 return true;
3673 }
3674
3675
3676 // Find the highest-points range between 0 and length_ where the character
3677 // information is not too vague. 'Too vague' means that there are more than
3678 // max_number_of_chars that can occur at this position. Calculates the number
3679 // of points as the product of width-of-the-range and
3680 // probability-of-finding-one-of-the-characters, where the probability is
3681 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3682 int BoyerMooreLookahead::FindBestInterval(
3683 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3684 int biggest_points = old_biggest_points;
3685 static const int kSize = RegExpMacroAssembler::kTableSize;
3686 for (int i = 0; i < length_; ) {
3687 while (i < length_ && Count(i) > max_number_of_chars) i++;
3688 if (i == length_) break;
3689 int remembered_from = i;
3690 bool union_map[kSize];
3691 for (int j = 0; j < kSize; j++) union_map[j] = false;
3692 while (i < length_ && Count(i) <= max_number_of_chars) {
3693 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3694 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3695 i++;
3696 }
3697 int frequency = 0;
3698 for (int j = 0; j < kSize; j++) {
3699 if (union_map[j]) {
3700 // Add 1 to the frequency to give a small per-character boost for
3701 // the cases where our sampling is not good enough and many
3702 // characters have a frequency of zero. This means the frequency
3703 // can theoretically be up to 2*kSize though we treat it mostly as
3704 // a fraction of kSize.
3705 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3706 }
3707 }
3708 // We use the probability of skipping times the distance we are skipping to
3709 // judge the effectiveness of this. Actually we have a cut-off: By
3710 // dividing by 2 we switch off the skipping if the probability of skipping
3711 // is less than 50%. This is because the multibyte mask-and-compare
3712 // skipping in quickcheck is more likely to do well on this case.
3713 bool in_quickcheck_range = ((i - remembered_from < 4) ||
3714 (compiler_->ascii() ? remembered_from <= 4 : remembered_from <= 2));
3715 // Called 'probability' but it is only a rough estimate and can actually
3716 // be outside the 0-kSize range.
3717 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3718 int points = (i - remembered_from) * probability;
3719 if (points > biggest_points) {
3720 *from = remembered_from;
3721 *to = i - 1;
3722 biggest_points = points;
3723 }
3724 }
3725 return biggest_points;
3726 }
3727
3728
3729 // Take all the characters that will not prevent a successful match if they
3730 // occur in the subject string in the range between min_lookahead and
3731 // max_lookahead (inclusive) measured from the current position. If the
3732 // character at max_lookahead offset is not one of these characters, then we
3733 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3734 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3735 int max_lookahead,
3736 Handle<ByteArray> boolean_skip_table) {
3737 const int kSize = RegExpMacroAssembler::kTableSize;
3738
3739 const int kSkipArrayEntry = 0;
3740 const int kDontSkipArrayEntry = 1;
3741
3742 for (int i = 0; i < kSize; i++) {
3743 boolean_skip_table->set(i, kSkipArrayEntry);
3744 }
3745 int skip = max_lookahead + 1 - min_lookahead;
3746
3747 for (int i = max_lookahead; i >= min_lookahead; i--) {
3748 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3749 for (int j = 0; j < kSize; j++) {
3750 if (map->at(j)) {
3751 boolean_skip_table->set(j, kDontSkipArrayEntry);
3752 }
3753 }
3754 }
3755
3756 return skip;
3757 }
3758
3759
3760 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3761 bool BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3762 const int kSize = RegExpMacroAssembler::kTableSize;
3763
3764 int min_lookahead = 0;
3765 int max_lookahead = 0;
3766
3767 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return false;
3768
3769 bool found_single_character = false;
3770 int single_character = 0;
3771 for (int i = max_lookahead; i >= min_lookahead; i--) {
3772 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3773 if (map->map_count() > 1 ||
3774 (found_single_character && map->map_count() != 0)) {
3775 found_single_character = false;
3776 break;
3777 }
3778 for (int j = 0; j < kSize; j++) {
3779 if (map->at(j)) {
3780 found_single_character = true;
3781 single_character = j;
3782 break;
3783 }
3784 }
3785 }
3786
3787 int lookahead_width = max_lookahead + 1 - min_lookahead;
3788
3789 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3790 // The mask-compare can probably handle this better.
3791 return false;
3792 }
3793
3794 if (found_single_character) {
3795 Label cont, again;
3796 masm->Bind(&again);
3797 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3798 if (max_char_ > kSize) {
3799 masm->CheckCharacterAfterAnd(single_character,
3800 RegExpMacroAssembler::kTableMask,
3801 &cont);
3802 } else {
3803 masm->CheckCharacter(single_character, &cont);
3804 }
3805 masm->AdvanceCurrentPosition(lookahead_width);
3806 masm->GoTo(&again);
3807 masm->Bind(&cont);
3808 return true;
3809 }
3810
3811 Factory* factory = masm->zone()->isolate()->factory();
3812 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3813 int skip_distance = GetSkipTable(
3814 min_lookahead, max_lookahead, boolean_skip_table);
3815 ASSERT(skip_distance != 0);
3816
3817 Label cont, again;
3818 masm->Bind(&again);
3819 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3820 masm->CheckBitInTable(boolean_skip_table, &cont);
3821 masm->AdvanceCurrentPosition(skip_distance);
3822 masm->GoTo(&again);
3823 masm->Bind(&cont);
3824
3825 return true;
3826 }
3827
3828
3829 /* Code generation for choice nodes.
3830 *
3831 * We generate quick checks that do a mask and compare to eliminate a
3832 * choice. If the quick check succeeds then it jumps to the continuation to
3833 * do slow checks and check subsequent nodes. If it fails (the common case)
3834 * it falls through to the next choice.
3835 *
3836 * Here is the desired flow graph. Nodes directly below each other imply
3837 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3838 * 3 doesn't have a quick check so we have to call the slow check.
3839 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3840 * regexp continuation is generated directly after the Sn node, up to the
3841 * next GoTo if we decide to reuse some already generated code. Some
3842 * nodes expect preload_characters to be preloaded into the current
3843 * character register. R nodes do this preloading. Vertices are marked
3844 * F for failures and S for success (possible success in the case of quick
3845 * nodes). L, V, < and > are used as arrow heads.
3846 *
3847 * ----------> R
3848 * |
3849 * V
3850 * Q1 -----> S1
3851 * | S /
3852 * F| /
3853 * | F/
3854 * | /
3855 * | R
3856 * | /
3857 * V L
3858 * Q2 -----> S2
3859 * | S /
3860 * F| /
3861 * | F/
3862 * | /
3863 * | R
3864 * | /
3865 * V L
3866 * S3
3867 * |
3868 * F|
3869 * |
3870 * R
3871 * |
3872 * backtrack V
3873 * <----------Q4
3874 * \ F |
3875 * \ |S
3876 * \ F V
3877 * \-----S4
3878 *
3879 * For greedy loops we reverse our expectation and expect to match rather
3880 * than fail. Therefore we want the loop code to look like this (U is the
3881 * unwind code that steps back in the greedy loop). The following alternatives
3882 * look the same as above.
3883 * _____
3884 * / \
3885 * V |
3886 * ----------> S1 |
3887 * /| |
3888 * / |S |
3889 * F/ \_____/
3890 * /
3891 * |<-----------
3892 * | \
3893 * V \
3894 * Q2 ---> S2 \
3895 * | S / |
3896 * F| / |
3897 * | F/ |
3898 * | / |
3899 * | R |
3900 * | / |
3901 * F VL |
3902 * <------U |
3903 * back |S |
3904 * \______________/
3905 */
3906
Emit(RegExpCompiler * compiler,Trace * trace)3907 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3908 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3909 int choice_count = alternatives_->length();
3910 #ifdef DEBUG
3911 for (int i = 0; i < choice_count - 1; i++) {
3912 GuardedAlternative alternative = alternatives_->at(i);
3913 ZoneList<Guard*>* guards = alternative.guards();
3914 int guard_count = (guards == NULL) ? 0 : guards->length();
3915 for (int j = 0; j < guard_count; j++) {
3916 ASSERT(!trace->mentions_reg(guards->at(j)->reg()));
3917 }
3918 }
3919 #endif
3920
3921 LimitResult limit_result = LimitVersions(compiler, trace);
3922 if (limit_result == DONE) return;
3923 ASSERT(limit_result == CONTINUE);
3924
3925 int new_flush_budget = trace->flush_budget() / choice_count;
3926 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3927 trace->Flush(compiler, this);
3928 return;
3929 }
3930
3931 RecursionCheck rc(compiler);
3932
3933 Trace* current_trace = trace;
3934
3935 int text_length = GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3936 bool greedy_loop = false;
3937 Label greedy_loop_label;
3938 Trace counter_backtrack_trace;
3939 counter_backtrack_trace.set_backtrack(&greedy_loop_label);
3940 if (not_at_start()) counter_backtrack_trace.set_at_start(false);
3941
3942 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3943 // Here we have special handling for greedy loops containing only text nodes
3944 // and other simple nodes. These are handled by pushing the current
3945 // position on the stack and then incrementing the current position each
3946 // time around the switch. On backtrack we decrement the current position
3947 // and check it against the pushed value. This avoids pushing backtrack
3948 // information for each iteration of the loop, which could take up a lot of
3949 // space.
3950 greedy_loop = true;
3951 ASSERT(trace->stop_node() == NULL);
3952 macro_assembler->PushCurrentPosition();
3953 current_trace = &counter_backtrack_trace;
3954 Label greedy_match_failed;
3955 Trace greedy_match_trace;
3956 if (not_at_start()) greedy_match_trace.set_at_start(false);
3957 greedy_match_trace.set_backtrack(&greedy_match_failed);
3958 Label loop_label;
3959 macro_assembler->Bind(&loop_label);
3960 greedy_match_trace.set_stop_node(this);
3961 greedy_match_trace.set_loop_label(&loop_label);
3962 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3963 macro_assembler->Bind(&greedy_match_failed);
3964 }
3965
3966 Label second_choice; // For use in greedy matches.
3967 macro_assembler->Bind(&second_choice);
3968
3969 int first_normal_choice = greedy_loop ? 1 : 0;
3970
3971 bool not_at_start = current_trace->at_start() == Trace::FALSE_VALUE;
3972 const int kEatsAtLeastNotYetInitialized = -1;
3973 int eats_at_least = kEatsAtLeastNotYetInitialized;
3974
3975 bool skip_was_emitted = false;
3976
3977 if (!greedy_loop && choice_count == 2) {
3978 GuardedAlternative alt1 = alternatives_->at(1);
3979 if (alt1.guards() == NULL || alt1.guards()->length() == 0) {
3980 RegExpNode* eats_anything_node = alt1.node();
3981 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) ==
3982 this) {
3983 // At this point we know that we are at a non-greedy loop that will eat
3984 // any character one at a time. Any non-anchored regexp has such a
3985 // loop prepended to it in order to find where it starts. We look for
3986 // a pattern of the form ...abc... where we can look 6 characters ahead
3987 // and step forwards 3 if the character is not one of abc. Abc need
3988 // not be atoms, they can be any reasonably limited character class or
3989 // small alternation.
3990 ASSERT(trace->is_trivial()); // This is the case on LoopChoiceNodes.
3991 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3992 if (lookahead == NULL) {
3993 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
3994 EatsAtLeast(kMaxLookaheadForBoyerMoore,
3995 kRecursionBudget,
3996 not_at_start));
3997 if (eats_at_least >= 1) {
3998 BoyerMooreLookahead* bm =
3999 new(zone()) BoyerMooreLookahead(eats_at_least,
4000 compiler,
4001 zone());
4002 GuardedAlternative alt0 = alternatives_->at(0);
4003 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
4004 skip_was_emitted = bm->EmitSkipInstructions(macro_assembler);
4005 }
4006 } else {
4007 skip_was_emitted = lookahead->EmitSkipInstructions(macro_assembler);
4008 }
4009 }
4010 }
4011 }
4012
4013 if (eats_at_least == kEatsAtLeastNotYetInitialized) {
4014 // Save some time by looking at most one machine word ahead.
4015 eats_at_least =
4016 EatsAtLeast(compiler->ascii() ? 4 : 2, kRecursionBudget, not_at_start);
4017 }
4018 int preload_characters = CalculatePreloadCharacters(compiler, eats_at_least);
4019
4020 bool preload_is_current = !skip_was_emitted &&
4021 (current_trace->characters_preloaded() == preload_characters);
4022 bool preload_has_checked_bounds = preload_is_current;
4023
4024 AlternativeGenerationList alt_gens(choice_count, zone());
4025
4026 // For now we just call all choices one after the other. The idea ultimately
4027 // is to use the Dispatch table to try only the relevant ones.
4028 for (int i = first_normal_choice; i < choice_count; i++) {
4029 GuardedAlternative alternative = alternatives_->at(i);
4030 AlternativeGeneration* alt_gen = alt_gens.at(i);
4031 alt_gen->quick_check_details.set_characters(preload_characters);
4032 ZoneList<Guard*>* guards = alternative.guards();
4033 int guard_count = (guards == NULL) ? 0 : guards->length();
4034 Trace new_trace(*current_trace);
4035 new_trace.set_characters_preloaded(preload_is_current ?
4036 preload_characters :
4037 0);
4038 if (preload_has_checked_bounds) {
4039 new_trace.set_bound_checked_up_to(preload_characters);
4040 }
4041 new_trace.quick_check_performed()->Clear();
4042 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4043 alt_gen->expects_preload = preload_is_current;
4044 bool generate_full_check_inline = false;
4045 if (FLAG_regexp_optimization &&
4046 try_to_emit_quick_check_for_alternative(i) &&
4047 alternative.node()->EmitQuickCheck(compiler,
4048 &new_trace,
4049 preload_has_checked_bounds,
4050 &alt_gen->possible_success,
4051 &alt_gen->quick_check_details,
4052 i < choice_count - 1)) {
4053 // Quick check was generated for this choice.
4054 preload_is_current = true;
4055 preload_has_checked_bounds = true;
4056 // On the last choice in the ChoiceNode we generated the quick
4057 // check to fall through on possible success. So now we need to
4058 // generate the full check inline.
4059 if (i == choice_count - 1) {
4060 macro_assembler->Bind(&alt_gen->possible_success);
4061 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4062 new_trace.set_characters_preloaded(preload_characters);
4063 new_trace.set_bound_checked_up_to(preload_characters);
4064 generate_full_check_inline = true;
4065 }
4066 } else if (alt_gen->quick_check_details.cannot_match()) {
4067 if (i == choice_count - 1 && !greedy_loop) {
4068 macro_assembler->GoTo(trace->backtrack());
4069 }
4070 continue;
4071 } else {
4072 // No quick check was generated. Put the full code here.
4073 // If this is not the first choice then there could be slow checks from
4074 // previous cases that go here when they fail. There's no reason to
4075 // insist that they preload characters since the slow check we are about
4076 // to generate probably can't use it.
4077 if (i != first_normal_choice) {
4078 alt_gen->expects_preload = false;
4079 new_trace.InvalidateCurrentCharacter();
4080 }
4081 if (i < choice_count - 1) {
4082 new_trace.set_backtrack(&alt_gen->after);
4083 }
4084 generate_full_check_inline = true;
4085 }
4086 if (generate_full_check_inline) {
4087 if (new_trace.actions() != NULL) {
4088 new_trace.set_flush_budget(new_flush_budget);
4089 }
4090 for (int j = 0; j < guard_count; j++) {
4091 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4092 }
4093 alternative.node()->Emit(compiler, &new_trace);
4094 preload_is_current = false;
4095 }
4096 macro_assembler->Bind(&alt_gen->after);
4097 }
4098 if (greedy_loop) {
4099 macro_assembler->Bind(&greedy_loop_label);
4100 // If we have unwound to the bottom then backtrack.
4101 macro_assembler->CheckGreedyLoop(trace->backtrack());
4102 // Otherwise try the second priority at an earlier position.
4103 macro_assembler->AdvanceCurrentPosition(-text_length);
4104 macro_assembler->GoTo(&second_choice);
4105 }
4106
4107 // At this point we need to generate slow checks for the alternatives where
4108 // the quick check was inlined. We can recognize these because the associated
4109 // label was bound.
4110 for (int i = first_normal_choice; i < choice_count - 1; i++) {
4111 AlternativeGeneration* alt_gen = alt_gens.at(i);
4112 Trace new_trace(*current_trace);
4113 // If there are actions to be flushed we have to limit how many times
4114 // they are flushed. Take the budget of the parent trace and distribute
4115 // it fairly amongst the children.
4116 if (new_trace.actions() != NULL) {
4117 new_trace.set_flush_budget(new_flush_budget);
4118 }
4119 EmitOutOfLineContinuation(compiler,
4120 &new_trace,
4121 alternatives_->at(i),
4122 alt_gen,
4123 preload_characters,
4124 alt_gens.at(i + 1)->expects_preload);
4125 }
4126 }
4127
4128
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4129 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4130 Trace* trace,
4131 GuardedAlternative alternative,
4132 AlternativeGeneration* alt_gen,
4133 int preload_characters,
4134 bool next_expects_preload) {
4135 if (!alt_gen->possible_success.is_linked()) return;
4136
4137 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4138 macro_assembler->Bind(&alt_gen->possible_success);
4139 Trace out_of_line_trace(*trace);
4140 out_of_line_trace.set_characters_preloaded(preload_characters);
4141 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4142 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4143 ZoneList<Guard*>* guards = alternative.guards();
4144 int guard_count = (guards == NULL) ? 0 : guards->length();
4145 if (next_expects_preload) {
4146 Label reload_current_char;
4147 out_of_line_trace.set_backtrack(&reload_current_char);
4148 for (int j = 0; j < guard_count; j++) {
4149 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4150 }
4151 alternative.node()->Emit(compiler, &out_of_line_trace);
4152 macro_assembler->Bind(&reload_current_char);
4153 // Reload the current character, since the next quick check expects that.
4154 // We don't need to check bounds here because we only get into this
4155 // code through a quick check which already did the checked load.
4156 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4157 NULL,
4158 false,
4159 preload_characters);
4160 macro_assembler->GoTo(&(alt_gen->after));
4161 } else {
4162 out_of_line_trace.set_backtrack(&(alt_gen->after));
4163 for (int j = 0; j < guard_count; j++) {
4164 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4165 }
4166 alternative.node()->Emit(compiler, &out_of_line_trace);
4167 }
4168 }
4169
4170
Emit(RegExpCompiler * compiler,Trace * trace)4171 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4172 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4173 LimitResult limit_result = LimitVersions(compiler, trace);
4174 if (limit_result == DONE) return;
4175 ASSERT(limit_result == CONTINUE);
4176
4177 RecursionCheck rc(compiler);
4178
4179 switch (action_type_) {
4180 case STORE_POSITION: {
4181 Trace::DeferredCapture
4182 new_capture(data_.u_position_register.reg,
4183 data_.u_position_register.is_capture,
4184 trace);
4185 Trace new_trace = *trace;
4186 new_trace.add_action(&new_capture);
4187 on_success()->Emit(compiler, &new_trace);
4188 break;
4189 }
4190 case INCREMENT_REGISTER: {
4191 Trace::DeferredIncrementRegister
4192 new_increment(data_.u_increment_register.reg);
4193 Trace new_trace = *trace;
4194 new_trace.add_action(&new_increment);
4195 on_success()->Emit(compiler, &new_trace);
4196 break;
4197 }
4198 case SET_REGISTER: {
4199 Trace::DeferredSetRegister
4200 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4201 Trace new_trace = *trace;
4202 new_trace.add_action(&new_set);
4203 on_success()->Emit(compiler, &new_trace);
4204 break;
4205 }
4206 case CLEAR_CAPTURES: {
4207 Trace::DeferredClearCaptures
4208 new_capture(Interval(data_.u_clear_captures.range_from,
4209 data_.u_clear_captures.range_to));
4210 Trace new_trace = *trace;
4211 new_trace.add_action(&new_capture);
4212 on_success()->Emit(compiler, &new_trace);
4213 break;
4214 }
4215 case BEGIN_SUBMATCH:
4216 if (!trace->is_trivial()) {
4217 trace->Flush(compiler, this);
4218 } else {
4219 assembler->WriteCurrentPositionToRegister(
4220 data_.u_submatch.current_position_register, 0);
4221 assembler->WriteStackPointerToRegister(
4222 data_.u_submatch.stack_pointer_register);
4223 on_success()->Emit(compiler, trace);
4224 }
4225 break;
4226 case EMPTY_MATCH_CHECK: {
4227 int start_pos_reg = data_.u_empty_match_check.start_register;
4228 int stored_pos = 0;
4229 int rep_reg = data_.u_empty_match_check.repetition_register;
4230 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4231 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4232 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4233 // If we know we haven't advanced and there is no minimum we
4234 // can just backtrack immediately.
4235 assembler->GoTo(trace->backtrack());
4236 } else if (know_dist && stored_pos < trace->cp_offset()) {
4237 // If we know we've advanced we can generate the continuation
4238 // immediately.
4239 on_success()->Emit(compiler, trace);
4240 } else if (!trace->is_trivial()) {
4241 trace->Flush(compiler, this);
4242 } else {
4243 Label skip_empty_check;
4244 // If we have a minimum number of repetitions we check the current
4245 // number first and skip the empty check if it's not enough.
4246 if (has_minimum) {
4247 int limit = data_.u_empty_match_check.repetition_limit;
4248 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4249 }
4250 // If the match is empty we bail out, otherwise we fall through
4251 // to the on-success continuation.
4252 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4253 trace->backtrack());
4254 assembler->Bind(&skip_empty_check);
4255 on_success()->Emit(compiler, trace);
4256 }
4257 break;
4258 }
4259 case POSITIVE_SUBMATCH_SUCCESS: {
4260 if (!trace->is_trivial()) {
4261 trace->Flush(compiler, this);
4262 return;
4263 }
4264 assembler->ReadCurrentPositionFromRegister(
4265 data_.u_submatch.current_position_register);
4266 assembler->ReadStackPointerFromRegister(
4267 data_.u_submatch.stack_pointer_register);
4268 int clear_register_count = data_.u_submatch.clear_register_count;
4269 if (clear_register_count == 0) {
4270 on_success()->Emit(compiler, trace);
4271 return;
4272 }
4273 int clear_registers_from = data_.u_submatch.clear_register_from;
4274 Label clear_registers_backtrack;
4275 Trace new_trace = *trace;
4276 new_trace.set_backtrack(&clear_registers_backtrack);
4277 on_success()->Emit(compiler, &new_trace);
4278
4279 assembler->Bind(&clear_registers_backtrack);
4280 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4281 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4282
4283 ASSERT(trace->backtrack() == NULL);
4284 assembler->Backtrack();
4285 return;
4286 }
4287 default:
4288 UNREACHABLE();
4289 }
4290 }
4291
4292
Emit(RegExpCompiler * compiler,Trace * trace)4293 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4294 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4295 if (!trace->is_trivial()) {
4296 trace->Flush(compiler, this);
4297 return;
4298 }
4299
4300 LimitResult limit_result = LimitVersions(compiler, trace);
4301 if (limit_result == DONE) return;
4302 ASSERT(limit_result == CONTINUE);
4303
4304 RecursionCheck rc(compiler);
4305
4306 ASSERT_EQ(start_reg_ + 1, end_reg_);
4307 if (compiler->ignore_case()) {
4308 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4309 trace->backtrack());
4310 } else {
4311 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4312 }
4313 on_success()->Emit(compiler, trace);
4314 }
4315
4316
4317 // -------------------------------------------------------------------
4318 // Dot/dotty output
4319
4320
4321 #ifdef DEBUG
4322
4323
4324 class DotPrinter: public NodeVisitor {
4325 public:
DotPrinter(bool ignore_case)4326 explicit DotPrinter(bool ignore_case)
4327 : ignore_case_(ignore_case),
4328 stream_(&alloc_) { }
4329 void PrintNode(const char* label, RegExpNode* node);
4330 void Visit(RegExpNode* node);
4331 void PrintAttributes(RegExpNode* from);
stream()4332 StringStream* stream() { return &stream_; }
4333 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4334 #define DECLARE_VISIT(Type) \
4335 virtual void Visit##Type(Type##Node* that);
4336 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4337 #undef DECLARE_VISIT
4338 private:
4339 bool ignore_case_;
4340 HeapStringAllocator alloc_;
4341 StringStream stream_;
4342 };
4343
4344
PrintNode(const char * label,RegExpNode * node)4345 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4346 stream()->Add("digraph G {\n graph [label=\"");
4347 for (int i = 0; label[i]; i++) {
4348 switch (label[i]) {
4349 case '\\':
4350 stream()->Add("\\\\");
4351 break;
4352 case '"':
4353 stream()->Add("\"");
4354 break;
4355 default:
4356 stream()->Put(label[i]);
4357 break;
4358 }
4359 }
4360 stream()->Add("\"];\n");
4361 Visit(node);
4362 stream()->Add("}\n");
4363 printf("%s", stream()->ToCString().get());
4364 }
4365
4366
Visit(RegExpNode * node)4367 void DotPrinter::Visit(RegExpNode* node) {
4368 if (node->info()->visited) return;
4369 node->info()->visited = true;
4370 node->Accept(this);
4371 }
4372
4373
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4374 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4375 stream()->Add(" n%p -> n%p [style=dotted];\n", from, on_failure);
4376 Visit(on_failure);
4377 }
4378
4379
4380 class TableEntryBodyPrinter {
4381 public:
TableEntryBodyPrinter(StringStream * stream,ChoiceNode * choice)4382 TableEntryBodyPrinter(StringStream* stream, ChoiceNode* choice)
4383 : stream_(stream), choice_(choice) { }
Call(uc16 from,DispatchTable::Entry entry)4384 void Call(uc16 from, DispatchTable::Entry entry) {
4385 OutSet* out_set = entry.out_set();
4386 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4387 if (out_set->Get(i)) {
4388 stream()->Add(" n%p:s%io%i -> n%p;\n",
4389 choice(),
4390 from,
4391 i,
4392 choice()->alternatives()->at(i).node());
4393 }
4394 }
4395 }
4396 private:
stream()4397 StringStream* stream() { return stream_; }
choice()4398 ChoiceNode* choice() { return choice_; }
4399 StringStream* stream_;
4400 ChoiceNode* choice_;
4401 };
4402
4403
4404 class TableEntryHeaderPrinter {
4405 public:
TableEntryHeaderPrinter(StringStream * stream)4406 explicit TableEntryHeaderPrinter(StringStream* stream)
4407 : first_(true), stream_(stream) { }
Call(uc16 from,DispatchTable::Entry entry)4408 void Call(uc16 from, DispatchTable::Entry entry) {
4409 if (first_) {
4410 first_ = false;
4411 } else {
4412 stream()->Add("|");
4413 }
4414 stream()->Add("{\\%k-\\%k|{", from, entry.to());
4415 OutSet* out_set = entry.out_set();
4416 int priority = 0;
4417 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4418 if (out_set->Get(i)) {
4419 if (priority > 0) stream()->Add("|");
4420 stream()->Add("<s%io%i> %i", from, i, priority);
4421 priority++;
4422 }
4423 }
4424 stream()->Add("}}");
4425 }
4426
4427 private:
4428 bool first_;
stream()4429 StringStream* stream() { return stream_; }
4430 StringStream* stream_;
4431 };
4432
4433
4434 class AttributePrinter {
4435 public:
AttributePrinter(DotPrinter * out)4436 explicit AttributePrinter(DotPrinter* out)
4437 : out_(out), first_(true) { }
PrintSeparator()4438 void PrintSeparator() {
4439 if (first_) {
4440 first_ = false;
4441 } else {
4442 out_->stream()->Add("|");
4443 }
4444 }
PrintBit(const char * name,bool value)4445 void PrintBit(const char* name, bool value) {
4446 if (!value) return;
4447 PrintSeparator();
4448 out_->stream()->Add("{%s}", name);
4449 }
PrintPositive(const char * name,int value)4450 void PrintPositive(const char* name, int value) {
4451 if (value < 0) return;
4452 PrintSeparator();
4453 out_->stream()->Add("{%s|%x}", name, value);
4454 }
4455 private:
4456 DotPrinter* out_;
4457 bool first_;
4458 };
4459
4460
PrintAttributes(RegExpNode * that)4461 void DotPrinter::PrintAttributes(RegExpNode* that) {
4462 stream()->Add(" a%p [shape=Mrecord, color=grey, fontcolor=grey, "
4463 "margin=0.1, fontsize=10, label=\"{",
4464 that);
4465 AttributePrinter printer(this);
4466 NodeInfo* info = that->info();
4467 printer.PrintBit("NI", info->follows_newline_interest);
4468 printer.PrintBit("WI", info->follows_word_interest);
4469 printer.PrintBit("SI", info->follows_start_interest);
4470 Label* label = that->label();
4471 if (label->is_bound())
4472 printer.PrintPositive("@", label->pos());
4473 stream()->Add("}\"];\n");
4474 stream()->Add(" a%p -> n%p [style=dashed, color=grey, "
4475 "arrowhead=none];\n", that, that);
4476 }
4477
4478
4479 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4480 void DotPrinter::VisitChoice(ChoiceNode* that) {
4481 if (kPrintDispatchTable) {
4482 stream()->Add(" n%p [shape=Mrecord, label=\"", that);
4483 TableEntryHeaderPrinter header_printer(stream());
4484 that->GetTable(ignore_case_)->ForEach(&header_printer);
4485 stream()->Add("\"]\n", that);
4486 PrintAttributes(that);
4487 TableEntryBodyPrinter body_printer(stream(), that);
4488 that->GetTable(ignore_case_)->ForEach(&body_printer);
4489 } else {
4490 stream()->Add(" n%p [shape=Mrecord, label=\"?\"];\n", that);
4491 for (int i = 0; i < that->alternatives()->length(); i++) {
4492 GuardedAlternative alt = that->alternatives()->at(i);
4493 stream()->Add(" n%p -> n%p;\n", that, alt.node());
4494 }
4495 }
4496 for (int i = 0; i < that->alternatives()->length(); i++) {
4497 GuardedAlternative alt = that->alternatives()->at(i);
4498 alt.node()->Accept(this);
4499 }
4500 }
4501
4502
VisitText(TextNode * that)4503 void DotPrinter::VisitText(TextNode* that) {
4504 Zone* zone = that->zone();
4505 stream()->Add(" n%p [label=\"", that);
4506 for (int i = 0; i < that->elements()->length(); i++) {
4507 if (i > 0) stream()->Add(" ");
4508 TextElement elm = that->elements()->at(i);
4509 switch (elm.text_type()) {
4510 case TextElement::ATOM: {
4511 stream()->Add("'%w'", elm.atom()->data());
4512 break;
4513 }
4514 case TextElement::CHAR_CLASS: {
4515 RegExpCharacterClass* node = elm.char_class();
4516 stream()->Add("[");
4517 if (node->is_negated())
4518 stream()->Add("^");
4519 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4520 CharacterRange range = node->ranges(zone)->at(j);
4521 stream()->Add("%k-%k", range.from(), range.to());
4522 }
4523 stream()->Add("]");
4524 break;
4525 }
4526 default:
4527 UNREACHABLE();
4528 }
4529 }
4530 stream()->Add("\", shape=box, peripheries=2];\n");
4531 PrintAttributes(that);
4532 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4533 Visit(that->on_success());
4534 }
4535
4536
VisitBackReference(BackReferenceNode * that)4537 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4538 stream()->Add(" n%p [label=\"$%i..$%i\", shape=doubleoctagon];\n",
4539 that,
4540 that->start_register(),
4541 that->end_register());
4542 PrintAttributes(that);
4543 stream()->Add(" n%p -> n%p;\n", that, that->on_success());
4544 Visit(that->on_success());
4545 }
4546
4547
VisitEnd(EndNode * that)4548 void DotPrinter::VisitEnd(EndNode* that) {
4549 stream()->Add(" n%p [style=bold, shape=point];\n", that);
4550 PrintAttributes(that);
4551 }
4552
4553
VisitAssertion(AssertionNode * that)4554 void DotPrinter::VisitAssertion(AssertionNode* that) {
4555 stream()->Add(" n%p [", that);
4556 switch (that->assertion_type()) {
4557 case AssertionNode::AT_END:
4558 stream()->Add("label=\"$\", shape=septagon");
4559 break;
4560 case AssertionNode::AT_START:
4561 stream()->Add("label=\"^\", shape=septagon");
4562 break;
4563 case AssertionNode::AT_BOUNDARY:
4564 stream()->Add("label=\"\\b\", shape=septagon");
4565 break;
4566 case AssertionNode::AT_NON_BOUNDARY:
4567 stream()->Add("label=\"\\B\", shape=septagon");
4568 break;
4569 case AssertionNode::AFTER_NEWLINE:
4570 stream()->Add("label=\"(?<=\\n)\", shape=septagon");
4571 break;
4572 }
4573 stream()->Add("];\n");
4574 PrintAttributes(that);
4575 RegExpNode* successor = that->on_success();
4576 stream()->Add(" n%p -> n%p;\n", that, successor);
4577 Visit(successor);
4578 }
4579
4580
VisitAction(ActionNode * that)4581 void DotPrinter::VisitAction(ActionNode* that) {
4582 stream()->Add(" n%p [", that);
4583 switch (that->action_type_) {
4584 case ActionNode::SET_REGISTER:
4585 stream()->Add("label=\"$%i:=%i\", shape=octagon",
4586 that->data_.u_store_register.reg,
4587 that->data_.u_store_register.value);
4588 break;
4589 case ActionNode::INCREMENT_REGISTER:
4590 stream()->Add("label=\"$%i++\", shape=octagon",
4591 that->data_.u_increment_register.reg);
4592 break;
4593 case ActionNode::STORE_POSITION:
4594 stream()->Add("label=\"$%i:=$pos\", shape=octagon",
4595 that->data_.u_position_register.reg);
4596 break;
4597 case ActionNode::BEGIN_SUBMATCH:
4598 stream()->Add("label=\"$%i:=$pos,begin\", shape=septagon",
4599 that->data_.u_submatch.current_position_register);
4600 break;
4601 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4602 stream()->Add("label=\"escape\", shape=septagon");
4603 break;
4604 case ActionNode::EMPTY_MATCH_CHECK:
4605 stream()->Add("label=\"$%i=$pos?,$%i<%i?\", shape=septagon",
4606 that->data_.u_empty_match_check.start_register,
4607 that->data_.u_empty_match_check.repetition_register,
4608 that->data_.u_empty_match_check.repetition_limit);
4609 break;
4610 case ActionNode::CLEAR_CAPTURES: {
4611 stream()->Add("label=\"clear $%i to $%i\", shape=septagon",
4612 that->data_.u_clear_captures.range_from,
4613 that->data_.u_clear_captures.range_to);
4614 break;
4615 }
4616 }
4617 stream()->Add("];\n");
4618 PrintAttributes(that);
4619 RegExpNode* successor = that->on_success();
4620 stream()->Add(" n%p -> n%p;\n", that, successor);
4621 Visit(successor);
4622 }
4623
4624
4625 class DispatchTableDumper {
4626 public:
DispatchTableDumper(StringStream * stream)4627 explicit DispatchTableDumper(StringStream* stream) : stream_(stream) { }
4628 void Call(uc16 key, DispatchTable::Entry entry);
stream()4629 StringStream* stream() { return stream_; }
4630 private:
4631 StringStream* stream_;
4632 };
4633
4634
Call(uc16 key,DispatchTable::Entry entry)4635 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4636 stream()->Add("[%k-%k]: {", key, entry.to());
4637 OutSet* set = entry.out_set();
4638 bool first = true;
4639 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4640 if (set->Get(i)) {
4641 if (first) {
4642 first = false;
4643 } else {
4644 stream()->Add(", ");
4645 }
4646 stream()->Add("%i", i);
4647 }
4648 }
4649 stream()->Add("}\n");
4650 }
4651
4652
Dump()4653 void DispatchTable::Dump() {
4654 HeapStringAllocator alloc;
4655 StringStream stream(&alloc);
4656 DispatchTableDumper dumper(&stream);
4657 tree()->ForEach(&dumper);
4658 OS::PrintError("%s", stream.ToCString().get());
4659 }
4660
4661
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4662 void RegExpEngine::DotPrint(const char* label,
4663 RegExpNode* node,
4664 bool ignore_case) {
4665 DotPrinter printer(ignore_case);
4666 printer.PrintNode(label, node);
4667 }
4668
4669
4670 #endif // DEBUG
4671
4672
4673 // -------------------------------------------------------------------
4674 // Tree to graph conversion
4675
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4676 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4677 RegExpNode* on_success) {
4678 ZoneList<TextElement>* elms =
4679 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4680 elms->Add(TextElement::Atom(this), compiler->zone());
4681 return new(compiler->zone()) TextNode(elms, on_success);
4682 }
4683
4684
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4685 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4686 RegExpNode* on_success) {
4687 return new(compiler->zone()) TextNode(elements(), on_success);
4688 }
4689
4690
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4691 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4692 const int* special_class,
4693 int length) {
4694 length--; // Remove final 0x10000.
4695 ASSERT(special_class[length] == 0x10000);
4696 ASSERT(ranges->length() != 0);
4697 ASSERT(length != 0);
4698 ASSERT(special_class[0] != 0);
4699 if (ranges->length() != (length >> 1) + 1) {
4700 return false;
4701 }
4702 CharacterRange range = ranges->at(0);
4703 if (range.from() != 0) {
4704 return false;
4705 }
4706 for (int i = 0; i < length; i += 2) {
4707 if (special_class[i] != (range.to() + 1)) {
4708 return false;
4709 }
4710 range = ranges->at((i >> 1) + 1);
4711 if (special_class[i+1] != range.from()) {
4712 return false;
4713 }
4714 }
4715 if (range.to() != 0xffff) {
4716 return false;
4717 }
4718 return true;
4719 }
4720
4721
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4722 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4723 const int* special_class,
4724 int length) {
4725 length--; // Remove final 0x10000.
4726 ASSERT(special_class[length] == 0x10000);
4727 if (ranges->length() * 2 != length) {
4728 return false;
4729 }
4730 for (int i = 0; i < length; i += 2) {
4731 CharacterRange range = ranges->at(i >> 1);
4732 if (range.from() != special_class[i] ||
4733 range.to() != special_class[i + 1] - 1) {
4734 return false;
4735 }
4736 }
4737 return true;
4738 }
4739
4740
is_standard(Zone * zone)4741 bool RegExpCharacterClass::is_standard(Zone* zone) {
4742 // TODO(lrn): Remove need for this function, by not throwing away information
4743 // along the way.
4744 if (is_negated_) {
4745 return false;
4746 }
4747 if (set_.is_standard()) {
4748 return true;
4749 }
4750 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4751 set_.set_standard_set_type('s');
4752 return true;
4753 }
4754 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4755 set_.set_standard_set_type('S');
4756 return true;
4757 }
4758 if (CompareInverseRanges(set_.ranges(zone),
4759 kLineTerminatorRanges,
4760 kLineTerminatorRangeCount)) {
4761 set_.set_standard_set_type('.');
4762 return true;
4763 }
4764 if (CompareRanges(set_.ranges(zone),
4765 kLineTerminatorRanges,
4766 kLineTerminatorRangeCount)) {
4767 set_.set_standard_set_type('n');
4768 return true;
4769 }
4770 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4771 set_.set_standard_set_type('w');
4772 return true;
4773 }
4774 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4775 set_.set_standard_set_type('W');
4776 return true;
4777 }
4778 return false;
4779 }
4780
4781
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4782 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4783 RegExpNode* on_success) {
4784 return new(compiler->zone()) TextNode(this, on_success);
4785 }
4786
4787
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4788 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4789 RegExpNode* on_success) {
4790 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4791 int length = alternatives->length();
4792 ChoiceNode* result =
4793 new(compiler->zone()) ChoiceNode(length, compiler->zone());
4794 for (int i = 0; i < length; i++) {
4795 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4796 on_success));
4797 result->AddAlternative(alternative);
4798 }
4799 return result;
4800 }
4801
4802
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4803 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4804 RegExpNode* on_success) {
4805 return ToNode(min(),
4806 max(),
4807 is_greedy(),
4808 body(),
4809 compiler,
4810 on_success);
4811 }
4812
4813
4814 // Scoped object to keep track of how much we unroll quantifier loops in the
4815 // regexp graph generator.
4816 class RegExpExpansionLimiter {
4817 public:
4818 static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)4819 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
4820 : compiler_(compiler),
4821 saved_expansion_factor_(compiler->current_expansion_factor()),
4822 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4823 ASSERT(factor > 0);
4824 if (ok_to_expand_) {
4825 if (factor > kMaxExpansionFactor) {
4826 // Avoid integer overflow of the current expansion factor.
4827 ok_to_expand_ = false;
4828 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4829 } else {
4830 int new_factor = saved_expansion_factor_ * factor;
4831 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4832 compiler->set_current_expansion_factor(new_factor);
4833 }
4834 }
4835 }
4836
~RegExpExpansionLimiter()4837 ~RegExpExpansionLimiter() {
4838 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4839 }
4840
ok_to_expand()4841 bool ok_to_expand() { return ok_to_expand_; }
4842
4843 private:
4844 RegExpCompiler* compiler_;
4845 int saved_expansion_factor_;
4846 bool ok_to_expand_;
4847
4848 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4849 };
4850
4851
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)4852 RegExpNode* RegExpQuantifier::ToNode(int min,
4853 int max,
4854 bool is_greedy,
4855 RegExpTree* body,
4856 RegExpCompiler* compiler,
4857 RegExpNode* on_success,
4858 bool not_at_start) {
4859 // x{f, t} becomes this:
4860 //
4861 // (r++)<-.
4862 // | `
4863 // | (x)
4864 // v ^
4865 // (r=0)-->(?)---/ [if r < t]
4866 // |
4867 // [if r >= f] \----> ...
4868 //
4869
4870 // 15.10.2.5 RepeatMatcher algorithm.
4871 // The parser has already eliminated the case where max is 0. In the case
4872 // where max_match is zero the parser has removed the quantifier if min was
4873 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4874
4875 // If we know that we cannot match zero length then things are a little
4876 // simpler since we don't need to make the special zero length match check
4877 // from step 2.1. If the min and max are small we can unroll a little in
4878 // this case.
4879 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
4880 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
4881 if (max == 0) return on_success; // This can happen due to recursion.
4882 bool body_can_be_empty = (body->min_match() == 0);
4883 int body_start_reg = RegExpCompiler::kNoRegister;
4884 Interval capture_registers = body->CaptureRegisters();
4885 bool needs_capture_clearing = !capture_registers.is_empty();
4886 Zone* zone = compiler->zone();
4887
4888 if (body_can_be_empty) {
4889 body_start_reg = compiler->AllocateRegister();
4890 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
4891 // Only unroll if there are no captures and the body can't be
4892 // empty.
4893 {
4894 RegExpExpansionLimiter limiter(
4895 compiler, min + ((max != min) ? 1 : 0));
4896 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4897 int new_max = (max == kInfinity) ? max : max - min;
4898 // Recurse once to get the loop or optional matches after the fixed
4899 // ones.
4900 RegExpNode* answer = ToNode(
4901 0, new_max, is_greedy, body, compiler, on_success, true);
4902 // Unroll the forced matches from 0 to min. This can cause chains of
4903 // TextNodes (which the parser does not generate). These should be
4904 // combined if it turns out they hinder good code generation.
4905 for (int i = 0; i < min; i++) {
4906 answer = body->ToNode(compiler, answer);
4907 }
4908 return answer;
4909 }
4910 }
4911 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4912 ASSERT(max > 0); // Due to the 'if' above.
4913 RegExpExpansionLimiter limiter(compiler, max);
4914 if (limiter.ok_to_expand()) {
4915 // Unroll the optional matches up to max.
4916 RegExpNode* answer = on_success;
4917 for (int i = 0; i < max; i++) {
4918 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
4919 if (is_greedy) {
4920 alternation->AddAlternative(
4921 GuardedAlternative(body->ToNode(compiler, answer)));
4922 alternation->AddAlternative(GuardedAlternative(on_success));
4923 } else {
4924 alternation->AddAlternative(GuardedAlternative(on_success));
4925 alternation->AddAlternative(
4926 GuardedAlternative(body->ToNode(compiler, answer)));
4927 }
4928 answer = alternation;
4929 if (not_at_start) alternation->set_not_at_start();
4930 }
4931 return answer;
4932 }
4933 }
4934 }
4935 bool has_min = min > 0;
4936 bool has_max = max < RegExpTree::kInfinity;
4937 bool needs_counter = has_min || has_max;
4938 int reg_ctr = needs_counter
4939 ? compiler->AllocateRegister()
4940 : RegExpCompiler::kNoRegister;
4941 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4942 zone);
4943 if (not_at_start) center->set_not_at_start();
4944 RegExpNode* loop_return = needs_counter
4945 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
4946 : static_cast<RegExpNode*>(center);
4947 if (body_can_be_empty) {
4948 // If the body can be empty we need to check if it was and then
4949 // backtrack.
4950 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
4951 reg_ctr,
4952 min,
4953 loop_return);
4954 }
4955 RegExpNode* body_node = body->ToNode(compiler, loop_return);
4956 if (body_can_be_empty) {
4957 // If the body can be empty we need to store the start position
4958 // so we can bail out if it was empty.
4959 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
4960 }
4961 if (needs_capture_clearing) {
4962 // Before entering the body of this loop we need to clear captures.
4963 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
4964 }
4965 GuardedAlternative body_alt(body_node);
4966 if (has_max) {
4967 Guard* body_guard =
4968 new(zone) Guard(reg_ctr, Guard::LT, max);
4969 body_alt.AddGuard(body_guard, zone);
4970 }
4971 GuardedAlternative rest_alt(on_success);
4972 if (has_min) {
4973 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
4974 rest_alt.AddGuard(rest_guard, zone);
4975 }
4976 if (is_greedy) {
4977 center->AddLoopAlternative(body_alt);
4978 center->AddContinueAlternative(rest_alt);
4979 } else {
4980 center->AddContinueAlternative(rest_alt);
4981 center->AddLoopAlternative(body_alt);
4982 }
4983 if (needs_counter) {
4984 return ActionNode::SetRegister(reg_ctr, 0, center);
4985 } else {
4986 return center;
4987 }
4988 }
4989
4990
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4991 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
4992 RegExpNode* on_success) {
4993 NodeInfo info;
4994 Zone* zone = compiler->zone();
4995
4996 switch (assertion_type()) {
4997 case START_OF_LINE:
4998 return AssertionNode::AfterNewline(on_success);
4999 case START_OF_INPUT:
5000 return AssertionNode::AtStart(on_success);
5001 case BOUNDARY:
5002 return AssertionNode::AtBoundary(on_success);
5003 case NON_BOUNDARY:
5004 return AssertionNode::AtNonBoundary(on_success);
5005 case END_OF_INPUT:
5006 return AssertionNode::AtEnd(on_success);
5007 case END_OF_LINE: {
5008 // Compile $ in multiline regexps as an alternation with a positive
5009 // lookahead in one side and an end-of-input on the other side.
5010 // We need two registers for the lookahead.
5011 int stack_pointer_register = compiler->AllocateRegister();
5012 int position_register = compiler->AllocateRegister();
5013 // The ChoiceNode to distinguish between a newline and end-of-input.
5014 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5015 // Create a newline atom.
5016 ZoneList<CharacterRange>* newline_ranges =
5017 new(zone) ZoneList<CharacterRange>(3, zone);
5018 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5019 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5020 TextNode* newline_matcher = new(zone) TextNode(
5021 newline_atom,
5022 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5023 position_register,
5024 0, // No captures inside.
5025 -1, // Ignored if no captures.
5026 on_success));
5027 // Create an end-of-input matcher.
5028 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5029 stack_pointer_register,
5030 position_register,
5031 newline_matcher);
5032 // Add the two alternatives to the ChoiceNode.
5033 GuardedAlternative eol_alternative(end_of_line);
5034 result->AddAlternative(eol_alternative);
5035 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5036 result->AddAlternative(end_alternative);
5037 return result;
5038 }
5039 default:
5040 UNREACHABLE();
5041 }
5042 return on_success;
5043 }
5044
5045
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5046 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5047 RegExpNode* on_success) {
5048 return new(compiler->zone())
5049 BackReferenceNode(RegExpCapture::StartRegister(index()),
5050 RegExpCapture::EndRegister(index()),
5051 on_success);
5052 }
5053
5054
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5055 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5056 RegExpNode* on_success) {
5057 return on_success;
5058 }
5059
5060
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5061 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
5062 RegExpNode* on_success) {
5063 int stack_pointer_register = compiler->AllocateRegister();
5064 int position_register = compiler->AllocateRegister();
5065
5066 const int registers_per_capture = 2;
5067 const int register_of_first_capture = 2;
5068 int register_count = capture_count_ * registers_per_capture;
5069 int register_start =
5070 register_of_first_capture + capture_from_ * registers_per_capture;
5071
5072 RegExpNode* success;
5073 if (is_positive()) {
5074 RegExpNode* node = ActionNode::BeginSubmatch(
5075 stack_pointer_register,
5076 position_register,
5077 body()->ToNode(
5078 compiler,
5079 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5080 position_register,
5081 register_count,
5082 register_start,
5083 on_success)));
5084 return node;
5085 } else {
5086 // We use a ChoiceNode for a negative lookahead because it has most of
5087 // the characteristics we need. It has the body of the lookahead as its
5088 // first alternative and the expression after the lookahead of the second
5089 // alternative. If the first alternative succeeds then the
5090 // NegativeSubmatchSuccess will unwind the stack including everything the
5091 // choice node set up and backtrack. If the first alternative fails then
5092 // the second alternative is tried, which is exactly the desired result
5093 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5094 // ChoiceNode that knows to ignore the first exit when calculating quick
5095 // checks.
5096 Zone* zone = compiler->zone();
5097
5098 GuardedAlternative body_alt(
5099 body()->ToNode(
5100 compiler,
5101 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5102 position_register,
5103 register_count,
5104 register_start,
5105 zone)));
5106 ChoiceNode* choice_node =
5107 new(zone) NegativeLookaheadChoiceNode(body_alt,
5108 GuardedAlternative(on_success),
5109 zone);
5110 return ActionNode::BeginSubmatch(stack_pointer_register,
5111 position_register,
5112 choice_node);
5113 }
5114 }
5115
5116
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5117 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5118 RegExpNode* on_success) {
5119 return ToNode(body(), index(), compiler, on_success);
5120 }
5121
5122
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5123 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5124 int index,
5125 RegExpCompiler* compiler,
5126 RegExpNode* on_success) {
5127 int start_reg = RegExpCapture::StartRegister(index);
5128 int end_reg = RegExpCapture::EndRegister(index);
5129 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5130 RegExpNode* body_node = body->ToNode(compiler, store_end);
5131 return ActionNode::StorePosition(start_reg, true, body_node);
5132 }
5133
5134
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5135 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5136 RegExpNode* on_success) {
5137 ZoneList<RegExpTree*>* children = nodes();
5138 RegExpNode* current = on_success;
5139 for (int i = children->length() - 1; i >= 0; i--) {
5140 current = children->at(i)->ToNode(compiler, current);
5141 }
5142 return current;
5143 }
5144
5145
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5146 static void AddClass(const int* elmv,
5147 int elmc,
5148 ZoneList<CharacterRange>* ranges,
5149 Zone* zone) {
5150 elmc--;
5151 ASSERT(elmv[elmc] == 0x10000);
5152 for (int i = 0; i < elmc; i += 2) {
5153 ASSERT(elmv[i] < elmv[i + 1]);
5154 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5155 }
5156 }
5157
5158
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5159 static void AddClassNegated(const int *elmv,
5160 int elmc,
5161 ZoneList<CharacterRange>* ranges,
5162 Zone* zone) {
5163 elmc--;
5164 ASSERT(elmv[elmc] == 0x10000);
5165 ASSERT(elmv[0] != 0x0000);
5166 ASSERT(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5167 uc16 last = 0x0000;
5168 for (int i = 0; i < elmc; i += 2) {
5169 ASSERT(last <= elmv[i] - 1);
5170 ASSERT(elmv[i] < elmv[i + 1]);
5171 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5172 last = elmv[i + 1];
5173 }
5174 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5175 }
5176
5177
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges,Zone * zone)5178 void CharacterRange::AddClassEscape(uc16 type,
5179 ZoneList<CharacterRange>* ranges,
5180 Zone* zone) {
5181 switch (type) {
5182 case 's':
5183 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5184 break;
5185 case 'S':
5186 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5187 break;
5188 case 'w':
5189 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5190 break;
5191 case 'W':
5192 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5193 break;
5194 case 'd':
5195 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5196 break;
5197 case 'D':
5198 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5199 break;
5200 case '.':
5201 AddClassNegated(kLineTerminatorRanges,
5202 kLineTerminatorRangeCount,
5203 ranges,
5204 zone);
5205 break;
5206 // This is not a character range as defined by the spec but a
5207 // convenient shorthand for a character class that matches any
5208 // character.
5209 case '*':
5210 ranges->Add(CharacterRange::Everything(), zone);
5211 break;
5212 // This is the set of characters matched by the $ and ^ symbols
5213 // in multiline mode.
5214 case 'n':
5215 AddClass(kLineTerminatorRanges,
5216 kLineTerminatorRangeCount,
5217 ranges,
5218 zone);
5219 break;
5220 default:
5221 UNREACHABLE();
5222 }
5223 }
5224
5225
GetWordBounds()5226 Vector<const int> CharacterRange::GetWordBounds() {
5227 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5228 }
5229
5230
5231 class CharacterRangeSplitter {
5232 public:
CharacterRangeSplitter(ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded,Zone * zone)5233 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
5234 ZoneList<CharacterRange>** excluded,
5235 Zone* zone)
5236 : included_(included),
5237 excluded_(excluded),
5238 zone_(zone) { }
5239 void Call(uc16 from, DispatchTable::Entry entry);
5240
5241 static const int kInBase = 0;
5242 static const int kInOverlay = 1;
5243
5244 private:
5245 ZoneList<CharacterRange>** included_;
5246 ZoneList<CharacterRange>** excluded_;
5247 Zone* zone_;
5248 };
5249
5250
Call(uc16 from,DispatchTable::Entry entry)5251 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5252 if (!entry.out_set()->Get(kInBase)) return;
5253 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5254 ? included_
5255 : excluded_;
5256 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5257 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5258 }
5259
5260
Split(ZoneList<CharacterRange> * base,Vector<const int> overlay,ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded,Zone * zone)5261 void CharacterRange::Split(ZoneList<CharacterRange>* base,
5262 Vector<const int> overlay,
5263 ZoneList<CharacterRange>** included,
5264 ZoneList<CharacterRange>** excluded,
5265 Zone* zone) {
5266 ASSERT_EQ(NULL, *included);
5267 ASSERT_EQ(NULL, *excluded);
5268 DispatchTable table(zone);
5269 for (int i = 0; i < base->length(); i++)
5270 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5271 for (int i = 0; i < overlay.length(); i += 2) {
5272 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5273 CharacterRangeSplitter::kInOverlay, zone);
5274 }
5275 CharacterRangeSplitter callback(included, excluded, zone);
5276 table.ForEach(&callback);
5277 }
5278
5279
AddCaseEquivalents(ZoneList<CharacterRange> * ranges,bool is_ascii,Zone * zone)5280 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
5281 bool is_ascii,
5282 Zone* zone) {
5283 Isolate* isolate = zone->isolate();
5284 uc16 bottom = from();
5285 uc16 top = to();
5286 if (is_ascii && !RangeContainsLatin1Equivalents(*this)) {
5287 if (bottom > String::kMaxOneByteCharCode) return;
5288 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5289 }
5290 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5291 if (top == bottom) {
5292 // If this is a singleton we just expand the one character.
5293 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5294 for (int i = 0; i < length; i++) {
5295 uc32 chr = chars[i];
5296 if (chr != bottom) {
5297 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5298 }
5299 }
5300 } else {
5301 // If this is a range we expand the characters block by block,
5302 // expanding contiguous subranges (blocks) one at a time.
5303 // The approach is as follows. For a given start character we
5304 // look up the remainder of the block that contains it (represented
5305 // by the end point), for instance we find 'z' if the character
5306 // is 'c'. A block is characterized by the property
5307 // that all characters uncanonicalize in the same way, except that
5308 // each entry in the result is incremented by the distance from the first
5309 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5310 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5311 // Once we've found the end point we look up its uncanonicalization
5312 // and produce a range for each element. For instance for [c-f]
5313 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
5314 // add a range if it is not already contained in the input, so [c-f]
5315 // will be skipped but [C-F] will be added. If this range is not
5316 // completely contained in a block we do this for all the blocks
5317 // covered by the range (handling characters that is not in a block
5318 // as a "singleton block").
5319 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5320 int pos = bottom;
5321 while (pos <= top) {
5322 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5323 uc16 block_end;
5324 if (length == 0) {
5325 block_end = pos;
5326 } else {
5327 ASSERT_EQ(1, length);
5328 block_end = range[0];
5329 }
5330 int end = (block_end > top) ? top : block_end;
5331 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5332 for (int i = 0; i < length; i++) {
5333 uc32 c = range[i];
5334 uc16 range_from = c - (block_end - pos);
5335 uc16 range_to = c - (block_end - end);
5336 if (!(bottom <= range_from && range_to <= top)) {
5337 ranges->Add(CharacterRange(range_from, range_to), zone);
5338 }
5339 }
5340 pos = end + 1;
5341 }
5342 }
5343 }
5344
5345
IsCanonical(ZoneList<CharacterRange> * ranges)5346 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5347 ASSERT_NOT_NULL(ranges);
5348 int n = ranges->length();
5349 if (n <= 1) return true;
5350 int max = ranges->at(0).to();
5351 for (int i = 1; i < n; i++) {
5352 CharacterRange next_range = ranges->at(i);
5353 if (next_range.from() <= max + 1) return false;
5354 max = next_range.to();
5355 }
5356 return true;
5357 }
5358
5359
ranges(Zone * zone)5360 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5361 if (ranges_ == NULL) {
5362 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5363 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5364 }
5365 return ranges_;
5366 }
5367
5368
5369 // Move a number of elements in a zonelist to another position
5370 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5371 static void MoveRanges(ZoneList<CharacterRange>* list,
5372 int from,
5373 int to,
5374 int count) {
5375 // Ranges are potentially overlapping.
5376 if (from < to) {
5377 for (int i = count - 1; i >= 0; i--) {
5378 list->at(to + i) = list->at(from + i);
5379 }
5380 } else {
5381 for (int i = 0; i < count; i++) {
5382 list->at(to + i) = list->at(from + i);
5383 }
5384 }
5385 }
5386
5387
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)5388 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5389 int count,
5390 CharacterRange insert) {
5391 // Inserts a range into list[0..count[, which must be sorted
5392 // by from value and non-overlapping and non-adjacent, using at most
5393 // list[0..count] for the result. Returns the number of resulting
5394 // canonicalized ranges. Inserting a range may collapse existing ranges into
5395 // fewer ranges, so the return value can be anything in the range 1..count+1.
5396 uc16 from = insert.from();
5397 uc16 to = insert.to();
5398 int start_pos = 0;
5399 int end_pos = count;
5400 for (int i = count - 1; i >= 0; i--) {
5401 CharacterRange current = list->at(i);
5402 if (current.from() > to + 1) {
5403 end_pos = i;
5404 } else if (current.to() + 1 < from) {
5405 start_pos = i + 1;
5406 break;
5407 }
5408 }
5409
5410 // Inserted range overlaps, or is adjacent to, ranges at positions
5411 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5412 // not affected by the insertion.
5413 // If start_pos == end_pos, the range must be inserted before start_pos.
5414 // if start_pos < end_pos, the entire range from start_pos to end_pos
5415 // must be merged with the insert range.
5416
5417 if (start_pos == end_pos) {
5418 // Insert between existing ranges at position start_pos.
5419 if (start_pos < count) {
5420 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5421 }
5422 list->at(start_pos) = insert;
5423 return count + 1;
5424 }
5425 if (start_pos + 1 == end_pos) {
5426 // Replace single existing range at position start_pos.
5427 CharacterRange to_replace = list->at(start_pos);
5428 int new_from = Min(to_replace.from(), from);
5429 int new_to = Max(to_replace.to(), to);
5430 list->at(start_pos) = CharacterRange(new_from, new_to);
5431 return count;
5432 }
5433 // Replace a number of existing ranges from start_pos to end_pos - 1.
5434 // Move the remaining ranges down.
5435
5436 int new_from = Min(list->at(start_pos).from(), from);
5437 int new_to = Max(list->at(end_pos - 1).to(), to);
5438 if (end_pos < count) {
5439 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5440 }
5441 list->at(start_pos) = CharacterRange(new_from, new_to);
5442 return count - (end_pos - start_pos) + 1;
5443 }
5444
5445
Canonicalize()5446 void CharacterSet::Canonicalize() {
5447 // Special/default classes are always considered canonical. The result
5448 // of calling ranges() will be sorted.
5449 if (ranges_ == NULL) return;
5450 CharacterRange::Canonicalize(ranges_);
5451 }
5452
5453
Canonicalize(ZoneList<CharacterRange> * character_ranges)5454 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5455 if (character_ranges->length() <= 1) return;
5456 // Check whether ranges are already canonical (increasing, non-overlapping,
5457 // non-adjacent).
5458 int n = character_ranges->length();
5459 int max = character_ranges->at(0).to();
5460 int i = 1;
5461 while (i < n) {
5462 CharacterRange current = character_ranges->at(i);
5463 if (current.from() <= max + 1) {
5464 break;
5465 }
5466 max = current.to();
5467 i++;
5468 }
5469 // Canonical until the i'th range. If that's all of them, we are done.
5470 if (i == n) return;
5471
5472 // The ranges at index i and forward are not canonicalized. Make them so by
5473 // doing the equivalent of insertion sort (inserting each into the previous
5474 // list, in order).
5475 // Notice that inserting a range can reduce the number of ranges in the
5476 // result due to combining of adjacent and overlapping ranges.
5477 int read = i; // Range to insert.
5478 int num_canonical = i; // Length of canonicalized part of list.
5479 do {
5480 num_canonical = InsertRangeInCanonicalList(character_ranges,
5481 num_canonical,
5482 character_ranges->at(read));
5483 read++;
5484 } while (read < n);
5485 character_ranges->Rewind(num_canonical);
5486
5487 ASSERT(CharacterRange::IsCanonical(character_ranges));
5488 }
5489
5490
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)5491 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5492 ZoneList<CharacterRange>* negated_ranges,
5493 Zone* zone) {
5494 ASSERT(CharacterRange::IsCanonical(ranges));
5495 ASSERT_EQ(0, negated_ranges->length());
5496 int range_count = ranges->length();
5497 uc16 from = 0;
5498 int i = 0;
5499 if (range_count > 0 && ranges->at(0).from() == 0) {
5500 from = ranges->at(0).to();
5501 i = 1;
5502 }
5503 while (i < range_count) {
5504 CharacterRange range = ranges->at(i);
5505 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5506 from = range.to();
5507 i++;
5508 }
5509 if (from < String::kMaxUtf16CodeUnit) {
5510 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5511 zone);
5512 }
5513 }
5514
5515
5516 // -------------------------------------------------------------------
5517 // Splay tree
5518
5519
Extend(unsigned value,Zone * zone)5520 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5521 if (Get(value))
5522 return this;
5523 if (successors(zone) != NULL) {
5524 for (int i = 0; i < successors(zone)->length(); i++) {
5525 OutSet* successor = successors(zone)->at(i);
5526 if (successor->Get(value))
5527 return successor;
5528 }
5529 } else {
5530 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
5531 }
5532 OutSet* result = new(zone) OutSet(first_, remaining_);
5533 result->Set(value, zone);
5534 successors(zone)->Add(result, zone);
5535 return result;
5536 }
5537
5538
Set(unsigned value,Zone * zone)5539 void OutSet::Set(unsigned value, Zone *zone) {
5540 if (value < kFirstLimit) {
5541 first_ |= (1 << value);
5542 } else {
5543 if (remaining_ == NULL)
5544 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
5545 if (remaining_->is_empty() || !remaining_->Contains(value))
5546 remaining_->Add(value, zone);
5547 }
5548 }
5549
5550
Get(unsigned value) const5551 bool OutSet::Get(unsigned value) const {
5552 if (value < kFirstLimit) {
5553 return (first_ & (1 << value)) != 0;
5554 } else if (remaining_ == NULL) {
5555 return false;
5556 } else {
5557 return remaining_->Contains(value);
5558 }
5559 }
5560
5561
5562 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
5563
5564
AddRange(CharacterRange full_range,int value,Zone * zone)5565 void DispatchTable::AddRange(CharacterRange full_range, int value,
5566 Zone* zone) {
5567 CharacterRange current = full_range;
5568 if (tree()->is_empty()) {
5569 // If this is the first range we just insert into the table.
5570 ZoneSplayTree<Config>::Locator loc;
5571 ASSERT_RESULT(tree()->Insert(current.from(), &loc));
5572 loc.set_value(Entry(current.from(), current.to(),
5573 empty()->Extend(value, zone)));
5574 return;
5575 }
5576 // First see if there is a range to the left of this one that
5577 // overlaps.
5578 ZoneSplayTree<Config>::Locator loc;
5579 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5580 Entry* entry = &loc.value();
5581 // If we've found a range that overlaps with this one, and it
5582 // starts strictly to the left of this one, we have to fix it
5583 // because the following code only handles ranges that start on
5584 // or after the start point of the range we're adding.
5585 if (entry->from() < current.from() && entry->to() >= current.from()) {
5586 // Snap the overlapping range in half around the start point of
5587 // the range we're adding.
5588 CharacterRange left(entry->from(), current.from() - 1);
5589 CharacterRange right(current.from(), entry->to());
5590 // The left part of the overlapping range doesn't overlap.
5591 // Truncate the whole entry to be just the left part.
5592 entry->set_to(left.to());
5593 // The right part is the one that overlaps. We add this part
5594 // to the map and let the next step deal with merging it with
5595 // the range we're adding.
5596 ZoneSplayTree<Config>::Locator loc;
5597 ASSERT_RESULT(tree()->Insert(right.from(), &loc));
5598 loc.set_value(Entry(right.from(),
5599 right.to(),
5600 entry->out_set()));
5601 }
5602 }
5603 while (current.is_valid()) {
5604 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5605 (loc.value().from() <= current.to()) &&
5606 (loc.value().to() >= current.from())) {
5607 Entry* entry = &loc.value();
5608 // We have overlap. If there is space between the start point of
5609 // the range we're adding and where the overlapping range starts
5610 // then we have to add a range covering just that space.
5611 if (current.from() < entry->from()) {
5612 ZoneSplayTree<Config>::Locator ins;
5613 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5614 ins.set_value(Entry(current.from(),
5615 entry->from() - 1,
5616 empty()->Extend(value, zone)));
5617 current.set_from(entry->from());
5618 }
5619 ASSERT_EQ(current.from(), entry->from());
5620 // If the overlapping range extends beyond the one we want to add
5621 // we have to snap the right part off and add it separately.
5622 if (entry->to() > current.to()) {
5623 ZoneSplayTree<Config>::Locator ins;
5624 ASSERT_RESULT(tree()->Insert(current.to() + 1, &ins));
5625 ins.set_value(Entry(current.to() + 1,
5626 entry->to(),
5627 entry->out_set()));
5628 entry->set_to(current.to());
5629 }
5630 ASSERT(entry->to() <= current.to());
5631 // The overlapping range is now completely contained by the range
5632 // we're adding so we can just update it and move the start point
5633 // of the range we're adding just past it.
5634 entry->AddValue(value, zone);
5635 // Bail out if the last interval ended at 0xFFFF since otherwise
5636 // adding 1 will wrap around to 0.
5637 if (entry->to() == String::kMaxUtf16CodeUnit)
5638 break;
5639 ASSERT(entry->to() + 1 > current.from());
5640 current.set_from(entry->to() + 1);
5641 } else {
5642 // There is no overlap so we can just add the range
5643 ZoneSplayTree<Config>::Locator ins;
5644 ASSERT_RESULT(tree()->Insert(current.from(), &ins));
5645 ins.set_value(Entry(current.from(),
5646 current.to(),
5647 empty()->Extend(value, zone)));
5648 break;
5649 }
5650 }
5651 }
5652
5653
Get(uc16 value)5654 OutSet* DispatchTable::Get(uc16 value) {
5655 ZoneSplayTree<Config>::Locator loc;
5656 if (!tree()->FindGreatestLessThan(value, &loc))
5657 return empty();
5658 Entry* entry = &loc.value();
5659 if (value <= entry->to())
5660 return entry->out_set();
5661 else
5662 return empty();
5663 }
5664
5665
5666 // -------------------------------------------------------------------
5667 // Analysis
5668
5669
EnsureAnalyzed(RegExpNode * that)5670 void Analysis::EnsureAnalyzed(RegExpNode* that) {
5671 StackLimitCheck check(that->zone()->isolate());
5672 if (check.HasOverflowed()) {
5673 fail("Stack overflow");
5674 return;
5675 }
5676 if (that->info()->been_analyzed || that->info()->being_analyzed)
5677 return;
5678 that->info()->being_analyzed = true;
5679 that->Accept(this);
5680 that->info()->being_analyzed = false;
5681 that->info()->been_analyzed = true;
5682 }
5683
5684
VisitEnd(EndNode * that)5685 void Analysis::VisitEnd(EndNode* that) {
5686 // nothing to do
5687 }
5688
5689
CalculateOffsets()5690 void TextNode::CalculateOffsets() {
5691 int element_count = elements()->length();
5692 // Set up the offsets of the elements relative to the start. This is a fixed
5693 // quantity since a TextNode can only contain fixed-width things.
5694 int cp_offset = 0;
5695 for (int i = 0; i < element_count; i++) {
5696 TextElement& elm = elements()->at(i);
5697 elm.set_cp_offset(cp_offset);
5698 cp_offset += elm.length();
5699 }
5700 }
5701
5702
VisitText(TextNode * that)5703 void Analysis::VisitText(TextNode* that) {
5704 if (ignore_case_) {
5705 that->MakeCaseIndependent(is_ascii_);
5706 }
5707 EnsureAnalyzed(that->on_success());
5708 if (!has_failed()) {
5709 that->CalculateOffsets();
5710 }
5711 }
5712
5713
VisitAction(ActionNode * that)5714 void Analysis::VisitAction(ActionNode* that) {
5715 RegExpNode* target = that->on_success();
5716 EnsureAnalyzed(target);
5717 if (!has_failed()) {
5718 // If the next node is interested in what it follows then this node
5719 // has to be interested too so it can pass the information on.
5720 that->info()->AddFromFollowing(target->info());
5721 }
5722 }
5723
5724
VisitChoice(ChoiceNode * that)5725 void Analysis::VisitChoice(ChoiceNode* that) {
5726 NodeInfo* info = that->info();
5727 for (int i = 0; i < that->alternatives()->length(); i++) {
5728 RegExpNode* node = that->alternatives()->at(i).node();
5729 EnsureAnalyzed(node);
5730 if (has_failed()) return;
5731 // Anything the following nodes need to know has to be known by
5732 // this node also, so it can pass it on.
5733 info->AddFromFollowing(node->info());
5734 }
5735 }
5736
5737
VisitLoopChoice(LoopChoiceNode * that)5738 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5739 NodeInfo* info = that->info();
5740 for (int i = 0; i < that->alternatives()->length(); i++) {
5741 RegExpNode* node = that->alternatives()->at(i).node();
5742 if (node != that->loop_node()) {
5743 EnsureAnalyzed(node);
5744 if (has_failed()) return;
5745 info->AddFromFollowing(node->info());
5746 }
5747 }
5748 // Check the loop last since it may need the value of this node
5749 // to get a correct result.
5750 EnsureAnalyzed(that->loop_node());
5751 if (!has_failed()) {
5752 info->AddFromFollowing(that->loop_node()->info());
5753 }
5754 }
5755
5756
VisitBackReference(BackReferenceNode * that)5757 void Analysis::VisitBackReference(BackReferenceNode* that) {
5758 EnsureAnalyzed(that->on_success());
5759 }
5760
5761
VisitAssertion(AssertionNode * that)5762 void Analysis::VisitAssertion(AssertionNode* that) {
5763 EnsureAnalyzed(that->on_success());
5764 }
5765
5766
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)5767 void BackReferenceNode::FillInBMInfo(int offset,
5768 int budget,
5769 BoyerMooreLookahead* bm,
5770 bool not_at_start) {
5771 // Working out the set of characters that a backreference can match is too
5772 // hard, so we just say that any character can match.
5773 bm->SetRest(offset);
5774 SaveBMInfo(bm, not_at_start, offset);
5775 }
5776
5777
5778 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5779 RegExpMacroAssembler::kTableSize);
5780
5781
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)5782 void ChoiceNode::FillInBMInfo(int offset,
5783 int budget,
5784 BoyerMooreLookahead* bm,
5785 bool not_at_start) {
5786 ZoneList<GuardedAlternative>* alts = alternatives();
5787 budget = (budget - 1) / alts->length();
5788 for (int i = 0; i < alts->length(); i++) {
5789 GuardedAlternative& alt = alts->at(i);
5790 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5791 bm->SetRest(offset); // Give up trying to fill in info.
5792 SaveBMInfo(bm, not_at_start, offset);
5793 return;
5794 }
5795 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5796 }
5797 SaveBMInfo(bm, not_at_start, offset);
5798 }
5799
5800
FillInBMInfo(int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)5801 void TextNode::FillInBMInfo(int initial_offset,
5802 int budget,
5803 BoyerMooreLookahead* bm,
5804 bool not_at_start) {
5805 if (initial_offset >= bm->length()) return;
5806 int offset = initial_offset;
5807 int max_char = bm->max_char();
5808 for (int i = 0; i < elements()->length(); i++) {
5809 if (offset >= bm->length()) {
5810 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5811 return;
5812 }
5813 TextElement text = elements()->at(i);
5814 if (text.text_type() == TextElement::ATOM) {
5815 RegExpAtom* atom = text.atom();
5816 for (int j = 0; j < atom->length(); j++, offset++) {
5817 if (offset >= bm->length()) {
5818 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5819 return;
5820 }
5821 uc16 character = atom->data()[j];
5822 if (bm->compiler()->ignore_case()) {
5823 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5824 int length = GetCaseIndependentLetters(
5825 Isolate::Current(),
5826 character,
5827 bm->max_char() == String::kMaxOneByteCharCode,
5828 chars);
5829 for (int j = 0; j < length; j++) {
5830 bm->Set(offset, chars[j]);
5831 }
5832 } else {
5833 if (character <= max_char) bm->Set(offset, character);
5834 }
5835 }
5836 } else {
5837 ASSERT_EQ(TextElement::CHAR_CLASS, text.text_type());
5838 RegExpCharacterClass* char_class = text.char_class();
5839 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
5840 if (char_class->is_negated()) {
5841 bm->SetAll(offset);
5842 } else {
5843 for (int k = 0; k < ranges->length(); k++) {
5844 CharacterRange& range = ranges->at(k);
5845 if (range.from() > max_char) continue;
5846 int to = Min(max_char, static_cast<int>(range.to()));
5847 bm->SetInterval(offset, Interval(range.from(), to));
5848 }
5849 }
5850 offset++;
5851 }
5852 }
5853 if (offset >= bm->length()) {
5854 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5855 return;
5856 }
5857 on_success()->FillInBMInfo(offset,
5858 budget - 1,
5859 bm,
5860 true); // Not at start after a text node.
5861 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5862 }
5863
5864
5865 // -------------------------------------------------------------------
5866 // Dispatch table construction
5867
5868
VisitEnd(EndNode * that)5869 void DispatchTableConstructor::VisitEnd(EndNode* that) {
5870 AddRange(CharacterRange::Everything());
5871 }
5872
5873
BuildTable(ChoiceNode * node)5874 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5875 node->set_being_calculated(true);
5876 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5877 for (int i = 0; i < alternatives->length(); i++) {
5878 set_choice_index(i);
5879 alternatives->at(i).node()->Accept(this);
5880 }
5881 node->set_being_calculated(false);
5882 }
5883
5884
5885 class AddDispatchRange {
5886 public:
AddDispatchRange(DispatchTableConstructor * constructor)5887 explicit AddDispatchRange(DispatchTableConstructor* constructor)
5888 : constructor_(constructor) { }
5889 void Call(uc32 from, DispatchTable::Entry entry);
5890 private:
5891 DispatchTableConstructor* constructor_;
5892 };
5893
5894
Call(uc32 from,DispatchTable::Entry entry)5895 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5896 CharacterRange range(from, entry.to());
5897 constructor_->AddRange(range);
5898 }
5899
5900
VisitChoice(ChoiceNode * node)5901 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5902 if (node->being_calculated())
5903 return;
5904 DispatchTable* table = node->GetTable(ignore_case_);
5905 AddDispatchRange adder(this);
5906 table->ForEach(&adder);
5907 }
5908
5909
VisitBackReference(BackReferenceNode * that)5910 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5911 // TODO(160): Find the node that we refer back to and propagate its start
5912 // set back to here. For now we just accept anything.
5913 AddRange(CharacterRange::Everything());
5914 }
5915
5916
VisitAssertion(AssertionNode * that)5917 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5918 RegExpNode* target = that->on_success();
5919 target->Accept(this);
5920 }
5921
5922
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)5923 static int CompareRangeByFrom(const CharacterRange* a,
5924 const CharacterRange* b) {
5925 return Compare<uc16>(a->from(), b->from());
5926 }
5927
5928
AddInverse(ZoneList<CharacterRange> * ranges)5929 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5930 ranges->Sort(CompareRangeByFrom);
5931 uc16 last = 0;
5932 for (int i = 0; i < ranges->length(); i++) {
5933 CharacterRange range = ranges->at(i);
5934 if (last < range.from())
5935 AddRange(CharacterRange(last, range.from() - 1));
5936 if (range.to() >= last) {
5937 if (range.to() == String::kMaxUtf16CodeUnit) {
5938 return;
5939 } else {
5940 last = range.to() + 1;
5941 }
5942 }
5943 }
5944 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
5945 }
5946
5947
VisitText(TextNode * that)5948 void DispatchTableConstructor::VisitText(TextNode* that) {
5949 TextElement elm = that->elements()->at(0);
5950 switch (elm.text_type()) {
5951 case TextElement::ATOM: {
5952 uc16 c = elm.atom()->data()[0];
5953 AddRange(CharacterRange(c, c));
5954 break;
5955 }
5956 case TextElement::CHAR_CLASS: {
5957 RegExpCharacterClass* tree = elm.char_class();
5958 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
5959 if (tree->is_negated()) {
5960 AddInverse(ranges);
5961 } else {
5962 for (int i = 0; i < ranges->length(); i++)
5963 AddRange(ranges->at(i));
5964 }
5965 break;
5966 }
5967 default: {
5968 UNIMPLEMENTED();
5969 }
5970 }
5971 }
5972
5973
VisitAction(ActionNode * that)5974 void DispatchTableConstructor::VisitAction(ActionNode* that) {
5975 RegExpNode* target = that->on_success();
5976 target->Accept(this);
5977 }
5978
5979
Compile(RegExpCompileData * data,bool ignore_case,bool is_global,bool is_multiline,Handle<String> pattern,Handle<String> sample_subject,bool is_ascii,Zone * zone)5980 RegExpEngine::CompilationResult RegExpEngine::Compile(
5981 RegExpCompileData* data,
5982 bool ignore_case,
5983 bool is_global,
5984 bool is_multiline,
5985 Handle<String> pattern,
5986 Handle<String> sample_subject,
5987 bool is_ascii,
5988 Zone* zone) {
5989 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
5990 return IrregexpRegExpTooBig(zone->isolate());
5991 }
5992 RegExpCompiler compiler(data->capture_count, ignore_case, is_ascii, zone);
5993
5994 // Sample some characters from the middle of the string.
5995 static const int kSampleSize = 128;
5996
5997 sample_subject = String::Flatten(sample_subject);
5998 int chars_sampled = 0;
5999 int half_way = (sample_subject->length() - kSampleSize) / 2;
6000 for (int i = Max(0, half_way);
6001 i < sample_subject->length() && chars_sampled < kSampleSize;
6002 i++, chars_sampled++) {
6003 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6004 }
6005
6006 // Wrap the body of the regexp in capture #0.
6007 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6008 0,
6009 &compiler,
6010 compiler.accept());
6011 RegExpNode* node = captured_body;
6012 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6013 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6014 int max_length = data->tree->max_match();
6015 if (!is_start_anchored) {
6016 // Add a .*? at the beginning, outside the body capture, unless
6017 // this expression is anchored at the beginning.
6018 RegExpNode* loop_node =
6019 RegExpQuantifier::ToNode(0,
6020 RegExpTree::kInfinity,
6021 false,
6022 new(zone) RegExpCharacterClass('*'),
6023 &compiler,
6024 captured_body,
6025 data->contains_anchor);
6026
6027 if (data->contains_anchor) {
6028 // Unroll loop once, to take care of the case that might start
6029 // at the start of input.
6030 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6031 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6032 first_step_node->AddAlternative(GuardedAlternative(
6033 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
6034 node = first_step_node;
6035 } else {
6036 node = loop_node;
6037 }
6038 }
6039 if (is_ascii) {
6040 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6041 // Do it again to propagate the new nodes to places where they were not
6042 // put because they had not been calculated yet.
6043 if (node != NULL) {
6044 node = node->FilterASCII(RegExpCompiler::kMaxRecursion, ignore_case);
6045 }
6046 }
6047
6048 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6049 data->node = node;
6050 Analysis analysis(ignore_case, is_ascii);
6051 analysis.EnsureAnalyzed(node);
6052 if (analysis.has_failed()) {
6053 const char* error_message = analysis.error_message();
6054 return CompilationResult(zone->isolate(), error_message);
6055 }
6056
6057 // Create the correct assembler for the architecture.
6058 #ifndef V8_INTERPRETED_REGEXP
6059 // Native regexp implementation.
6060
6061 NativeRegExpMacroAssembler::Mode mode =
6062 is_ascii ? NativeRegExpMacroAssembler::ASCII
6063 : NativeRegExpMacroAssembler::UC16;
6064
6065 #if V8_TARGET_ARCH_IA32
6066 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6067 zone);
6068 #elif V8_TARGET_ARCH_X64
6069 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6070 zone);
6071 #elif V8_TARGET_ARCH_ARM
6072 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6073 zone);
6074 #elif V8_TARGET_ARCH_ARM64
6075 RegExpMacroAssemblerARM64 macro_assembler(mode, (data->capture_count + 1) * 2,
6076 zone);
6077 #elif V8_TARGET_ARCH_MIPS
6078 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6079 zone);
6080 #elif V8_TARGET_ARCH_X87
6081 RegExpMacroAssemblerX87 macro_assembler(mode, (data->capture_count + 1) * 2,
6082 zone);
6083 #else
6084 #error "Unsupported architecture"
6085 #endif
6086
6087 #else // V8_INTERPRETED_REGEXP
6088 // Interpreted regexp implementation.
6089 EmbeddedVector<byte, 1024> codes;
6090 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
6091 #endif // V8_INTERPRETED_REGEXP
6092
6093 // Inserted here, instead of in Assembler, because it depends on information
6094 // in the AST that isn't replicated in the Node structure.
6095 static const int kMaxBacksearchLimit = 1024;
6096 if (is_end_anchored &&
6097 !is_start_anchored &&
6098 max_length < kMaxBacksearchLimit) {
6099 macro_assembler.SetCurrentPositionFromEnd(max_length);
6100 }
6101
6102 if (is_global) {
6103 macro_assembler.set_global_mode(
6104 (data->tree->min_match() > 0)
6105 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6106 : RegExpMacroAssembler::GLOBAL);
6107 }
6108
6109 return compiler.Assemble(¯o_assembler,
6110 node,
6111 data->capture_count,
6112 pattern);
6113 }
6114
6115
6116 }} // namespace v8::internal
6117